diff --git a/charts/controlplane/Chart.yaml b/charts/controlplane/Chart.yaml index 4fa0b630..cefb6d5c 100644 --- a/charts/controlplane/Chart.yaml +++ b/charts/controlplane/Chart.yaml @@ -3,7 +3,7 @@ name: controlplane description: Deploys the Union controlplane components to onboard a kubernetes cluster to the Union Cloud. type: application icon: https://i.ibb.co/JxfDQsL/Union-Symbol-yellow-2.png -version: 2026.4.2 +version: 2026.4.4 appVersion: 2026.4.5 kubeVersion: '>= 1.28.0-0' dependencies: diff --git a/charts/controlplane/templates/_helpers.tpl b/charts/controlplane/templates/_helpers.tpl index 816d6ca2..a57d4990 100644 --- a/charts/controlplane/templates/_helpers.tpl +++ b/charts/controlplane/templates/_helpers.tpl @@ -284,7 +284,7 @@ app.kubernetes.io/instance: {{ .Release.Name }} {{- include "unionai.fullname" . | trim -}} {{- end }} {{- else }} -default +union {{- end }} {{- end }} @@ -580,7 +580,7 @@ app.kubernetes.io/instance: {{ .Release.Name }} {{- if .Values.console.serviceAccount.create }} {{- default (include "console.fullname" .) .Values.console.serviceAccount.name }} {{- else }} -{{- default "default" .Values.console.serviceAccount.name }} +{{- default "union" .Values.console.serviceAccount.name }} {{- end }} {{- end }} diff --git a/charts/controlplane/templates/union-serviceaccount.yaml b/charts/controlplane/templates/union-serviceaccount.yaml new file mode 100644 index 00000000..ec3eeff2 --- /dev/null +++ b/charts/controlplane/templates/union-serviceaccount.yaml @@ -0,0 +1,9 @@ +{{- if (index .Values "controlplane" | default dict).enabled }} +apiVersion: v1 +kind: ServiceAccount +metadata: + name: union + namespace: {{ .Release.Namespace }} + labels: + {{- include "unionai.labels" (dict "key" "union" "Release" .Release "Values" .Values "Chart" .Chart) | nindent 4 }} +{{- end }} diff --git a/charts/dataplane-crds/Chart.yaml b/charts/dataplane-crds/Chart.yaml index b5d635ee..20a24af3 100644 --- a/charts/dataplane-crds/Chart.yaml +++ b/charts/dataplane-crds/Chart.yaml @@ -3,8 +3,8 @@ name: dataplane-crds description: Deploys the Union dataplane CRDs. type: application icon: https://i.ibb.co/JxfDQsL/Union-Symbol-yellow-2.png -version: 2026.4.0 -appVersion: 2026.3.6 +version: 2026.4.3 +appVersion: 2026.4.1 kubeVersion: '>= 1.28.0-0' dependencies: - name: prometheus-operator-crds diff --git a/charts/dataplane/Chart.yaml b/charts/dataplane/Chart.yaml index 450aecc0..d1c7c07e 100644 --- a/charts/dataplane/Chart.yaml +++ b/charts/dataplane/Chart.yaml @@ -3,26 +3,32 @@ name: dataplane description: Deploys the Union dataplane components to onboard a kubernetes cluster to the Union Cloud. type: application icon: https://i.ibb.co/JxfDQsL/Union-Symbol-yellow-2.png -version: 2026.4.2 +version: 2026.4.4 appVersion: 2026.4.5 kubeVersion: '>= 1.28.0-0' dependencies: -- name: kube-state-metrics - repository: https://prometheus-community.github.io/helm-charts - version: 5.30.1 - name: kube-prometheus-stack repository: https://prometheus-community.github.io/helm-charts version: 80.8.0 alias: monitoring condition: monitoring.enabled +- name: prometheus + repository: https://prometheus-community.github.io/helm-charts + version: 25.27.0 + alias: prometheus + condition: prometheus.enabled - name: metrics-server repository: https://kubernetes-sigs.github.io/metrics-server/ version: 3.12.2 condition: metrics-server.enabled alias: metrics-server +- name: knative-operator-crds + repository: file://../knative-operator-crds + version: 2025.6.3 + condition: knative-operator-crds.enabled - name: knative-operator - repository: https://unionai.github.io/helm-charts - version: 2025.5.0 + repository: file://../knative-operator + version: 2025.6.3 alias: knative-operator condition: knative-operator.enabled - name: fluent-bit diff --git a/charts/dataplane/files/recording-rules.yml b/charts/dataplane/files/recording-rules.yml new file mode 100644 index 00000000..2ca18f57 --- /dev/null +++ b/charts/dataplane/files/recording-rules.yml @@ -0,0 +1,710 @@ +groups: + - name: cost_calculations_15s + interval: 15s + rules: + - record: pod_gpu_allocation + expr: | + sum by (namespace, pod) (DCGM_FI_DEV_GPU_UTIL >= bool 0) * on (namespace, pod) group_left() (max by (namespace, pod) (kube_pod_status_phase{phase=~"Running|Pending"} == 1)) + - record: execution_info # A join metric to look up execution-level info. Used to disambiguate workflow/task executions, apps, and workspaces. + expr: | + max by (label_domain, label_project, label_entity_name, label_execution_id, label_entity_id)( + label_replace( + label_replace( + label_replace( + label_replace( + label_replace( + flyte:propeller:all:round:execution_info{domain!="", project!="", workflow_name!="", execution_id!=""}, # filter for workflow/task executions + "label_entity_id", "$1", "execution_id", "(.*)" # join key + ), "label_entity_name", "$1", "workflow_name", "(.*)" # set label_entity_name to the workflow/task name from the workflow_execution_id + ), + "label_execution_id", "$1", "execution_id", "(.*)" + ), + "label_project", "$1", "project", "(.*)" # project + ), + "label_domain", "$1", "domain", "(.*)" # domain + ) + ) + - record: app_info # A join metric to look up app-level info. Used to disambiguate workflow/task executions, apps, and workspaces. + expr: | + max by (label_domain, label_project, label_app_name, label_app_version, label_entity_id)( + label_replace( + label_replace( + label_replace( + kube_pod_labels{ + label_domain!="", + label_project!="", + label_serving_unionai_dev_app_name!="", + label_serving_knative_dev_revision!="" + }, # this filters for apps + "label_app_name", "$1", "label_serving_unionai_dev_app_name", "(.*)" # rename to cleanup + ), + "label_app_version", "$1", "label_serving_knative_dev_revision", "(.*)" # the app_version is equivalent to an execution_id for workflows (lowest level of granularity) + ), + "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # join key + ) + ) + - record: workspace_info # A join metric to look up workspace info. Used to disambiguate workflow/task executions, apps, and workspaces. + expr: | + max by (label_domain, label_project, label_workspace_name, label_entity_id)( + label_replace( + label_replace( + kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # filter for workspaces + "label_entity_id", "$1", "label_node_id", "(.*)" # join key + ), "label_workspace_name", "$1", "label_node_id", "(.*)" # set label_workspace_name to the workspace name from the kube_pod_labels + ) + ) + - record: entity_id:mem_usage_bytes_total_per_node:sum # Allocated memory (max(requested, consumed)) aggregated per node and entity, where entity is either a task/workflow execution or an app. + expr: | + sum by (label_entity_type, label_domain, label_project, label_entity_id, node) ( # aggregate up to entity + # First, calculate the allocated memory for each pod + max by (namespace, pod) ( # this is the case where consumed (the memory working set) exceeds requested memory + ( + sum by (namespace, pod) ( + container_memory_working_set_bytes{namespace!="",pod!="",image!=""} + ) + > sum by (namespace, pod) ( + kube_pod_container_resource_requests{namespace!="", pod!="", node!="", resource="memory"} + ) + ) + or sum by (namespace, pod) ( # this is the case where memory requests are <= consumed memory + kube_pod_container_resource_requests{namespace!="", pod!="", node!="", resource="memory"} # needed to add node!="" to dedupe + ) + ) + # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but we do not want to double the number of pod-level metrics we save + * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( + max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels + label_replace( + label_replace( + label_replace( + label_replace( + kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) + "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") + ), + "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) + ), + "label_domain", "$1", "label_domain", "(.*)" + ), + "label_project", "$1", "label_project", "(.*)" + ) + ) + or + max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels + label_replace( + label_replace( + kube_pod_labels{ + label_domain!="", + label_project!="", + label_serving_unionai_dev_app_name!="", + label_serving_knative_dev_revision!="" + }, # this filters for apps only + "label_entity_type", "app", "", "" # set label_entity_type to "app" + ), + "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) + ) + ) + or + max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels + label_replace( + label_replace( + label_replace( + label_replace( + kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) + "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" + ), + "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) + ), + "label_domain", "$1", "label_domain", "(.*)" + ), + "label_project", "$1", "label_project", "(.*)" + ) + ) + ) + # Then filter for pods only in the "Running" or "Pending" phase + * on (namespace, pod) group_left() ( + max by (namespace, pod) ( + kube_pod_status_phase{phase=~"Running|Pending"} == 1 + ) + ) + # Now join in node identifiers which are used for subsequent overhead calculations + * on (namespace, pod) group_left(node) ( + max by (namespace, pod, node) (kube_pod_info{node!=""}) # needed to add node!="" to dedupe + ) + ) + - record: entity_id:cpu_usage_per_node:sum # Allocated cpu (max(requested, consumed)) aggregated per node and entity, where entity is either a task/workflow execution or an app. + expr: | + sum by (label_entity_type, label_domain, label_project, label_entity_id, node) ( + # First, calculate the allocated cpu for each pod + max by (namespace, pod) ( # this is the case where consumed (the cpu usage seconds total) exceeds requested cpu + ( + sum by (namespace, pod) ( + irate(container_cpu_usage_seconds_total{namespace!="",pod!="",image!=""}[5m]) + ) + > sum by (namespace, pod) ( + kube_pod_container_resource_requests{namespace!="", pod!="", node!="", resource="cpu"} + ) + ) + or sum by (namespace, pod) ( # this is the case where cpu requests are <= consumed cpu + kube_pod_container_resource_requests{namespace!="", pod!="", node!="", resource="cpu"} + ) + ) + # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but I didn't want to double the number of pod-level metrics we save + * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( + max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels + label_replace( + label_replace( + label_replace( + label_replace( + kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) + "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") + ), + "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) + ), + "label_domain", "$1", "label_domain", "(.*)" + ), + "label_project", "$1", "label_project", "(.*)" + ) + ) + or + max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels + label_replace( + label_replace( + kube_pod_labels{ + label_domain!="", + label_project!="", + label_serving_unionai_dev_app_name!="", + label_serving_knative_dev_revision!="" + }, # this filters for apps only + "label_entity_type", "app", "", "" # set label_entity_type to "app" + ), + "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) + ) + ) + or + max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels + label_replace( + label_replace( + label_replace( + label_replace( + kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) + "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" + ), + "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) + ), + "label_domain", "$1", "label_domain", "(.*)" + ), + "label_project", "$1", "label_project", "(.*)" + ) + ) + ) + # Then filter for pods only in the "Running" or "Pending" phase + * on (namespace, pod) group_left() ( + max by (namespace, pod) ( + kube_pod_status_phase{phase=~"Running|Pending"} == 1 + ) + ) + # Now join in node identifiers which are used for subsequent overhead calculations + * on (namespace, pod) group_left(node) ( + max by (namespace, pod, node) (kube_pod_info{node!=""}) # needed to add node!="" to dedupe + ) + ) + - record: entity_id:gpu_usage_per_node:sum # Allocated gpu aggregated per node and entity, where entity is either a task/workflow execution or an app. + expr: | + sum by (label_entity_type, label_domain, label_project, label_entity_id, node) ( + # First, grab the allocated gpu for each pod (which is always either 1 or zero, since k8s can't split gpus the way it can with cpu/memory) + max by (namespace, pod) ( + pod_gpu_allocation + ) + # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but I didn't want to double the number of pod-level metrics we save + * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( + max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels + label_replace( + label_replace( + label_replace( + label_replace( + kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) + "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") + ), + "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) + ), + "label_domain", "$1", "label_domain", "(.*)" + ), + "label_project", "$1", "label_project", "(.*)" + ) + ) + or + max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels + label_replace( + label_replace( + kube_pod_labels{ + label_domain!="", + label_project!="", + label_serving_unionai_dev_app_name!="", + label_serving_knative_dev_revision!="" + }, # this filters for apps only + "label_entity_type", "app", "", "" # set label_entity_type to "app" + ), + "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) + ) + ) + or + max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels + label_replace( + label_replace( + label_replace( + label_replace( + kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) + "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" + ), + "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) + ), + "label_domain", "$1", "label_domain", "(.*)" + ), + "label_project", "$1", "label_project", "(.*)" + ) + ) + ) + # Then filter for pods only in the "Running" or "Pending" phase + * on (namespace, pod) group_left() ( + max by (namespace, pod) ( + kube_pod_status_phase{phase=~"Running|Pending"} == 1 + ) + ) + # Now join in node identifiers which are used for subsequent overhead calculations + * on (namespace, pod) group_left(node) ( + max by (namespace, pod, node) (kube_pod_info{node!=""}) # needed to add node!="" to dedupe + ) + ) + - record: entity_id:used_mem_bytes:sum # the sum of used memory across all containers in an entity (numerator for aggregate utilization calculations) + expr: | + sum by (label_entity_type, label_domain, label_project, label_entity_id) ( # aggregate up to entity + # First, calculate the used memory for each pod + sum by (namespace, pod) ( + container_memory_working_set_bytes{namespace!="",pod!="",image!=""} + ) + # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but we do not want to double the number of pod-level metrics we save + * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( + max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels + label_replace( + label_replace( + label_replace( + label_replace( + kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) + "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") + ), + "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) + ), + "label_domain", "$1", "label_domain", "(.*)" + ), + "label_project", "$1", "label_project", "(.*)" + ) + ) + or + max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels + label_replace( + label_replace( + kube_pod_labels{ + label_domain!="", + label_project!="", + label_serving_unionai_dev_app_name!="", + label_serving_knative_dev_revision!="" + }, # this filters for apps only + "label_entity_type", "app", "", "" # set label_entity_type to "app" + ), + "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) + ) + ) + or + max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels + label_replace( + label_replace( + label_replace( + label_replace( + kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) + "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" + ), + "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) + ), + "label_domain", "$1", "label_domain", "(.*)" + ), + "label_project", "$1", "label_project", "(.*)" + ) + ) + ) + # Then filter for pods only in the "Running" or "Pending" phase + * on (namespace, pod) group_left() ( + max by (namespace, pod) ( + kube_pod_status_phase{phase=~"Running|Pending"} == 1 + ) + ) + ) + - record: entity_id:allocated_mem_bytes:sum # the sum of allocated memory across all containers in an entity (denominator for aggregate utilization calculations) + expr: | + sum by (label_entity_type, label_domain, label_project, label_entity_id) ( # aggregate up to entity (remove node) + entity_id:mem_usage_bytes_total_per_node:sum + ) + - record: entity_id:used_cpu:sum # the sum of used cpu across all containers in an entity (numerator for aggregate utilization calculations) + expr: | + sum by (label_entity_type, label_domain, label_project, label_entity_id) ( + sum by (namespace, pod) ( + irate(container_cpu_usage_seconds_total{namespace!="",pod!="",image!=""}[5m]) + ) + # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but I didn't want to double the number of pod-level metrics we save + * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( + max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels + label_replace( + label_replace( + label_replace( + label_replace( + kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) + "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") + ), + "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) + ), + "label_domain", "$1", "label_domain", "(.*)" + ), + "label_project", "$1", "label_project", "(.*)" + ) + ) + or + max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels + label_replace( + label_replace( + kube_pod_labels{ + label_domain!="", + label_project!="", + label_serving_unionai_dev_app_name!="", + label_serving_knative_dev_revision!="" + }, # this filters for apps only + "label_entity_type", "app", "", "" # set label_entity_type to "app" + ), + "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) + ) + ) + or + max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels + label_replace( + label_replace( + label_replace( + label_replace( + kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) + "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" + ), + "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) + ), + "label_domain", "$1", "label_domain", "(.*)" + ), + "label_project", "$1", "label_project", "(.*)" + ) + ) + ) + # Then filter for pods only in the "Running" or "Pending" phase + * on (namespace, pod) group_left() ( + max by (namespace, pod) ( + kube_pod_status_phase{phase=~"Running|Pending"} == 1 + ) + ) + ) + - record: entity_id:allocated_cpu:sum # the sum of allocated cpu across all containers in an entity (denominator for aggregate utilization calculations) + expr: | + sum by (label_entity_type, label_domain, label_project, label_entity_id) ( # aggregate up to entity (remove node) + entity_id:cpu_usage_per_node:sum + ) + - record: entity_id:sm_occupancy:avg # the simple average of SM occupancy (a good generic measure of GPU utilization) per entity + expr: | + avg by (label_entity_type, label_domain, label_project, label_entity_id) ( + # First, grab the SM occupancy for each pod + max by (namespace, pod) ( + DCGM_FI_PROF_SM_OCCUPANCY # SM occupancy is a good proxy for actual GPU usage + ) + # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but I didn't want to double the number of pod-level metrics we save + * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( + max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels + label_replace( + label_replace( + label_replace( + label_replace( + kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) + "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") + ), + "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) + ), + "label_domain", "$1", "label_domain", "(.*)" + ), + "label_project", "$1", "label_project", "(.*)" + ) + ) + or + max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels + label_replace( + label_replace( + kube_pod_labels{ + label_domain!="", + label_project!="", + label_serving_unionai_dev_app_name!="", + label_serving_knative_dev_revision!="" + }, # this filters for apps only + "label_entity_type", "app", "", "" # set label_entity_type to "app" + ), + "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) + ) + ) + or + max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels + label_replace( + label_replace( + label_replace( + label_replace( + kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) + "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" + ), + "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) + ), + "label_domain", "$1", "label_domain", "(.*)" + ), + "label_project", "$1", "label_project", "(.*)" + ) + ) + ) + # Then filter for pods only in the "Running" or "Pending" phase + * on (namespace, pod) group_left() ( + max by (namespace, pod) ( + kube_pod_status_phase{phase=~"Running|Pending"} == 1 + ) + ) + ) + - record: entity_id:gpu_count:sum # the count of running gpu pods per entity (need this to weight the gpu utilization when aggregating upwards - i.e. project-level) + expr: | + sum by (label_entity_type, label_domain, label_project, label_entity_id, node) ( + # First, grab the allocated gpu for each pod (which is always either 1 or zero, since k8s can't split gpus the way it can with cpu/memory) + max by (namespace, pod) ( + pod_gpu_allocation + ) + # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but I didn't want to double the number of pod-level metrics we save + * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( + max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels + label_replace( + label_replace( + label_replace( + label_replace( + kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) + "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") + ), + "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) + ), + "label_domain", "$1", "label_domain", "(.*)" + ), + "label_project", "$1", "label_project", "(.*)" + ) + ) + or + max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels + label_replace( + label_replace( + kube_pod_labels{ + label_domain!="", + label_project!="", + label_serving_unionai_dev_app_name!="", + label_serving_knative_dev_revision!="" + }, # this filters for apps only + "label_entity_type", "app", "", "" # set label_entity_type to "app" + ), + "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) + ) + ) + or + max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels + label_replace( + label_replace( + label_replace( + label_replace( + kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) + "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" + ), + "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) + ), + "label_domain", "$1", "label_domain", "(.*)" + ), + "label_project", "$1", "label_project", "(.*)" + ) + ) + ) + # Then filter for pods only in the "Running" or "Pending" phase + * on (namespace, pod) group_left() ( + max by (namespace, pod) ( + kube_pod_status_phase{phase=~"Running|Pending"} == 1 + ) + ) + ) + - record: entity_id:weighted_sm_occupancy:sum # product of SM occupancy and allocated GPU count (something like "used memory", numerator of weighted calcs) + expr: | + entity_id:sm_occupancy:avg + * on (label_domain, label_project, label_entity_type, label_entity_id) entity_id:gpu_count:sum + - record: entity_id:allocated_mem_cost:sum # Allocated cost of memory for each workflow/task execution and app. + expr: | + sum by (label_entity_type, label_domain, label_project, label_entity_id, type) ( + entity_id:mem_usage_bytes_total_per_node:sum / (1024 * 1024 * 1024) # convert bytes to GB + * on (node) group_left(type) label_replace(avg by (node) (node_ram_hourly_cost * (15 / 3600)), "type", "mem", "", "") # convert hourly cost to 15-secondly cost and add type + ) + - record: entity_id:allocated_cpu_cost:sum # Allocated cost of cpu for each workflow/task execution and app. + expr: | + sum by (label_entity_type, label_domain, label_project, label_entity_id, type)( + entity_id:cpu_usage_per_node:sum + * on (node) group_left(type) label_replace(avg by (node) (node_cpu_hourly_cost * (15 / 3600)), "type", "cpu", "", "") # convert hourly cost to 15-secondly cost and add type + ) + - record: entity_id:allocated_gpu_cost:sum # Allocated cost of gpu for each workflow/task execution and app. + expr: | + sum by (label_entity_type, label_domain, label_project, label_entity_id, type)( + entity_id:gpu_usage_per_node:sum + * on (node) group_left(type) label_replace(avg by (node) (node_gpu_hourly_cost * (15 / 3600)), "type", "gpu", "", "") # convert hourly cost to 15-secondly cost and add type + ) + - record: entity_id:allocated_cost:sum # Allocated cost of memory, cpu, and gpu for each workflow/task execution and app. + expr: | + label_replace( + sum by (label_entity_type, label_domain, label_project, label_entity_id) ( # for the sum to work, the labels need to be different on each "or" element (type label) + entity_id:allocated_mem_cost:sum + or + entity_id:allocated_cpu_cost:sum + or + entity_id:allocated_gpu_cost:sum + ), + "type", "allocated", "", "" # add type info + ) + - record: entity_id:overhead_cost:sum # The amount of overhead costs (node costs that we can't allocate with container resources) to allocate to each entity (workflow/task execution or app) + expr: | + label_replace( + sum by (label_entity_type, label_entity_id, label_domain, label_project)( # Aggregate the per-node metrics up to workflow/task execution or app (label_entity_id) + # Start with each execution's and app's allocated cost per node + sum by (label_entity_type, label_domain, label_project, label_entity_id, node) ( # for the sum to work, the labels need to be different on each "or" element (type label) + entity_id:mem_usage_bytes_total_per_node:sum / (1024 * 1024 * 1024) # convert bytes to GB + * on (node) group_left(type) label_replace(avg by (node) (node_ram_hourly_cost * (15 / 3600)), "type", "mem", "", "") # convert hourly cost to 15-secondly cost and add type + or + entity_id:cpu_usage_per_node:sum + * on (node) group_left(type) label_replace(avg by (node) (node_cpu_hourly_cost * (15 / 3600)), "type", "cpu", "", "") # convert hourly cost to 15-secondly cost and add type + or + entity_id:gpu_usage_per_node:sum + * on (node) group_left(type) label_replace(avg by (node) (node_gpu_hourly_cost * (15 / 3600)), "type", "gpu", "", "") # convert hourly cost to 15-secondly cost and add type + ) + # Then divide out the total allocated cost per node to get the proportion of allocated cost associated with each entity + / on (node) group_left()( + sum by (node) ( # for the sum to work, the labels need to be different on each "or" element (type label) + entity_id:mem_usage_bytes_total_per_node:sum / (1024 * 1024 * 1024) # convert bytes to GB + * on (node) group_left(type) label_replace(avg by (node) (node_ram_hourly_cost * (15 / 3600)), "type", "mem", "", "") # convert hourly cost to 15-secondly cost and add type + or + entity_id:cpu_usage_per_node:sum + * on (node) group_left(type) label_replace(avg by (node) (node_cpu_hourly_cost * (15 / 3600)), "type", "cpu", "", "") # convert hourly cost to 15-secondly cost and add type + or + entity_id:gpu_usage_per_node:sum + * on (node) group_left(type) label_replace(avg by (node) (node_gpu_hourly_cost * (15 / 3600)), "type", "gpu", "", "") # convert hourly cost to 15-secondly cost and add type + ) + > 0 # need to avoid dividing by zero, or gaps in the data can cause NaNs to proliferate, borking all charts + ) + # Then multiply by the overhead cost per node + * on (node) group_left() ( + # To calculate overhead, start with the true cost of running each node + avg by (node)(kube_node_labels{label_flyte_org_node_role="worker"}) # only look at worker nodes + * on (node) max by (node) ( + node_total_hourly_cost{instance_type!=""} # sometimes, the instance_type can be null, causing an unlabeled label to show up in the Compute Costs dashboard charts + ) * (15 / 3600) # convert hourly cost to 15-secondly cost + # Then subtract out the total allocated cost on each node + - on (node) group_left()( + sum by (node) ( # for the sum to work, the labels need to be different on each "or" element (type label) + entity_id:mem_usage_bytes_total_per_node:sum / (1024 * 1024 * 1024) # convert bytes to GB + * on (node) group_left(type) label_replace(avg by (node) (node_ram_hourly_cost * (15 / 3600)), "type", "mem", "", "") # convert hourly cost to 15-secondly cost and add type + or + entity_id:cpu_usage_per_node:sum + * on (node) group_left(type) label_replace(avg by (node) (node_cpu_hourly_cost * (15 / 3600)), "type", "cpu", "", "") # convert hourly cost to 15-secondly cost and add type + or + entity_id:gpu_usage_per_node:sum + * on (node) group_left(type) label_replace(avg by (node) (node_gpu_hourly_cost * (15 / 3600)), "type", "gpu", "", "") # convert hourly cost to 15-secondly cost and add type + ) + ) + ) + ), + "type", "overhead", "", "" # add type info + ) + - record: entity_id:total_cost:sum # Total cost of each entity (workflow/task execution or app), including allocated (from container resources) and overhead (proportion of unallocated node costs) + expr: | + label_replace( + sum by (label_domain, label_project, label_entity_id, label_entity_type) ( + entity_id:allocated_cost:sum + or + entity_id:overhead_cost:sum + ), + "type", "total", "", "" # add type info + ) + - record: node:total_cost:sum # Total cost of all nodes + expr: | + sum ( + avg by (node)(kube_node_labels{label_flyte_org_node_role="worker", label_node_kubernetes_io_instance_type!=""}) # only look at worker nodes + * on (node) group_left() node_total_hourly_cost{instance_type!=""} * (15 / 3600) # convert hourly cost to 15-secondly cost + ) + - record: node_type:total_cost:sum # Total cost of nodes grouped by node type + expr: | + sum by (node_type)( + avg by (node)(kube_node_labels{label_flyte_org_node_role="worker", label_node_kubernetes_io_instance_type!=""}) # only look at worker nodes + * on (node) group_left(node_type) label_replace(node_total_hourly_cost{instance_type!=""}, "node_type", "$1", "instance_type", "(.*)") * (15 / 3600) # convert hourly cost to 15-secondly cost and rename label + ) + - record: node_type:uptime_hours:sum # Total uptime of nodes grouped by node type + expr: | + sum by (node_type)( + avg by (node, node_type)( # dedupe + label_replace(kube_node_labels{label_flyte_org_node_role="worker", label_node_kubernetes_io_instance_type!=""}, "node_type", "$1", "label_node_kubernetes_io_instance_type", "(.*)") # relabel + ) + ) * (15 / 3600) # convert to number of hours per 15-second observation # Aggregate the above into visible metrics + - name: cost_rollup_15m + interval: 15m + rules: + - record: execution_info15m + expr: | + max_over_time(execution_info[15m:15s]) + - record: app_info15m + expr: | + max_over_time(app_info[15m:15s]) + - record: workspace_info15m + expr: | + max_over_time(workspace_info[15m:15s]) + - record: entity_id:allocated_mem_bytes:sum15m + expr: | + sum_over_time(entity_id:allocated_mem_bytes:sum[15m:15s]) + - record: entity_id:used_mem_bytes:sum15m + expr: | + sum_over_time(entity_id:used_mem_bytes:sum[15m:15s]) + - record: entity_id:allocated_cpu:sum15m + expr: | + sum_over_time(entity_id:allocated_cpu:sum[15m:15s]) + - record: entity_id:used_cpu:sum15m + expr: | + sum_over_time(entity_id:used_cpu:sum[15m:15s]) + - record: entity_id:weighted_sm_occupancy:sum15m + expr: | + sum_over_time(entity_id:weighted_sm_occupancy:sum[15m:15s]) + - record: entity_id:gpu_count:sum15m + expr: | + sum_over_time(entity_id:gpu_count:sum[15m:15s]) + - record: entity_id:allocated_mem_cost:sum15m + expr: | + sum_over_time(entity_id:allocated_mem_cost:sum[15m:15s]) + - record: entity_id:allocated_cpu_cost:sum15m + expr: | + sum_over_time(entity_id:allocated_cpu_cost:sum[15m:15s]) + - record: entity_id:allocated_gpu_cost:sum15m + expr: | + sum_over_time(entity_id:allocated_gpu_cost:sum[15m:15s]) + - record: entity_id:allocated_cost:sum15m + expr: | + sum_over_time(entity_id:allocated_cost:sum[15m:15s]) + - record: entity_id:overhead_cost:sum15m + expr: | + sum_over_time(entity_id:overhead_cost:sum[15m:15s]) + - record: entity_id:total_cost:sum15m + expr: | + sum_over_time(entity_id:total_cost:sum[15m:15s]) + - record: node:total_cost:sum15m + expr: | + sum_over_time(node:total_cost:sum[15m:15s]) + - record: node_type:total_cost:sum15m + expr: | + sum_over_time(node_type:total_cost:sum[15m:15s]) + - record: node_type:uptime_hours:sum15m + expr: | + sum_over_time(node_type:uptime_hours:sum[15m:15s]) diff --git a/charts/dataplane/templates/_helpers.tpl b/charts/dataplane/templates/_helpers.tpl index b1f9d611..ed7149c6 100644 --- a/charts/dataplane/templates/_helpers.tpl +++ b/charts/dataplane/templates/_helpers.tpl @@ -241,7 +241,7 @@ tolerations: {{- end -}} {{- define "flytepropellerwebhook.selectorLabels" -}} -app.kubernetes.io/name: flyte-pod-webhook +app.kubernetes.io/name: union-pod-webhook app.kubernetes.io/instance: {{ .Release.Name }} {{- end -}} @@ -526,7 +526,11 @@ tolerations: Create the name of the service account to use */}} {{- define "operator.serviceAccountName" -}} -{{- default "operator-system" .Values.operator.serviceAccount.name }} +{{- if include "useCommonServiceAccount" . -}} +{{- include "common.serviceAccountName" . -}} +{{- else -}} +{{- default "operator-system" .Values.operator.serviceAccount.name -}} +{{- end -}} {{- end }} {{- define "operator.serviceAccount.annotations" -}} @@ -634,7 +638,11 @@ clusterData: Create the name of the service account to use */}} {{- define "proxy.serviceAccountName" -}} -{{- default "proxy-system" .Values.proxy.serviceAccount.name }} +{{- if include "useCommonServiceAccount" . -}} +{{- include "common.serviceAccountName" . -}} +{{- else -}} +{{- default "proxy-system" .Values.proxy.serviceAccount.name -}} +{{- end -}} {{- end }} {{- define "proxy.serviceAccount.annotations" -}} @@ -724,18 +732,26 @@ access the storage is injected. {{- end }} {{- end -}} +{{- define "prometheus.service.name" -}} +union-operator-prometheus +{{- end -}} + {{- define "prometheus.health.url" -}} -http://{{ include "union-operator.fullname" . }}-prometheus:80/-/healthy +http://{{ include "prometheus.service.name" . }}:80/-/healthy {{- end -}} {{- define "prometheus.service.url" -}} -http://{{ include "union-operator.fullname" . }}-prometheus:80 +http://{{ include "prometheus.service.name" . }}:80 {{- end -}} {{- define "propeller.health.url" -}} http://flytepropeller:10254 {{- end -}} +{{- define "executor.health.url" -}} +http://union-operator-executor:10254 +{{- end -}} + {{- define "proxy.health.url" -}} http://{{ include "union-operator.fullname" . }}-proxy:10254 {{- end -}} @@ -1062,6 +1078,14 @@ platform.union.ai/service-group: {{ .Release.Name }} app.kubernetes.io/managed-by: {{ .Release.Service }} {{- end -}} +{{/* +Check if Depot should be enabled for image building. +True when imageBuilder is enabled, buildkit is not, and no custom buildkitUri is set. +*/}} +{{- define "operator.enableDepot" -}} +{{- if and .Values.imageBuilder.enabled (not .Values.imageBuilder.buildkit.enabled) (not .Values.imageBuilder.buildkitUri) -}}true{{- end -}} +{{- end -}} + {{/* Check if both imageBuilder and imageBuilder.buildkit are enabled */}} @@ -1141,22 +1165,251 @@ app: executor {{ include "global.podLabels" . }} {{ $labels := include "executor.labels" . | fromYaml -}} {{- $podLabels := .Values.executor.podLabels | default dict -}} -{{- mustMergeOverwrite $podLabels $labels | toYaml }} +{{- tpl (mustMergeOverwrite $podLabels $labels | toYaml) . }} +{{- end -}} + +{{/* +Webhook certificate helpers +*/}} + +{{/* +Get the webhook service name +*/}} +{{- define "flytepropellerwebhook.serviceName" -}} +union-pod-webhook +{{- end -}} + +{{/* +Get the webhook secret name +*/}} +{{- define "flytepropellerwebhook.secretName" -}} +union-pod-webhook +{{- end -}} + +{{/* +Get the webhook service DNS names for certificate generation +*/}} +{{- define "flytepropellerwebhook.certDnsNames" -}} +{{- $serviceName := include "flytepropellerwebhook.serviceName" . -}} +{{- $namespace := .Release.Namespace -}} +- {{ $serviceName }} +- {{ $serviceName }}.{{ $namespace }} +- {{ $serviceName }}.{{ $namespace }}.svc +- {{ $serviceName }}.{{ $namespace }}.svc.cluster.local +{{- end -}} + +{{/* +Check if cert-manager CRDs are available in the cluster. +Uses lookup to detect cert-manager, with fallback for template-only environments. +Returns "true" if cert-manager is available, empty string otherwise. +*/}} +{{- define "flytepropellerwebhook.certManagerAvailable" -}} +{{- if .Values.flytepropellerwebhook.certificate.certManager.issuerRef -}} +{{- /* User explicitly configured cert-manager issuer, assume it's available */ -}} +true +{{- else if .Capabilities.APIVersions.Has "cert-manager.io/v1" -}} +{{- /* cert-manager CRDs are registered */ -}} +true +{{- else -}} +{{- /* Try lookup as last resort - this works during helm install but not helm template */ -}} +{{- $crd := lookup "apiextensions.k8s.io/v1" "CustomResourceDefinition" "" "certificates.cert-manager.io" -}} +{{- if $crd -}} +true +{{- end -}} +{{- end -}} +{{- end -}} + +{{/* +Determine if we should use cert-manager based on configuration and availability. +For Flux/ArgoCD compatibility, when provider is "certManager", we trust the user's configuration. +*/}} +{{- define "flytepropellerwebhook.useCertManager" -}} +{{- if eq .Values.flytepropellerwebhook.certificate.provider "certManager" -}} +true +{{- end -}} +{{- end -}} + +{{/* +Generate self-signed CA and server certificates for the webhook. +This uses Helm's genCA and genSignedCert functions. +The certificates are cached in a lookup to ensure consistency across template renders. +*/}} +{{- define "flytepropellerwebhook.generateCerts" -}} +{{- $serviceName := include "flytepropellerwebhook.serviceName" . -}} +{{- $namespace := .Release.Namespace -}} +{{- $secretName := include "flytepropellerwebhook.secretName" . -}} +{{- /* Check if secret already exists to maintain certificate stability */ -}} +{{- $existingSecret := lookup "v1" "Secret" $namespace $secretName -}} +{{- if and $existingSecret $existingSecret.data (index $existingSecret.data "ca.crt") -}} +{{- /* Reuse existing certificates (new key names) */ -}} +caCert: {{ index $existingSecret.data "ca.crt" }} +serverCert: {{ index $existingSecret.data "tls.crt" }} +serverKey: {{ index $existingSecret.data "tls.key" }} +{{- else if and $existingSecret $existingSecret.data (index $existingSecret.data "ca-cert.pem") -}} +{{- /* Reuse existing certificates (old key names - for backward compatibility) */ -}} +caCert: {{ index $existingSecret.data "ca-cert.pem" }} +serverCert: {{ index $existingSecret.data "server-cert.pem" }} +serverKey: {{ index $existingSecret.data "server-key.pem" }} +{{- else -}} +{{- /* Generate new certificates */ -}} +{{- $dnsNames := list $serviceName (printf "%s.%s" $serviceName $namespace) (printf "%s.%s.svc" $serviceName $namespace) (printf "%s.%s.svc.cluster.local" $serviceName $namespace) -}} +{{- $ca := genCA (printf "%s.%s.svc" $serviceName $namespace) 3650 -}} +{{- $cert := genSignedCert (printf "%s.%s.svc" $serviceName $namespace) nil $dnsNames 365 $ca -}} +caCert: {{ $ca.Cert | b64enc }} +serverCert: {{ $cert.Cert | b64enc }} +serverKey: {{ $cert.Key | b64enc }} +{{- end -}} +{{- end -}} + +{{/* +Get the CA bundle for the webhook configuration. +This returns the base64-encoded CA certificate based on the certificate provider. +*/}} +{{- define "flytepropellerwebhook.caBundle" -}} +{{- if eq .Values.flytepropellerwebhook.certificate.provider "external" -}} +{{- .Values.flytepropellerwebhook.certificate.external.caCert -}} +{{- else if eq .Values.flytepropellerwebhook.certificate.provider "helm" -}} +{{- $certs := include "flytepropellerwebhook.generateCerts" . | fromYaml -}} +{{- $certs.caCert -}} +{{- else if eq .Values.flytepropellerwebhook.certificate.provider "certManager" -}} +{{- /* For cert-manager, caBundle is injected by cert-manager's cainjector */ -}} +{{- /* Return empty to signal that cainjector should handle it */ -}} +{{- end -}} +{{- end -}} +{{/* +Returns "true" when a common service account should be used for all components. +Enabled explicitly via commonServiceAccount.enabled or implicitly via singleNamespace mode. +*/}} +{{- define "useCommonServiceAccount" -}} +{{- if or .Values.commonServiceAccount.enabled (include "singleNamespace" .) -}}true{{- end -}} +{{- end -}} + +{{/* +Returns the common service account name. +*/}} +{{- define "common.serviceAccountName" -}} +{{- .Values.commonServiceAccount.name | default "union-system" -}} +{{- end -}} + +{{/* +Returns the executor service account name, using the common SA when enabled. +*/}} +{{- define "executor.serviceAccountName" -}} +{{- if include "useCommonServiceAccount" . -}} +{{- include "common.serviceAccountName" . -}} +{{- else -}} +executor +{{- end -}} +{{- end -}} + +{{/* +Returns the webhook service account name, using the common SA when enabled. +*/}} +{{- define "webhook.serviceAccountName" -}} +{{- if include "useCommonServiceAccount" . -}} +{{- include "common.serviceAccountName" . -}} +{{- else -}} +union-webhook-system +{{- end -}} +{{- end -}} + +{{/* +Returns the fluentbit service account name, using the common SA when enabled. +*/}} +{{- define "fluentbit.serviceAccountName" -}} +{{- if include "useCommonServiceAccount" . -}} +{{- include "common.serviceAccountName" . -}} +{{- else -}} +{{- .Values.fluentbit.serviceAccount.name | default "fluentbit-system" -}} +{{- end -}} +{{- end -}} + +{{/* +Returns the buildkit service account name, using the common SA when enabled. +*/}} +{{- define "buildkit.serviceAccountName" -}} +{{- if include "useCommonServiceAccount" . -}} +{{- include "common.serviceAccountName" . -}} +{{- else -}} +{{- .Values.imageBuilder.buildkit.serviceAccount.name | default "union-imagebuilder" -}} +{{- end -}} +{{- end -}} + +{{- define "buildkit.serviceAccount.annotations" -}} +{{- include "global.serviceAccountAnnotations" . }} +{{- with .Values.imageBuilder.buildkit.serviceAccount.annotations }} +{{ toYaml . }} +{{- end }} +{{- end -}} + +{{/* +Returns the default container image repository URL. +If imageBuilder.defaultRepository is set, use it as-is. +Otherwise, auto-generate from the cloud provider, region, project, and registryName. +Checks both storage.provider and the top-level provider field (Azure uses storage.provider=custom). +*/}} +{{- define "imagebuilder.defaultRepository" -}} +{{- if .Values.imageBuilder.defaultRepository -}} + {{- tpl .Values.imageBuilder.defaultRepository . -}} +{{- else if eq (tpl .Values.storage.provider .) "aws" -}} + {{- $region := tpl .Values.storage.region . -}} + {{- $accountId := .Values.global.AWS_ACCOUNT_ID -}} + {{- $registryName := .Values.imageBuilder.registryName -}} + {{- printf "%s.dkr.ecr.%s.amazonaws.com/%s" $accountId $region $registryName -}} +{{- else if or (eq (tpl .Values.storage.provider .) "gcp") (eq (tpl .Values.storage.provider .) "gcs") (eq (.Values.provider | default "") "gcp") -}} + {{- $region := tpl .Values.storage.region . -}} + {{- $projectId := tpl .Values.storage.gcp.projectId . -}} + {{- $registryName := .Values.imageBuilder.registryName -}} + {{- printf "%s-docker.pkg.dev/%s/%s" $region $projectId $registryName -}} +{{- else if or (eq (tpl .Values.storage.provider .) "azure") (eq (.Values.provider | default "") "azure") -}} + {{- $registryName := .Values.imageBuilder.registryName -}} + {{- printf "%s.azurecr.io" $registryName -}} +{{- else -}} + {{- .Values.imageBuilder.registryName -}} +{{- end -}} +{{- end -}} + +{{/* +Returns the image builder authentication type. +If imageBuilder.authenticationType is explicitly set (non-empty, not "noop"), use it. +Otherwise, auto-detect from the cloud provider. +*/}} +{{- define "imagebuilder.authenticationType" -}} +{{- if and .Values.imageBuilder.authenticationType (ne .Values.imageBuilder.authenticationType "noop") -}} + {{- .Values.imageBuilder.authenticationType -}} +{{- else if eq (tpl .Values.storage.provider .) "aws" -}} + {{- "aws" -}} +{{- else if or (eq (tpl .Values.storage.provider .) "gcp") (eq (tpl .Values.storage.provider .) "gcs") (eq (.Values.provider | default "") "gcp") -}} + {{- "google" -}} +{{- else if or (eq (tpl .Values.storage.provider .) "azure") (eq (.Values.provider | default "") "azure") -}} + {{- "azure" -}} +{{- else -}} + {{- .Values.imageBuilder.authenticationType | default "noop" -}} +{{- end -}} +{{- end -}} + +{{/* +Returns "true" when namespaces.enabled is false, indicating single-namespace mode. +In this mode, templates auto-inject namespace-scoping config (limitNamespace, limit-namespace, +namespace_mapping) so users only need to set namespaces.enabled: false. +*/}} +{{- define "singleNamespace" -}} +{{- if or (not .Values.namespaces.enabled) .Values.low_privilege -}}true{{- end -}} {{- end -}} {{- define "operator.dependenciesHeartbeat" -}} -{{- if .Values.flytepropeller.enabled }} -{{- tpl (toYaml .Values.config.operator.dependenciesHeartbeat) $ | nindent 8 }} -{{- else }} {{- $heartbeat := dict }} {{- range $key, $value := .Values.config.operator.dependenciesHeartbeat }} -{{- if ne $key "propeller" }} +{{- if and (eq $key "propeller") (not $.Values.flytepropeller.enabled) }} +{{- else if and (eq $key "executor") (not $.Values.executor.enabled) }} +{{- else if and (eq $key "prometheus") $.Values.low_privilege }} +{{- else }} {{- $_ := set $heartbeat $key $value }} {{- end }} {{- end }} {{- tpl (toYaml $heartbeat) $ | nindent 8 }} -{{- end }} {{- end -}} + {{- define "flyte-pod-webhook.name" -}} union-pod-webhook {{- end -}} @@ -1167,9 +1420,12 @@ union-pod-webhook */}} {{- define "propeller.webhookConfigMinimal" -}} {{- $webhook := deepCopy .Values.config.core.webhook }} -{{- $_ := set $webhook "serviceName" (include "flyte-pod-webhook.name" .) }} -{{- $_ := set $webhook "secretName" (include "flyte-pod-webhook.name" .) }} +{{- $_ := set $webhook "serviceName" (include "flytepropellerwebhook.serviceName" .) }} +{{- $_ := set $webhook "secretName" (include "flytepropellerwebhook.secretName" .) }} {{- $_ := set $webhook "localCert" true }} +{{- if .Values.low_privilege }} +{{- $_ := set $webhook "disableCreateMutatingWebhookConfig" true }} +{{- end }} webhook: {{- tpl (toYaml $webhook) . | nindent 2 }} {{- end -}} diff --git a/charts/dataplane/templates/clusterresourcesync/configmap.yaml b/charts/dataplane/templates/clusterresourcesync/configmap.yaml index a9b2484c..335b7a3a 100644 --- a/charts/dataplane/templates/clusterresourcesync/configmap.yaml +++ b/charts/dataplane/templates/clusterresourcesync/configmap.yaml @@ -1,4 +1,4 @@ -{{- if .Values.clusterresourcesync.enabled }} +{{- if and .Values.clusterresourcesync.enabled (not (include "singleNamespace" .)) }} apiVersion: v1 kind: ConfigMap metadata: diff --git a/charts/dataplane/templates/clusterresourcesync/deployment.yaml b/charts/dataplane/templates/clusterresourcesync/deployment.yaml index 48d8f09d..4d28ccb5 100644 --- a/charts/dataplane/templates/clusterresourcesync/deployment.yaml +++ b/charts/dataplane/templates/clusterresourcesync/deployment.yaml @@ -1,4 +1,4 @@ -{{- if .Values.clusterresourcesync.enabled }} +{{- if and .Values.clusterresourcesync.enabled (not (include "singleNamespace" .)) }} apiVersion: apps/v1 kind: Deployment metadata: diff --git a/charts/dataplane/templates/clusterresourcesync/serviceaccount.yaml b/charts/dataplane/templates/clusterresourcesync/serviceaccount.yaml index 6e3af9e1..45303dd7 100644 --- a/charts/dataplane/templates/clusterresourcesync/serviceaccount.yaml +++ b/charts/dataplane/templates/clusterresourcesync/serviceaccount.yaml @@ -1,4 +1,4 @@ -{{- if .Values.clusterresourcesync.enabled -}} +{{- if and .Values.clusterresourcesync.enabled (not (include "singleNamespace" .)) -}} apiVersion: v1 kind: ServiceAccount metadata: diff --git a/charts/dataplane/templates/common/default-serviceaccount.yaml b/charts/dataplane/templates/common/default-serviceaccount.yaml deleted file mode 100644 index 73b14c94..00000000 --- a/charts/dataplane/templates/common/default-serviceaccount.yaml +++ /dev/null @@ -1,10 +0,0 @@ -{{- if and (not .Values.clusterresourcesync.enabled) .Values.low_privilege }} -apiVersion: v1 -kind: ServiceAccount -metadata: - name: default - namespace: {{ .Release.Namespace }} - annotations: - {{.Values.userRoleAnnotationKey}}: {{.Values.userRoleAnnotationValue}} -automountServiceAccountToken: true -{{- end }} \ No newline at end of file diff --git a/charts/dataplane/templates/common/depot-token-secret.yaml b/charts/dataplane/templates/common/depot-token-secret.yaml new file mode 100644 index 00000000..ec6530d3 --- /dev/null +++ b/charts/dataplane/templates/common/depot-token-secret.yaml @@ -0,0 +1,10 @@ +{{- if include "operator.enableDepot" . }} +apiVersion: v1 +kind: Secret +metadata: + name: depot-token + namespace: {{ .Release.Namespace }} +type: kubernetes.io/dockerconfigjson +data: + .dockerconfigjson: {{ printf `{"auths":{}}` | b64enc }} +{{- end }} diff --git a/charts/dataplane/templates/common/namespaces.yaml b/charts/dataplane/templates/common/namespaces.yaml index da7c3cec..d16fdfcb 100644 --- a/charts/dataplane/templates/common/namespaces.yaml +++ b/charts/dataplane/templates/common/namespaces.yaml @@ -1,4 +1,4 @@ -{{- if .Values.namespaces.enabled }} +{{- if and .Values.namespaces.enabled (not .Values.low_privilege) }} --- apiVersion: v1 kind: Namespace diff --git a/charts/dataplane/templates/common/system-serviceaccount.yaml b/charts/dataplane/templates/common/system-serviceaccount.yaml new file mode 100644 index 00000000..0812ef03 --- /dev/null +++ b/charts/dataplane/templates/common/system-serviceaccount.yaml @@ -0,0 +1,15 @@ +{{- if include "useCommonServiceAccount" . }} +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ include "common.serviceAccountName" . }} + namespace: {{ .Release.Namespace }} + annotations: + {{- include "global.serviceAccountAnnotations" . | nindent 4 }} + {{- with .Values.commonServiceAccount.annotations }} + {{- tpl (toYaml .) $ | nindent 4 }} + {{- end }} +{{- with .Values.commonServiceAccount.imagePullSecrets }} +imagePullSecrets: {{ tpl (toYaml .) $ | nindent 2 }} +{{- end }} +{{- end }} diff --git a/charts/dataplane/templates/common/task-podtemplate.yaml b/charts/dataplane/templates/common/task-podtemplate.yaml new file mode 100644 index 00000000..ac778b06 --- /dev/null +++ b/charts/dataplane/templates/common/task-podtemplate.yaml @@ -0,0 +1,19 @@ +{{- if or (include "singleNamespace" .) (include "operator.enableDepot" .) }} +apiVersion: v1 +kind: PodTemplate +metadata: + name: task-template + namespace: {{ .Release.Namespace }} +template: + spec: +{{- if include "singleNamespace" . }} + serviceAccountName: union +{{- end }} +{{- if include "operator.enableDepot" . }} + imagePullSecrets: + - name: depot-token +{{- end }} + containers: + - name: default + image: docker.io/rwgrim/docker-noop +{{- end }} diff --git a/charts/dataplane/templates/common/union-serviceaccount.yaml b/charts/dataplane/templates/common/union-serviceaccount.yaml new file mode 100644 index 00000000..d7d74234 --- /dev/null +++ b/charts/dataplane/templates/common/union-serviceaccount.yaml @@ -0,0 +1,10 @@ +{{- if include "singleNamespace" . }} +apiVersion: v1 +kind: ServiceAccount +metadata: + name: union + namespace: {{ .Release.Namespace }} + annotations: + {{ tpl .Values.userRoleAnnotationKey . }}: {{ tpl .Values.userRoleAnnotationValue . }} +automountServiceAccountToken: true +{{- end }} \ No newline at end of file diff --git a/charts/dataplane/templates/common/validation.yaml b/charts/dataplane/templates/common/validation.yaml new file mode 100644 index 00000000..2f3e5aa9 --- /dev/null +++ b/charts/dataplane/templates/common/validation.yaml @@ -0,0 +1 @@ +{{/* Validation checks - rendered as empty output */}} diff --git a/charts/dataplane/templates/fluent-bit/serviceaccount.yaml b/charts/dataplane/templates/fluent-bit/serviceaccount.yaml new file mode 100644 index 00000000..a15f6664 --- /dev/null +++ b/charts/dataplane/templates/fluent-bit/serviceaccount.yaml @@ -0,0 +1,16 @@ +{{- if .Values.fluentbit.enabled }} +{{- if not (include "useCommonServiceAccount" .) }} +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ include "fluentbit.serviceAccountName" . }} + namespace: {{ .Release.Namespace }} + labels: + {{- include "operator.labels" . | nindent 4 }} + annotations: + {{- include "global.serviceAccountAnnotations" . | nindent 4 }} + {{- with .Values.fluentbit.serviceAccount.annotations }} + {{- tpl (toYaml .) $ | nindent 4 }} + {{- end }} +{{- end }} +{{- end }} diff --git a/charts/dataplane/templates/flyteconnector/deployment.yaml b/charts/dataplane/templates/flyteconnector/deployment.yaml index bf5e82e0..c9a90bf5 100644 --- a/charts/dataplane/templates/flyteconnector/deployment.yaml +++ b/charts/dataplane/templates/flyteconnector/deployment.yaml @@ -24,9 +24,7 @@ spec: {{- end }} containers: - command: - - flyte - - serve - - connector + - c0 image: "{{ .Values.flyteconnector.image.repository }}:{{ .Values.flyteconnector.image.tag }}" imagePullPolicy: "{{ .Values.flyteconnector.image.pullPolicy }}" name: flyteconnector diff --git a/charts/dataplane/templates/imagebuilder/build-image-configmap.yaml b/charts/dataplane/templates/imagebuilder/build-image-configmap.yaml new file mode 100644 index 00000000..5665a86c --- /dev/null +++ b/charts/dataplane/templates/imagebuilder/build-image-configmap.yaml @@ -0,0 +1,17 @@ +{{- if and .Values.imageBuilder.enabled (include "singleNamespace" .) }} +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ .Values.imageBuilder.targetConfigMapName }} + labels: + {{- include "operator.labels" . | nindent 4 }} +data: + storage.yaml: | {{ tpl (include "storage" .) $ | nindent 4 }} +{{- if or .Values.imageBuilder.buildkitUri .Values.imageBuilder.buildkit.enabled }} + buildkit-uri: {{ (include "imagebuilder.buildkit.uri" .) | quote }} +{{- else }} + buildkit-uri: "" +{{- end }} + default-repository: {{ include "imagebuilder.defaultRepository" . | quote }} + authentication-type: {{ include "imagebuilder.authenticationType" . | quote }} +{{- end }} diff --git a/charts/dataplane/templates/imagebuilder/deployment.yaml b/charts/dataplane/templates/imagebuilder/deployment.yaml index e6334e33..edb46678 100644 --- a/charts/dataplane/templates/imagebuilder/deployment.yaml +++ b/charts/dataplane/templates/imagebuilder/deployment.yaml @@ -29,7 +29,7 @@ spec: platform.union.ai/zone: "dataplane" {{- include "imagebuilder.buildkit.selectorLabels" . | nindent 8 }} spec: - serviceAccountName: {{ .Values.imageBuilder.buildkit.serviceAccount.name | default "union-imagebuilder" | quote }} + serviceAccountName: {{ include "buildkit.serviceAccountName" . | quote }} containers: - name: "buildkit" image: {{ include "imagebuilder.buildkit.image" . | quote }} diff --git a/charts/dataplane/templates/imagebuilder/serviceaccount.yaml b/charts/dataplane/templates/imagebuilder/serviceaccount.yaml index ce41b55f..6a9a4b84 100644 --- a/charts/dataplane/templates/imagebuilder/serviceaccount.yaml +++ b/charts/dataplane/templates/imagebuilder/serviceaccount.yaml @@ -1,12 +1,10 @@ -{{- if and (include "imagebuilder.buildkit.enabled" .) (index .Values.imageBuilder.buildkit.serviceAccount "create" | default true) }} +{{- if and (include "imagebuilder.buildkit.enabled" .) (not (include "useCommonServiceAccount" .)) (index .Values.imageBuilder.buildkit.serviceAccount "create" | default true) }} apiVersion: v1 kind: ServiceAccount metadata: - name: {{ .Values.imageBuilder.buildkit.serviceAccount.name }} - {{- with .Values.imageBuilder.buildkit.serviceAccount.annotations }} + name: {{ include "buildkit.serviceAccountName" . }} annotations: - {{- toYaml . | nindent 4 }} - {{- end }} + {{- include "buildkit.serviceAccount.annotations" . | nindent 4 }} {{ if .Values.imageBuilder.buildkit.serviceAccount.imagePullSecret }} imagePullSecrets: - name: {{ .Values.imageBuilder.buildkit.serviceAccount.imagePullSecret }} diff --git a/charts/dataplane/templates/monitoring/prometheus-rbac.yaml b/charts/dataplane/templates/monitoring/prometheus-rbac.yaml new file mode 100644 index 00000000..c9c3261e --- /dev/null +++ b/charts/dataplane/templates/monitoring/prometheus-rbac.yaml @@ -0,0 +1,158 @@ +{{- if and .Values.prometheus.enabled .Values.low_privilege -}} +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: union-operator-prometheus-rbac + namespace: {{ .Release.Namespace }} + labels: + release: {{ .Release.Name }} +rules: + # Prometheus server scrape permissions + - apiGroups: [""] + resources: + - nodes + - nodes/proxy + - nodes/metrics + - services + - endpoints + - pods + - ingresses + - configmaps + verbs: + - get + - list + - watch + - apiGroups: ["extensions", "networking.k8s.io"] + resources: + - ingresses/status + - ingresses + verbs: + - get + - list + - watch + - apiGroups: ["discovery.k8s.io"] + resources: + - endpointslices + verbs: + - get + - list + - watch + # kube-state-metrics permissions (rbac.create=false on the subchart; managed here instead) + - apiGroups: ["certificates.k8s.io"] + resources: ["certificatesigningrequests"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["configmaps"] + verbs: ["list", "watch"] + - apiGroups: ["batch"] + resources: ["cronjobs"] + verbs: ["list", "watch"] + - apiGroups: ["extensions", "apps"] + resources: ["daemonsets"] + verbs: ["list", "watch"] + - apiGroups: ["extensions", "apps"] + resources: ["deployments"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["endpoints"] + verbs: ["list", "watch"] + - apiGroups: ["discovery.k8s.io"] + resources: ["endpointslices"] + verbs: ["list", "watch"] + - apiGroups: ["autoscaling"] + resources: ["horizontalpodautoscalers"] + verbs: ["list", "watch"] + - apiGroups: ["extensions", "networking.k8s.io"] + resources: ["ingresses"] + verbs: ["list", "watch"] + - apiGroups: ["batch"] + resources: ["jobs"] + verbs: ["list", "watch"] + - apiGroups: ["coordination.k8s.io"] + resources: ["leases"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["limitranges"] + verbs: ["list", "watch"] + - apiGroups: ["admissionregistration.k8s.io"] + resources: ["mutatingwebhookconfigurations"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["namespaces"] + verbs: ["list", "watch"] + - apiGroups: ["networking.k8s.io"] + resources: ["networkpolicies"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["persistentvolumeclaims"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["persistentvolumes"] + verbs: ["list", "watch"] + - apiGroups: ["policy"] + resources: ["poddisruptionbudgets"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["pods"] + verbs: ["list", "watch"] + - apiGroups: ["extensions", "apps"] + resources: ["replicasets"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["replicationcontrollers"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["resourcequotas"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["secrets"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["services"] + verbs: ["list", "watch"] + - apiGroups: ["apps"] + resources: ["statefulsets"] + verbs: ["list", "watch"] + - apiGroups: ["storage.k8s.io"] + resources: ["storageclasses"] + verbs: ["list", "watch"] + - apiGroups: ["admissionregistration.k8s.io"] + resources: ["validatingwebhookconfigurations"] + verbs: ["list", "watch"] + - apiGroups: ["storage.k8s.io"] + resources: ["volumeattachments"] + verbs: ["list", "watch"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: union-operator-prometheus-rbac + namespace: {{ .Release.Namespace }} + labels: + release: {{ .Release.Name }} +subjects: + - kind: ServiceAccount + name: union-operator-prometheus + namespace: {{ .Release.Namespace }} +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: union-operator-prometheus-rbac +--- +# kube-state-metrics service account RoleBinding +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: {{ .Release.Name }}-prometheus-kube-state-metrics + namespace: {{ .Release.Namespace }} + labels: + release: {{ .Release.Name }} +subjects: + - kind: ServiceAccount + name: {{ .Release.Name }}-prometheus-kube-state-metrics + namespace: {{ .Release.Namespace }} +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: union-operator-prometheus-rbac +{{- end }} diff --git a/charts/dataplane/templates/monitoring/prometheusrule.yaml b/charts/dataplane/templates/monitoring/prometheusrule.yaml index 3740f4d3..216052c0 100644 --- a/charts/dataplane/templates/monitoring/prometheusrule.yaml +++ b/charts/dataplane/templates/monitoring/prometheusrule.yaml @@ -1,4 +1,5 @@ -{{- if and .Values.monitoring.prometheusRules.enabled (.Capabilities.APIVersions.Has "monitoring.coreos.com/v1") }} +{{- if and .Values.monitoring.prometheusRules.enabled (.Capabilities.APIVersions.Has "monitoring.coreos.com/v1") + .Values.cost.enabled (not .Values.low_privilege) .Values.monitoring.enabled }} apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: @@ -6,6 +7,35 @@ metadata: namespace: {{ .Release.Namespace }} labels: release: {{ .Release.Name }} +spec: +{{ .Files.Get "files/recording-rules.yml" | indent 2 }} +{{- end }} +--- +{{- if .Values.prometheus.enabled }} +apiVersion: v1 +kind: ConfigMap +metadata: + name: union-recording-rules + namespace: {{ .Release.Namespace }} + labels: + release: {{ .Release.Name }} +data: + recording_rules.yml: | +{{- if and .Values.cost.enabled (not .Values.low_privilege) }} +{{ .Files.Get "files/recording-rules.yml" | indent 4 }} +{{- else }} + groups: [] +{{- end }} +{{- end }} +--- +{{- if and .Values.monitoring.prometheusRules.enabled (.Capabilities.APIVersions.Has "monitoring.coreos.com/v1") }} +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: {{ include "unionai-dataplane.fullname" . }}-monitoring-rules + namespace: {{ .Release.Namespace }} + labels: + release: {{ .Release.Name }} spec: groups: # --- Recording rules (always active) --- diff --git a/charts/dataplane/templates/monitoring/servicemonitor.yaml b/charts/dataplane/templates/monitoring/servicemonitor.yaml index 6022c18e..0fa4ac87 100644 --- a/charts/dataplane/templates/monitoring/servicemonitor.yaml +++ b/charts/dataplane/templates/monitoring/servicemonitor.yaml @@ -1,8 +1,37 @@ -{{- if and .Values.monitoring.serviceMonitors.enabled (.Capabilities.APIVersions.Has "monitoring.coreos.com/v1") }} +{{- if and .Values.cost.enabled (not .Values.low_privilege) .Values.monitoring.enabled -}} apiVersion: monitoring.coreos.com/v1 kind: ServiceMonitor metadata: - name: {{ include "unionai-dataplane.fullname" . }}-services + name: {{ .Values.cost.serviceMonitor.name }} + namespace: {{ .Release.Namespace }} + labels: + release: {{ .Release.Name }} +spec: + selector: + {{- with .Values.cost.serviceMonitor.matchLabels }} + matchLabels: + {{- toYaml . | nindent 6 }} + {{- end }} + namespaceSelector: + matchNames: + - "{{ .Release.Namespace }}" + endpoints: + - port: http + interval: 1m + path: /metrics + honorLabels: true + metricRelabelings: + - sourceLabels: [ "__name__" ] + separator: ";" + regex: "kube_node_labels|kube_pod_labels|node_total_hourly_cost|node_ram_hourly_cost|node_cpu_hourly_cost|node_gpu_hourly_cost" + action: keep +{{- end }} +--- +{{- if and .Values.monitoring.enabled (not .Values.low_privilege) }} +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: union-service-monitor namespace: {{ .Release.Namespace }} labels: release: {{ .Release.Name }} diff --git a/charts/dataplane/templates/nodeexecutor/configmap.yaml b/charts/dataplane/templates/nodeexecutor/configmap.yaml index 3c980fb8..46c460b2 100644 --- a/charts/dataplane/templates/nodeexecutor/configmap.yaml +++ b/charts/dataplane/templates/nodeexecutor/configmap.yaml @@ -25,7 +25,11 @@ data: {{- with .Values.config.task_resource_defaults -}} {{ toYaml . | nindent 6 }} {{- end }} - {{- with .Values.executor.raw_config }} + {{- $rawConfig := deepCopy (.Values.executor.raw_config | default dict) }} + {{- if and (include "singleNamespace" $) (not (index $rawConfig "namespace_mapping")) }} + {{- $_ := set $rawConfig "namespace_mapping" (dict "template" $.Release.Namespace) }} + {{- end }} + {{- with $rawConfig }} {{- tpl (toYaml .) $ | nindent 4 }} {{- end }} {{- if not (dig "namespace_mapping" false .Values.executor.raw_config) }} @@ -59,14 +63,21 @@ data: logger: {{- tpl (omit . "pythonLevel" | toYaml) $ | nindent 6 }} {{- end }} - {{- with .Values.config.namespace_config -}} - {{- tpl (toYaml .) $ | nindent 4 }} + {{- if .Values.config.namespace_config -}} + {{- tpl (toYaml .Values.config.namespace_config) $ | nindent 4 }} + {{- else if and (include "singleNamespace" $) (not (index $rawConfig "namespace_mapping")) }} + namespace_mapping: + template: {{ .Release.Namespace }} {{- end }} {{- with .Values.executor.sharedService }} sharedService: {{- tpl (toYaml .) $ | nindent 6 }} {{- end }} - {{- with .Values.executor.propeller }} + {{- $execPropeller := deepCopy (.Values.executor.propeller | default dict) }} + {{- if and (include "singleNamespace" $) (not (index $execPropeller "limit-namespace")) }} + {{- $_ := set $execPropeller "limit-namespace" $.Release.Namespace }} + {{- end }} + {{- with $execPropeller }} propeller: {{- tpl (toYaml .) $ | nindent 6 }} {{- end }} diff --git a/charts/dataplane/templates/nodeexecutor/deployment.yaml b/charts/dataplane/templates/nodeexecutor/deployment.yaml index ee547928..8140d0ae 100644 --- a/charts/dataplane/templates/nodeexecutor/deployment.yaml +++ b/charts/dataplane/templates/nodeexecutor/deployment.yaml @@ -25,7 +25,7 @@ spec: spec: securityContext: fsGroup: 1337 - serviceAccountName: executor + serviceAccountName: {{ include "executor.serviceAccountName" . }} volumes: - name: config-volume configMap: diff --git a/charts/dataplane/templates/nodeexecutor/service.yaml b/charts/dataplane/templates/nodeexecutor/service.yaml index 5b00d6c5..6e9e4cab 100644 --- a/charts/dataplane/templates/nodeexecutor/service.yaml +++ b/charts/dataplane/templates/nodeexecutor/service.yaml @@ -2,7 +2,7 @@ apiVersion: v1 kind: Service metadata: - name: {{ include "unionai-dataplane.fullname" . }}-executor + name: {{ include "union-operator.fullname" . }}-executor labels: platform.union.ai/prometheus-group: "union-services" {{- include "executor.labels" . | nindent 4 }} diff --git a/charts/dataplane/templates/nodeexecutor/serviceaccount.yaml b/charts/dataplane/templates/nodeexecutor/serviceaccount.yaml index 73f5d6c9..82bcaf29 100644 --- a/charts/dataplane/templates/nodeexecutor/serviceaccount.yaml +++ b/charts/dataplane/templates/nodeexecutor/serviceaccount.yaml @@ -1,8 +1,9 @@ {{- if .Values.executor.enabled }} +{{- if not (include "useCommonServiceAccount" .) }} apiVersion: v1 kind: ServiceAccount metadata: - name: executor + name: {{ include "executor.serviceAccountName" . }} namespace: {{ .Release.Namespace }} labels: {{- include "executor.labels" . | nindent 4 }} @@ -12,12 +13,17 @@ metadata: {{- end }} --- +{{- end }} {{- if $.Capabilities.APIVersions.Has "rbac.authorization.k8s.io/v1" }} apiVersion: rbac.authorization.k8s.io/v1 {{- else }} apiVersion: rbac.authorization.k8s.io/v1beta1 {{- end }} +{{- if .Values.low_privilege }} +kind: Role +{{- else }} kind: ClusterRole +{{- end }} metadata: name: {{ .Release.Namespace }}-executor labels: @@ -70,17 +76,25 @@ rules: --- apiVersion: rbac.authorization.k8s.io/v1 +{{- if .Values.low_privilege }} +kind: RoleBinding +{{- else }} kind: ClusterRoleBinding +{{- end }} metadata: name: {{ .Release.Namespace }}-executor labels: app: executor roleRef: apiGroup: rbac.authorization.k8s.io + {{- if .Values.low_privilege }} + kind: Role + {{- else }} kind: ClusterRole + {{- end }} name: {{ .Release.Namespace }}-executor subjects: - kind: ServiceAccount - name: executor + name: {{ include "executor.serviceAccountName" . }} namespace: {{ .Release.Namespace }} -{{- end }} \ No newline at end of file +{{- end }} diff --git a/charts/dataplane/templates/nodeexecutor/webhook.yaml b/charts/dataplane/templates/nodeexecutor/webhook.yaml deleted file mode 100644 index 2eb44169..00000000 --- a/charts/dataplane/templates/nodeexecutor/webhook.yaml +++ /dev/null @@ -1,216 +0,0 @@ -{{ if .Values.executor.enabled }} -{{- if not .Values.flytepropeller.enabled }} # prevent duplicate from propeller -{{ if .Values.flytepropellerwebhook.enabled }} -# Create an empty secret that the first propeller pod will populate -apiVersion: v1 -kind: Secret -metadata: - name: {{ template "flyte-pod-webhook.name" . }} #prevent name collision with flyte oss - namespace: {{ .Release.Namespace }} -type: Opaque ---- -# Create the actual deployment -apiVersion: apps/v1 -kind: Deployment -metadata: - name: {{ template "flyte-pod-webhook.name" . }} - namespace: {{ .Release.Namespace }} - labels: - {{- include "flytepropellerwebhook.labels" . | nindent 4 }} -spec: - replicas: {{ .Values.flytepropellerwebhook.replicaCount }} - selector: - matchLabels: - {{- include "flytepropellerwebhook.selectorLabels" . | nindent 6 }} - template: - metadata: - labels: - {{- include "flytepropellerwebhook.podLabels" . | nindent 8 }} - annotations: - configChecksum: {{ include (print .Template.BasePath "/propeller/configmap.yaml") . | sha256sum | trunc 63 | quote }} - {{- with .Values.flytepropellerwebhook.podAnnotations }} - {{- toYaml . | nindent 8 }} - {{- end }} - {{- include "global.podAnnotations" . | nindent 8 }} - spec: - {{- with .Values.flytepropellerwebhook.securityContext }} - securityContext: - {{- tpl (toYaml .) $ | nindent 8 }} - {{- end }} - serviceAccountName: {{ template "flyte-pod-webhook.name" . }} - {{- if .Values.flytepropellerwebhook.priorityClassName }} - priorityClassName: {{ .Values.flytepropellerwebhook.priorityClassName }} - {{- end }} - initContainers: - - name: generate-secrets - image: "{{ .Values.image.union.repository }}:{{ .Values.image.union.tag | default .Chart.AppVersion }}" - imagePullPolicy: "{{ .Values.image.union.pullPolicy }}" - command: - - flytepropeller - args: - - webhook - - init-certs - - --config - - /etc/flyte/config/*.yaml - env: - {{- include "global.podEnvVars" . | nindent 10 }} - {{- with .Values.flytepropellerwebhook.podEnv -}} - {{- toYaml . | nindent 10 }} - {{- end }} - volumeMounts: - - name: config-volume - mountPath: /etc/flyte/config - - name: webhook-certs - mountPath: /etc/webhook/certs - resources: {{- toYaml .Values.flytepropellerwebhook.resources | nindent 12 }} - containers: - - name: webhook - image: "{{ .Values.image.union.repository }}:{{ .Values.image.union.tag | default .Chart.AppVersion }}" - imagePullPolicy: "{{ .Values.image.union.pullPolicy }}" - command: - - flytepropeller - args: - - webhook - - --config - - /etc/flyte/config/*.yaml - env: - {{- include "global.podEnvVars" . | nindent 10 }} - {{- if .Values.flytepropellerwebhook.podEnv -}} - {{- with .Values.flytepropellerwebhook.podEnv -}} - {{- toYaml . | nindent 10 }} - {{- end }} - {{- end }} - ports: - - containerPort: {{ .Values.flytepropellerwebhook.service.targetPort }} - - containerPort: {{ index .Values.config.core.propeller "prof-port" }} - {{- with .Values.flytepropellerwebhook.resources }} - resources: {{- toYaml . | nindent 12 }} - {{- end }} - volumeMounts: - - name: config-volume - mountPath: /etc/flyte/config - readOnly: true - - name: webhook-certs - mountPath: /etc/webhook/certs - readOnly: true - {{- with .Values.flytepropellerwebhook.additionalVolumeMounts -}} - {{ tpl (toYaml .) $ | nindent 12 }} - {{- end }} - volumes: - - name: config-volume - configMap: - name: flyte-propeller-webhook-config - - name: webhook-certs - emptyDir: {} - {{- with .Values.flytepropellerwebhook.additionalVolumes -}} - {{ tpl (toYaml .) $ | nindent 8 }} - {{- end }} - {{- include "flytepropellerwebhook.scheduling" . | nindent 6 }} - {{- include "additionalPodSpec" . | nindent 6 }} ---- - -apiVersion: v1 -kind: Service -metadata: - name: {{ template "flyte-pod-webhook.name" . }} - namespace: {{ .Release.Namespace }} - labels: - {{- include "flytepropellerwebhook.labels" . | nindent 4 }} - {{- with .Values.flytepropellerwebhook.service.annotations }} - annotations: - {{- tpl (toYaml .) $ | nindent 4 }} - {{- end }} -spec: - selector: - {{- include "flytepropellerwebhook.selectorLabels" . | nindent 4 }} - ports: - - name: https - protocol: TCP - port: {{ .Values.flytepropellerwebhook.service.port }} - targetPort: {{ .Values.flytepropellerwebhook.service.targetPort }} - - name: debug - protocol: TCP - port: 10254 - targetPort: 10254 - ---- -# Headless Service for cache invalidation — resolves to all pod IPs so that -# we can fan out invalidation requests to every webhook replica. -apiVersion: v1 -kind: Service -metadata: - name: {{ template "flyte-pod-webhook.name" . }}-headless - namespace: {{ .Release.Namespace }} - labels: - {{- include "flytepropellerwebhook.labels" . | nindent 4 }} -spec: - clusterIP: None - selector: - {{- include "flytepropellerwebhook.selectorLabels" . | nindent 4 }} - ports: - - name: cache-internal - protocol: TCP - port: {{ .Values.flytepropellerwebhook.service.targetPort }} - targetPort: {{ .Values.flytepropellerwebhook.service.targetPort }} ---- -{{- if .Values.low_privilege }} -kind: Role -{{- else }} -kind: ClusterRole -{{- end }} -apiVersion: rbac.authorization.k8s.io/v1 -metadata: - name: {{ .Release.Namespace }}-{{- template "flyte-pod-webhook.name" . }} - namespace: {{ .Release.Namespace }} -rules: - - apiGroups: - - "*" - resources: - - mutatingwebhookconfigurations - - secrets - - pods - - replicasets/finalizers - verbs: - - get - - create - - update - - patch ---- -apiVersion: v1 -kind: ServiceAccount -metadata: - name: {{ template "flyte-pod-webhook.name" . }} - namespace: {{ .Release.Namespace }} - {{- with include "global.serviceAccountAnnotations" . }} - annotations: - {{- . | nindent 4 }} - {{- end }} -{{- with .Values.flytepropellerwebhook.serviceAccount.imagePullSecrets }} -imagePullSecrets: {{ tpl (toYaml .) $ | nindent 2 }} -{{- end }} ---- -# Create a binding from Role -> ServiceAccount -{{- if .Values.low_privilege }} -kind: RoleBinding -{{- else }} -kind: ClusterRoleBinding -{{- end }} -apiVersion: rbac.authorization.k8s.io/v1 -metadata: - name: {{ .Release.Namespace }}-{{- template "flyte-pod-webhook.name" . }} - namespace: {{ .Release.Namespace }} -roleRef: - apiGroup: rbac.authorization.k8s.io - {{- if .Values.low_privilege }} - kind: Role - {{- else }} - kind: ClusterRole - {{- end }} - name: {{ .Release.Namespace }}-{{- template "flyte-pod-webhook.name" . }} -subjects: - - kind: ServiceAccount - name: {{ template "flyte-pod-webhook.name" . }} - namespace: {{ .Release.Namespace }} -{{- end }} -{{- end }} -{{- end }} \ No newline at end of file diff --git a/charts/dataplane/templates/operator/configmap.yaml b/charts/dataplane/templates/operator/configmap.yaml index a124af61..2b4032ce 100644 --- a/charts/dataplane/templates/operator/configmap.yaml +++ b/charts/dataplane/templates/operator/configmap.yaml @@ -27,14 +27,19 @@ data: operator: enabled: {{ .Values.config.operator.enabled }} enableTunnelService: {{ .Values.config.operator.enableTunnelService }} + # enableDepot: {{ include "operator.enableDepot" . | default "false" }} tunnel: enableDirectToAppIngress: {{ .Values.serving.enabled }} deploymentToRestart: union-operator-proxy - {{- with .Values.config.operator.limitNamespace }} - limitNamespace: {{ tpl (toYaml .) $ }} + {{- if .Values.config.operator.limitNamespace }} + limitNamespace: {{ tpl (toYaml .Values.config.operator.limitNamespace) $ }} + {{- else if include "singleNamespace" . }} + limitNamespace: {{ .Release.Namespace }} {{- end }} - {{- with .Values.config.operator.disableClusterPermissions }} - disableClusterPermissions: {{ tpl (toYaml .) $ }} + {{- if .Values.config.operator.disableClusterPermissions }} + disableClusterPermissions: {{ tpl (toYaml .Values.config.operator.disableClusterPermissions) $ }} + {{- else if .Values.low_privilege }} + disableClusterPermissions: true {{- end }} {{- with .Values.config.operator.apps }} apps: @@ -49,7 +54,11 @@ data: {{- tpl (toYaml .) $ | nindent 8 }} {{- end }} {{- tpl (include "operator.clusterData" .) $ | nindent 6 }} - {{- with .Values.config.operator.collectUsages }} + {{- $collectUsages := deepCopy (.Values.config.operator.collectUsages | default dict) }} + {{- if $.Values.low_privilege }} + {{- $_ := set $collectUsages "enabled" false }} + {{- end }} + {{- with $collectUsages }} collectUsages: {{- tpl (toYaml .) $ | nindent 8 }} {{- end }} @@ -76,6 +85,9 @@ data: {{- if and (not .Values.config.operator.org) .Values.namespace_mapping }} org: namespaceTemplate: {{ tpl (.Values.namespace_mapping.template | quote) $ }} + {{- else if include "singleNamespace" . }} + org: + namespaceTemplate: {{ .Release.Namespace }} {{- end }} {{- if .Values.imageBuilder.enabled }} imageBuilder: @@ -102,7 +114,11 @@ data: fast_registration_storage.yaml: | {{ tpl (include "fast-registration-storage" .) $ | nindent 4 }} {{- end }} {{- if .Values.imageBuilder.enabled }} +{{- if or .Values.imageBuilder.buildkitUri .Values.imageBuilder.buildkit.enabled }} image-builder.buildkit-uri: {{ (include "imagebuilder.buildkit.uri" .) | quote }} - image-builder.default-repository: {{ .Values.imageBuilder.defaultRepository | quote }} - image-builder.authentication-type: {{ .Values.imageBuilder.authenticationType | quote }} +{{- else }} + image-builder.buildkit-uri: "" +{{- end }} + image-builder.default-repository: {{ include "imagebuilder.defaultRepository" . | quote }} + image-builder.authentication-type: {{ include "imagebuilder.authenticationType" . | quote }} {{- end }} diff --git a/charts/dataplane/templates/operator/deployment-proxy.yaml b/charts/dataplane/templates/operator/deployment-proxy.yaml index 7e027ddc..7cfbdaa4 100644 --- a/charts/dataplane/templates/operator/deployment-proxy.yaml +++ b/charts/dataplane/templates/operator/deployment-proxy.yaml @@ -29,7 +29,7 @@ spec: sources: - configMap: name: {{ include "union-operator.fullname" . }} - {{- if .Values.clusterresourcesync.enabled }} + {{- if and .Values.clusterresourcesync.enabled (not (include "singleNamespace" .)) }} - configMap: name: union-clusterresourcesync-config {{- end }} diff --git a/charts/dataplane/templates/operator/serviceaccount-proxy.yaml b/charts/dataplane/templates/operator/serviceaccount-proxy.yaml index c04b8fef..8d09693a 100644 --- a/charts/dataplane/templates/operator/serviceaccount-proxy.yaml +++ b/charts/dataplane/templates/operator/serviceaccount-proxy.yaml @@ -1,8 +1,10 @@ {{- if .Values.proxy.serviceAccount.create -}} +{{- $roleName := default "proxy-system" .Values.proxy.serviceAccount.name -}} +{{- if not (include "useCommonServiceAccount" .) }} apiVersion: v1 kind: ServiceAccount metadata: - name: {{ include "proxy.serviceAccountName" . }} + name: {{ $roleName }} labels: {{- include "proxy.labels" . | nindent 4 }} {{- with include "proxy.serviceAccount.annotations" . }} @@ -10,6 +12,7 @@ metadata: {{- . | nindent 4 }} {{- end }} --- +{{- end }} apiVersion: rbac.authorization.k8s.io/v1 {{- if .Values.low_privilege }} kind: Role @@ -17,7 +20,7 @@ kind: Role kind: ClusterRole {{- end }} metadata: - name: {{ include "proxy.serviceAccountName" . }} + name: {{ $roleName }} labels: {{- include "proxy.labels" . | nindent 4 }} rules: @@ -42,7 +45,7 @@ kind: RoleBinding kind: ClusterRoleBinding {{- end }} metadata: - name: {{ include "proxy.serviceAccountName" . }} + name: {{ $roleName }} labels: {{- include "proxy.labels" . | nindent 4 }} roleRef: @@ -52,7 +55,7 @@ roleRef: {{- else }} kind: ClusterRole {{- end }} - name: {{ include "proxy.serviceAccountName" . }} + name: {{ $roleName }} subjects: - kind: ServiceAccount name: {{ include "proxy.serviceAccountName" . }} diff --git a/charts/dataplane/templates/operator/serviceaccount.yaml b/charts/dataplane/templates/operator/serviceaccount.yaml index a439b4ee..c14a0f15 100644 --- a/charts/dataplane/templates/operator/serviceaccount.yaml +++ b/charts/dataplane/templates/operator/serviceaccount.yaml @@ -1,8 +1,10 @@ {{- if .Values.operator.serviceAccount.create -}} +{{- $roleName := default "operator-system" .Values.operator.serviceAccount.name -}} +{{- if not (include "useCommonServiceAccount" .) }} apiVersion: v1 kind: ServiceAccount metadata: - name: {{ include "operator.serviceAccountName" . }} + name: {{ $roleName }} labels: {{- include "operator.labels" . | nindent 4 }} {{- with include "operator.serviceAccount.annotations" . }} @@ -10,10 +12,11 @@ metadata: {{- . | nindent 4 }} {{- end }} --- +{{- end }} apiVersion: rbac.authorization.k8s.io/v1 kind: Role metadata: - name: {{ include "operator.serviceAccountName" . }} + name: {{ $roleName }} labels: {{- include "operator.labels" . | nindent 4 }} rules: @@ -81,13 +84,13 @@ rules: apiVersion: rbac.authorization.k8s.io/v1 kind: RoleBinding metadata: - name: {{ include "operator.serviceAccountName" . }} + name: {{ $roleName }} labels: {{- include "operator.labels" . | nindent 4 }} roleRef: apiGroup: rbac.authorization.k8s.io kind: Role - name: {{ include "operator.serviceAccountName" . }} + name: {{ $roleName }} subjects: - kind: ServiceAccount name: {{ include "operator.serviceAccountName" . }} @@ -98,7 +101,7 @@ subjects: apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: - name: {{ include "operator.serviceAccountName" . }} + name: {{ $roleName }} labels: {{- include "operator.labels" . | nindent 4 }} rules: @@ -171,13 +174,13 @@ rules: apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: - name: {{ include "operator.serviceAccountName" . }} + name: {{ $roleName }} labels: {{- include "operator.labels" . | nindent 4 }} roleRef: apiGroup: rbac.authorization.k8s.io kind: ClusterRole - name: {{ include "operator.serviceAccountName" . }} + name: {{ $roleName }} subjects: - kind: ServiceAccount name: {{ include "operator.serviceAccountName" . }} diff --git a/charts/dataplane/templates/prometheus/configmap.yaml b/charts/dataplane/templates/prometheus/configmap.yaml deleted file mode 100644 index b3691b82..00000000 --- a/charts/dataplane/templates/prometheus/configmap.yaml +++ /dev/null @@ -1,169 +0,0 @@ -apiVersion: v1 -kind: ConfigMap -metadata: - name: {{ include "union-operator.fullname" . }}-prometheus - namespace: {{ .Release.Namespace }} - labels: - {{- include "unionai-dataplane.labels" . | nindent 4 }} - app.kubernetes.io/component: prometheus -data: - prometheus.yml: | - global: - scrape_interval: 15s - evaluation_interval: 15s - alerting: - alertmanagers: - - static_configs: - - targets: - {{- if .Values.cost.enabled }} - rule_files: - - rules.yml - {{- end }} - scrape_configs: - # Self-monitoring - - job_name: prometheus - metrics_path: /prometheus/metrics - static_configs: - - targets: ['localhost:9090'] - metric_relabel_configs: - - source_labels: [__name__] - regex: prometheus_tsdb_head_chunks_storage_size_bytes|prometheus_tsdb_head_series|prometheus_tsdb_head_series_created_total|prometheus_tsdb_head_series_removed_total|prometheus_tsdb_head_samples_appended_total|prometheus_tsdb_storage_blocks_bytes|prometheus_tsdb_retention_limit_bytes|prometheus_tsdb_wal_(.*)|.+_(limit|failed|missed|failures)_total - action: keep - - # Kube state metrics for pod/node resource tracking and cost calculations - - job_name: kube-state-metrics - static_configs: - - targets: ['{{ .Release.Name }}-kube-state-metrics:8080'] - metric_relabel_configs: - - separator: ; - source_labels: [__name__] - regex: kube_pod_container_resource_(limits|requests)|kube_pod_status_phase|kube_node_(labels|status_allocatable|status_condition|status_capacity)|kube_namespace_labels|kube_pod_container_status_(waiting|terminated|last_terminated).*_reason|kube_daemonset_status_number_unavailable|kube_deployment_status_replicas_unavailable|kube_resourcequota|kube_pod_info|kube_node_info|kube_pod_container_status_restarts_total - action: keep - - separator: ; - source_labels: [__name__, phase] - regex: kube_pod_status_phase;(Succeeded|Failed) - action: drop - - source_labels: [node] - target_label: nodename - regex: '(.*)' - action: replace - - source_labels: [label_node_group_name] - action: replace - regex: (.+) - target_label: label_node_pool_name - - # cAdvisor container metrics for CPU and memory tracking - - job_name: kubernetes-cadvisor - metrics_path: /metrics - scheme: https - kubernetes_sd_configs: - - role: node - namespaces: - names: [] - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token - tls_config: - ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt - insecure_skip_verify: false - metric_relabel_configs: - - separator: ; - source_labels: [__name__] - regex: container_cpu_usage_seconds_total|container_memory_working_set_bytes - action: keep - relabel_configs: - - separator: ; - regex: __meta_kubernetes_node_label_(.+) - replacement: $1 - action: labelmap - - separator: ; - regex: (.*) - target_label: __address__ - replacement: kubernetes.default.svc:443 - action: replace - - source_labels: [__meta_kubernetes_node_name] - separator: ; - regex: (.+) - target_label: __metrics_path__ - replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor - action: replace - - # Flyte propeller metrics for execution info and fast task duration - - job_name: flytepropeller - kubernetes_sd_configs: - - role: pod - namespaces: - names: - - {{ .Release.Namespace }} - selectors: - - role: pod - label: app.kubernetes.io/name=flytepropeller - metric_relabel_configs: - - source_labels: [__name__] - regex: "flyte:propeller:all:round:execution_info|flyte:propeller:all:node:fast_task:fast_task_execution_duration" - action: keep - relabel_configs: - - source_labels: [__meta_kubernetes_pod_container_name] - regex: flytepropeller - action: keep - - source_labels: [__meta_kubernetes_namespace] - action: replace - target_label: namespace - - source_labels: [__meta_kubernetes_pod_name] - action: replace - target_label: pod - - {{- if .Values.serving.enabled }} - # Kourier gateway (envoy) metrics for serving - - job_name: serving-envoy - kubernetes_sd_configs: - - role: pod - namespaces: - names: - - {{ .Release.Namespace }} - selectors: - - role: pod - label: app=3scale-kourier-gateway - metrics_path: /stats/prometheus - metric_relabel_configs: - - source_labels: [__name__] - regex: "envoy_cluster_upstream_rq_xx|envoy_cluster_upstream_rq_time_bucket" - action: keep - relabel_configs: - - source_labels: [__meta_kubernetes_pod_container_port_name] - regex: metrics - action: keep - - source_labels: [__meta_kubernetes_namespace] - action: replace - target_label: namespace - - source_labels: [__meta_kubernetes_pod_name] - action: replace - target_label: pod - {{- end }} - - {{- if .Values.opencost.enabled }} - # OpenCost metrics for cost tracking - - job_name: opencost - static_configs: - - targets: ['{{ .Release.Name }}-opencost:9003'] - metric_relabel_configs: - - source_labels: [__name__] - regex: "kube_node_labels|kube_pod_labels|node_total_hourly_cost|node_ram_hourly_cost|node_cpu_hourly_cost|node_gpu_hourly_cost" - action: keep - {{- end }} - - {{- if (index .Values "dcgm-exporter" "enabled" | default false) }} - # DCGM GPU metrics - - job_name: gpu-metrics - kubernetes_sd_configs: - - role: pod - namespaces: - names: - - kube-system - selectors: - - role: pod - label: app.kubernetes.io/name=dcgm-exporter - {{- end }} - - {{- if .Values.cost.enabled }} - rules.yml: | - {{ include "cost.recording-rules" . | nindent 4 }} - {{- end }} diff --git a/charts/dataplane/templates/prometheus/deployment.yaml b/charts/dataplane/templates/prometheus/deployment.yaml deleted file mode 100644 index ab7459e1..00000000 --- a/charts/dataplane/templates/prometheus/deployment.yaml +++ /dev/null @@ -1,69 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - name: {{ include "union-operator.fullname" . }}-prometheus - namespace: {{ .Release.Namespace }} - labels: - {{- include "unionai-dataplane.labels" . | nindent 4 }} - app.kubernetes.io/component: prometheus -spec: - replicas: 1 - selector: - matchLabels: - app.kubernetes.io/component: prometheus - {{- include "unionai-dataplane.selectorLabels" . | nindent 6 }} - template: - metadata: - annotations: - configChecksum: {{ include (print .Template.BasePath "/prometheus/configmap.yaml") . | sha256sum | trunc 63 | quote }} - labels: - app.kubernetes.io/component: prometheus - {{- include "unionai-dataplane.selectorLabels" . | nindent 8 }} - spec: - {{- with .Values.prometheus.priorityClassName }} - priorityClassName: {{ . }} - {{- end }} - serviceAccountName: {{ include "union-operator.fullname" . }}-prometheus - securityContext: - fsGroup: 65534 - runAsNonRoot: true - runAsUser: 65534 - {{- if ge (int (mustRegexFind "[0-9]+" .Capabilities.KubeVersion.Minor)) 23 }} - fsGroupChangePolicy: OnRootMismatch - {{- end }} - containers: - - name: prometheus - image: "{{ .Values.prometheus.image.repository }}:{{ .Values.prometheus.image.tag }}" - args: - - --config.file=/etc/prometheus/prometheus.yml - - --web.external-url={{ .Values.prometheus.routePrefix }} - - --web.route-prefix={{ .Values.prometheus.routePrefix }} - - --storage.tsdb.retention.time={{ .Values.prometheus.retention }} - ports: - - name: http - containerPort: 9090 - protocol: TCP - resources: - {{- toYaml .Values.prometheus.resources | nindent 12 }} - securityContext: - allowPrivilegeEscalation: false - terminationMessagePolicy: FallbackToLogsOnError - volumeMounts: - - mountPath: /etc/prometheus - name: prometheus-config - volumes: - - name: prometheus-config - configMap: - name: {{ include "union-operator.fullname" . }}-prometheus - {{- with .Values.prometheus.nodeSelector }} - nodeSelector: - {{- toYaml . | nindent 8 }} - {{- end }} - {{- with .Values.prometheus.affinity }} - affinity: - {{- toYaml . | nindent 8 }} - {{- end }} - {{- with .Values.prometheus.tolerations }} - tolerations: - {{- toYaml . | nindent 8 }} - {{- end }} diff --git a/charts/dataplane/templates/prometheus/rbac.yaml b/charts/dataplane/templates/prometheus/rbac.yaml deleted file mode 100644 index 168c5e5d..00000000 --- a/charts/dataplane/templates/prometheus/rbac.yaml +++ /dev/null @@ -1,42 +0,0 @@ -{{- if .Values.prometheus.serviceAccount.create }} -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: {{ include "union-operator.fullname" . }}-prometheus - labels: - {{- include "unionai-dataplane.labels" . | nindent 4 }} - app.kubernetes.io/component: prometheus -rules: - - apiGroups: [""] - resources: - - nodes - - nodes/proxy - - pods - - endpoints - - services - verbs: - - get - - list - - watch - - nonResourceURLs: - - /metrics - - /metrics/cadvisor - verbs: - - get ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - name: {{ include "union-operator.fullname" . }}-prometheus - labels: - {{- include "unionai-dataplane.labels" . | nindent 4 }} - app.kubernetes.io/component: prometheus -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: {{ include "union-operator.fullname" . }}-prometheus -subjects: - - kind: ServiceAccount - name: {{ include "union-operator.fullname" . }}-prometheus - namespace: {{ .Release.Namespace }} -{{- end }} diff --git a/charts/dataplane/templates/prometheus/service.yaml b/charts/dataplane/templates/prometheus/service.yaml deleted file mode 100644 index f15fd744..00000000 --- a/charts/dataplane/templates/prometheus/service.yaml +++ /dev/null @@ -1,18 +0,0 @@ -apiVersion: v1 -kind: Service -metadata: - name: {{ include "union-operator.fullname" . }}-prometheus - namespace: {{ .Release.Namespace }} - labels: - {{- include "unionai-dataplane.labels" . | nindent 4 }} - app.kubernetes.io/component: prometheus -spec: - type: ClusterIP - ports: - - port: 80 - targetPort: 9090 - protocol: TCP - name: http - selector: - app.kubernetes.io/component: prometheus - {{- include "unionai-dataplane.selectorLabels" . | nindent 4 }} diff --git a/charts/dataplane/templates/prometheus/serviceaccount.yaml b/charts/dataplane/templates/prometheus/serviceaccount.yaml deleted file mode 100644 index 28f803f4..00000000 --- a/charts/dataplane/templates/prometheus/serviceaccount.yaml +++ /dev/null @@ -1,14 +0,0 @@ -{{- if .Values.prometheus.serviceAccount.create }} -apiVersion: v1 -kind: ServiceAccount -metadata: - name: {{ include "union-operator.fullname" . }}-prometheus - namespace: {{ .Release.Namespace }} - labels: - {{- include "unionai-dataplane.labels" . | nindent 4 }} - app.kubernetes.io/component: prometheus - {{- with .Values.prometheus.serviceAccount.annotations }} - annotations: - {{- toYaml . | nindent 4 }} - {{- end }} -{{- end }} diff --git a/charts/dataplane/templates/propeller/configmap.yaml b/charts/dataplane/templates/propeller/configmap.yaml index 9a4262c7..fbc7d7bd 100644 --- a/charts/dataplane/templates/propeller/configmap.yaml +++ b/charts/dataplane/templates/propeller/configmap.yaml @@ -17,7 +17,18 @@ data: {{- with .Values.config.copilot }} copilot.yaml: | {{ tpl (toYaml .) $ | nindent 4 }} {{- end }} -{{- with .Values.config.core }} +{{- $core := deepCopy (.Values.config.core | default dict) }} +{{- if include "singleNamespace" $ }} +{{- $_ := set (index $core "propeller") "limit-namespace" .Release.Namespace }} +{{- end }} +{{- if .Values.low_privilege }} +{{- if not (hasKey $core "webhook") }} +{{- $_ := set $core "webhook" (dict "disableCreateMutatingWebhookConfig" true) }} +{{- else }} +{{- $_ := set (index $core "webhook") "disableCreateMutatingWebhookConfig" true }} +{{- end }} +{{- end }} +{{- with $core }} core.yaml: | {{ tpl (toYaml .) $ | nindent 4 }} {{- end }} {{- with .Values.config.enabled_plugins }} @@ -32,8 +43,12 @@ data: {{- with .Values.config.qubole }} qubole.yaml: | {{ tpl (toYaml .) $ | nindent 4 }} {{- end }} -{{- with .Values.config.namespace_config }} - namespace_config.yaml: | {{ tpl (toYaml .) $ | nindent 4 }} +{{- if .Values.config.namespace_config }} + namespace_config.yaml: | {{ tpl (toYaml .Values.config.namespace_config) $ | nindent 4 }} +{{- else if include "singleNamespace" . }} + namespace_config.yaml: | + namespace_mapping: + template: {{ .Release.Namespace }} {{- end }} {{- if and (not .Values.config.namespace_config) .Values.namespace_mapping }} namespace_config.yaml: | @@ -63,14 +78,4 @@ data: {{- end }} {{- end }} storage.yaml: | {{ tpl (include "storage" .) $ | nindent 4 }} -{{- end }} -{{- if and (not .Values.flytepropeller.enabled) .Values.flytepropellerwebhook.enabled }} -apiVersion: v1 -kind: ConfigMap -metadata: - name: flyte-propeller-webhook-config - namespace: {{ .Release.Namespace }} -data: - core.yaml: | -{{ include "propeller.webhookConfigMinimal" . | nindent 4 }} {{- end }} \ No newline at end of file diff --git a/charts/dataplane/templates/propeller/deployment.yaml b/charts/dataplane/templates/propeller/deployment.yaml index 8bf5f559..149f8bff 100644 --- a/charts/dataplane/templates/propeller/deployment.yaml +++ b/charts/dataplane/templates/propeller/deployment.yaml @@ -26,7 +26,7 @@ spec: {{- with .Values.flytepropeller.securityContext }} securityContext: {{ tpl (toYaml .) $ | nindent 8 }} {{- end }} - {{- if .Values.flytepropeller.priorityClassName }} + {{- if and .Values.flytepropeller.priorityClassName (not .Values.low_privilege) }} priorityClassName: {{ .Values.flytepropeller.priorityClassName }} {{- end }} containers: diff --git a/charts/dataplane/templates/serving/knative-serving.yaml b/charts/dataplane/templates/serving/knative-serving.yaml index a4f53b4d..c8f71496 100644 --- a/charts/dataplane/templates/serving/knative-serving.yaml +++ b/charts/dataplane/templates/serving/knative-serving.yaml @@ -24,6 +24,9 @@ spec: kubernetes.podspec-nodeselector: "enabled" kubernetes.podspec-tolerations: "enabled" kubernetes.podspec-fieldref: "enabled" + kubernetes.podspec-dnspolicy: "enabled" + kubernetes.podspec-schedulername: "enabled" + kubernetes.podspec-securitycontext: "enabled" {{- if .Values.serving.extraConfig.features }} {{- tpl (.Values.serving.extraConfig.features | toYaml) . | nindent 6 }} {{- end }} diff --git a/charts/dataplane/templates/webhook/certs.yaml b/charts/dataplane/templates/webhook/certs.yaml new file mode 100644 index 00000000..b660165d --- /dev/null +++ b/charts/dataplane/templates/webhook/certs.yaml @@ -0,0 +1,59 @@ +{{- /* + cert-manager managed certificates. + Creates a Certificate resource that will provision the secret automatically. + Optionally creates a self-signed Issuer if no issuerRef is provided. + + Note: For "helm" and "external" providers, the secret is created in + mutatingwebhookconfiguration.yaml to ensure cert consistency. +*/ -}} +{{- if eq .Values.flytepropellerwebhook.certificate.provider "certManager" }} +{{- $secretName := include "flytepropellerwebhook.secretName" . -}} +{{- $serviceName := include "flytepropellerwebhook.serviceName" . -}} +{{- $namespace := .Release.Namespace -}} +{{- $issuerRef := .Values.flytepropellerwebhook.certificate.certManager.issuerRef -}} +{{- if not $issuerRef }} +{{- /* Create a self-signed issuer if none provided */ -}} +--- +apiVersion: cert-manager.io/v1 +kind: Issuer +metadata: + name: {{ $secretName }}-selfsigned-issuer + namespace: {{ $namespace }} + labels: + {{- include "flytepropellerwebhook.labels" . | nindent 4 }} +spec: + selfSigned: {} +{{- end }} +--- +apiVersion: cert-manager.io/v1 +kind: Certificate +metadata: + name: {{ $secretName }}-cert + namespace: {{ $namespace }} + labels: + {{- include "flytepropellerwebhook.labels" . | nindent 4 }} +spec: + secretName: {{ $secretName }} + duration: {{ .Values.flytepropellerwebhook.certificate.duration }} + renewBefore: {{ .Values.flytepropellerwebhook.certificate.renewBefore }} + {{- if $issuerRef }} + issuerRef: + {{- toYaml $issuerRef | nindent 4 }} + {{- else }} + issuerRef: + name: {{ $secretName }}-selfsigned-issuer + kind: Issuer + group: cert-manager.io + {{- end }} + commonName: {{ $serviceName }}.{{ $namespace }}.svc + dnsNames: + - {{ $serviceName }} + - {{ $serviceName }}.{{ $namespace }} + - {{ $serviceName }}.{{ $namespace }}.svc + - {{ $serviceName }}.{{ $namespace }}.svc.cluster.local + privateKey: + algorithm: ECDSA + size: 256 + usages: + - server auth +{{- end }} diff --git a/charts/dataplane/templates/webhook/configmap.yaml b/charts/dataplane/templates/webhook/configmap.yaml new file mode 100644 index 00000000..9cc077c0 --- /dev/null +++ b/charts/dataplane/templates/webhook/configmap.yaml @@ -0,0 +1,10 @@ +{{- if and (not .Values.flytepropeller.enabled) .Values.flytepropellerwebhook.enabled }} +apiVersion: v1 +kind: ConfigMap +metadata: + name: union-pod-webhook-config + namespace: {{ .Release.Namespace }} +data: + core.yaml: | +{{ include "propeller.webhookConfigMinimal" . | nindent 4 }} +{{- end }} diff --git a/charts/dataplane/templates/propeller/deployment-webhook.yaml b/charts/dataplane/templates/webhook/deployment.yaml similarity index 78% rename from charts/dataplane/templates/propeller/deployment-webhook.yaml rename to charts/dataplane/templates/webhook/deployment.yaml index 20509e2f..246e2589 100644 --- a/charts/dataplane/templates/propeller/deployment-webhook.yaml +++ b/charts/dataplane/templates/webhook/deployment.yaml @@ -1,17 +1,10 @@ -{{ if .Values.flytepropeller.enabled }} -# Create an empty secret that the first propeller pod will populate -apiVersion: v1 -kind: Secret -metadata: - name: flyte-pod-webhook - namespace: {{ .Release.Namespace }} -type: Opaque ---- +{{- if .Values.flytepropellerwebhook.enabled }} +# Webhook deployment # Create the actual deployment apiVersion: apps/v1 kind: Deployment metadata: - name: flytepropeller-webhook + name: {{ template "flyte-pod-webhook.name" . }} namespace: {{ .Release.Namespace }} labels: {{- include "flytepropellerwebhook.labels" . | nindent 4 }} @@ -26,7 +19,11 @@ spec: platform.union.ai/zone: "dataplane" {{- include "flytepropellerwebhook.podLabels" . | nindent 8 }} annotations: + {{- if .Values.flytepropeller.enabled }} configChecksum: {{ include (print .Template.BasePath "/propeller/configmap.yaml") . | sha256sum | trunc 63 | quote }} + {{- else }} + configChecksum: {{ include (print .Template.BasePath "/webhook/configmap.yaml") . | sha256sum | trunc 63 | quote }} + {{- end }} {{- with .Values.flytepropellerwebhook.podAnnotations }} {{- toYaml . | nindent 8 }} {{- end }} @@ -36,10 +33,11 @@ spec: securityContext: {{- tpl (toYaml .) $ | nindent 8 }} {{- end }} - serviceAccountName: flytepropeller-webhook-system + serviceAccountName: {{ include "webhook.serviceAccountName" . }} {{- if .Values.flytepropellerwebhook.priorityClassName }} priorityClassName: {{ .Values.flytepropellerwebhook.priorityClassName }} {{- end }} + {{- if eq .Values.flytepropellerwebhook.certificate.provider "legacy" }} initContainers: - name: generate-secrets image: "{{ .Values.image.union.repository }}:{{ .Values.image.union.tag | default .Chart.AppVersion }}" @@ -52,14 +50,17 @@ spec: - --config - /etc/flyte/config/*.yaml env: - {{- include "global.podEnvVars" . | nindent 10 }} + {{- include "global.podEnvVars" . | nindent 12 }} {{- with .Values.flytepropellerwebhook.podEnv -}} - {{- toYaml . | nindent 10 }} + {{- toYaml . | nindent 12 }} {{- end }} volumeMounts: - name: config-volume mountPath: /etc/flyte/config - resources: {{- toYaml .Values.flytepropellerwebhook.resources | nindent 12 }} + {{- with .Values.flytepropellerwebhook.resources }} + resources: {{- toYaml . | nindent 12 }} + {{- end }} + {{- end }} containers: - name: webhook image: "{{ .Values.image.union.repository }}:{{ .Values.image.union.tag | default .Chart.AppVersion }}" @@ -96,13 +97,17 @@ spec: volumes: - name: config-volume configMap: + {{- if .Values.flytepropeller.enabled }} name: flyte-propeller-config + {{- else }} + name: union-pod-webhook-config + {{- end }} - name: webhook-certs secret: - secretName: flyte-pod-webhook + secretName: {{ include "flytepropellerwebhook.secretName" . }} {{- with .Values.flytepropellerwebhook.additionalVolumes -}} {{ tpl (toYaml .) $ | nindent 8 }} {{- end }} {{- include "flytepropellerwebhook.scheduling" . | nindent 6 }} {{- include "additionalPodSpec" . | nindent 6 }} -{{- end }} \ No newline at end of file +{{- end }} diff --git a/charts/dataplane/templates/propeller/hpa-webhook.yaml b/charts/dataplane/templates/webhook/hpa.yaml similarity index 69% rename from charts/dataplane/templates/propeller/hpa-webhook.yaml rename to charts/dataplane/templates/webhook/hpa.yaml index 899ccb68..d3e25868 100644 --- a/charts/dataplane/templates/propeller/hpa-webhook.yaml +++ b/charts/dataplane/templates/webhook/hpa.yaml @@ -1,15 +1,15 @@ -{{- if .Values.flytepropellerwebhook.autoscaling.enabled }} +{{- if and (.Values.flytepropellerwebhook.autoscaling.enabled) .Values.flytepropellerwebhook.enabled }} apiVersion: autoscaling/v2 kind: HorizontalPodAutoscaler metadata: - name: flytepropeller-webhook + name: union-webhook labels: - app: flytepropeller-webhook + app: union-webhook spec: scaleTargetRef: apiVersion: apps/v1 kind: Deployment - name: flytepropeller-webhook + name: union-pod-webhook minReplicas: {{ .Values.flytepropellerwebhook.autoscaling.minReplicas }} maxReplicas: {{ .Values.flytepropellerwebhook.autoscaling.maxReplicas }} metrics: diff --git a/charts/dataplane/templates/webhook/mutatingwebhookconfiguration.yaml b/charts/dataplane/templates/webhook/mutatingwebhookconfiguration.yaml new file mode 100644 index 00000000..d9598ba5 --- /dev/null +++ b/charts/dataplane/templates/webhook/mutatingwebhookconfiguration.yaml @@ -0,0 +1,132 @@ +{{- if .Values.flytepropellerwebhook.enabled }} +{{- $serviceName := include "flytepropellerwebhook.serviceName" . }} +{{- $secretName := include "flytepropellerwebhook.secretName" . }} +{{- $namespace := .Release.Namespace }} +{{- $useCertManager := include "flytepropellerwebhook.useCertManager" . }} +{{- $useLegacy := eq .Values.flytepropellerwebhook.certificate.provider "legacy" }} +{{- $lowPrivilege := .Values.low_privilege }} +{{- /* Generate certs once and reuse for both secret and webhook config */ -}} +{{- $certs := dict }} +{{- if eq .Values.flytepropellerwebhook.certificate.provider "helm" }} +{{- $certs = include "flytepropellerwebhook.generateCerts" . | fromYaml }} +{{- else if eq .Values.flytepropellerwebhook.certificate.provider "external" }} +{{- $_ := set $certs "caCert" .Values.flytepropellerwebhook.certificate.external.caCert }} +{{- $_ := set $certs "serverCert" .Values.flytepropellerwebhook.certificate.external.tlsCrt }} +{{- $_ := set $certs "serverKey" .Values.flytepropellerwebhook.certificate.external.tlsKey }} +{{- end }} +{{- /* Create the Secret (for helm, external, and legacy providers) */ -}} +{{- if ne .Values.flytepropellerwebhook.certificate.provider "certManager" }} +--- +apiVersion: v1 +kind: Secret +metadata: + name: {{ $secretName }} + namespace: {{ $namespace }} + labels: + {{- include "flytepropellerwebhook.labels" . | nindent 4 }} +type: Opaque +{{- if ne .Values.flytepropellerwebhook.certificate.provider "legacy" }} +data: + ca.crt: {{ $certs.caCert }} + tls.crt: {{ $certs.serverCert }} + tls.key: {{ $certs.serverKey }} +{{- end }} +{{- end }} +--- +{{- if .Values.flytepropellerwebhook.managedConfig }} +apiVersion: admissionregistration.k8s.io/v1 +kind: MutatingWebhookConfiguration +metadata: + name: {{ tpl .Values.flytepropellerwebhook.webhook.configurationName . }} + labels: + {{- include "flytepropellerwebhook.labels" . | nindent 4 }} + {{- if $useCertManager }} + annotations: + {{- $secretName := include "flytepropellerwebhook.secretName" . }} + cert-manager.io/inject-ca-from: {{ $namespace }}/{{ $secretName }}-cert + {{- end }} +webhooks: +{{- if .Values.flytepropellerwebhook.webhook.webhooks.secrets.enabled }} + - name: {{ .Values.flytepropellerwebhook.webhook.webhooks.secrets.name }} + admissionReviewVersions: + - v1 + - v1beta1 + clientConfig: + {{- if and (not $useCertManager) (not $useLegacy) }} + caBundle: {{ $certs.caCert }} + {{- end }} + service: + name: {{ $serviceName }} + namespace: {{ $namespace }} + path: {{ .Values.flytepropellerwebhook.webhook.webhooks.secrets.path }} + port: {{ .Values.flytepropellerwebhook.service.port }} + failurePolicy: {{ .Values.flytepropellerwebhook.webhook.failurePolicy }} + matchPolicy: Equivalent + {{- if $lowPrivilege }} + namespaceSelector: + matchLabels: + kubernetes.io/metadata.name: {{ $namespace }} + {{- else }} + namespaceSelector: {} + {{- end }} + objectSelector: + {{- with .Values.flytepropellerwebhook.webhook.webhooks.secrets.objectSelector }} + {{- tpl (toYaml .) $ | nindent 6 }} + {{- end }} + reinvocationPolicy: {{ .Values.flytepropellerwebhook.webhook.reinvocationPolicy }} + rules: + - apiGroups: + - '*' + apiVersions: + - v1 + operations: + - CREATE + resources: + - pods + scope: '*' + sideEffects: NoneOnDryRun + timeoutSeconds: {{ .Values.flytepropellerwebhook.webhook.timeoutSeconds }} +{{- end }} +{{- if .Values.flytepropellerwebhook.webhook.webhooks.managedImage.enabled }} + - name: {{ .Values.flytepropellerwebhook.webhook.webhooks.managedImage.name }} + admissionReviewVersions: + - v1 + - v1beta1 + clientConfig: + {{- if and (not $useCertManager) (not $useLegacy) }} + caBundle: {{ $certs.caCert }} + {{- end }} + service: + name: {{ $serviceName }} + namespace: {{ $namespace }} + path: {{ .Values.flytepropellerwebhook.webhook.webhooks.managedImage.path }} + port: {{ .Values.flytepropellerwebhook.service.port }} + failurePolicy: {{ .Values.flytepropellerwebhook.webhook.failurePolicy }} + matchPolicy: Equivalent + {{- if $lowPrivilege }} + namespaceSelector: + matchLabels: + kubernetes.io/metadata.name: {{ $namespace }} + {{- else }} + namespaceSelector: {} + {{- end }} + objectSelector: + {{- with .Values.flytepropellerwebhook.webhook.webhooks.managedImage.objectSelector }} + {{- tpl (toYaml .) $ | nindent 6 }} + {{- end }} + reinvocationPolicy: {{ .Values.flytepropellerwebhook.webhook.reinvocationPolicy }} + rules: + - apiGroups: + - '*' + apiVersions: + - v1 + operations: + - CREATE + resources: + - pods + scope: '*' + sideEffects: NoneOnDryRun + timeoutSeconds: {{ .Values.flytepropellerwebhook.webhook.timeoutSeconds }} +{{- end }} +{{- end }} +{{- end }} diff --git a/charts/dataplane/templates/propeller/service-webhook.yaml b/charts/dataplane/templates/webhook/service.yaml similarity index 88% rename from charts/dataplane/templates/propeller/service-webhook.yaml rename to charts/dataplane/templates/webhook/service.yaml index 50898000..45a41117 100644 --- a/charts/dataplane/templates/propeller/service-webhook.yaml +++ b/charts/dataplane/templates/webhook/service.yaml @@ -1,8 +1,8 @@ -{{ if .Values.flytepropeller.enabled }} +{{- if .Values.flytepropellerwebhook.enabled }} apiVersion: v1 kind: Service metadata: - name: flyte-pod-webhook + name: {{ template "flyte-pod-webhook.name" . }} namespace: {{ .Release.Namespace }} labels: {{- include "flytepropellerwebhook.labels" . | nindent 4 }} @@ -28,7 +28,7 @@ spec: apiVersion: v1 kind: Service metadata: - name: flyte-pod-webhook-headless + name: {{ template "flyte-pod-webhook.name" . }}-headless namespace: {{ .Release.Namespace }} labels: {{- include "flytepropellerwebhook.labels" . | nindent 4 }} @@ -41,4 +41,4 @@ spec: protocol: TCP port: {{ .Values.flytepropellerwebhook.service.targetPort }} targetPort: {{ .Values.flytepropellerwebhook.service.targetPort }} -{{- end }} \ No newline at end of file +{{- end }} diff --git a/charts/dataplane/templates/propeller/serviceaccount-webhook.yaml b/charts/dataplane/templates/webhook/serviceaccount.yaml similarity index 78% rename from charts/dataplane/templates/propeller/serviceaccount-webhook.yaml rename to charts/dataplane/templates/webhook/serviceaccount.yaml index bca9af15..7dbc6e77 100644 --- a/charts/dataplane/templates/propeller/serviceaccount-webhook.yaml +++ b/charts/dataplane/templates/webhook/serviceaccount.yaml @@ -1,5 +1,4 @@ -{{ if .Values.flytepropeller.enabled }} ---- +{{- if .Values.flytepropellerwebhook.enabled }} {{- if .Values.low_privilege }} kind: Role {{- else }} @@ -7,7 +6,7 @@ kind: ClusterRole {{- end }} apiVersion: rbac.authorization.k8s.io/v1 metadata: - name: flytepropeller-webhook-role + name: union-webhook-role namespace: {{ .Release.Namespace }} rules: - apiGroups: @@ -23,10 +22,11 @@ rules: - update - patch --- +{{- if not (include "useCommonServiceAccount" .) }} apiVersion: v1 kind: ServiceAccount metadata: - name: flytepropeller-webhook-system + name: {{ include "webhook.serviceAccountName" . }} namespace: {{ .Release.Namespace }} {{- with include "global.serviceAccountAnnotations" . }} annotations: @@ -36,6 +36,7 @@ metadata: imagePullSecrets: {{ tpl (toYaml .) $ | nindent 2 }} {{- end }} --- +{{- end }} # Create a binding from Role -> ServiceAccount {{- if .Values.low_privilege }} kind: RoleBinding @@ -44,7 +45,7 @@ kind: ClusterRoleBinding {{- end }} apiVersion: rbac.authorization.k8s.io/v1 metadata: - name: flytepropeller-webhook-binding + name: union-webhook-binding namespace: {{ .Release.Namespace }} roleRef: apiGroup: rbac.authorization.k8s.io @@ -53,9 +54,9 @@ roleRef: {{- else }} kind: ClusterRole {{- end }} - name: flytepropeller-webhook-role + name: union-webhook-role subjects: - kind: ServiceAccount - name: flytepropeller-webhook-system + name: {{ include "webhook.serviceAccountName" . }} namespace: {{ .Release.Namespace }} -{{- end }} \ No newline at end of file +{{- end }} diff --git a/charts/dataplane/values-legacy.yaml b/charts/dataplane/values-legacy.yaml new file mode 100644 index 00000000..68c6a466 --- /dev/null +++ b/charts/dataplane/values-legacy.yaml @@ -0,0 +1,185 @@ +# Restores legacy (pre-low-privilege, pre-v2) defaults. +# Use this file to opt back into the previous behavior: +# helm install -f values-legacy.yaml [...] +low_privilege: false +flytepropeller: + enabled: true +executor: + idl2Executor: false +flyteconnector: + enabled: false +clusterresourcesync: + enabled: true +opencost: + enabled: true +commonServiceAccount: + enabled: false +fluentbit: + serviceAccount: + create: true + name: fluentbit-system +serving: + enabled: false +knative-operator-crds: + enabled: false +knative-operator: + enabled: false +namespaces: + enabled: true +prometheus: + # Enable the chart's own ClusterRole for cluster-wide scraping (cAdvisor, node metrics). + rbac: + create: true + kube-state-metrics: + rbac: + create: true + useClusterRole: true + releaseNamespace: false + # Append cAdvisor scrape config (requires ClusterRole for nodes/proxy access). + extraScrapeConfigs: | + - job_name: 'kube-state-metrics' + metrics_path: /metrics + scrape_interval: 1m + honor_labels: true + kubernetes_sd_configs: + - role: endpoints + namespaces: + own_namespace: true + relabel_configs: + - source_labels: [__meta_kubernetes_service_label_app_kubernetes_io_name] + regex: kube-state-metrics + action: keep + - source_labels: [__meta_kubernetes_endpoint_port_name] + regex: http + action: keep + metric_relabel_configs: + - source_labels: [__name__] + regex: "kube_pod_container_resource_(limits|requests)|kube_pod_status_phase|kube_node_(labels|status_allocatable|status_condition|status_capacity)|kube_namespace_labels|kube_pod_container_status_(waiting|terminated|last_terminated).*_reason|kube_daemonset_status_number_unavailable|kube_deployment_status_replicas_unavailable|kube_resourcequota|kube_pod_info|kube_node_info|kube_pod_container_status_restarts_total" + action: keep + - source_labels: [__name__, phase] + separator: ";" + regex: "kube_pod_status_phase;(Succeeded|Failed)" + action: drop + - source_labels: [node] + target_label: nodename + regex: "(.*)" + action: replace + - source_labels: [label_node_group_name] + action: replace + regex: "(.+)" + target_label: label_node_pool_name + - job_name: 'opencost' + metrics_path: /metrics + scrape_interval: 1m + honor_labels: true + kubernetes_sd_configs: + - role: endpoints + namespaces: + own_namespace: true + relabel_configs: + - source_labels: [__meta_kubernetes_service_label_app_kubernetes_io_name] + regex: opencost + action: keep + - source_labels: [__meta_kubernetes_endpoint_port_name] + regex: http + action: keep + metric_relabel_configs: + - source_labels: [__name__] + regex: "kube_node_labels|kube_pod_labels|node_total_hourly_cost|node_ram_hourly_cost|node_cpu_hourly_cost|node_gpu_hourly_cost" + action: keep + - job_name: 'union-services' + metrics_path: /metrics + scrape_interval: 1m + honor_labels: true + kubernetes_sd_configs: + - role: endpoints + namespaces: + own_namespace: true + relabel_configs: + - source_labels: [__meta_kubernetes_service_label_platform_union_ai_service_group] + regex: .+ + action: keep + - source_labels: [__meta_kubernetes_endpoint_port_name] + regex: debug + action: keep + - job_name: 'flytepropeller' + kubernetes_sd_configs: + - role: pod + namespaces: + own_namespace: true + selectors: + - role: pod + label: app.kubernetes.io/name=flytepropeller + metric_relabel_configs: + - source_labels: [__name__] + regex: "flyte:propeller:all:round:execution_info|flyte:propeller:all:node:fast_task:fast_task_execution_duration" + action: keep + relabel_configs: + - source_labels: [__meta_kubernetes_pod_container_name] + regex: flytepropeller + action: keep + - source_labels: [__meta_kubernetes_namespace] + action: replace + target_label: namespace + - source_labels: [__meta_kubernetes_pod_name] + action: replace + target_label: pod + - job_name: 'serving-envoy' + kubernetes_sd_configs: + - role: pod + namespaces: + own_namespace: true + selectors: + - role: pod + label: app=3scale-kourier-gateway + metrics_path: /stats/prometheus + metric_relabel_configs: + - source_labels: [__name__] + regex: "envoy_cluster_upstream_rq_xx|envoy_cluster_upstream_rq_time_bucket" + action: keep + relabel_configs: + - source_labels: [__meta_kubernetes_pod_container_port_name] + regex: metrics + action: keep + - source_labels: [__meta_kubernetes_namespace] + action: replace + target_label: namespace + - source_labels: [__meta_kubernetes_pod_name] + action: replace + target_label: pod + - job_name: 'gpu-metrics' + kubernetes_sd_configs: + - role: pod + namespaces: + names: + - kube-system + selectors: + - role: pod + label: app.kubernetes.io/name=dcgm-exporter + # cAdvisor container metrics for actual CPU and memory usage tracking. + # Requires cluster-level RBAC (nodes/proxy); not available in low-privilege mode. + - job_name: 'kubernetes-cadvisor' + metrics_path: /metrics + scheme: https + kubernetes_sd_configs: + - role: node + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + insecure_skip_verify: false + metric_relabel_configs: + - source_labels: [__name__] + regex: container_cpu_usage_seconds_total|container_memory_working_set_bytes + action: keep + relabel_configs: + - regex: __meta_kubernetes_node_label_(.+) + replacement: $1 + action: labelmap + - target_label: __address__ + replacement: kubernetes.default.svc:443 + action: replace + - source_labels: [__meta_kubernetes_node_name] + regex: (.+) + target_label: __metrics_path__ + replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor + action: replace diff --git a/charts/dataplane/values-low-privilege.yaml b/charts/dataplane/values-low-privilege.yaml deleted file mode 100644 index 090d2cca..00000000 --- a/charts/dataplane/values-low-privilege.yaml +++ /dev/null @@ -1,54 +0,0 @@ -# This values file customizes the helm chart to install Union Dataplane into a single namespace and ensure -# no cluster-wide permissions are needed. -low_privilege: true -fluentbit: - enabled: false -config: - operator: - billableUsageCollector: - enabled: false - collectUsages: - enabled: false - serverlessCollectUsages: - enabled: false - dependenciesHeartbeat: - # -- Define the propeller health check endpoint. - propeller: - endpoint: '{{ include "propeller.health.url" . }}' - # -- Define the operator proxy health check endpoint. - proxy: - endpoint: '{{ include "proxy.health.url" . }}' - # -- This removes the prometheus health endpoint - prometheus: null - limitNamespace: "{{ .Release.Namespace }}" - disableClusterPermissions: true - core: - propeller: - limit-namespace: "{{ .Release.Namespace }}" - webhook: - disableCreateMutatingWebhookConfig: true - namespace_config: - namespace_mapping: - template: "{{ .Release.Namespace }}" -prometheus: - enabled: false -executor: - propeller: - limit-namespace: "{{ .Release.Namespace }}" - raw_config: - namespace_mapping: - template: "{{ .Release.Namespace }}" -clusterresourcesync: - enabled: false -opencost: - enabled: false -namespaces: - enabled: false -cost: - enabled: false -monitoring: - enabled: false -metrics-server: - enabled: false -flytepropeller: - priorityClassName: '' diff --git a/charts/dataplane/values-test-certs.yaml b/charts/dataplane/values-test-certs.yaml new file mode 100644 index 00000000..bdf17e16 --- /dev/null +++ b/charts/dataplane/values-test-certs.yaml @@ -0,0 +1,22 @@ +# Static test certificates for helm template tests. +# These are self-signed certificates used ONLY for testing purposes. +# DO NOT use these certificates in production. +# +# Generated with: +# openssl genCA for CA +# openssl genSignedCert for server cert with SANs: +# - flytepropeller-webhook +# - flytepropeller-webhook.union +# - flytepropeller-webhook.union.svc +# - flytepropeller-webhook.union.svc.cluster.local + +flytepropellerwebhook: + certificate: + provider: external + external: + # Base64-encoded CA certificate + caCert: "LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSUROekNDQWgrZ0F3SUJBZ0lVSm9kQ09lem4vd1BHQTZjYkw1U3duSlpYT2hJd0RRWUpLb1pJaHZjTkFRRUwKQlFBd0t6RXBNQ2NHQTFVRUF3d2dabXg1ZEdWd2NtOXdaV3hzWlhJdGQyVmlhRzl2YXk1MWJtbHZiaTV6ZG1NdwpIaGNOTWpZd01USXpNREl6TmpVMVdoY05Nell3TVRJeE1ESXpOalUxV2pBck1Ta3dKd1lEVlFRRERDQm1iSGwwClpYQnliM0JsYkd4bGNpMTNaV0pvYjI5ckxuVnVhVzl1TG5OMll6Q0NBU0l3RFFZSktvWklodmNOQVFFQkJRQUQKZ2dFUEFEQ0NBUW9DZ2dFQkFJcjBSWDRGRjFNVTNYdFBmUHErMDUrMGI1Qm0xK3hsTVR5ZUp4bG5mWFJuRlgyegpPcHV6NmsreUpBQk4xRkxmNDVYRnlJclFlaW5DWWJtckZXalFRc3ZDWEloNkpPNGMwdmRMVVU5RERYcldWRFlsCkRNZytTUDlJVnZBMm5USkVFZ2RFREVsKzNWeThUbUJoNlNmcGthcXdHajY5SmhIdy9OeE9yZzM2bUY3TU5VZ1MKWXJVUkxqUFFzTUN6NElTVzhhQnEzS2IwRUFWUTA4V1lZQ0ZEbjY1aHBzVFdxR05rL3BKRnVTYjlsZll6RVdyYwpJeW5wNDhqTnIvNUdKS3NnWG9LeTUwZVo4UkppeTVyOUhENGtnSW15TVNQR2RDMXFZSVppRHRlZW8yOElxNk5nCmd1dWcyTGhlNndYYW9YZXFCQ2d3SHpwWFVXM1hJVzI5eFoxUkFRRUNBd0VBQWFOVE1GRXdIUVlEVlIwT0JCWUUKRktITDQ1RWhQaVg5bnlRN241QUk0YUZVNFlCL01COEdBMVVkSXdRWU1CYUFGS0hMNDVFaFBpWDlueVE3bjVBSQo0YUZVNFlCL01BOEdBMVVkRXdFQi93UUZNQU1CQWY4d0RRWUpLb1pJaHZjTkFRRUxCUUFEZ2dFQkFEeTRac21XCkNjZ3B5TmI4cU1ydzNYclBNejZLMTRlTVhqQVk1bERFYXM1NklLZExoZFM1bi9sWk95bENuUUtOZ3RHS0VrY3UKT05tMEE2Z0xNT3UzcE43dG9aNXh0WktOcnUvS0x6dm44QUNNNndCbUR0bG5oRERDa3ZUTHBVRzh6UTZSNmlaegpzTWlhVmpLSUJiWkJsZXNYSHZZQnQ2YlRTN2o1QWM2aTdYc1RPK3ZqQ2RUQjVYdCtEZ0x2QnBJRWh2ZGJzNXhmClFtc3ZzQmwxenJnR3JXemo4d0txcXNUMzZaOVU1TzdSK1RJMlQrUDVOR25SeHNQOGJ0R0RtMGZ5SW55WlJrSkcKVTdpVFJyMlZRWlRyaTNZQ2dlVjN1UjFtMnJjc2hyU1Avb082T0QyTXR5Q1FMRWtFK1pxVUVJVnpSYmVaNXl2RQptM29YMFV2OUtBUy9pWTA9Ci0tLS0tRU5EIENFUlRJRklDQVRFLS0tLS0K" + # Base64-encoded TLS certificate + tlsCrt: "LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSUQxakNDQXI2Z0F3SUJBZ0lVREtRN3A4MFpIdkQyUE5Ya3p3VGRLem9iUjg4d0RRWUpLb1pJaHZjTkFRRUwKQlFBd0t6RXBNQ2NHQTFVRUF3d2dabXg1ZEdWd2NtOXdaV3hzWlhJdGQyVmlhRzl2YXk1MWJtbHZiaTV6ZG1NdwpIaGNOTWpZd01USXpNREl6TmpVMVdoY05NamN3TVRJek1ESXpOalUxV2pBck1Ta3dKd1lEVlFRRERDQm1iSGwwClpYQnliM0JsYkd4bGNpMTNaV0pvYjI5ckxuVnVhVzl1TG5OMll6Q0NBU0l3RFFZSktvWklodmNOQVFFQkJRQUQKZ2dFUEFEQ0NBUW9DZ2dFQkFMNnMxUTRaZ3ZxVVdNdFp5VC9EKytEQWl6WmR0Q1lkVzArSGk4S3phQ1Vjbi9zZwo3a21NTEhDM2ZlZWtZK2ZUNENHUTd6K1l2WjRWWSt6WGljKzdrNndLbUZqdm1acEM1Rm96OWo1Q21NRk5HZmlWCnRobzVvSU9MUkIzNXRYMU1ROHUwNFVzT29QcjdMK016eEdxWXVmTFdtN1NVdjh0eEp3ZFMxZVlzMllNV1pMeU4KekI4akdWY01Qc2ZNVkdxdFhTTVljQ3hNYURKaTM1N0VEZSs3clM2U2ZtbnFtYjJxekdQM1BRRStCUm91Yk5nLwo4NCt4c253VXBLZUFpem95dFM2c2ZlOTdMUEtRTW9oVzd2TjZORFJrLy9xSmpHem9iZmZwUHU3endhWHVXMWtMCmllc2g2cVRrdlZLSTF3WFJ2WHArVU5GbjZzdDIwdEVlRks0ZzMwOENBd0VBQWFPQjhUQ0I3akFKQmdOVkhSTUUKQWpBQU1Bc0dBMVVkRHdRRUF3SUY0RENCa3dZRFZSMFJCSUdMTUlHSWdoWm1iSGwwWlhCeWIzQmxiR3hsY2kxMwpaV0pvYjI5cmdoeG1iSGwwWlhCeWIzQmxiR3hsY2kxM1pXSm9iMjlyTG5WdWFXOXVnaUJtYkhsMFpYQnliM0JsCmJHeGxjaTEzWldKb2IyOXJMblZ1YVc5dUxuTjJZNEl1Wm14NWRHVndjbTl3Wld4c1pYSXRkMlZpYUc5dmF5NTEKYm1sdmJpNXpkbU11WTJ4MWMzUmxjaTVzYjJOaGJEQWRCZ05WSFE0RUZnUVV1d25wcDZ1MTNKL0NsNnI3eDZ1Sgo5YmZ1RDhzd0h3WURWUjBqQkJnd0ZvQVVvY3Zqa1NFK0pmMmZKRHVma0FqaG9WVGhnSDh3RFFZSktvWklodmNOCkFRRUxCUUFEZ2dFQkFFMmlOa1MzczhGT3Z0cDhrVmJNZUN2amxIQytvYTVlZTFGNmowNlZ3bFJPOWdxOXJla1UKMGptWUhNWGNjWjcrZXJkNFRQRXgwK3ZqMzNhRTV3cDFmdVJJL2xIZHhuOUpJRHg2bTFQTHdVa3p1WG9EekYxLwozVXVIWU1xOGkrT2xsY0tqbmdVMUFUNmRWRCtIbFVWeHpMdjlNazdPb05SQUY5dTJHa1hiUHdCN0I1ZGRxdWxlCmtNcXBxbDhES2s4SVNVelVsTjNBb2Q5NmJUelIyV3RCVkxpTi9zRHZtTDZwZThFYm55RnBUb1BZTDlGeWZOcmIKT1ZPZEJwUkZFTG1KOHZaZWY5d1lxRmVXMTE3SGV5VDArZkxGZVNSemZUNVVWUlVraVFBcUVJdUhHUVRrOVFabgo1WnYvNUF0eEg3WE1rcXR0VE1zMFlSUG0xYjQzY3FBeFdhND0KLS0tLS1FTkQgQ0VSVElGSUNBVEUtLS0tLQo=" + # Base64-encoded TLS private key + tlsKey: "LS0tLS1CRUdJTiBQUklWQVRFIEtFWS0tLS0tCk1JSUV2UUlCQURBTkJna3Foa2lHOXcwQkFRRUZBQVNDQktjd2dnU2pBZ0VBQW9JQkFRQytyTlVPR1lMNmxGakwKV2NrL3cvdmd3SXMyWGJRbUhWdFBoNHZDczJnbEhKLzdJTzVKakN4d3QzM25wR1BuMCtBaGtPOC9tTDJlRldQcwoxNG5QdTVPc0NwaFk3NW1hUXVSYU0vWStRcGpCVFJuNGxiWWFPYUNEaTBRZCtiVjlURVBMdE9GTERxRDYreS9qCk04UnFtTG55MXB1MGxML0xjU2NIVXRYbUxObURGbVM4amN3Zkl4bFhERDdIekZScXJWMGpHSEFzVEdneVl0K2UKeEEzdnU2MHVrbjVwNnBtOXFzeGo5ejBCUGdVYUxtellQL09Qc2JKOEZLU25nSXM2TXJVdXJIM3ZleXp5a0RLSQpWdTd6ZWpRMFpQLzZpWXhzNkczMzZUN3U4OEdsN2x0WkM0bnJJZXFrNUwxU2lOY0YwYjE2ZmxEUlorckxkdExSCkhoU3VJTjlQQWdNQkFBRUNnZ0VBRjV2SnMzbTRMSE9DdlFic2NwOC9DQmgvQkNlOC92MGlpYW5VUmJLMkFlWlYKN3Arb3NXV1FPUktYSGIvT2VPMVVjb09DQkFOUzh3aGQrM3pDZlB5U0w3cU9HM0RyT1Z6djdqVkIxM3FpZEVpcQpId3ZXWk0vZXpuckhYOWpEdm5SYmJwVUNVaXRKQmxwa2x3S1pYc056UHB5UTRkNkxFdEw3VEo1V2lxM2g3cWQzCjhncmJoeWlWNDhZRFAyYkhpZGt0S2QvMHBNL3piSzZPTS9iMFpLUmRBWVpHQXlOcklMaFJ3eDNsQXE2SDVqL20KUHBuNlZEdVQ3Y1h6UFhmNTgxR0IyNjQxWFJVTXRNWGVEREtaU3JudnIrbG5tV2lsYXZmNkg3ZHp4NlRBMmk0RApybCtIRVdUN1RSQUZGMnZ5SVUrYSt5RzNPVFBKRCtmVTdCWWh4Zld4aFFLQmdRRGVvcXVCbzQxK1dKU255blZnCmNWMHZoUEkwZ0dyRGQzNFdIbCtGNTZuSmxuMVVlbUQ2SUQ0TUhnNVpHWmNmd3NkRlB0aHl1VnB0OVl1SW11SGQKRHE5ZDhpWnpHUzJVbStkSWlsWjhnZXJ5Q3dON3hIbFM3SzJrc0RmRGJwYmZZSXZ1LzRnSExZRHh3YzQ0OG54ZwpWNGpCUFY2QTdYTHRqQzZzdFE2OW9uTlIwd0tCZ1FEYlFBUTRGdWpnUjJUOXh2Y01MdXhMbm1YaHE4U2NoZlM5CkRoWndDUzZIeHB1YlBxblBjQ04vQUZjbDZ1cDRXVEVZdVY4bEpwcGwwRmpYQ2lMdkhuUU15cm5vRjlUOThpanQKZGVHYjF6ZWlwRUpndFpNVHkvOGdERWorZU1wSFBuYTZwQVpMOVpRVUdLR2dDNGQ0SzljdVNHMmRlb1dSU0ViSQpMVGgzeGpXVEZRS0JnQStJRWFidG5nVmVjS0IwQTFSREZGa29VUzFRZUNKQ3g4MExPV2JDRHBvOW9XaXZVT3lpCkt6SDFOdE1JY2Y2SlBCV2NtTVVJSVVMaWltVnhTS2gvU2NTb0MvNmpsd1p6Q2VPSm94YjBpVXR4Y1VERktDR2MKMlZCUDZ0UDdkeE1HVFR6VEhzNUJZbWw3TjhQSlJ0d0J2MHliMTJmdktNRmhzaS9pUWJFQkVFSjVBb0dBUm9NSApHRmJkM0V0NXdsZzcyYkk1a25SRnhkY3RLejIxb1J1bndhNWlSWTV0T3Zkak8zQ1FLZWNkSC9lMklyQmtwdFB2Ck1vNkF0MS9UUW8xakFNNGxlbnUwWUYxUnhiNGN4WW5VM2Y3UVNNRlZDNjg1dHZNemdNWVNyNngvT1h4d1NNUTUKdGpJcnhtN0poQ1JSRkNmZFUyZzl3SmpIM2hxRmtSbGlBTHRCUGFVQ2dZRUF2QkRHejQvYS9kRzRJT0VpREk4SgpBbkxaRWZaTFJrdnI0ejRyT0p0TGZtWlFjTW5DSUZmKzBaelhTME9XOWhvMjk5RUNtS1l4U2VUWk1WSDgzOFZCCnRvSkNGM3lzb01rSC9meEVXTUpicDlMblJwSFUzUElicHlhRG1nWW1YdmsvK3laL3Y1c0dDTWNtVGkzWVFLRGsKcStlK1o0SHZyaGUxVytUaHpwOCtoNGs9Ci0tLS0tRU5EIFBSSVZBVEUgS0VZLS0tLS0K" diff --git a/charts/dataplane/values.aws.eks-automode.yaml b/charts/dataplane/values.aws.eks-automode.yaml index 5144470c..05096775 100644 --- a/charts/dataplane/values.aws.eks-automode.yaml +++ b/charts/dataplane/values.aws.eks-automode.yaml @@ -56,12 +56,17 @@ global: # Note: Can be same as metadata bucket or separate FAST_REGISTRATION_BUCKET: "" - # 1. AWS_REGION - AWS region for S3 buckets + # 1. AWS_ACCOUNT_ID - AWS account ID (used for ECR registry URL) + # Format: 12-digit AWS account ID + # Example: "123456789012" + AWS_ACCOUNT_ID: "" + + # 2. AWS_REGION - AWS region for S3 buckets # Format: AWS region code # Example: "us-east-1", "us-west-2", "eu-west-1" AWS_REGION: "" - # 2. BACKEND_IAM_ROLE_ARN - IAM role for Union backend services + # 3. BACKEND_IAM_ROLE_ARN - IAM role for Union backend services # Format: Full ARN # Example: "arn:aws:iam::123456789012:role/union-backend-role" # Permissions: S3 access, ECR (if private images) diff --git a/charts/dataplane/values.aws.selfhosted-intracluster.yaml b/charts/dataplane/values.aws.selfhosted-intracluster.yaml index 42e690cc..b2a6654d 100644 --- a/charts/dataplane/values.aws.selfhosted-intracluster.yaml +++ b/charts/dataplane/values.aws.selfhosted-intracluster.yaml @@ -49,6 +49,11 @@ global: # Note: Can be same as metadata bucket or separate FAST_REGISTRATION_BUCKET: "" + # AWS_ACCOUNT_ID - AWS account ID (used for ECR registry URL) + # Format: 12-digit AWS account ID + # Example: "123456789012" + AWS_ACCOUNT_ID: "" + # AWS_REGION - AWS region for S3 buckets # Format: AWS region code # Example: "us-east-1", "us-west-2", "eu-west-1" diff --git a/charts/dataplane/values.aws.yaml b/charts/dataplane/values.aws.yaml index 6183d43d..667ea06a 100644 --- a/charts/dataplane/values.aws.yaml +++ b/charts/dataplane/values.aws.yaml @@ -55,12 +55,17 @@ global: # Note: Can be same as metadata bucket or separate FAST_REGISTRATION_BUCKET: "" - # 1. AWS_REGION - AWS region for S3 buckets + # 1. AWS_ACCOUNT_ID - AWS account ID (used for ECR registry URL) + # Format: 12-digit AWS account ID + # Example: "123456789012" + AWS_ACCOUNT_ID: "" + + # 2. AWS_REGION - AWS region for S3 buckets # Format: AWS region code # Example: "us-east-1", "us-west-2", "eu-west-1" AWS_REGION: "" - # 2. BACKEND_IAM_ROLE_ARN - IAM role for Union backend services + # 3. BACKEND_IAM_ROLE_ARN - IAM role for Union backend services # Format: Full ARN # Example: "arn:aws:iam::123456789012:role/union-backend-role" # Permissions: S3 access, ECR (if private images) diff --git a/charts/dataplane/values.yaml b/charts/dataplane/values.yaml index 04276a0d..ea4f894e 100644 --- a/charts/dataplane/values.yaml +++ b/charts/dataplane/values.yaml @@ -47,6 +47,22 @@ global: # Note: Can be same as metadata bucket or separate FAST_REGISTRATION_BUCKET: "" + # -- AWS account ID (required for ECR registry URL auto-generation). + # Example: "123456789012" + AWS_ACCOUNT_ID: "" + + # BACKEND_IAM_ROLE_ARN - IAM Role to use for union-system SA + # GCP Format: service-account-name@project-id.iam.gserviceaccount.com + # Example: "union-backend@my-project.iam.gserviceaccount.com" + # Permissions: GCS access, GCR/Artifact Registry (if private images) + BACKEND_IAM_ROLE_ARN: "" + + # WORKER_IAM_ROLE_ARN - IAM Role to use for union SA + # GCP Format: service-account-name@project-id.iam.gserviceaccount.com + # Example: "union-worker@my-project.iam.gserviceaccount.com" + # Permissions: GCS access, GCP services used by workflows + WORKER_IAM_ROLE_ARN: "" + # ---------------------------------------------------------------------------- # Additional Configuration # ---------------------------------------------------------------------------- @@ -80,6 +96,9 @@ additionalPodLabels: { } # -- Define additional PodSpec values for all of the Union pods. additionalPodSpec: { } +# -- Define additional serviceAccountAnnotations to be added to all system-created service accounts +additionalServiceAccountAnnotations: { } + # -- Global kubernetes scheduling constraints that will be applied to the # pods. Application specific constraints will always take precedence. scheduling: @@ -112,21 +131,23 @@ scheduling: # effect: "NoSchedule" nodeName: "" -# -- This is the annotation key that is added to service accounts. Used with GCP and AWS. +# -- This is the annotation key that is added to service accounts. userRoleAnnotationKey: "eks.amazonaws.com/role-arn" -# -- This is the value of the annotation key that is added to service accounts. Used with GCP and AWS. +# -- This is the value of the annotation key that is added to service accounts. userRoleAnnotationValue: "arn:aws:iam::ACCOUNT_ID:role/flyte_project_role" # -- clusterresourcesync contains the configuration information for the syncresources service. clusterresourcesync: # -- Enable or disable the syncresources service - enabled: true + enabled: false # -- Override service account values for the syncresources service serviceAccount: # -- Override the service account name for the syncresources service name: "" # -- Additional annotations for the syncresources service account annotations: {} + # -- Additional pod labels for the syncresources service + podLabels: { } # -- Additional pod annotations for the syncresources service podAnnotations: { } # -- Additional pod environment variables for the syncresources service @@ -334,11 +355,19 @@ config: cache-endpoint: 'dns:///{{ tpl .Values.host . }}' endpoint: 'dns:///{{ tpl .Values.host . }}' insecure: false - type: fallback + type: cacheservicev2 use-admin-auth: true + # -- Optional catalog cache configuration, rendered as catalog_cache.yaml for propeller. + catalog_cache: {} clusters: labelClusterMap: { } clusterConfigs: [ ] + # -- Optional namespace configuration, rendered as namespace_config.yaml. + namespace_config: {} + # -- Optional namespace mapping override (preferred over top-level namespace_mapping). + namespace_mapping: {} + # -- Optional Qubole plugin configuration. + qubole: {} # -- Override any configuration settings. configOverrides: cache: @@ -394,7 +423,7 @@ config: enabled: true webhook: certDir: /etc/webhook/certs - serviceName: flyte-pod-webhook + serviceName: union-pod-webhook servicePort: "{{ .Values.flytepropellerwebhook.service.port }}" listenPort: "{{ .Values.flytepropellerwebhook.service.targetPort }}" secretManagerTypes: @@ -444,6 +473,7 @@ config: plugins: # -- Configuration section for all K8s specific plugins [Configuration structure](https://pkg.go.dev/github.com/lyft/flyteplugins/go/tasks/pluginmachinery/flytek8s/config) k8s: + default-pod-template-name: "task-template" default-env-vars: [ ] default-cpus: 100m default-memory: 100Mi @@ -451,6 +481,14 @@ config: operator: # -- Enables the operator service enabled: true + # -- Restrict operator to a single namespace. Auto-set in singleNamespace mode. + limitNamespace: "" + # -- Disable cluster-wide permissions. Auto-set when low_privilege is true. + disableClusterPermissions: false + # -- Compute resource manager configuration. + computeResourceManager: {} + # -- Organization namespace template override. + org: {} # -- Enable app serving apps: enabled: "{{ .Values.serving.enabled }}" @@ -491,6 +529,9 @@ config: # -- Define the propeller health check endpoint. propeller: endpoint: '{{ include "propeller.health.url" . }}' + # -- Define the propeller health check endpoint. + executor: + endpoint: '{{ include "executor.health.url" . }}' # -- Define the operator proxy health check endpoint. proxy: endpoint: '{{ include "proxy.health.url" . }}' @@ -519,9 +560,14 @@ config: objectStore: prefix: persisted-logs pathTemplate: "namespace-{{`{{.KubernetesNamespace}}`}}.pod-{{`{{.KubernetesPodName}}`}}.cont-{{`{{.KubernetesContainerName}}`}}" - imageBuilderConfig: - authenticationType: "{{ .Values.imageBuilder.authenticationType }}" - defaultRepository: "{{ .Values.imageBuilder.defaultRepository }}" +# Temporarily disable imageBuilderConfig +# imageBuilderConfig: +# authenticationType: "{{ .Values.imageBuilder.authenticationType }}" +# defaultRepository: "{{ .Values.imageBuilder.defaultRepository }}" +# basicAuth: +# registry: "{{ .Values.imageBuilder.defaultRepository }}" +# username: "{{ .Values.imageBuilder.basicAuth.username }}" +# passwordSecretName: "{{ .Values.imageBuilder.basicAuth.passwordSecretName }}" # -- Resource manager configuration resource_manager: # -- resource manager configuration @@ -756,7 +802,7 @@ dcgm-exporter: opencost: # -- Enable or disable the opencost installation. - enabled: true + enabled: false opencost: metrics: @@ -807,9 +853,11 @@ fluentbit: tolerations: - operator: Exists serviceAccount: - name: fluentbit-system - # Set cloud-provider-specific annotations to grant bucket write access. - # See examples above. + create: false + # Must match commonServiceAccount.name when commonServiceAccount is enabled (default). + # When commonServiceAccount is disabled, override to a dedicated name + # (e.g. "fluentbit-system") and create the SA externally or set create: true. + name: union-system annotations: {} # If you would like to use static access keys instead of identity federation # (not recommended), uncomment the following lines: @@ -827,7 +875,7 @@ flyteagent: executor: enabled: true - idl2Executor: false + idl2Executor: true raw_config: {} config: organization: "{{ tpl .Values.orgName . }}" @@ -849,10 +897,15 @@ executor: memory: 1Gi podLabels: app: executor + app.kubernetes.io/name: executor + app.kubernetes.io/instance: "{{ .Release.Name }}" + selector: + matchLabels: + app: executor task_logs: plugins: logs: - kubernetes-enabled: true + kubernetes-enabled: false # -- One option is to enable cloudwatch logging for EKS, update the region and log group accordingly cloudwatch-enabled: false dynamic-log-links: @@ -920,7 +973,7 @@ executor: additional-worker-args: - "--last-ack-grace-period-seconds" - "120" - callback-uri: "http://unionai-dataplane-executor.{{.Release.Namespace}}.svc.cluster.local:15605" + callback-uri: "http://union-operator-executor.{{.Release.Namespace}}.svc.cluster.local:15605" grace-period-status-not-found: 2m k8s: disable-inject-owner-references: true @@ -943,7 +996,7 @@ executor: # -- Flytepropeller configuration flytepropeller: - enabled: true + enabled: false priorityClassName: "system-cluster-critical" # -- Replicas count for Flytepropeller deployment replicaCount: 1 @@ -967,6 +1020,8 @@ flytepropeller: annotations: { } # -- ImapgePullSecrets to automatically assign to the service account imagePullSecrets: [ ] + # -- Security context for the Flytepropeller container. + securityContext: {} # -- Labels for the Flytepropeller pods podLabels: { } # -- Annotations for Flytepropeller pods @@ -1048,8 +1103,7 @@ flytepropellerwebhook: runAsNonRoot: true runAsUser: 1001 fsGroupChangePolicy: "Always" - seLinuxOptions: - type: spc_t + seLinuxOptions: null resources: limits: @@ -1082,13 +1136,85 @@ flytepropellerwebhook: additionalVolumes: [] # -- Appends additional volume mounts to the main container's spec. May include template values. additionalVolumeMounts: [] - + # -- Enable Helm-managed MutatingWebhookConfiguration (if false, the webhook will create its own) + managedConfig: true + # -- Configuration for the webhook MutatingWebhookConfiguration and certificates + webhook: + # -- Name of the MutatingWebhookConfiguration resource + configurationName: "union-pod-webhook-{{ tpl .Values.orgName . }}" + # -- Failure policy for the webhook (Fail or Ignore) + failurePolicy: Fail + # -- Reinvocation policy for the webhook + reinvocationPolicy: Never + # -- Timeout in seconds for the webhook + timeoutSeconds: 30 + # -- Webhook configurations to create + webhooks: + # -- Secrets injection webhook configuration + secrets: + enabled: true + # -- Name of the webhook + name: union-pod-webhook.flyte.org + # -- Path for the webhook + path: /mutate--v1-pod/secrets + # -- Object selector for the webhook + objectSelector: + matchLabels: + inject-flyte-secrets: "true" + organization: "{{ tpl .Values.orgName . }}" + # -- Managed image webhook configuration (requires Union operator support) + managedImage: + enabled: false + # -- Name of the webhook + name: managed-image-webhook.union.ai + # -- Path for the webhook + path: /mutate--v1-pod/managed-image + # -- Object selector for the webhook (matchExpressions) + objectSelector: + matchLabels: + organization: "{{ tpl .Values.orgName . }}" + matchExpressions: + - key: organization + operator: Exists + - key: project + operator: Exists + - key: domain + operator: Exists + + # -- Configuration for webhook certificates + certificate: + # -- Certificate provider: + # -- "helm" - Generate self-signed certificates using Helm's crypto functions. Certs are preserved on upgrade if secret exists. + # -- "certManager" - Use cert-manager to provision and manage certificates. Requires cert-manager to be installed. + # -- "external" - Use externally provided certificates (set caCert, serverCert, serverKey below). + # -- "legacy" - Let the webhook binary generate its own certificates using an init container. The init container runs + # -- `flytepropeller webhook init-certs` to populate an empty secret, then the webhook uses those certs. + provider: helm + # -- Duration of the certificate (only used with certManager provider) + duration: 8760h + # -- Renew before duration (only used with certManager provider) + renewBefore: 720h + # -- cert-manager configuration (only used when provider is "certManager") + certManager: + # -- Issuer reference for cert-manager. If not set, a self-signed issuer will be created. + issuerRef: {} + # name: selfsigned-issuer + # kind: ClusterIssuer + # group: cert-manager.io + # -- External certificate configuration (only used when provider is "external") + external: + # -- Base64-encoded CA certificate (PEM format) + caCert: "" + # -- Base64-encoded TLS certificate (PEM format) + tlsCrt: "" + # -- Base64-encoded TLS private key (PEM format) + tlsKey: "" # # FLYTE_CONNECTOR SETTINGS # flyteconnector: - enabled: false + enabled: true # -- Replicas count for flyteconnector deployment replicaCount: 2 image: @@ -1155,8 +1281,6 @@ flyteconnector: targetCPUUtilizationPercentage: 80 targetMemoryUtilizationPercentage: 80 - - # Container images image: # -- Image repository for the operator and union services @@ -1192,6 +1316,8 @@ nodeobserver: privileged: true capabilities: add: [ "SYS_ADMIN" ] + # -- Additional pod labels for the nodeobserver services + podLabels: { } # -- Additional pod annotations for the nodeobserver services podAnnotations: { } # -- Additional pod environment variables for the nodeobserver services @@ -1311,44 +1437,235 @@ ingress: tls: {} -# -- Union features Prometheus configuration. -# Deploys a static Prometheus instance (no Prometheus Operator required) for Union -# features like cost tracking and task-level monitoring. +# -- Prometheus configuration (CRD-free) +# Configures a standalone Prometheus deployment using the community prometheus chart. +# It does NOT install the Prometheus Operator or any CRDs (ServiceMonitor, PrometheusRule, etc.). +# Recording rules are delivered via a ConfigMap instead of PrometheusRule CRDs. +# Scrape targets are configured via static scrape configs instead of ServiceMonitor CRDs. +# NOTE: If you also enable dcgm-exporter, set dcgm-exporter.serviceMonitor.enabled=false +# to avoid creating ServiceMonitor CRDs. prometheus: - image: - repository: prom/prometheus - tag: v3.3.1 - # -- Data retention period. - retention: 3d - # -- Route prefix for Prometheus web UI and API. - routePrefix: /prometheus/ - # -- Resource limits and requests. - resources: - limits: - cpu: "3" - memory: "3500Mi" - requests: - cpu: "1" - memory: "1Gi" - # -- Service account configuration. - serviceAccount: - create: true - annotations: {} - # -- Priority class for the Prometheus pod. - priorityClassName: system-cluster-critical - # -- Node selector for the Prometheus pod. - nodeSelector: {} - # -- Tolerations for the Prometheus pod. - tolerations: [] - # -- Affinity rules for the Prometheus pod. - affinity: {} + enabled: true + # Disable the chart's own RBAC (which creates ClusterRoles). + # A namespace-scoped Role + RoleBinding is created by the dataplane chart instead. + rbac: + create: false + + # Match the same service name as kube-prometheus-stack mode so that + # health checks, opencost, and ingress work without changes. + # clusterRoleNameOverride avoids naming collisions with the kube-prometheus-stack ClusterRole. + # In low-privilege mode, set rbac.create=false and useExistingClusterRoleName to skip + # ClusterRole/ClusterRoleBinding creation; a namespace-scoped Role is created instead. + server: + fullnameOverride: "union-operator-prometheus" + clusterRoleNameOverride: "union-operator-prometheus" + useExistingClusterRoleName: "" -# -- Standalone kube-state-metrics for Union features (cost tracking, pod resource metrics). -# Metric filtering is handled in the Prometheus static scrape config. -kube-state-metrics: {} + service: + servicePort: 80 + + prefixURL: "/prometheus" + baseURL: "/prometheus/" + + retention: "3d" + + persistentVolume: + enabled: false -# -- Scopes the deployment, permissions and actions created into a single namespace -low_privilege: false + emptyDir: + sizeLimit: "10Gi" + + startupProbe: + initialDelaySeconds: 120 + + resources: + limits: + cpu: "3" + memory: "3500Mi" + requests: + cpu: "1" + memory: "1Gi" + + # Mount the recording rules ConfigMap generated by this chart + extraConfigmapMounts: + - name: recording-rules + mountPath: /etc/config/recording + configMap: union-recording-rules + readonly: true + + # Scrape configs to replace ServiceMonitor CRDs. + # NOTE: This is a top-level key in the community prometheus chart, not nested under server. + extraScrapeConfigs: | + - job_name: 'kube-state-metrics' + metrics_path: /metrics + scrape_interval: 1m + honor_labels: true + kubernetes_sd_configs: + - role: endpoints + namespaces: + own_namespace: true + relabel_configs: + - source_labels: [__meta_kubernetes_service_label_app_kubernetes_io_name] + regex: kube-state-metrics + action: keep + - source_labels: [__meta_kubernetes_endpoint_port_name] + regex: http + action: keep + metric_relabel_configs: + - source_labels: [__name__] + regex: "kube_pod_container_resource_(limits|requests)|kube_pod_status_phase|kube_node_(labels|status_allocatable|status_condition|status_capacity)|kube_namespace_labels|kube_pod_container_status_(waiting|terminated|last_terminated).*_reason|kube_daemonset_status_number_unavailable|kube_deployment_status_replicas_unavailable|kube_resourcequota|kube_pod_info|kube_node_info|kube_pod_container_status_restarts_total" + action: keep + - source_labels: [__name__, phase] + separator: ";" + regex: "kube_pod_status_phase;(Succeeded|Failed)" + action: drop + - source_labels: [node] + target_label: nodename + regex: "(.*)" + action: replace + - source_labels: [label_node_group_name] + action: replace + regex: "(.+)" + target_label: label_node_pool_name + - job_name: 'opencost' + metrics_path: /metrics + scrape_interval: 1m + honor_labels: true + kubernetes_sd_configs: + - role: endpoints + namespaces: + own_namespace: true + relabel_configs: + - source_labels: [__meta_kubernetes_service_label_app_kubernetes_io_name] + regex: opencost + action: keep + - source_labels: [__meta_kubernetes_endpoint_port_name] + regex: http + action: keep + metric_relabel_configs: + - source_labels: [__name__] + regex: "kube_node_labels|kube_pod_labels|node_total_hourly_cost|node_ram_hourly_cost|node_cpu_hourly_cost|node_gpu_hourly_cost" + action: keep + - job_name: 'union-services' + metrics_path: /metrics + scrape_interval: 1m + honor_labels: true + kubernetes_sd_configs: + - role: endpoints + namespaces: + own_namespace: true + relabel_configs: + - source_labels: [__meta_kubernetes_service_label_platform_union_ai_service_group] + regex: .+ + action: keep + - source_labels: [__meta_kubernetes_endpoint_port_name] + regex: debug + action: keep + # Flyte propeller metrics for execution info and fast task duration. + # No-op when propeller is not deployed (no pods match the label selector). + - job_name: 'flytepropeller' + kubernetes_sd_configs: + - role: pod + namespaces: + own_namespace: true + selectors: + - role: pod + label: app.kubernetes.io/name=flytepropeller + metric_relabel_configs: + - source_labels: [__name__] + regex: "flyte:propeller:all:round:execution_info|flyte:propeller:all:node:fast_task:fast_task_execution_duration" + action: keep + relabel_configs: + - source_labels: [__meta_kubernetes_pod_container_name] + regex: flytepropeller + action: keep + - source_labels: [__meta_kubernetes_namespace] + action: replace + target_label: namespace + - source_labels: [__meta_kubernetes_pod_name] + action: replace + target_label: pod + # Kourier gateway (envoy) metrics for app serving. + # No-op when serving is not enabled (no pods match the label selector). + - job_name: 'serving-envoy' + kubernetes_sd_configs: + - role: pod + namespaces: + own_namespace: true + selectors: + - role: pod + label: app=3scale-kourier-gateway + metrics_path: /stats/prometheus + metric_relabel_configs: + - source_labels: [__name__] + regex: "envoy_cluster_upstream_rq_xx|envoy_cluster_upstream_rq_time_bucket" + action: keep + relabel_configs: + - source_labels: [__meta_kubernetes_pod_container_port_name] + regex: metrics + action: keep + - source_labels: [__meta_kubernetes_namespace] + action: replace + target_label: namespace + - source_labels: [__meta_kubernetes_pod_name] + action: replace + target_label: pod + # DCGM GPU metrics. + # No-op when dcgm-exporter is not deployed (no pods match the label selector). + - job_name: 'gpu-metrics' + kubernetes_sd_configs: + - role: pod + namespaces: + names: + - kube-system + selectors: + - role: pod + label: app.kubernetes.io/name=dcgm-exporter + + alertmanager: + enabled: false + # kube-state-metrics: use namespace-scoped Role (not ClusterRole) and only watch the release namespace. + # This provides pod-level metrics (kube_pod_*, kube_pod_container_*) but NOT node-level metrics + # (kube_node_*) since nodes are cluster-scoped resources. + kube-state-metrics: + enabled: true + rbac: + create: false + releaseNamespace: true + metricRelabelings: + - sourceLabels: [ "__name__" ] + separator: ";" + regex: "kube_pod_container_resource_(limits|requests)|kube_pod_status_phase|kube_node_(labels|status_allocatable|status_condition|status_capacity)|kube_namespace_labels|kube_pod_container_status_(waiting|terminated|last_terminated).*_reason|kube_daemonset_status_number_unavailable|kube_deployment_status_replicas_unavailable|kube_resourcequota|kube_pod_info|kube_node_info|kube_pod_container_status_restarts_total" + action: keep + - sourceLabels: [ "__name__", "phase" ] + separator: ";" + regex: "kube_pod_status_phase;(Succeeded|Failed)" + action: drop + - sourceLabels: [ "node" ] + targetLabel: "nodename" + regex: "(.*)" + action: replace + - sourceLabels: [ "label_node_group_name" ] + action: replace + regex: "(.+)" + targetLabel: "label_node_pool_name" + prometheus-node-exporter: + enabled: false + prometheus-pushgateway: + enabled: false + +# -- Scopes the deployment, permissions and actions created into a single namespace and avoids any deployments that would +# require additional permissions on the cluster. This limits the functionality though. +low_privilege: true + +# -- When enabled, creates a single shared ServiceAccount for all components +# (operator, executor, proxy, webhook, fluentbit) instead of individual ones. +# Automatically enabled when singleNamespace mode is active. +commonServiceAccount: + enabled: true + name: union-system + annotations: {} + imagePullSecrets: [] # -- Union operator proxy configuration proxy: serviceAccount: @@ -1474,13 +1791,15 @@ storage: # -- Configure app serving and knative. serving: # -- Enables the serving components. Installs Knative Serving. Knative-Operator must be running in the cluster for this to work. Enables app serving in operator. - enabled: false + enabled: true # -- The number of replicas to create for all components for high availability. replicas: 2 # -- Enables scraping of metrics from the serving component metrics: true # -- Additional configuration for Knative serving - extraConfig: {} + extraConfig: + deployment: + registries-skipping-tag-resolving: "managed.cr.union.ai" # -- Resources for serving components resources: 3scale-kourier-gateway: @@ -1506,12 +1825,18 @@ serving: # -- Disabling is common if not leveraging Union Cloud SSO. enabled: true +# -- Install Knative Operator CRDs (KnativeServing, KnativeEventing). +# -- Set to false if CRDs are managed externally or knative-operator is disabled. +knative-operator-crds: + enabled: true + # Enable the knative operator. Required for app serving. +# Note: The operator requires cluster-scoped RBAC (CRDs, ClusterRoles, namespaces, deployments). knative-operator: - enabled: false - - crds: - install: true + enabled: true + namespaceOverride: '{{ .Release.Namespace }}' + # Skip namespace creation — the parent chart creates the release namespace via --create-namespace + skipNamespaceCreation: true imageBuilder: enabled: true @@ -1525,18 +1850,37 @@ imageBuilder: # -- E.g. "tcp://buildkitd.buildkit.svc.cluster.local:1234" buildkitUri: "" - # -- The default repository to publish images to "registry" is not specified with imagespec. - # -- Note, the build-image task will fail unless "registry" is specified or a default repository is provided. + # -- The default container image repository for user-built images. + # -- When empty, auto-generated from storage.provider, storage.region, storage.gcp.projectId, + # -- and imageBuilder.registryName: + # -- aws: .dkr.ecr..amazonaws.com/ + # -- gcp: -docker.pkg.dev// + # -- azure: .azurecr.io + # -- Set explicitly to override auto-detection (e.g. for Depot, GHCR, or custom registries). defaultRepository: "" + # -- Short name used to construct the default registry URL when defaultRepository is empty. + registryName: "union-dataplane" + # -- How build-image task and operator proxy will attempt to authenticate # -- Supported values are "noop", "google", "aws", "azure" # -- "noop" no authentication is attempted # -- "google" uses docker-credential-gcr to authenticate to the default registry # -- "aws" uses docker-credential-ecr-login to authenticate to the default registry # -- "azure" uses az acr login to authenticate to the default registry. Requires Azure Workload Identity to be enabled. + # -- "basic" uses basic docker username/password auth to authenticate to the registry. authenticationType: "noop" + # -- Basic auth credentials for registry authentication (e.g. Depot, GHCR). + # -- Used by the operator proxy to authenticate when checking image existence. + basicAuth: + # -- Username for registry authentication. For Depot use "x-token". + username: "x-token" + # -- Secret name for password lookup via secret manager (recommended for production). + passwordSecretName: "DEPOT_TOKEN" + # -- Plain text password or token (use passwordSecretName for production). + passwordPlainText: "" + buildkit: # -- Enable buildkit service within this release. @@ -1584,7 +1928,7 @@ imageBuilder: # -- Run buildkit in rootless mode (non-privileged). Uses the moby/buildkit rootless # -- image variant which bundles RootlessKit to set up user namespaces. Requires # -- kernel >= 5.11 with unprivileged user namespace support. - rootless: true + rootless: false # -- Enable debug logging log: @@ -1616,9 +1960,9 @@ imageBuilder: # -- Resource definitions resources: requests: - cpu: 1 - memory: 1Gi - ephemeral-storage: 20Gi + cpu: 4 + memory: 4Gi + ephemeral-storage: 50Gi # -- Node selector nodeSelector: {} @@ -1665,7 +2009,7 @@ extraObjects: [] ## -- Automatically create namespaces to deploy into namespaces: - enabled: true + enabled: false monitoring: enabled: false @@ -1764,6 +2108,9 @@ monitoring: kube-state-metrics: nameOverride: "monitoring-kube-state-metrics" fullnameOverride: "monitoring-kube-state-metrics" + # Explicit list type to avoid merge conflict with standalone kube-state-metrics subchart + # which defaults env to {} (map) while kube-prometheus-stack's nested chart expects [] (list). + env: [] # By default, install a separate Prometheus instance for monitoring. # This is the simplest, out of the box model, it is highly recommended that users look diff --git a/charts/knative-operator-crds/Chart.yaml b/charts/knative-operator-crds/Chart.yaml new file mode 100644 index 00000000..2cb8fac1 --- /dev/null +++ b/charts/knative-operator-crds/Chart.yaml @@ -0,0 +1,7 @@ +apiVersion: v2 +name: knative-operator-crds +description: Knative Operator CRDs (KnativeServing, KnativeEventing) +type: application +version: 2025.6.3 +appVersion: 1.16.0 +kubeVersion: ">= 1.28.0-0" diff --git a/charts/knative-operator/templates/knative-crds.yaml b/charts/knative-operator-crds/crds/knative-crds.yaml similarity index 99% rename from charts/knative-operator/templates/knative-crds.yaml rename to charts/knative-operator-crds/crds/knative-crds.yaml index 7e0ef8f0..40df1937 100644 --- a/charts/knative-operator/templates/knative-crds.yaml +++ b/charts/knative-operator-crds/crds/knative-crds.yaml @@ -1,4 +1,4 @@ -{{- if .Values.crds.install -}} + # Imported from https://github.com/knative/serving/releases/download/knative-v1.16.0/serving-crds.yaml # Copyright 2020 The Knative Authors @@ -8365,7 +8365,7 @@ spec: clientConfig: service: name: operator-webhook - namespace: {{ .Release.Namespace }} + namespace: operator-webhook-namespace path: /resource-conversion --- # Copyright 2021 The Knative Authors @@ -10074,6 +10074,6 @@ spec: clientConfig: service: name: operator-webhook - namespace: {{ .Release.Namespace }} + namespace: operator-webhook-namespace path: /resource-conversion - {{- end -}} + diff --git a/charts/knative-operator-crds/values.yaml b/charts/knative-operator-crds/values.yaml new file mode 100644 index 00000000..ed97d539 --- /dev/null +++ b/charts/knative-operator-crds/values.yaml @@ -0,0 +1 @@ +--- diff --git a/charts/knative-operator/templates/_helpers.tpl b/charts/knative-operator/templates/_helpers.tpl index 4e869f4c..4a2a6298 100644 --- a/charts/knative-operator/templates/_helpers.tpl +++ b/charts/knative-operator/templates/_helpers.tpl @@ -4,5 +4,5 @@ to install knative-operator in its own namespace, while still allowing other cha (ie: dataplane) to depend on it, and avoid installing in the Helm release namespace. */}} {{- define "knative-operator.namespace" -}} -{{- default "knative-operator" .Values.namespaceOverride | quote -}} +{{- tpl (default "knative-operator" .Values.namespaceOverride) . | quote -}} {{- end -}} diff --git a/charts/knative-operator/templates/knative-operator.yaml b/charts/knative-operator/templates/knative-operator.yaml index 8db2d0d4..1938e92e 100644 --- a/charts/knative-operator/templates/knative-operator.yaml +++ b/charts/knative-operator/templates/knative-operator.yaml @@ -13,6 +13,8 @@ # See the License for the specific language governing permissions and # limitations under the License. + +{{- if not .Values.skipNamespaceCreation }} apiVersion: v1 kind: Namespace metadata: @@ -20,6 +22,7 @@ metadata: labels: app.kubernetes.io/name: knative-operator app.kubernetes.io/version: "1.16.0" +{{- end }} --- # Copyright 2022 The Knative Authors @@ -222,9 +225,6 @@ spec: # See the License for the specific language governing permissions and # limitations under the License. -{{- if .Values.single_namespace }} -{{- else }} - apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: @@ -304,12 +304,7 @@ rules: [] # Rules are automatically filled in by the controller manager. # See the License for the specific language governing permissions and # limitations under the License. -{{- end }} -{{- if .Values.single_namespace }} -kind: Role -{{- else }} kind: ClusterRole -{{- end }} apiVersion: rbac.authorization.k8s.io/v1 metadata: name: knative-serving-operator @@ -549,11 +544,7 @@ rules: verbs: - get --- -{{- if .Values.single_namespace }} -kind: Role -{{- else }} kind: ClusterRole -{{- end }} apiVersion: rbac.authorization.k8s.io/v1 metadata: name: knative-eventing-operator @@ -983,11 +974,7 @@ metadata: # TODO: Consider restriction of non-aggregated role to knativeservings namespaces. apiVersion: rbac.authorization.k8s.io/v1 -{{- if .Values.single_namespace }} -kind: RoleBinding -{{- else }} kind: ClusterRoleBinding -{{- end }} metadata: name: knative-serving-operator labels: @@ -995,19 +982,13 @@ metadata: app.kubernetes.io/name: knative-operator roleRef: apiGroup: rbac.authorization.k8s.io - {{- if .Values.single_namespace }} - kind: Role - {{- else }} kind: ClusterRole - {{- end }} name: knative-serving-operator subjects: - kind: ServiceAccount name: knative-operator namespace: {{ include "knative-operator.namespace" . }} --- -{{- if .Values.single_namespace }} -{{- else }} # TODO: Consider restriction of non-aggregated role to knativeeventing namespaces. apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding @@ -1024,7 +1005,6 @@ subjects: - kind: ServiceAccount name: knative-operator namespace: {{ include "knative-operator.namespace" . }} -{{- end }} --- # Copyright 2022 The Knative Authors # @@ -1063,11 +1043,7 @@ rules: - "patch" --- apiVersion: rbac.authorization.k8s.io/v1 -{{- if .Values.single_namespace }} -kind: Role -{{- else }} kind: ClusterRole -{{- end }} metadata: name: knative-operator-webhook labels: @@ -1198,11 +1174,7 @@ roleRef: apiGroup: rbac.authorization.k8s.io --- apiVersion: rbac.authorization.k8s.io/v1 -{{- if .Values.single_namespace }} -kind: RoleBinding -{{- else }} kind: ClusterRoleBinding -{{- end }} metadata: name: operator-webhook labels: @@ -1210,11 +1182,7 @@ metadata: app.kubernetes.io/name: knative-operator roleRef: apiGroup: rbac.authorization.k8s.io - {{- if .Values.single_namespace }} - kind: Role - {{- else }} kind: ClusterRole - {{- end }} name: knative-operator-webhook subjects: - kind: ServiceAccount @@ -1235,8 +1203,6 @@ subjects: # See the License for the specific language governing permissions and # limitations under the License. -{{- if .Values.single_namespace }} -{{- else }} apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: @@ -1300,7 +1266,6 @@ subjects: - kind: ServiceAccount name: knative-operator namespace: {{ include "knative-operator.namespace" . }} -{{- end }} --- # Copyright 2019 The Knative Authors # @@ -1324,45 +1289,7 @@ metadata: labels: app.kubernetes.io/version: "1.16.0" app.kubernetes.io/name: knative-operator -data: - _example: | - ################################ - # # - # EXAMPLE CONFIGURATION # - # # - ################################ - - # This block is not actually functional configuration, - # but serves to illustrate the available configuration - # options and document them in a way that is accessible - # to users that `kubectl edit` this config map. - # - # These sample configuration options may be copied out of - # this example block and unindented to be in the data block - # to actually change the configuration. - - # Common configuration for all Knative codebase - zap-logger-config: | - { - "level": "info", - "development": false, - "outputPaths": ["stdout"], - "errorOutputPaths": ["stderr"], - "encoding": "json", - "encoderConfig": { - "timeKey": "ts", - "levelKey": "level", - "nameKey": "logger", - "callerKey": "caller", - "messageKey": "msg", - "stacktraceKey": "stacktrace", - "lineEnding": "", - "levelEncoder": "", - "timeEncoder": "iso8601", - "durationEncoder": "", - "callerEncoder": "" - } - } +data: {} --- # Copyright 2019 The Knative Authors @@ -1387,56 +1314,7 @@ metadata: labels: app.kubernetes.io/version: "1.16.0" app.kubernetes.io/name: knative-operator -data: - _example: | - ################################ - # # - # EXAMPLE CONFIGURATION # - # # - ################################ - - # This block is not actually functional configuration, - # but serves to illustrate the available configuration - # options and document them in a way that is accessible - # to users that `kubectl edit` this config map. - # - # These sample configuration options may be copied out of - # this example block and unindented to be in the data block - # to actually change the configuration. - - # logging.enable-var-log-collection defaults to false. - # The fluentd daemon set will be set up to collect /var/log if - # this flag is true. - logging.enable-var-log-collection: false - - # logging.revision-url-template provides a template to use for producing the - # logging URL that is injected into the status of each Revision. - # This value is what you might use the the Knative monitoring bundle, and provides - # access to Kibana after setting up kubectl proxy. - logging.revision-url-template: | - http://localhost:8001/api/v1/namespaces/knative-monitoring/services/kibana-logging/proxy/app/kibana#/discover?_a=(query:(match:(kubernetes.labels.serving-knative-dev%2FrevisionUID:(query:'${REVISION_UID}',type:phrase)))) - - # metrics.backend-destination field specifies the system metrics destination. - # It supports either prometheus (the default) or stackdriver. - # Note: Using stackdriver will incur additional charges - metrics.backend-destination: prometheus - - # metrics.request-metrics-backend-destination specifies the request metrics - # destination. If non-empty, it enables queue proxy to send request metrics. - # Currently supported values: prometheus, stackdriver. - metrics.request-metrics-backend-destination: prometheus - - # metrics.stackdriver-project-id field specifies the stackdriver project ID. This - # field is optional. When running on GCE, application default credentials will be - # used if this field is not provided. - metrics.stackdriver-project-id: "" - - # metrics.allow-stackdriver-custom-metrics indicates whether it is allowed to send metrics to - # Stackdriver using "global" resource type and custom metric type if the - # metrics are not supported by "knative_revision" resource type. Setting this - # flag to "true" could cause extra Stackdriver charge. - # If metrics.backend-destination is not Stackdriver, this is ignored. - metrics.allow-stackdriver-custom-metrics: "false" +data: {} --- # Copyright 2020 The Knative Authors diff --git a/charts/knative-operator/values.yaml b/charts/knative-operator/values.yaml index 039b13d1..4d64d434 100644 --- a/charts/knative-operator/values.yaml +++ b/charts/knative-operator/values.yaml @@ -1,3 +1,4 @@ --- -crds: - install: true +# -- Skip creating the Namespace resource. Set to true when the namespace is +# managed by the parent chart (e.g. via --create-namespace or a separate template). +skipNamespaceCreation: false diff --git a/charts/sandbox/Chart.yaml b/charts/sandbox/Chart.yaml index 22be3d4b..83f47cce 100644 --- a/charts/sandbox/Chart.yaml +++ b/charts/sandbox/Chart.yaml @@ -3,6 +3,6 @@ name: sandbox description: Deploys extras for sandbox testing. type: application icon: https://i.ibb.co/JxfDQsL/Union-Symbol-yellow-2.png -version: 2026.4.0 +version: 2026.4.3 appVersion: 2026.3.6 kubeVersion: '>= 1.28.0' diff --git a/prometheus/templates/cm.yaml b/prometheus/templates/cm.yaml new file mode 100644 index 00000000..8713bd1e --- /dev/null +++ b/prometheus/templates/cm.yaml @@ -0,0 +1,103 @@ +{{- if (empty .Values.server.configMapOverrideName) -}} +apiVersion: v1 +kind: ConfigMap +metadata: +{{- with .Values.server.configMapAnnotations }} + annotations: + {{- toYaml . | nindent 4 }} +{{- end }} + labels: + {{- include "prometheus.server.labels" . | nindent 4 }} + {{- with .Values.server.extraConfigmapLabels }} + {{- toYaml . | nindent 4 }} + {{- end }} + name: {{ template "prometheus.server.fullname" . }} + namespace: {{ include "prometheus.namespace" . }} +data: + allow-snippet-annotations: "false" +{{- $root := . -}} +{{- range $key, $value := .Values.ruleFiles }} + {{ $key }}: {{- toYaml $value | indent 2 }} +{{- end }} +{{- range $key, $value := .Values.serverFiles }} + {{ $key }}: | +{{- if eq $key "prometheus.yml" }} + global: +{{ $root.Values.server.global | toYaml | trimSuffix "\n" | indent 6 }} +{{- if $root.Values.server.remoteWrite }} + remote_write: +{{- include "prometheus.server.remoteWrite" $root | nindent 4 }} +{{- end }} +{{- if $root.Values.server.remoteRead }} + remote_read: +{{- include "prometheus.server.remoteRead" $root | nindent 4 }} +{{- end }} +{{- if or $root.Values.server.tsdb $root.Values.server.exemplars }} + storage: +{{- if $root.Values.server.tsdb }} + tsdb: +{{ $root.Values.server.tsdb | toYaml | indent 8 }} +{{- end }} +{{- if $root.Values.server.exemplars }} + exemplars: +{{ $root.Values.server.exemplars | toYaml | indent 8 }} +{{- end }} +{{- end }} +{{- if $root.Values.scrapeConfigFiles }} + scrape_config_files: +{{ toYaml $root.Values.scrapeConfigFiles | indent 4 }} +{{- end }} +{{- end }} +{{- if eq $key "alerts" }} +{{- if and (not (empty $value)) (empty $value.groups) }} + groups: +{{- range $ruleKey, $ruleValue := $value }} + - name: {{ $ruleKey -}}.rules + rules: +{{ $ruleValue | toYaml | trimSuffix "\n" | indent 6 }} +{{- end }} +{{- else }} +{{ toYaml $value | indent 4 }} +{{- end }} +{{- else }} +{{ toYaml $value | default "{}" | indent 4 }} +{{- end }} +{{- if eq $key "prometheus.yml" -}} +{{- if $root.Values.extraScrapeConfigs }} +{{ tpl $root.Values.extraScrapeConfigs $root | indent 4 }} +{{- end -}} +{{- if or ($root.Values.alertmanager.enabled) ($root.Values.server.alertmanagers) }} + alerting: +{{- if $root.Values.alertRelabelConfigs }} +{{ $root.Values.alertRelabelConfigs | toYaml | trimSuffix "\n" | indent 6 }} +{{- end }} + alertmanagers: +{{- if $root.Values.server.alertmanagers }} +{{ toYaml $root.Values.server.alertmanagers | indent 8 }} +{{- else }} + - kubernetes_sd_configs: + - role: pod + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + {{- if $root.Values.alertmanager.prefixURL }} + path_prefix: {{ $root.Values.alertmanager.prefixURL }} + {{- end }} + relabel_configs: + - source_labels: [__meta_kubernetes_namespace] + regex: {{ $root.Release.Namespace }} + action: keep + - source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_instance] + regex: {{ $root.Release.Name }} + action: keep + - source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_name] + regex: {{ default "alertmanager" $root.Values.alertmanager.nameOverride | trunc 63 | trimSuffix "-" }} + action: keep + - source_labels: [__meta_kubernetes_pod_container_port_number] + regex: "9093" + action: keep +{{- end -}} +{{- end -}} +{{- end -}} +{{- end -}} +{{- end -}} diff --git a/tests/generated/controlplane.aws.billing-enable.yaml b/tests/generated/controlplane.aws.billing-enable.yaml index 68764b46..f5ced35a 100644 --- a/tests/generated/controlplane.aws.billing-enable.yaml +++ b/tests/generated/controlplane.aws.billing-enable.yaml @@ -37,7 +37,7 @@ kind: PodDisruptionBudget metadata: name: unionconsole labels: - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: unionconsole app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" @@ -213,7 +213,7 @@ metadata: labels: app.kubernetes.io/name: cacheservice app.kubernetes.io/instance: release-name - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/managed-by: Helm --- # Source: controlplane/templates/console/serviceaccount.yaml @@ -222,7 +222,7 @@ kind: ServiceAccount metadata: name: unionconsole labels: - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: unionconsole app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" @@ -235,7 +235,7 @@ kind: ServiceAccount metadata: name: authorizer labels: - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: authorizer app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" @@ -247,7 +247,7 @@ kind: ServiceAccount metadata: name: cluster labels: - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: cluster app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" @@ -259,7 +259,7 @@ kind: ServiceAccount metadata: name: dataproxy labels: - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: dataproxy app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" @@ -271,7 +271,7 @@ kind: ServiceAccount metadata: name: executions labels: - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: executions app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" @@ -283,7 +283,7 @@ kind: ServiceAccount metadata: name: queue labels: - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: queue app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" @@ -295,7 +295,7 @@ kind: ServiceAccount metadata: name: run-scheduler labels: - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: run-scheduler app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" @@ -307,12 +307,25 @@ kind: ServiceAccount metadata: name: usage labels: - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: usage app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" app.kubernetes.io/managed-by: Helm --- +# Source: controlplane/templates/union-serviceaccount.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: union + namespace: union + labels: + helm.sh/chart: controlplane-2026.4.4 + app.kubernetes.io/name: union + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2026.4.5" + app.kubernetes.io/managed-by: Helm +--- # Source: controlplane/charts/flyte/templates/admin/secret.yaml apiVersion: v1 kind: Secret @@ -529,7 +542,7 @@ metadata: labels: app.kubernetes.io/name: cacheservice app.kubernetes.io/instance: release-name - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/managed-by: Helm data: db.yaml: | @@ -600,7 +613,7 @@ kind: ConfigMap metadata: name: authorizer labels: - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: authorizer app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" @@ -664,7 +677,7 @@ kind: ConfigMap metadata: name: cluster labels: - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: cluster app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" @@ -739,7 +752,7 @@ kind: ConfigMap metadata: name: dataproxy labels: - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: dataproxy app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" @@ -802,7 +815,7 @@ kind: ConfigMap metadata: name: executions labels: - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: executions app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" @@ -886,7 +899,7 @@ kind: ConfigMap metadata: name: queue labels: - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: queue app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" @@ -955,7 +968,7 @@ kind: ConfigMap metadata: name: run-scheduler labels: - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: run-scheduler app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" @@ -1024,7 +1037,7 @@ kind: ConfigMap metadata: name: usage labels: - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: usage app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" @@ -5497,7 +5510,7 @@ metadata: labels: app.kubernetes.io/name: flyteadmin app.kubernetes.io/instance: release-name - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 #app.kubernetes.io/managed-by: Helm rules: - apiGroups: @@ -5528,7 +5541,7 @@ metadata: labels: app.kubernetes.io/name: flyteadmin app.kubernetes.io/instance: release-name - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 #app.kubernetes.io/managed-by: Helm roleRef: apiGroup: rbac.authorization.k8s.io @@ -5626,7 +5639,7 @@ metadata: platform.union.ai/prometheus-group: "union-services" app.kubernetes.io/name: cacheservice app.kubernetes.io/instance: release-name - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/managed-by: Helm spec: type: ClusterIP @@ -5653,7 +5666,7 @@ metadata: name: unionconsole labels: platform.union.ai/prometheus-group: "union-services" - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: unionconsole app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" @@ -5681,7 +5694,7 @@ metadata: name: authorizer labels: platform.union.ai/prometheus-group: "union-services" - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: authorizer app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" @@ -5720,7 +5733,7 @@ metadata: name: cluster labels: platform.union.ai/prometheus-group: "union-services" - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: cluster app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" @@ -5759,7 +5772,7 @@ metadata: name: dataproxy labels: platform.union.ai/prometheus-group: "union-services" - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: dataproxy app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" @@ -5794,7 +5807,7 @@ metadata: name: executions labels: platform.union.ai/prometheus-group: "union-services" - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: executions app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" @@ -5829,7 +5842,7 @@ metadata: name: queue labels: platform.union.ai/prometheus-group: "union-services" - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: queue app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" @@ -5864,7 +5877,7 @@ metadata: name: run-scheduler labels: platform.union.ai/prometheus-group: "union-services" - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: run-scheduler app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" @@ -5899,7 +5912,7 @@ metadata: name: usage labels: platform.union.ai/prometheus-group: "union-services" - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: usage app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" @@ -6323,7 +6336,7 @@ metadata: labels: app.kubernetes.io/name: cacheservice app.kubernetes.io/instance: release-name - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/managed-by: Helm spec: replicas: 1 @@ -6334,7 +6347,7 @@ spec: template: metadata: annotations: - configChecksum: "90393bcce1a5d37998b4ff6af8ccb8f77098677625ecdfc4f09de9dc84e6d9e" + configChecksum: "901f295ac8f6c4f6cbfaa6c64f065e0d8a7e38f6742449c52fe322c5cfd52e8" linkerd.io/inject: disabled prometheus.io/path: /metrics prometheus.io/port: "10254" @@ -6343,7 +6356,7 @@ spec: app.kubernetes.io/name: cacheservice app.kubernetes.io/instance: release-name - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/managed-by: Helm spec: securityContext: @@ -6435,7 +6448,7 @@ kind: Deployment metadata: name: unionconsole labels: - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: unionconsole app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" @@ -6500,7 +6513,7 @@ kind: Deployment metadata: name: authorizer labels: - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: authorizer app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" @@ -6613,7 +6626,7 @@ kind: Deployment metadata: name: cluster labels: - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: cluster app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" @@ -6742,7 +6755,7 @@ kind: Deployment metadata: name: dataproxy labels: - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: dataproxy app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" @@ -6852,7 +6865,7 @@ kind: Deployment metadata: name: executions labels: - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: executions app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" @@ -6978,7 +6991,7 @@ kind: Deployment metadata: name: queue labels: - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: queue app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" @@ -7105,7 +7118,7 @@ kind: Deployment metadata: name: run-scheduler labels: - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: run-scheduler app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" @@ -7232,7 +7245,7 @@ kind: Deployment metadata: name: usage labels: - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: usage app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" @@ -7378,7 +7391,7 @@ kind: HorizontalPodAutoscaler metadata: name: unionconsole labels: - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: unionconsole app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" diff --git a/tests/generated/controlplane.aws.yaml b/tests/generated/controlplane.aws.yaml index 3c594008..72217e79 100644 --- a/tests/generated/controlplane.aws.yaml +++ b/tests/generated/controlplane.aws.yaml @@ -37,7 +37,7 @@ kind: PodDisruptionBudget metadata: name: unionconsole labels: - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: unionconsole app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" @@ -213,7 +213,7 @@ metadata: labels: app.kubernetes.io/name: cacheservice app.kubernetes.io/instance: release-name - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/managed-by: Helm --- # Source: controlplane/templates/console/serviceaccount.yaml @@ -222,7 +222,7 @@ kind: ServiceAccount metadata: name: unionconsole labels: - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: unionconsole app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" @@ -235,7 +235,7 @@ kind: ServiceAccount metadata: name: authorizer labels: - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: authorizer app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" @@ -247,7 +247,7 @@ kind: ServiceAccount metadata: name: cluster labels: - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: cluster app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" @@ -259,7 +259,7 @@ kind: ServiceAccount metadata: name: dataproxy labels: - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: dataproxy app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" @@ -271,7 +271,7 @@ kind: ServiceAccount metadata: name: executions labels: - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: executions app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" @@ -283,7 +283,7 @@ kind: ServiceAccount metadata: name: queue labels: - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: queue app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" @@ -295,7 +295,7 @@ kind: ServiceAccount metadata: name: run-scheduler labels: - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: run-scheduler app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" @@ -307,12 +307,25 @@ kind: ServiceAccount metadata: name: usage labels: - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: usage app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" app.kubernetes.io/managed-by: Helm --- +# Source: controlplane/templates/union-serviceaccount.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: union + namespace: union + labels: + helm.sh/chart: controlplane-2026.4.4 + app.kubernetes.io/name: union + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2026.4.5" + app.kubernetes.io/managed-by: Helm +--- # Source: controlplane/charts/flyte/templates/admin/secret.yaml apiVersion: v1 kind: Secret @@ -529,7 +542,7 @@ metadata: labels: app.kubernetes.io/name: cacheservice app.kubernetes.io/instance: release-name - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/managed-by: Helm data: db.yaml: | @@ -600,7 +613,7 @@ kind: ConfigMap metadata: name: authorizer labels: - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: authorizer app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" @@ -664,7 +677,7 @@ kind: ConfigMap metadata: name: cluster labels: - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: cluster app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" @@ -739,7 +752,7 @@ kind: ConfigMap metadata: name: dataproxy labels: - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: dataproxy app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" @@ -802,7 +815,7 @@ kind: ConfigMap metadata: name: executions labels: - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: executions app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" @@ -886,7 +899,7 @@ kind: ConfigMap metadata: name: queue labels: - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: queue app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" @@ -955,7 +968,7 @@ kind: ConfigMap metadata: name: run-scheduler labels: - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: run-scheduler app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" @@ -1024,7 +1037,7 @@ kind: ConfigMap metadata: name: usage labels: - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: usage app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" @@ -5497,7 +5510,7 @@ metadata: labels: app.kubernetes.io/name: flyteadmin app.kubernetes.io/instance: release-name - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 #app.kubernetes.io/managed-by: Helm rules: - apiGroups: @@ -5528,7 +5541,7 @@ metadata: labels: app.kubernetes.io/name: flyteadmin app.kubernetes.io/instance: release-name - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 #app.kubernetes.io/managed-by: Helm roleRef: apiGroup: rbac.authorization.k8s.io @@ -5626,7 +5639,7 @@ metadata: platform.union.ai/prometheus-group: "union-services" app.kubernetes.io/name: cacheservice app.kubernetes.io/instance: release-name - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/managed-by: Helm spec: type: ClusterIP @@ -5653,7 +5666,7 @@ metadata: name: unionconsole labels: platform.union.ai/prometheus-group: "union-services" - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: unionconsole app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" @@ -5681,7 +5694,7 @@ metadata: name: authorizer labels: platform.union.ai/prometheus-group: "union-services" - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: authorizer app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" @@ -5720,7 +5733,7 @@ metadata: name: cluster labels: platform.union.ai/prometheus-group: "union-services" - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: cluster app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" @@ -5759,7 +5772,7 @@ metadata: name: dataproxy labels: platform.union.ai/prometheus-group: "union-services" - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: dataproxy app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" @@ -5794,7 +5807,7 @@ metadata: name: executions labels: platform.union.ai/prometheus-group: "union-services" - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: executions app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" @@ -5829,7 +5842,7 @@ metadata: name: queue labels: platform.union.ai/prometheus-group: "union-services" - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: queue app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" @@ -5864,7 +5877,7 @@ metadata: name: run-scheduler labels: platform.union.ai/prometheus-group: "union-services" - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: run-scheduler app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" @@ -5899,7 +5912,7 @@ metadata: name: usage labels: platform.union.ai/prometheus-group: "union-services" - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: usage app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" @@ -6323,7 +6336,7 @@ metadata: labels: app.kubernetes.io/name: cacheservice app.kubernetes.io/instance: release-name - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/managed-by: Helm spec: replicas: 1 @@ -6334,7 +6347,7 @@ spec: template: metadata: annotations: - configChecksum: "90393bcce1a5d37998b4ff6af8ccb8f77098677625ecdfc4f09de9dc84e6d9e" + configChecksum: "901f295ac8f6c4f6cbfaa6c64f065e0d8a7e38f6742449c52fe322c5cfd52e8" linkerd.io/inject: disabled prometheus.io/path: /metrics prometheus.io/port: "10254" @@ -6344,7 +6357,7 @@ spec: app.kubernetes.io/name: cacheservice app.kubernetes.io/instance: release-name - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/managed-by: Helm spec: securityContext: @@ -6436,7 +6449,7 @@ kind: Deployment metadata: name: unionconsole labels: - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: unionconsole app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" @@ -6502,7 +6515,7 @@ kind: Deployment metadata: name: authorizer labels: - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: authorizer app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" @@ -6616,7 +6629,7 @@ kind: Deployment metadata: name: cluster labels: - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: cluster app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" @@ -6746,7 +6759,7 @@ kind: Deployment metadata: name: dataproxy labels: - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: dataproxy app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" @@ -6857,7 +6870,7 @@ kind: Deployment metadata: name: executions labels: - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: executions app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" @@ -6984,7 +6997,7 @@ kind: Deployment metadata: name: queue labels: - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: queue app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" @@ -7112,7 +7125,7 @@ kind: Deployment metadata: name: run-scheduler labels: - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: run-scheduler app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" @@ -7240,7 +7253,7 @@ kind: Deployment metadata: name: usage labels: - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: usage app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" @@ -7381,7 +7394,7 @@ kind: HorizontalPodAutoscaler metadata: name: unionconsole labels: - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: unionconsole app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" diff --git a/tests/generated/controlplane.external-authz.yaml b/tests/generated/controlplane.external-authz.yaml index 83d3df9e..7709ae58 100644 --- a/tests/generated/controlplane.external-authz.yaml +++ b/tests/generated/controlplane.external-authz.yaml @@ -37,7 +37,7 @@ kind: PodDisruptionBudget metadata: name: unionconsole labels: - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: unionconsole app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" @@ -211,7 +211,7 @@ metadata: labels: app.kubernetes.io/name: cacheservice app.kubernetes.io/instance: release-name - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/managed-by: Helm --- # Source: controlplane/templates/console/serviceaccount.yaml @@ -220,7 +220,7 @@ kind: ServiceAccount metadata: name: unionconsole labels: - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: unionconsole app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" @@ -233,7 +233,7 @@ kind: ServiceAccount metadata: name: authorizer labels: - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: authorizer app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" @@ -245,7 +245,7 @@ kind: ServiceAccount metadata: name: cluster labels: - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: cluster app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" @@ -257,7 +257,7 @@ kind: ServiceAccount metadata: name: dataproxy labels: - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: dataproxy app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" @@ -269,7 +269,7 @@ kind: ServiceAccount metadata: name: executions labels: - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: executions app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" @@ -281,7 +281,7 @@ kind: ServiceAccount metadata: name: queue labels: - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: queue app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" @@ -293,7 +293,7 @@ kind: ServiceAccount metadata: name: run-scheduler labels: - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: run-scheduler app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" @@ -305,12 +305,25 @@ kind: ServiceAccount metadata: name: usage labels: - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: usage app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" app.kubernetes.io/managed-by: Helm --- +# Source: controlplane/templates/union-serviceaccount.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: union + namespace: union + labels: + helm.sh/chart: controlplane-2026.4.4 + app.kubernetes.io/name: union + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2026.4.5" + app.kubernetes.io/managed-by: Helm +--- # Source: controlplane/charts/flyte/templates/admin/secret.yaml apiVersion: v1 kind: Secret @@ -530,7 +543,7 @@ metadata: labels: app.kubernetes.io/name: cacheservice app.kubernetes.io/instance: release-name - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/managed-by: Helm data: db.yaml: | @@ -601,7 +614,7 @@ kind: ConfigMap metadata: name: authorizer labels: - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: authorizer app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" @@ -669,7 +682,7 @@ kind: ConfigMap metadata: name: cluster labels: - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: cluster app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" @@ -744,7 +757,7 @@ kind: ConfigMap metadata: name: dataproxy labels: - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: dataproxy app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" @@ -807,7 +820,7 @@ kind: ConfigMap metadata: name: executions labels: - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: executions app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" @@ -891,7 +904,7 @@ kind: ConfigMap metadata: name: queue labels: - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: queue app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" @@ -960,7 +973,7 @@ kind: ConfigMap metadata: name: run-scheduler labels: - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: run-scheduler app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" @@ -1029,7 +1042,7 @@ kind: ConfigMap metadata: name: usage labels: - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: usage app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" @@ -5502,7 +5515,7 @@ metadata: labels: app.kubernetes.io/name: flyteadmin app.kubernetes.io/instance: release-name - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 #app.kubernetes.io/managed-by: Helm rules: - apiGroups: @@ -5533,7 +5546,7 @@ metadata: labels: app.kubernetes.io/name: flyteadmin app.kubernetes.io/instance: release-name - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 #app.kubernetes.io/managed-by: Helm roleRef: apiGroup: rbac.authorization.k8s.io @@ -5631,7 +5644,7 @@ metadata: platform.union.ai/prometheus-group: "union-services" app.kubernetes.io/name: cacheservice app.kubernetes.io/instance: release-name - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/managed-by: Helm spec: type: ClusterIP @@ -5658,7 +5671,7 @@ metadata: name: unionconsole labels: platform.union.ai/prometheus-group: "union-services" - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: unionconsole app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" @@ -5686,7 +5699,7 @@ metadata: name: authorizer labels: platform.union.ai/prometheus-group: "union-services" - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: authorizer app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" @@ -5725,7 +5738,7 @@ metadata: name: cluster labels: platform.union.ai/prometheus-group: "union-services" - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: cluster app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" @@ -5764,7 +5777,7 @@ metadata: name: dataproxy labels: platform.union.ai/prometheus-group: "union-services" - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: dataproxy app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" @@ -5799,7 +5812,7 @@ metadata: name: executions labels: platform.union.ai/prometheus-group: "union-services" - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: executions app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" @@ -5834,7 +5847,7 @@ metadata: name: queue labels: platform.union.ai/prometheus-group: "union-services" - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: queue app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" @@ -5869,7 +5882,7 @@ metadata: name: run-scheduler labels: platform.union.ai/prometheus-group: "union-services" - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: run-scheduler app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" @@ -5904,7 +5917,7 @@ metadata: name: usage labels: platform.union.ai/prometheus-group: "union-services" - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: usage app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" @@ -6328,7 +6341,7 @@ metadata: labels: app.kubernetes.io/name: cacheservice app.kubernetes.io/instance: release-name - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/managed-by: Helm spec: replicas: 1 @@ -6339,7 +6352,7 @@ spec: template: metadata: annotations: - configChecksum: "390923c3be4a0a64d53eb00c02ada994ed2f8e2627648a41590292af9fe52c6" + configChecksum: "354a5b5874189f03212da83b39dd4658ac59d1514f2f03b03abf93633429bbd" linkerd.io/inject: disabled prometheus.io/path: /metrics prometheus.io/port: "10254" @@ -6348,7 +6361,7 @@ spec: app.kubernetes.io/name: cacheservice app.kubernetes.io/instance: release-name - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/managed-by: Helm spec: securityContext: @@ -6440,7 +6453,7 @@ kind: Deployment metadata: name: unionconsole labels: - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: unionconsole app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" @@ -6505,7 +6518,7 @@ kind: Deployment metadata: name: authorizer labels: - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: authorizer app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" @@ -6618,7 +6631,7 @@ kind: Deployment metadata: name: cluster labels: - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: cluster app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" @@ -6747,7 +6760,7 @@ kind: Deployment metadata: name: dataproxy labels: - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: dataproxy app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" @@ -6857,7 +6870,7 @@ kind: Deployment metadata: name: executions labels: - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: executions app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" @@ -6983,7 +6996,7 @@ kind: Deployment metadata: name: queue labels: - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: queue app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" @@ -7110,7 +7123,7 @@ kind: Deployment metadata: name: run-scheduler labels: - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: run-scheduler app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" @@ -7237,7 +7250,7 @@ kind: Deployment metadata: name: usage labels: - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: usage app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" @@ -7383,7 +7396,7 @@ kind: HorizontalPodAutoscaler metadata: name: unionconsole labels: - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: unionconsole app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" diff --git a/tests/generated/controlplane.userclouds.yaml b/tests/generated/controlplane.userclouds.yaml index 55cab6f0..6d1860dc 100644 --- a/tests/generated/controlplane.userclouds.yaml +++ b/tests/generated/controlplane.userclouds.yaml @@ -37,7 +37,7 @@ kind: PodDisruptionBudget metadata: name: unionconsole labels: - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: unionconsole app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" @@ -213,7 +213,7 @@ metadata: labels: app.kubernetes.io/name: cacheservice app.kubernetes.io/instance: release-name - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/managed-by: Helm --- # Source: controlplane/templates/console/serviceaccount.yaml @@ -222,7 +222,7 @@ kind: ServiceAccount metadata: name: unionconsole labels: - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: unionconsole app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" @@ -235,7 +235,7 @@ kind: ServiceAccount metadata: name: authorizer labels: - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: authorizer app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" @@ -247,7 +247,7 @@ kind: ServiceAccount metadata: name: cluster labels: - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: cluster app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" @@ -259,7 +259,7 @@ kind: ServiceAccount metadata: name: dataproxy labels: - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: dataproxy app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" @@ -271,7 +271,7 @@ kind: ServiceAccount metadata: name: executions labels: - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: executions app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" @@ -283,7 +283,7 @@ kind: ServiceAccount metadata: name: queue labels: - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: queue app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" @@ -295,7 +295,7 @@ kind: ServiceAccount metadata: name: run-scheduler labels: - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: run-scheduler app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" @@ -307,12 +307,25 @@ kind: ServiceAccount metadata: name: usage labels: - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: usage app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" app.kubernetes.io/managed-by: Helm --- +# Source: controlplane/templates/union-serviceaccount.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: union + namespace: union + labels: + helm.sh/chart: controlplane-2026.4.4 + app.kubernetes.io/name: union + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2026.4.5" + app.kubernetes.io/managed-by: Helm +--- # Source: controlplane/charts/flyte/templates/admin/secret.yaml apiVersion: v1 kind: Secret @@ -529,7 +542,7 @@ metadata: labels: app.kubernetes.io/name: cacheservice app.kubernetes.io/instance: release-name - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/managed-by: Helm data: db.yaml: | @@ -600,7 +613,7 @@ kind: ConfigMap metadata: name: authorizer labels: - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: authorizer app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" @@ -664,7 +677,7 @@ kind: ConfigMap metadata: name: cluster labels: - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: cluster app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" @@ -739,7 +752,7 @@ kind: ConfigMap metadata: name: dataproxy labels: - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: dataproxy app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" @@ -802,7 +815,7 @@ kind: ConfigMap metadata: name: executions labels: - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: executions app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" @@ -886,7 +899,7 @@ kind: ConfigMap metadata: name: queue labels: - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: queue app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" @@ -955,7 +968,7 @@ kind: ConfigMap metadata: name: run-scheduler labels: - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: run-scheduler app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" @@ -1024,7 +1037,7 @@ kind: ConfigMap metadata: name: usage labels: - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: usage app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" @@ -5497,7 +5510,7 @@ metadata: labels: app.kubernetes.io/name: flyteadmin app.kubernetes.io/instance: release-name - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 #app.kubernetes.io/managed-by: Helm rules: - apiGroups: @@ -5528,7 +5541,7 @@ metadata: labels: app.kubernetes.io/name: flyteadmin app.kubernetes.io/instance: release-name - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 #app.kubernetes.io/managed-by: Helm roleRef: apiGroup: rbac.authorization.k8s.io @@ -5626,7 +5639,7 @@ metadata: platform.union.ai/prometheus-group: "union-services" app.kubernetes.io/name: cacheservice app.kubernetes.io/instance: release-name - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/managed-by: Helm spec: type: ClusterIP @@ -5653,7 +5666,7 @@ metadata: name: unionconsole labels: platform.union.ai/prometheus-group: "union-services" - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: unionconsole app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" @@ -5681,7 +5694,7 @@ metadata: name: authorizer labels: platform.union.ai/prometheus-group: "union-services" - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: authorizer app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" @@ -5720,7 +5733,7 @@ metadata: name: cluster labels: platform.union.ai/prometheus-group: "union-services" - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: cluster app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" @@ -5759,7 +5772,7 @@ metadata: name: dataproxy labels: platform.union.ai/prometheus-group: "union-services" - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: dataproxy app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" @@ -5794,7 +5807,7 @@ metadata: name: executions labels: platform.union.ai/prometheus-group: "union-services" - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: executions app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" @@ -5829,7 +5842,7 @@ metadata: name: queue labels: platform.union.ai/prometheus-group: "union-services" - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: queue app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" @@ -5864,7 +5877,7 @@ metadata: name: run-scheduler labels: platform.union.ai/prometheus-group: "union-services" - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: run-scheduler app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" @@ -5899,7 +5912,7 @@ metadata: name: usage labels: platform.union.ai/prometheus-group: "union-services" - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: usage app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" @@ -6323,7 +6336,7 @@ metadata: labels: app.kubernetes.io/name: cacheservice app.kubernetes.io/instance: release-name - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/managed-by: Helm spec: replicas: 1 @@ -6334,7 +6347,7 @@ spec: template: metadata: annotations: - configChecksum: "90393bcce1a5d37998b4ff6af8ccb8f77098677625ecdfc4f09de9dc84e6d9e" + configChecksum: "901f295ac8f6c4f6cbfaa6c64f065e0d8a7e38f6742449c52fe322c5cfd52e8" linkerd.io/inject: disabled prometheus.io/path: /metrics prometheus.io/port: "10254" @@ -6343,7 +6356,7 @@ spec: app.kubernetes.io/name: cacheservice app.kubernetes.io/instance: release-name - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/managed-by: Helm spec: securityContext: @@ -6435,7 +6448,7 @@ kind: Deployment metadata: name: unionconsole labels: - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: unionconsole app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" @@ -6500,7 +6513,7 @@ kind: Deployment metadata: name: authorizer labels: - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: authorizer app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" @@ -6613,7 +6626,7 @@ kind: Deployment metadata: name: cluster labels: - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: cluster app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" @@ -6742,7 +6755,7 @@ kind: Deployment metadata: name: dataproxy labels: - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: dataproxy app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" @@ -6852,7 +6865,7 @@ kind: Deployment metadata: name: executions labels: - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: executions app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" @@ -6978,7 +6991,7 @@ kind: Deployment metadata: name: queue labels: - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: queue app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" @@ -7105,7 +7118,7 @@ kind: Deployment metadata: name: run-scheduler labels: - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: run-scheduler app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" @@ -7232,7 +7245,7 @@ kind: Deployment metadata: name: usage labels: - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: usage app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" @@ -7372,7 +7385,7 @@ kind: HorizontalPodAutoscaler metadata: name: unionconsole labels: - helm.sh/chart: controlplane-2026.4.2 + helm.sh/chart: controlplane-2026.4.4 app.kubernetes.io/name: unionconsole app.kubernetes.io/instance: release-name app.kubernetes.io/version: "2026.4.5" diff --git a/tests/generated/dataplane.additional-podlabels.yaml b/tests/generated/dataplane.additional-podlabels.yaml index e03f03d4..e4257c42 100644 --- a/tests/generated/dataplane.additional-podlabels.yaml +++ b/tests/generated/dataplane.additional-podlabels.yaml @@ -1,155 +1,139 @@ --- -# Source: dataplane/templates/common/namespaces.yaml -apiVersion: v1 -kind: Namespace -metadata: - name: flytesnacks-development ---- -# Source: dataplane/templates/common/namespaces.yaml -apiVersion: v1 -kind: Namespace -metadata: - name: flytesnacks-staging ---- -# Source: dataplane/templates/common/namespaces.yaml -apiVersion: v1 -kind: Namespace -metadata: - name: flytesnacks-production ---- -# Source: dataplane/templates/common/namespaces.yaml -apiVersion: v1 -kind: Namespace -metadata: - name: union-health-monitoring-development ---- -# Source: dataplane/templates/common/namespaces.yaml -apiVersion: v1 -kind: Namespace -metadata: - name: union-health-monitoring-staging ---- -# Source: dataplane/templates/common/namespaces.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2020 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + apiVersion: v1 -kind: Namespace +kind: ServiceAccount metadata: - name: union-health-monitoring-production + name: knative-operator + namespace: "union" + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator --- -# Source: dataplane/charts/fluentbit/templates/serviceaccount.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2020 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + apiVersion: v1 kind: ServiceAccount metadata: - name: fluentbit-system - namespace: union + name: operator-webhook + namespace: "union" labels: - helm.sh/chart: fluentbit-0.48.9 - app.kubernetes.io/name: fluentbit - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "3.2.8" - app.kubernetes.io/managed-by: Helm + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator --- -# Source: dataplane/charts/kube-state-metrics/templates/serviceaccount.yaml +# Source: dataplane/charts/prometheus/charts/kube-state-metrics/templates/serviceaccount.yaml apiVersion: v1 kind: ServiceAccount automountServiceAccountToken: true metadata: labels: - helm.sh/chart: kube-state-metrics-5.30.1 + helm.sh/chart: kube-state-metrics-5.25.1 app.kubernetes.io/managed-by: Helm app.kubernetes.io/component: metrics app.kubernetes.io/part-of: kube-state-metrics app.kubernetes.io/name: kube-state-metrics app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2.15.0" + app.kubernetes.io/version: "2.13.0" name: release-name-kube-state-metrics namespace: union --- -# Source: dataplane/charts/opencost/templates/serviceaccount.yaml +# Source: dataplane/charts/prometheus/templates/serviceaccount.yaml apiVersion: v1 kind: ServiceAccount metadata: - name: release-name-opencost - namespace: union labels: - helm.sh/chart: opencost-1.42.0 - app.kubernetes.io/name: opencost + app.kubernetes.io/component: server + app.kubernetes.io/name: prometheus app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "1.111.0" - app.kubernetes.io/part-of: opencost + app.kubernetes.io/version: v2.54.1 + helm.sh/chart: prometheus-25.27.0 app.kubernetes.io/managed-by: Helm -automountServiceAccountToken: true ---- -# Source: dataplane/templates/clusterresourcesync/serviceaccount.yaml -apiVersion: v1 -kind: ServiceAccount -metadata: - name: union-clustersync-system + app.kubernetes.io/part-of: prometheus + name: union-operator-prometheus namespace: union + annotations: + {} --- -# Source: dataplane/templates/imagebuilder/serviceaccount.yaml -apiVersion: v1 -kind: ServiceAccount -metadata: - name: union-imagebuilder ---- -# Source: dataplane/templates/nodeexecutor/serviceaccount.yaml +# Source: dataplane/templates/common/system-serviceaccount.yaml apiVersion: v1 kind: ServiceAccount metadata: - name: executor + name: union-system namespace: union - labels: - app: executor ---- -# Source: dataplane/templates/operator/serviceaccount-proxy.yaml -apiVersion: v1 -kind: ServiceAccount -metadata: - name: proxy-system - labels: - app.kubernetes.io/name: operator-proxy - app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name - app.kubernetes.io/managed-by: Helm + annotations: --- -# Source: dataplane/templates/operator/serviceaccount.yaml +# Source: dataplane/templates/common/union-serviceaccount.yaml apiVersion: v1 kind: ServiceAccount metadata: - name: operator-system - labels: - app.kubernetes.io/name: union-operator - app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name - app.kubernetes.io/managed-by: Helm + name: union + namespace: union + annotations: + eks.amazonaws.com/role-arn: arn:aws:iam::ACCOUNT_ID:role/flyte_project_role +automountServiceAccountToken: true --- -# Source: dataplane/templates/prometheus/serviceaccount.yaml +# Source: dataplane/templates/flyteconnector/serviceaccount.yaml apiVersion: v1 kind: ServiceAccount metadata: - name: union-operator-prometheus + name: flyteconnector namespace: union - labels: - helm.sh/chart: dataplane-2026.4.2 - app.kubernetes.io/name: release-name-dataplane + labels: + app.kubernetes.io/name: flyteconnector app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: prometheus ---- -# Source: dataplane/templates/propeller/serviceaccount-webhook.yaml -apiVersion: v1 -kind: ServiceAccount -metadata: - name: flytepropeller-webhook-system - namespace: union --- -# Source: dataplane/templates/propeller/serviceaccount.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2022 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + apiVersion: v1 -kind: ServiceAccount +kind: Secret metadata: - name: flytepropeller-system - namespace: union + name: operator-webhook-certs + namespace: "union" + labels: + app.kubernetes.io/component: operator-webhook + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +# The data is populated at install time. --- # Source: dataplane/templates/common/auth-secret.yaml apiVersion: v1 @@ -172,151 +156,533 @@ type: Opaque data: cluster_name: dW5pb24tdGVzdA== --- -# Source: dataplane/templates/propeller/deployment-webhook.yaml -# Create an empty secret that the first propeller pod will populate +# Source: dataplane/templates/webhook/mutatingwebhookconfiguration.yaml apiVersion: v1 kind: Secret metadata: - name: flyte-pod-webhook + name: union-pod-webhook namespace: union + labels: + app.kubernetes.io/name: union-pod-webhook + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm type: Opaque +data: + ca.crt: LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSUROekNDQWgrZ0F3SUJBZ0lVSm9kQ09lem4vd1BHQTZjYkw1U3duSlpYT2hJd0RRWUpLb1pJaHZjTkFRRUwKQlFBd0t6RXBNQ2NHQTFVRUF3d2dabXg1ZEdWd2NtOXdaV3hzWlhJdGQyVmlhRzl2YXk1MWJtbHZiaTV6ZG1NdwpIaGNOTWpZd01USXpNREl6TmpVMVdoY05Nell3TVRJeE1ESXpOalUxV2pBck1Ta3dKd1lEVlFRRERDQm1iSGwwClpYQnliM0JsYkd4bGNpMTNaV0pvYjI5ckxuVnVhVzl1TG5OMll6Q0NBU0l3RFFZSktvWklodmNOQVFFQkJRQUQKZ2dFUEFEQ0NBUW9DZ2dFQkFJcjBSWDRGRjFNVTNYdFBmUHErMDUrMGI1Qm0xK3hsTVR5ZUp4bG5mWFJuRlgyegpPcHV6NmsreUpBQk4xRkxmNDVYRnlJclFlaW5DWWJtckZXalFRc3ZDWEloNkpPNGMwdmRMVVU5RERYcldWRFlsCkRNZytTUDlJVnZBMm5USkVFZ2RFREVsKzNWeThUbUJoNlNmcGthcXdHajY5SmhIdy9OeE9yZzM2bUY3TU5VZ1MKWXJVUkxqUFFzTUN6NElTVzhhQnEzS2IwRUFWUTA4V1lZQ0ZEbjY1aHBzVFdxR05rL3BKRnVTYjlsZll6RVdyYwpJeW5wNDhqTnIvNUdKS3NnWG9LeTUwZVo4UkppeTVyOUhENGtnSW15TVNQR2RDMXFZSVppRHRlZW8yOElxNk5nCmd1dWcyTGhlNndYYW9YZXFCQ2d3SHpwWFVXM1hJVzI5eFoxUkFRRUNBd0VBQWFOVE1GRXdIUVlEVlIwT0JCWUUKRktITDQ1RWhQaVg5bnlRN241QUk0YUZVNFlCL01COEdBMVVkSXdRWU1CYUFGS0hMNDVFaFBpWDlueVE3bjVBSQo0YUZVNFlCL01BOEdBMVVkRXdFQi93UUZNQU1CQWY4d0RRWUpLb1pJaHZjTkFRRUxCUUFEZ2dFQkFEeTRac21XCkNjZ3B5TmI4cU1ydzNYclBNejZLMTRlTVhqQVk1bERFYXM1NklLZExoZFM1bi9sWk95bENuUUtOZ3RHS0VrY3UKT05tMEE2Z0xNT3UzcE43dG9aNXh0WktOcnUvS0x6dm44QUNNNndCbUR0bG5oRERDa3ZUTHBVRzh6UTZSNmlaegpzTWlhVmpLSUJiWkJsZXNYSHZZQnQ2YlRTN2o1QWM2aTdYc1RPK3ZqQ2RUQjVYdCtEZ0x2QnBJRWh2ZGJzNXhmClFtc3ZzQmwxenJnR3JXemo4d0txcXNUMzZaOVU1TzdSK1RJMlQrUDVOR25SeHNQOGJ0R0RtMGZ5SW55WlJrSkcKVTdpVFJyMlZRWlRyaTNZQ2dlVjN1UjFtMnJjc2hyU1Avb082T0QyTXR5Q1FMRWtFK1pxVUVJVnpSYmVaNXl2RQptM29YMFV2OUtBUy9pWTA9Ci0tLS0tRU5EIENFUlRJRklDQVRFLS0tLS0K + tls.crt: LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSUQxakNDQXI2Z0F3SUJBZ0lVREtRN3A4MFpIdkQyUE5Ya3p3VGRLem9iUjg4d0RRWUpLb1pJaHZjTkFRRUwKQlFBd0t6RXBNQ2NHQTFVRUF3d2dabXg1ZEdWd2NtOXdaV3hzWlhJdGQyVmlhRzl2YXk1MWJtbHZiaTV6ZG1NdwpIaGNOTWpZd01USXpNREl6TmpVMVdoY05NamN3TVRJek1ESXpOalUxV2pBck1Ta3dKd1lEVlFRRERDQm1iSGwwClpYQnliM0JsYkd4bGNpMTNaV0pvYjI5ckxuVnVhVzl1TG5OMll6Q0NBU0l3RFFZSktvWklodmNOQVFFQkJRQUQKZ2dFUEFEQ0NBUW9DZ2dFQkFMNnMxUTRaZ3ZxVVdNdFp5VC9EKytEQWl6WmR0Q1lkVzArSGk4S3phQ1Vjbi9zZwo3a21NTEhDM2ZlZWtZK2ZUNENHUTd6K1l2WjRWWSt6WGljKzdrNndLbUZqdm1acEM1Rm96OWo1Q21NRk5HZmlWCnRobzVvSU9MUkIzNXRYMU1ROHUwNFVzT29QcjdMK016eEdxWXVmTFdtN1NVdjh0eEp3ZFMxZVlzMllNV1pMeU4KekI4akdWY01Qc2ZNVkdxdFhTTVljQ3hNYURKaTM1N0VEZSs3clM2U2ZtbnFtYjJxekdQM1BRRStCUm91Yk5nLwo4NCt4c253VXBLZUFpem95dFM2c2ZlOTdMUEtRTW9oVzd2TjZORFJrLy9xSmpHem9iZmZwUHU3endhWHVXMWtMCmllc2g2cVRrdlZLSTF3WFJ2WHArVU5GbjZzdDIwdEVlRks0ZzMwOENBd0VBQWFPQjhUQ0I3akFKQmdOVkhSTUUKQWpBQU1Bc0dBMVVkRHdRRUF3SUY0RENCa3dZRFZSMFJCSUdMTUlHSWdoWm1iSGwwWlhCeWIzQmxiR3hsY2kxMwpaV0pvYjI5cmdoeG1iSGwwWlhCeWIzQmxiR3hsY2kxM1pXSm9iMjlyTG5WdWFXOXVnaUJtYkhsMFpYQnliM0JsCmJHeGxjaTEzWldKb2IyOXJMblZ1YVc5dUxuTjJZNEl1Wm14NWRHVndjbTl3Wld4c1pYSXRkMlZpYUc5dmF5NTEKYm1sdmJpNXpkbU11WTJ4MWMzUmxjaTVzYjJOaGJEQWRCZ05WSFE0RUZnUVV1d25wcDZ1MTNKL0NsNnI3eDZ1Sgo5YmZ1RDhzd0h3WURWUjBqQkJnd0ZvQVVvY3Zqa1NFK0pmMmZKRHVma0FqaG9WVGhnSDh3RFFZSktvWklodmNOCkFRRUxCUUFEZ2dFQkFFMmlOa1MzczhGT3Z0cDhrVmJNZUN2amxIQytvYTVlZTFGNmowNlZ3bFJPOWdxOXJla1UKMGptWUhNWGNjWjcrZXJkNFRQRXgwK3ZqMzNhRTV3cDFmdVJJL2xIZHhuOUpJRHg2bTFQTHdVa3p1WG9EekYxLwozVXVIWU1xOGkrT2xsY0tqbmdVMUFUNmRWRCtIbFVWeHpMdjlNazdPb05SQUY5dTJHa1hiUHdCN0I1ZGRxdWxlCmtNcXBxbDhES2s4SVNVelVsTjNBb2Q5NmJUelIyV3RCVkxpTi9zRHZtTDZwZThFYm55RnBUb1BZTDlGeWZOcmIKT1ZPZEJwUkZFTG1KOHZaZWY5d1lxRmVXMTE3SGV5VDArZkxGZVNSemZUNVVWUlVraVFBcUVJdUhHUVRrOVFabgo1WnYvNUF0eEg3WE1rcXR0VE1zMFlSUG0xYjQzY3FBeFdhND0KLS0tLS1FTkQgQ0VSVElGSUNBVEUtLS0tLQo= + tls.key: LS0tLS1CRUdJTiBQUklWQVRFIEtFWS0tLS0tCk1JSUV2UUlCQURBTkJna3Foa2lHOXcwQkFRRUZBQVNDQktjd2dnU2pBZ0VBQW9JQkFRQytyTlVPR1lMNmxGakwKV2NrL3cvdmd3SXMyWGJRbUhWdFBoNHZDczJnbEhKLzdJTzVKakN4d3QzM25wR1BuMCtBaGtPOC9tTDJlRldQcwoxNG5QdTVPc0NwaFk3NW1hUXVSYU0vWStRcGpCVFJuNGxiWWFPYUNEaTBRZCtiVjlURVBMdE9GTERxRDYreS9qCk04UnFtTG55MXB1MGxML0xjU2NIVXRYbUxObURGbVM4amN3Zkl4bFhERDdIekZScXJWMGpHSEFzVEdneVl0K2UKeEEzdnU2MHVrbjVwNnBtOXFzeGo5ejBCUGdVYUxtellQL09Qc2JKOEZLU25nSXM2TXJVdXJIM3ZleXp5a0RLSQpWdTd6ZWpRMFpQLzZpWXhzNkczMzZUN3U4OEdsN2x0WkM0bnJJZXFrNUwxU2lOY0YwYjE2ZmxEUlorckxkdExSCkhoU3VJTjlQQWdNQkFBRUNnZ0VBRjV2SnMzbTRMSE9DdlFic2NwOC9DQmgvQkNlOC92MGlpYW5VUmJLMkFlWlYKN3Arb3NXV1FPUktYSGIvT2VPMVVjb09DQkFOUzh3aGQrM3pDZlB5U0w3cU9HM0RyT1Z6djdqVkIxM3FpZEVpcQpId3ZXWk0vZXpuckhYOWpEdm5SYmJwVUNVaXRKQmxwa2x3S1pYc056UHB5UTRkNkxFdEw3VEo1V2lxM2g3cWQzCjhncmJoeWlWNDhZRFAyYkhpZGt0S2QvMHBNL3piSzZPTS9iMFpLUmRBWVpHQXlOcklMaFJ3eDNsQXE2SDVqL20KUHBuNlZEdVQ3Y1h6UFhmNTgxR0IyNjQxWFJVTXRNWGVEREtaU3JudnIrbG5tV2lsYXZmNkg3ZHp4NlRBMmk0RApybCtIRVdUN1RSQUZGMnZ5SVUrYSt5RzNPVFBKRCtmVTdCWWh4Zld4aFFLQmdRRGVvcXVCbzQxK1dKU255blZnCmNWMHZoUEkwZ0dyRGQzNFdIbCtGNTZuSmxuMVVlbUQ2SUQ0TUhnNVpHWmNmd3NkRlB0aHl1VnB0OVl1SW11SGQKRHE5ZDhpWnpHUzJVbStkSWlsWjhnZXJ5Q3dON3hIbFM3SzJrc0RmRGJwYmZZSXZ1LzRnSExZRHh3YzQ0OG54ZwpWNGpCUFY2QTdYTHRqQzZzdFE2OW9uTlIwd0tCZ1FEYlFBUTRGdWpnUjJUOXh2Y01MdXhMbm1YaHE4U2NoZlM5CkRoWndDUzZIeHB1YlBxblBjQ04vQUZjbDZ1cDRXVEVZdVY4bEpwcGwwRmpYQ2lMdkhuUU15cm5vRjlUOThpanQKZGVHYjF6ZWlwRUpndFpNVHkvOGdERWorZU1wSFBuYTZwQVpMOVpRVUdLR2dDNGQ0SzljdVNHMmRlb1dSU0ViSQpMVGgzeGpXVEZRS0JnQStJRWFidG5nVmVjS0IwQTFSREZGa29VUzFRZUNKQ3g4MExPV2JDRHBvOW9XaXZVT3lpCkt6SDFOdE1JY2Y2SlBCV2NtTVVJSVVMaWltVnhTS2gvU2NTb0MvNmpsd1p6Q2VPSm94YjBpVXR4Y1VERktDR2MKMlZCUDZ0UDdkeE1HVFR6VEhzNUJZbWw3TjhQSlJ0d0J2MHliMTJmdktNRmhzaS9pUWJFQkVFSjVBb0dBUm9NSApHRmJkM0V0NXdsZzcyYkk1a25SRnhkY3RLejIxb1J1bndhNWlSWTV0T3Zkak8zQ1FLZWNkSC9lMklyQmtwdFB2Ck1vNkF0MS9UUW8xakFNNGxlbnUwWUYxUnhiNGN4WW5VM2Y3UVNNRlZDNjg1dHZNemdNWVNyNngvT1h4d1NNUTUKdGpJcnhtN0poQ1JSRkNmZFUyZzl3SmpIM2hxRmtSbGlBTHRCUGFVQ2dZRUF2QkRHejQvYS9kRzRJT0VpREk4SgpBbkxaRWZaTFJrdnI0ejRyT0p0TGZtWlFjTW5DSUZmKzBaelhTME9XOWhvMjk5RUNtS1l4U2VUWk1WSDgzOFZCCnRvSkNGM3lzb01rSC9meEVXTUpicDlMblJwSFUzUElicHlhRG1nWW1YdmsvK3laL3Y1c0dDTWNtVGkzWVFLRGsKcStlK1o0SHZyaGUxVytUaHpwOCtoNGs9Ci0tLS0tRU5EIFBSSVZBVEUgS0VZLS0tLS0K --- -# Source: dataplane/templates/clusterresourcesync/configmap.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2019 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + apiVersion: v1 kind: ConfigMap metadata: - name: union-clusterresourcesync-config - namespace: union + name: config-logging + namespace: "union" labels: - app.kubernetes.io/name: clusterresourcesync - app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name - app.kubernetes.io/managed-by: Helm -data: - cluster_resources.yaml: | - cluster_resources: - clusterName: 'union-test' - customData: - - production: - - projectQuotaCpu: - value: "4096" - - projectQuotaMemory: - value: 2Ti - - projectQuotaNvidiaGpu: - value: "256" - - defaultUserRoleKey: - value: 'eks.amazonaws.com/role-arn' - - defaultUserRoleValue: - value: 'arn:aws:iam::ACCOUNT_ID:role/flyte_project_role' - - staging: - - projectQuotaCpu: - value: "4096" - - projectQuotaMemory: - value: 2Ti - - projectQuotaNvidiaGpu: - value: "256" - - defaultUserRoleKey: - value: 'eks.amazonaws.com/role-arn' - - defaultUserRoleValue: - value: 'arn:aws:iam::ACCOUNT_ID:role/flyte_project_role' - - development: - - projectQuotaCpu: - value: "4096" - - projectQuotaMemory: - value: 2Ti - - projectQuotaNvidiaGpu: - value: "256" - - defaultUserRoleKey: - value: 'eks.amazonaws.com/role-arn' - - defaultUserRoleValue: - value: 'arn:aws:iam::ACCOUNT_ID:role/flyte_project_role' - refreshInterval: 5m - standaloneDeployment: true - templatePath: /etc/flyte/clusterresource/templates - clusterResourcesPrivate: - app: - isServerless: false - union: - auth: - authorizationMetadataKey: flyte-authorization - clientId: 'test-client' - clientSecretLocation: /etc/union/secret/client_secret - tokenRefreshWindow: 5m - type: ClientSecret - connection: - host: dns:///union.test.union.ai - admin.yaml: | - admin: - clientId: 'test-client' - clientSecretLocation: /etc/union/secret/client_secret - endpoint: dns:///union.test.union.ai - insecure: false - event: - capacity: 1000 - rate: 500 - type: admin - domain.yaml: | - domains: - - id: development - name: development - - id: staging - name: staging - - id: production - name: production - clusters.yaml: | - clusters: - clusterConfigs: [] - labelClusterMap: {} - logger.yaml: | - logger: - formatter: - type: json - level: 4 - show-source: true + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +data: {} +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2019 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v1 +kind: ConfigMap +metadata: + name: config-observability + namespace: "union" + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +data: {} --- -# Source: dataplane/templates/clusterresourcesync/configmap.yaml +# Source: dataplane/charts/prometheus/templates/cm.yaml apiVersion: v1 kind: ConfigMap metadata: - name: union-clusterresource-template - namespace: union labels: - app.kubernetes.io/name: clusterresourcesync + app.kubernetes.io/component: server + app.kubernetes.io/name: prometheus app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name + app.kubernetes.io/version: v2.54.1 + helm.sh/chart: prometheus-25.27.0 app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: prometheus + name: union-operator-prometheus + namespace: union data: - a_namespace.yaml: | - apiVersion: v1 - kind: Namespace - metadata: - name: {{ namespace }} - labels: - union.ai/namespace-type: flyte - spec: - finalizers: - - kubernetes - - b_default_service_account.yaml: | - apiVersion: v1 - kind: ServiceAccount - metadata: - name: default - namespace: {{ namespace }} - annotations: - {{ defaultUserRoleKey }}: {{ defaultUserRoleValue }} + allow-snippet-annotations: "false" + alerting_rules.yml: | + {} + alerts: | + {} + prometheus.yml: | + global: + evaluation_interval: 1m + scrape_interval: 1m + scrape_timeout: 10s + rule_files: + - /etc/config/recording_rules.yml + - /etc/config/alerting_rules.yml + - /etc/config/rules + - /etc/config/alerts + scrape_configs: + - job_name: prometheus + static_configs: + - targets: + - localhost:9090 + - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + job_name: kubernetes-apiservers + kubernetes_sd_configs: + - role: endpoints + relabel_configs: + - action: keep + regex: default;kubernetes;https + source_labels: + - __meta_kubernetes_namespace + - __meta_kubernetes_service_name + - __meta_kubernetes_endpoint_port_name + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + insecure_skip_verify: true + - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + job_name: kubernetes-nodes + kubernetes_sd_configs: + - role: node + relabel_configs: + - action: labelmap + regex: __meta_kubernetes_node_label_(.+) + - replacement: kubernetes.default.svc:443 + target_label: __address__ + - regex: (.+) + replacement: /api/v1/nodes/$1/proxy/metrics + source_labels: + - __meta_kubernetes_node_name + target_label: __metrics_path__ + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + insecure_skip_verify: true + - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + job_name: kubernetes-nodes-cadvisor + kubernetes_sd_configs: + - role: node + relabel_configs: + - action: labelmap + regex: __meta_kubernetes_node_label_(.+) + - replacement: kubernetes.default.svc:443 + target_label: __address__ + - regex: (.+) + replacement: /api/v1/nodes/$1/proxy/metrics/cadvisor + source_labels: + - __meta_kubernetes_node_name + target_label: __metrics_path__ + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + insecure_skip_verify: true + - honor_labels: true + job_name: kubernetes-service-endpoints + kubernetes_sd_configs: + - role: endpoints + relabel_configs: + - action: keep + regex: true + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_scrape + - action: drop + regex: true + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_scrape_slow + - action: replace + regex: (https?) + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_scheme + target_label: __scheme__ + - action: replace + regex: (.+) + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_path + target_label: __metrics_path__ + - action: replace + regex: (.+?)(?::\d+)?;(\d+) + replacement: $1:$2 + source_labels: + - __address__ + - __meta_kubernetes_service_annotation_prometheus_io_port + target_label: __address__ + - action: labelmap + regex: __meta_kubernetes_service_annotation_prometheus_io_param_(.+) + replacement: __param_$1 + - action: labelmap + regex: __meta_kubernetes_service_label_(.+) + - action: replace + source_labels: + - __meta_kubernetes_namespace + target_label: namespace + - action: replace + source_labels: + - __meta_kubernetes_service_name + target_label: service + - action: replace + source_labels: + - __meta_kubernetes_pod_node_name + target_label: node + - honor_labels: true + job_name: kubernetes-service-endpoints-slow + kubernetes_sd_configs: + - role: endpoints + relabel_configs: + - action: keep + regex: true + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_scrape_slow + - action: replace + regex: (https?) + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_scheme + target_label: __scheme__ + - action: replace + regex: (.+) + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_path + target_label: __metrics_path__ + - action: replace + regex: (.+?)(?::\d+)?;(\d+) + replacement: $1:$2 + source_labels: + - __address__ + - __meta_kubernetes_service_annotation_prometheus_io_port + target_label: __address__ + - action: labelmap + regex: __meta_kubernetes_service_annotation_prometheus_io_param_(.+) + replacement: __param_$1 + - action: labelmap + regex: __meta_kubernetes_service_label_(.+) + - action: replace + source_labels: + - __meta_kubernetes_namespace + target_label: namespace + - action: replace + source_labels: + - __meta_kubernetes_service_name + target_label: service + - action: replace + source_labels: + - __meta_kubernetes_pod_node_name + target_label: node + scrape_interval: 5m + scrape_timeout: 30s + - honor_labels: true + job_name: prometheus-pushgateway + kubernetes_sd_configs: + - role: service + relabel_configs: + - action: keep + regex: pushgateway + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_probe + - honor_labels: true + job_name: kubernetes-services + kubernetes_sd_configs: + - role: service + metrics_path: /probe + params: + module: + - http_2xx + relabel_configs: + - action: keep + regex: true + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_probe + - source_labels: + - __address__ + target_label: __param_target + - replacement: blackbox + target_label: __address__ + - source_labels: + - __param_target + target_label: instance + - action: labelmap + regex: __meta_kubernetes_service_label_(.+) + - source_labels: + - __meta_kubernetes_namespace + target_label: namespace + - source_labels: + - __meta_kubernetes_service_name + target_label: service + - honor_labels: true + job_name: kubernetes-pods + kubernetes_sd_configs: + - role: pod + relabel_configs: + - action: keep + regex: true + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_scrape + - action: drop + regex: true + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_scrape_slow + - action: replace + regex: (https?) + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_scheme + target_label: __scheme__ + - action: replace + regex: (.+) + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_path + target_label: __metrics_path__ + - action: replace + regex: (\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4}) + replacement: '[$2]:$1' + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_port + - __meta_kubernetes_pod_ip + target_label: __address__ + - action: replace + regex: (\d+);((([0-9]+?)(\.|$)){4}) + replacement: $2:$1 + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_port + - __meta_kubernetes_pod_ip + target_label: __address__ + - action: labelmap + regex: __meta_kubernetes_pod_annotation_prometheus_io_param_(.+) + replacement: __param_$1 + - action: labelmap + regex: __meta_kubernetes_pod_label_(.+) + - action: replace + source_labels: + - __meta_kubernetes_namespace + target_label: namespace + - action: replace + source_labels: + - __meta_kubernetes_pod_name + target_label: pod + - action: drop + regex: Pending|Succeeded|Failed|Completed + source_labels: + - __meta_kubernetes_pod_phase + - action: replace + source_labels: + - __meta_kubernetes_pod_node_name + target_label: node + - honor_labels: true + job_name: kubernetes-pods-slow + kubernetes_sd_configs: + - role: pod + relabel_configs: + - action: keep + regex: true + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_scrape_slow + - action: replace + regex: (https?) + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_scheme + target_label: __scheme__ + - action: replace + regex: (.+) + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_path + target_label: __metrics_path__ + - action: replace + regex: (\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4}) + replacement: '[$2]:$1' + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_port + - __meta_kubernetes_pod_ip + target_label: __address__ + - action: replace + regex: (\d+);((([0-9]+?)(\.|$)){4}) + replacement: $2:$1 + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_port + - __meta_kubernetes_pod_ip + target_label: __address__ + - action: labelmap + regex: __meta_kubernetes_pod_annotation_prometheus_io_param_(.+) + replacement: __param_$1 + - action: labelmap + regex: __meta_kubernetes_pod_label_(.+) + - action: replace + source_labels: + - __meta_kubernetes_namespace + target_label: namespace + - action: replace + source_labels: + - __meta_kubernetes_pod_name + target_label: pod + - action: drop + regex: Pending|Succeeded|Failed|Completed + source_labels: + - __meta_kubernetes_pod_phase + - action: replace + source_labels: + - __meta_kubernetes_pod_node_name + target_label: node + scrape_interval: 5m + scrape_timeout: 30s + - job_name: 'kube-state-metrics' + metrics_path: /metrics + scrape_interval: 1m + honor_labels: true + kubernetes_sd_configs: + - role: endpoints + namespaces: + own_namespace: true + relabel_configs: + - source_labels: [__meta_kubernetes_service_label_app_kubernetes_io_name] + regex: kube-state-metrics + action: keep + - source_labels: [__meta_kubernetes_endpoint_port_name] + regex: http + action: keep + metric_relabel_configs: + - source_labels: [__name__] + regex: "kube_pod_container_resource_(limits|requests)|kube_pod_status_phase|kube_node_(labels|status_allocatable|status_condition|status_capacity)|kube_namespace_labels|kube_pod_container_status_(waiting|terminated|last_terminated).*_reason|kube_daemonset_status_number_unavailable|kube_deployment_status_replicas_unavailable|kube_resourcequota|kube_pod_info|kube_node_info|kube_pod_container_status_restarts_total" + action: keep + - source_labels: [__name__, phase] + separator: ";" + regex: "kube_pod_status_phase;(Succeeded|Failed)" + action: drop + - source_labels: [node] + target_label: nodename + regex: "(.*)" + action: replace + - source_labels: [label_node_group_name] + action: replace + regex: "(.+)" + target_label: label_node_pool_name + - job_name: 'opencost' + metrics_path: /metrics + scrape_interval: 1m + honor_labels: true + kubernetes_sd_configs: + - role: endpoints + namespaces: + own_namespace: true + relabel_configs: + - source_labels: [__meta_kubernetes_service_label_app_kubernetes_io_name] + regex: opencost + action: keep + - source_labels: [__meta_kubernetes_endpoint_port_name] + regex: http + action: keep + metric_relabel_configs: + - source_labels: [__name__] + regex: "kube_node_labels|kube_pod_labels|node_total_hourly_cost|node_ram_hourly_cost|node_cpu_hourly_cost|node_gpu_hourly_cost" + action: keep + - job_name: 'union-services' + metrics_path: /metrics + scrape_interval: 1m + honor_labels: true + kubernetes_sd_configs: + - role: endpoints + namespaces: + own_namespace: true + relabel_configs: + - source_labels: [__meta_kubernetes_service_label_platform_union_ai_service_group] + regex: .+ + action: keep + - source_labels: [__meta_kubernetes_endpoint_port_name] + regex: debug + action: keep + # Flyte propeller metrics for execution info and fast task duration. + # No-op when propeller is not deployed (no pods match the label selector). + - job_name: 'flytepropeller' + kubernetes_sd_configs: + - role: pod + namespaces: + own_namespace: true + selectors: + - role: pod + label: app.kubernetes.io/name=flytepropeller + metric_relabel_configs: + - source_labels: [__name__] + regex: "flyte:propeller:all:round:execution_info|flyte:propeller:all:node:fast_task:fast_task_execution_duration" + action: keep + relabel_configs: + - source_labels: [__meta_kubernetes_pod_container_name] + regex: flytepropeller + action: keep + - source_labels: [__meta_kubernetes_namespace] + action: replace + target_label: namespace + - source_labels: [__meta_kubernetes_pod_name] + action: replace + target_label: pod + # Kourier gateway (envoy) metrics for app serving. + # No-op when serving is not enabled (no pods match the label selector). + - job_name: 'serving-envoy' + kubernetes_sd_configs: + - role: pod + namespaces: + own_namespace: true + selectors: + - role: pod + label: app=3scale-kourier-gateway + metrics_path: /stats/prometheus + metric_relabel_configs: + - source_labels: [__name__] + regex: "envoy_cluster_upstream_rq_xx|envoy_cluster_upstream_rq_time_bucket" + action: keep + relabel_configs: + - source_labels: [__meta_kubernetes_pod_container_port_name] + regex: metrics + action: keep + - source_labels: [__meta_kubernetes_namespace] + action: replace + target_label: namespace + - source_labels: [__meta_kubernetes_pod_name] + action: replace + target_label: pod + # DCGM GPU metrics. + # No-op when dcgm-exporter is not deployed (no pods match the label selector). + - job_name: 'gpu-metrics' + kubernetes_sd_configs: + - role: pod + namespaces: + names: + - kube-system + selectors: + - role: pod + label: app.kubernetes.io/name=dcgm-exporter - c_project_resource_quota.yaml: | - apiVersion: v1 - kind: ResourceQuota - metadata: - name: project-quota - namespace: {{ namespace }} - spec: - hard: - limits.cpu: {{ projectQuotaCpu }} - limits.memory: {{ projectQuotaMemory }} - requests.nvidia.com/gpu: {{ projectQuotaNvidiaGpu }} + recording_rules.yml: | + {} + rules: | + {} --- # Source: dataplane/templates/fluent-bit/configmap.yaml apiVersion: v1 @@ -364,6 +730,34 @@ data: region us-east-1 bucket test-bucket --- +# Source: dataplane/templates/imagebuilder/build-image-configmap.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: build-image-config + labels: + app.kubernetes.io/name: union-operator + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm +data: + storage.yaml: | + storage: + container: "test-bucket" + type: s3 + connection: + auth-type: iam + region: us-east-1 + enable-multicontainer: false + limits: + maxDownloadMBs: 1024 + cache: + max_size_mbs: 0 + target_gc_percent: 70 + buildkit-uri: "tcp://union-operator-buildkit.union.svc.cluster.local:1234" + default-repository: ".dkr.ecr.us-east-1.amazonaws.com/union-dataplane" + authentication-type: "aws" +--- # Source: dataplane/templates/imagebuilder/configmap.yaml apiVersion: v1 kind: ConfigMap @@ -2520,6 +2914,18 @@ data: "version": 1 } --- +# Source: dataplane/templates/monitoring/prometheusrule.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: union-recording-rules + namespace: union + labels: + release: release-name +data: + recording_rules.yml: | + groups: [] +--- # Source: dataplane/templates/nodeexecutor/configmap.yaml apiVersion: v1 kind: ConfigMap @@ -2574,8 +2980,13 @@ data: templateUris: - https://scale.neptune.ai/{{ .taskConfig.project }}/-/run/?customId={{ .taskConfig.id }} - kubernetes-enabled: true + kubernetes-enabled: false enabled_plugins.yaml: | + plugins: + connector-service: + defaultConnector: + defaultServiceConfig: '{"loadBalancingConfig": [{"round_robin":{}}]}' + endpoint: k8s:///flyteconnector.union:8000 tasks: task-plugins: default-for-task-types: @@ -2608,6 +3019,8 @@ data: cpu: 4096 gpu: 256 memory: 2Ti + namespace_mapping: + template: union union: connection: host: dns:///union.test.union.ai @@ -2628,7 +3041,7 @@ data: cache-endpoint: dns:///union.test.union.ai endpoint: dns:///union.test.union.ai insecure: false - type: fallback + type: cacheservicev2 use-admin-auth: true logger: formatter: @@ -2648,6 +3061,7 @@ data: secure: false useAuth: false propeller: + limit-namespace: union node-config: disable-input-file-writes: true plugins: @@ -2655,7 +3069,7 @@ data: additional-worker-args: - --last-ack-grace-period-seconds - "120" - callback-uri: http://unionai-dataplane-executor.union.svc.cluster.local:15605 + callback-uri: http://union-operator-executor.union.svc.cluster.local:15605 grace-period-status-not-found: 2m ioutils: remoteFileOutputPaths: @@ -2665,6 +3079,7 @@ data: default-cpus: 100m default-env-vars: [] default-memory: 100Mi + default-pod-template-name: task-template co-pilot: image: 'cr.flyte.org/flyteorg/flytecopilot:v1.14.1' name: flyte-copilot- @@ -2699,6 +3114,7 @@ data: default-cpus: 100m default-env-vars: [] default-memory: 100Mi + default-pod-template-name: task-template config.yaml: | union: connection: @@ -2718,11 +3134,14 @@ data: operator: enabled: true enableTunnelService: true + # enableDepot: false tunnel: - enableDirectToAppIngress: false + enableDirectToAppIngress: true deploymentToRestart: union-operator-proxy + limitNamespace: union + disableClusterPermissions: true apps: - enabled: 'false' + enabled: 'true' syncClusterConfig: enabled: false clusterId: @@ -2737,19 +3156,19 @@ data: userRole: 'arn:aws:iam::ACCOUNT_ID:role/flyte_project_role' userRoleKey: 'eks.amazonaws.com/role-arn' collectUsages: - enabled: true + enabled: false billing: model: Legacy dependenciesHeartbeat: - prometheus: - endpoint: 'http://union-operator-prometheus:80/-/healthy' - propeller: - endpoint: 'http://flytepropeller:10254' + executor: + endpoint: 'http://union-operator-executor:10254' proxy: endpoint: 'http://union-operator-proxy:10254' secretsWatcher: dryRun: true enabled: false + org: + namespaceTemplate: union imageBuilder: enabled: true executionNamespaceLabels: @@ -2757,9 +3176,6 @@ data: referenceConfigmapName: union-operator targetConfigMapName: "build-image-config" proxy: - imageBuilderConfig: - authenticationType: 'noop' - defaultRepository: '' persistedLogs: objectStore: pathTemplate: namespace-{{.KubernetesNamespace}}.pod-{{.KubernetesPodName}}.cont-{{.KubernetesContainerName}} @@ -2801,989 +3217,149 @@ data: auth-type: iam region: us-east-1 image-builder.buildkit-uri: "tcp://union-operator-buildkit.union.svc.cluster.local:1234" - image-builder.default-repository: "" - image-builder.authentication-type: "noop" + image-builder.default-repository: ".dkr.ecr.us-east-1.amazonaws.com/union-dataplane" + image-builder.authentication-type: "aws" --- -# Source: dataplane/templates/prometheus/configmap.yaml +# Source: dataplane/templates/serving/bootstrap-configmap.yaml +# We need to copy the default configmap here since knative-operator does not automatically update the +# address of the `net-kourier-controller` endpoint to use the release namespace. It assumes that the +# resources are being installed into the `knative-serving` namespace instead. apiVersion: v1 kind: ConfigMap metadata: - name: union-operator-prometheus + name: union-operator-serving-envoy-bootstrap namespace: union - labels: - helm.sh/chart: dataplane-2026.4.2 - app.kubernetes.io/name: release-name-dataplane - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: prometheus data: - prometheus.yml: | - global: - scrape_interval: 15s - evaluation_interval: 15s - alerting: - alertmanagers: - - static_configs: - - targets: - rule_files: - - rules.yml - scrape_configs: - # Self-monitoring - - job_name: prometheus - metrics_path: /prometheus/metrics - static_configs: - - targets: ['localhost:9090'] - metric_relabel_configs: - - source_labels: [__name__] - regex: prometheus_tsdb_head_chunks_storage_size_bytes|prometheus_tsdb_head_series|prometheus_tsdb_head_series_created_total|prometheus_tsdb_head_series_removed_total|prometheus_tsdb_head_samples_appended_total|prometheus_tsdb_storage_blocks_bytes|prometheus_tsdb_retention_limit_bytes|prometheus_tsdb_wal_(.*)|.+_(limit|failed|missed|failures)_total - action: keep - - # Kube state metrics for pod/node resource tracking and cost calculations - - job_name: kube-state-metrics - static_configs: - - targets: ['release-name-kube-state-metrics:8080'] - metric_relabel_configs: - - separator: ; - source_labels: [__name__] - regex: kube_pod_container_resource_(limits|requests)|kube_pod_status_phase|kube_node_(labels|status_allocatable|status_condition|status_capacity)|kube_namespace_labels|kube_pod_container_status_(waiting|terminated|last_terminated).*_reason|kube_daemonset_status_number_unavailable|kube_deployment_status_replicas_unavailable|kube_resourcequota|kube_pod_info|kube_node_info|kube_pod_container_status_restarts_total - action: keep - - separator: ; - source_labels: [__name__, phase] - regex: kube_pod_status_phase;(Succeeded|Failed) - action: drop - - source_labels: [node] - target_label: nodename - regex: '(.*)' - action: replace - - source_labels: [label_node_group_name] - action: replace - regex: (.+) - target_label: label_node_pool_name + envoy-bootstrap.yaml: | + dynamic_resources: + ads_config: + transport_api_version: V3 + api_type: GRPC + rate_limit_settings: {} + grpc_services: + - envoy_grpc: {cluster_name: xds_cluster} + cds_config: + resource_api_version: V3 + ads: {} + lds_config: + resource_api_version: V3 + ads: {} + node: + cluster: kourier-knative + id: 3scale-kourier-gateway + static_resources: + listeners: + - name: stats_listener + address: + socket_address: + address: 0.0.0.0 + port_value: 9000 + filter_chains: + - filters: + - name: envoy.filters.network.http_connection_manager + typed_config: + "@type": type.googleapis.com/envoy.extensions.filters.network.http_connection_manager.v3.HttpConnectionManager + stat_prefix: stats_server + http_filters: + - name: envoy.filters.http.router + typed_config: + "@type": type.googleapis.com/envoy.extensions.filters.http.router.v3.Router + route_config: + virtual_hosts: + - name: admin_interface + domains: + - "*" + routes: + - match: + safe_regex: + regex: '/(certs|stats(/prometheus)?|server_info|clusters|listeners|ready)?' + headers: + - name: ':method' + string_match: + exact: GET + route: + cluster: service_stats + - match: + safe_regex: + regex: '/drain_listeners' + headers: + - name: ':method' + string_match: + exact: POST + route: + cluster: service_stats + clusters: + - name: service_stats + connect_timeout: 0.250s + type: static + load_assignment: + cluster_name: service_stats + endpoints: + lb_endpoints: + endpoint: + address: + socket_address: + address: 127.0.0.1 + port_value: 9901 + - name: xds_cluster + # This keepalive is recommended by envoy docs. + # https://www.envoyproxy.io/docs/envoy/latest/api-docs/xds_protocol + typed_extension_protocol_options: + envoy.extensions.upstreams.http.v3.HttpProtocolOptions: + "@type": type.googleapis.com/envoy.extensions.upstreams.http.v3.HttpProtocolOptions + explicit_http_config: + http2_protocol_options: + connection_keepalive: + interval: 30s + timeout: 5s + connect_timeout: 1s + load_assignment: + cluster_name: xds_cluster + endpoints: + lb_endpoints: + endpoint: + address: + socket_address: + address: "net-kourier-controller" + port_value: 18000 + type: STRICT_DNS + admin: + access_log: + - name: envoy.access_loggers.stdout + typed_config: + "@type": type.googleapis.com/envoy.extensions.access_loggers.stream.v3.StdoutAccessLog + address: + socket_address: + address: 127.0.0.1 + port_value: 9901 + stats_config: + stats_tags: + - tag_name: name + regex: '^.*?\.u/.*?n=(.*?)/.*?u\..*$' + - tag_name: domain + regex: '^.*?\.u/.*?d=(.*?)/.*?u\..*$' + - tag_name: org + regex: '^.*?\.u/.*?o=(.*?)/.*?u\..*$' + - tag_name: project + regex: '^.*?\.u/.*?p=(.*?)/.*?u\..*$' + - tag_name: target + regex: '^.*?\.u/.*?t=(.*?)/.*?u\..*$' + - tag_name: target_namespace + regex: '^.*?\.u/.*?tns=(.*?)/.*?u\..*$' +--- +# Source: dataplane/templates/webhook/configmap.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: union-pod-webhook-config + namespace: union +data: + core.yaml: | - # cAdvisor container metrics for CPU and memory tracking - - job_name: kubernetes-cadvisor - metrics_path: /metrics - scheme: https - kubernetes_sd_configs: - - role: node - namespaces: - names: [] - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token - tls_config: - ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt - insecure_skip_verify: false - metric_relabel_configs: - - separator: ; - source_labels: [__name__] - regex: container_cpu_usage_seconds_total|container_memory_working_set_bytes - action: keep - relabel_configs: - - separator: ; - regex: __meta_kubernetes_node_label_(.+) - replacement: $1 - action: labelmap - - separator: ; - regex: (.*) - target_label: __address__ - replacement: kubernetes.default.svc:443 - action: replace - - source_labels: [__meta_kubernetes_node_name] - separator: ; - regex: (.+) - target_label: __metrics_path__ - replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor - action: replace - - # Flyte propeller metrics for execution info and fast task duration - - job_name: flytepropeller - kubernetes_sd_configs: - - role: pod - namespaces: - names: - - union - selectors: - - role: pod - label: app.kubernetes.io/name=flytepropeller - metric_relabel_configs: - - source_labels: [__name__] - regex: "flyte:propeller:all:round:execution_info|flyte:propeller:all:node:fast_task:fast_task_execution_duration" - action: keep - relabel_configs: - - source_labels: [__meta_kubernetes_pod_container_name] - regex: flytepropeller - action: keep - - source_labels: [__meta_kubernetes_namespace] - action: replace - target_label: namespace - - source_labels: [__meta_kubernetes_pod_name] - action: replace - target_label: pod - # OpenCost metrics for cost tracking - - job_name: opencost - static_configs: - - targets: ['release-name-opencost:9003'] - metric_relabel_configs: - - source_labels: [__name__] - regex: "kube_node_labels|kube_pod_labels|node_total_hourly_cost|node_ram_hourly_cost|node_cpu_hourly_cost|node_gpu_hourly_cost" - action: keep - rules.yml: | - groups: - - name: cost_calculations_15s - interval: 15s - rules: - - record: pod_gpu_allocation - expr: | - sum by (namespace, pod) (DCGM_FI_DEV_GPU_UTIL >= bool 0) * on (namespace, pod) group_left() (max by (namespace, pod) (kube_pod_status_phase{phase=~"Running|Pending"} == 1)) - - record: execution_info # A join metric to look up execution-level info. Used to disambiguate workflow/task executions, apps, and workspaces. - expr: | - max by (label_domain, label_project, label_entity_name, label_execution_id, label_entity_id)( - label_replace( - label_replace( - label_replace( - label_replace( - label_replace( - flyte:propeller:all:round:execution_info{domain!="", project!="", workflow_name!="", execution_id!=""}, # filter for workflow/task executions - "label_entity_id", "$1", "execution_id", "(.*)" # join key - ), "label_entity_name", "$1", "workflow_name", "(.*)" # set label_entity_name to the workflow/task name from the workflow_execution_id - ), - "label_execution_id", "$1", "execution_id", "(.*)" - ), - "label_project", "$1", "project", "(.*)" # project - ), - "label_domain", "$1", "domain", "(.*)" # domain - ) - ) - - record: app_info # A join metric to look up app-level info. Used to disambiguate workflow/task executions, apps, and workspaces. - expr: | - max by (label_domain, label_project, label_app_name, label_app_version, label_entity_id)( - label_replace( - label_replace( - label_replace( - kube_pod_labels{ - label_domain!="", - label_project!="", - label_serving_unionai_dev_app_name!="", - label_serving_knative_dev_revision!="" - }, # this filters for apps - "label_app_name", "$1", "label_serving_unionai_dev_app_name", "(.*)" # rename to cleanup - ), - "label_app_version", "$1", "label_serving_knative_dev_revision", "(.*)" # the app_version is equivalent to an execution_id for workflows (lowest level of granularity) - ), - "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # join key - ) - ) - - record: workspace_info # A join metric to look up workspace info. Used to disambiguate workflow/task executions, apps, and workspaces. - expr: | - max by (label_domain, label_project, label_workspace_name, label_entity_id)( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # filter for workspaces - "label_entity_id", "$1", "label_node_id", "(.*)" # join key - ), "label_workspace_name", "$1", "label_node_id", "(.*)" # set label_workspace_name to the workspace name from the kube_pod_labels - ) - ) - - record: fast_task_execution_duration - expr: | - max by (label_domain, label_project, label_execution_id, label_entity_id, namespace, pod)( - label_replace( - label_replace( - label_replace( - label_replace( - label_replace( - label_replace( - flyte:propeller:all:node:fast_task:fast_task_execution_duration{domain!="", project!="", execution_id!="", exported_namespace!="", exported_pod!=""}, - "label_entity_id", "$1", "execution_id", "(.*)" # join key - ), - "label_execution_id", "$1", "execution_id", "(.*)" - ), - "label_project", "$1", "project", "(.*)" # project - ), - "label_domain", "$1", "domain", "(.*)" # domain - ), - "namespace", "$1", "exported_namespace", "(.*)" - ), - "pod", "$1", "exported_pod", "(.*)" - ) - ) - - record: fast_task_execution_duration_rate - expr: | - irate(fast_task_execution_duration[1m]) > 0 or absent(irate(fast_task_execution_duration[1m])) * fast_task_execution_duration - - record: entity_id:mem_usage_bytes_total_per_node:sum # Allocated memory (max(requested, consumed)) aggregated per node and entity, where entity is either a task/workflow execution or an app. - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id, node) ( # aggregate up to entity - # First, calculate the allocated memory for each pod - max by (namespace, pod) ( # this is the case where consumed (the memory working set) exceeds requested memory - ( - sum by (namespace, pod) ( - container_memory_working_set_bytes{namespace!="",pod!="",image!=""} - ) - > sum by (namespace, pod) ( - kube_pod_container_resource_requests{namespace!="", pod!="", node!="", resource="memory"} - ) - ) - or sum by (namespace, pod) ( # this is the case where memory requests are <= consumed memory - kube_pod_container_resource_requests{namespace!="", pod!="", node!="", resource="memory"} # needed to add node!="" to dedupe - ) - ) - # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but we do not want to double the number of pod-level metrics we save - * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) - "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") - ), - "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels - label_replace( - label_replace( - kube_pod_labels{ - label_domain!="", - label_project!="", - label_serving_unionai_dev_app_name!="", - label_serving_knative_dev_revision!="" - }, # this filters for apps only - "label_entity_type", "app", "", "" # set label_entity_type to "app" - ), - "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) - "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" - ), - "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - fast_task_execution_duration_rate{label_domain!="", label_project!="",label_execution_id!=""}, - "label_entity_type", "fast_task", "", "" - ) - ) - ) - # Then filter for pods only in the "Running" or "Pending" phase - * on (namespace, pod) group_left() ( - max by (namespace, pod) ( - kube_pod_status_phase{phase=~"Running|Pending"} == 1 - ) - ) - # Now join in node identifiers which are used for subsequent overhead calculations - * on (namespace, pod) group_left(node) ( - max by (namespace, pod, node) (kube_pod_info{node!=""}) # needed to add node!="" to dedupe - ) - ) - - record: entity_id:cpu_usage_per_node:sum # Allocated cpu (max(requested, consumed)) aggregated per node and entity, where entity is either a task/workflow execution or an app. - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id, node) ( - # First, calculate the allocated cpu for each pod - max by (namespace, pod) ( # this is the case where consumed (the cpu usage seconds total) exceeds requested cpu - ( - sum by (namespace, pod) ( - irate(container_cpu_usage_seconds_total{namespace!="",pod!="",image!=""}[5m]) - ) - > sum by (namespace, pod) ( - kube_pod_container_resource_requests{namespace!="", pod!="", node!="", resource="cpu"} - ) - ) - or sum by (namespace, pod) ( # this is the case where cpu requests are <= consumed cpu - kube_pod_container_resource_requests{namespace!="", pod!="", node!="", resource="cpu"} - ) - ) - # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but I didn't want to double the number of pod-level metrics we save - * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) - "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") - ), - "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels - label_replace( - label_replace( - kube_pod_labels{ - label_domain!="", - label_project!="", - label_serving_unionai_dev_app_name!="", - label_serving_knative_dev_revision!="" - }, # this filters for apps only - "label_entity_type", "app", "", "" # set label_entity_type to "app" - ), - "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) - "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" - ), - "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - fast_task_execution_duration_rate{label_domain!="", label_project!="", label_execution_id!=""}, - "label_entity_type", "fast_task", "", "" - ) - ) - ) - # Then filter for pods only in the "Running" or "Pending" phase - * on (namespace, pod) group_left() ( - max by (namespace, pod) ( - kube_pod_status_phase{phase=~"Running|Pending"} == 1 - ) - ) - # Now join in node identifiers which are used for subsequent overhead calculations - * on (namespace, pod) group_left(node) ( - max by (namespace, pod, node) (kube_pod_info{node!=""}) # needed to add node!="" to dedupe - ) - ) - - record: entity_id:gpu_usage_per_node:sum # Allocated gpu aggregated per node and entity, where entity is either a task/workflow execution or an app. - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id, node) ( - # First, grab the allocated gpu for each pod (which is always either 1 or zero, since k8s can't split gpus the way it can with cpu/memory) - max by (namespace, pod) ( - pod_gpu_allocation - ) - # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but I didn't want to double the number of pod-level metrics we save - * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) - "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") - ), - "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels - label_replace( - label_replace( - kube_pod_labels{ - label_domain!="", - label_project!="", - label_serving_unionai_dev_app_name!="", - label_serving_knative_dev_revision!="" - }, # this filters for apps only - "label_entity_type", "app", "", "" # set label_entity_type to "app" - ), - "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) - "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" - ), - "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - fast_task_execution_duration_rate{label_domain!="", label_project!="", label_execution_id!=""}, - "label_entity_type", "fast_task", "", "" - ) - ) - ) - # Then filter for pods only in the "Running" or "Pending" phase - * on (namespace, pod) group_left() ( - max by (namespace, pod) ( - kube_pod_status_phase{phase=~"Running|Pending"} == 1 - ) - ) - # Now join in node identifiers which are used for subsequent overhead calculations - * on (namespace, pod) group_left(node) ( - max by (namespace, pod, node) (kube_pod_info{node!=""}) # needed to add node!="" to dedupe - ) - ) - - record: entity_id:used_mem_bytes:sum # the sum of used memory across all containers in an entity (numerator for aggregate utilization calculations) - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id) ( # aggregate up to entity - # First, calculate the used memory for each pod - sum by (namespace, pod) ( - container_memory_working_set_bytes{namespace!="",pod!="",image!=""} - ) - # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but we do not want to double the number of pod-level metrics we save - * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) - "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") - ), - "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels - label_replace( - label_replace( - kube_pod_labels{ - label_domain!="", - label_project!="", - label_serving_unionai_dev_app_name!="", - label_serving_knative_dev_revision!="" - }, # this filters for apps only - "label_entity_type", "app", "", "" # set label_entity_type to "app" - ), - "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) - "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" - ), - "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - fast_task_execution_duration_rate{label_domain!="", label_project!="", label_execution_id!=""}, - "label_entity_type", "fast_task", "", "" - ) - ) - ) - # Then filter for pods only in the "Running" or "Pending" phase - * on (namespace, pod) group_left() ( - max by (namespace, pod) ( - kube_pod_status_phase{phase=~"Running|Pending"} == 1 - ) - ) - ) - - record: entity_id:allocated_mem_bytes:sum # the sum of allocated memory across all containers in an entity (denominator for aggregate utilization calculations) - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id) ( # aggregate up to entity (remove node) - entity_id:mem_usage_bytes_total_per_node:sum - ) - - record: entity_id:used_cpu:sum # the sum of used cpu across all containers in an entity (numerator for aggregate utilization calculations) - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id) ( - sum by (namespace, pod) ( - irate(container_cpu_usage_seconds_total{namespace!="",pod!="",image!=""}[5m]) - ) - # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but I didn't want to double the number of pod-level metrics we save - * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) - "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") - ), - "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels - label_replace( - label_replace( - kube_pod_labels{ - label_domain!="", - label_project!="", - label_serving_unionai_dev_app_name!="", - label_serving_knative_dev_revision!="" - }, # this filters for apps only - "label_entity_type", "app", "", "" # set label_entity_type to "app" - ), - "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) - "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" - ), - "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - fast_task_execution_duration_rate{label_domain!="", label_project!="", label_execution_id!=""}, - "label_entity_type", "fast_task", "", "" - ) - ) - ) - # Then filter for pods only in the "Running" or "Pending" phase - * on (namespace, pod) group_left() ( - max by (namespace, pod) ( - kube_pod_status_phase{phase=~"Running|Pending"} == 1 - ) - ) - ) - - record: entity_id:allocated_cpu:sum # the sum of allocated cpu across all containers in an entity (denominator for aggregate utilization calculations) - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id) ( # aggregate up to entity (remove node) - entity_id:cpu_usage_per_node:sum - ) - - record: entity_id:sm_occupancy:avg # the simple average of SM occupancy (a good generic measure of GPU utilization) per entity - expr: | - avg by (label_entity_type, label_domain, label_project, label_entity_id) ( - # First, grab the SM occupancy for each pod - max by (namespace, pod) ( - DCGM_FI_PROF_SM_OCCUPANCY # SM occupancy is a good proxy for actual GPU usage - ) - # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but I didn't want to double the number of pod-level metrics we save - * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) - "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") - ), - "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels - label_replace( - label_replace( - kube_pod_labels{ - label_domain!="", - label_project!="", - label_serving_unionai_dev_app_name!="", - label_serving_knative_dev_revision!="" - }, # this filters for apps only - "label_entity_type", "app", "", "" # set label_entity_type to "app" - ), - "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) - "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" - ), - "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - fast_task_execution_duration_rate{label_domain!="", label_project!="", label_execution_id!=""}, - "label_entity_type", "fast_task", "", "" - ) - ) - ) - # Then filter for pods only in the "Running" or "Pending" phase - * on (namespace, pod) group_left() ( - max by (namespace, pod) ( - kube_pod_status_phase{phase=~"Running|Pending"} == 1 - ) - ) - ) - - record: entity_id:gpu_count:sum # the count of running gpu pods per entity (need this to weight the gpu utilization when aggregating upwards - i.e. project-level) - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id, node) ( - # First, grab the allocated gpu for each pod (which is always either 1 or zero, since k8s can't split gpus the way it can with cpu/memory) - max by (namespace, pod) ( - pod_gpu_allocation - ) - # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but I didn't want to double the number of pod-level metrics we save - * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) - "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") - ), - "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels - label_replace( - label_replace( - kube_pod_labels{ - label_domain!="", - label_project!="", - label_serving_unionai_dev_app_name!="", - label_serving_knative_dev_revision!="" - }, # this filters for apps only - "label_entity_type", "app", "", "" # set label_entity_type to "app" - ), - "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) - "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" - ), - "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - fast_task_execution_duration_rate{label_domain!="", label_project!="", label_execution_id!=""}, - "label_entity_type", "fast_task", "", "" - ) - ) - ) - # Then filter for pods only in the "Running" or "Pending" phase - * on (namespace, pod) group_left() ( - max by (namespace, pod) ( - kube_pod_status_phase{phase=~"Running|Pending"} == 1 - ) - ) - ) - - record: entity_id:weighted_sm_occupancy:sum # product of SM occupancy and allocated GPU count (something like "used memory", numerator of weighted calcs) - expr: | - entity_id:sm_occupancy:avg - * on (label_domain, label_project, label_entity_type, label_entity_id) entity_id:gpu_count:sum - - record: entity_id:allocated_mem_cost:sum # Allocated cost of memory for each workflow/task execution and app. - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id, type) ( - entity_id:mem_usage_bytes_total_per_node:sum / (1024 * 1024 * 1024) # convert bytes to GB - * on (node) group_left(type) label_replace(avg by (node) (node_ram_hourly_cost * (15 / 3600)), "type", "mem", "", "") # convert hourly cost to 15-secondly cost and add type - ) - - record: entity_id:allocated_cpu_cost:sum # Allocated cost of cpu for each workflow/task execution and app. - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id, type)( - entity_id:cpu_usage_per_node:sum - * on (node) group_left(type) label_replace(avg by (node) (node_cpu_hourly_cost * (15 / 3600)), "type", "cpu", "", "") # convert hourly cost to 15-secondly cost and add type - ) - - record: entity_id:allocated_gpu_cost:sum # Allocated cost of gpu for each workflow/task execution and app. - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id, type)( - entity_id:gpu_usage_per_node:sum - * on (node) group_left(type) label_replace(avg by (node) (node_gpu_hourly_cost * (15 / 3600)), "type", "gpu", "", "") # convert hourly cost to 15-secondly cost and add type - ) - - record: entity_id:allocated_cost:sum # Allocated cost of memory, cpu, and gpu for each workflow/task execution and app. - expr: | - label_replace( - sum by (label_entity_type, label_domain, label_project, label_entity_id) ( # for the sum to work, the labels need to be different on each "or" element (type label) - entity_id:allocated_mem_cost:sum - or - entity_id:allocated_cpu_cost:sum - or - entity_id:allocated_gpu_cost:sum - ), - "type", "allocated", "", "" # add type info - ) - - record: entity_id:overhead_cost:sum # The amount of overhead costs (node costs that we can't allocate with container resources) to allocate to each entity (workflow/task execution or app) - expr: | - label_replace( - sum by (label_entity_type, label_entity_id, label_domain, label_project)( # Aggregate the per-node metrics up to workflow/task execution or app (label_entity_id) - # Start with each execution's and app's allocated cost per node - sum by (label_entity_type, label_domain, label_project, label_entity_id, node) ( # for the sum to work, the labels need to be different on each "or" element (type label) - entity_id:mem_usage_bytes_total_per_node:sum / (1024 * 1024 * 1024) # convert bytes to GB - * on (node) group_left(type) label_replace(avg by (node) (node_ram_hourly_cost * (15 / 3600)), "type", "mem", "", "") # convert hourly cost to 15-secondly cost and add type - or - entity_id:cpu_usage_per_node:sum - * on (node) group_left(type) label_replace(avg by (node) (node_cpu_hourly_cost * (15 / 3600)), "type", "cpu", "", "") # convert hourly cost to 15-secondly cost and add type - or - entity_id:gpu_usage_per_node:sum - * on (node) group_left(type) label_replace(avg by (node) (node_gpu_hourly_cost * (15 / 3600)), "type", "gpu", "", "") # convert hourly cost to 15-secondly cost and add type - ) - # Then divide out the total allocated cost per node to get the proportion of allocated cost associated with each entity - / on (node) group_left()( - sum by (node) ( # for the sum to work, the labels need to be different on each "or" element (type label) - entity_id:mem_usage_bytes_total_per_node:sum / (1024 * 1024 * 1024) # convert bytes to GB - * on (node) group_left(type) label_replace(avg by (node) (node_ram_hourly_cost * (15 / 3600)), "type", "mem", "", "") # convert hourly cost to 15-secondly cost and add type - or - entity_id:cpu_usage_per_node:sum - * on (node) group_left(type) label_replace(avg by (node) (node_cpu_hourly_cost * (15 / 3600)), "type", "cpu", "", "") # convert hourly cost to 15-secondly cost and add type - or - entity_id:gpu_usage_per_node:sum - * on (node) group_left(type) label_replace(avg by (node) (node_gpu_hourly_cost * (15 / 3600)), "type", "gpu", "", "") # convert hourly cost to 15-secondly cost and add type - ) - > 0 # need to avoid dividing by zero, or gaps in the data can cause NaNs to proliferate, borking all charts - ) - # Then multiply by the overhead cost per node - * on (node) group_left() ( - # To calculate overhead, start with the true cost of running each node - avg by (node)(kube_node_labels{label_flyte_org_node_role="worker"}) # only look at worker nodes - * on (node) max by (node) ( - node_total_hourly_cost{instance_type!=""} # sometimes, the instance_type can be null, causing an unlabeled label to show up in the Compute Costs dashboard charts - ) * (15 / 3600) # convert hourly cost to 15-secondly cost - # Then subtract out the total allocated cost on each node - - on (node) group_left()( - sum by (node) ( # for the sum to work, the labels need to be different on each "or" element (type label) - entity_id:mem_usage_bytes_total_per_node:sum / (1024 * 1024 * 1024) # convert bytes to GB - * on (node) group_left(type) label_replace(avg by (node) (node_ram_hourly_cost * (15 / 3600)), "type", "mem", "", "") # convert hourly cost to 15-secondly cost and add type - or - entity_id:cpu_usage_per_node:sum - * on (node) group_left(type) label_replace(avg by (node) (node_cpu_hourly_cost * (15 / 3600)), "type", "cpu", "", "") # convert hourly cost to 15-secondly cost and add type - or - entity_id:gpu_usage_per_node:sum - * on (node) group_left(type) label_replace(avg by (node) (node_gpu_hourly_cost * (15 / 3600)), "type", "gpu", "", "") # convert hourly cost to 15-secondly cost and add type - ) - ) - ) - ), - "type", "overhead", "", "" # add type info - ) - - record: entity_id:total_cost:sum # Total cost of each entity (workflow/task execution or app), including allocated (from container resources) and overhead (proportion of unallocated node costs) - expr: | - label_replace( - sum by (label_domain, label_project, label_entity_id, label_entity_type) ( - entity_id:allocated_cost:sum - or - entity_id:overhead_cost:sum - ), - "type", "total", "", "" # add type info - ) - - record: node:total_cost:sum # Total cost of all nodes - expr: | - sum ( - avg by (node)(kube_node_labels{label_flyte_org_node_role="worker", label_node_kubernetes_io_instance_type!=""}) # only look at worker nodes - * on (node) group_left() node_total_hourly_cost{instance_type!=""} * (15 / 3600) # convert hourly cost to 15-secondly cost - ) - - record: node_type:total_cost:sum # Total cost of nodes grouped by node type - expr: | - sum by (node_type)( - avg by (node)(kube_node_labels{label_flyte_org_node_role="worker", label_node_kubernetes_io_instance_type!=""}) # only look at worker nodes - * on (node) group_left(node_type) label_replace(node_total_hourly_cost{instance_type!=""}, "node_type", "$1", "instance_type", "(.*)") * (15 / 3600) # convert hourly cost to 15-secondly cost and rename label - ) - - record: node_type:uptime_hours:sum # Total uptime of nodes grouped by node type - expr: | - sum by (node_type)( - avg by (node, node_type)( # dedupe - label_replace(kube_node_labels{label_flyte_org_node_role="worker", label_node_kubernetes_io_instance_type!=""}, "node_type", "$1", "label_node_kubernetes_io_instance_type", "(.*)") # relabel - ) - ) * (15 / 3600) # convert to number of hours per 15-second observation # Aggregate the above into visible metrics - - name: cost_rollup_15m - interval: 15m - rules: - - record: execution_info15m - expr: | - max_over_time(execution_info[15m:15s]) - - record: app_info15m - expr: | - max_over_time(app_info[15m:15s]) - - record: workspace_info15m - expr: | - max_over_time(workspace_info[15m:15s]) - - record: entity_id:allocated_mem_bytes:sum15m - expr: | - sum_over_time(entity_id:allocated_mem_bytes:sum[15m:15s]) - - record: entity_id:used_mem_bytes:sum15m - expr: | - sum_over_time(entity_id:used_mem_bytes:sum[15m:15s]) - - record: entity_id:allocated_cpu:sum15m - expr: | - sum_over_time(entity_id:allocated_cpu:sum[15m:15s]) - - record: entity_id:used_cpu:sum15m - expr: | - sum_over_time(entity_id:used_cpu:sum[15m:15s]) - - record: entity_id:weighted_sm_occupancy:sum15m - expr: | - sum_over_time(entity_id:weighted_sm_occupancy:sum[15m:15s]) - - record: entity_id:gpu_count:sum15m - expr: | - sum_over_time(entity_id:gpu_count:sum[15m:15s]) - - record: entity_id:allocated_mem_cost:sum15m - expr: | - sum_over_time(entity_id:allocated_mem_cost:sum[15m:15s]) - - record: entity_id:allocated_cpu_cost:sum15m - expr: | - sum_over_time(entity_id:allocated_cpu_cost:sum[15m:15s]) - - record: entity_id:allocated_gpu_cost:sum15m - expr: | - sum_over_time(entity_id:allocated_gpu_cost:sum[15m:15s]) - - record: entity_id:allocated_cost:sum15m - expr: | - sum_over_time(entity_id:allocated_cost:sum[15m:15s]) - - record: entity_id:overhead_cost:sum15m - expr: | - sum_over_time(entity_id:overhead_cost:sum[15m:15s]) - - record: entity_id:total_cost:sum15m - expr: | - sum_over_time(entity_id:total_cost:sum[15m:15s]) - - record: node:total_cost:sum15m - expr: | - sum_over_time(node:total_cost:sum[15m:15s]) - - record: node_type:total_cost:sum15m - expr: | - sum_over_time(node_type:total_cost:sum[15m:15s]) - - record: node_type:uptime_hours:sum15m - expr: | - sum_over_time(node_type:uptime_hours:sum[15m:15s]) ---- -# Source: dataplane/templates/propeller/configmap.yaml -apiVersion: v1 -kind: ConfigMap -metadata: - name: flyte-propeller-config - namespace: union -data: - admin.yaml: | - admin: - clientId: 'test-client' - clientSecretLocation: /etc/union/secret/client_secret - endpoint: dns:///union.test.union.ai - insecure: false - event: - capacity: 1000 - rate: 500 - type: admin - catalog.yaml: | - catalog-cache: - cache-endpoint: dns:///union.test.union.ai - endpoint: dns:///union.test.union.ai - insecure: false - type: fallback - use-admin-auth: true - copilot.yaml: | - plugins: - k8s: - co-pilot: - image: 'cr.flyte.org/flyteorg/flytecopilot:v1.14.1' - name: flyte-copilot- - start-timeout: 30s - core.yaml: | - propeller: - downstream-eval-duration: 30s - enable-admin-launcher: true - leader-election: - enabled: true - lease-duration: 15s - lock-config-map: - name: propeller-leader - namespace: 'union' - renew-deadline: 10s - retry-period: 2s - limit-namespace: all - literal-offloading-config: - enabled: true - max-workflow-retries: 30 - metadata-prefix: metadata/propeller - metrics-prefix: flyte - prof-port: 10254 - queue: - batch-size: -1 - batching-interval: 2s - queue: - base-delay: 5s - capacity: 1000 - max-delay: 120s - rate: 100 - type: maxof - sub-queue: - capacity: 100 - rate: 10 - type: bucket - type: batch - rawoutput-prefix: 's3://test-bucket' - workers: 4 - workflow-reeval-duration: 30s webhook: certDir: /etc/webhook/certs + disableCreateMutatingWebhookConfig: true embeddedSecretManagerConfig: imagePullSecrets: enabled: true @@ -3791,75 +3367,13 @@ data: namespace: 'union' type: 'K8s' listenPort: '9443' + localCert: true secretManagerTypes: - Embedded - K8s - serviceName: flyte-pod-webhook + secretName: union-pod-webhook + serviceName: union-pod-webhook servicePort: '443' - enabled_plugins.yaml: | - plugins: - connector-service: - defaultConnector: - defaultServiceConfig: '{"loadBalancingConfig": [{"round_robin":{}}]}' - endpoint: k8s:///flyteconnector.union:8000 - tasks: - task-plugins: - default-for-task-types: - actor: fast-task - container: container - container_array: k8s-array - sidecar: sidecar - enabled-plugins: - - container - - sidecar - - k8s-array - - echo - - fast-task - - connector-service - k8s.yaml: | - plugins: - k8s: - default-cpus: 100m - default-env-vars: [] - default-memory: 100Mi - logger.yaml: | - logger: - formatter: - type: json - level: 4 - show-source: true - resource_manager.yaml: | - propeller: - resourcemanager: - type: noop - task_logs.yaml: | - plugins: - logs: - cloudwatch-enabled: false - dynamic-log-links: - - vscode: - displayName: VS Code Debugger - templateUris: - - /dataplane/pod/v1/generated_name/task/{{.executionProject}}/{{.executionDomain}}/{{.executionName}}/{{.nodeID}}/{{.taskRetryAttempt}}/{{.taskProject}}/{{.taskDomain}}/{{.taskID}}/{{.taskVersion}}/ - kubernetes-enabled: false - templates: - - displayName: Task Logs - scheme: TaskExecution - templateUris: - - /console/projects/{{.executionProject}}/domains/{{.executionDomain}}/executions/{{.executionName}}/nodeId/{{.nodeID}}/taskId/{{.taskID}}/attempt/{{.taskRetryAttempt}}/view/logs?duration=all&fromExecutionNav=true - storage.yaml: | - storage: - container: "test-bucket" - type: s3 - connection: - auth-type: iam - region: us-east-1 - enable-multicontainer: false - limits: - maxDownloadMBs: 1024 - cache: - max_size_mbs: 0 - target_gc_percent: 70 --- # Source: dataplane/charts/fluentbit/templates/clusterrole.yaml apiVersion: rbac.authorization.k8s.io/v1 @@ -3883,275 +3397,1143 @@ rules: - list - watch --- -# Source: dataplane/charts/kube-state-metrics/templates/role.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2020 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: - labels: - helm.sh/chart: kube-state-metrics-5.30.1 - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: metrics - app.kubernetes.io/part-of: kube-state-metrics - app.kubernetes.io/name: kube-state-metrics - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2.15.0" - name: release-name-kube-state-metrics -rules: - -- apiGroups: ["certificates.k8s.io"] - resources: - - certificatesigningrequests - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - configmaps - verbs: ["list", "watch"] - -- apiGroups: ["batch"] - resources: - - cronjobs - verbs: ["list", "watch"] - -- apiGroups: ["extensions", "apps"] - resources: - - daemonsets - verbs: ["list", "watch"] - -- apiGroups: ["extensions", "apps"] - resources: - - deployments - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - endpoints - verbs: ["list", "watch"] - -- apiGroups: ["autoscaling"] - resources: - - horizontalpodautoscalers - verbs: ["list", "watch"] - -- apiGroups: ["extensions", "networking.k8s.io"] - resources: - - ingresses - verbs: ["list", "watch"] - -- apiGroups: ["batch"] - resources: - - jobs - verbs: ["list", "watch"] - -- apiGroups: ["coordination.k8s.io"] - resources: - - leases - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - limitranges - verbs: ["list", "watch"] - -- apiGroups: ["admissionregistration.k8s.io"] - resources: - - mutatingwebhookconfigurations - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - namespaces - verbs: ["list", "watch"] - -- apiGroups: ["networking.k8s.io"] - resources: - - networkpolicies - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - nodes - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - persistentvolumeclaims - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - persistentvolumes - verbs: ["list", "watch"] - -- apiGroups: ["policy"] - resources: - - poddisruptionbudgets - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - pods - verbs: ["list", "watch"] - -- apiGroups: ["extensions", "apps"] - resources: - - replicasets - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - replicationcontrollers - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - resourcequotas - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - secrets - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - services - verbs: ["list", "watch"] - -- apiGroups: ["apps"] - resources: - - statefulsets - verbs: ["list", "watch"] + name: knative-serving-operator-aggregated + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +aggregationRule: + clusterRoleSelectors: + # This (along with escalate below) allows the Operator to pick up any + # roles that are provided to the admin of the cluster by knative serving + # automatically. + - matchExpressions: + - {key: serving.knative.dev/release, operator: Exists} +rules: [] # Rules are automatically filled in by the controller manager. +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: knative-serving-operator-aggregated-stable + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +aggregationRule: + clusterRoleSelectors: + # This (along with escalate below) allows the Operator to pick up any + # roles that are provided to the admin of the cluster by knative serving + # automatically. + - matchExpressions: + - {key: app.kubernetes.io/name, operator: In, values: ["knative-serving"]} +rules: [] # Rules are automatically filled in by the controller manager. +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: knative-eventing-operator-aggregated + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +aggregationRule: + clusterRoleSelectors: + # This (along with escalate below) allows the Operator to pick up any + # roles that are provided to the admin of the cluster by knative eventing + # automatically. + - matchExpressions: + - {key: eventing.knative.dev/release, operator: Exists} +rules: [] # Rules are automatically filled in by the controller manager. +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: knative-eventing-operator-aggregated-stable + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +aggregationRule: + clusterRoleSelectors: + # This (along with escalate below) allows the Operator to pick up any + # roles that are provided to the admin of the cluster by knative eventing + # automatically. + - matchExpressions: + - {key: app.kubernetes.io/name, operator: In, values: ["knative-eventing"]} +rules: [] # Rules are automatically filled in by the controller manager. +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2020 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. -- apiGroups: ["storage.k8s.io"] - resources: - - storageclasses - verbs: ["list", "watch"] +kind: ClusterRole +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: knative-serving-operator + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +rules: + - apiGroups: + - operator.knative.dev + resources: + - '*' + verbs: + - '*' + # Bootstrapping permissions. + # Roles that are explicitly bound buch which are specified by this Operator + # MUST be specified here with 'get' and 'bind'. + - apiGroups: + - rbac.authorization.k8s.io + resources: + - clusterroles + resourceNames: + - system:auth-delegator + verbs: + - bind + - get + - apiGroups: + - rbac.authorization.k8s.io + resources: + - roles + resourceNames: + - extension-apiserver-authentication-reader + verbs: + - bind + - get + - apiGroups: + - rbac.authorization.k8s.io + resources: + - clusterroles + - roles + verbs: + - create + - delete + # Escalate is necessary in order to create a role using cluster role aggregation, + # and to allow the Operator to bootstrap itself into the necessary set of + # permissions, even as those continue to evolve upstream. + - escalate + - get + - list + - update + - apiGroups: + - rbac.authorization.k8s.io + resources: + - clusterrolebindings + - rolebindings + verbs: + - create + - delete + - list + - get + - update + # Permissions required for Knative controller + # infra. + - apiGroups: + - apiregistration.k8s.io + resources: + - apiservices + verbs: + - update + - apiGroups: + - "" + resources: + - services + verbs: + - create + - delete + - get + - list + - watch + - apiGroups: + - caching.internal.knative.dev + resources: + - images + verbs: + - '*' + - apiGroups: + - "" + resources: + - namespaces + verbs: + - get + - update + - watch + - apiGroups: + - '' + resources: + - events + verbs: + - create + - update + - patch + - apiGroups: + - '' + resources: + - configmaps + verbs: + - create + - delete + - get + - list + - watch + - apiGroups: + - security.istio.io + - apps + - policy + resources: + - poddisruptionbudgets + - peerauthentications + - deployments + - daemonsets + - replicasets + - statefulsets + verbs: + - create + - delete + - get + - list + - watch + - update + - apiGroups: + - apiregistration.k8s.io + resources: + - apiservices + verbs: + - create + - delete + - get + - list + - apiGroups: + - autoscaling + resources: + - horizontalpodautoscalers + verbs: + - create + - delete + - get + - list + - apiGroups: + - coordination.k8s.io + resources: + - leases + verbs: + - '*' + - apiGroups: + - apiextensions.k8s.io + resources: + - customresourcedefinitions + verbs: + - '*' + # Old resources that need cleaning up that are not in the knative-serving + # namespace. + - apiGroups: + - "" + resources: + - services + - deployments + - horizontalpodautoscalers + resourceNames: + - knative-ingressgateway + verbs: + - delete + - apiGroups: + - "" + resources: + - configmaps + resourceNames: + - config-controller + verbs: + - delete + - apiGroups: + - "" + resources: + - serviceaccounts + resourceNames: + - knative-serving-operator + verbs: + - delete + # for contour TLS + - apiGroups: + - projectcontour.io + resources: + - httpproxies + - tlscertificatedelegations + verbs: + - get + - list + - watch + - update + - create + - delete + - deletecollection + - patch + # for security-guard + - apiGroups: + - guard.security.knative.dev + resources: + - guardians + verbs: + - get + - list + - watch + - create + - update + - patch + - delete + - apiGroups: + - "" + resources: + - configmaps + verbs: + - get + - list + - watch + - create + - update + - patch + - delete + - apiGroups: + - authentication.k8s.io + resources: + - tokenreviews + verbs: + - create + - apiGroups: + - "" + resources: + - pods + verbs: + - get +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +kind: ClusterRole +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: knative-eventing-operator + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +rules: + - apiGroups: + - operator.knative.dev + resources: + - '*' + verbs: + - '*' + # Bootstrapping permissions. + # Roles that are explicitly bound buch which are specified by this Operator + # MUST be specified here with 'get' and 'bind'. + - apiGroups: + - rbac.authorization.k8s.io + resources: + - clusterroles + - roles + verbs: + - create + - delete + # Escalate is necessary in order to create a role using cluster role aggregation, + # and to allow the Operator to bootstrap itself into the necessary set of + # permissions, even as those continue to evolve upstream. + - escalate + - get + - list + - update + - apiGroups: + - rbac.authorization.k8s.io + resources: + - clusterrolebindings + - rolebindings + verbs: + - create + - delete + - list + - get + - update + # Permissions required for Knative controller + # infra. + - apiGroups: + - apiregistration.k8s.io + resources: + - apiservices + verbs: + - update + - apiGroups: + - "" + resources: + - services + verbs: + - create + - delete + - get + - list + - watch + - apiGroups: + - caching.internal.knative.dev + resources: + - images + verbs: + - '*' + - apiGroups: + - "" + resources: + - namespaces + verbs: + - get + - update + - watch + - apiGroups: + - '' + resources: + - events + verbs: + - create + - update + - patch + - apiGroups: + - '' + resources: + - configmaps + verbs: + - create + - delete + - get + - list + - watch + - apiGroups: + - apps + resources: + - deployments + - daemonsets + - replicasets + - statefulsets + verbs: + - create + - delete + - get + - list + - watch + - apiGroups: + - apiregistration.k8s.io + resources: + - apiservices + verbs: + - create + - delete + - get + - list + - apiGroups: + - autoscaling + resources: + - horizontalpodautoscalers + verbs: + - create + - delete + - update + - get + - list + - apiGroups: + - coordination.k8s.io + resources: + - leases + verbs: + - '*' + - apiGroups: + - apiextensions.k8s.io + resources: + - customresourcedefinitions + verbs: + - '*' + - apiGroups: + - batch + resources: + - jobs + verbs: + - create + - delete + - update + - get + - list + - watch + # Old resources that need cleaning up that are not in the knative-eventing + # namespace. + - apiGroups: + - "" + resources: + - serviceaccounts + resourceNames: + - knative-eventing-operator + verbs: + - delete + # for RabbitMQ messaging topology objects + - apiGroups: + - rabbitmq.com + resources: + - rabbitmqclusters + verbs: + - "get" + - "list" + - "watch" + - apiGroups: + - rabbitmq.com + resources: + - bindings + - queues + - exchanges + verbs: + - create + - delete + - get + - list + - patch + - update + - watch + - apiGroups: + - rabbitmq.com + resources: + - bindings/status + - queues/status + - exchanges/status + verbs: + - get + # for Kafka eventing source + - apiGroups: + - keda.sh + resources: + - scaledobjects + - scaledobjects/finalizers + - scaledobjects/status + - triggerauthentications + - triggerauthentications/status + verbs: + - get + - list + - watch + - update + - create + - delete + # Internal APIs + - apiGroups: + - "internal.kafka.eventing.knative.dev" + resources: + - "consumers" + - "consumers/status" + - "consumergroups" + - "consumergroups/status" + verbs: + - create + - get + - list + - watch + - patch + - update + - delete + - apiGroups: + - "internal.kafka.eventing.knative.dev" + resources: + - "consumers/finalizers" + - "consumergroups/finalizers" + verbs: + - update + - delete + - apiGroups: + - apps + resources: + - statefulsets/scale + verbs: + - get + - list + - watch + - update + - patch + - create + - delete + - apiGroups: + - rbac.authorization.k8s.io + resources: + - clusterrolebindings + verbs: + - watch + - apiGroups: + - "*" + resources: + - configmaps + verbs: + - delete + - apiGroups: + - "*" + resources: + - configmaps + - services + verbs: + - get + - list + - watch + - update + - create + - delete + - apiGroups: + - "*" + resources: + - pods + verbs: + - list + - update + - get + - watch + - apiGroups: + - "*" + resources: + - pods/finalizers + verbs: + - get + - list + - create + - update + - delete + - apiGroups: + - "*" + resources: + - events + verbs: + - patch + - create + - apiGroups: + - "*" + resources: + - secrets + verbs: + - get + - list + - watch + - update + - create + - delete + - apiGroups: + - "*" + resources: + - nodes + verbs: + - get + - list + - watch + - apiGroups: + - "*" + resources: + - serviceaccounts + verbs: + - get + - list + - watch + - update + - create + - delete + - apiGroups: + - "*" + resources: + - configmaps + resourceNames: + - kafka-channel-config + verbs: + - patch + - apiGroups: + - "*" + resources: + - horizontalpodautoscalers + resourceNames: + - kafka-webhook + verbs: + - delete + - apiGroups: + - "*" + resources: + - leases + verbs: + - delete + - apiGroups: + - "*" + resources: + - poddisruptionbudgets + resourceNames: + - kafka-webhook + verbs: + - delete + - apiGroups: + - "*" + resources: + - services + verbs: + - patch + - apiGroups: + - "apps" + resources: + - deployments + verbs: + - deletecollection + # Eventing TLS + - apiGroups: + - "cert-manager.io" + resources: + - certificates + - issuers + - clusterissuers + verbs: + - create + - delete + - update + - list + - get + - watch + - apiGroups: + - "trust.cert-manager.io" + resources: + - bundles + verbs: + - create + - delete + - update + - list + - get + - watch +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: knative-operator-webhook + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +rules: + # For watching logging configuration and getting certs. + - apiGroups: + - "" + resources: + - "configmaps" + verbs: + - "get" + - "list" + - "watch" + # For manipulating certs into secrets. + - apiGroups: + - "" + resources: + - "namespaces" + verbs: + - "get" + - "create" + - "update" + - "list" + - "watch" + - "patch" + # finalizers are needed for the owner reference of the webhook + - apiGroups: + - "" + resources: + - "namespaces/finalizers" + verbs: + - "update" + # For getting our Deployment so we can decorate with ownerref. + - apiGroups: + - "apps" + resources: + - "deployments" + verbs: + - "get" + - apiGroups: + - "apps" + resources: + - "deployments/finalizers" + verbs: + - update + # For actually registering our webhook. + - apiGroups: + - "admissionregistration.k8s.io" + resources: + - "mutatingwebhookconfigurations" + - "validatingwebhookconfigurations" + verbs: &everything + - "get" + - "list" + - "create" + - "update" + - "delete" + - "patch" + - "watch" + # For leader election + - apiGroups: + - "coordination.k8s.io" + resources: + - "leases" + verbs: *everything + # Necessary for conversion webhook. These are copied from the serving + # TODO: Do we really need all these permissions? + - apiGroups: ["apiextensions.k8s.io"] + resources: ["customresourcedefinitions"] + verbs: ["get", "list", "create", "update", "delete", "patch", "watch"] +--- +# Source: dataplane/charts/fluentbit/templates/clusterrolebinding.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: release-name-fluentbit + labels: + helm.sh/chart: fluentbit-0.48.9 + app.kubernetes.io/name: fluentbit + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "3.2.8" + app.kubernetes.io/managed-by: Helm +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: release-name-fluentbit +subjects: + - kind: ServiceAccount + name: union-system + namespace: union +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2020 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# TODO: Consider restriction of non-aggregated role to knativeservings namespaces. +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: knative-serving-operator + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: knative-serving-operator +subjects: + - kind: ServiceAccount + name: knative-operator + namespace: "union" +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# TODO: Consider restriction of non-aggregated role to knativeeventing namespaces. +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: knative-eventing-operator + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: knative-eventing-operator +subjects: + - kind: ServiceAccount + name: knative-operator + namespace: "union" +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: operator-webhook + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: knative-operator-webhook +subjects: + - kind: ServiceAccount + name: operator-webhook + namespace: "union" +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2020 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. -- apiGroups: ["admissionregistration.k8s.io"] - resources: - - validatingwebhookconfigurations - verbs: ["list", "watch"] +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: knative-serving-operator-aggregated + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: knative-serving-operator-aggregated +subjects: + - kind: ServiceAccount + name: knative-operator + namespace: "union" +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: knative-serving-operator-aggregated-stable + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: knative-serving-operator-aggregated-stable +subjects: + - kind: ServiceAccount + name: knative-operator + namespace: "union" +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: knative-eventing-operator-aggregated + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: knative-eventing-operator-aggregated +subjects: + - kind: ServiceAccount + name: knative-operator + namespace: "union" +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: knative-eventing-operator-aggregated-stable + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: knative-eventing-operator-aggregated-stable +subjects: + - kind: ServiceAccount + name: knative-operator + namespace: "union" +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2022 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. -- apiGroups: ["storage.k8s.io"] - resources: - - volumeattachments - verbs: ["list", "watch"] +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + namespace: "union" + name: knative-operator-webhook + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +rules: + # For manipulating certs into secrets. + - apiGroups: + - "" + resources: + - "secrets" + verbs: + - "get" + - "create" + - "update" + - "list" + - "watch" + - "patch" --- -# Source: dataplane/charts/opencost/templates/clusterrole.yaml -# Cluster role giving opencost to get, list, watch required resources -# No write permissions are required +# Source: dataplane/templates/monitoring/prometheus-rbac.yaml apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole +kind: Role metadata: - name: release-name-opencost + name: union-operator-prometheus-rbac + namespace: union labels: - helm.sh/chart: opencost-1.42.0 - app.kubernetes.io/name: opencost - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "1.111.0" - app.kubernetes.io/part-of: opencost - app.kubernetes.io/managed-by: Helm + release: release-name rules: + # Prometheus server scrape permissions - apiGroups: [""] resources: - - configmaps - - deployments - nodes - - pods + - nodes/proxy + - nodes/metrics - services - - resourcequotas - - replicationcontrollers - - limitranges - - persistentvolumeclaims - - persistentvolumes - - namespaces - endpoints + - pods + - ingresses + - configmaps verbs: - get - list - watch - - apiGroups: - - extensions - resources: - - daemonsets - - deployments - - replicasets - verbs: - - get - - list - - watch - - apiGroups: - - apps - resources: - - statefulsets - - deployments - - daemonsets - - replicasets - verbs: - - list - - watch - - apiGroups: - - batch - resources: - - cronjobs - - jobs - verbs: - - get - - list - - watch - - apiGroups: - - autoscaling - resources: - - horizontalpodautoscalers - verbs: - - get - - list - - watch - - apiGroups: - - policy + - apiGroups: ["extensions", "networking.k8s.io"] resources: - - poddisruptionbudgets + - ingresses/status + - ingresses verbs: - get - list - watch - - apiGroups: - - storage.k8s.io + - apiGroups: ["discovery.k8s.io"] resources: - - storageclasses + - endpointslices verbs: - get - list - watch ---- -# Source: dataplane/templates/clusterresourcesync/serviceaccount.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: union-clustersync-resource -rules: - - apiGroups: - - "" - - rbac.authorization.k8s.io - resources: - - configmaps - - namespaces - - pods - - resourcequotas - - roles - - rolebindings - - secrets - - services - - serviceaccounts - - clusterrolebindings - - podtemplates - verbs: - - '*' + # kube-state-metrics permissions (rbac.create=false on the subchart; managed here instead) + - apiGroups: ["certificates.k8s.io"] + resources: ["certificatesigningrequests"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["configmaps"] + verbs: ["list", "watch"] + - apiGroups: ["batch"] + resources: ["cronjobs"] + verbs: ["list", "watch"] + - apiGroups: ["extensions", "apps"] + resources: ["daemonsets"] + verbs: ["list", "watch"] + - apiGroups: ["extensions", "apps"] + resources: ["deployments"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["endpoints"] + verbs: ["list", "watch"] + - apiGroups: ["discovery.k8s.io"] + resources: ["endpointslices"] + verbs: ["list", "watch"] + - apiGroups: ["autoscaling"] + resources: ["horizontalpodautoscalers"] + verbs: ["list", "watch"] + - apiGroups: ["extensions", "networking.k8s.io"] + resources: ["ingresses"] + verbs: ["list", "watch"] + - apiGroups: ["batch"] + resources: ["jobs"] + verbs: ["list", "watch"] + - apiGroups: ["coordination.k8s.io"] + resources: ["leases"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["limitranges"] + verbs: ["list", "watch"] + - apiGroups: ["admissionregistration.k8s.io"] + resources: ["mutatingwebhookconfigurations"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["namespaces"] + verbs: ["list", "watch"] + - apiGroups: ["networking.k8s.io"] + resources: ["networkpolicies"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["persistentvolumeclaims"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["persistentvolumes"] + verbs: ["list", "watch"] + - apiGroups: ["policy"] + resources: ["poddisruptionbudgets"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["pods"] + verbs: ["list", "watch"] + - apiGroups: ["extensions", "apps"] + resources: ["replicasets"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["replicationcontrollers"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["resourcequotas"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["secrets"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["services"] + verbs: ["list", "watch"] + - apiGroups: ["apps"] + resources: ["statefulsets"] + verbs: ["list", "watch"] + - apiGroups: ["storage.k8s.io"] + resources: ["storageclasses"] + verbs: ["list", "watch"] + - apiGroups: ["admissionregistration.k8s.io"] + resources: ["validatingwebhookconfigurations"] + verbs: ["list", "watch"] + - apiGroups: ["storage.k8s.io"] + resources: ["volumeattachments"] + verbs: ["list", "watch"] --- # Source: dataplane/templates/nodeexecutor/serviceaccount.yaml apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole +kind: Role metadata: name: union-executor labels: @@ -4202,11 +4584,12 @@ rules: - delete - update --- -# Source: dataplane/templates/operator/serviceaccount-proxy.yaml +# Source: dataplane/templates/operator/serviceaccount-proxy-secret.yaml apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole +kind: Role metadata: - name: proxy-system + name: union-system-secret + namespace: union labels: app.kubernetes.io/name: operator-proxy app.kubernetes.io/instance: release-name @@ -4216,146 +4599,86 @@ rules: - apiGroups: - '*' resources: - - events - - flyteworkflows - - pods/log - - pods - - rayjobs - - resourcequotas - verbs: - - get - - list - - watch ---- -# Source: dataplane/templates/operator/serviceaccount.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: operator-system - labels: - app.kubernetes.io/name: union-operator - app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name - app.kubernetes.io/managed-by: Helm -rules: - # Allow Access to all resources under flyte.lyft.com - - apiGroups: - - flyte.lyft.com - resources: - - flyteworkflows - - flyteworkflows/finalizers - verbs: - - get - - list - - watch - - create - - update - - delete - - patch - - post - - deletecollection - - apiGroups: - - '*' - resources: - - resourcequotas - - pods - - configmaps - - podtemplates - secrets - - namespaces - - nodes verbs: - get - list - - watch - create - update - delete - - nonResourceURLs: - - /metrics - verbs: - - get --- -# Source: dataplane/templates/prometheus/rbac.yaml +# Source: dataplane/templates/operator/serviceaccount-proxy.yaml apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole +kind: Role metadata: - name: union-operator-prometheus + name: proxy-system labels: - helm.sh/chart: dataplane-2026.4.2 - app.kubernetes.io/name: release-name-dataplane + app.kubernetes.io/name: operator-proxy app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" + platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: prometheus rules: - - apiGroups: [""] + - apiGroups: + - '*' resources: - - nodes - - nodes/proxy + - events + - flyteworkflows + - pods/log - pods - - endpoints - - services + - rayjobs + - resourcequotas verbs: - get - list - watch - - nonResourceURLs: - - /metrics - - /metrics/cadvisor - verbs: - - get ---- -# Source: dataplane/templates/propeller/serviceaccount-webhook.yaml -kind: ClusterRole -apiVersion: rbac.authorization.k8s.io/v1 -metadata: - name: flytepropeller-webhook-role - namespace: union -rules: - - apiGroups: - - "*" - resources: - - mutatingwebhookconfigurations - - secrets - - pods - - replicasets/finalizers - verbs: - - get - - create - - update - - patch --- -# Source: dataplane/templates/propeller/serviceaccount.yaml +# Source: dataplane/templates/operator/serviceaccount.yaml apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole +kind: Role metadata: - name: flytepropeller-role + name: operator-system + labels: + app.kubernetes.io/name: union-operator + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm rules: - # Allow RO access to PODS - apiGroups: - - "" + - '*' resources: - - pods + - secrets + - deployments verbs: - get - list - watch - # Allow Event recording access + - create + - update - apiGroups: - - "" + - flyte.lyft.com resources: - - events + - flyteworkflows + - flyteworkflows/finalizers verbs: + - get + - list + - watch - create - update - delete - patch - # Allow Access All plugin objects + - post + - deletecollection - apiGroups: - '*' resources: - - '*' + - resourcequotas + - pods + - configmaps + - podtemplates + - secrets + - namespaces + - nodes verbs: - get - list @@ -4363,148 +4686,129 @@ rules: - create - update - delete - - patch - # Allow Access to CRD - apiGroups: - - apiextensions.k8s.io + - serving.knative.dev resources: - - customresourcedefinitions + - revisions + - configurations + - services verbs: - get - list - watch - create - - delete - update - # Allow Access to all resources under flyte.lyft.com + - delete +--- +# Source: dataplane/templates/webhook/serviceaccount.yaml +kind: Role +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: union-webhook-role + namespace: union +rules: - apiGroups: - - flyte.lyft.com + - "*" resources: - - flyteworkflows - - flyteworkflows/finalizers + - mutatingwebhookconfigurations + - secrets + - pods + - replicasets/finalizers verbs: - get - - list - - watch - create - update - - delete - patch - - post - - deletecollection --- -# Source: dataplane/charts/fluentbit/templates/clusterrolebinding.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2022 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding +kind: RoleBinding metadata: - name: release-name-fluentbit + namespace: "union" + name: operator-webhook labels: - helm.sh/chart: fluentbit-0.48.9 - app.kubernetes.io/name: fluentbit - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "3.2.8" - app.kubernetes.io/managed-by: Helm -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: release-name-fluentbit + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator subjects: - kind: ServiceAccount - name: fluentbit-system - namespace: union ---- -# Source: dataplane/charts/kube-state-metrics/templates/clusterrolebinding.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - labels: - helm.sh/chart: kube-state-metrics-5.30.1 - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: metrics - app.kubernetes.io/part-of: kube-state-metrics - app.kubernetes.io/name: kube-state-metrics - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2.15.0" - name: release-name-kube-state-metrics + name: operator-webhook + namespace: "union" roleRef: + kind: Role + name: knative-operator-webhook apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: release-name-kube-state-metrics -subjects: -- kind: ServiceAccount - name: release-name-kube-state-metrics - namespace: union --- -# Source: dataplane/charts/opencost/templates/clusterrolebinding.yaml +# Source: dataplane/templates/monitoring/prometheus-rbac.yaml apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding +kind: RoleBinding metadata: - name: release-name-opencost + name: union-operator-prometheus-rbac + namespace: union labels: - helm.sh/chart: opencost-1.42.0 - app.kubernetes.io/name: opencost - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "1.111.0" - app.kubernetes.io/part-of: opencost - app.kubernetes.io/managed-by: Helm -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: release-name-opencost + release: release-name subjects: - kind: ServiceAccount - name: release-name-opencost + name: union-operator-prometheus namespace: union ---- -# Source: dataplane/templates/clusterresourcesync/serviceaccount.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - name: union-clustersync-resource roleRef: apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: union-clustersync-resource -subjects: - - kind: ServiceAccount - name: union-clustersync-system - namespace: union + kind: Role + name: union-operator-prometheus-rbac --- -# Source: dataplane/templates/clusterresourcesync/serviceaccount.yaml +# Source: dataplane/templates/monitoring/prometheus-rbac.yaml +# kube-state-metrics service account RoleBinding apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding +kind: RoleBinding metadata: - name: union-clustersync-auth-delegator -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: system:auth-delegator + name: release-name-prometheus-kube-state-metrics + namespace: union + labels: + release: release-name subjects: - kind: ServiceAccount - name: union-clustersync-system + name: release-name-prometheus-kube-state-metrics namespace: union +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: union-operator-prometheus-rbac --- # Source: dataplane/templates/nodeexecutor/serviceaccount.yaml apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding +kind: RoleBinding metadata: name: union-executor labels: app: executor roleRef: apiGroup: rbac.authorization.k8s.io - kind: ClusterRole + kind: Role name: union-executor subjects: - kind: ServiceAccount - name: executor + name: union-system namespace: union --- -# Source: dataplane/templates/operator/serviceaccount-proxy.yaml +# Source: dataplane/templates/operator/serviceaccount-proxy-secret.yaml apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding +kind: RoleBinding metadata: - name: proxy-system + name: union-system-secret + namespace: union labels: app.kubernetes.io/name: operator-proxy app.kubernetes.io/instance: release-name @@ -4512,109 +4816,35 @@ metadata: app.kubernetes.io/managed-by: Helm roleRef: apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: proxy-system + kind: Role + name: union-system-secret subjects: - kind: ServiceAccount - name: proxy-system + name: union-system namespace: union --- -# Source: dataplane/templates/operator/serviceaccount.yaml +# Source: dataplane/templates/operator/serviceaccount-proxy.yaml apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding +kind: RoleBinding metadata: - name: operator-system + name: proxy-system labels: - app.kubernetes.io/name: union-operator + app.kubernetes.io/name: operator-proxy app.kubernetes.io/instance: release-name platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm roleRef: apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: operator-system -subjects: - - kind: ServiceAccount - name: operator-system - namespace: union ---- -# Source: dataplane/templates/prometheus/rbac.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - name: union-operator-prometheus - labels: - helm.sh/chart: dataplane-2026.4.2 - app.kubernetes.io/name: release-name-dataplane - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: prometheus -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: union-operator-prometheus -subjects: - - kind: ServiceAccount - name: union-operator-prometheus - namespace: union ---- -# Source: dataplane/templates/propeller/serviceaccount-webhook.yaml -# Create a binding from Role -> ServiceAccount -kind: ClusterRoleBinding -apiVersion: rbac.authorization.k8s.io/v1 -metadata: - name: flytepropeller-webhook-binding - namespace: union -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: flytepropeller-webhook-role -subjects: - - kind: ServiceAccount - name: flytepropeller-webhook-system - namespace: union ---- -# Source: dataplane/templates/propeller/serviceaccount.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - name: flytepropeller-binding -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: flytepropeller-role + kind: Role + name: proxy-system subjects: - kind: ServiceAccount - name: flytepropeller-system + name: union-system namespace: union --- -# Source: dataplane/templates/operator/serviceaccount-proxy-secret.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: Role -metadata: - name: proxy-system-secret - namespace: union - labels: - app.kubernetes.io/name: operator-proxy - app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name - app.kubernetes.io/managed-by: Helm -rules: - - apiGroups: - - '*' - resources: - - secrets - verbs: - - get - - list - - create - - update - - delete ---- # Source: dataplane/templates/operator/serviceaccount.yaml apiVersion: rbac.authorization.k8s.io/v1 -kind: Role +kind: RoleBinding metadata: name: operator-system labels: @@ -4622,56 +4852,29 @@ metadata: app.kubernetes.io/instance: release-name platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm -rules: - - apiGroups: - - '*' - resources: - - secrets - - deployments - verbs: - - get - - list - - watch - - create - - update ---- -# Source: dataplane/templates/operator/serviceaccount-proxy-secret.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: RoleBinding -metadata: - name: proxy-system-secret - namespace: union - labels: - app.kubernetes.io/name: operator-proxy - app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name - app.kubernetes.io/managed-by: Helm roleRef: apiGroup: rbac.authorization.k8s.io kind: Role - name: proxy-system-secret + name: operator-system subjects: - kind: ServiceAccount - name: proxy-system + name: union-system namespace: union --- -# Source: dataplane/templates/operator/serviceaccount.yaml -apiVersion: rbac.authorization.k8s.io/v1 +# Source: dataplane/templates/webhook/serviceaccount.yaml +# Create a binding from Role -> ServiceAccount kind: RoleBinding +apiVersion: rbac.authorization.k8s.io/v1 metadata: - name: operator-system - labels: - app.kubernetes.io/name: union-operator - app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name - app.kubernetes.io/managed-by: Helm + name: union-webhook-binding + namespace: union roleRef: apiGroup: rbac.authorization.k8s.io kind: Role - name: operator-system + name: union-webhook-role subjects: - kind: ServiceAccount - name: operator-system + name: union-system namespace: union --- # Source: dataplane/charts/fluentbit/templates/service.yaml @@ -4697,20 +4900,60 @@ spec: app.kubernetes.io/name: fluentbit app.kubernetes.io/instance: release-name --- -# Source: dataplane/charts/kube-state-metrics/templates/service.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2022 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v1 +kind: Service +metadata: + labels: + role: operator-webhook + app.kubernetes.io/component: operator-webhook + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator + name: operator-webhook + namespace: "union" +spec: + ports: + # Define metrics and profiling for them to be accessible within service meshes. + - name: http-metrics + port: 9090 + targetPort: 9090 + - name: http-profiling + port: 8008 + targetPort: 8008 + - name: https-webhook + port: 443 + targetPort: 8443 + selector: + role: operator-webhook +--- +# Source: dataplane/charts/prometheus/charts/kube-state-metrics/templates/service.yaml apiVersion: v1 kind: Service metadata: name: release-name-kube-state-metrics namespace: union labels: - helm.sh/chart: kube-state-metrics-5.30.1 + helm.sh/chart: kube-state-metrics-5.25.1 app.kubernetes.io/managed-by: Helm app.kubernetes.io/component: metrics app.kubernetes.io/part-of: kube-state-metrics app.kubernetes.io/name: kube-state-metrics app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2.15.0" + app.kubernetes.io/version: "2.13.0" annotations: prometheus.io/scrape: 'true' spec: @@ -4725,28 +4968,32 @@ spec: app.kubernetes.io/name: kube-state-metrics app.kubernetes.io/instance: release-name --- -# Source: dataplane/charts/opencost/templates/service.yaml +# Source: dataplane/charts/prometheus/templates/service.yaml apiVersion: v1 kind: Service metadata: - name: release-name-opencost - namespace: union labels: - helm.sh/chart: opencost-1.42.0 - app.kubernetes.io/name: opencost + app.kubernetes.io/component: server + app.kubernetes.io/name: prometheus app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "1.111.0" - app.kubernetes.io/part-of: opencost + app.kubernetes.io/version: v2.54.1 + helm.sh/chart: prometheus-25.27.0 app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: prometheus + name: union-operator-prometheus + namespace: union spec: + ports: + - name: http + port: 80 + protocol: TCP + targetPort: 9090 selector: - app.kubernetes.io/name: opencost + app.kubernetes.io/component: server + app.kubernetes.io/name: prometheus app.kubernetes.io/instance: release-name + sessionAffinity: None type: "ClusterIP" - ports: - - name: http - port: 9003 - targetPort: 9003 --- # Source: dataplane/templates/clusterresourcesync/service.yaml apiVersion: v1 @@ -4770,6 +5017,31 @@ spec: app.kubernetes.io/name: clusterresourcesync app.kubernetes.io/instance: release-name --- +# Source: dataplane/templates/flyteconnector/service.yaml +apiVersion: v1 +kind: Service +metadata: + name: flyteconnector + namespace: union + labels: + app.kubernetes.io/name: flyteconnector + app.kubernetes.io/instance: release-name + app.kubernetes.io/managed-by: Helm +spec: + clusterIP: None + ports: + - name: grpc + port: 8000 + protocol: TCP + targetPort: grpc + - name: metric + port: 9090 + protocol: TCP + targetPort: metric + selector: + app.kubernetes.io/name: flyteconnector + app.kubernetes.io/instance: release-name +--- # Source: dataplane/templates/imagebuilder/service.yaml apiVersion: v1 kind: Service @@ -4795,7 +5067,7 @@ spec: apiVersion: v1 kind: Service metadata: - name: release-name-dataplane-executor + name: union-operator-executor labels: platform.union.ai/prometheus-group: "union-services" app: executor @@ -4861,39 +5133,14 @@ spec: app.kubernetes.io/name: union-operator app.kubernetes.io/instance: release-name --- -# Source: dataplane/templates/prometheus/service.yaml -apiVersion: v1 -kind: Service -metadata: - name: union-operator-prometheus - namespace: union - labels: - helm.sh/chart: dataplane-2026.4.2 - app.kubernetes.io/name: release-name-dataplane - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: prometheus -spec: - type: ClusterIP - ports: - - port: 80 - targetPort: 9090 - protocol: TCP - name: http - selector: - app.kubernetes.io/component: prometheus - app.kubernetes.io/name: release-name-dataplane - app.kubernetes.io/instance: release-name ---- -# Source: dataplane/templates/propeller/service-webhook.yaml +# Source: dataplane/templates/webhook/service.yaml apiVersion: v1 kind: Service metadata: - name: flyte-pod-webhook + name: union-pod-webhook namespace: union labels: - app.kubernetes.io/name: flyte-pod-webhook + app.kubernetes.io/name: union-pod-webhook app.kubernetes.io/instance: release-name platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm @@ -4901,7 +5148,7 @@ metadata: projectcontour.io/upstream-protocol.h2c: grpc spec: selector: - app.kubernetes.io/name: flyte-pod-webhook + app.kubernetes.io/name: union-pod-webhook app.kubernetes.io/instance: release-name ports: - name: https @@ -4913,23 +5160,23 @@ spec: port: 10254 targetPort: 10254 --- -# Source: dataplane/templates/propeller/service-webhook.yaml +# Source: dataplane/templates/webhook/service.yaml # Headless Service for cache invalidation — resolves to all pod IPs so that # we can fan out invalidation requests to every webhook replica. apiVersion: v1 kind: Service metadata: - name: flyte-pod-webhook-headless + name: union-pod-webhook-headless namespace: union labels: - app.kubernetes.io/name: flyte-pod-webhook + app.kubernetes.io/name: union-pod-webhook app.kubernetes.io/instance: release-name platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm spec: clusterIP: None selector: - app.kubernetes.io/name: flyte-pod-webhook + app.kubernetes.io/name: union-pod-webhook app.kubernetes.io/instance: release-name ports: - name: cache-internal @@ -4937,32 +5184,6 @@ spec: port: 9443 targetPort: 9443 --- -# Source: dataplane/templates/propeller/service.yaml -apiVersion: v1 -kind: Service -metadata: - namespace: union - name: flytepropeller - labels: - platform.union.ai/prometheus-group: "union-services" - app.kubernetes.io/name: flytepropeller - app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name - app.kubernetes.io/managed-by: Helm -spec: - type: ClusterIP - ports: - - name: debug - protocol: TCP - port: 10254 - - name: fasttask - port: 15605 - protocol: TCP - targetPort: 15605 - selector: - app.kubernetes.io/name: flytepropeller - app.kubernetes.io/instance: release-name ---- # Source: dataplane/charts/fluentbit/templates/daemonset.yaml apiVersion: apps/v1 kind: DaemonSet @@ -4988,7 +5209,7 @@ spec: annotations: checksum/config: e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855 spec: - serviceAccountName: fluentbit-system + serviceAccountName: union-system hostNetwork: false dnsPolicy: ClusterFirst containers: @@ -5040,20 +5261,217 @@ spec: tolerations: - operator: Exists --- -# Source: dataplane/charts/kube-state-metrics/templates/deployment.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2022 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: operator-webhook + namespace: "union" + labels: + app.kubernetes.io/component: operator-webhook + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +spec: + selector: + matchLabels: + app: operator-webhook + role: operator-webhook + template: + metadata: + labels: + app: operator-webhook + role: operator-webhook + app.kubernetes.io/component: operator-webhook + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator + sidecar.istio.io/inject: "false" + spec: + # To avoid node becoming SPOF, spread our replicas to different nodes. + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - podAffinityTerm: + labelSelector: + matchLabels: + app: operator-webhook + topologyKey: kubernetes.io/hostname + weight: 100 + serviceAccountName: operator-webhook + containers: + - name: operator-webhook + # This is the Go import path for the binary that is containerized + # and substituted here. + image: gcr.io/knative-releases/knative.dev/operator/cmd/webhook@sha256:d3a7a3304629ccfcaacec620b50736e0b4c902a6328b83369b6cbaf7e94677c9 + resources: + requests: + cpu: 100m + memory: 100Mi + limits: + cpu: 500m + memory: 500Mi + env: + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: SYSTEM_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: CONFIG_LOGGING_NAME + value: config-logging + - name: CONFIG_OBSERVABILITY_NAME + value: config-observability + - name: WEBHOOK_NAME + value: operator-webhook + - name: WEBHOOK_PORT + value: "8443" + - name: WEBHOOK_SECRET_NAME + value: operator-webhook-certs + - name: METRICS_DOMAIN + value: knative.dev/operator + - name: KUBERNETES_MIN_VERSION + value: "" + securityContext: + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + runAsNonRoot: true + capabilities: + drop: + - ALL + ports: + - name: metrics + containerPort: 9090 + - name: profiling + containerPort: 8008 + - name: https-webhook + containerPort: 8443 + readinessProbe: + periodSeconds: 1 + httpGet: + scheme: HTTPS + port: 8443 + httpHeaders: + - name: k-kubelet-probe + value: "webhook" + livenessProbe: + periodSeconds: 1 + httpGet: + scheme: HTTPS + port: 8443 + httpHeaders: + - name: k-kubelet-probe + value: "webhook" + failureThreshold: 6 + initialDelaySeconds: 120 + # Our webhook should gracefully terminate by lame ducking first, set this to a sufficiently + # high value that we respect whatever value it has configured for the lame duck grace period. + terminationGracePeriodSeconds: 300 +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2020 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: knative-operator + namespace: "union" + labels: + app.kubernetes.io/name: knative-operator + app.kubernetes.io/version: "1.16.0" +spec: + replicas: 1 + selector: + matchLabels: + name: knative-operator + template: + metadata: + labels: + name: knative-operator + app.kubernetes.io/name: knative-operator + app.kubernetes.io/version: "1.16.0" + sidecar.istio.io/inject: "false" + spec: + serviceAccountName: knative-operator + containers: + - name: knative-operator + image: gcr.io/knative-releases/knative.dev/operator/cmd/operator@sha256:0b5a3532417f9c8e7b6044e23f6f67ad932a74a40a4092ad965b6f173b2fd887 + resources: + requests: + cpu: 100m + memory: 100Mi + limits: + cpu: 1000m + memory: 1000Mi + imagePullPolicy: IfNotPresent + env: + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: SYSTEM_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: METRICS_DOMAIN + value: knative.dev/operator + - name: CONFIG_LOGGING_NAME + value: config-logging + - name: CONFIG_OBSERVABILITY_NAME + value: config-observability + - name: KUBERNETES_MIN_VERSION + value: "" + securityContext: + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + runAsNonRoot: true + capabilities: + drop: + - ALL + ports: + - name: metrics + containerPort: 9090 +--- +# Source: dataplane/charts/prometheus/charts/kube-state-metrics/templates/deployment.yaml apiVersion: apps/v1 kind: Deployment metadata: name: release-name-kube-state-metrics namespace: union labels: - helm.sh/chart: kube-state-metrics-5.30.1 + helm.sh/chart: kube-state-metrics-5.25.1 app.kubernetes.io/managed-by: Helm app.kubernetes.io/component: metrics app.kubernetes.io/part-of: kube-state-metrics app.kubernetes.io/name: kube-state-metrics app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2.15.0" + app.kubernetes.io/version: "2.13.0" spec: selector: matchLabels: @@ -5066,13 +5484,13 @@ spec: template: metadata: labels: - helm.sh/chart: kube-state-metrics-5.30.1 + helm.sh/chart: kube-state-metrics-5.25.1 app.kubernetes.io/managed-by: Helm app.kubernetes.io/component: metrics app.kubernetes.io/part-of: kube-state-metrics app.kubernetes.io/name: kube-state-metrics app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2.15.0" + app.kubernetes.io/version: "2.13.0" spec: automountServiceAccountToken: true hostNetwork: false @@ -5089,8 +5507,9 @@ spec: args: - --port=8080 - --resources=certificatesigningrequests,configmaps,cronjobs,daemonsets,deployments,endpoints,horizontalpodautoscalers,ingresses,jobs,leases,limitranges,mutatingwebhookconfigurations,namespaces,networkpolicies,nodes,persistentvolumeclaims,persistentvolumes,poddisruptionbudgets,pods,replicasets,replicationcontrollers,resourcequotas,secrets,services,statefulsets,storageclasses,validatingwebhookconfigurations,volumeattachments + - --namespaces=union imagePullPolicy: IfNotPresent - image: registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.15.0 + image: registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.13.0 ports: - containerPort: 8080 name: "http" @@ -5110,7 +5529,7 @@ spec: httpGet: httpHeaders: path: /readyz - port: 8081 + port: 8080 scheme: HTTP initialDelaySeconds: 5 periodSeconds: 10 @@ -5125,203 +5544,204 @@ spec: - ALL readOnlyRootFilesystem: true --- -# Source: dataplane/charts/opencost/templates/deployment.yaml +# Source: dataplane/charts/prometheus/templates/deploy.yaml apiVersion: apps/v1 kind: Deployment metadata: - name: release-name-opencost - namespace: union labels: - helm.sh/chart: opencost-1.42.0 - app.kubernetes.io/name: opencost + app.kubernetes.io/component: server + app.kubernetes.io/name: prometheus app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "1.111.0" - app.kubernetes.io/part-of: opencost + app.kubernetes.io/version: v2.54.1 + helm.sh/chart: prometheus-25.27.0 app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: prometheus + name: union-operator-prometheus + namespace: union spec: - replicas: 1 selector: matchLabels: - app.kubernetes.io/name: opencost + app.kubernetes.io/component: server + app.kubernetes.io/name: prometheus app.kubernetes.io/instance: release-name - strategy: - rollingUpdate: - maxSurge: 1 - maxUnavailable: 1 - type: RollingUpdate + replicas: 1 + revisionHistoryLimit: 10 + strategy: + type: Recreate + rollingUpdate: null template: metadata: labels: - app.kubernetes.io/name: opencost + app.kubernetes.io/component: server + app.kubernetes.io/name: prometheus app.kubernetes.io/instance: release-name + app.kubernetes.io/version: v2.54.1 + helm.sh/chart: prometheus-25.27.0 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: prometheus spec: - serviceAccountName: release-name-opencost + enableServiceLinks: true + serviceAccountName: union-operator-prometheus containers: - - name: release-name-opencost - image: ghcr.io/opencost/opencost:1.111.0@sha256:6aa68e52a24b14ba41f23db08d1b9db1429a1c0300f4c0381ecc2c61fc311a97 - imagePullPolicy: IfNotPresent + - name: prometheus-server-configmap-reload + image: "quay.io/prometheus-operator/prometheus-config-reloader:v0.76.0" + imagePullPolicy: "IfNotPresent" args: + - --watched-dir=/etc/config + - --listen-address=0.0.0.0:8080 + - --reload-url=http://127.0.0.1:9090/prometheus/-/reload ports: - - containerPort: 9003 - name: http - resources: - limits: - cpu: 1000m - memory: 4Gi - requests: - cpu: 500m - memory: 1Gi - startupProbe: - httpGet: - path: /healthz - port: 9003 - initialDelaySeconds: 10 - periodSeconds: 5 - failureThreshold: 30 + - containerPort: 8080 + name: metrics livenessProbe: httpGet: path: /healthz - port: 9003 - initialDelaySeconds: 10 - periodSeconds: 20 - failureThreshold: 3 + port: metrics + scheme: HTTP + initialDelaySeconds: 2 + periodSeconds: 10 readinessProbe: httpGet: path: /healthz - port: 9003 - initialDelaySeconds: 10 + port: metrics + scheme: HTTP periodSeconds: 10 + volumeMounts: + - name: config-volume + mountPath: /etc/config + readOnly: true + + - name: prometheus-server + image: "quay.io/prometheus/prometheus:v2.54.1" + imagePullPolicy: "IfNotPresent" + args: + - --storage.tsdb.retention.time=3d + - --config.file=/etc/config/prometheus.yml + - --storage.tsdb.path=/data + - --web.console.libraries=/etc/prometheus/console_libraries + - --web.console.templates=/etc/prometheus/consoles + - --web.enable-lifecycle + - --web.route-prefix=/prometheus + - --web.external-url=/prometheus/ + ports: + - containerPort: 9090 + readinessProbe: + httpGet: + path: /prometheus/-/ready + port: 9090 + scheme: HTTP + initialDelaySeconds: 30 + periodSeconds: 5 + timeoutSeconds: 4 failureThreshold: 3 - env: - - name: LOG_LEVEL - value: info - - name: CUSTOM_COST_ENABLED - value: "false" - - name: KUBECOST_NAMESPACE - value: union - - name: API_PORT - value: "9003" - - name: PROMETHEUS_SERVER_ENDPOINT - value: "http://union-operator-prometheus.union.svc:80/prometheus" - - name: CLUSTER_ID - value: "default-cluster" - - name: DATA_RETENTION_DAILY_RESOLUTION_DAYS - value: "15" - - name: CLOUD_COST_ENABLED - value: "false" - - name: CLOUD_COST_MONTH_TO_DATE_INTERVAL - value: "6" - - name: CLOUD_COST_REFRESH_RATE_HOURS - value: "6" - - name: CLOUD_COST_QUERY_WINDOW_DAYS - value: "7" - - name: CLOUD_COST_RUN_WINDOW_DAYS - value: "3" - # Add any additional provided variables + successThreshold: 1 + livenessProbe: + httpGet: + path: /prometheus/-/healthy + port: 9090 + scheme: HTTP + initialDelaySeconds: 30 + periodSeconds: 15 + timeoutSeconds: 10 + failureThreshold: 3 + successThreshold: 1 + resources: + limits: + cpu: "3" + memory: 3500Mi + requests: + cpu: "1" + memory: 1Gi + volumeMounts: + - name: config-volume + mountPath: /etc/config + - name: storage-volume + mountPath: /data + subPath: "" + - name: server-recording-rules + mountPath: /etc/config/recording + subPath: + readOnly: + dnsPolicy: ClusterFirst + securityContext: + fsGroup: 65534 + runAsGroup: 65534 + runAsNonRoot: true + runAsUser: 65534 + terminationGracePeriodSeconds: 300 + volumes: + - name: config-volume + configMap: + name: union-operator-prometheus + - name: server-recording-rules + configMap: + name: union-recording-rules + - name: storage-volume + emptyDir: + sizeLimit: 10Gi --- -# Source: dataplane/templates/clusterresourcesync/deployment.yaml +# Source: dataplane/templates/flyteconnector/deployment.yaml apiVersion: apps/v1 kind: Deployment metadata: - name: union-syncresources + name: flyteconnector namespace: union - labels: - app.kubernetes.io/name: clusterresourcesync + labels: + app.kubernetes.io/name: flyteconnector app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm spec: - replicas: 1 + replicas: 2 selector: - matchLabels: - app.kubernetes.io/name: clusterresourcesync - app.kubernetes.io/instance: release-name - template: - metadata: - annotations: - configChecksum: "bef188a6ef20bf390a95e33b16ee869128e28243ae661609aab81d95191b1d3" - prometheus.io/scrape: "true" - labels: - platform.union.ai/zone: "dataplane" - - azure.workload.identity/use: "true" - custom-label: custom-value - app.kubernetes.io/name: clusterresourcesync - app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name - app.kubernetes.io/managed-by: Helm - spec: - containers: - - command: - - clusterresource - - --config - - /etc/flyte/config/*.yaml - - clusterresource - - run - env: - - name: POD_NAME - valueFrom: - fieldRef: - fieldPath: metadata.name - - name: POD_NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - - name: GOMEMLIMIT - valueFrom: - resourceFieldRef: - divisor: 1 - resource: limits.memory - - name: GOMAXPROCS - valueFrom: - resourceFieldRef: - divisor: 1 - resource: limits.cpu - - name: CLUSTER_NAME - valueFrom: - secretKeyRef: - name: operator-cluster-name - key: cluster_name - - name: DEPLOYMENT_NAME - value: operator - - name: PROXY_SERVICE_URL - value: http://union-operator-proxy:8080 - - name: PROMETHEUS_SERVICE_URL - value: http://union-operator-prometheus:80 - - name: KNATIVE_PROXY_SERVICE_URL - value: http://kourier-internal - image: "public.ecr.aws/p0i0a9q8/unionoperator:2026.4.5" + matchLabels: + app.kubernetes.io/name: flyteconnector + app.kubernetes.io/instance: release-name + template: + metadata: + annotations: + labels: + platform.union.ai/zone: "dataplane" + + app.kubernetes.io/name: flyteconnector + app.kubernetes.io/instance: release-name + app.kubernetes.io/managed-by: Helm + spec: + containers: + - command: + - c0 + image: "ghcr.io/flyteorg/flyte-connectors:py3.13-2.0.0b50.dev3-g695bb1db3.d20260122" imagePullPolicy: "IfNotPresent" - name: sync-cluster-resources + name: flyteconnector + readinessProbe: + failureThreshold: 3 + httpGet: + path: / + port: 9090 + scheme: HTTP + initialDelaySeconds: 5 + periodSeconds: 10 + successThreshold: 1 + timeoutSeconds: 5 + env: + - name: AIRFLOW_CONN_GOOGLE_CLOUD_DEFAULT + value: "google-cloud-platform://" + - name: LOG_LEVEL + value: "20" + ports: + - containerPort: 8000 + name: grpc + - containerPort: 9090 + name: metric resources: limits: - cpu: "1" - memory: 500Mi + cpu: "1.5" + ephemeral-storage: 100Mi + memory: 1500Mi requests: - cpu: 500m - memory: 100Mi - volumeMounts: - - name: auth - mountPath: /etc/union/secret - - name: resource-templates - mountPath: /etc/flyte/clusterresource/templates - - name: config-volume - mountPath: /etc/flyte/config - ports: - - name: debug - containerPort: 10254 - protocol: TCP - serviceAccountName: union-clustersync-system - volumes: - - configMap: - name: union-clusterresource-template - name: resource-templates - - configMap: - name: union-clusterresourcesync-config - name: config-volume - - name: auth - secret: - secretName: union-secret-auth + cpu: "1" + ephemeral-storage: 100Mi + memory: 1000Mi + serviceAccountName: flyteconnector --- # Source: dataplane/templates/imagebuilder/deployment.yaml apiVersion: apps/v1 @@ -5343,17 +5763,15 @@ spec: app.kubernetes.io/instance: release-name template: metadata: - annotations: - container.apparmor.security.beta.kubernetes.io/buildkit: unconfined labels: platform.union.ai/zone: "dataplane" app.kubernetes.io/name: imagebuilder-buildkit app.kubernetes.io/instance: release-name spec: - serviceAccountName: "union-imagebuilder" + serviceAccountName: "union-system" containers: - name: "buildkit" - image: "docker.io/moby/buildkit:buildx-stable-1-rootless" + image: "docker.io/moby/buildkit:buildx-stable-1" imagePullPolicy: IfNotPresent env: - name: POD_NAME @@ -5388,18 +5806,15 @@ spec: - name: KNATIVE_PROXY_SERVICE_URL value: http://kourier-internal volumeMounts: - - mountPath: /home/user/.local/share/buildkit - name: buildkitd - mountPath: /etc/buildkit name: buildkit-config args: - --config - /etc/buildkit/buildkitd.toml - --addr - - unix:///run/user/1000/buildkit/buildkitd.sock + - unix:///run//buildkit/buildkitd.sock - --addr - tcp://0.0.0.0:1234 - - --oci-worker-no-process-sandbox ports: - name: tcp containerPort: 1234 @@ -5421,18 +5836,13 @@ spec: initialDelaySeconds: 5 periodSeconds: 30 securityContext: - seccompProfile: # Needs Kubernetes >= 1.19 - type: Unconfined - runAsUser: 1000 - runAsGroup: 1000 + privileged: true resources: requests: - cpu: 1 - ephemeral-storage: 20Gi - memory: 1Gi + cpu: 4 + ephemeral-storage: 50Gi + memory: 4Gi volumes: - - name: buildkitd - emptyDir: {} - configMap: name: union-operator-buildkit name: buildkit-config @@ -5461,7 +5871,7 @@ spec: template: metadata: annotations: - configChecksum: "a1e4a396e83da22e5ffee35c34f73c3b54be6c6ba8bf8d81004e789023847e1" + configChecksum: "d5154bc26a2b8e52a22a16821031e24a380e12a15429af3c00ec1d91eae26ed" prometheus.io/scrape: "true" labels: platform.union.ai/zone: "dataplane" @@ -5469,10 +5879,12 @@ spec: azure.workload.identity/use: "true" custom-label: custom-value app: executor + app.kubernetes.io/instance: 'release-name' + app.kubernetes.io/name: executor spec: securityContext: fsGroup: 1337 - serviceAccountName: executor + serviceAccountName: union-system volumes: - name: config-volume configMap: @@ -5488,7 +5900,7 @@ spec: image: "public.ecr.aws/p0i0a9q8/unionoperator:2026.4.5" imagePullPolicy: IfNotPresent command: - - executor + - executorv2 - serve - --config - /etc/config/*.yaml @@ -5566,7 +5978,7 @@ spec: template: metadata: annotations: - configChecksum: "4076d001f4e350fc48c9eb224e0f71482e5fb912fa4fcbe49f7eb2ee4ac44c3" + configChecksum: "9f173d6f4412c91858e16604240da14386ec2ff7f861d554d9b98d78ed9cc94" prometheus.io/scrape: "true" labels: @@ -5583,12 +5995,10 @@ spec: sources: - configMap: name: union-operator - - configMap: - name: union-clusterresourcesync-config - name: secret-volume secret: secretName: union-secret-auth - serviceAccountName: proxy-system + serviceAccountName: union-system securityContext: {} containers: @@ -5705,7 +6115,7 @@ spec: template: metadata: annotations: - configChecksum: "4076d001f4e350fc48c9eb224e0f71482e5fb912fa4fcbe49f7eb2ee4ac44c3" + configChecksum: "9f173d6f4412c91858e16604240da14386ec2ff7f861d554d9b98d78ed9cc94" prometheus.io/scrape: "true" labels: @@ -5716,7 +6126,7 @@ spec: platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm spec: - serviceAccountName: operator-system + serviceAccountName: union-system securityContext: {} volumes: @@ -5797,81 +6207,16 @@ spec: containerPort: 10254 protocol: TCP --- -# Source: dataplane/templates/prometheus/deployment.yaml -apiVersion: apps/v1 -kind: Deployment -metadata: - name: union-operator-prometheus - namespace: union - labels: - helm.sh/chart: dataplane-2026.4.2 - app.kubernetes.io/name: release-name-dataplane - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: prometheus -spec: - replicas: 1 - selector: - matchLabels: - app.kubernetes.io/component: prometheus - app.kubernetes.io/name: release-name-dataplane - app.kubernetes.io/instance: release-name - template: - metadata: - annotations: - configChecksum: "72a39ca838f02fc499675b5708fa77e2a4938e316a99315b59e33ef7f2d31ae" - labels: - app.kubernetes.io/component: prometheus - app.kubernetes.io/name: release-name-dataplane - app.kubernetes.io/instance: release-name - spec: - priorityClassName: system-cluster-critical - serviceAccountName: union-operator-prometheus - securityContext: - fsGroup: 65534 - runAsNonRoot: true - runAsUser: 65534 - fsGroupChangePolicy: OnRootMismatch - containers: - - name: prometheus - image: "prom/prometheus:v3.3.1" - args: - - --config.file=/etc/prometheus/prometheus.yml - - --web.external-url=/prometheus/ - - --web.route-prefix=/prometheus/ - - --storage.tsdb.retention.time=3d - ports: - - name: http - containerPort: 9090 - protocol: TCP - resources: - limits: - cpu: "3" - memory: 3500Mi - requests: - cpu: "1" - memory: 1Gi - securityContext: - allowPrivilegeEscalation: false - terminationMessagePolicy: FallbackToLogsOnError - volumeMounts: - - mountPath: /etc/prometheus - name: prometheus-config - volumes: - - name: prometheus-config - configMap: - name: union-operator-prometheus ---- -# Source: dataplane/templates/propeller/deployment-webhook.yaml +# Source: dataplane/templates/webhook/deployment.yaml +# Webhook deployment # Create the actual deployment apiVersion: apps/v1 kind: Deployment metadata: - name: flytepropeller-webhook + name: union-pod-webhook namespace: union labels: - app.kubernetes.io/name: flyte-pod-webhook + app.kubernetes.io/name: union-pod-webhook app.kubernetes.io/instance: release-name platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm @@ -5879,7 +6224,7 @@ spec: replicas: 1 selector: matchLabels: - app.kubernetes.io/name: flyte-pod-webhook + app.kubernetes.io/name: union-pod-webhook app.kubernetes.io/instance: release-name template: metadata: @@ -5888,12 +6233,12 @@ spec: azure.workload.identity/use: "true" custom-label: custom-value - app.kubernetes.io/name: flyte-pod-webhook + app.kubernetes.io/name: union-pod-webhook app.kubernetes.io/instance: release-name platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm annotations: - configChecksum: "24e0f0c6dd0bf39ba1103d08dedaa1047108f9263429148d8f22e6025237ac3" + configChecksum: "77958aee0464eee4cb43c70036cf5ffa89f47f58c908845b9eeaecc7e32bba4" prometheus.io/scrape: "true" spec: securityContext: @@ -5901,64 +6246,8 @@ spec: fsGroupChangePolicy: Always runAsNonRoot: true runAsUser: 1001 - seLinuxOptions: - type: spc_t - serviceAccountName: flytepropeller-webhook-system - initContainers: - - name: generate-secrets - image: "public.ecr.aws/p0i0a9q8/unionoperator:2026.4.5" - imagePullPolicy: "IfNotPresent" - command: - - flytepropeller - args: - - webhook - - init-certs - - --config - - /etc/flyte/config/*.yaml - env: - - name: POD_NAME - valueFrom: - fieldRef: - fieldPath: metadata.name - - name: POD_NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - - name: GOMEMLIMIT - valueFrom: - resourceFieldRef: - divisor: 1 - resource: limits.memory - - name: GOMAXPROCS - valueFrom: - resourceFieldRef: - divisor: 1 - resource: limits.cpu - - name: CLUSTER_NAME - valueFrom: - secretKeyRef: - name: operator-cluster-name - key: cluster_name - - name: DEPLOYMENT_NAME - value: operator - - name: PROXY_SERVICE_URL - value: http://union-operator-proxy:8080 - - name: PROMETHEUS_SERVICE_URL - value: http://union-operator-prometheus:80 - - name: KNATIVE_PROXY_SERVICE_URL - value: http://kourier-internal - volumeMounts: - - name: config-volume - mountPath: /etc/flyte/config - resources: - limits: - cpu: 1 - ephemeral-storage: 500Mi - memory: 500Mi - requests: - cpu: 200m - ephemeral-storage: 500Mi - memory: 500Mi + seLinuxOptions: null + serviceAccountName: union-system containers: - name: webhook image: "public.ecr.aws/p0i0a9q8/unionoperator:2026.4.5" @@ -6023,109 +6312,301 @@ spec: volumes: - name: config-volume configMap: - name: flyte-propeller-config + name: union-pod-webhook-config - name: webhook-certs secret: - secretName: flyte-pod-webhook + secretName: union-pod-webhook --- -# Source: dataplane/templates/propeller/deployment.yaml -apiVersion: apps/v1 -kind: Deployment +# Source: dataplane/templates/flyteconnector/hpa.yaml +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler metadata: - namespace: union - name: flytepropeller + name: flyteconnector + labels: + app.kubernetes.io/name: flyteconnector + app.kubernetes.io/instance: release-name + app.kubernetes.io/managed-by: Helm +spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: flyteconnector + minReplicas: 2 + maxReplicas: 5 + metrics: + - type: Resource + resource: + name: cpu + target: + type: Utilization + averageUtilization: 80 + - type: Resource + resource: + name: memory + target: + type: Utilization + averageUtilization: 80 +--- +# Source: dataplane/templates/webhook/mutatingwebhookconfiguration.yaml +apiVersion: admissionregistration.k8s.io/v1 +kind: MutatingWebhookConfiguration +metadata: + name: union-pod-webhook-union labels: - app.kubernetes.io/name: flytepropeller + app.kubernetes.io/name: union-pod-webhook app.kubernetes.io/instance: release-name platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm +webhooks: + - name: union-pod-webhook.flyte.org + admissionReviewVersions: + - v1 + - v1beta1 + clientConfig: + caBundle: LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSUROekNDQWgrZ0F3SUJBZ0lVSm9kQ09lem4vd1BHQTZjYkw1U3duSlpYT2hJd0RRWUpLb1pJaHZjTkFRRUwKQlFBd0t6RXBNQ2NHQTFVRUF3d2dabXg1ZEdWd2NtOXdaV3hzWlhJdGQyVmlhRzl2YXk1MWJtbHZiaTV6ZG1NdwpIaGNOTWpZd01USXpNREl6TmpVMVdoY05Nell3TVRJeE1ESXpOalUxV2pBck1Ta3dKd1lEVlFRRERDQm1iSGwwClpYQnliM0JsYkd4bGNpMTNaV0pvYjI5ckxuVnVhVzl1TG5OMll6Q0NBU0l3RFFZSktvWklodmNOQVFFQkJRQUQKZ2dFUEFEQ0NBUW9DZ2dFQkFJcjBSWDRGRjFNVTNYdFBmUHErMDUrMGI1Qm0xK3hsTVR5ZUp4bG5mWFJuRlgyegpPcHV6NmsreUpBQk4xRkxmNDVYRnlJclFlaW5DWWJtckZXalFRc3ZDWEloNkpPNGMwdmRMVVU5RERYcldWRFlsCkRNZytTUDlJVnZBMm5USkVFZ2RFREVsKzNWeThUbUJoNlNmcGthcXdHajY5SmhIdy9OeE9yZzM2bUY3TU5VZ1MKWXJVUkxqUFFzTUN6NElTVzhhQnEzS2IwRUFWUTA4V1lZQ0ZEbjY1aHBzVFdxR05rL3BKRnVTYjlsZll6RVdyYwpJeW5wNDhqTnIvNUdKS3NnWG9LeTUwZVo4UkppeTVyOUhENGtnSW15TVNQR2RDMXFZSVppRHRlZW8yOElxNk5nCmd1dWcyTGhlNndYYW9YZXFCQ2d3SHpwWFVXM1hJVzI5eFoxUkFRRUNBd0VBQWFOVE1GRXdIUVlEVlIwT0JCWUUKRktITDQ1RWhQaVg5bnlRN241QUk0YUZVNFlCL01COEdBMVVkSXdRWU1CYUFGS0hMNDVFaFBpWDlueVE3bjVBSQo0YUZVNFlCL01BOEdBMVVkRXdFQi93UUZNQU1CQWY4d0RRWUpLb1pJaHZjTkFRRUxCUUFEZ2dFQkFEeTRac21XCkNjZ3B5TmI4cU1ydzNYclBNejZLMTRlTVhqQVk1bERFYXM1NklLZExoZFM1bi9sWk95bENuUUtOZ3RHS0VrY3UKT05tMEE2Z0xNT3UzcE43dG9aNXh0WktOcnUvS0x6dm44QUNNNndCbUR0bG5oRERDa3ZUTHBVRzh6UTZSNmlaegpzTWlhVmpLSUJiWkJsZXNYSHZZQnQ2YlRTN2o1QWM2aTdYc1RPK3ZqQ2RUQjVYdCtEZ0x2QnBJRWh2ZGJzNXhmClFtc3ZzQmwxenJnR3JXemo4d0txcXNUMzZaOVU1TzdSK1RJMlQrUDVOR25SeHNQOGJ0R0RtMGZ5SW55WlJrSkcKVTdpVFJyMlZRWlRyaTNZQ2dlVjN1UjFtMnJjc2hyU1Avb082T0QyTXR5Q1FMRWtFK1pxVUVJVnpSYmVaNXl2RQptM29YMFV2OUtBUy9pWTA9Ci0tLS0tRU5EIENFUlRJRklDQVRFLS0tLS0K + service: + name: union-pod-webhook + namespace: union + path: /mutate--v1-pod/secrets + port: 443 + failurePolicy: Fail + matchPolicy: Equivalent + namespaceSelector: + matchLabels: + kubernetes.io/metadata.name: union + objectSelector: + matchLabels: + inject-flyte-secrets: "true" + organization: 'union' + reinvocationPolicy: Never + rules: + - apiGroups: + - '*' + apiVersions: + - v1 + operations: + - CREATE + resources: + - pods + scope: '*' + sideEffects: NoneOnDryRun + timeoutSeconds: 30 +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Imported via https://github.com/knative/operator/releases/download/knative-v1.16.0/operator.yaml +# Copyright 2024 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +# Source: dataplane/templates/propeller/serviceaccount.yaml +--- +--- +# Source: dataplane/templates/serving/knative-serving.yaml +apiVersion: operator.knative.dev/v1beta1 +kind: KnativeServing +metadata: + name: union-operator-serving + namespace: union + labels: + helm.sh/chart: dataplane-2026.4.4 + app.kubernetes.io/name: release-name-dataplane + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2026.4.5" + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/component: serving spec: - replicas: 1 - selector: - matchLabels: - app.kubernetes.io/name: flytepropeller + config: + deployment: + progress-deadline: "30m" + queue-sidecar-cpu-request: "25m" + queue-sidecar-cpu-limit: "1000m" + queue-sidecar-memory-request: "400Mi" + queue-sidecar-memory-limit: "800Mi" + queue-sidecar-ephemeral-storage-request: "512Mi" + queue-sidecar-ephemeral-storage-limit: "1024Mi" + registries-skipping-tag-resolving: managed.cr.union.ai + features: + kubernetes.podspec-affinity: "enabled" + kubernetes.podspec-nodeselector: "enabled" + kubernetes.podspec-tolerations: "enabled" + kubernetes.podspec-fieldref: "enabled" + kubernetes.podspec-dnspolicy: "enabled" + kubernetes.podspec-schedulername: "enabled" + kubernetes.podspec-securitycontext: "enabled" + network: + ingress-class: "kourier.ingress.networking.knative.dev" + high-availability: + replicas: 2 + ingress: + kourier: + enabled: true + bootstrap-configmap: "union-operator-serving-envoy-bootstrap" + service-type: ClusterIP + podDisruptionBudgets: + - name: 3scale-kourier-gateway-pdb + minAvailable: 50% + - name: activator-pdb + minAvailable: 50% + - name: webhook-pdb + minAvailable: 50% + registry: + override: + # TODO(jeev): Wire up Union fork of Envoy + 3scale-kourier-gateway/kourier-gateway: ghcr.io/unionai/envoy:456fed84d4ad9a9dfb186d117d9362e9dc0f7c1f + # TODO(jeev): Wire up Union fork of Kourier + net-kourier-controller/controller: ghcr.io/unionai/kourier@sha256:5804c348d15b3959604e3e3ceed216c3a1c7b32cbe254c7d3eb02a35e62ba9c4 + workloads: + - name: 3scale-kourier-gateway + labels: + helm.sh/chart: dataplane-2026.4.4 + app.kubernetes.io/name: release-name-dataplane app.kubernetes.io/instance: release-name - template: - metadata: - annotations: - configChecksum: "24e0f0c6dd0bf39ba1103d08dedaa1047108f9263429148d8f22e6025237ac3" - prometheus.io/scrape: "true" - labels: - platform.union.ai/zone: "dataplane" - - - azure.workload.identity/use: "true" - custom-label: custom-value - app.kubernetes.io/name: flytepropeller - app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name - app.kubernetes.io/managed-by: Helm - spec: - priorityClassName: system-cluster-critical - containers: - - command: - - flytepropeller - - --config - - /etc/flyte/config/*.yaml - - --propeller.cluster-id - - union-test - env: - - name: POD_NAME - valueFrom: - fieldRef: - fieldPath: metadata.name - - name: POD_NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - - name: GOMEMLIMIT - valueFrom: - resourceFieldRef: - divisor: 1 - resource: limits.memory - - name: GOMAXPROCS - valueFrom: - resourceFieldRef: - divisor: 1 - resource: limits.cpu - - name: CLUSTER_NAME - valueFrom: - secretKeyRef: - name: operator-cluster-name - key: cluster_name - - name: DEPLOYMENT_NAME - value: operator - - name: PROXY_SERVICE_URL - value: http://union-operator-proxy:8080 - - name: PROMETHEUS_SERVICE_URL - value: http://union-operator-prometheus:80 - - name: KNATIVE_PROXY_SERVICE_URL - value: http://kourier-internal - image: "public.ecr.aws/p0i0a9q8/unionoperator:2026.4.5" - imagePullPolicy: "IfNotPresent" - name: flytepropeller - ports: - - containerPort: 10254 - resources: - limits: - cpu: "3" - memory: 3Gi - requests: - cpu: "1" - memory: 1Gi - volumeMounts: - - name: config-volume - mountPath: /etc/flyte/config - - name: auth - mountPath: /etc/union/secret - serviceAccountName: flytepropeller-system - volumes: - - configMap: - name: flyte-propeller-config - name: config-volume - - name: auth - secret: - secretName: union-secret-auth + app.kubernetes.io/version: "2026.4.5" + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/component: 3scale-kourier-gateway + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchLabels: + app.kubernetes.io/name: 3scale-kourier-gateway + app.kubernetes.io/instance: release-name + topologyKey: topology.kubernetes.io/zone + annotations: + checksum/bootstrap-config: 6469f6f592eebc9e0c676d8ccc359a05bc799c0988e474b2da942c4cc328f656 + env: + - container: kourier-gateway + envVars: + - name: UNION_AUTHZ_TENANTAUTHURL + value: "https://union.test.union.ai/me" + - name: UNION_AUTHZ_TENANTAUTHSIGNINURL + value: "https://union.test.union.ai/login" + - name: UNION_AUTHZ_TENANTCONTROLPLANEURL + value: "https://union.test.union.ai" + resources: + - container: kourier-gateway + limits: + cpu: "2" + memory: 2Gi + requests: + cpu: "1" + memory: 1Gi + - name: activator + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app + operator: In + values: + - activator + topologyKey: topology.kubernetes.io/zone + - name: autoscaler + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app + operator: In + values: + - autoscaler + topologyKey: topology.kubernetes.io/zone + - name: autoscaler-hpa + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app + operator: In + values: + - autoscaler-hpa + topologyKey: topology.kubernetes.io/zone + - name: controller + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app + operator: In + values: + - controller + topologyKey: topology.kubernetes.io/zone + - name: net-kourier-controller + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app + operator: In + values: + - net-kourier-controller + topologyKey: topology.kubernetes.io/zone + env: + - container: controller + envVars: + - name: KOURIER_UNION_AUTHZ_ENABLED + value: "true" + resources: + - container: controller + limits: + cpu: "1" + memory: 1Gi + requests: + cpu: 500m + memory: 500Mi + - name: webhook + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app + operator: In + values: + - webhook + topologyKey: topology.kubernetes.io/zone +--- +# Source: dataplane/templates/common/task-podtemplate.yaml +apiVersion: v1 +kind: PodTemplate +metadata: + name: task-template + namespace: union +template: + spec: + serviceAccountName: union + containers: + - name: default + image: docker.io/rwgrim/docker-noop --- # Source: dataplane/charts/fluentbit/templates/tests/test-connection.yaml apiVersion: v1 diff --git a/tests/generated/dataplane.additional-templates.yaml b/tests/generated/dataplane.additional-templates.yaml index bddd10b4..3d54d186 100644 --- a/tests/generated/dataplane.additional-templates.yaml +++ b/tests/generated/dataplane.additional-templates.yaml @@ -1,155 +1,139 @@ --- -# Source: dataplane/templates/common/namespaces.yaml -apiVersion: v1 -kind: Namespace -metadata: - name: flytesnacks-development ---- -# Source: dataplane/templates/common/namespaces.yaml -apiVersion: v1 -kind: Namespace -metadata: - name: flytesnacks-staging ---- -# Source: dataplane/templates/common/namespaces.yaml -apiVersion: v1 -kind: Namespace -metadata: - name: flytesnacks-production ---- -# Source: dataplane/templates/common/namespaces.yaml -apiVersion: v1 -kind: Namespace -metadata: - name: union-health-monitoring-development ---- -# Source: dataplane/templates/common/namespaces.yaml -apiVersion: v1 -kind: Namespace -metadata: - name: union-health-monitoring-staging ---- -# Source: dataplane/templates/common/namespaces.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2020 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + apiVersion: v1 -kind: Namespace +kind: ServiceAccount metadata: - name: union-health-monitoring-production + name: knative-operator + namespace: "union" + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator --- -# Source: dataplane/charts/fluentbit/templates/serviceaccount.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2020 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + apiVersion: v1 kind: ServiceAccount metadata: - name: fluentbit-system - namespace: union + name: operator-webhook + namespace: "union" labels: - helm.sh/chart: fluentbit-0.48.9 - app.kubernetes.io/name: fluentbit - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "3.2.8" - app.kubernetes.io/managed-by: Helm + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator --- -# Source: dataplane/charts/kube-state-metrics/templates/serviceaccount.yaml +# Source: dataplane/charts/prometheus/charts/kube-state-metrics/templates/serviceaccount.yaml apiVersion: v1 kind: ServiceAccount automountServiceAccountToken: true metadata: labels: - helm.sh/chart: kube-state-metrics-5.30.1 + helm.sh/chart: kube-state-metrics-5.25.1 app.kubernetes.io/managed-by: Helm app.kubernetes.io/component: metrics app.kubernetes.io/part-of: kube-state-metrics app.kubernetes.io/name: kube-state-metrics app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2.15.0" + app.kubernetes.io/version: "2.13.0" name: release-name-kube-state-metrics namespace: union --- -# Source: dataplane/charts/opencost/templates/serviceaccount.yaml +# Source: dataplane/charts/prometheus/templates/serviceaccount.yaml apiVersion: v1 kind: ServiceAccount metadata: - name: release-name-opencost - namespace: union labels: - helm.sh/chart: opencost-1.42.0 - app.kubernetes.io/name: opencost + app.kubernetes.io/component: server + app.kubernetes.io/name: prometheus app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "1.111.0" - app.kubernetes.io/part-of: opencost + app.kubernetes.io/version: v2.54.1 + helm.sh/chart: prometheus-25.27.0 app.kubernetes.io/managed-by: Helm -automountServiceAccountToken: true ---- -# Source: dataplane/templates/clusterresourcesync/serviceaccount.yaml -apiVersion: v1 -kind: ServiceAccount -metadata: - name: union-clustersync-system + app.kubernetes.io/part-of: prometheus + name: union-operator-prometheus namespace: union + annotations: + {} --- -# Source: dataplane/templates/imagebuilder/serviceaccount.yaml -apiVersion: v1 -kind: ServiceAccount -metadata: - name: union-imagebuilder ---- -# Source: dataplane/templates/nodeexecutor/serviceaccount.yaml +# Source: dataplane/templates/common/system-serviceaccount.yaml apiVersion: v1 kind: ServiceAccount metadata: - name: executor + name: union-system namespace: union - labels: - app: executor ---- -# Source: dataplane/templates/operator/serviceaccount-proxy.yaml -apiVersion: v1 -kind: ServiceAccount -metadata: - name: proxy-system - labels: - app.kubernetes.io/name: operator-proxy - app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name - app.kubernetes.io/managed-by: Helm + annotations: --- -# Source: dataplane/templates/operator/serviceaccount.yaml +# Source: dataplane/templates/common/union-serviceaccount.yaml apiVersion: v1 kind: ServiceAccount metadata: - name: operator-system - labels: - app.kubernetes.io/name: union-operator - app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name - app.kubernetes.io/managed-by: Helm + name: union + namespace: union + annotations: + eks.amazonaws.com/role-arn: arn:aws:iam::ACCOUNT_ID:role/flyte_project_role +automountServiceAccountToken: true --- -# Source: dataplane/templates/prometheus/serviceaccount.yaml +# Source: dataplane/templates/flyteconnector/serviceaccount.yaml apiVersion: v1 kind: ServiceAccount metadata: - name: union-operator-prometheus + name: flyteconnector namespace: union - labels: - helm.sh/chart: dataplane-2026.4.2 - app.kubernetes.io/name: release-name-dataplane + labels: + app.kubernetes.io/name: flyteconnector app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: prometheus ---- -# Source: dataplane/templates/propeller/serviceaccount-webhook.yaml -apiVersion: v1 -kind: ServiceAccount -metadata: - name: flytepropeller-webhook-system - namespace: union --- -# Source: dataplane/templates/propeller/serviceaccount.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2022 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + apiVersion: v1 -kind: ServiceAccount +kind: Secret metadata: - name: flytepropeller-system - namespace: union + name: operator-webhook-certs + namespace: "union" + labels: + app.kubernetes.io/component: operator-webhook + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +# The data is populated at install time. --- # Source: dataplane/templates/common/auth-secret.yaml apiVersion: v1 @@ -172,178 +156,533 @@ type: Opaque data: cluster_name: --- -# Source: dataplane/templates/propeller/deployment-webhook.yaml -# Create an empty secret that the first propeller pod will populate +# Source: dataplane/templates/webhook/mutatingwebhookconfiguration.yaml apiVersion: v1 kind: Secret metadata: - name: flyte-pod-webhook + name: union-pod-webhook namespace: union + labels: + app.kubernetes.io/name: union-pod-webhook + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm type: Opaque +data: + ca.crt: LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSUROekNDQWgrZ0F3SUJBZ0lVSm9kQ09lem4vd1BHQTZjYkw1U3duSlpYT2hJd0RRWUpLb1pJaHZjTkFRRUwKQlFBd0t6RXBNQ2NHQTFVRUF3d2dabXg1ZEdWd2NtOXdaV3hzWlhJdGQyVmlhRzl2YXk1MWJtbHZiaTV6ZG1NdwpIaGNOTWpZd01USXpNREl6TmpVMVdoY05Nell3TVRJeE1ESXpOalUxV2pBck1Ta3dKd1lEVlFRRERDQm1iSGwwClpYQnliM0JsYkd4bGNpMTNaV0pvYjI5ckxuVnVhVzl1TG5OMll6Q0NBU0l3RFFZSktvWklodmNOQVFFQkJRQUQKZ2dFUEFEQ0NBUW9DZ2dFQkFJcjBSWDRGRjFNVTNYdFBmUHErMDUrMGI1Qm0xK3hsTVR5ZUp4bG5mWFJuRlgyegpPcHV6NmsreUpBQk4xRkxmNDVYRnlJclFlaW5DWWJtckZXalFRc3ZDWEloNkpPNGMwdmRMVVU5RERYcldWRFlsCkRNZytTUDlJVnZBMm5USkVFZ2RFREVsKzNWeThUbUJoNlNmcGthcXdHajY5SmhIdy9OeE9yZzM2bUY3TU5VZ1MKWXJVUkxqUFFzTUN6NElTVzhhQnEzS2IwRUFWUTA4V1lZQ0ZEbjY1aHBzVFdxR05rL3BKRnVTYjlsZll6RVdyYwpJeW5wNDhqTnIvNUdKS3NnWG9LeTUwZVo4UkppeTVyOUhENGtnSW15TVNQR2RDMXFZSVppRHRlZW8yOElxNk5nCmd1dWcyTGhlNndYYW9YZXFCQ2d3SHpwWFVXM1hJVzI5eFoxUkFRRUNBd0VBQWFOVE1GRXdIUVlEVlIwT0JCWUUKRktITDQ1RWhQaVg5bnlRN241QUk0YUZVNFlCL01COEdBMVVkSXdRWU1CYUFGS0hMNDVFaFBpWDlueVE3bjVBSQo0YUZVNFlCL01BOEdBMVVkRXdFQi93UUZNQU1CQWY4d0RRWUpLb1pJaHZjTkFRRUxCUUFEZ2dFQkFEeTRac21XCkNjZ3B5TmI4cU1ydzNYclBNejZLMTRlTVhqQVk1bERFYXM1NklLZExoZFM1bi9sWk95bENuUUtOZ3RHS0VrY3UKT05tMEE2Z0xNT3UzcE43dG9aNXh0WktOcnUvS0x6dm44QUNNNndCbUR0bG5oRERDa3ZUTHBVRzh6UTZSNmlaegpzTWlhVmpLSUJiWkJsZXNYSHZZQnQ2YlRTN2o1QWM2aTdYc1RPK3ZqQ2RUQjVYdCtEZ0x2QnBJRWh2ZGJzNXhmClFtc3ZzQmwxenJnR3JXemo4d0txcXNUMzZaOVU1TzdSK1RJMlQrUDVOR25SeHNQOGJ0R0RtMGZ5SW55WlJrSkcKVTdpVFJyMlZRWlRyaTNZQ2dlVjN1UjFtMnJjc2hyU1Avb082T0QyTXR5Q1FMRWtFK1pxVUVJVnpSYmVaNXl2RQptM29YMFV2OUtBUy9pWTA9Ci0tLS0tRU5EIENFUlRJRklDQVRFLS0tLS0K + tls.crt: LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSUQxakNDQXI2Z0F3SUJBZ0lVREtRN3A4MFpIdkQyUE5Ya3p3VGRLem9iUjg4d0RRWUpLb1pJaHZjTkFRRUwKQlFBd0t6RXBNQ2NHQTFVRUF3d2dabXg1ZEdWd2NtOXdaV3hzWlhJdGQyVmlhRzl2YXk1MWJtbHZiaTV6ZG1NdwpIaGNOTWpZd01USXpNREl6TmpVMVdoY05NamN3TVRJek1ESXpOalUxV2pBck1Ta3dKd1lEVlFRRERDQm1iSGwwClpYQnliM0JsYkd4bGNpMTNaV0pvYjI5ckxuVnVhVzl1TG5OMll6Q0NBU0l3RFFZSktvWklodmNOQVFFQkJRQUQKZ2dFUEFEQ0NBUW9DZ2dFQkFMNnMxUTRaZ3ZxVVdNdFp5VC9EKytEQWl6WmR0Q1lkVzArSGk4S3phQ1Vjbi9zZwo3a21NTEhDM2ZlZWtZK2ZUNENHUTd6K1l2WjRWWSt6WGljKzdrNndLbUZqdm1acEM1Rm96OWo1Q21NRk5HZmlWCnRobzVvSU9MUkIzNXRYMU1ROHUwNFVzT29QcjdMK016eEdxWXVmTFdtN1NVdjh0eEp3ZFMxZVlzMllNV1pMeU4KekI4akdWY01Qc2ZNVkdxdFhTTVljQ3hNYURKaTM1N0VEZSs3clM2U2ZtbnFtYjJxekdQM1BRRStCUm91Yk5nLwo4NCt4c253VXBLZUFpem95dFM2c2ZlOTdMUEtRTW9oVzd2TjZORFJrLy9xSmpHem9iZmZwUHU3endhWHVXMWtMCmllc2g2cVRrdlZLSTF3WFJ2WHArVU5GbjZzdDIwdEVlRks0ZzMwOENBd0VBQWFPQjhUQ0I3akFKQmdOVkhSTUUKQWpBQU1Bc0dBMVVkRHdRRUF3SUY0RENCa3dZRFZSMFJCSUdMTUlHSWdoWm1iSGwwWlhCeWIzQmxiR3hsY2kxMwpaV0pvYjI5cmdoeG1iSGwwWlhCeWIzQmxiR3hsY2kxM1pXSm9iMjlyTG5WdWFXOXVnaUJtYkhsMFpYQnliM0JsCmJHeGxjaTEzWldKb2IyOXJMblZ1YVc5dUxuTjJZNEl1Wm14NWRHVndjbTl3Wld4c1pYSXRkMlZpYUc5dmF5NTEKYm1sdmJpNXpkbU11WTJ4MWMzUmxjaTVzYjJOaGJEQWRCZ05WSFE0RUZnUVV1d25wcDZ1MTNKL0NsNnI3eDZ1Sgo5YmZ1RDhzd0h3WURWUjBqQkJnd0ZvQVVvY3Zqa1NFK0pmMmZKRHVma0FqaG9WVGhnSDh3RFFZSktvWklodmNOCkFRRUxCUUFEZ2dFQkFFMmlOa1MzczhGT3Z0cDhrVmJNZUN2amxIQytvYTVlZTFGNmowNlZ3bFJPOWdxOXJla1UKMGptWUhNWGNjWjcrZXJkNFRQRXgwK3ZqMzNhRTV3cDFmdVJJL2xIZHhuOUpJRHg2bTFQTHdVa3p1WG9EekYxLwozVXVIWU1xOGkrT2xsY0tqbmdVMUFUNmRWRCtIbFVWeHpMdjlNazdPb05SQUY5dTJHa1hiUHdCN0I1ZGRxdWxlCmtNcXBxbDhES2s4SVNVelVsTjNBb2Q5NmJUelIyV3RCVkxpTi9zRHZtTDZwZThFYm55RnBUb1BZTDlGeWZOcmIKT1ZPZEJwUkZFTG1KOHZaZWY5d1lxRmVXMTE3SGV5VDArZkxGZVNSemZUNVVWUlVraVFBcUVJdUhHUVRrOVFabgo1WnYvNUF0eEg3WE1rcXR0VE1zMFlSUG0xYjQzY3FBeFdhND0KLS0tLS1FTkQgQ0VSVElGSUNBVEUtLS0tLQo= + tls.key: LS0tLS1CRUdJTiBQUklWQVRFIEtFWS0tLS0tCk1JSUV2UUlCQURBTkJna3Foa2lHOXcwQkFRRUZBQVNDQktjd2dnU2pBZ0VBQW9JQkFRQytyTlVPR1lMNmxGakwKV2NrL3cvdmd3SXMyWGJRbUhWdFBoNHZDczJnbEhKLzdJTzVKakN4d3QzM25wR1BuMCtBaGtPOC9tTDJlRldQcwoxNG5QdTVPc0NwaFk3NW1hUXVSYU0vWStRcGpCVFJuNGxiWWFPYUNEaTBRZCtiVjlURVBMdE9GTERxRDYreS9qCk04UnFtTG55MXB1MGxML0xjU2NIVXRYbUxObURGbVM4amN3Zkl4bFhERDdIekZScXJWMGpHSEFzVEdneVl0K2UKeEEzdnU2MHVrbjVwNnBtOXFzeGo5ejBCUGdVYUxtellQL09Qc2JKOEZLU25nSXM2TXJVdXJIM3ZleXp5a0RLSQpWdTd6ZWpRMFpQLzZpWXhzNkczMzZUN3U4OEdsN2x0WkM0bnJJZXFrNUwxU2lOY0YwYjE2ZmxEUlorckxkdExSCkhoU3VJTjlQQWdNQkFBRUNnZ0VBRjV2SnMzbTRMSE9DdlFic2NwOC9DQmgvQkNlOC92MGlpYW5VUmJLMkFlWlYKN3Arb3NXV1FPUktYSGIvT2VPMVVjb09DQkFOUzh3aGQrM3pDZlB5U0w3cU9HM0RyT1Z6djdqVkIxM3FpZEVpcQpId3ZXWk0vZXpuckhYOWpEdm5SYmJwVUNVaXRKQmxwa2x3S1pYc056UHB5UTRkNkxFdEw3VEo1V2lxM2g3cWQzCjhncmJoeWlWNDhZRFAyYkhpZGt0S2QvMHBNL3piSzZPTS9iMFpLUmRBWVpHQXlOcklMaFJ3eDNsQXE2SDVqL20KUHBuNlZEdVQ3Y1h6UFhmNTgxR0IyNjQxWFJVTXRNWGVEREtaU3JudnIrbG5tV2lsYXZmNkg3ZHp4NlRBMmk0RApybCtIRVdUN1RSQUZGMnZ5SVUrYSt5RzNPVFBKRCtmVTdCWWh4Zld4aFFLQmdRRGVvcXVCbzQxK1dKU255blZnCmNWMHZoUEkwZ0dyRGQzNFdIbCtGNTZuSmxuMVVlbUQ2SUQ0TUhnNVpHWmNmd3NkRlB0aHl1VnB0OVl1SW11SGQKRHE5ZDhpWnpHUzJVbStkSWlsWjhnZXJ5Q3dON3hIbFM3SzJrc0RmRGJwYmZZSXZ1LzRnSExZRHh3YzQ0OG54ZwpWNGpCUFY2QTdYTHRqQzZzdFE2OW9uTlIwd0tCZ1FEYlFBUTRGdWpnUjJUOXh2Y01MdXhMbm1YaHE4U2NoZlM5CkRoWndDUzZIeHB1YlBxblBjQ04vQUZjbDZ1cDRXVEVZdVY4bEpwcGwwRmpYQ2lMdkhuUU15cm5vRjlUOThpanQKZGVHYjF6ZWlwRUpndFpNVHkvOGdERWorZU1wSFBuYTZwQVpMOVpRVUdLR2dDNGQ0SzljdVNHMmRlb1dSU0ViSQpMVGgzeGpXVEZRS0JnQStJRWFidG5nVmVjS0IwQTFSREZGa29VUzFRZUNKQ3g4MExPV2JDRHBvOW9XaXZVT3lpCkt6SDFOdE1JY2Y2SlBCV2NtTVVJSVVMaWltVnhTS2gvU2NTb0MvNmpsd1p6Q2VPSm94YjBpVXR4Y1VERktDR2MKMlZCUDZ0UDdkeE1HVFR6VEhzNUJZbWw3TjhQSlJ0d0J2MHliMTJmdktNRmhzaS9pUWJFQkVFSjVBb0dBUm9NSApHRmJkM0V0NXdsZzcyYkk1a25SRnhkY3RLejIxb1J1bndhNWlSWTV0T3Zkak8zQ1FLZWNkSC9lMklyQmtwdFB2Ck1vNkF0MS9UUW8xakFNNGxlbnUwWUYxUnhiNGN4WW5VM2Y3UVNNRlZDNjg1dHZNemdNWVNyNngvT1h4d1NNUTUKdGpJcnhtN0poQ1JSRkNmZFUyZzl3SmpIM2hxRmtSbGlBTHRCUGFVQ2dZRUF2QkRHejQvYS9kRzRJT0VpREk4SgpBbkxaRWZaTFJrdnI0ejRyT0p0TGZtWlFjTW5DSUZmKzBaelhTME9XOWhvMjk5RUNtS1l4U2VUWk1WSDgzOFZCCnRvSkNGM3lzb01rSC9meEVXTUpicDlMblJwSFUzUElicHlhRG1nWW1YdmsvK3laL3Y1c0dDTWNtVGkzWVFLRGsKcStlK1o0SHZyaGUxVytUaHpwOCtoNGs9Ci0tLS0tRU5EIFBSSVZBVEUgS0VZLS0tLS0K --- -# Source: dataplane/templates/clusterresourcesync/configmap.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2019 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + apiVersion: v1 kind: ConfigMap metadata: - name: union-clusterresourcesync-config - namespace: union + name: config-logging + namespace: "union" labels: - app.kubernetes.io/name: clusterresourcesync - app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name - app.kubernetes.io/managed-by: Helm -data: - cluster_resources.yaml: | - cluster_resources: - clusterName: '' - customData: - - production: - - projectQuotaCpu: - value: "4096" - - projectQuotaMemory: - value: 2Ti - - projectQuotaNvidiaGpu: - value: "256" - - defaultUserRoleKey: - value: 'eks.amazonaws.com/role-arn' - - defaultUserRoleValue: - value: 'arn:aws:iam::ACCOUNT_ID:role/flyte_project_role' - - staging: - - projectQuotaCpu: - value: "4096" - - projectQuotaMemory: - value: 2Ti - - projectQuotaNvidiaGpu: - value: "256" - - defaultUserRoleKey: - value: 'eks.amazonaws.com/role-arn' - - defaultUserRoleValue: - value: 'arn:aws:iam::ACCOUNT_ID:role/flyte_project_role' - - development: - - projectQuotaCpu: - value: "4096" - - projectQuotaMemory: - value: 2Ti - - projectQuotaNvidiaGpu: - value: "256" - - defaultUserRoleKey: - value: 'eks.amazonaws.com/role-arn' - - defaultUserRoleValue: - value: 'arn:aws:iam::ACCOUNT_ID:role/flyte_project_role' - refreshInterval: 5m - standaloneDeployment: true - templatePath: /etc/flyte/clusterresource/templates - clusterResourcesPrivate: - app: - isServerless: false - union: - auth: - authorizationMetadataKey: flyte-authorization - clientId: 'test-client-id' - clientSecretLocation: /etc/union/secret/client_secret - tokenRefreshWindow: 5m - type: ClientSecret - connection: - host: dns:///test.example.com - admin.yaml: | - admin: - clientId: 'test-client-id' - clientSecretLocation: /etc/union/secret/client_secret - endpoint: dns:///test.example.com - insecure: false - event: - capacity: 1000 - rate: 500 - type: admin - domain.yaml: | - domains: - - id: development - name: development - - id: staging - name: staging - - id: production - name: production - clusters.yaml: | - clusters: - clusterConfigs: [] - labelClusterMap: {} - logger.yaml: | - logger: - formatter: - type: json - level: 4 - show-source: true - namespace_mapping.yaml: | - namespace_mapping: - template: '{{ project }}-{{ domain }}' + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +data: {} --- -# Source: dataplane/templates/clusterresourcesync/configmap.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2019 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + apiVersion: v1 kind: ConfigMap metadata: - name: union-clusterresource-template - namespace: union + name: config-observability + namespace: "union" labels: - app.kubernetes.io/name: clusterresourcesync + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +data: {} +--- +# Source: dataplane/charts/prometheus/templates/cm.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + labels: + app.kubernetes.io/component: server + app.kubernetes.io/name: prometheus app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name + app.kubernetes.io/version: v2.54.1 + helm.sh/chart: prometheus-25.27.0 app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: prometheus + name: union-operator-prometheus + namespace: union data: - a_namespace.yaml: | - apiVersion: v1 - kind: Namespace - metadata: - name: {{ namespace }} - labels: - union.ai/namespace-type: flyte - spec: - finalizers: - - kubernetes - - b_default_service_account.yaml: | - apiVersion: v1 - kind: ServiceAccount - metadata: - name: default - namespace: {{ namespace }} - annotations: - {{ defaultUserRoleKey }}: {{ defaultUserRoleValue }} - - c_project_resource_quota.yaml: | - apiVersion: v1 - kind: ResourceQuota - metadata: - name: project-quota - namespace: {{ namespace }} - spec: - hard: - limits.cpu: {{ projectQuotaCpu }} - limits.memory: {{ projectQuotaMemory }} - requests.nvidia.com/gpu: {{ projectQuotaNvidiaGpu }} - - d_custom_networkpolicy.yaml: | - apiVersion: networking.k8s.io/v1 - kind: NetworkPolicy - metadata: - name: default-deny - namespace: {{ namespace }} - spec: - podSelector: {} - policyTypes: - - Ingress + allow-snippet-annotations: "false" + alerting_rules.yml: | + {} + alerts: | + {} + prometheus.yml: | + global: + evaluation_interval: 1m + scrape_interval: 1m + scrape_timeout: 10s + rule_files: + - /etc/config/recording_rules.yml + - /etc/config/alerting_rules.yml + - /etc/config/rules + - /etc/config/alerts + scrape_configs: + - job_name: prometheus + static_configs: + - targets: + - localhost:9090 + - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + job_name: kubernetes-apiservers + kubernetes_sd_configs: + - role: endpoints + relabel_configs: + - action: keep + regex: default;kubernetes;https + source_labels: + - __meta_kubernetes_namespace + - __meta_kubernetes_service_name + - __meta_kubernetes_endpoint_port_name + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + insecure_skip_verify: true + - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + job_name: kubernetes-nodes + kubernetes_sd_configs: + - role: node + relabel_configs: + - action: labelmap + regex: __meta_kubernetes_node_label_(.+) + - replacement: kubernetes.default.svc:443 + target_label: __address__ + - regex: (.+) + replacement: /api/v1/nodes/$1/proxy/metrics + source_labels: + - __meta_kubernetes_node_name + target_label: __metrics_path__ + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + insecure_skip_verify: true + - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + job_name: kubernetes-nodes-cadvisor + kubernetes_sd_configs: + - role: node + relabel_configs: + - action: labelmap + regex: __meta_kubernetes_node_label_(.+) + - replacement: kubernetes.default.svc:443 + target_label: __address__ + - regex: (.+) + replacement: /api/v1/nodes/$1/proxy/metrics/cadvisor + source_labels: + - __meta_kubernetes_node_name + target_label: __metrics_path__ + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + insecure_skip_verify: true + - honor_labels: true + job_name: kubernetes-service-endpoints + kubernetes_sd_configs: + - role: endpoints + relabel_configs: + - action: keep + regex: true + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_scrape + - action: drop + regex: true + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_scrape_slow + - action: replace + regex: (https?) + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_scheme + target_label: __scheme__ + - action: replace + regex: (.+) + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_path + target_label: __metrics_path__ + - action: replace + regex: (.+?)(?::\d+)?;(\d+) + replacement: $1:$2 + source_labels: + - __address__ + - __meta_kubernetes_service_annotation_prometheus_io_port + target_label: __address__ + - action: labelmap + regex: __meta_kubernetes_service_annotation_prometheus_io_param_(.+) + replacement: __param_$1 + - action: labelmap + regex: __meta_kubernetes_service_label_(.+) + - action: replace + source_labels: + - __meta_kubernetes_namespace + target_label: namespace + - action: replace + source_labels: + - __meta_kubernetes_service_name + target_label: service + - action: replace + source_labels: + - __meta_kubernetes_pod_node_name + target_label: node + - honor_labels: true + job_name: kubernetes-service-endpoints-slow + kubernetes_sd_configs: + - role: endpoints + relabel_configs: + - action: keep + regex: true + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_scrape_slow + - action: replace + regex: (https?) + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_scheme + target_label: __scheme__ + - action: replace + regex: (.+) + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_path + target_label: __metrics_path__ + - action: replace + regex: (.+?)(?::\d+)?;(\d+) + replacement: $1:$2 + source_labels: + - __address__ + - __meta_kubernetes_service_annotation_prometheus_io_port + target_label: __address__ + - action: labelmap + regex: __meta_kubernetes_service_annotation_prometheus_io_param_(.+) + replacement: __param_$1 + - action: labelmap + regex: __meta_kubernetes_service_label_(.+) + - action: replace + source_labels: + - __meta_kubernetes_namespace + target_label: namespace + - action: replace + source_labels: + - __meta_kubernetes_service_name + target_label: service + - action: replace + source_labels: + - __meta_kubernetes_pod_node_name + target_label: node + scrape_interval: 5m + scrape_timeout: 30s + - honor_labels: true + job_name: prometheus-pushgateway + kubernetes_sd_configs: + - role: service + relabel_configs: + - action: keep + regex: pushgateway + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_probe + - honor_labels: true + job_name: kubernetes-services + kubernetes_sd_configs: + - role: service + metrics_path: /probe + params: + module: + - http_2xx + relabel_configs: + - action: keep + regex: true + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_probe + - source_labels: + - __address__ + target_label: __param_target + - replacement: blackbox + target_label: __address__ + - source_labels: + - __param_target + target_label: instance + - action: labelmap + regex: __meta_kubernetes_service_label_(.+) + - source_labels: + - __meta_kubernetes_namespace + target_label: namespace + - source_labels: + - __meta_kubernetes_service_name + target_label: service + - honor_labels: true + job_name: kubernetes-pods + kubernetes_sd_configs: + - role: pod + relabel_configs: + - action: keep + regex: true + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_scrape + - action: drop + regex: true + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_scrape_slow + - action: replace + regex: (https?) + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_scheme + target_label: __scheme__ + - action: replace + regex: (.+) + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_path + target_label: __metrics_path__ + - action: replace + regex: (\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4}) + replacement: '[$2]:$1' + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_port + - __meta_kubernetes_pod_ip + target_label: __address__ + - action: replace + regex: (\d+);((([0-9]+?)(\.|$)){4}) + replacement: $2:$1 + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_port + - __meta_kubernetes_pod_ip + target_label: __address__ + - action: labelmap + regex: __meta_kubernetes_pod_annotation_prometheus_io_param_(.+) + replacement: __param_$1 + - action: labelmap + regex: __meta_kubernetes_pod_label_(.+) + - action: replace + source_labels: + - __meta_kubernetes_namespace + target_label: namespace + - action: replace + source_labels: + - __meta_kubernetes_pod_name + target_label: pod + - action: drop + regex: Pending|Succeeded|Failed|Completed + source_labels: + - __meta_kubernetes_pod_phase + - action: replace + source_labels: + - __meta_kubernetes_pod_node_name + target_label: node + - honor_labels: true + job_name: kubernetes-pods-slow + kubernetes_sd_configs: + - role: pod + relabel_configs: + - action: keep + regex: true + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_scrape_slow + - action: replace + regex: (https?) + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_scheme + target_label: __scheme__ + - action: replace + regex: (.+) + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_path + target_label: __metrics_path__ + - action: replace + regex: (\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4}) + replacement: '[$2]:$1' + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_port + - __meta_kubernetes_pod_ip + target_label: __address__ + - action: replace + regex: (\d+);((([0-9]+?)(\.|$)){4}) + replacement: $2:$1 + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_port + - __meta_kubernetes_pod_ip + target_label: __address__ + - action: labelmap + regex: __meta_kubernetes_pod_annotation_prometheus_io_param_(.+) + replacement: __param_$1 + - action: labelmap + regex: __meta_kubernetes_pod_label_(.+) + - action: replace + source_labels: + - __meta_kubernetes_namespace + target_label: namespace + - action: replace + source_labels: + - __meta_kubernetes_pod_name + target_label: pod + - action: drop + regex: Pending|Succeeded|Failed|Completed + source_labels: + - __meta_kubernetes_pod_phase + - action: replace + source_labels: + - __meta_kubernetes_pod_node_name + target_label: node + scrape_interval: 5m + scrape_timeout: 30s + - job_name: 'kube-state-metrics' + metrics_path: /metrics + scrape_interval: 1m + honor_labels: true + kubernetes_sd_configs: + - role: endpoints + namespaces: + own_namespace: true + relabel_configs: + - source_labels: [__meta_kubernetes_service_label_app_kubernetes_io_name] + regex: kube-state-metrics + action: keep + - source_labels: [__meta_kubernetes_endpoint_port_name] + regex: http + action: keep + metric_relabel_configs: + - source_labels: [__name__] + regex: "kube_pod_container_resource_(limits|requests)|kube_pod_status_phase|kube_node_(labels|status_allocatable|status_condition|status_capacity)|kube_namespace_labels|kube_pod_container_status_(waiting|terminated|last_terminated).*_reason|kube_daemonset_status_number_unavailable|kube_deployment_status_replicas_unavailable|kube_resourcequota|kube_pod_info|kube_node_info|kube_pod_container_status_restarts_total" + action: keep + - source_labels: [__name__, phase] + separator: ";" + regex: "kube_pod_status_phase;(Succeeded|Failed)" + action: drop + - source_labels: [node] + target_label: nodename + regex: "(.*)" + action: replace + - source_labels: [label_node_group_name] + action: replace + regex: "(.+)" + target_label: label_node_pool_name + - job_name: 'opencost' + metrics_path: /metrics + scrape_interval: 1m + honor_labels: true + kubernetes_sd_configs: + - role: endpoints + namespaces: + own_namespace: true + relabel_configs: + - source_labels: [__meta_kubernetes_service_label_app_kubernetes_io_name] + regex: opencost + action: keep + - source_labels: [__meta_kubernetes_endpoint_port_name] + regex: http + action: keep + metric_relabel_configs: + - source_labels: [__name__] + regex: "kube_node_labels|kube_pod_labels|node_total_hourly_cost|node_ram_hourly_cost|node_cpu_hourly_cost|node_gpu_hourly_cost" + action: keep + - job_name: 'union-services' + metrics_path: /metrics + scrape_interval: 1m + honor_labels: true + kubernetes_sd_configs: + - role: endpoints + namespaces: + own_namespace: true + relabel_configs: + - source_labels: [__meta_kubernetes_service_label_platform_union_ai_service_group] + regex: .+ + action: keep + - source_labels: [__meta_kubernetes_endpoint_port_name] + regex: debug + action: keep + # Flyte propeller metrics for execution info and fast task duration. + # No-op when propeller is not deployed (no pods match the label selector). + - job_name: 'flytepropeller' + kubernetes_sd_configs: + - role: pod + namespaces: + own_namespace: true + selectors: + - role: pod + label: app.kubernetes.io/name=flytepropeller + metric_relabel_configs: + - source_labels: [__name__] + regex: "flyte:propeller:all:round:execution_info|flyte:propeller:all:node:fast_task:fast_task_execution_duration" + action: keep + relabel_configs: + - source_labels: [__meta_kubernetes_pod_container_name] + regex: flytepropeller + action: keep + - source_labels: [__meta_kubernetes_namespace] + action: replace + target_label: namespace + - source_labels: [__meta_kubernetes_pod_name] + action: replace + target_label: pod + # Kourier gateway (envoy) metrics for app serving. + # No-op when serving is not enabled (no pods match the label selector). + - job_name: 'serving-envoy' + kubernetes_sd_configs: + - role: pod + namespaces: + own_namespace: true + selectors: + - role: pod + label: app=3scale-kourier-gateway + metrics_path: /stats/prometheus + metric_relabel_configs: + - source_labels: [__name__] + regex: "envoy_cluster_upstream_rq_xx|envoy_cluster_upstream_rq_time_bucket" + action: keep + relabel_configs: + - source_labels: [__meta_kubernetes_pod_container_port_name] + regex: metrics + action: keep + - source_labels: [__meta_kubernetes_namespace] + action: replace + target_label: namespace + - source_labels: [__meta_kubernetes_pod_name] + action: replace + target_label: pod + # DCGM GPU metrics. + # No-op when dcgm-exporter is not deployed (no pods match the label selector). + - job_name: 'gpu-metrics' + kubernetes_sd_configs: + - role: pod + namespaces: + names: + - kube-system + selectors: + - role: pod + label: app.kubernetes.io/name=dcgm-exporter - e_custom_limitrange.yaml: | - apiVersion: v1 - kind: LimitRange - metadata: - name: default-limits - namespace: {{ namespace }} - spec: - limits: - - default: - cpu: "2" - memory: "1Gi" - type: Container + recording_rules.yml: | + {} + rules: | + {} --- # Source: dataplane/templates/fluent-bit/configmap.yaml apiVersion: v1 @@ -391,6 +730,40 @@ data: region us-east-1 bucket --- +# Source: dataplane/templates/imagebuilder/build-image-configmap.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: build-image-config + labels: + app.kubernetes.io/name: union-operator + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm +data: + storage.yaml: | + storage: + container: "" + type: stow + stow: + kind: s3 + config: + auth_type: accesskey + access_key_id: + secret_key: + disable_ssl: false + endpoint: + region: us-east-1 + enable-multicontainer: false + limits: + maxDownloadMBs: 1024 + cache: + max_size_mbs: 0 + target_gc_percent: 70 + buildkit-uri: "tcp://union-operator-buildkit.union.svc.cluster.local:1234" + default-repository: "union-dataplane" + authentication-type: "noop" +--- # Source: dataplane/templates/imagebuilder/configmap.yaml apiVersion: v1 kind: ConfigMap @@ -2547,6 +2920,18 @@ data: "version": 1 } --- +# Source: dataplane/templates/monitoring/prometheusrule.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: union-recording-rules + namespace: union + labels: + release: release-name +data: + recording_rules.yml: | + groups: [] +--- # Source: dataplane/templates/nodeexecutor/configmap.yaml apiVersion: v1 kind: ConfigMap @@ -2601,8 +2986,13 @@ data: templateUris: - https://scale.neptune.ai/{{ .taskConfig.project }}/-/run/?customId={{ .taskConfig.id }} - kubernetes-enabled: true + kubernetes-enabled: false enabled_plugins.yaml: | + plugins: + connector-service: + defaultConnector: + defaultServiceConfig: '{"loadBalancingConfig": [{"round_robin":{}}]}' + endpoint: k8s:///flyteconnector.union:8000 tasks: task-plugins: default-for-task-types: @@ -2635,6 +3025,8 @@ data: cpu: 4096 gpu: 256 memory: 2Ti + namespace_mapping: + template: union namespace_mapping: template: '{{ project }}-{{ domain }}' union: @@ -2657,7 +3049,7 @@ data: cache-endpoint: dns:///test.example.com endpoint: dns:///test.example.com insecure: false - type: fallback + type: cacheservicev2 use-admin-auth: true logger: formatter: @@ -2677,6 +3069,7 @@ data: secure: false useAuth: false propeller: + limit-namespace: union node-config: disable-input-file-writes: true plugins: @@ -2684,7 +3077,7 @@ data: additional-worker-args: - --last-ack-grace-period-seconds - "120" - callback-uri: http://unionai-dataplane-executor.union.svc.cluster.local:15605 + callback-uri: http://union-operator-executor.union.svc.cluster.local:15605 grace-period-status-not-found: 2m ioutils: remoteFileOutputPaths: @@ -2694,6 +3087,7 @@ data: default-cpus: 100m default-env-vars: [] default-memory: 100Mi + default-pod-template-name: task-template co-pilot: image: 'cr.flyte.org/flyteorg/flytecopilot:v1.14.1' name: flyte-copilot- @@ -2734,6 +3128,7 @@ data: default-cpus: 100m default-env-vars: [] default-memory: 100Mi + default-pod-template-name: task-template config.yaml: | union: connection: @@ -2753,11 +3148,14 @@ data: operator: enabled: true enableTunnelService: true + # enableDepot: false tunnel: - enableDirectToAppIngress: false + enableDirectToAppIngress: true deploymentToRestart: union-operator-proxy + limitNamespace: union + disableClusterPermissions: true apps: - enabled: 'false' + enabled: 'true' syncClusterConfig: enabled: false clusterId: @@ -2772,14 +3170,12 @@ data: userRole: 'arn:aws:iam::ACCOUNT_ID:role/flyte_project_role' userRoleKey: 'eks.amazonaws.com/role-arn' collectUsages: - enabled: true + enabled: false billing: model: Legacy dependenciesHeartbeat: - prometheus: - endpoint: 'http://union-operator-prometheus:80/-/healthy' - propeller: - endpoint: 'http://flytepropeller:10254' + executor: + endpoint: 'http://union-operator-executor:10254' proxy: endpoint: 'http://union-operator-proxy:10254' secretsWatcher: @@ -2794,9 +3190,6 @@ data: referenceConfigmapName: union-operator targetConfigMapName: "build-image-config" proxy: - imageBuilderConfig: - authenticationType: 'noop' - defaultRepository: '' persistedLogs: objectStore: pathTemplate: namespace-{{.KubernetesNamespace}}.pod-{{.KubernetesPodName}}.cont-{{.KubernetesContainerName}} @@ -2850,989 +3243,149 @@ data: endpoint: region: us-east-1 image-builder.buildkit-uri: "tcp://union-operator-buildkit.union.svc.cluster.local:1234" - image-builder.default-repository: "" + image-builder.default-repository: "union-dataplane" image-builder.authentication-type: "noop" --- -# Source: dataplane/templates/prometheus/configmap.yaml +# Source: dataplane/templates/serving/bootstrap-configmap.yaml +# We need to copy the default configmap here since knative-operator does not automatically update the +# address of the `net-kourier-controller` endpoint to use the release namespace. It assumes that the +# resources are being installed into the `knative-serving` namespace instead. apiVersion: v1 kind: ConfigMap metadata: - name: union-operator-prometheus + name: union-operator-serving-envoy-bootstrap namespace: union - labels: - helm.sh/chart: dataplane-2026.4.2 - app.kubernetes.io/name: release-name-dataplane - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: prometheus data: - prometheus.yml: | - global: - scrape_interval: 15s - evaluation_interval: 15s - alerting: - alertmanagers: - - static_configs: - - targets: - rule_files: - - rules.yml - scrape_configs: - # Self-monitoring - - job_name: prometheus - metrics_path: /prometheus/metrics - static_configs: - - targets: ['localhost:9090'] - metric_relabel_configs: - - source_labels: [__name__] - regex: prometheus_tsdb_head_chunks_storage_size_bytes|prometheus_tsdb_head_series|prometheus_tsdb_head_series_created_total|prometheus_tsdb_head_series_removed_total|prometheus_tsdb_head_samples_appended_total|prometheus_tsdb_storage_blocks_bytes|prometheus_tsdb_retention_limit_bytes|prometheus_tsdb_wal_(.*)|.+_(limit|failed|missed|failures)_total - action: keep - - # Kube state metrics for pod/node resource tracking and cost calculations - - job_name: kube-state-metrics - static_configs: - - targets: ['release-name-kube-state-metrics:8080'] - metric_relabel_configs: - - separator: ; - source_labels: [__name__] - regex: kube_pod_container_resource_(limits|requests)|kube_pod_status_phase|kube_node_(labels|status_allocatable|status_condition|status_capacity)|kube_namespace_labels|kube_pod_container_status_(waiting|terminated|last_terminated).*_reason|kube_daemonset_status_number_unavailable|kube_deployment_status_replicas_unavailable|kube_resourcequota|kube_pod_info|kube_node_info|kube_pod_container_status_restarts_total - action: keep - - separator: ; - source_labels: [__name__, phase] - regex: kube_pod_status_phase;(Succeeded|Failed) - action: drop - - source_labels: [node] - target_label: nodename - regex: '(.*)' - action: replace - - source_labels: [label_node_group_name] - action: replace - regex: (.+) - target_label: label_node_pool_name - - # cAdvisor container metrics for CPU and memory tracking - - job_name: kubernetes-cadvisor - metrics_path: /metrics - scheme: https - kubernetes_sd_configs: - - role: node - namespaces: - names: [] - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token - tls_config: - ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt - insecure_skip_verify: false - metric_relabel_configs: - - separator: ; - source_labels: [__name__] - regex: container_cpu_usage_seconds_total|container_memory_working_set_bytes - action: keep - relabel_configs: - - separator: ; - regex: __meta_kubernetes_node_label_(.+) - replacement: $1 - action: labelmap - - separator: ; - regex: (.*) - target_label: __address__ - replacement: kubernetes.default.svc:443 - action: replace - - source_labels: [__meta_kubernetes_node_name] - separator: ; - regex: (.+) - target_label: __metrics_path__ - replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor - action: replace - - # Flyte propeller metrics for execution info and fast task duration - - job_name: flytepropeller - kubernetes_sd_configs: - - role: pod - namespaces: - names: - - union - selectors: - - role: pod - label: app.kubernetes.io/name=flytepropeller - metric_relabel_configs: - - source_labels: [__name__] - regex: "flyte:propeller:all:round:execution_info|flyte:propeller:all:node:fast_task:fast_task_execution_duration" - action: keep - relabel_configs: - - source_labels: [__meta_kubernetes_pod_container_name] - regex: flytepropeller - action: keep - - source_labels: [__meta_kubernetes_namespace] - action: replace - target_label: namespace - - source_labels: [__meta_kubernetes_pod_name] - action: replace - target_label: pod - # OpenCost metrics for cost tracking - - job_name: opencost - static_configs: - - targets: ['release-name-opencost:9003'] - metric_relabel_configs: - - source_labels: [__name__] - regex: "kube_node_labels|kube_pod_labels|node_total_hourly_cost|node_ram_hourly_cost|node_cpu_hourly_cost|node_gpu_hourly_cost" - action: keep - rules.yml: | - - groups: - - name: cost_calculations_15s - interval: 15s - rules: - - record: pod_gpu_allocation - expr: | - sum by (namespace, pod) (DCGM_FI_DEV_GPU_UTIL >= bool 0) * on (namespace, pod) group_left() (max by (namespace, pod) (kube_pod_status_phase{phase=~"Running|Pending"} == 1)) - - record: execution_info # A join metric to look up execution-level info. Used to disambiguate workflow/task executions, apps, and workspaces. - expr: | - max by (label_domain, label_project, label_entity_name, label_execution_id, label_entity_id)( - label_replace( - label_replace( - label_replace( - label_replace( - label_replace( - flyte:propeller:all:round:execution_info{domain!="", project!="", workflow_name!="", execution_id!=""}, # filter for workflow/task executions - "label_entity_id", "$1", "execution_id", "(.*)" # join key - ), "label_entity_name", "$1", "workflow_name", "(.*)" # set label_entity_name to the workflow/task name from the workflow_execution_id - ), - "label_execution_id", "$1", "execution_id", "(.*)" - ), - "label_project", "$1", "project", "(.*)" # project - ), - "label_domain", "$1", "domain", "(.*)" # domain - ) - ) - - record: app_info # A join metric to look up app-level info. Used to disambiguate workflow/task executions, apps, and workspaces. - expr: | - max by (label_domain, label_project, label_app_name, label_app_version, label_entity_id)( - label_replace( - label_replace( - label_replace( - kube_pod_labels{ - label_domain!="", - label_project!="", - label_serving_unionai_dev_app_name!="", - label_serving_knative_dev_revision!="" - }, # this filters for apps - "label_app_name", "$1", "label_serving_unionai_dev_app_name", "(.*)" # rename to cleanup - ), - "label_app_version", "$1", "label_serving_knative_dev_revision", "(.*)" # the app_version is equivalent to an execution_id for workflows (lowest level of granularity) - ), - "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # join key - ) - ) - - record: workspace_info # A join metric to look up workspace info. Used to disambiguate workflow/task executions, apps, and workspaces. - expr: | - max by (label_domain, label_project, label_workspace_name, label_entity_id)( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # filter for workspaces - "label_entity_id", "$1", "label_node_id", "(.*)" # join key - ), "label_workspace_name", "$1", "label_node_id", "(.*)" # set label_workspace_name to the workspace name from the kube_pod_labels - ) - ) - - record: fast_task_execution_duration - expr: | - max by (label_domain, label_project, label_execution_id, label_entity_id, namespace, pod)( - label_replace( - label_replace( - label_replace( - label_replace( - label_replace( - label_replace( - flyte:propeller:all:node:fast_task:fast_task_execution_duration{domain!="", project!="", execution_id!="", exported_namespace!="", exported_pod!=""}, - "label_entity_id", "$1", "execution_id", "(.*)" # join key - ), - "label_execution_id", "$1", "execution_id", "(.*)" - ), - "label_project", "$1", "project", "(.*)" # project - ), - "label_domain", "$1", "domain", "(.*)" # domain - ), - "namespace", "$1", "exported_namespace", "(.*)" - ), - "pod", "$1", "exported_pod", "(.*)" - ) - ) - - record: fast_task_execution_duration_rate - expr: | - irate(fast_task_execution_duration[1m]) > 0 or absent(irate(fast_task_execution_duration[1m])) * fast_task_execution_duration - - record: entity_id:mem_usage_bytes_total_per_node:sum # Allocated memory (max(requested, consumed)) aggregated per node and entity, where entity is either a task/workflow execution or an app. - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id, node) ( # aggregate up to entity - # First, calculate the allocated memory for each pod - max by (namespace, pod) ( # this is the case where consumed (the memory working set) exceeds requested memory - ( - sum by (namespace, pod) ( - container_memory_working_set_bytes{namespace!="",pod!="",image!=""} - ) - > sum by (namespace, pod) ( - kube_pod_container_resource_requests{namespace!="", pod!="", node!="", resource="memory"} - ) - ) - or sum by (namespace, pod) ( # this is the case where memory requests are <= consumed memory - kube_pod_container_resource_requests{namespace!="", pod!="", node!="", resource="memory"} # needed to add node!="" to dedupe - ) - ) - # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but we do not want to double the number of pod-level metrics we save - * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) - "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") - ), - "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels - label_replace( - label_replace( - kube_pod_labels{ - label_domain!="", - label_project!="", - label_serving_unionai_dev_app_name!="", - label_serving_knative_dev_revision!="" - }, # this filters for apps only - "label_entity_type", "app", "", "" # set label_entity_type to "app" - ), - "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) - "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" - ), - "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - fast_task_execution_duration_rate{label_domain!="", label_project!="",label_execution_id!=""}, - "label_entity_type", "fast_task", "", "" - ) - ) - ) - # Then filter for pods only in the "Running" or "Pending" phase - * on (namespace, pod) group_left() ( - max by (namespace, pod) ( - kube_pod_status_phase{phase=~"Running|Pending"} == 1 - ) - ) - # Now join in node identifiers which are used for subsequent overhead calculations - * on (namespace, pod) group_left(node) ( - max by (namespace, pod, node) (kube_pod_info{node!=""}) # needed to add node!="" to dedupe - ) - ) - - record: entity_id:cpu_usage_per_node:sum # Allocated cpu (max(requested, consumed)) aggregated per node and entity, where entity is either a task/workflow execution or an app. - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id, node) ( - # First, calculate the allocated cpu for each pod - max by (namespace, pod) ( # this is the case where consumed (the cpu usage seconds total) exceeds requested cpu - ( - sum by (namespace, pod) ( - irate(container_cpu_usage_seconds_total{namespace!="",pod!="",image!=""}[5m]) - ) - > sum by (namespace, pod) ( - kube_pod_container_resource_requests{namespace!="", pod!="", node!="", resource="cpu"} - ) - ) - or sum by (namespace, pod) ( # this is the case where cpu requests are <= consumed cpu - kube_pod_container_resource_requests{namespace!="", pod!="", node!="", resource="cpu"} - ) - ) - # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but I didn't want to double the number of pod-level metrics we save - * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) - "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") - ), - "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels - label_replace( - label_replace( - kube_pod_labels{ - label_domain!="", - label_project!="", - label_serving_unionai_dev_app_name!="", - label_serving_knative_dev_revision!="" - }, # this filters for apps only - "label_entity_type", "app", "", "" # set label_entity_type to "app" - ), - "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) - "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" - ), - "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - fast_task_execution_duration_rate{label_domain!="", label_project!="", label_execution_id!=""}, - "label_entity_type", "fast_task", "", "" - ) - ) - ) - # Then filter for pods only in the "Running" or "Pending" phase - * on (namespace, pod) group_left() ( - max by (namespace, pod) ( - kube_pod_status_phase{phase=~"Running|Pending"} == 1 - ) - ) - # Now join in node identifiers which are used for subsequent overhead calculations - * on (namespace, pod) group_left(node) ( - max by (namespace, pod, node) (kube_pod_info{node!=""}) # needed to add node!="" to dedupe - ) - ) - - record: entity_id:gpu_usage_per_node:sum # Allocated gpu aggregated per node and entity, where entity is either a task/workflow execution or an app. - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id, node) ( - # First, grab the allocated gpu for each pod (which is always either 1 or zero, since k8s can't split gpus the way it can with cpu/memory) - max by (namespace, pod) ( - pod_gpu_allocation - ) - # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but I didn't want to double the number of pod-level metrics we save - * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) - "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") - ), - "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels - label_replace( - label_replace( - kube_pod_labels{ - label_domain!="", - label_project!="", - label_serving_unionai_dev_app_name!="", - label_serving_knative_dev_revision!="" - }, # this filters for apps only - "label_entity_type", "app", "", "" # set label_entity_type to "app" - ), - "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) - "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" - ), - "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - fast_task_execution_duration_rate{label_domain!="", label_project!="", label_execution_id!=""}, - "label_entity_type", "fast_task", "", "" - ) - ) - ) - # Then filter for pods only in the "Running" or "Pending" phase - * on (namespace, pod) group_left() ( - max by (namespace, pod) ( - kube_pod_status_phase{phase=~"Running|Pending"} == 1 - ) - ) - # Now join in node identifiers which are used for subsequent overhead calculations - * on (namespace, pod) group_left(node) ( - max by (namespace, pod, node) (kube_pod_info{node!=""}) # needed to add node!="" to dedupe - ) - ) - - record: entity_id:used_mem_bytes:sum # the sum of used memory across all containers in an entity (numerator for aggregate utilization calculations) - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id) ( # aggregate up to entity - # First, calculate the used memory for each pod - sum by (namespace, pod) ( - container_memory_working_set_bytes{namespace!="",pod!="",image!=""} - ) - # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but we do not want to double the number of pod-level metrics we save - * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) - "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") - ), - "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels - label_replace( - label_replace( - kube_pod_labels{ - label_domain!="", - label_project!="", - label_serving_unionai_dev_app_name!="", - label_serving_knative_dev_revision!="" - }, # this filters for apps only - "label_entity_type", "app", "", "" # set label_entity_type to "app" - ), - "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) - "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" - ), - "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - fast_task_execution_duration_rate{label_domain!="", label_project!="", label_execution_id!=""}, - "label_entity_type", "fast_task", "", "" - ) - ) - ) - # Then filter for pods only in the "Running" or "Pending" phase - * on (namespace, pod) group_left() ( - max by (namespace, pod) ( - kube_pod_status_phase{phase=~"Running|Pending"} == 1 - ) - ) - ) - - record: entity_id:allocated_mem_bytes:sum # the sum of allocated memory across all containers in an entity (denominator for aggregate utilization calculations) - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id) ( # aggregate up to entity (remove node) - entity_id:mem_usage_bytes_total_per_node:sum - ) - - record: entity_id:used_cpu:sum # the sum of used cpu across all containers in an entity (numerator for aggregate utilization calculations) - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id) ( - sum by (namespace, pod) ( - irate(container_cpu_usage_seconds_total{namespace!="",pod!="",image!=""}[5m]) - ) - # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but I didn't want to double the number of pod-level metrics we save - * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) - "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") - ), - "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels - label_replace( - label_replace( - kube_pod_labels{ - label_domain!="", - label_project!="", - label_serving_unionai_dev_app_name!="", - label_serving_knative_dev_revision!="" - }, # this filters for apps only - "label_entity_type", "app", "", "" # set label_entity_type to "app" - ), - "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) - "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" - ), - "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - fast_task_execution_duration_rate{label_domain!="", label_project!="", label_execution_id!=""}, - "label_entity_type", "fast_task", "", "" - ) - ) - ) - # Then filter for pods only in the "Running" or "Pending" phase - * on (namespace, pod) group_left() ( - max by (namespace, pod) ( - kube_pod_status_phase{phase=~"Running|Pending"} == 1 - ) - ) - ) - - record: entity_id:allocated_cpu:sum # the sum of allocated cpu across all containers in an entity (denominator for aggregate utilization calculations) - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id) ( # aggregate up to entity (remove node) - entity_id:cpu_usage_per_node:sum - ) - - record: entity_id:sm_occupancy:avg # the simple average of SM occupancy (a good generic measure of GPU utilization) per entity - expr: | - avg by (label_entity_type, label_domain, label_project, label_entity_id) ( - # First, grab the SM occupancy for each pod - max by (namespace, pod) ( - DCGM_FI_PROF_SM_OCCUPANCY # SM occupancy is a good proxy for actual GPU usage - ) - # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but I didn't want to double the number of pod-level metrics we save - * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) - "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") - ), - "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels - label_replace( - label_replace( - kube_pod_labels{ - label_domain!="", - label_project!="", - label_serving_unionai_dev_app_name!="", - label_serving_knative_dev_revision!="" - }, # this filters for apps only - "label_entity_type", "app", "", "" # set label_entity_type to "app" - ), - "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) - "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" - ), - "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - fast_task_execution_duration_rate{label_domain!="", label_project!="", label_execution_id!=""}, - "label_entity_type", "fast_task", "", "" - ) - ) - ) - # Then filter for pods only in the "Running" or "Pending" phase - * on (namespace, pod) group_left() ( - max by (namespace, pod) ( - kube_pod_status_phase{phase=~"Running|Pending"} == 1 - ) - ) - ) - - record: entity_id:gpu_count:sum # the count of running gpu pods per entity (need this to weight the gpu utilization when aggregating upwards - i.e. project-level) - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id, node) ( - # First, grab the allocated gpu for each pod (which is always either 1 or zero, since k8s can't split gpus the way it can with cpu/memory) - max by (namespace, pod) ( - pod_gpu_allocation - ) - # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but I didn't want to double the number of pod-level metrics we save - * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) - "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") - ), - "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels - label_replace( - label_replace( - kube_pod_labels{ - label_domain!="", - label_project!="", - label_serving_unionai_dev_app_name!="", - label_serving_knative_dev_revision!="" - }, # this filters for apps only - "label_entity_type", "app", "", "" # set label_entity_type to "app" - ), - "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) - "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" - ), - "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - fast_task_execution_duration_rate{label_domain!="", label_project!="", label_execution_id!=""}, - "label_entity_type", "fast_task", "", "" - ) - ) - ) - # Then filter for pods only in the "Running" or "Pending" phase - * on (namespace, pod) group_left() ( - max by (namespace, pod) ( - kube_pod_status_phase{phase=~"Running|Pending"} == 1 - ) - ) - ) - - record: entity_id:weighted_sm_occupancy:sum # product of SM occupancy and allocated GPU count (something like "used memory", numerator of weighted calcs) - expr: | - entity_id:sm_occupancy:avg - * on (label_domain, label_project, label_entity_type, label_entity_id) entity_id:gpu_count:sum - - record: entity_id:allocated_mem_cost:sum # Allocated cost of memory for each workflow/task execution and app. - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id, type) ( - entity_id:mem_usage_bytes_total_per_node:sum / (1024 * 1024 * 1024) # convert bytes to GB - * on (node) group_left(type) label_replace(avg by (node) (node_ram_hourly_cost * (15 / 3600)), "type", "mem", "", "") # convert hourly cost to 15-secondly cost and add type - ) - - record: entity_id:allocated_cpu_cost:sum # Allocated cost of cpu for each workflow/task execution and app. - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id, type)( - entity_id:cpu_usage_per_node:sum - * on (node) group_left(type) label_replace(avg by (node) (node_cpu_hourly_cost * (15 / 3600)), "type", "cpu", "", "") # convert hourly cost to 15-secondly cost and add type - ) - - record: entity_id:allocated_gpu_cost:sum # Allocated cost of gpu for each workflow/task execution and app. - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id, type)( - entity_id:gpu_usage_per_node:sum - * on (node) group_left(type) label_replace(avg by (node) (node_gpu_hourly_cost * (15 / 3600)), "type", "gpu", "", "") # convert hourly cost to 15-secondly cost and add type - ) - - record: entity_id:allocated_cost:sum # Allocated cost of memory, cpu, and gpu for each workflow/task execution and app. - expr: | - label_replace( - sum by (label_entity_type, label_domain, label_project, label_entity_id) ( # for the sum to work, the labels need to be different on each "or" element (type label) - entity_id:allocated_mem_cost:sum - or - entity_id:allocated_cpu_cost:sum - or - entity_id:allocated_gpu_cost:sum - ), - "type", "allocated", "", "" # add type info - ) - - record: entity_id:overhead_cost:sum # The amount of overhead costs (node costs that we can't allocate with container resources) to allocate to each entity (workflow/task execution or app) - expr: | - label_replace( - sum by (label_entity_type, label_entity_id, label_domain, label_project)( # Aggregate the per-node metrics up to workflow/task execution or app (label_entity_id) - # Start with each execution's and app's allocated cost per node - sum by (label_entity_type, label_domain, label_project, label_entity_id, node) ( # for the sum to work, the labels need to be different on each "or" element (type label) - entity_id:mem_usage_bytes_total_per_node:sum / (1024 * 1024 * 1024) # convert bytes to GB - * on (node) group_left(type) label_replace(avg by (node) (node_ram_hourly_cost * (15 / 3600)), "type", "mem", "", "") # convert hourly cost to 15-secondly cost and add type - or - entity_id:cpu_usage_per_node:sum - * on (node) group_left(type) label_replace(avg by (node) (node_cpu_hourly_cost * (15 / 3600)), "type", "cpu", "", "") # convert hourly cost to 15-secondly cost and add type - or - entity_id:gpu_usage_per_node:sum - * on (node) group_left(type) label_replace(avg by (node) (node_gpu_hourly_cost * (15 / 3600)), "type", "gpu", "", "") # convert hourly cost to 15-secondly cost and add type - ) - # Then divide out the total allocated cost per node to get the proportion of allocated cost associated with each entity - / on (node) group_left()( - sum by (node) ( # for the sum to work, the labels need to be different on each "or" element (type label) - entity_id:mem_usage_bytes_total_per_node:sum / (1024 * 1024 * 1024) # convert bytes to GB - * on (node) group_left(type) label_replace(avg by (node) (node_ram_hourly_cost * (15 / 3600)), "type", "mem", "", "") # convert hourly cost to 15-secondly cost and add type - or - entity_id:cpu_usage_per_node:sum - * on (node) group_left(type) label_replace(avg by (node) (node_cpu_hourly_cost * (15 / 3600)), "type", "cpu", "", "") # convert hourly cost to 15-secondly cost and add type - or - entity_id:gpu_usage_per_node:sum - * on (node) group_left(type) label_replace(avg by (node) (node_gpu_hourly_cost * (15 / 3600)), "type", "gpu", "", "") # convert hourly cost to 15-secondly cost and add type - ) - > 0 # need to avoid dividing by zero, or gaps in the data can cause NaNs to proliferate, borking all charts - ) - # Then multiply by the overhead cost per node - * on (node) group_left() ( - # To calculate overhead, start with the true cost of running each node - avg by (node)(kube_node_labels{label_flyte_org_node_role="worker"}) # only look at worker nodes - * on (node) max by (node) ( - node_total_hourly_cost{instance_type!=""} # sometimes, the instance_type can be null, causing an unlabeled label to show up in the Compute Costs dashboard charts - ) * (15 / 3600) # convert hourly cost to 15-secondly cost - # Then subtract out the total allocated cost on each node - - on (node) group_left()( - sum by (node) ( # for the sum to work, the labels need to be different on each "or" element (type label) - entity_id:mem_usage_bytes_total_per_node:sum / (1024 * 1024 * 1024) # convert bytes to GB - * on (node) group_left(type) label_replace(avg by (node) (node_ram_hourly_cost * (15 / 3600)), "type", "mem", "", "") # convert hourly cost to 15-secondly cost and add type - or - entity_id:cpu_usage_per_node:sum - * on (node) group_left(type) label_replace(avg by (node) (node_cpu_hourly_cost * (15 / 3600)), "type", "cpu", "", "") # convert hourly cost to 15-secondly cost and add type - or - entity_id:gpu_usage_per_node:sum - * on (node) group_left(type) label_replace(avg by (node) (node_gpu_hourly_cost * (15 / 3600)), "type", "gpu", "", "") # convert hourly cost to 15-secondly cost and add type - ) - ) - ) - ), - "type", "overhead", "", "" # add type info - ) - - record: entity_id:total_cost:sum # Total cost of each entity (workflow/task execution or app), including allocated (from container resources) and overhead (proportion of unallocated node costs) - expr: | - label_replace( - sum by (label_domain, label_project, label_entity_id, label_entity_type) ( - entity_id:allocated_cost:sum - or - entity_id:overhead_cost:sum - ), - "type", "total", "", "" # add type info - ) - - record: node:total_cost:sum # Total cost of all nodes - expr: | - sum ( - avg by (node)(kube_node_labels{label_flyte_org_node_role="worker", label_node_kubernetes_io_instance_type!=""}) # only look at worker nodes - * on (node) group_left() node_total_hourly_cost{instance_type!=""} * (15 / 3600) # convert hourly cost to 15-secondly cost - ) - - record: node_type:total_cost:sum # Total cost of nodes grouped by node type - expr: | - sum by (node_type)( - avg by (node)(kube_node_labels{label_flyte_org_node_role="worker", label_node_kubernetes_io_instance_type!=""}) # only look at worker nodes - * on (node) group_left(node_type) label_replace(node_total_hourly_cost{instance_type!=""}, "node_type", "$1", "instance_type", "(.*)") * (15 / 3600) # convert hourly cost to 15-secondly cost and rename label - ) - - record: node_type:uptime_hours:sum # Total uptime of nodes grouped by node type - expr: | - sum by (node_type)( - avg by (node, node_type)( # dedupe - label_replace(kube_node_labels{label_flyte_org_node_role="worker", label_node_kubernetes_io_instance_type!=""}, "node_type", "$1", "label_node_kubernetes_io_instance_type", "(.*)") # relabel - ) - ) * (15 / 3600) # convert to number of hours per 15-second observation # Aggregate the above into visible metrics - - name: cost_rollup_15m - interval: 15m - rules: - - record: execution_info15m - expr: | - max_over_time(execution_info[15m:15s]) - - record: app_info15m - expr: | - max_over_time(app_info[15m:15s]) - - record: workspace_info15m - expr: | - max_over_time(workspace_info[15m:15s]) - - record: entity_id:allocated_mem_bytes:sum15m - expr: | - sum_over_time(entity_id:allocated_mem_bytes:sum[15m:15s]) - - record: entity_id:used_mem_bytes:sum15m - expr: | - sum_over_time(entity_id:used_mem_bytes:sum[15m:15s]) - - record: entity_id:allocated_cpu:sum15m - expr: | - sum_over_time(entity_id:allocated_cpu:sum[15m:15s]) - - record: entity_id:used_cpu:sum15m - expr: | - sum_over_time(entity_id:used_cpu:sum[15m:15s]) - - record: entity_id:weighted_sm_occupancy:sum15m - expr: | - sum_over_time(entity_id:weighted_sm_occupancy:sum[15m:15s]) - - record: entity_id:gpu_count:sum15m - expr: | - sum_over_time(entity_id:gpu_count:sum[15m:15s]) - - record: entity_id:allocated_mem_cost:sum15m - expr: | - sum_over_time(entity_id:allocated_mem_cost:sum[15m:15s]) - - record: entity_id:allocated_cpu_cost:sum15m - expr: | - sum_over_time(entity_id:allocated_cpu_cost:sum[15m:15s]) - - record: entity_id:allocated_gpu_cost:sum15m - expr: | - sum_over_time(entity_id:allocated_gpu_cost:sum[15m:15s]) - - record: entity_id:allocated_cost:sum15m - expr: | - sum_over_time(entity_id:allocated_cost:sum[15m:15s]) - - record: entity_id:overhead_cost:sum15m - expr: | - sum_over_time(entity_id:overhead_cost:sum[15m:15s]) - - record: entity_id:total_cost:sum15m - expr: | - sum_over_time(entity_id:total_cost:sum[15m:15s]) - - record: node:total_cost:sum15m - expr: | - sum_over_time(node:total_cost:sum[15m:15s]) - - record: node_type:total_cost:sum15m - expr: | - sum_over_time(node_type:total_cost:sum[15m:15s]) - - record: node_type:uptime_hours:sum15m - expr: | - sum_over_time(node_type:uptime_hours:sum[15m:15s]) + envoy-bootstrap.yaml: | + dynamic_resources: + ads_config: + transport_api_version: V3 + api_type: GRPC + rate_limit_settings: {} + grpc_services: + - envoy_grpc: {cluster_name: xds_cluster} + cds_config: + resource_api_version: V3 + ads: {} + lds_config: + resource_api_version: V3 + ads: {} + node: + cluster: kourier-knative + id: 3scale-kourier-gateway + static_resources: + listeners: + - name: stats_listener + address: + socket_address: + address: 0.0.0.0 + port_value: 9000 + filter_chains: + - filters: + - name: envoy.filters.network.http_connection_manager + typed_config: + "@type": type.googleapis.com/envoy.extensions.filters.network.http_connection_manager.v3.HttpConnectionManager + stat_prefix: stats_server + http_filters: + - name: envoy.filters.http.router + typed_config: + "@type": type.googleapis.com/envoy.extensions.filters.http.router.v3.Router + route_config: + virtual_hosts: + - name: admin_interface + domains: + - "*" + routes: + - match: + safe_regex: + regex: '/(certs|stats(/prometheus)?|server_info|clusters|listeners|ready)?' + headers: + - name: ':method' + string_match: + exact: GET + route: + cluster: service_stats + - match: + safe_regex: + regex: '/drain_listeners' + headers: + - name: ':method' + string_match: + exact: POST + route: + cluster: service_stats + clusters: + - name: service_stats + connect_timeout: 0.250s + type: static + load_assignment: + cluster_name: service_stats + endpoints: + lb_endpoints: + endpoint: + address: + socket_address: + address: 127.0.0.1 + port_value: 9901 + - name: xds_cluster + # This keepalive is recommended by envoy docs. + # https://www.envoyproxy.io/docs/envoy/latest/api-docs/xds_protocol + typed_extension_protocol_options: + envoy.extensions.upstreams.http.v3.HttpProtocolOptions: + "@type": type.googleapis.com/envoy.extensions.upstreams.http.v3.HttpProtocolOptions + explicit_http_config: + http2_protocol_options: + connection_keepalive: + interval: 30s + timeout: 5s + connect_timeout: 1s + load_assignment: + cluster_name: xds_cluster + endpoints: + lb_endpoints: + endpoint: + address: + socket_address: + address: "net-kourier-controller" + port_value: 18000 + type: STRICT_DNS + admin: + access_log: + - name: envoy.access_loggers.stdout + typed_config: + "@type": type.googleapis.com/envoy.extensions.access_loggers.stream.v3.StdoutAccessLog + address: + socket_address: + address: 127.0.0.1 + port_value: 9901 + stats_config: + stats_tags: + - tag_name: name + regex: '^.*?\.u/.*?n=(.*?)/.*?u\..*$' + - tag_name: domain + regex: '^.*?\.u/.*?d=(.*?)/.*?u\..*$' + - tag_name: org + regex: '^.*?\.u/.*?o=(.*?)/.*?u\..*$' + - tag_name: project + regex: '^.*?\.u/.*?p=(.*?)/.*?u\..*$' + - tag_name: target + regex: '^.*?\.u/.*?t=(.*?)/.*?u\..*$' + - tag_name: target_namespace + regex: '^.*?\.u/.*?tns=(.*?)/.*?u\..*$' --- -# Source: dataplane/templates/propeller/configmap.yaml +# Source: dataplane/templates/webhook/configmap.yaml apiVersion: v1 kind: ConfigMap metadata: - name: flyte-propeller-config + name: union-pod-webhook-config namespace: union data: - admin.yaml: | - admin: - clientId: 'test-client-id' - clientSecretLocation: /etc/union/secret/client_secret - endpoint: dns:///test.example.com - insecure: false - event: - capacity: 1000 - rate: 500 - type: admin - catalog.yaml: | - catalog-cache: - cache-endpoint: dns:///test.example.com - endpoint: dns:///test.example.com - insecure: false - type: fallback - use-admin-auth: true - copilot.yaml: | - plugins: - k8s: - co-pilot: - image: 'cr.flyte.org/flyteorg/flytecopilot:v1.14.1' - name: flyte-copilot- - start-timeout: 30s - core.yaml: | - propeller: - downstream-eval-duration: 30s - enable-admin-launcher: true - leader-election: - enabled: true - lease-duration: 15s - lock-config-map: - name: propeller-leader - namespace: 'union' - renew-deadline: 10s - retry-period: 2s - limit-namespace: all - literal-offloading-config: - enabled: true - max-workflow-retries: 30 - metadata-prefix: metadata/propeller - metrics-prefix: flyte - prof-port: 10254 - queue: - batch-size: -1 - batching-interval: 2s - queue: - base-delay: 5s - capacity: 1000 - max-delay: 120s - rate: 100 - type: maxof - sub-queue: - capacity: 100 - rate: 10 - type: bucket - type: batch - rawoutput-prefix: 's3://' - workers: 4 - workflow-reeval-duration: 30s + core.yaml: | + + webhook: certDir: /etc/webhook/certs + disableCreateMutatingWebhookConfig: true embeddedSecretManagerConfig: imagePullSecrets: enabled: true @@ -3840,85 +3393,13 @@ data: namespace: 'union' type: 'K8s' listenPort: '9443' + localCert: true secretManagerTypes: - Embedded - K8s - serviceName: flyte-pod-webhook + secretName: union-pod-webhook + serviceName: union-pod-webhook servicePort: '443' - enabled_plugins.yaml: | - plugins: - connector-service: - defaultConnector: - defaultServiceConfig: '{"loadBalancingConfig": [{"round_robin":{}}]}' - endpoint: k8s:///flyteconnector.union:8000 - tasks: - task-plugins: - default-for-task-types: - actor: fast-task - container: container - container_array: k8s-array - sidecar: sidecar - enabled-plugins: - - container - - sidecar - - k8s-array - - echo - - fast-task - - connector-service - k8s.yaml: | - plugins: - k8s: - default-cpus: 100m - default-env-vars: [] - default-memory: 100Mi - logger.yaml: | - logger: - formatter: - type: json - level: 4 - show-source: true - namespace_config.yaml: | - namespace_config: - namespace_mapping: - template: '{{ project }}-{{ domain }}' - resource_manager.yaml: | - propeller: - resourcemanager: - type: noop - task_logs.yaml: | - plugins: - logs: - cloudwatch-enabled: false - dynamic-log-links: - - vscode: - displayName: VS Code Debugger - templateUris: - - /dataplane/pod/v1/generated_name/task/{{.executionProject}}/{{.executionDomain}}/{{.executionName}}/{{.nodeID}}/{{.taskRetryAttempt}}/{{.taskProject}}/{{.taskDomain}}/{{.taskID}}/{{.taskVersion}}/ - kubernetes-enabled: false - templates: - - displayName: Task Logs - scheme: TaskExecution - templateUris: - - /console/projects/{{.executionProject}}/domains/{{.executionDomain}}/executions/{{.executionName}}/nodeId/{{.nodeID}}/taskId/{{.taskID}}/attempt/{{.taskRetryAttempt}}/view/logs?duration=all&fromExecutionNav=true - storage.yaml: | - storage: - container: "" - type: stow - stow: - kind: s3 - config: - auth_type: accesskey - access_key_id: - secret_key: - disable_ssl: false - endpoint: - region: us-east-1 - enable-multicontainer: false - limits: - maxDownloadMBs: 1024 - cache: - max_size_mbs: 0 - target_gc_percent: 70 --- # Source: dataplane/charts/fluentbit/templates/clusterrole.yaml apiVersion: rbac.authorization.k8s.io/v1 @@ -3942,275 +3423,1143 @@ rules: - list - watch --- -# Source: dataplane/charts/kube-state-metrics/templates/role.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2020 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: knative-serving-operator-aggregated + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +aggregationRule: + clusterRoleSelectors: + # This (along with escalate below) allows the Operator to pick up any + # roles that are provided to the admin of the cluster by knative serving + # automatically. + - matchExpressions: + - {key: serving.knative.dev/release, operator: Exists} +rules: [] # Rules are automatically filled in by the controller manager. +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: knative-serving-operator-aggregated-stable + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +aggregationRule: + clusterRoleSelectors: + # This (along with escalate below) allows the Operator to pick up any + # roles that are provided to the admin of the cluster by knative serving + # automatically. + - matchExpressions: + - {key: app.kubernetes.io/name, operator: In, values: ["knative-serving"]} +rules: [] # Rules are automatically filled in by the controller manager. +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: knative-eventing-operator-aggregated + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +aggregationRule: + clusterRoleSelectors: + # This (along with escalate below) allows the Operator to pick up any + # roles that are provided to the admin of the cluster by knative eventing + # automatically. + - matchExpressions: + - {key: eventing.knative.dev/release, operator: Exists} +rules: [] # Rules are automatically filled in by the controller manager. +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: knative-eventing-operator-aggregated-stable + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +aggregationRule: + clusterRoleSelectors: + # This (along with escalate below) allows the Operator to pick up any + # roles that are provided to the admin of the cluster by knative eventing + # automatically. + - matchExpressions: + - {key: app.kubernetes.io/name, operator: In, values: ["knative-eventing"]} +rules: [] # Rules are automatically filled in by the controller manager. +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2020 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +kind: ClusterRole +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: knative-serving-operator + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +rules: + - apiGroups: + - operator.knative.dev + resources: + - '*' + verbs: + - '*' + # Bootstrapping permissions. + # Roles that are explicitly bound buch which are specified by this Operator + # MUST be specified here with 'get' and 'bind'. + - apiGroups: + - rbac.authorization.k8s.io + resources: + - clusterroles + resourceNames: + - system:auth-delegator + verbs: + - bind + - get + - apiGroups: + - rbac.authorization.k8s.io + resources: + - roles + resourceNames: + - extension-apiserver-authentication-reader + verbs: + - bind + - get + - apiGroups: + - rbac.authorization.k8s.io + resources: + - clusterroles + - roles + verbs: + - create + - delete + # Escalate is necessary in order to create a role using cluster role aggregation, + # and to allow the Operator to bootstrap itself into the necessary set of + # permissions, even as those continue to evolve upstream. + - escalate + - get + - list + - update + - apiGroups: + - rbac.authorization.k8s.io + resources: + - clusterrolebindings + - rolebindings + verbs: + - create + - delete + - list + - get + - update + # Permissions required for Knative controller + # infra. + - apiGroups: + - apiregistration.k8s.io + resources: + - apiservices + verbs: + - update + - apiGroups: + - "" + resources: + - services + verbs: + - create + - delete + - get + - list + - watch + - apiGroups: + - caching.internal.knative.dev + resources: + - images + verbs: + - '*' + - apiGroups: + - "" + resources: + - namespaces + verbs: + - get + - update + - watch + - apiGroups: + - '' + resources: + - events + verbs: + - create + - update + - patch + - apiGroups: + - '' + resources: + - configmaps + verbs: + - create + - delete + - get + - list + - watch + - apiGroups: + - security.istio.io + - apps + - policy + resources: + - poddisruptionbudgets + - peerauthentications + - deployments + - daemonsets + - replicasets + - statefulsets + verbs: + - create + - delete + - get + - list + - watch + - update + - apiGroups: + - apiregistration.k8s.io + resources: + - apiservices + verbs: + - create + - delete + - get + - list + - apiGroups: + - autoscaling + resources: + - horizontalpodautoscalers + verbs: + - create + - delete + - get + - list + - apiGroups: + - coordination.k8s.io + resources: + - leases + verbs: + - '*' + - apiGroups: + - apiextensions.k8s.io + resources: + - customresourcedefinitions + verbs: + - '*' + # Old resources that need cleaning up that are not in the knative-serving + # namespace. + - apiGroups: + - "" + resources: + - services + - deployments + - horizontalpodautoscalers + resourceNames: + - knative-ingressgateway + verbs: + - delete + - apiGroups: + - "" + resources: + - configmaps + resourceNames: + - config-controller + verbs: + - delete + - apiGroups: + - "" + resources: + - serviceaccounts + resourceNames: + - knative-serving-operator + verbs: + - delete + # for contour TLS + - apiGroups: + - projectcontour.io + resources: + - httpproxies + - tlscertificatedelegations + verbs: + - get + - list + - watch + - update + - create + - delete + - deletecollection + - patch + # for security-guard + - apiGroups: + - guard.security.knative.dev + resources: + - guardians + verbs: + - get + - list + - watch + - create + - update + - patch + - delete + - apiGroups: + - "" + resources: + - configmaps + verbs: + - get + - list + - watch + - create + - update + - patch + - delete + - apiGroups: + - authentication.k8s.io + resources: + - tokenreviews + verbs: + - create + - apiGroups: + - "" + resources: + - pods + verbs: + - get +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +kind: ClusterRole +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: knative-eventing-operator + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +rules: + - apiGroups: + - operator.knative.dev + resources: + - '*' + verbs: + - '*' + # Bootstrapping permissions. + # Roles that are explicitly bound buch which are specified by this Operator + # MUST be specified here with 'get' and 'bind'. + - apiGroups: + - rbac.authorization.k8s.io + resources: + - clusterroles + - roles + verbs: + - create + - delete + # Escalate is necessary in order to create a role using cluster role aggregation, + # and to allow the Operator to bootstrap itself into the necessary set of + # permissions, even as those continue to evolve upstream. + - escalate + - get + - list + - update + - apiGroups: + - rbac.authorization.k8s.io + resources: + - clusterrolebindings + - rolebindings + verbs: + - create + - delete + - list + - get + - update + # Permissions required for Knative controller + # infra. + - apiGroups: + - apiregistration.k8s.io + resources: + - apiservices + verbs: + - update + - apiGroups: + - "" + resources: + - services + verbs: + - create + - delete + - get + - list + - watch + - apiGroups: + - caching.internal.knative.dev + resources: + - images + verbs: + - '*' + - apiGroups: + - "" + resources: + - namespaces + verbs: + - get + - update + - watch + - apiGroups: + - '' + resources: + - events + verbs: + - create + - update + - patch + - apiGroups: + - '' + resources: + - configmaps + verbs: + - create + - delete + - get + - list + - watch + - apiGroups: + - apps + resources: + - deployments + - daemonsets + - replicasets + - statefulsets + verbs: + - create + - delete + - get + - list + - watch + - apiGroups: + - apiregistration.k8s.io + resources: + - apiservices + verbs: + - create + - delete + - get + - list + - apiGroups: + - autoscaling + resources: + - horizontalpodautoscalers + verbs: + - create + - delete + - update + - get + - list + - apiGroups: + - coordination.k8s.io + resources: + - leases + verbs: + - '*' + - apiGroups: + - apiextensions.k8s.io + resources: + - customresourcedefinitions + verbs: + - '*' + - apiGroups: + - batch + resources: + - jobs + verbs: + - create + - delete + - update + - get + - list + - watch + # Old resources that need cleaning up that are not in the knative-eventing + # namespace. + - apiGroups: + - "" + resources: + - serviceaccounts + resourceNames: + - knative-eventing-operator + verbs: + - delete + # for RabbitMQ messaging topology objects + - apiGroups: + - rabbitmq.com + resources: + - rabbitmqclusters + verbs: + - "get" + - "list" + - "watch" + - apiGroups: + - rabbitmq.com + resources: + - bindings + - queues + - exchanges + verbs: + - create + - delete + - get + - list + - patch + - update + - watch + - apiGroups: + - rabbitmq.com + resources: + - bindings/status + - queues/status + - exchanges/status + verbs: + - get + # for Kafka eventing source + - apiGroups: + - keda.sh + resources: + - scaledobjects + - scaledobjects/finalizers + - scaledobjects/status + - triggerauthentications + - triggerauthentications/status + verbs: + - get + - list + - watch + - update + - create + - delete + # Internal APIs + - apiGroups: + - "internal.kafka.eventing.knative.dev" + resources: + - "consumers" + - "consumers/status" + - "consumergroups" + - "consumergroups/status" + verbs: + - create + - get + - list + - watch + - patch + - update + - delete + - apiGroups: + - "internal.kafka.eventing.knative.dev" + resources: + - "consumers/finalizers" + - "consumergroups/finalizers" + verbs: + - update + - delete + - apiGroups: + - apps + resources: + - statefulsets/scale + verbs: + - get + - list + - watch + - update + - patch + - create + - delete + - apiGroups: + - rbac.authorization.k8s.io + resources: + - clusterrolebindings + verbs: + - watch + - apiGroups: + - "*" + resources: + - configmaps + verbs: + - delete + - apiGroups: + - "*" + resources: + - configmaps + - services + verbs: + - get + - list + - watch + - update + - create + - delete + - apiGroups: + - "*" + resources: + - pods + verbs: + - list + - update + - get + - watch + - apiGroups: + - "*" + resources: + - pods/finalizers + verbs: + - get + - list + - create + - update + - delete + - apiGroups: + - "*" + resources: + - events + verbs: + - patch + - create + - apiGroups: + - "*" + resources: + - secrets + verbs: + - get + - list + - watch + - update + - create + - delete + - apiGroups: + - "*" + resources: + - nodes + verbs: + - get + - list + - watch + - apiGroups: + - "*" + resources: + - serviceaccounts + verbs: + - get + - list + - watch + - update + - create + - delete + - apiGroups: + - "*" + resources: + - configmaps + resourceNames: + - kafka-channel-config + verbs: + - patch + - apiGroups: + - "*" + resources: + - horizontalpodautoscalers + resourceNames: + - kafka-webhook + verbs: + - delete + - apiGroups: + - "*" + resources: + - leases + verbs: + - delete + - apiGroups: + - "*" + resources: + - poddisruptionbudgets + resourceNames: + - kafka-webhook + verbs: + - delete + - apiGroups: + - "*" + resources: + - services + verbs: + - patch + - apiGroups: + - "apps" + resources: + - deployments + verbs: + - deletecollection + # Eventing TLS + - apiGroups: + - "cert-manager.io" + resources: + - certificates + - issuers + - clusterissuers + verbs: + - create + - delete + - update + - list + - get + - watch + - apiGroups: + - "trust.cert-manager.io" + resources: + - bundles + verbs: + - create + - delete + - update + - list + - get + - watch +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: knative-operator-webhook + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +rules: + # For watching logging configuration and getting certs. + - apiGroups: + - "" + resources: + - "configmaps" + verbs: + - "get" + - "list" + - "watch" + # For manipulating certs into secrets. + - apiGroups: + - "" + resources: + - "namespaces" + verbs: + - "get" + - "create" + - "update" + - "list" + - "watch" + - "patch" + # finalizers are needed for the owner reference of the webhook + - apiGroups: + - "" + resources: + - "namespaces/finalizers" + verbs: + - "update" + # For getting our Deployment so we can decorate with ownerref. + - apiGroups: + - "apps" + resources: + - "deployments" + verbs: + - "get" + - apiGroups: + - "apps" + resources: + - "deployments/finalizers" + verbs: + - update + # For actually registering our webhook. + - apiGroups: + - "admissionregistration.k8s.io" + resources: + - "mutatingwebhookconfigurations" + - "validatingwebhookconfigurations" + verbs: &everything + - "get" + - "list" + - "create" + - "update" + - "delete" + - "patch" + - "watch" + # For leader election + - apiGroups: + - "coordination.k8s.io" + resources: + - "leases" + verbs: *everything + # Necessary for conversion webhook. These are copied from the serving + # TODO: Do we really need all these permissions? + - apiGroups: ["apiextensions.k8s.io"] + resources: ["customresourcedefinitions"] + verbs: ["get", "list", "create", "update", "delete", "patch", "watch"] +--- +# Source: dataplane/charts/fluentbit/templates/clusterrolebinding.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: release-name-fluentbit + labels: + helm.sh/chart: fluentbit-0.48.9 + app.kubernetes.io/name: fluentbit + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "3.2.8" + app.kubernetes.io/managed-by: Helm +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: release-name-fluentbit +subjects: + - kind: ServiceAccount + name: union-system + namespace: union +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2020 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# TODO: Consider restriction of non-aggregated role to knativeservings namespaces. +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: knative-serving-operator + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: knative-serving-operator +subjects: + - kind: ServiceAccount + name: knative-operator + namespace: "union" +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# TODO: Consider restriction of non-aggregated role to knativeeventing namespaces. +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: knative-eventing-operator + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: knative-eventing-operator +subjects: + - kind: ServiceAccount + name: knative-operator + namespace: "union" +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: operator-webhook + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: knative-operator-webhook +subjects: + - kind: ServiceAccount + name: operator-webhook + namespace: "union" +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2020 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: knative-serving-operator-aggregated + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: knative-serving-operator-aggregated +subjects: + - kind: ServiceAccount + name: knative-operator + namespace: "union" +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: knative-serving-operator-aggregated-stable + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: knative-serving-operator-aggregated-stable +subjects: + - kind: ServiceAccount + name: knative-operator + namespace: "union" +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: knative-eventing-operator-aggregated + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: knative-eventing-operator-aggregated +subjects: + - kind: ServiceAccount + name: knative-operator + namespace: "union" +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: knative-eventing-operator-aggregated-stable + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: knative-eventing-operator-aggregated-stable +subjects: + - kind: ServiceAccount + name: knative-operator + namespace: "union" +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2022 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole +kind: Role metadata: - labels: - helm.sh/chart: kube-state-metrics-5.30.1 - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: metrics - app.kubernetes.io/part-of: kube-state-metrics - app.kubernetes.io/name: kube-state-metrics - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2.15.0" - name: release-name-kube-state-metrics + namespace: "union" + name: knative-operator-webhook + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator rules: - -- apiGroups: ["certificates.k8s.io"] - resources: - - certificatesigningrequests - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - configmaps - verbs: ["list", "watch"] - -- apiGroups: ["batch"] - resources: - - cronjobs - verbs: ["list", "watch"] - -- apiGroups: ["extensions", "apps"] - resources: - - daemonsets - verbs: ["list", "watch"] - -- apiGroups: ["extensions", "apps"] - resources: - - deployments - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - endpoints - verbs: ["list", "watch"] - -- apiGroups: ["autoscaling"] - resources: - - horizontalpodautoscalers - verbs: ["list", "watch"] - -- apiGroups: ["extensions", "networking.k8s.io"] - resources: - - ingresses - verbs: ["list", "watch"] - -- apiGroups: ["batch"] - resources: - - jobs - verbs: ["list", "watch"] - -- apiGroups: ["coordination.k8s.io"] - resources: - - leases - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - limitranges - verbs: ["list", "watch"] - -- apiGroups: ["admissionregistration.k8s.io"] - resources: - - mutatingwebhookconfigurations - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - namespaces - verbs: ["list", "watch"] - -- apiGroups: ["networking.k8s.io"] - resources: - - networkpolicies - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - nodes - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - persistentvolumeclaims - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - persistentvolumes - verbs: ["list", "watch"] - -- apiGroups: ["policy"] - resources: - - poddisruptionbudgets - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - pods - verbs: ["list", "watch"] - -- apiGroups: ["extensions", "apps"] - resources: - - replicasets - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - replicationcontrollers - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - resourcequotas - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - secrets - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - services - verbs: ["list", "watch"] - -- apiGroups: ["apps"] - resources: - - statefulsets - verbs: ["list", "watch"] - -- apiGroups: ["storage.k8s.io"] - resources: - - storageclasses - verbs: ["list", "watch"] - -- apiGroups: ["admissionregistration.k8s.io"] - resources: - - validatingwebhookconfigurations - verbs: ["list", "watch"] - -- apiGroups: ["storage.k8s.io"] - resources: - - volumeattachments - verbs: ["list", "watch"] + # For manipulating certs into secrets. + - apiGroups: + - "" + resources: + - "secrets" + verbs: + - "get" + - "create" + - "update" + - "list" + - "watch" + - "patch" --- -# Source: dataplane/charts/opencost/templates/clusterrole.yaml -# Cluster role giving opencost to get, list, watch required resources -# No write permissions are required +# Source: dataplane/templates/monitoring/prometheus-rbac.yaml apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole +kind: Role metadata: - name: release-name-opencost + name: union-operator-prometheus-rbac + namespace: union labels: - helm.sh/chart: opencost-1.42.0 - app.kubernetes.io/name: opencost - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "1.111.0" - app.kubernetes.io/part-of: opencost - app.kubernetes.io/managed-by: Helm + release: release-name rules: + # Prometheus server scrape permissions - apiGroups: [""] resources: - - configmaps - - deployments - nodes - - pods + - nodes/proxy + - nodes/metrics - services - - resourcequotas - - replicationcontrollers - - limitranges - - persistentvolumeclaims - - persistentvolumes - - namespaces - endpoints + - pods + - ingresses + - configmaps verbs: - get - list - watch - - apiGroups: - - extensions - resources: - - daemonsets - - deployments - - replicasets - verbs: - - get - - list - - watch - - apiGroups: - - apps - resources: - - statefulsets - - deployments - - daemonsets - - replicasets - verbs: - - list - - watch - - apiGroups: - - batch - resources: - - cronjobs - - jobs - verbs: - - get - - list - - watch - - apiGroups: - - autoscaling - resources: - - horizontalpodautoscalers - verbs: - - get - - list - - watch - - apiGroups: - - policy + - apiGroups: ["extensions", "networking.k8s.io"] resources: - - poddisruptionbudgets + - ingresses/status + - ingresses verbs: - get - list - watch - - apiGroups: - - storage.k8s.io + - apiGroups: ["discovery.k8s.io"] resources: - - storageclasses + - endpointslices verbs: - get - list - watch ---- -# Source: dataplane/templates/clusterresourcesync/serviceaccount.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: union-clustersync-resource -rules: - - apiGroups: - - "" - - rbac.authorization.k8s.io - resources: - - configmaps - - namespaces - - pods - - resourcequotas - - roles - - rolebindings - - secrets - - services - - serviceaccounts - - clusterrolebindings - - podtemplates - verbs: - - '*' + # kube-state-metrics permissions (rbac.create=false on the subchart; managed here instead) + - apiGroups: ["certificates.k8s.io"] + resources: ["certificatesigningrequests"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["configmaps"] + verbs: ["list", "watch"] + - apiGroups: ["batch"] + resources: ["cronjobs"] + verbs: ["list", "watch"] + - apiGroups: ["extensions", "apps"] + resources: ["daemonsets"] + verbs: ["list", "watch"] + - apiGroups: ["extensions", "apps"] + resources: ["deployments"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["endpoints"] + verbs: ["list", "watch"] + - apiGroups: ["discovery.k8s.io"] + resources: ["endpointslices"] + verbs: ["list", "watch"] + - apiGroups: ["autoscaling"] + resources: ["horizontalpodautoscalers"] + verbs: ["list", "watch"] + - apiGroups: ["extensions", "networking.k8s.io"] + resources: ["ingresses"] + verbs: ["list", "watch"] + - apiGroups: ["batch"] + resources: ["jobs"] + verbs: ["list", "watch"] + - apiGroups: ["coordination.k8s.io"] + resources: ["leases"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["limitranges"] + verbs: ["list", "watch"] + - apiGroups: ["admissionregistration.k8s.io"] + resources: ["mutatingwebhookconfigurations"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["namespaces"] + verbs: ["list", "watch"] + - apiGroups: ["networking.k8s.io"] + resources: ["networkpolicies"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["persistentvolumeclaims"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["persistentvolumes"] + verbs: ["list", "watch"] + - apiGroups: ["policy"] + resources: ["poddisruptionbudgets"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["pods"] + verbs: ["list", "watch"] + - apiGroups: ["extensions", "apps"] + resources: ["replicasets"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["replicationcontrollers"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["resourcequotas"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["secrets"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["services"] + verbs: ["list", "watch"] + - apiGroups: ["apps"] + resources: ["statefulsets"] + verbs: ["list", "watch"] + - apiGroups: ["storage.k8s.io"] + resources: ["storageclasses"] + verbs: ["list", "watch"] + - apiGroups: ["admissionregistration.k8s.io"] + resources: ["validatingwebhookconfigurations"] + verbs: ["list", "watch"] + - apiGroups: ["storage.k8s.io"] + resources: ["volumeattachments"] + verbs: ["list", "watch"] --- # Source: dataplane/templates/nodeexecutor/serviceaccount.yaml apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole +kind: Role metadata: name: union-executor labels: @@ -4261,11 +4610,12 @@ rules: - delete - update --- -# Source: dataplane/templates/operator/serviceaccount-proxy.yaml +# Source: dataplane/templates/operator/serviceaccount-proxy-secret.yaml apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole +kind: Role metadata: - name: proxy-system + name: union-system-secret + namespace: union labels: app.kubernetes.io/name: operator-proxy app.kubernetes.io/instance: release-name @@ -4275,146 +4625,86 @@ rules: - apiGroups: - '*' resources: - - events - - flyteworkflows - - pods/log - - pods - - rayjobs - - resourcequotas - verbs: - - get - - list - - watch ---- -# Source: dataplane/templates/operator/serviceaccount.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: operator-system - labels: - app.kubernetes.io/name: union-operator - app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name - app.kubernetes.io/managed-by: Helm -rules: - # Allow Access to all resources under flyte.lyft.com - - apiGroups: - - flyte.lyft.com - resources: - - flyteworkflows - - flyteworkflows/finalizers - verbs: - - get - - list - - watch - - create - - update - - delete - - patch - - post - - deletecollection - - apiGroups: - - '*' - resources: - - resourcequotas - - pods - - configmaps - - podtemplates - secrets - - namespaces - - nodes verbs: - get - list - - watch - create - update - delete - - nonResourceURLs: - - /metrics - verbs: - - get --- -# Source: dataplane/templates/prometheus/rbac.yaml +# Source: dataplane/templates/operator/serviceaccount-proxy.yaml apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole +kind: Role metadata: - name: union-operator-prometheus + name: proxy-system labels: - helm.sh/chart: dataplane-2026.4.2 - app.kubernetes.io/name: release-name-dataplane + app.kubernetes.io/name: operator-proxy app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" + platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: prometheus rules: - - apiGroups: [""] + - apiGroups: + - '*' resources: - - nodes - - nodes/proxy + - events + - flyteworkflows + - pods/log - pods - - endpoints - - services + - rayjobs + - resourcequotas verbs: - get - list - watch - - nonResourceURLs: - - /metrics - - /metrics/cadvisor - verbs: - - get ---- -# Source: dataplane/templates/propeller/serviceaccount-webhook.yaml -kind: ClusterRole -apiVersion: rbac.authorization.k8s.io/v1 -metadata: - name: flytepropeller-webhook-role - namespace: union -rules: - - apiGroups: - - "*" - resources: - - mutatingwebhookconfigurations - - secrets - - pods - - replicasets/finalizers - verbs: - - get - - create - - update - - patch --- -# Source: dataplane/templates/propeller/serviceaccount.yaml +# Source: dataplane/templates/operator/serviceaccount.yaml apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole +kind: Role metadata: - name: flytepropeller-role + name: operator-system + labels: + app.kubernetes.io/name: union-operator + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm rules: - # Allow RO access to PODS - apiGroups: - - "" + - '*' resources: - - pods + - secrets + - deployments verbs: - get - list - watch - # Allow Event recording access + - create + - update - apiGroups: - - "" + - flyte.lyft.com resources: - - events + - flyteworkflows + - flyteworkflows/finalizers verbs: + - get + - list + - watch - create - update - delete - patch - # Allow Access All plugin objects + - post + - deletecollection - apiGroups: - '*' resources: - - '*' + - resourcequotas + - pods + - configmaps + - podtemplates + - secrets + - namespaces + - nodes verbs: - get - list @@ -4422,148 +4712,129 @@ rules: - create - update - delete - - patch - # Allow Access to CRD - apiGroups: - - apiextensions.k8s.io + - serving.knative.dev resources: - - customresourcedefinitions + - revisions + - configurations + - services verbs: - get - list - watch - create - - delete - update - # Allow Access to all resources under flyte.lyft.com + - delete +--- +# Source: dataplane/templates/webhook/serviceaccount.yaml +kind: Role +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: union-webhook-role + namespace: union +rules: - apiGroups: - - flyte.lyft.com + - "*" resources: - - flyteworkflows - - flyteworkflows/finalizers + - mutatingwebhookconfigurations + - secrets + - pods + - replicasets/finalizers verbs: - get - - list - - watch - create - update - - delete - patch - - post - - deletecollection --- -# Source: dataplane/charts/fluentbit/templates/clusterrolebinding.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2022 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding +kind: RoleBinding metadata: - name: release-name-fluentbit + namespace: "union" + name: operator-webhook labels: - helm.sh/chart: fluentbit-0.48.9 - app.kubernetes.io/name: fluentbit - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "3.2.8" - app.kubernetes.io/managed-by: Helm -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: release-name-fluentbit + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator subjects: - kind: ServiceAccount - name: fluentbit-system - namespace: union ---- -# Source: dataplane/charts/kube-state-metrics/templates/clusterrolebinding.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - labels: - helm.sh/chart: kube-state-metrics-5.30.1 - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: metrics - app.kubernetes.io/part-of: kube-state-metrics - app.kubernetes.io/name: kube-state-metrics - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2.15.0" - name: release-name-kube-state-metrics + name: operator-webhook + namespace: "union" roleRef: + kind: Role + name: knative-operator-webhook apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: release-name-kube-state-metrics -subjects: -- kind: ServiceAccount - name: release-name-kube-state-metrics - namespace: union --- -# Source: dataplane/charts/opencost/templates/clusterrolebinding.yaml +# Source: dataplane/templates/monitoring/prometheus-rbac.yaml apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding +kind: RoleBinding metadata: - name: release-name-opencost + name: union-operator-prometheus-rbac + namespace: union labels: - helm.sh/chart: opencost-1.42.0 - app.kubernetes.io/name: opencost - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "1.111.0" - app.kubernetes.io/part-of: opencost - app.kubernetes.io/managed-by: Helm -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: release-name-opencost + release: release-name subjects: - kind: ServiceAccount - name: release-name-opencost + name: union-operator-prometheus namespace: union ---- -# Source: dataplane/templates/clusterresourcesync/serviceaccount.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - name: union-clustersync-resource roleRef: apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: union-clustersync-resource -subjects: - - kind: ServiceAccount - name: union-clustersync-system - namespace: union + kind: Role + name: union-operator-prometheus-rbac --- -# Source: dataplane/templates/clusterresourcesync/serviceaccount.yaml +# Source: dataplane/templates/monitoring/prometheus-rbac.yaml +# kube-state-metrics service account RoleBinding apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding +kind: RoleBinding metadata: - name: union-clustersync-auth-delegator -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: system:auth-delegator + name: release-name-prometheus-kube-state-metrics + namespace: union + labels: + release: release-name subjects: - kind: ServiceAccount - name: union-clustersync-system + name: release-name-prometheus-kube-state-metrics namespace: union +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: union-operator-prometheus-rbac --- # Source: dataplane/templates/nodeexecutor/serviceaccount.yaml apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding +kind: RoleBinding metadata: name: union-executor labels: app: executor roleRef: apiGroup: rbac.authorization.k8s.io - kind: ClusterRole + kind: Role name: union-executor subjects: - kind: ServiceAccount - name: executor + name: union-system namespace: union --- -# Source: dataplane/templates/operator/serviceaccount-proxy.yaml +# Source: dataplane/templates/operator/serviceaccount-proxy-secret.yaml apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding +kind: RoleBinding metadata: - name: proxy-system + name: union-system-secret + namespace: union labels: app.kubernetes.io/name: operator-proxy app.kubernetes.io/instance: release-name @@ -4571,109 +4842,35 @@ metadata: app.kubernetes.io/managed-by: Helm roleRef: apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: proxy-system + kind: Role + name: union-system-secret subjects: - kind: ServiceAccount - name: proxy-system + name: union-system namespace: union --- -# Source: dataplane/templates/operator/serviceaccount.yaml +# Source: dataplane/templates/operator/serviceaccount-proxy.yaml apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding +kind: RoleBinding metadata: - name: operator-system + name: proxy-system labels: - app.kubernetes.io/name: union-operator + app.kubernetes.io/name: operator-proxy app.kubernetes.io/instance: release-name platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm roleRef: apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: operator-system -subjects: - - kind: ServiceAccount - name: operator-system - namespace: union ---- -# Source: dataplane/templates/prometheus/rbac.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - name: union-operator-prometheus - labels: - helm.sh/chart: dataplane-2026.4.2 - app.kubernetes.io/name: release-name-dataplane - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: prometheus -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: union-operator-prometheus -subjects: - - kind: ServiceAccount - name: union-operator-prometheus - namespace: union ---- -# Source: dataplane/templates/propeller/serviceaccount-webhook.yaml -# Create a binding from Role -> ServiceAccount -kind: ClusterRoleBinding -apiVersion: rbac.authorization.k8s.io/v1 -metadata: - name: flytepropeller-webhook-binding - namespace: union -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: flytepropeller-webhook-role -subjects: - - kind: ServiceAccount - name: flytepropeller-webhook-system - namespace: union ---- -# Source: dataplane/templates/propeller/serviceaccount.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - name: flytepropeller-binding -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: flytepropeller-role + kind: Role + name: proxy-system subjects: - kind: ServiceAccount - name: flytepropeller-system + name: union-system namespace: union --- -# Source: dataplane/templates/operator/serviceaccount-proxy-secret.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: Role -metadata: - name: proxy-system-secret - namespace: union - labels: - app.kubernetes.io/name: operator-proxy - app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name - app.kubernetes.io/managed-by: Helm -rules: - - apiGroups: - - '*' - resources: - - secrets - verbs: - - get - - list - - create - - update - - delete ---- # Source: dataplane/templates/operator/serviceaccount.yaml apiVersion: rbac.authorization.k8s.io/v1 -kind: Role +kind: RoleBinding metadata: name: operator-system labels: @@ -4681,56 +4878,29 @@ metadata: app.kubernetes.io/instance: release-name platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm -rules: - - apiGroups: - - '*' - resources: - - secrets - - deployments - verbs: - - get - - list - - watch - - create - - update ---- -# Source: dataplane/templates/operator/serviceaccount-proxy-secret.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: RoleBinding -metadata: - name: proxy-system-secret - namespace: union - labels: - app.kubernetes.io/name: operator-proxy - app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name - app.kubernetes.io/managed-by: Helm roleRef: apiGroup: rbac.authorization.k8s.io kind: Role - name: proxy-system-secret + name: operator-system subjects: - kind: ServiceAccount - name: proxy-system + name: union-system namespace: union --- -# Source: dataplane/templates/operator/serviceaccount.yaml -apiVersion: rbac.authorization.k8s.io/v1 +# Source: dataplane/templates/webhook/serviceaccount.yaml +# Create a binding from Role -> ServiceAccount kind: RoleBinding +apiVersion: rbac.authorization.k8s.io/v1 metadata: - name: operator-system - labels: - app.kubernetes.io/name: union-operator - app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name - app.kubernetes.io/managed-by: Helm + name: union-webhook-binding + namespace: union roleRef: apiGroup: rbac.authorization.k8s.io kind: Role - name: operator-system + name: union-webhook-role subjects: - kind: ServiceAccount - name: operator-system + name: union-system namespace: union --- # Source: dataplane/charts/fluentbit/templates/service.yaml @@ -4756,20 +4926,60 @@ spec: app.kubernetes.io/name: fluentbit app.kubernetes.io/instance: release-name --- -# Source: dataplane/charts/kube-state-metrics/templates/service.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2022 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v1 +kind: Service +metadata: + labels: + role: operator-webhook + app.kubernetes.io/component: operator-webhook + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator + name: operator-webhook + namespace: "union" +spec: + ports: + # Define metrics and profiling for them to be accessible within service meshes. + - name: http-metrics + port: 9090 + targetPort: 9090 + - name: http-profiling + port: 8008 + targetPort: 8008 + - name: https-webhook + port: 443 + targetPort: 8443 + selector: + role: operator-webhook +--- +# Source: dataplane/charts/prometheus/charts/kube-state-metrics/templates/service.yaml apiVersion: v1 kind: Service metadata: name: release-name-kube-state-metrics namespace: union labels: - helm.sh/chart: kube-state-metrics-5.30.1 + helm.sh/chart: kube-state-metrics-5.25.1 app.kubernetes.io/managed-by: Helm app.kubernetes.io/component: metrics app.kubernetes.io/part-of: kube-state-metrics app.kubernetes.io/name: kube-state-metrics app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2.15.0" + app.kubernetes.io/version: "2.13.0" annotations: prometheus.io/scrape: 'true' spec: @@ -4784,28 +4994,32 @@ spec: app.kubernetes.io/name: kube-state-metrics app.kubernetes.io/instance: release-name --- -# Source: dataplane/charts/opencost/templates/service.yaml +# Source: dataplane/charts/prometheus/templates/service.yaml apiVersion: v1 kind: Service metadata: - name: release-name-opencost - namespace: union labels: - helm.sh/chart: opencost-1.42.0 - app.kubernetes.io/name: opencost + app.kubernetes.io/component: server + app.kubernetes.io/name: prometheus app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "1.111.0" - app.kubernetes.io/part-of: opencost + app.kubernetes.io/version: v2.54.1 + helm.sh/chart: prometheus-25.27.0 app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: prometheus + name: union-operator-prometheus + namespace: union spec: + ports: + - name: http + port: 80 + protocol: TCP + targetPort: 9090 selector: - app.kubernetes.io/name: opencost + app.kubernetes.io/component: server + app.kubernetes.io/name: prometheus app.kubernetes.io/instance: release-name + sessionAffinity: None type: "ClusterIP" - ports: - - name: http - port: 9003 - targetPort: 9003 --- # Source: dataplane/templates/clusterresourcesync/service.yaml apiVersion: v1 @@ -4829,6 +5043,31 @@ spec: app.kubernetes.io/name: clusterresourcesync app.kubernetes.io/instance: release-name --- +# Source: dataplane/templates/flyteconnector/service.yaml +apiVersion: v1 +kind: Service +metadata: + name: flyteconnector + namespace: union + labels: + app.kubernetes.io/name: flyteconnector + app.kubernetes.io/instance: release-name + app.kubernetes.io/managed-by: Helm +spec: + clusterIP: None + ports: + - name: grpc + port: 8000 + protocol: TCP + targetPort: grpc + - name: metric + port: 9090 + protocol: TCP + targetPort: metric + selector: + app.kubernetes.io/name: flyteconnector + app.kubernetes.io/instance: release-name +--- # Source: dataplane/templates/imagebuilder/service.yaml apiVersion: v1 kind: Service @@ -4854,7 +5093,7 @@ spec: apiVersion: v1 kind: Service metadata: - name: release-name-dataplane-executor + name: union-operator-executor labels: platform.union.ai/prometheus-group: "union-services" app: executor @@ -4920,39 +5159,14 @@ spec: app.kubernetes.io/name: union-operator app.kubernetes.io/instance: release-name --- -# Source: dataplane/templates/prometheus/service.yaml -apiVersion: v1 -kind: Service -metadata: - name: union-operator-prometheus - namespace: union - labels: - helm.sh/chart: dataplane-2026.4.2 - app.kubernetes.io/name: release-name-dataplane - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: prometheus -spec: - type: ClusterIP - ports: - - port: 80 - targetPort: 9090 - protocol: TCP - name: http - selector: - app.kubernetes.io/component: prometheus - app.kubernetes.io/name: release-name-dataplane - app.kubernetes.io/instance: release-name ---- -# Source: dataplane/templates/propeller/service-webhook.yaml +# Source: dataplane/templates/webhook/service.yaml apiVersion: v1 kind: Service metadata: - name: flyte-pod-webhook + name: union-pod-webhook namespace: union labels: - app.kubernetes.io/name: flyte-pod-webhook + app.kubernetes.io/name: union-pod-webhook app.kubernetes.io/instance: release-name platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm @@ -4960,7 +5174,7 @@ metadata: projectcontour.io/upstream-protocol.h2c: grpc spec: selector: - app.kubernetes.io/name: flyte-pod-webhook + app.kubernetes.io/name: union-pod-webhook app.kubernetes.io/instance: release-name ports: - name: https @@ -4972,23 +5186,23 @@ spec: port: 10254 targetPort: 10254 --- -# Source: dataplane/templates/propeller/service-webhook.yaml +# Source: dataplane/templates/webhook/service.yaml # Headless Service for cache invalidation — resolves to all pod IPs so that # we can fan out invalidation requests to every webhook replica. apiVersion: v1 kind: Service metadata: - name: flyte-pod-webhook-headless + name: union-pod-webhook-headless namespace: union labels: - app.kubernetes.io/name: flyte-pod-webhook + app.kubernetes.io/name: union-pod-webhook app.kubernetes.io/instance: release-name platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm spec: clusterIP: None selector: - app.kubernetes.io/name: flyte-pod-webhook + app.kubernetes.io/name: union-pod-webhook app.kubernetes.io/instance: release-name ports: - name: cache-internal @@ -4996,32 +5210,6 @@ spec: port: 9443 targetPort: 9443 --- -# Source: dataplane/templates/propeller/service.yaml -apiVersion: v1 -kind: Service -metadata: - namespace: union - name: flytepropeller - labels: - platform.union.ai/prometheus-group: "union-services" - app.kubernetes.io/name: flytepropeller - app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name - app.kubernetes.io/managed-by: Helm -spec: - type: ClusterIP - ports: - - name: debug - protocol: TCP - port: 10254 - - name: fasttask - port: 15605 - protocol: TCP - targetPort: 15605 - selector: - app.kubernetes.io/name: flytepropeller - app.kubernetes.io/instance: release-name ---- # Source: dataplane/charts/fluentbit/templates/daemonset.yaml apiVersion: apps/v1 kind: DaemonSet @@ -5047,7 +5235,7 @@ spec: annotations: checksum/config: e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855 spec: - serviceAccountName: fluentbit-system + serviceAccountName: union-system hostNetwork: false dnsPolicy: ClusterFirst containers: @@ -5099,20 +5287,217 @@ spec: tolerations: - operator: Exists --- -# Source: dataplane/charts/kube-state-metrics/templates/deployment.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2022 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: operator-webhook + namespace: "union" + labels: + app.kubernetes.io/component: operator-webhook + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +spec: + selector: + matchLabels: + app: operator-webhook + role: operator-webhook + template: + metadata: + labels: + app: operator-webhook + role: operator-webhook + app.kubernetes.io/component: operator-webhook + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator + sidecar.istio.io/inject: "false" + spec: + # To avoid node becoming SPOF, spread our replicas to different nodes. + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - podAffinityTerm: + labelSelector: + matchLabels: + app: operator-webhook + topologyKey: kubernetes.io/hostname + weight: 100 + serviceAccountName: operator-webhook + containers: + - name: operator-webhook + # This is the Go import path for the binary that is containerized + # and substituted here. + image: gcr.io/knative-releases/knative.dev/operator/cmd/webhook@sha256:d3a7a3304629ccfcaacec620b50736e0b4c902a6328b83369b6cbaf7e94677c9 + resources: + requests: + cpu: 100m + memory: 100Mi + limits: + cpu: 500m + memory: 500Mi + env: + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: SYSTEM_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: CONFIG_LOGGING_NAME + value: config-logging + - name: CONFIG_OBSERVABILITY_NAME + value: config-observability + - name: WEBHOOK_NAME + value: operator-webhook + - name: WEBHOOK_PORT + value: "8443" + - name: WEBHOOK_SECRET_NAME + value: operator-webhook-certs + - name: METRICS_DOMAIN + value: knative.dev/operator + - name: KUBERNETES_MIN_VERSION + value: "" + securityContext: + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + runAsNonRoot: true + capabilities: + drop: + - ALL + ports: + - name: metrics + containerPort: 9090 + - name: profiling + containerPort: 8008 + - name: https-webhook + containerPort: 8443 + readinessProbe: + periodSeconds: 1 + httpGet: + scheme: HTTPS + port: 8443 + httpHeaders: + - name: k-kubelet-probe + value: "webhook" + livenessProbe: + periodSeconds: 1 + httpGet: + scheme: HTTPS + port: 8443 + httpHeaders: + - name: k-kubelet-probe + value: "webhook" + failureThreshold: 6 + initialDelaySeconds: 120 + # Our webhook should gracefully terminate by lame ducking first, set this to a sufficiently + # high value that we respect whatever value it has configured for the lame duck grace period. + terminationGracePeriodSeconds: 300 +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2020 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: knative-operator + namespace: "union" + labels: + app.kubernetes.io/name: knative-operator + app.kubernetes.io/version: "1.16.0" +spec: + replicas: 1 + selector: + matchLabels: + name: knative-operator + template: + metadata: + labels: + name: knative-operator + app.kubernetes.io/name: knative-operator + app.kubernetes.io/version: "1.16.0" + sidecar.istio.io/inject: "false" + spec: + serviceAccountName: knative-operator + containers: + - name: knative-operator + image: gcr.io/knative-releases/knative.dev/operator/cmd/operator@sha256:0b5a3532417f9c8e7b6044e23f6f67ad932a74a40a4092ad965b6f173b2fd887 + resources: + requests: + cpu: 100m + memory: 100Mi + limits: + cpu: 1000m + memory: 1000Mi + imagePullPolicy: IfNotPresent + env: + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: SYSTEM_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: METRICS_DOMAIN + value: knative.dev/operator + - name: CONFIG_LOGGING_NAME + value: config-logging + - name: CONFIG_OBSERVABILITY_NAME + value: config-observability + - name: KUBERNETES_MIN_VERSION + value: "" + securityContext: + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + runAsNonRoot: true + capabilities: + drop: + - ALL + ports: + - name: metrics + containerPort: 9090 +--- +# Source: dataplane/charts/prometheus/charts/kube-state-metrics/templates/deployment.yaml apiVersion: apps/v1 kind: Deployment metadata: name: release-name-kube-state-metrics namespace: union labels: - helm.sh/chart: kube-state-metrics-5.30.1 + helm.sh/chart: kube-state-metrics-5.25.1 app.kubernetes.io/managed-by: Helm app.kubernetes.io/component: metrics app.kubernetes.io/part-of: kube-state-metrics app.kubernetes.io/name: kube-state-metrics app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2.15.0" + app.kubernetes.io/version: "2.13.0" spec: selector: matchLabels: @@ -5125,13 +5510,13 @@ spec: template: metadata: labels: - helm.sh/chart: kube-state-metrics-5.30.1 + helm.sh/chart: kube-state-metrics-5.25.1 app.kubernetes.io/managed-by: Helm app.kubernetes.io/component: metrics app.kubernetes.io/part-of: kube-state-metrics app.kubernetes.io/name: kube-state-metrics app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2.15.0" + app.kubernetes.io/version: "2.13.0" spec: automountServiceAccountToken: true hostNetwork: false @@ -5148,8 +5533,9 @@ spec: args: - --port=8080 - --resources=certificatesigningrequests,configmaps,cronjobs,daemonsets,deployments,endpoints,horizontalpodautoscalers,ingresses,jobs,leases,limitranges,mutatingwebhookconfigurations,namespaces,networkpolicies,nodes,persistentvolumeclaims,persistentvolumes,poddisruptionbudgets,pods,replicasets,replicationcontrollers,resourcequotas,secrets,services,statefulsets,storageclasses,validatingwebhookconfigurations,volumeattachments + - --namespaces=union imagePullPolicy: IfNotPresent - image: registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.15.0 + image: registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.13.0 ports: - containerPort: 8080 name: "http" @@ -5169,7 +5555,7 @@ spec: httpGet: httpHeaders: path: /readyz - port: 8081 + port: 8080 scheme: HTTP initialDelaySeconds: 5 periodSeconds: 10 @@ -5184,201 +5570,204 @@ spec: - ALL readOnlyRootFilesystem: true --- -# Source: dataplane/charts/opencost/templates/deployment.yaml +# Source: dataplane/charts/prometheus/templates/deploy.yaml apiVersion: apps/v1 kind: Deployment metadata: - name: release-name-opencost - namespace: union labels: - helm.sh/chart: opencost-1.42.0 - app.kubernetes.io/name: opencost + app.kubernetes.io/component: server + app.kubernetes.io/name: prometheus app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "1.111.0" - app.kubernetes.io/part-of: opencost + app.kubernetes.io/version: v2.54.1 + helm.sh/chart: prometheus-25.27.0 app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: prometheus + name: union-operator-prometheus + namespace: union spec: - replicas: 1 selector: matchLabels: - app.kubernetes.io/name: opencost + app.kubernetes.io/component: server + app.kubernetes.io/name: prometheus app.kubernetes.io/instance: release-name - strategy: - rollingUpdate: - maxSurge: 1 - maxUnavailable: 1 - type: RollingUpdate + replicas: 1 + revisionHistoryLimit: 10 + strategy: + type: Recreate + rollingUpdate: null template: metadata: labels: - app.kubernetes.io/name: opencost + app.kubernetes.io/component: server + app.kubernetes.io/name: prometheus app.kubernetes.io/instance: release-name + app.kubernetes.io/version: v2.54.1 + helm.sh/chart: prometheus-25.27.0 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: prometheus spec: - serviceAccountName: release-name-opencost + enableServiceLinks: true + serviceAccountName: union-operator-prometheus containers: - - name: release-name-opencost - image: ghcr.io/opencost/opencost:1.111.0@sha256:6aa68e52a24b14ba41f23db08d1b9db1429a1c0300f4c0381ecc2c61fc311a97 - imagePullPolicy: IfNotPresent + - name: prometheus-server-configmap-reload + image: "quay.io/prometheus-operator/prometheus-config-reloader:v0.76.0" + imagePullPolicy: "IfNotPresent" args: + - --watched-dir=/etc/config + - --listen-address=0.0.0.0:8080 + - --reload-url=http://127.0.0.1:9090/prometheus/-/reload ports: - - containerPort: 9003 - name: http - resources: - limits: - cpu: 1000m - memory: 4Gi - requests: - cpu: 500m - memory: 1Gi - startupProbe: - httpGet: - path: /healthz - port: 9003 - initialDelaySeconds: 10 - periodSeconds: 5 - failureThreshold: 30 + - containerPort: 8080 + name: metrics livenessProbe: httpGet: path: /healthz - port: 9003 - initialDelaySeconds: 10 - periodSeconds: 20 - failureThreshold: 3 + port: metrics + scheme: HTTP + initialDelaySeconds: 2 + periodSeconds: 10 readinessProbe: httpGet: path: /healthz - port: 9003 - initialDelaySeconds: 10 + port: metrics + scheme: HTTP periodSeconds: 10 + volumeMounts: + - name: config-volume + mountPath: /etc/config + readOnly: true + + - name: prometheus-server + image: "quay.io/prometheus/prometheus:v2.54.1" + imagePullPolicy: "IfNotPresent" + args: + - --storage.tsdb.retention.time=3d + - --config.file=/etc/config/prometheus.yml + - --storage.tsdb.path=/data + - --web.console.libraries=/etc/prometheus/console_libraries + - --web.console.templates=/etc/prometheus/consoles + - --web.enable-lifecycle + - --web.route-prefix=/prometheus + - --web.external-url=/prometheus/ + ports: + - containerPort: 9090 + readinessProbe: + httpGet: + path: /prometheus/-/ready + port: 9090 + scheme: HTTP + initialDelaySeconds: 30 + periodSeconds: 5 + timeoutSeconds: 4 failureThreshold: 3 - env: - - name: LOG_LEVEL - value: info - - name: CUSTOM_COST_ENABLED - value: "false" - - name: KUBECOST_NAMESPACE - value: union - - name: API_PORT - value: "9003" - - name: PROMETHEUS_SERVER_ENDPOINT - value: "http://union-operator-prometheus.union.svc:80/prometheus" - - name: CLUSTER_ID - value: "default-cluster" - - name: DATA_RETENTION_DAILY_RESOLUTION_DAYS - value: "15" - - name: CLOUD_COST_ENABLED - value: "false" - - name: CLOUD_COST_MONTH_TO_DATE_INTERVAL - value: "6" - - name: CLOUD_COST_REFRESH_RATE_HOURS - value: "6" - - name: CLOUD_COST_QUERY_WINDOW_DAYS - value: "7" - - name: CLOUD_COST_RUN_WINDOW_DAYS - value: "3" - # Add any additional provided variables + successThreshold: 1 + livenessProbe: + httpGet: + path: /prometheus/-/healthy + port: 9090 + scheme: HTTP + initialDelaySeconds: 30 + periodSeconds: 15 + timeoutSeconds: 10 + failureThreshold: 3 + successThreshold: 1 + resources: + limits: + cpu: "3" + memory: 3500Mi + requests: + cpu: "1" + memory: 1Gi + volumeMounts: + - name: config-volume + mountPath: /etc/config + - name: storage-volume + mountPath: /data + subPath: "" + - name: server-recording-rules + mountPath: /etc/config/recording + subPath: + readOnly: + dnsPolicy: ClusterFirst + securityContext: + fsGroup: 65534 + runAsGroup: 65534 + runAsNonRoot: true + runAsUser: 65534 + terminationGracePeriodSeconds: 300 + volumes: + - name: config-volume + configMap: + name: union-operator-prometheus + - name: server-recording-rules + configMap: + name: union-recording-rules + - name: storage-volume + emptyDir: + sizeLimit: 10Gi --- -# Source: dataplane/templates/clusterresourcesync/deployment.yaml +# Source: dataplane/templates/flyteconnector/deployment.yaml apiVersion: apps/v1 kind: Deployment metadata: - name: union-syncresources + name: flyteconnector namespace: union - labels: - app.kubernetes.io/name: clusterresourcesync + labels: + app.kubernetes.io/name: flyteconnector app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm spec: - replicas: 1 - selector: - matchLabels: - app.kubernetes.io/name: clusterresourcesync - app.kubernetes.io/instance: release-name - template: - metadata: - annotations: - configChecksum: "e8f0345934a4cc3c298aa5eef3ed39a162d412505166678ff0faf09c9d0afee" - - labels: - platform.union.ai/zone: "dataplane" - - app.kubernetes.io/name: clusterresourcesync - app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name - app.kubernetes.io/managed-by: Helm - spec: - containers: - - command: - - clusterresource - - --config - - /etc/flyte/config/*.yaml - - clusterresource - - run - env: - - name: POD_NAME - valueFrom: - fieldRef: - fieldPath: metadata.name - - name: POD_NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - - name: GOMEMLIMIT - valueFrom: - resourceFieldRef: - divisor: 1 - resource: limits.memory - - name: GOMAXPROCS - valueFrom: - resourceFieldRef: - divisor: 1 - resource: limits.cpu - - name: CLUSTER_NAME - valueFrom: - secretKeyRef: - name: operator-cluster-name - key: cluster_name - - name: DEPLOYMENT_NAME - value: operator - - name: PROXY_SERVICE_URL - value: http://union-operator-proxy:8080 - - name: PROMETHEUS_SERVICE_URL - value: http://union-operator-prometheus:80 - - name: KNATIVE_PROXY_SERVICE_URL - value: http://kourier-internal - image: "public.ecr.aws/p0i0a9q8/unionoperator:2026.4.5" + replicas: 2 + selector: + matchLabels: + app.kubernetes.io/name: flyteconnector + app.kubernetes.io/instance: release-name + template: + metadata: + annotations: + labels: + platform.union.ai/zone: "dataplane" + + app.kubernetes.io/name: flyteconnector + app.kubernetes.io/instance: release-name + app.kubernetes.io/managed-by: Helm + spec: + containers: + - command: + - c0 + image: "ghcr.io/flyteorg/flyte-connectors:py3.13-2.0.0b50.dev3-g695bb1db3.d20260122" imagePullPolicy: "IfNotPresent" - name: sync-cluster-resources + name: flyteconnector + readinessProbe: + failureThreshold: 3 + httpGet: + path: / + port: 9090 + scheme: HTTP + initialDelaySeconds: 5 + periodSeconds: 10 + successThreshold: 1 + timeoutSeconds: 5 + env: + - name: AIRFLOW_CONN_GOOGLE_CLOUD_DEFAULT + value: "google-cloud-platform://" + - name: LOG_LEVEL + value: "20" + ports: + - containerPort: 8000 + name: grpc + - containerPort: 9090 + name: metric resources: limits: - cpu: "1" - memory: 500Mi + cpu: "1.5" + ephemeral-storage: 100Mi + memory: 1500Mi requests: - cpu: 500m - memory: 100Mi - volumeMounts: - - name: auth - mountPath: /etc/union/secret - - name: resource-templates - mountPath: /etc/flyte/clusterresource/templates - - name: config-volume - mountPath: /etc/flyte/config - ports: - - name: debug - containerPort: 10254 - protocol: TCP - serviceAccountName: union-clustersync-system - volumes: - - configMap: - name: union-clusterresource-template - name: resource-templates - - configMap: - name: union-clusterresourcesync-config - name: config-volume - - name: auth - secret: - secretName: union-secret-auth + cpu: "1" + ephemeral-storage: 100Mi + memory: 1000Mi + serviceAccountName: flyteconnector --- # Source: dataplane/templates/imagebuilder/deployment.yaml apiVersion: apps/v1 @@ -5400,17 +5789,15 @@ spec: app.kubernetes.io/instance: release-name template: metadata: - annotations: - container.apparmor.security.beta.kubernetes.io/buildkit: unconfined labels: platform.union.ai/zone: "dataplane" app.kubernetes.io/name: imagebuilder-buildkit app.kubernetes.io/instance: release-name spec: - serviceAccountName: "union-imagebuilder" + serviceAccountName: "union-system" containers: - name: "buildkit" - image: "docker.io/moby/buildkit:buildx-stable-1-rootless" + image: "docker.io/moby/buildkit:buildx-stable-1" imagePullPolicy: IfNotPresent env: - name: POD_NAME @@ -5445,18 +5832,15 @@ spec: - name: KNATIVE_PROXY_SERVICE_URL value: http://kourier-internal volumeMounts: - - mountPath: /home/user/.local/share/buildkit - name: buildkitd - mountPath: /etc/buildkit name: buildkit-config args: - --config - /etc/buildkit/buildkitd.toml - --addr - - unix:///run/user/1000/buildkit/buildkitd.sock + - unix:///run//buildkit/buildkitd.sock - --addr - tcp://0.0.0.0:1234 - - --oci-worker-no-process-sandbox ports: - name: tcp containerPort: 1234 @@ -5478,18 +5862,13 @@ spec: initialDelaySeconds: 5 periodSeconds: 30 securityContext: - seccompProfile: # Needs Kubernetes >= 1.19 - type: Unconfined - runAsUser: 1000 - runAsGroup: 1000 + privileged: true resources: requests: - cpu: 1 - ephemeral-storage: 20Gi - memory: 1Gi + cpu: 4 + ephemeral-storage: 50Gi + memory: 4Gi volumes: - - name: buildkitd - emptyDir: {} - configMap: name: union-operator-buildkit name: buildkit-config @@ -5518,16 +5897,18 @@ spec: template: metadata: annotations: - configChecksum: "8fb93781ca61cf36fec4151f853da41981c87a2295bc90caf045122030265e9" + configChecksum: "f84e128b0db9de6636d6239fdc94e10ca8ec03dfe7ebde09097673b80ca385a" labels: platform.union.ai/zone: "dataplane" app: executor + app.kubernetes.io/instance: 'release-name' + app.kubernetes.io/name: executor spec: securityContext: fsGroup: 1337 - serviceAccountName: executor + serviceAccountName: union-system volumes: - name: config-volume configMap: @@ -5543,7 +5924,7 @@ spec: image: "public.ecr.aws/p0i0a9q8/unionoperator:2026.4.5" imagePullPolicy: IfNotPresent command: - - executor + - executorv2 - serve - --config - /etc/config/*.yaml @@ -5621,7 +6002,7 @@ spec: template: metadata: annotations: - configChecksum: "5ee14e6faf5726ce2b854531f04ce7cec5e963a2bbda2b5271a0a8b7f2a5219" + configChecksum: "a4359544543680d66ed71cdcd8fea9460540ea8b9d6a7d8de7cb28b48e4c200" labels: @@ -5636,12 +6017,10 @@ spec: sources: - configMap: name: union-operator - - configMap: - name: union-clusterresourcesync-config - name: secret-volume secret: secretName: union-secret-auth - serviceAccountName: proxy-system + serviceAccountName: union-system securityContext: {} containers: @@ -5758,7 +6137,7 @@ spec: template: metadata: annotations: - configChecksum: "5ee14e6faf5726ce2b854531f04ce7cec5e963a2bbda2b5271a0a8b7f2a5219" + configChecksum: "a4359544543680d66ed71cdcd8fea9460540ea8b9d6a7d8de7cb28b48e4c200" labels: @@ -5767,7 +6146,7 @@ spec: platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm spec: - serviceAccountName: operator-system + serviceAccountName: union-system securityContext: {} volumes: @@ -5848,81 +6227,16 @@ spec: containerPort: 10254 protocol: TCP --- -# Source: dataplane/templates/prometheus/deployment.yaml -apiVersion: apps/v1 -kind: Deployment -metadata: - name: union-operator-prometheus - namespace: union - labels: - helm.sh/chart: dataplane-2026.4.2 - app.kubernetes.io/name: release-name-dataplane - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: prometheus -spec: - replicas: 1 - selector: - matchLabels: - app.kubernetes.io/component: prometheus - app.kubernetes.io/name: release-name-dataplane - app.kubernetes.io/instance: release-name - template: - metadata: - annotations: - configChecksum: "72a39ca838f02fc499675b5708fa77e2a4938e316a99315b59e33ef7f2d31ae" - labels: - app.kubernetes.io/component: prometheus - app.kubernetes.io/name: release-name-dataplane - app.kubernetes.io/instance: release-name - spec: - priorityClassName: system-cluster-critical - serviceAccountName: union-operator-prometheus - securityContext: - fsGroup: 65534 - runAsNonRoot: true - runAsUser: 65534 - fsGroupChangePolicy: OnRootMismatch - containers: - - name: prometheus - image: "prom/prometheus:v3.3.1" - args: - - --config.file=/etc/prometheus/prometheus.yml - - --web.external-url=/prometheus/ - - --web.route-prefix=/prometheus/ - - --storage.tsdb.retention.time=3d - ports: - - name: http - containerPort: 9090 - protocol: TCP - resources: - limits: - cpu: "3" - memory: 3500Mi - requests: - cpu: "1" - memory: 1Gi - securityContext: - allowPrivilegeEscalation: false - terminationMessagePolicy: FallbackToLogsOnError - volumeMounts: - - mountPath: /etc/prometheus - name: prometheus-config - volumes: - - name: prometheus-config - configMap: - name: union-operator-prometheus ---- -# Source: dataplane/templates/propeller/deployment-webhook.yaml +# Source: dataplane/templates/webhook/deployment.yaml +# Webhook deployment # Create the actual deployment apiVersion: apps/v1 kind: Deployment metadata: - name: flytepropeller-webhook + name: union-pod-webhook namespace: union labels: - app.kubernetes.io/name: flyte-pod-webhook + app.kubernetes.io/name: union-pod-webhook app.kubernetes.io/instance: release-name platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm @@ -5930,19 +6244,19 @@ spec: replicas: 1 selector: matchLabels: - app.kubernetes.io/name: flyte-pod-webhook + app.kubernetes.io/name: union-pod-webhook app.kubernetes.io/instance: release-name template: metadata: labels: platform.union.ai/zone: "dataplane" - app.kubernetes.io/name: flyte-pod-webhook + app.kubernetes.io/name: union-pod-webhook app.kubernetes.io/instance: release-name platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm annotations: - configChecksum: "54a4cba2f0b2935bab3d47d13782d2d7ea6d5fe154f434cf177d1a19dbf3dad" + configChecksum: "77958aee0464eee4cb43c70036cf5ffa89f47f58c908845b9eeaecc7e32bba4" spec: securityContext: @@ -5950,64 +6264,8 @@ spec: fsGroupChangePolicy: Always runAsNonRoot: true runAsUser: 1001 - seLinuxOptions: - type: spc_t - serviceAccountName: flytepropeller-webhook-system - initContainers: - - name: generate-secrets - image: "public.ecr.aws/p0i0a9q8/unionoperator:2026.4.5" - imagePullPolicy: "IfNotPresent" - command: - - flytepropeller - args: - - webhook - - init-certs - - --config - - /etc/flyte/config/*.yaml - env: - - name: POD_NAME - valueFrom: - fieldRef: - fieldPath: metadata.name - - name: POD_NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - - name: GOMEMLIMIT - valueFrom: - resourceFieldRef: - divisor: 1 - resource: limits.memory - - name: GOMAXPROCS - valueFrom: - resourceFieldRef: - divisor: 1 - resource: limits.cpu - - name: CLUSTER_NAME - valueFrom: - secretKeyRef: - name: operator-cluster-name - key: cluster_name - - name: DEPLOYMENT_NAME - value: operator - - name: PROXY_SERVICE_URL - value: http://union-operator-proxy:8080 - - name: PROMETHEUS_SERVICE_URL - value: http://union-operator-prometheus:80 - - name: KNATIVE_PROXY_SERVICE_URL - value: http://kourier-internal - volumeMounts: - - name: config-volume - mountPath: /etc/flyte/config - resources: - limits: - cpu: 1 - ephemeral-storage: 500Mi - memory: 500Mi - requests: - cpu: 200m - ephemeral-storage: 500Mi - memory: 500Mi + seLinuxOptions: null + serviceAccountName: union-system containers: - name: webhook image: "public.ecr.aws/p0i0a9q8/unionoperator:2026.4.5" @@ -6072,107 +6330,301 @@ spec: volumes: - name: config-volume configMap: - name: flyte-propeller-config + name: union-pod-webhook-config - name: webhook-certs secret: - secretName: flyte-pod-webhook + secretName: union-pod-webhook --- -# Source: dataplane/templates/propeller/deployment.yaml -apiVersion: apps/v1 -kind: Deployment +# Source: dataplane/templates/flyteconnector/hpa.yaml +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler metadata: - namespace: union - name: flytepropeller + name: flyteconnector + labels: + app.kubernetes.io/name: flyteconnector + app.kubernetes.io/instance: release-name + app.kubernetes.io/managed-by: Helm +spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: flyteconnector + minReplicas: 2 + maxReplicas: 5 + metrics: + - type: Resource + resource: + name: cpu + target: + type: Utilization + averageUtilization: 80 + - type: Resource + resource: + name: memory + target: + type: Utilization + averageUtilization: 80 +--- +# Source: dataplane/templates/webhook/mutatingwebhookconfiguration.yaml +apiVersion: admissionregistration.k8s.io/v1 +kind: MutatingWebhookConfiguration +metadata: + name: union-pod-webhook- labels: - app.kubernetes.io/name: flytepropeller + app.kubernetes.io/name: union-pod-webhook app.kubernetes.io/instance: release-name platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm +webhooks: + - name: union-pod-webhook.flyte.org + admissionReviewVersions: + - v1 + - v1beta1 + clientConfig: + caBundle: LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSUROekNDQWgrZ0F3SUJBZ0lVSm9kQ09lem4vd1BHQTZjYkw1U3duSlpYT2hJd0RRWUpLb1pJaHZjTkFRRUwKQlFBd0t6RXBNQ2NHQTFVRUF3d2dabXg1ZEdWd2NtOXdaV3hzWlhJdGQyVmlhRzl2YXk1MWJtbHZiaTV6ZG1NdwpIaGNOTWpZd01USXpNREl6TmpVMVdoY05Nell3TVRJeE1ESXpOalUxV2pBck1Ta3dKd1lEVlFRRERDQm1iSGwwClpYQnliM0JsYkd4bGNpMTNaV0pvYjI5ckxuVnVhVzl1TG5OMll6Q0NBU0l3RFFZSktvWklodmNOQVFFQkJRQUQKZ2dFUEFEQ0NBUW9DZ2dFQkFJcjBSWDRGRjFNVTNYdFBmUHErMDUrMGI1Qm0xK3hsTVR5ZUp4bG5mWFJuRlgyegpPcHV6NmsreUpBQk4xRkxmNDVYRnlJclFlaW5DWWJtckZXalFRc3ZDWEloNkpPNGMwdmRMVVU5RERYcldWRFlsCkRNZytTUDlJVnZBMm5USkVFZ2RFREVsKzNWeThUbUJoNlNmcGthcXdHajY5SmhIdy9OeE9yZzM2bUY3TU5VZ1MKWXJVUkxqUFFzTUN6NElTVzhhQnEzS2IwRUFWUTA4V1lZQ0ZEbjY1aHBzVFdxR05rL3BKRnVTYjlsZll6RVdyYwpJeW5wNDhqTnIvNUdKS3NnWG9LeTUwZVo4UkppeTVyOUhENGtnSW15TVNQR2RDMXFZSVppRHRlZW8yOElxNk5nCmd1dWcyTGhlNndYYW9YZXFCQ2d3SHpwWFVXM1hJVzI5eFoxUkFRRUNBd0VBQWFOVE1GRXdIUVlEVlIwT0JCWUUKRktITDQ1RWhQaVg5bnlRN241QUk0YUZVNFlCL01COEdBMVVkSXdRWU1CYUFGS0hMNDVFaFBpWDlueVE3bjVBSQo0YUZVNFlCL01BOEdBMVVkRXdFQi93UUZNQU1CQWY4d0RRWUpLb1pJaHZjTkFRRUxCUUFEZ2dFQkFEeTRac21XCkNjZ3B5TmI4cU1ydzNYclBNejZLMTRlTVhqQVk1bERFYXM1NklLZExoZFM1bi9sWk95bENuUUtOZ3RHS0VrY3UKT05tMEE2Z0xNT3UzcE43dG9aNXh0WktOcnUvS0x6dm44QUNNNndCbUR0bG5oRERDa3ZUTHBVRzh6UTZSNmlaegpzTWlhVmpLSUJiWkJsZXNYSHZZQnQ2YlRTN2o1QWM2aTdYc1RPK3ZqQ2RUQjVYdCtEZ0x2QnBJRWh2ZGJzNXhmClFtc3ZzQmwxenJnR3JXemo4d0txcXNUMzZaOVU1TzdSK1RJMlQrUDVOR25SeHNQOGJ0R0RtMGZ5SW55WlJrSkcKVTdpVFJyMlZRWlRyaTNZQ2dlVjN1UjFtMnJjc2hyU1Avb082T0QyTXR5Q1FMRWtFK1pxVUVJVnpSYmVaNXl2RQptM29YMFV2OUtBUy9pWTA9Ci0tLS0tRU5EIENFUlRJRklDQVRFLS0tLS0K + service: + name: union-pod-webhook + namespace: union + path: /mutate--v1-pod/secrets + port: 443 + failurePolicy: Fail + matchPolicy: Equivalent + namespaceSelector: + matchLabels: + kubernetes.io/metadata.name: union + objectSelector: + matchLabels: + inject-flyte-secrets: "true" + organization: '' + reinvocationPolicy: Never + rules: + - apiGroups: + - '*' + apiVersions: + - v1 + operations: + - CREATE + resources: + - pods + scope: '*' + sideEffects: NoneOnDryRun + timeoutSeconds: 30 +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Imported via https://github.com/knative/operator/releases/download/knative-v1.16.0/operator.yaml +# Copyright 2024 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +# Source: dataplane/templates/propeller/serviceaccount.yaml +--- +--- +# Source: dataplane/templates/serving/knative-serving.yaml +apiVersion: operator.knative.dev/v1beta1 +kind: KnativeServing +metadata: + name: union-operator-serving + namespace: union + labels: + helm.sh/chart: dataplane-2026.4.4 + app.kubernetes.io/name: release-name-dataplane + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2026.4.5" + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/component: serving spec: - replicas: 1 - selector: - matchLabels: - app.kubernetes.io/name: flytepropeller + config: + deployment: + progress-deadline: "30m" + queue-sidecar-cpu-request: "25m" + queue-sidecar-cpu-limit: "1000m" + queue-sidecar-memory-request: "400Mi" + queue-sidecar-memory-limit: "800Mi" + queue-sidecar-ephemeral-storage-request: "512Mi" + queue-sidecar-ephemeral-storage-limit: "1024Mi" + registries-skipping-tag-resolving: managed.cr.union.ai + features: + kubernetes.podspec-affinity: "enabled" + kubernetes.podspec-nodeselector: "enabled" + kubernetes.podspec-tolerations: "enabled" + kubernetes.podspec-fieldref: "enabled" + kubernetes.podspec-dnspolicy: "enabled" + kubernetes.podspec-schedulername: "enabled" + kubernetes.podspec-securitycontext: "enabled" + network: + ingress-class: "kourier.ingress.networking.knative.dev" + high-availability: + replicas: 2 + ingress: + kourier: + enabled: true + bootstrap-configmap: "union-operator-serving-envoy-bootstrap" + service-type: ClusterIP + podDisruptionBudgets: + - name: 3scale-kourier-gateway-pdb + minAvailable: 50% + - name: activator-pdb + minAvailable: 50% + - name: webhook-pdb + minAvailable: 50% + registry: + override: + # TODO(jeev): Wire up Union fork of Envoy + 3scale-kourier-gateway/kourier-gateway: ghcr.io/unionai/envoy:456fed84d4ad9a9dfb186d117d9362e9dc0f7c1f + # TODO(jeev): Wire up Union fork of Kourier + net-kourier-controller/controller: ghcr.io/unionai/kourier@sha256:5804c348d15b3959604e3e3ceed216c3a1c7b32cbe254c7d3eb02a35e62ba9c4 + workloads: + - name: 3scale-kourier-gateway + labels: + helm.sh/chart: dataplane-2026.4.4 + app.kubernetes.io/name: release-name-dataplane app.kubernetes.io/instance: release-name - template: - metadata: - annotations: - configChecksum: "54a4cba2f0b2935bab3d47d13782d2d7ea6d5fe154f434cf177d1a19dbf3dad" - - labels: - platform.union.ai/zone: "dataplane" - - - app.kubernetes.io/name: flytepropeller - app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name - app.kubernetes.io/managed-by: Helm - spec: - priorityClassName: system-cluster-critical - containers: - - command: - - flytepropeller - - --config - - /etc/flyte/config/*.yaml - - --propeller.cluster-id - - '' - env: - - name: POD_NAME - valueFrom: - fieldRef: - fieldPath: metadata.name - - name: POD_NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - - name: GOMEMLIMIT - valueFrom: - resourceFieldRef: - divisor: 1 - resource: limits.memory - - name: GOMAXPROCS - valueFrom: - resourceFieldRef: - divisor: 1 - resource: limits.cpu - - name: CLUSTER_NAME - valueFrom: - secretKeyRef: - name: operator-cluster-name - key: cluster_name - - name: DEPLOYMENT_NAME - value: operator - - name: PROXY_SERVICE_URL - value: http://union-operator-proxy:8080 - - name: PROMETHEUS_SERVICE_URL - value: http://union-operator-prometheus:80 - - name: KNATIVE_PROXY_SERVICE_URL - value: http://kourier-internal - image: "public.ecr.aws/p0i0a9q8/unionoperator:2026.4.5" - imagePullPolicy: "IfNotPresent" - name: flytepropeller - ports: - - containerPort: 10254 - resources: - limits: - cpu: "3" - memory: 3Gi - requests: - cpu: "1" - memory: 1Gi - volumeMounts: - - name: config-volume - mountPath: /etc/flyte/config - - name: auth - mountPath: /etc/union/secret - serviceAccountName: flytepropeller-system - volumes: - - configMap: - name: flyte-propeller-config - name: config-volume - - name: auth - secret: - secretName: union-secret-auth + app.kubernetes.io/version: "2026.4.5" + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/component: 3scale-kourier-gateway + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchLabels: + app.kubernetes.io/name: 3scale-kourier-gateway + app.kubernetes.io/instance: release-name + topologyKey: topology.kubernetes.io/zone + annotations: + checksum/bootstrap-config: 6469f6f592eebc9e0c676d8ccc359a05bc799c0988e474b2da942c4cc328f656 + env: + - container: kourier-gateway + envVars: + - name: UNION_AUTHZ_TENANTAUTHURL + value: "https://test.example.com/me" + - name: UNION_AUTHZ_TENANTAUTHSIGNINURL + value: "https://test.example.com/login" + - name: UNION_AUTHZ_TENANTCONTROLPLANEURL + value: "https://test.example.com" + resources: + - container: kourier-gateway + limits: + cpu: "2" + memory: 2Gi + requests: + cpu: "1" + memory: 1Gi + - name: activator + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app + operator: In + values: + - activator + topologyKey: topology.kubernetes.io/zone + - name: autoscaler + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app + operator: In + values: + - autoscaler + topologyKey: topology.kubernetes.io/zone + - name: autoscaler-hpa + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app + operator: In + values: + - autoscaler-hpa + topologyKey: topology.kubernetes.io/zone + - name: controller + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app + operator: In + values: + - controller + topologyKey: topology.kubernetes.io/zone + - name: net-kourier-controller + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app + operator: In + values: + - net-kourier-controller + topologyKey: topology.kubernetes.io/zone + env: + - container: controller + envVars: + - name: KOURIER_UNION_AUTHZ_ENABLED + value: "true" + resources: + - container: controller + limits: + cpu: "1" + memory: 1Gi + requests: + cpu: 500m + memory: 500Mi + - name: webhook + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app + operator: In + values: + - webhook + topologyKey: topology.kubernetes.io/zone +--- +# Source: dataplane/templates/common/task-podtemplate.yaml +apiVersion: v1 +kind: PodTemplate +metadata: + name: task-template + namespace: union +template: + spec: + serviceAccountName: union + containers: + - name: default + image: docker.io/rwgrim/docker-noop --- # Source: dataplane/charts/fluentbit/templates/tests/test-connection.yaml apiVersion: v1 diff --git a/tests/generated/dataplane.aws.eks-automode.yaml b/tests/generated/dataplane.aws.eks-automode.yaml index e8acd503..b77d25ef 100644 --- a/tests/generated/dataplane.aws.eks-automode.yaml +++ b/tests/generated/dataplane.aws.eks-automode.yaml @@ -1,40 +1,4 @@ --- -# Source: dataplane/templates/common/namespaces.yaml -apiVersion: v1 -kind: Namespace -metadata: - name: flytesnacks-development ---- -# Source: dataplane/templates/common/namespaces.yaml -apiVersion: v1 -kind: Namespace -metadata: - name: flytesnacks-staging ---- -# Source: dataplane/templates/common/namespaces.yaml -apiVersion: v1 -kind: Namespace -metadata: - name: flytesnacks-production ---- -# Source: dataplane/templates/common/namespaces.yaml -apiVersion: v1 -kind: Namespace -metadata: - name: union-health-monitoring-development ---- -# Source: dataplane/templates/common/namespaces.yaml -apiVersion: v1 -kind: Namespace -metadata: - name: union-health-monitoring-staging ---- -# Source: dataplane/templates/common/namespaces.yaml -apiVersion: v1 -kind: Namespace -metadata: - name: union-health-monitoring-production ---- # Source: dataplane/charts/dcgm-exporter/templates/serviceaccount.yaml # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. # @@ -64,133 +28,142 @@ metadata: app.kubernetes.io/managed-by: Helm automountServiceAccountToken: false --- -# Source: dataplane/charts/fluentbit/templates/serviceaccount.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2020 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + apiVersion: v1 kind: ServiceAccount metadata: - name: fluentbit-system - namespace: union + name: knative-operator + namespace: "union" labels: - helm.sh/chart: fluentbit-0.48.9 - app.kubernetes.io/name: fluentbit - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "3.2.8" - app.kubernetes.io/managed-by: Helm + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2020 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v1 +kind: ServiceAccount +metadata: + name: operator-webhook + namespace: "union" + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator --- -# Source: dataplane/charts/kube-state-metrics/templates/serviceaccount.yaml +# Source: dataplane/charts/prometheus/charts/kube-state-metrics/templates/serviceaccount.yaml apiVersion: v1 kind: ServiceAccount automountServiceAccountToken: true metadata: labels: - helm.sh/chart: kube-state-metrics-5.30.1 + helm.sh/chart: kube-state-metrics-5.25.1 app.kubernetes.io/managed-by: Helm app.kubernetes.io/component: metrics app.kubernetes.io/part-of: kube-state-metrics app.kubernetes.io/name: kube-state-metrics app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2.15.0" + app.kubernetes.io/version: "2.13.0" name: release-name-kube-state-metrics namespace: union --- -# Source: dataplane/charts/opencost/templates/serviceaccount.yaml +# Source: dataplane/charts/prometheus/templates/serviceaccount.yaml apiVersion: v1 kind: ServiceAccount metadata: - name: release-name-opencost - namespace: union labels: - helm.sh/chart: opencost-1.42.0 - app.kubernetes.io/name: opencost + app.kubernetes.io/component: server + app.kubernetes.io/name: prometheus app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "1.111.0" - app.kubernetes.io/part-of: opencost + app.kubernetes.io/version: v2.54.1 + helm.sh/chart: prometheus-25.27.0 app.kubernetes.io/managed-by: Helm -automountServiceAccountToken: true ---- -# Source: dataplane/templates/clusterresourcesync/serviceaccount.yaml -apiVersion: v1 -kind: ServiceAccount -metadata: - name: union-clustersync-system + app.kubernetes.io/part-of: prometheus + name: union-operator-prometheus namespace: union annotations: - eks.amazonaws.com/role-arn: 'test-backend-iam-role-arn' ---- -# Source: dataplane/templates/imagebuilder/serviceaccount.yaml -apiVersion: v1 -kind: ServiceAccount -metadata: - name: union-imagebuilder + {} --- -# Source: dataplane/templates/nodeexecutor/serviceaccount.yaml +# Source: dataplane/templates/common/system-serviceaccount.yaml apiVersion: v1 kind: ServiceAccount metadata: - name: executor + name: union-system namespace: union - labels: - app: executor - annotations: - eks.amazonaws.com/role-arn: 'test-backend-iam-role-arn' ---- -# Source: dataplane/templates/operator/serviceaccount-proxy.yaml -apiVersion: v1 -kind: ServiceAccount -metadata: - name: proxy-system - labels: - app.kubernetes.io/name: operator-proxy - app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name - app.kubernetes.io/managed-by: Helm annotations: eks.amazonaws.com/role-arn: 'test-backend-iam-role-arn' --- -# Source: dataplane/templates/operator/serviceaccount.yaml +# Source: dataplane/templates/common/union-serviceaccount.yaml apiVersion: v1 kind: ServiceAccount metadata: - name: operator-system - labels: - app.kubernetes.io/name: union-operator - app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name - app.kubernetes.io/managed-by: Helm + name: union + namespace: union annotations: - eks.amazonaws.com/role-arn: 'test-backend-iam-role-arn' + eks.amazonaws.com/role-arn: test-worker-iam-role-arn +automountServiceAccountToken: true --- -# Source: dataplane/templates/prometheus/serviceaccount.yaml +# Source: dataplane/templates/flyteconnector/serviceaccount.yaml apiVersion: v1 kind: ServiceAccount metadata: - name: union-operator-prometheus + name: flyteconnector namespace: union - labels: - helm.sh/chart: dataplane-2026.4.2 - app.kubernetes.io/name: release-name-dataplane + labels: + app.kubernetes.io/name: flyteconnector app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: prometheus ---- -# Source: dataplane/templates/propeller/serviceaccount-webhook.yaml -apiVersion: v1 -kind: ServiceAccount -metadata: - name: flytepropeller-webhook-system - namespace: union - annotations: - eks.amazonaws.com/role-arn: 'test-backend-iam-role-arn' --- -# Source: dataplane/templates/propeller/serviceaccount.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2022 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + apiVersion: v1 -kind: ServiceAccount +kind: Secret metadata: - name: flytepropeller-system - namespace: union - annotations: - eks.amazonaws.com/role-arn: 'test-backend-iam-role-arn' + name: operator-webhook-certs + namespace: "union" + labels: + app.kubernetes.io/component: operator-webhook + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +# The data is populated at install time. --- # Source: dataplane/templates/common/auth-secret.yaml apiVersion: v1 @@ -213,14 +186,22 @@ type: Opaque data: cluster_name: dGVzdC1jbHVzdGVyLW5hbWU= --- -# Source: dataplane/templates/propeller/deployment-webhook.yaml -# Create an empty secret that the first propeller pod will populate +# Source: dataplane/templates/webhook/mutatingwebhookconfiguration.yaml apiVersion: v1 kind: Secret metadata: - name: flyte-pod-webhook + name: union-pod-webhook namespace: union + labels: + app.kubernetes.io/name: union-pod-webhook + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm type: Opaque +data: + ca.crt: LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSUROekNDQWgrZ0F3SUJBZ0lVSm9kQ09lem4vd1BHQTZjYkw1U3duSlpYT2hJd0RRWUpLb1pJaHZjTkFRRUwKQlFBd0t6RXBNQ2NHQTFVRUF3d2dabXg1ZEdWd2NtOXdaV3hzWlhJdGQyVmlhRzl2YXk1MWJtbHZiaTV6ZG1NdwpIaGNOTWpZd01USXpNREl6TmpVMVdoY05Nell3TVRJeE1ESXpOalUxV2pBck1Ta3dKd1lEVlFRRERDQm1iSGwwClpYQnliM0JsYkd4bGNpMTNaV0pvYjI5ckxuVnVhVzl1TG5OMll6Q0NBU0l3RFFZSktvWklodmNOQVFFQkJRQUQKZ2dFUEFEQ0NBUW9DZ2dFQkFJcjBSWDRGRjFNVTNYdFBmUHErMDUrMGI1Qm0xK3hsTVR5ZUp4bG5mWFJuRlgyegpPcHV6NmsreUpBQk4xRkxmNDVYRnlJclFlaW5DWWJtckZXalFRc3ZDWEloNkpPNGMwdmRMVVU5RERYcldWRFlsCkRNZytTUDlJVnZBMm5USkVFZ2RFREVsKzNWeThUbUJoNlNmcGthcXdHajY5SmhIdy9OeE9yZzM2bUY3TU5VZ1MKWXJVUkxqUFFzTUN6NElTVzhhQnEzS2IwRUFWUTA4V1lZQ0ZEbjY1aHBzVFdxR05rL3BKRnVTYjlsZll6RVdyYwpJeW5wNDhqTnIvNUdKS3NnWG9LeTUwZVo4UkppeTVyOUhENGtnSW15TVNQR2RDMXFZSVppRHRlZW8yOElxNk5nCmd1dWcyTGhlNndYYW9YZXFCQ2d3SHpwWFVXM1hJVzI5eFoxUkFRRUNBd0VBQWFOVE1GRXdIUVlEVlIwT0JCWUUKRktITDQ1RWhQaVg5bnlRN241QUk0YUZVNFlCL01COEdBMVVkSXdRWU1CYUFGS0hMNDVFaFBpWDlueVE3bjVBSQo0YUZVNFlCL01BOEdBMVVkRXdFQi93UUZNQU1CQWY4d0RRWUpLb1pJaHZjTkFRRUxCUUFEZ2dFQkFEeTRac21XCkNjZ3B5TmI4cU1ydzNYclBNejZLMTRlTVhqQVk1bERFYXM1NklLZExoZFM1bi9sWk95bENuUUtOZ3RHS0VrY3UKT05tMEE2Z0xNT3UzcE43dG9aNXh0WktOcnUvS0x6dm44QUNNNndCbUR0bG5oRERDa3ZUTHBVRzh6UTZSNmlaegpzTWlhVmpLSUJiWkJsZXNYSHZZQnQ2YlRTN2o1QWM2aTdYc1RPK3ZqQ2RUQjVYdCtEZ0x2QnBJRWh2ZGJzNXhmClFtc3ZzQmwxenJnR3JXemo4d0txcXNUMzZaOVU1TzdSK1RJMlQrUDVOR25SeHNQOGJ0R0RtMGZ5SW55WlJrSkcKVTdpVFJyMlZRWlRyaTNZQ2dlVjN1UjFtMnJjc2hyU1Avb082T0QyTXR5Q1FMRWtFK1pxVUVJVnpSYmVaNXl2RQptM29YMFV2OUtBUy9pWTA9Ci0tLS0tRU5EIENFUlRJRklDQVRFLS0tLS0K + tls.crt: LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSUQxakNDQXI2Z0F3SUJBZ0lVREtRN3A4MFpIdkQyUE5Ya3p3VGRLem9iUjg4d0RRWUpLb1pJaHZjTkFRRUwKQlFBd0t6RXBNQ2NHQTFVRUF3d2dabXg1ZEdWd2NtOXdaV3hzWlhJdGQyVmlhRzl2YXk1MWJtbHZiaTV6ZG1NdwpIaGNOTWpZd01USXpNREl6TmpVMVdoY05NamN3TVRJek1ESXpOalUxV2pBck1Ta3dKd1lEVlFRRERDQm1iSGwwClpYQnliM0JsYkd4bGNpMTNaV0pvYjI5ckxuVnVhVzl1TG5OMll6Q0NBU0l3RFFZSktvWklodmNOQVFFQkJRQUQKZ2dFUEFEQ0NBUW9DZ2dFQkFMNnMxUTRaZ3ZxVVdNdFp5VC9EKytEQWl6WmR0Q1lkVzArSGk4S3phQ1Vjbi9zZwo3a21NTEhDM2ZlZWtZK2ZUNENHUTd6K1l2WjRWWSt6WGljKzdrNndLbUZqdm1acEM1Rm96OWo1Q21NRk5HZmlWCnRobzVvSU9MUkIzNXRYMU1ROHUwNFVzT29QcjdMK016eEdxWXVmTFdtN1NVdjh0eEp3ZFMxZVlzMllNV1pMeU4KekI4akdWY01Qc2ZNVkdxdFhTTVljQ3hNYURKaTM1N0VEZSs3clM2U2ZtbnFtYjJxekdQM1BRRStCUm91Yk5nLwo4NCt4c253VXBLZUFpem95dFM2c2ZlOTdMUEtRTW9oVzd2TjZORFJrLy9xSmpHem9iZmZwUHU3endhWHVXMWtMCmllc2g2cVRrdlZLSTF3WFJ2WHArVU5GbjZzdDIwdEVlRks0ZzMwOENBd0VBQWFPQjhUQ0I3akFKQmdOVkhSTUUKQWpBQU1Bc0dBMVVkRHdRRUF3SUY0RENCa3dZRFZSMFJCSUdMTUlHSWdoWm1iSGwwWlhCeWIzQmxiR3hsY2kxMwpaV0pvYjI5cmdoeG1iSGwwWlhCeWIzQmxiR3hsY2kxM1pXSm9iMjlyTG5WdWFXOXVnaUJtYkhsMFpYQnliM0JsCmJHeGxjaTEzWldKb2IyOXJMblZ1YVc5dUxuTjJZNEl1Wm14NWRHVndjbTl3Wld4c1pYSXRkMlZpYUc5dmF5NTEKYm1sdmJpNXpkbU11WTJ4MWMzUmxjaTVzYjJOaGJEQWRCZ05WSFE0RUZnUVV1d25wcDZ1MTNKL0NsNnI3eDZ1Sgo5YmZ1RDhzd0h3WURWUjBqQkJnd0ZvQVVvY3Zqa1NFK0pmMmZKRHVma0FqaG9WVGhnSDh3RFFZSktvWklodmNOCkFRRUxCUUFEZ2dFQkFFMmlOa1MzczhGT3Z0cDhrVmJNZUN2amxIQytvYTVlZTFGNmowNlZ3bFJPOWdxOXJla1UKMGptWUhNWGNjWjcrZXJkNFRQRXgwK3ZqMzNhRTV3cDFmdVJJL2xIZHhuOUpJRHg2bTFQTHdVa3p1WG9EekYxLwozVXVIWU1xOGkrT2xsY0tqbmdVMUFUNmRWRCtIbFVWeHpMdjlNazdPb05SQUY5dTJHa1hiUHdCN0I1ZGRxdWxlCmtNcXBxbDhES2s4SVNVelVsTjNBb2Q5NmJUelIyV3RCVkxpTi9zRHZtTDZwZThFYm55RnBUb1BZTDlGeWZOcmIKT1ZPZEJwUkZFTG1KOHZaZWY5d1lxRmVXMTE3SGV5VDArZkxGZVNSemZUNVVWUlVraVFBcUVJdUhHUVRrOVFabgo1WnYvNUF0eEg3WE1rcXR0VE1zMFlSUG0xYjQzY3FBeFdhND0KLS0tLS1FTkQgQ0VSVElGSUNBVEUtLS0tLQo= + tls.key: LS0tLS1CRUdJTiBQUklWQVRFIEtFWS0tLS0tCk1JSUV2UUlCQURBTkJna3Foa2lHOXcwQkFRRUZBQVNDQktjd2dnU2pBZ0VBQW9JQkFRQytyTlVPR1lMNmxGakwKV2NrL3cvdmd3SXMyWGJRbUhWdFBoNHZDczJnbEhKLzdJTzVKakN4d3QzM25wR1BuMCtBaGtPOC9tTDJlRldQcwoxNG5QdTVPc0NwaFk3NW1hUXVSYU0vWStRcGpCVFJuNGxiWWFPYUNEaTBRZCtiVjlURVBMdE9GTERxRDYreS9qCk04UnFtTG55MXB1MGxML0xjU2NIVXRYbUxObURGbVM4amN3Zkl4bFhERDdIekZScXJWMGpHSEFzVEdneVl0K2UKeEEzdnU2MHVrbjVwNnBtOXFzeGo5ejBCUGdVYUxtellQL09Qc2JKOEZLU25nSXM2TXJVdXJIM3ZleXp5a0RLSQpWdTd6ZWpRMFpQLzZpWXhzNkczMzZUN3U4OEdsN2x0WkM0bnJJZXFrNUwxU2lOY0YwYjE2ZmxEUlorckxkdExSCkhoU3VJTjlQQWdNQkFBRUNnZ0VBRjV2SnMzbTRMSE9DdlFic2NwOC9DQmgvQkNlOC92MGlpYW5VUmJLMkFlWlYKN3Arb3NXV1FPUktYSGIvT2VPMVVjb09DQkFOUzh3aGQrM3pDZlB5U0w3cU9HM0RyT1Z6djdqVkIxM3FpZEVpcQpId3ZXWk0vZXpuckhYOWpEdm5SYmJwVUNVaXRKQmxwa2x3S1pYc056UHB5UTRkNkxFdEw3VEo1V2lxM2g3cWQzCjhncmJoeWlWNDhZRFAyYkhpZGt0S2QvMHBNL3piSzZPTS9iMFpLUmRBWVpHQXlOcklMaFJ3eDNsQXE2SDVqL20KUHBuNlZEdVQ3Y1h6UFhmNTgxR0IyNjQxWFJVTXRNWGVEREtaU3JudnIrbG5tV2lsYXZmNkg3ZHp4NlRBMmk0RApybCtIRVdUN1RSQUZGMnZ5SVUrYSt5RzNPVFBKRCtmVTdCWWh4Zld4aFFLQmdRRGVvcXVCbzQxK1dKU255blZnCmNWMHZoUEkwZ0dyRGQzNFdIbCtGNTZuSmxuMVVlbUQ2SUQ0TUhnNVpHWmNmd3NkRlB0aHl1VnB0OVl1SW11SGQKRHE5ZDhpWnpHUzJVbStkSWlsWjhnZXJ5Q3dON3hIbFM3SzJrc0RmRGJwYmZZSXZ1LzRnSExZRHh3YzQ0OG54ZwpWNGpCUFY2QTdYTHRqQzZzdFE2OW9uTlIwd0tCZ1FEYlFBUTRGdWpnUjJUOXh2Y01MdXhMbm1YaHE4U2NoZlM5CkRoWndDUzZIeHB1YlBxblBjQ04vQUZjbDZ1cDRXVEVZdVY4bEpwcGwwRmpYQ2lMdkhuUU15cm5vRjlUOThpanQKZGVHYjF6ZWlwRUpndFpNVHkvOGdERWorZU1wSFBuYTZwQVpMOVpRVUdLR2dDNGQ0SzljdVNHMmRlb1dSU0ViSQpMVGgzeGpXVEZRS0JnQStJRWFidG5nVmVjS0IwQTFSREZGa29VUzFRZUNKQ3g4MExPV2JDRHBvOW9XaXZVT3lpCkt6SDFOdE1JY2Y2SlBCV2NtTVVJSVVMaWltVnhTS2gvU2NTb0MvNmpsd1p6Q2VPSm94YjBpVXR4Y1VERktDR2MKMlZCUDZ0UDdkeE1HVFR6VEhzNUJZbWw3TjhQSlJ0d0J2MHliMTJmdktNRmhzaS9pUWJFQkVFSjVBb0dBUm9NSApHRmJkM0V0NXdsZzcyYkk1a25SRnhkY3RLejIxb1J1bndhNWlSWTV0T3Zkak8zQ1FLZWNkSC9lMklyQmtwdFB2Ck1vNkF0MS9UUW8xakFNNGxlbnUwWUYxUnhiNGN4WW5VM2Y3UVNNRlZDNjg1dHZNemdNWVNyNngvT1h4d1NNUTUKdGpJcnhtN0poQ1JSRkNmZFUyZzl3SmpIM2hxRmtSbGlBTHRCUGFVQ2dZRUF2QkRHejQvYS9kRzRJT0VpREk4SgpBbkxaRWZaTFJrdnI0ejRyT0p0TGZtWlFjTW5DSUZmKzBaelhTME9XOWhvMjk5RUNtS1l4U2VUWk1WSDgzOFZCCnRvSkNGM3lzb01rSC9meEVXTUpicDlMblJwSFUzUElicHlhRG1nWW1YdmsvK3laL3Y1c0dDTWNtVGkzWVFLRGsKcStlK1o0SHZyaGUxVytUaHpwOCtoNGs9Ci0tLS0tRU5EIFBSSVZBVEUgS0VZLS0tLS0K --- # Source: dataplane/charts/dcgm-exporter/templates/metrics-configmap.yaml apiVersion: v1 @@ -316,142 +297,516 @@ data: DCGM_FI_PROF_PCIE_TX_BYTES, counter, The number of bytes of active pcie tx data including both header and payload. DCGM_FI_PROF_PCIE_RX_BYTES, counter, The number of bytes of active pcie rx data including both header and payload. --- -# Source: dataplane/templates/clusterresourcesync/configmap.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2019 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + apiVersion: v1 kind: ConfigMap metadata: - name: union-clusterresourcesync-config - namespace: union + name: config-logging + namespace: "union" labels: - app.kubernetes.io/name: clusterresourcesync - app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name - app.kubernetes.io/managed-by: Helm -data: - cluster_resources.yaml: | - cluster_resources: - clusterName: 'test-cluster-name' - customData: - - production: - - projectQuotaCpu: - value: "4096" - - projectQuotaMemory: - value: 2Ti - - projectQuotaNvidiaGpu: - value: "256" - - defaultUserRoleKey: - value: 'eks.amazonaws.com/role-arn' - - defaultUserRoleValue: - value: 'test-worker-iam-role-arn' - - staging: - - projectQuotaCpu: - value: "4096" - - projectQuotaMemory: - value: 2Ti - - projectQuotaNvidiaGpu: - value: "256" - - defaultUserRoleKey: - value: 'eks.amazonaws.com/role-arn' - - defaultUserRoleValue: - value: 'test-worker-iam-role-arn' - - development: - - projectQuotaCpu: - value: "4096" - - projectQuotaMemory: - value: 2Ti - - projectQuotaNvidiaGpu: - value: "256" - - defaultUserRoleKey: - value: 'eks.amazonaws.com/role-arn' - - defaultUserRoleValue: - value: 'test-worker-iam-role-arn' - refreshInterval: 5m - standaloneDeployment: true - templatePath: /etc/flyte/clusterresource/templates - clusterResourcesPrivate: - app: - isServerless: false - union: - auth: - authorizationMetadataKey: flyte-authorization - clientId: 'test-client-id' - clientSecretLocation: /etc/union/secret/client_secret - tokenRefreshWindow: 5m - type: ClientSecret - connection: - host: dns:///test-controlplane-host - admin.yaml: | - admin: - clientId: 'test-client-id' - clientSecretLocation: /etc/union/secret/client_secret - endpoint: dns:///test-controlplane-host - insecure: false - event: - capacity: 1000 - rate: 500 - type: admin - domain.yaml: | - domains: - - id: development - name: development - - id: staging - name: staging - - id: production - name: production - clusters.yaml: | - clusters: - clusterConfigs: [] - labelClusterMap: {} - logger.yaml: | - logger: - formatter: - type: json - level: 4 - show-source: true + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +data: {} +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2019 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v1 +kind: ConfigMap +metadata: + name: config-observability + namespace: "union" + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +data: {} --- -# Source: dataplane/templates/clusterresourcesync/configmap.yaml +# Source: dataplane/charts/prometheus/templates/cm.yaml apiVersion: v1 kind: ConfigMap metadata: - name: union-clusterresource-template - namespace: union labels: - app.kubernetes.io/name: clusterresourcesync + app.kubernetes.io/component: server + app.kubernetes.io/name: prometheus app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name + app.kubernetes.io/version: v2.54.1 + helm.sh/chart: prometheus-25.27.0 app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: prometheus + name: union-operator-prometheus + namespace: union data: - a_namespace.yaml: | - apiVersion: v1 - kind: Namespace - metadata: - name: {{ namespace }} - labels: - union.ai/namespace-type: flyte - spec: - finalizers: - - kubernetes - - b_default_service_account.yaml: | - apiVersion: v1 - kind: ServiceAccount - metadata: - name: default - namespace: {{ namespace }} - annotations: - {{ defaultUserRoleKey }}: {{ defaultUserRoleValue }} + allow-snippet-annotations: "false" + alerting_rules.yml: | + {} + alerts: | + {} + prometheus.yml: | + global: + evaluation_interval: 1m + scrape_interval: 1m + scrape_timeout: 10s + rule_files: + - /etc/config/recording_rules.yml + - /etc/config/alerting_rules.yml + - /etc/config/rules + - /etc/config/alerts + scrape_configs: + - job_name: prometheus + static_configs: + - targets: + - localhost:9090 + - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + job_name: kubernetes-apiservers + kubernetes_sd_configs: + - role: endpoints + relabel_configs: + - action: keep + regex: default;kubernetes;https + source_labels: + - __meta_kubernetes_namespace + - __meta_kubernetes_service_name + - __meta_kubernetes_endpoint_port_name + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + insecure_skip_verify: true + - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + job_name: kubernetes-nodes + kubernetes_sd_configs: + - role: node + relabel_configs: + - action: labelmap + regex: __meta_kubernetes_node_label_(.+) + - replacement: kubernetes.default.svc:443 + target_label: __address__ + - regex: (.+) + replacement: /api/v1/nodes/$1/proxy/metrics + source_labels: + - __meta_kubernetes_node_name + target_label: __metrics_path__ + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + insecure_skip_verify: true + - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + job_name: kubernetes-nodes-cadvisor + kubernetes_sd_configs: + - role: node + relabel_configs: + - action: labelmap + regex: __meta_kubernetes_node_label_(.+) + - replacement: kubernetes.default.svc:443 + target_label: __address__ + - regex: (.+) + replacement: /api/v1/nodes/$1/proxy/metrics/cadvisor + source_labels: + - __meta_kubernetes_node_name + target_label: __metrics_path__ + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + insecure_skip_verify: true + - honor_labels: true + job_name: kubernetes-service-endpoints + kubernetes_sd_configs: + - role: endpoints + relabel_configs: + - action: keep + regex: true + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_scrape + - action: drop + regex: true + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_scrape_slow + - action: replace + regex: (https?) + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_scheme + target_label: __scheme__ + - action: replace + regex: (.+) + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_path + target_label: __metrics_path__ + - action: replace + regex: (.+?)(?::\d+)?;(\d+) + replacement: $1:$2 + source_labels: + - __address__ + - __meta_kubernetes_service_annotation_prometheus_io_port + target_label: __address__ + - action: labelmap + regex: __meta_kubernetes_service_annotation_prometheus_io_param_(.+) + replacement: __param_$1 + - action: labelmap + regex: __meta_kubernetes_service_label_(.+) + - action: replace + source_labels: + - __meta_kubernetes_namespace + target_label: namespace + - action: replace + source_labels: + - __meta_kubernetes_service_name + target_label: service + - action: replace + source_labels: + - __meta_kubernetes_pod_node_name + target_label: node + - honor_labels: true + job_name: kubernetes-service-endpoints-slow + kubernetes_sd_configs: + - role: endpoints + relabel_configs: + - action: keep + regex: true + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_scrape_slow + - action: replace + regex: (https?) + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_scheme + target_label: __scheme__ + - action: replace + regex: (.+) + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_path + target_label: __metrics_path__ + - action: replace + regex: (.+?)(?::\d+)?;(\d+) + replacement: $1:$2 + source_labels: + - __address__ + - __meta_kubernetes_service_annotation_prometheus_io_port + target_label: __address__ + - action: labelmap + regex: __meta_kubernetes_service_annotation_prometheus_io_param_(.+) + replacement: __param_$1 + - action: labelmap + regex: __meta_kubernetes_service_label_(.+) + - action: replace + source_labels: + - __meta_kubernetes_namespace + target_label: namespace + - action: replace + source_labels: + - __meta_kubernetes_service_name + target_label: service + - action: replace + source_labels: + - __meta_kubernetes_pod_node_name + target_label: node + scrape_interval: 5m + scrape_timeout: 30s + - honor_labels: true + job_name: prometheus-pushgateway + kubernetes_sd_configs: + - role: service + relabel_configs: + - action: keep + regex: pushgateway + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_probe + - honor_labels: true + job_name: kubernetes-services + kubernetes_sd_configs: + - role: service + metrics_path: /probe + params: + module: + - http_2xx + relabel_configs: + - action: keep + regex: true + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_probe + - source_labels: + - __address__ + target_label: __param_target + - replacement: blackbox + target_label: __address__ + - source_labels: + - __param_target + target_label: instance + - action: labelmap + regex: __meta_kubernetes_service_label_(.+) + - source_labels: + - __meta_kubernetes_namespace + target_label: namespace + - source_labels: + - __meta_kubernetes_service_name + target_label: service + - honor_labels: true + job_name: kubernetes-pods + kubernetes_sd_configs: + - role: pod + relabel_configs: + - action: keep + regex: true + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_scrape + - action: drop + regex: true + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_scrape_slow + - action: replace + regex: (https?) + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_scheme + target_label: __scheme__ + - action: replace + regex: (.+) + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_path + target_label: __metrics_path__ + - action: replace + regex: (\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4}) + replacement: '[$2]:$1' + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_port + - __meta_kubernetes_pod_ip + target_label: __address__ + - action: replace + regex: (\d+);((([0-9]+?)(\.|$)){4}) + replacement: $2:$1 + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_port + - __meta_kubernetes_pod_ip + target_label: __address__ + - action: labelmap + regex: __meta_kubernetes_pod_annotation_prometheus_io_param_(.+) + replacement: __param_$1 + - action: labelmap + regex: __meta_kubernetes_pod_label_(.+) + - action: replace + source_labels: + - __meta_kubernetes_namespace + target_label: namespace + - action: replace + source_labels: + - __meta_kubernetes_pod_name + target_label: pod + - action: drop + regex: Pending|Succeeded|Failed|Completed + source_labels: + - __meta_kubernetes_pod_phase + - action: replace + source_labels: + - __meta_kubernetes_pod_node_name + target_label: node + - honor_labels: true + job_name: kubernetes-pods-slow + kubernetes_sd_configs: + - role: pod + relabel_configs: + - action: keep + regex: true + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_scrape_slow + - action: replace + regex: (https?) + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_scheme + target_label: __scheme__ + - action: replace + regex: (.+) + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_path + target_label: __metrics_path__ + - action: replace + regex: (\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4}) + replacement: '[$2]:$1' + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_port + - __meta_kubernetes_pod_ip + target_label: __address__ + - action: replace + regex: (\d+);((([0-9]+?)(\.|$)){4}) + replacement: $2:$1 + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_port + - __meta_kubernetes_pod_ip + target_label: __address__ + - action: labelmap + regex: __meta_kubernetes_pod_annotation_prometheus_io_param_(.+) + replacement: __param_$1 + - action: labelmap + regex: __meta_kubernetes_pod_label_(.+) + - action: replace + source_labels: + - __meta_kubernetes_namespace + target_label: namespace + - action: replace + source_labels: + - __meta_kubernetes_pod_name + target_label: pod + - action: drop + regex: Pending|Succeeded|Failed|Completed + source_labels: + - __meta_kubernetes_pod_phase + - action: replace + source_labels: + - __meta_kubernetes_pod_node_name + target_label: node + scrape_interval: 5m + scrape_timeout: 30s + - job_name: 'kube-state-metrics' + metrics_path: /metrics + scrape_interval: 1m + honor_labels: true + kubernetes_sd_configs: + - role: endpoints + namespaces: + own_namespace: true + relabel_configs: + - source_labels: [__meta_kubernetes_service_label_app_kubernetes_io_name] + regex: kube-state-metrics + action: keep + - source_labels: [__meta_kubernetes_endpoint_port_name] + regex: http + action: keep + metric_relabel_configs: + - source_labels: [__name__] + regex: "kube_pod_container_resource_(limits|requests)|kube_pod_status_phase|kube_node_(labels|status_allocatable|status_condition|status_capacity)|kube_namespace_labels|kube_pod_container_status_(waiting|terminated|last_terminated).*_reason|kube_daemonset_status_number_unavailable|kube_deployment_status_replicas_unavailable|kube_resourcequota|kube_pod_info|kube_node_info|kube_pod_container_status_restarts_total" + action: keep + - source_labels: [__name__, phase] + separator: ";" + regex: "kube_pod_status_phase;(Succeeded|Failed)" + action: drop + - source_labels: [node] + target_label: nodename + regex: "(.*)" + action: replace + - source_labels: [label_node_group_name] + action: replace + regex: "(.+)" + target_label: label_node_pool_name + - job_name: 'opencost' + metrics_path: /metrics + scrape_interval: 1m + honor_labels: true + kubernetes_sd_configs: + - role: endpoints + namespaces: + own_namespace: true + relabel_configs: + - source_labels: [__meta_kubernetes_service_label_app_kubernetes_io_name] + regex: opencost + action: keep + - source_labels: [__meta_kubernetes_endpoint_port_name] + regex: http + action: keep + metric_relabel_configs: + - source_labels: [__name__] + regex: "kube_node_labels|kube_pod_labels|node_total_hourly_cost|node_ram_hourly_cost|node_cpu_hourly_cost|node_gpu_hourly_cost" + action: keep + - job_name: 'union-services' + metrics_path: /metrics + scrape_interval: 1m + honor_labels: true + kubernetes_sd_configs: + - role: endpoints + namespaces: + own_namespace: true + relabel_configs: + - source_labels: [__meta_kubernetes_service_label_platform_union_ai_service_group] + regex: .+ + action: keep + - source_labels: [__meta_kubernetes_endpoint_port_name] + regex: debug + action: keep + # Flyte propeller metrics for execution info and fast task duration. + # No-op when propeller is not deployed (no pods match the label selector). + - job_name: 'flytepropeller' + kubernetes_sd_configs: + - role: pod + namespaces: + own_namespace: true + selectors: + - role: pod + label: app.kubernetes.io/name=flytepropeller + metric_relabel_configs: + - source_labels: [__name__] + regex: "flyte:propeller:all:round:execution_info|flyte:propeller:all:node:fast_task:fast_task_execution_duration" + action: keep + relabel_configs: + - source_labels: [__meta_kubernetes_pod_container_name] + regex: flytepropeller + action: keep + - source_labels: [__meta_kubernetes_namespace] + action: replace + target_label: namespace + - source_labels: [__meta_kubernetes_pod_name] + action: replace + target_label: pod + # Kourier gateway (envoy) metrics for app serving. + # No-op when serving is not enabled (no pods match the label selector). + - job_name: 'serving-envoy' + kubernetes_sd_configs: + - role: pod + namespaces: + own_namespace: true + selectors: + - role: pod + label: app=3scale-kourier-gateway + metrics_path: /stats/prometheus + metric_relabel_configs: + - source_labels: [__name__] + regex: "envoy_cluster_upstream_rq_xx|envoy_cluster_upstream_rq_time_bucket" + action: keep + relabel_configs: + - source_labels: [__meta_kubernetes_pod_container_port_name] + regex: metrics + action: keep + - source_labels: [__meta_kubernetes_namespace] + action: replace + target_label: namespace + - source_labels: [__meta_kubernetes_pod_name] + action: replace + target_label: pod + # DCGM GPU metrics. + # No-op when dcgm-exporter is not deployed (no pods match the label selector). + - job_name: 'gpu-metrics' + kubernetes_sd_configs: + - role: pod + namespaces: + names: + - kube-system + selectors: + - role: pod + label: app.kubernetes.io/name=dcgm-exporter - c_project_resource_quota.yaml: | - apiVersion: v1 - kind: ResourceQuota - metadata: - name: project-quota - namespace: {{ namespace }} - spec: - hard: - limits.cpu: {{ projectQuotaCpu }} - limits.memory: {{ projectQuotaMemory }} - requests.nvidia.com/gpu: {{ projectQuotaNvidiaGpu }} + recording_rules.yml: | + {} + rules: | + {} --- # Source: dataplane/templates/fluent-bit/configmap.yaml apiVersion: v1 @@ -499,13 +854,41 @@ data: region test-region bucket test-metadata-bucket --- -# Source: dataplane/templates/imagebuilder/configmap.yaml +# Source: dataplane/templates/imagebuilder/build-image-configmap.yaml apiVersion: v1 kind: ConfigMap metadata: - name : union-operator-buildkit -data: - buildkitd.toml: | + name: build-image-config + labels: + app.kubernetes.io/name: union-operator + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm +data: + storage.yaml: | + storage: + container: "test-metadata-bucket" + type: s3 + connection: + auth-type: iam + region: test-region + enable-multicontainer: true + limits: + maxDownloadMBs: 1024 + cache: + max_size_mbs: 0 + target_gc_percent: 70 + buildkit-uri: "tcp://union-operator-buildkit.union.svc.cluster.local:1234" + default-repository: ".dkr.ecr.test-region.amazonaws.com/union-dataplane" + authentication-type: "aws" +--- +# Source: dataplane/templates/imagebuilder/configmap.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name : union-operator-buildkit +data: + buildkitd.toml: | debug = false [log] @@ -2655,6 +3038,18 @@ data: "version": 1 } --- +# Source: dataplane/templates/monitoring/prometheusrule.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: union-recording-rules + namespace: union + labels: + release: release-name +data: + recording_rules.yml: | + groups: [] +--- # Source: dataplane/templates/nodeexecutor/configmap.yaml apiVersion: v1 kind: ConfigMap @@ -2709,8 +3104,13 @@ data: templateUris: - https://scale.neptune.ai/{{ .taskConfig.project }}/-/run/?customId={{ .taskConfig.id }} - kubernetes-enabled: true + kubernetes-enabled: false enabled_plugins.yaml: | + plugins: + connector-service: + defaultConnector: + defaultServiceConfig: '{"loadBalancingConfig": [{"round_robin":{}}]}' + endpoint: k8s:///flyteconnector.union:8000 tasks: task-plugins: default-for-task-types: @@ -2743,6 +3143,8 @@ data: cpu: 4096 gpu: 256 memory: 2Ti + namespace_mapping: + template: union union: connection: host: dns:///test-controlplane-host @@ -2763,7 +3165,7 @@ data: cache-endpoint: dns:///test-controlplane-host endpoint: dns:///test-controlplane-host insecure: false - type: fallback + type: cacheservicev2 use-admin-auth: true logger: formatter: @@ -2783,6 +3185,7 @@ data: secure: false useAuth: false propeller: + limit-namespace: union node-config: disable-input-file-writes: true plugins: @@ -2790,7 +3193,7 @@ data: additional-worker-args: - --last-ack-grace-period-seconds - "120" - callback-uri: http://unionai-dataplane-executor.union.svc.cluster.local:15605 + callback-uri: http://union-operator-executor.union.svc.cluster.local:15605 grace-period-status-not-found: 2m ioutils: remoteFileOutputPaths: @@ -2825,6 +3228,7 @@ data: default-cpus: 100m default-env-vars: [] default-memory: 100Mi + default-pod-template-name: task-template co-pilot: image: 'cr.flyte.org/flyteorg/flytecopilot:v1.14.1' name: flyte-copilot- @@ -2884,6 +3288,7 @@ data: default-cpus: 100m default-env-vars: [] default-memory: 100Mi + default-pod-template-name: task-template config.yaml: | union: connection: @@ -2903,11 +3308,14 @@ data: operator: enabled: true enableTunnelService: true + # enableDepot: false tunnel: - enableDirectToAppIngress: false + enableDirectToAppIngress: true deploymentToRestart: union-operator-proxy + limitNamespace: union + disableClusterPermissions: true apps: - enabled: 'false' + enabled: 'true' syncClusterConfig: enabled: false clusterId: @@ -2922,19 +3330,19 @@ data: userRole: 'test-worker-iam-role-arn' userRoleKey: 'eks.amazonaws.com/role-arn' collectUsages: - enabled: true + enabled: false billing: model: Legacy dependenciesHeartbeat: - prometheus: - endpoint: 'http://union-operator-prometheus:80/-/healthy' - propeller: - endpoint: 'http://flytepropeller:10254' + executor: + endpoint: 'http://union-operator-executor:10254' proxy: endpoint: 'http://union-operator-proxy:10254' secretsWatcher: dryRun: true enabled: false + org: + namespaceTemplate: union imageBuilder: enabled: true executionNamespaceLabels: @@ -2942,9 +3350,6 @@ data: referenceConfigmapName: union-operator targetConfigMapName: "build-image-config" proxy: - imageBuilderConfig: - authenticationType: 'noop' - defaultRepository: '' persistedLogs: objectStore: pathTemplate: namespace-{{.KubernetesNamespace}}.pod-{{.KubernetesPodName}}.cont-{{.KubernetesContainerName}} @@ -2986,999 +3391,149 @@ data: auth-type: iam region: test-region image-builder.buildkit-uri: "tcp://union-operator-buildkit.union.svc.cluster.local:1234" - image-builder.default-repository: "" - image-builder.authentication-type: "noop" + image-builder.default-repository: ".dkr.ecr.test-region.amazonaws.com/union-dataplane" + image-builder.authentication-type: "aws" --- -# Source: dataplane/templates/prometheus/configmap.yaml +# Source: dataplane/templates/serving/bootstrap-configmap.yaml +# We need to copy the default configmap here since knative-operator does not automatically update the +# address of the `net-kourier-controller` endpoint to use the release namespace. It assumes that the +# resources are being installed into the `knative-serving` namespace instead. apiVersion: v1 kind: ConfigMap metadata: - name: union-operator-prometheus + name: union-operator-serving-envoy-bootstrap namespace: union - labels: - helm.sh/chart: dataplane-2026.4.2 - app.kubernetes.io/name: release-name-dataplane - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: prometheus data: - prometheus.yml: | - global: - scrape_interval: 15s - evaluation_interval: 15s - alerting: - alertmanagers: - - static_configs: - - targets: - rule_files: - - rules.yml - scrape_configs: - # Self-monitoring - - job_name: prometheus - metrics_path: /prometheus/metrics - static_configs: - - targets: ['localhost:9090'] - metric_relabel_configs: - - source_labels: [__name__] - regex: prometheus_tsdb_head_chunks_storage_size_bytes|prometheus_tsdb_head_series|prometheus_tsdb_head_series_created_total|prometheus_tsdb_head_series_removed_total|prometheus_tsdb_head_samples_appended_total|prometheus_tsdb_storage_blocks_bytes|prometheus_tsdb_retention_limit_bytes|prometheus_tsdb_wal_(.*)|.+_(limit|failed|missed|failures)_total - action: keep - - # Kube state metrics for pod/node resource tracking and cost calculations - - job_name: kube-state-metrics - static_configs: - - targets: ['release-name-kube-state-metrics:8080'] - metric_relabel_configs: - - separator: ; - source_labels: [__name__] - regex: kube_pod_container_resource_(limits|requests)|kube_pod_status_phase|kube_node_(labels|status_allocatable|status_condition|status_capacity)|kube_namespace_labels|kube_pod_container_status_(waiting|terminated|last_terminated).*_reason|kube_daemonset_status_number_unavailable|kube_deployment_status_replicas_unavailable|kube_resourcequota|kube_pod_info|kube_node_info|kube_pod_container_status_restarts_total - action: keep - - separator: ; - source_labels: [__name__, phase] - regex: kube_pod_status_phase;(Succeeded|Failed) - action: drop - - source_labels: [node] - target_label: nodename - regex: '(.*)' - action: replace - - source_labels: [label_node_group_name] - action: replace - regex: (.+) - target_label: label_node_pool_name - - # cAdvisor container metrics for CPU and memory tracking - - job_name: kubernetes-cadvisor - metrics_path: /metrics - scheme: https - kubernetes_sd_configs: - - role: node - namespaces: - names: [] - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token - tls_config: - ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt - insecure_skip_verify: false - metric_relabel_configs: - - separator: ; - source_labels: [__name__] - regex: container_cpu_usage_seconds_total|container_memory_working_set_bytes - action: keep - relabel_configs: - - separator: ; - regex: __meta_kubernetes_node_label_(.+) - replacement: $1 - action: labelmap - - separator: ; - regex: (.*) - target_label: __address__ - replacement: kubernetes.default.svc:443 - action: replace - - source_labels: [__meta_kubernetes_node_name] - separator: ; - regex: (.+) - target_label: __metrics_path__ - replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor - action: replace - - # Flyte propeller metrics for execution info and fast task duration - - job_name: flytepropeller - kubernetes_sd_configs: - - role: pod - namespaces: - names: - - union - selectors: - - role: pod - label: app.kubernetes.io/name=flytepropeller - metric_relabel_configs: - - source_labels: [__name__] - regex: "flyte:propeller:all:round:execution_info|flyte:propeller:all:node:fast_task:fast_task_execution_duration" - action: keep - relabel_configs: - - source_labels: [__meta_kubernetes_pod_container_name] - regex: flytepropeller - action: keep - - source_labels: [__meta_kubernetes_namespace] - action: replace - target_label: namespace - - source_labels: [__meta_kubernetes_pod_name] - action: replace - target_label: pod - # OpenCost metrics for cost tracking - - job_name: opencost - static_configs: - - targets: ['release-name-opencost:9003'] - metric_relabel_configs: - - source_labels: [__name__] - regex: "kube_node_labels|kube_pod_labels|node_total_hourly_cost|node_ram_hourly_cost|node_cpu_hourly_cost|node_gpu_hourly_cost" - action: keep - # DCGM GPU metrics - - job_name: gpu-metrics - kubernetes_sd_configs: - - role: pod - namespaces: - names: - - kube-system - selectors: - - role: pod - label: app.kubernetes.io/name=dcgm-exporter - rules.yml: | - - groups: - - name: cost_calculations_15s - interval: 15s - rules: - - record: pod_gpu_allocation - expr: | - sum by (namespace, pod) (DCGM_FI_DEV_GPU_UTIL >= bool 0) * on (namespace, pod) group_left() (max by (namespace, pod) (kube_pod_status_phase{phase=~"Running|Pending"} == 1)) - - record: execution_info # A join metric to look up execution-level info. Used to disambiguate workflow/task executions, apps, and workspaces. - expr: | - max by (label_domain, label_project, label_entity_name, label_execution_id, label_entity_id)( - label_replace( - label_replace( - label_replace( - label_replace( - label_replace( - flyte:propeller:all:round:execution_info{domain!="", project!="", workflow_name!="", execution_id!=""}, # filter for workflow/task executions - "label_entity_id", "$1", "execution_id", "(.*)" # join key - ), "label_entity_name", "$1", "workflow_name", "(.*)" # set label_entity_name to the workflow/task name from the workflow_execution_id - ), - "label_execution_id", "$1", "execution_id", "(.*)" - ), - "label_project", "$1", "project", "(.*)" # project - ), - "label_domain", "$1", "domain", "(.*)" # domain - ) - ) - - record: app_info # A join metric to look up app-level info. Used to disambiguate workflow/task executions, apps, and workspaces. - expr: | - max by (label_domain, label_project, label_app_name, label_app_version, label_entity_id)( - label_replace( - label_replace( - label_replace( - kube_pod_labels{ - label_domain!="", - label_project!="", - label_serving_unionai_dev_app_name!="", - label_serving_knative_dev_revision!="" - }, # this filters for apps - "label_app_name", "$1", "label_serving_unionai_dev_app_name", "(.*)" # rename to cleanup - ), - "label_app_version", "$1", "label_serving_knative_dev_revision", "(.*)" # the app_version is equivalent to an execution_id for workflows (lowest level of granularity) - ), - "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # join key - ) - ) - - record: workspace_info # A join metric to look up workspace info. Used to disambiguate workflow/task executions, apps, and workspaces. - expr: | - max by (label_domain, label_project, label_workspace_name, label_entity_id)( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # filter for workspaces - "label_entity_id", "$1", "label_node_id", "(.*)" # join key - ), "label_workspace_name", "$1", "label_node_id", "(.*)" # set label_workspace_name to the workspace name from the kube_pod_labels - ) - ) - - record: fast_task_execution_duration - expr: | - max by (label_domain, label_project, label_execution_id, label_entity_id, namespace, pod)( - label_replace( - label_replace( - label_replace( - label_replace( - label_replace( - label_replace( - flyte:propeller:all:node:fast_task:fast_task_execution_duration{domain!="", project!="", execution_id!="", exported_namespace!="", exported_pod!=""}, - "label_entity_id", "$1", "execution_id", "(.*)" # join key - ), - "label_execution_id", "$1", "execution_id", "(.*)" - ), - "label_project", "$1", "project", "(.*)" # project - ), - "label_domain", "$1", "domain", "(.*)" # domain - ), - "namespace", "$1", "exported_namespace", "(.*)" - ), - "pod", "$1", "exported_pod", "(.*)" - ) - ) - - record: fast_task_execution_duration_rate - expr: | - irate(fast_task_execution_duration[1m]) > 0 or absent(irate(fast_task_execution_duration[1m])) * fast_task_execution_duration - - record: entity_id:mem_usage_bytes_total_per_node:sum # Allocated memory (max(requested, consumed)) aggregated per node and entity, where entity is either a task/workflow execution or an app. - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id, node) ( # aggregate up to entity - # First, calculate the allocated memory for each pod - max by (namespace, pod) ( # this is the case where consumed (the memory working set) exceeds requested memory - ( - sum by (namespace, pod) ( - container_memory_working_set_bytes{namespace!="",pod!="",image!=""} - ) - > sum by (namespace, pod) ( - kube_pod_container_resource_requests{namespace!="", pod!="", node!="", resource="memory"} - ) - ) - or sum by (namespace, pod) ( # this is the case where memory requests are <= consumed memory - kube_pod_container_resource_requests{namespace!="", pod!="", node!="", resource="memory"} # needed to add node!="" to dedupe - ) - ) - # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but we do not want to double the number of pod-level metrics we save - * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) - "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") - ), - "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels - label_replace( - label_replace( - kube_pod_labels{ - label_domain!="", - label_project!="", - label_serving_unionai_dev_app_name!="", - label_serving_knative_dev_revision!="" - }, # this filters for apps only - "label_entity_type", "app", "", "" # set label_entity_type to "app" - ), - "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) - "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" - ), - "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - fast_task_execution_duration_rate{label_domain!="", label_project!="",label_execution_id!=""}, - "label_entity_type", "fast_task", "", "" - ) - ) - ) - # Then filter for pods only in the "Running" or "Pending" phase - * on (namespace, pod) group_left() ( - max by (namespace, pod) ( - kube_pod_status_phase{phase=~"Running|Pending"} == 1 - ) - ) - # Now join in node identifiers which are used for subsequent overhead calculations - * on (namespace, pod) group_left(node) ( - max by (namespace, pod, node) (kube_pod_info{node!=""}) # needed to add node!="" to dedupe - ) - ) - - record: entity_id:cpu_usage_per_node:sum # Allocated cpu (max(requested, consumed)) aggregated per node and entity, where entity is either a task/workflow execution or an app. - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id, node) ( - # First, calculate the allocated cpu for each pod - max by (namespace, pod) ( # this is the case where consumed (the cpu usage seconds total) exceeds requested cpu - ( - sum by (namespace, pod) ( - irate(container_cpu_usage_seconds_total{namespace!="",pod!="",image!=""}[5m]) - ) - > sum by (namespace, pod) ( - kube_pod_container_resource_requests{namespace!="", pod!="", node!="", resource="cpu"} - ) - ) - or sum by (namespace, pod) ( # this is the case where cpu requests are <= consumed cpu - kube_pod_container_resource_requests{namespace!="", pod!="", node!="", resource="cpu"} - ) - ) - # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but I didn't want to double the number of pod-level metrics we save - * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) - "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") - ), - "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels - label_replace( - label_replace( - kube_pod_labels{ - label_domain!="", - label_project!="", - label_serving_unionai_dev_app_name!="", - label_serving_knative_dev_revision!="" - }, # this filters for apps only - "label_entity_type", "app", "", "" # set label_entity_type to "app" - ), - "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) - "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" - ), - "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - fast_task_execution_duration_rate{label_domain!="", label_project!="", label_execution_id!=""}, - "label_entity_type", "fast_task", "", "" - ) - ) - ) - # Then filter for pods only in the "Running" or "Pending" phase - * on (namespace, pod) group_left() ( - max by (namespace, pod) ( - kube_pod_status_phase{phase=~"Running|Pending"} == 1 - ) - ) - # Now join in node identifiers which are used for subsequent overhead calculations - * on (namespace, pod) group_left(node) ( - max by (namespace, pod, node) (kube_pod_info{node!=""}) # needed to add node!="" to dedupe - ) - ) - - record: entity_id:gpu_usage_per_node:sum # Allocated gpu aggregated per node and entity, where entity is either a task/workflow execution or an app. - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id, node) ( - # First, grab the allocated gpu for each pod (which is always either 1 or zero, since k8s can't split gpus the way it can with cpu/memory) - max by (namespace, pod) ( - pod_gpu_allocation - ) - # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but I didn't want to double the number of pod-level metrics we save - * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) - "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") - ), - "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels - label_replace( - label_replace( - kube_pod_labels{ - label_domain!="", - label_project!="", - label_serving_unionai_dev_app_name!="", - label_serving_knative_dev_revision!="" - }, # this filters for apps only - "label_entity_type", "app", "", "" # set label_entity_type to "app" - ), - "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) - "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" - ), - "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - fast_task_execution_duration_rate{label_domain!="", label_project!="", label_execution_id!=""}, - "label_entity_type", "fast_task", "", "" - ) - ) - ) - # Then filter for pods only in the "Running" or "Pending" phase - * on (namespace, pod) group_left() ( - max by (namespace, pod) ( - kube_pod_status_phase{phase=~"Running|Pending"} == 1 - ) - ) - # Now join in node identifiers which are used for subsequent overhead calculations - * on (namespace, pod) group_left(node) ( - max by (namespace, pod, node) (kube_pod_info{node!=""}) # needed to add node!="" to dedupe - ) - ) - - record: entity_id:used_mem_bytes:sum # the sum of used memory across all containers in an entity (numerator for aggregate utilization calculations) - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id) ( # aggregate up to entity - # First, calculate the used memory for each pod - sum by (namespace, pod) ( - container_memory_working_set_bytes{namespace!="",pod!="",image!=""} - ) - # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but we do not want to double the number of pod-level metrics we save - * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) - "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") - ), - "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels - label_replace( - label_replace( - kube_pod_labels{ - label_domain!="", - label_project!="", - label_serving_unionai_dev_app_name!="", - label_serving_knative_dev_revision!="" - }, # this filters for apps only - "label_entity_type", "app", "", "" # set label_entity_type to "app" - ), - "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) - "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" - ), - "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - fast_task_execution_duration_rate{label_domain!="", label_project!="", label_execution_id!=""}, - "label_entity_type", "fast_task", "", "" - ) - ) - ) - # Then filter for pods only in the "Running" or "Pending" phase - * on (namespace, pod) group_left() ( - max by (namespace, pod) ( - kube_pod_status_phase{phase=~"Running|Pending"} == 1 - ) - ) - ) - - record: entity_id:allocated_mem_bytes:sum # the sum of allocated memory across all containers in an entity (denominator for aggregate utilization calculations) - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id) ( # aggregate up to entity (remove node) - entity_id:mem_usage_bytes_total_per_node:sum - ) - - record: entity_id:used_cpu:sum # the sum of used cpu across all containers in an entity (numerator for aggregate utilization calculations) - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id) ( - sum by (namespace, pod) ( - irate(container_cpu_usage_seconds_total{namespace!="",pod!="",image!=""}[5m]) - ) - # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but I didn't want to double the number of pod-level metrics we save - * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) - "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") - ), - "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels - label_replace( - label_replace( - kube_pod_labels{ - label_domain!="", - label_project!="", - label_serving_unionai_dev_app_name!="", - label_serving_knative_dev_revision!="" - }, # this filters for apps only - "label_entity_type", "app", "", "" # set label_entity_type to "app" - ), - "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) - "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" - ), - "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - fast_task_execution_duration_rate{label_domain!="", label_project!="", label_execution_id!=""}, - "label_entity_type", "fast_task", "", "" - ) - ) - ) - # Then filter for pods only in the "Running" or "Pending" phase - * on (namespace, pod) group_left() ( - max by (namespace, pod) ( - kube_pod_status_phase{phase=~"Running|Pending"} == 1 - ) - ) - ) - - record: entity_id:allocated_cpu:sum # the sum of allocated cpu across all containers in an entity (denominator for aggregate utilization calculations) - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id) ( # aggregate up to entity (remove node) - entity_id:cpu_usage_per_node:sum - ) - - record: entity_id:sm_occupancy:avg # the simple average of SM occupancy (a good generic measure of GPU utilization) per entity - expr: | - avg by (label_entity_type, label_domain, label_project, label_entity_id) ( - # First, grab the SM occupancy for each pod - max by (namespace, pod) ( - DCGM_FI_PROF_SM_OCCUPANCY # SM occupancy is a good proxy for actual GPU usage - ) - # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but I didn't want to double the number of pod-level metrics we save - * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) - "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") - ), - "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels - label_replace( - label_replace( - kube_pod_labels{ - label_domain!="", - label_project!="", - label_serving_unionai_dev_app_name!="", - label_serving_knative_dev_revision!="" - }, # this filters for apps only - "label_entity_type", "app", "", "" # set label_entity_type to "app" - ), - "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) - "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" - ), - "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - fast_task_execution_duration_rate{label_domain!="", label_project!="", label_execution_id!=""}, - "label_entity_type", "fast_task", "", "" - ) - ) - ) - # Then filter for pods only in the "Running" or "Pending" phase - * on (namespace, pod) group_left() ( - max by (namespace, pod) ( - kube_pod_status_phase{phase=~"Running|Pending"} == 1 - ) - ) - ) - - record: entity_id:gpu_count:sum # the count of running gpu pods per entity (need this to weight the gpu utilization when aggregating upwards - i.e. project-level) - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id, node) ( - # First, grab the allocated gpu for each pod (which is always either 1 or zero, since k8s can't split gpus the way it can with cpu/memory) - max by (namespace, pod) ( - pod_gpu_allocation - ) - # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but I didn't want to double the number of pod-level metrics we save - * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) - "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") - ), - "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels - label_replace( - label_replace( - kube_pod_labels{ - label_domain!="", - label_project!="", - label_serving_unionai_dev_app_name!="", - label_serving_knative_dev_revision!="" - }, # this filters for apps only - "label_entity_type", "app", "", "" # set label_entity_type to "app" - ), - "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) - "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" - ), - "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - fast_task_execution_duration_rate{label_domain!="", label_project!="", label_execution_id!=""}, - "label_entity_type", "fast_task", "", "" - ) - ) - ) - # Then filter for pods only in the "Running" or "Pending" phase - * on (namespace, pod) group_left() ( - max by (namespace, pod) ( - kube_pod_status_phase{phase=~"Running|Pending"} == 1 - ) - ) - ) - - record: entity_id:weighted_sm_occupancy:sum # product of SM occupancy and allocated GPU count (something like "used memory", numerator of weighted calcs) - expr: | - entity_id:sm_occupancy:avg - * on (label_domain, label_project, label_entity_type, label_entity_id) entity_id:gpu_count:sum - - record: entity_id:allocated_mem_cost:sum # Allocated cost of memory for each workflow/task execution and app. - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id, type) ( - entity_id:mem_usage_bytes_total_per_node:sum / (1024 * 1024 * 1024) # convert bytes to GB - * on (node) group_left(type) label_replace(avg by (node) (node_ram_hourly_cost * (15 / 3600)), "type", "mem", "", "") # convert hourly cost to 15-secondly cost and add type - ) - - record: entity_id:allocated_cpu_cost:sum # Allocated cost of cpu for each workflow/task execution and app. - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id, type)( - entity_id:cpu_usage_per_node:sum - * on (node) group_left(type) label_replace(avg by (node) (node_cpu_hourly_cost * (15 / 3600)), "type", "cpu", "", "") # convert hourly cost to 15-secondly cost and add type - ) - - record: entity_id:allocated_gpu_cost:sum # Allocated cost of gpu for each workflow/task execution and app. - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id, type)( - entity_id:gpu_usage_per_node:sum - * on (node) group_left(type) label_replace(avg by (node) (node_gpu_hourly_cost * (15 / 3600)), "type", "gpu", "", "") # convert hourly cost to 15-secondly cost and add type - ) - - record: entity_id:allocated_cost:sum # Allocated cost of memory, cpu, and gpu for each workflow/task execution and app. - expr: | - label_replace( - sum by (label_entity_type, label_domain, label_project, label_entity_id) ( # for the sum to work, the labels need to be different on each "or" element (type label) - entity_id:allocated_mem_cost:sum - or - entity_id:allocated_cpu_cost:sum - or - entity_id:allocated_gpu_cost:sum - ), - "type", "allocated", "", "" # add type info - ) - - record: entity_id:overhead_cost:sum # The amount of overhead costs (node costs that we can't allocate with container resources) to allocate to each entity (workflow/task execution or app) - expr: | - label_replace( - sum by (label_entity_type, label_entity_id, label_domain, label_project)( # Aggregate the per-node metrics up to workflow/task execution or app (label_entity_id) - # Start with each execution's and app's allocated cost per node - sum by (label_entity_type, label_domain, label_project, label_entity_id, node) ( # for the sum to work, the labels need to be different on each "or" element (type label) - entity_id:mem_usage_bytes_total_per_node:sum / (1024 * 1024 * 1024) # convert bytes to GB - * on (node) group_left(type) label_replace(avg by (node) (node_ram_hourly_cost * (15 / 3600)), "type", "mem", "", "") # convert hourly cost to 15-secondly cost and add type - or - entity_id:cpu_usage_per_node:sum - * on (node) group_left(type) label_replace(avg by (node) (node_cpu_hourly_cost * (15 / 3600)), "type", "cpu", "", "") # convert hourly cost to 15-secondly cost and add type - or - entity_id:gpu_usage_per_node:sum - * on (node) group_left(type) label_replace(avg by (node) (node_gpu_hourly_cost * (15 / 3600)), "type", "gpu", "", "") # convert hourly cost to 15-secondly cost and add type - ) - # Then divide out the total allocated cost per node to get the proportion of allocated cost associated with each entity - / on (node) group_left()( - sum by (node) ( # for the sum to work, the labels need to be different on each "or" element (type label) - entity_id:mem_usage_bytes_total_per_node:sum / (1024 * 1024 * 1024) # convert bytes to GB - * on (node) group_left(type) label_replace(avg by (node) (node_ram_hourly_cost * (15 / 3600)), "type", "mem", "", "") # convert hourly cost to 15-secondly cost and add type - or - entity_id:cpu_usage_per_node:sum - * on (node) group_left(type) label_replace(avg by (node) (node_cpu_hourly_cost * (15 / 3600)), "type", "cpu", "", "") # convert hourly cost to 15-secondly cost and add type - or - entity_id:gpu_usage_per_node:sum - * on (node) group_left(type) label_replace(avg by (node) (node_gpu_hourly_cost * (15 / 3600)), "type", "gpu", "", "") # convert hourly cost to 15-secondly cost and add type - ) - > 0 # need to avoid dividing by zero, or gaps in the data can cause NaNs to proliferate, borking all charts - ) - # Then multiply by the overhead cost per node - * on (node) group_left() ( - # To calculate overhead, start with the true cost of running each node - avg by (node)(kube_node_labels{label_flyte_org_node_role="worker"}) # only look at worker nodes - * on (node) max by (node) ( - node_total_hourly_cost{instance_type!=""} # sometimes, the instance_type can be null, causing an unlabeled label to show up in the Compute Costs dashboard charts - ) * (15 / 3600) # convert hourly cost to 15-secondly cost - # Then subtract out the total allocated cost on each node - - on (node) group_left()( - sum by (node) ( # for the sum to work, the labels need to be different on each "or" element (type label) - entity_id:mem_usage_bytes_total_per_node:sum / (1024 * 1024 * 1024) # convert bytes to GB - * on (node) group_left(type) label_replace(avg by (node) (node_ram_hourly_cost * (15 / 3600)), "type", "mem", "", "") # convert hourly cost to 15-secondly cost and add type - or - entity_id:cpu_usage_per_node:sum - * on (node) group_left(type) label_replace(avg by (node) (node_cpu_hourly_cost * (15 / 3600)), "type", "cpu", "", "") # convert hourly cost to 15-secondly cost and add type - or - entity_id:gpu_usage_per_node:sum - * on (node) group_left(type) label_replace(avg by (node) (node_gpu_hourly_cost * (15 / 3600)), "type", "gpu", "", "") # convert hourly cost to 15-secondly cost and add type - ) - ) - ) - ), - "type", "overhead", "", "" # add type info - ) - - record: entity_id:total_cost:sum # Total cost of each entity (workflow/task execution or app), including allocated (from container resources) and overhead (proportion of unallocated node costs) - expr: | - label_replace( - sum by (label_domain, label_project, label_entity_id, label_entity_type) ( - entity_id:allocated_cost:sum - or - entity_id:overhead_cost:sum - ), - "type", "total", "", "" # add type info - ) - - record: node:total_cost:sum # Total cost of all nodes - expr: | - sum ( - avg by (node)(kube_node_labels{label_flyte_org_node_role="worker", label_node_kubernetes_io_instance_type!=""}) # only look at worker nodes - * on (node) group_left() node_total_hourly_cost{instance_type!=""} * (15 / 3600) # convert hourly cost to 15-secondly cost - ) - - record: node_type:total_cost:sum # Total cost of nodes grouped by node type - expr: | - sum by (node_type)( - avg by (node)(kube_node_labels{label_flyte_org_node_role="worker", label_node_kubernetes_io_instance_type!=""}) # only look at worker nodes - * on (node) group_left(node_type) label_replace(node_total_hourly_cost{instance_type!=""}, "node_type", "$1", "instance_type", "(.*)") * (15 / 3600) # convert hourly cost to 15-secondly cost and rename label - ) - - record: node_type:uptime_hours:sum # Total uptime of nodes grouped by node type - expr: | - sum by (node_type)( - avg by (node, node_type)( # dedupe - label_replace(kube_node_labels{label_flyte_org_node_role="worker", label_node_kubernetes_io_instance_type!=""}, "node_type", "$1", "label_node_kubernetes_io_instance_type", "(.*)") # relabel - ) - ) * (15 / 3600) # convert to number of hours per 15-second observation # Aggregate the above into visible metrics - - name: cost_rollup_15m - interval: 15m - rules: - - record: execution_info15m - expr: | - max_over_time(execution_info[15m:15s]) - - record: app_info15m - expr: | - max_over_time(app_info[15m:15s]) - - record: workspace_info15m - expr: | - max_over_time(workspace_info[15m:15s]) - - record: entity_id:allocated_mem_bytes:sum15m - expr: | - sum_over_time(entity_id:allocated_mem_bytes:sum[15m:15s]) - - record: entity_id:used_mem_bytes:sum15m - expr: | - sum_over_time(entity_id:used_mem_bytes:sum[15m:15s]) - - record: entity_id:allocated_cpu:sum15m - expr: | - sum_over_time(entity_id:allocated_cpu:sum[15m:15s]) - - record: entity_id:used_cpu:sum15m - expr: | - sum_over_time(entity_id:used_cpu:sum[15m:15s]) - - record: entity_id:weighted_sm_occupancy:sum15m - expr: | - sum_over_time(entity_id:weighted_sm_occupancy:sum[15m:15s]) - - record: entity_id:gpu_count:sum15m - expr: | - sum_over_time(entity_id:gpu_count:sum[15m:15s]) - - record: entity_id:allocated_mem_cost:sum15m - expr: | - sum_over_time(entity_id:allocated_mem_cost:sum[15m:15s]) - - record: entity_id:allocated_cpu_cost:sum15m - expr: | - sum_over_time(entity_id:allocated_cpu_cost:sum[15m:15s]) - - record: entity_id:allocated_gpu_cost:sum15m - expr: | - sum_over_time(entity_id:allocated_gpu_cost:sum[15m:15s]) - - record: entity_id:allocated_cost:sum15m - expr: | - sum_over_time(entity_id:allocated_cost:sum[15m:15s]) - - record: entity_id:overhead_cost:sum15m - expr: | - sum_over_time(entity_id:overhead_cost:sum[15m:15s]) - - record: entity_id:total_cost:sum15m - expr: | - sum_over_time(entity_id:total_cost:sum[15m:15s]) - - record: node:total_cost:sum15m - expr: | - sum_over_time(node:total_cost:sum[15m:15s]) - - record: node_type:total_cost:sum15m - expr: | - sum_over_time(node_type:total_cost:sum[15m:15s]) - - record: node_type:uptime_hours:sum15m - expr: | - sum_over_time(node_type:uptime_hours:sum[15m:15s]) + envoy-bootstrap.yaml: | + dynamic_resources: + ads_config: + transport_api_version: V3 + api_type: GRPC + rate_limit_settings: {} + grpc_services: + - envoy_grpc: {cluster_name: xds_cluster} + cds_config: + resource_api_version: V3 + ads: {} + lds_config: + resource_api_version: V3 + ads: {} + node: + cluster: kourier-knative + id: 3scale-kourier-gateway + static_resources: + listeners: + - name: stats_listener + address: + socket_address: + address: 0.0.0.0 + port_value: 9000 + filter_chains: + - filters: + - name: envoy.filters.network.http_connection_manager + typed_config: + "@type": type.googleapis.com/envoy.extensions.filters.network.http_connection_manager.v3.HttpConnectionManager + stat_prefix: stats_server + http_filters: + - name: envoy.filters.http.router + typed_config: + "@type": type.googleapis.com/envoy.extensions.filters.http.router.v3.Router + route_config: + virtual_hosts: + - name: admin_interface + domains: + - "*" + routes: + - match: + safe_regex: + regex: '/(certs|stats(/prometheus)?|server_info|clusters|listeners|ready)?' + headers: + - name: ':method' + string_match: + exact: GET + route: + cluster: service_stats + - match: + safe_regex: + regex: '/drain_listeners' + headers: + - name: ':method' + string_match: + exact: POST + route: + cluster: service_stats + clusters: + - name: service_stats + connect_timeout: 0.250s + type: static + load_assignment: + cluster_name: service_stats + endpoints: + lb_endpoints: + endpoint: + address: + socket_address: + address: 127.0.0.1 + port_value: 9901 + - name: xds_cluster + # This keepalive is recommended by envoy docs. + # https://www.envoyproxy.io/docs/envoy/latest/api-docs/xds_protocol + typed_extension_protocol_options: + envoy.extensions.upstreams.http.v3.HttpProtocolOptions: + "@type": type.googleapis.com/envoy.extensions.upstreams.http.v3.HttpProtocolOptions + explicit_http_config: + http2_protocol_options: + connection_keepalive: + interval: 30s + timeout: 5s + connect_timeout: 1s + load_assignment: + cluster_name: xds_cluster + endpoints: + lb_endpoints: + endpoint: + address: + socket_address: + address: "net-kourier-controller" + port_value: 18000 + type: STRICT_DNS + admin: + access_log: + - name: envoy.access_loggers.stdout + typed_config: + "@type": type.googleapis.com/envoy.extensions.access_loggers.stream.v3.StdoutAccessLog + address: + socket_address: + address: 127.0.0.1 + port_value: 9901 + stats_config: + stats_tags: + - tag_name: name + regex: '^.*?\.u/.*?n=(.*?)/.*?u\..*$' + - tag_name: domain + regex: '^.*?\.u/.*?d=(.*?)/.*?u\..*$' + - tag_name: org + regex: '^.*?\.u/.*?o=(.*?)/.*?u\..*$' + - tag_name: project + regex: '^.*?\.u/.*?p=(.*?)/.*?u\..*$' + - tag_name: target + regex: '^.*?\.u/.*?t=(.*?)/.*?u\..*$' + - tag_name: target_namespace + regex: '^.*?\.u/.*?tns=(.*?)/.*?u\..*$' --- -# Source: dataplane/templates/propeller/configmap.yaml +# Source: dataplane/templates/webhook/configmap.yaml apiVersion: v1 kind: ConfigMap metadata: - name: flyte-propeller-config + name: union-pod-webhook-config namespace: union data: - admin.yaml: | - admin: - clientId: 'test-client-id' - clientSecretLocation: /etc/union/secret/client_secret - endpoint: dns:///test-controlplane-host - insecure: false - event: - capacity: 1000 - rate: 500 - type: admin - catalog.yaml: | - catalog-cache: - cache-endpoint: dns:///test-controlplane-host - endpoint: dns:///test-controlplane-host - insecure: false - type: fallback - use-admin-auth: true - copilot.yaml: | - plugins: - k8s: - co-pilot: - image: 'cr.flyte.org/flyteorg/flytecopilot:v1.14.1' - name: flyte-copilot- - start-timeout: 30s - core.yaml: | - propeller: - downstream-eval-duration: 30s - enable-admin-launcher: true - leader-election: - enabled: true - lease-duration: 15s - lock-config-map: - name: propeller-leader - namespace: 'union' - renew-deadline: 10s - retry-period: 2s - limit-namespace: all - literal-offloading-config: - enabled: true - max-workflow-retries: 30 - metadata-prefix: metadata/propeller - metrics-prefix: flyte - prof-port: 10254 - queue: - batch-size: -1 - batching-interval: 2s - queue: - base-delay: 5s - capacity: 1000 - max-delay: 120s - rate: 100 - type: maxof - sub-queue: - capacity: 100 - rate: 10 - type: bucket - type: batch - rawoutput-prefix: 's3://test-metadata-bucket' - workers: 4 - workflow-reeval-duration: 30s + core.yaml: | + + webhook: certDir: /etc/webhook/certs + disableCreateMutatingWebhookConfig: true embeddedSecretManagerConfig: imagePullSecrets: enabled: true @@ -3986,100 +3541,13 @@ data: namespace: 'union' type: 'K8s' listenPort: '9443' + localCert: true secretManagerTypes: - Embedded - K8s - serviceName: flyte-pod-webhook + secretName: union-pod-webhook + serviceName: union-pod-webhook servicePort: '443' - enabled_plugins.yaml: | - plugins: - connector-service: - defaultConnector: - defaultServiceConfig: '{"loadBalancingConfig": [{"round_robin":{}}]}' - endpoint: k8s:///flyteconnector.union:8000 - tasks: - task-plugins: - default-for-task-types: - actor: fast-task - container: container - container_array: k8s-array - sidecar: sidecar - enabled-plugins: - - container - - sidecar - - k8s-array - - echo - - fast-task - - connector-service - k8s.yaml: | - plugins: - k8s: - accelerator-device-classes: - - NVIDIA_GPU: - device-node-label: eks.amazonaws.com/instance-gpu-name - pod-template: - template: - spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: eks.amazonaws.com/instance-gpu-manufacturer - operator: In - values: - - nvidia - resource-name: nvidia.com/gpu - accelerator-devices: - - NVIDIA-TESLA-T4: t4 - - NVIDIA-TESLA-V100: v100 - - NVIDIA-TESLA-A100: a100 - - NVIDIA-A10G: a10g - - NVIDIA-TESLA-K80: k80 - - NVIDIA-H100: h100 - - NVIDIA-L4: l4 - - NVIDIA-L40S: l40s - default-cpus: 100m - default-env-vars: [] - default-memory: 100Mi - logger.yaml: | - logger: - formatter: - type: json - level: 4 - show-source: true - resource_manager.yaml: | - propeller: - resourcemanager: - type: noop - task_logs.yaml: | - plugins: - logs: - cloudwatch-enabled: false - dynamic-log-links: - - vscode: - displayName: VS Code Debugger - templateUris: - - /dataplane/pod/v1/generated_name/task/{{.executionProject}}/{{.executionDomain}}/{{.executionName}}/{{.nodeID}}/{{.taskRetryAttempt}}/{{.taskProject}}/{{.taskDomain}}/{{.taskID}}/{{.taskVersion}}/ - kubernetes-enabled: false - templates: - - displayName: Task Logs - scheme: TaskExecution - templateUris: - - /console/projects/{{.executionProject}}/domains/{{.executionDomain}}/executions/{{.executionName}}/nodeId/{{.nodeID}}/taskId/{{.taskID}}/attempt/{{.taskRetryAttempt}}/view/logs?duration=all&fromExecutionNav=true - storage.yaml: | - storage: - container: "test-metadata-bucket" - type: s3 - connection: - auth-type: iam - region: test-region - enable-multicontainer: true - limits: - maxDownloadMBs: 1024 - cache: - max_size_mbs: 0 - target_gc_percent: 70 --- # Source: dataplane/charts/fluentbit/templates/clusterrole.yaml apiVersion: rbac.authorization.k8s.io/v1 @@ -4103,515 +3571,808 @@ rules: - list - watch --- -# Source: dataplane/charts/kube-state-metrics/templates/role.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2020 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: - labels: - helm.sh/chart: kube-state-metrics-5.30.1 - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: metrics - app.kubernetes.io/part-of: kube-state-metrics - app.kubernetes.io/name: kube-state-metrics - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2.15.0" - name: release-name-kube-state-metrics -rules: - -- apiGroups: ["certificates.k8s.io"] - resources: - - certificatesigningrequests - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - configmaps - verbs: ["list", "watch"] - -- apiGroups: ["batch"] - resources: - - cronjobs - verbs: ["list", "watch"] - -- apiGroups: ["extensions", "apps"] - resources: - - daemonsets - verbs: ["list", "watch"] - -- apiGroups: ["extensions", "apps"] - resources: - - deployments - verbs: ["list", "watch"] + name: knative-serving-operator-aggregated + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +aggregationRule: + clusterRoleSelectors: + # This (along with escalate below) allows the Operator to pick up any + # roles that are provided to the admin of the cluster by knative serving + # automatically. + - matchExpressions: + - {key: serving.knative.dev/release, operator: Exists} +rules: [] # Rules are automatically filled in by the controller manager. +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: knative-serving-operator-aggregated-stable + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +aggregationRule: + clusterRoleSelectors: + # This (along with escalate below) allows the Operator to pick up any + # roles that are provided to the admin of the cluster by knative serving + # automatically. + - matchExpressions: + - {key: app.kubernetes.io/name, operator: In, values: ["knative-serving"]} +rules: [] # Rules are automatically filled in by the controller manager. +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: knative-eventing-operator-aggregated + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +aggregationRule: + clusterRoleSelectors: + # This (along with escalate below) allows the Operator to pick up any + # roles that are provided to the admin of the cluster by knative eventing + # automatically. + - matchExpressions: + - {key: eventing.knative.dev/release, operator: Exists} +rules: [] # Rules are automatically filled in by the controller manager. +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: knative-eventing-operator-aggregated-stable + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +aggregationRule: + clusterRoleSelectors: + # This (along with escalate below) allows the Operator to pick up any + # roles that are provided to the admin of the cluster by knative eventing + # automatically. + - matchExpressions: + - {key: app.kubernetes.io/name, operator: In, values: ["knative-eventing"]} +rules: [] # Rules are automatically filled in by the controller manager. +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2020 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. -- apiGroups: [""] - resources: - - endpoints - verbs: ["list", "watch"] - -- apiGroups: ["autoscaling"] - resources: - - horizontalpodautoscalers - verbs: ["list", "watch"] - -- apiGroups: ["extensions", "networking.k8s.io"] - resources: - - ingresses - verbs: ["list", "watch"] - -- apiGroups: ["batch"] - resources: - - jobs - verbs: ["list", "watch"] - -- apiGroups: ["coordination.k8s.io"] - resources: - - leases - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - limitranges - verbs: ["list", "watch"] - -- apiGroups: ["admissionregistration.k8s.io"] - resources: - - mutatingwebhookconfigurations - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - namespaces - verbs: ["list", "watch"] - -- apiGroups: ["networking.k8s.io"] - resources: - - networkpolicies - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - nodes - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - persistentvolumeclaims - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - persistentvolumes - verbs: ["list", "watch"] - -- apiGroups: ["policy"] - resources: - - poddisruptionbudgets - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - pods - verbs: ["list", "watch"] - -- apiGroups: ["extensions", "apps"] - resources: - - replicasets - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - replicationcontrollers - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - resourcequotas - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - secrets - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - services - verbs: ["list", "watch"] - -- apiGroups: ["apps"] - resources: - - statefulsets - verbs: ["list", "watch"] - -- apiGroups: ["storage.k8s.io"] - resources: - - storageclasses - verbs: ["list", "watch"] - -- apiGroups: ["admissionregistration.k8s.io"] - resources: - - validatingwebhookconfigurations - verbs: ["list", "watch"] - -- apiGroups: ["storage.k8s.io"] - resources: - - volumeattachments - verbs: ["list", "watch"] ---- -# Source: dataplane/charts/opencost/templates/clusterrole.yaml -# Cluster role giving opencost to get, list, watch required resources -# No write permissions are required +kind: ClusterRole apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: knative-serving-operator + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +rules: + - apiGroups: + - operator.knative.dev + resources: + - '*' + verbs: + - '*' + # Bootstrapping permissions. + # Roles that are explicitly bound buch which are specified by this Operator + # MUST be specified here with 'get' and 'bind'. + - apiGroups: + - rbac.authorization.k8s.io + resources: + - clusterroles + resourceNames: + - system:auth-delegator + verbs: + - bind + - get + - apiGroups: + - rbac.authorization.k8s.io + resources: + - roles + resourceNames: + - extension-apiserver-authentication-reader + verbs: + - bind + - get + - apiGroups: + - rbac.authorization.k8s.io + resources: + - clusterroles + - roles + verbs: + - create + - delete + # Escalate is necessary in order to create a role using cluster role aggregation, + # and to allow the Operator to bootstrap itself into the necessary set of + # permissions, even as those continue to evolve upstream. + - escalate + - get + - list + - update + - apiGroups: + - rbac.authorization.k8s.io + resources: + - clusterrolebindings + - rolebindings + verbs: + - create + - delete + - list + - get + - update + # Permissions required for Knative controller + # infra. + - apiGroups: + - apiregistration.k8s.io + resources: + - apiservices + verbs: + - update + - apiGroups: + - "" + resources: + - services + verbs: + - create + - delete + - get + - list + - watch + - apiGroups: + - caching.internal.knative.dev + resources: + - images + verbs: + - '*' + - apiGroups: + - "" + resources: + - namespaces + verbs: + - get + - update + - watch + - apiGroups: + - '' + resources: + - events + verbs: + - create + - update + - patch + - apiGroups: + - '' + resources: + - configmaps + verbs: + - create + - delete + - get + - list + - watch + - apiGroups: + - security.istio.io + - apps + - policy + resources: + - poddisruptionbudgets + - peerauthentications + - deployments + - daemonsets + - replicasets + - statefulsets + verbs: + - create + - delete + - get + - list + - watch + - update + - apiGroups: + - apiregistration.k8s.io + resources: + - apiservices + verbs: + - create + - delete + - get + - list + - apiGroups: + - autoscaling + resources: + - horizontalpodautoscalers + verbs: + - create + - delete + - get + - list + - apiGroups: + - coordination.k8s.io + resources: + - leases + verbs: + - '*' + - apiGroups: + - apiextensions.k8s.io + resources: + - customresourcedefinitions + verbs: + - '*' + # Old resources that need cleaning up that are not in the knative-serving + # namespace. + - apiGroups: + - "" + resources: + - services + - deployments + - horizontalpodautoscalers + resourceNames: + - knative-ingressgateway + verbs: + - delete + - apiGroups: + - "" + resources: + - configmaps + resourceNames: + - config-controller + verbs: + - delete + - apiGroups: + - "" + resources: + - serviceaccounts + resourceNames: + - knative-serving-operator + verbs: + - delete + # for contour TLS + - apiGroups: + - projectcontour.io + resources: + - httpproxies + - tlscertificatedelegations + verbs: + - get + - list + - watch + - update + - create + - delete + - deletecollection + - patch + # for security-guard + - apiGroups: + - guard.security.knative.dev + resources: + - guardians + verbs: + - get + - list + - watch + - create + - update + - patch + - delete + - apiGroups: + - "" + resources: + - configmaps + verbs: + - get + - list + - watch + - create + - update + - patch + - delete + - apiGroups: + - authentication.k8s.io + resources: + - tokenreviews + verbs: + - create + - apiGroups: + - "" + resources: + - pods + verbs: + - get +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml kind: ClusterRole +apiVersion: rbac.authorization.k8s.io/v1 metadata: - name: release-name-opencost + name: knative-eventing-operator labels: - helm.sh/chart: opencost-1.42.0 - app.kubernetes.io/name: opencost - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "1.111.0" - app.kubernetes.io/part-of: opencost - app.kubernetes.io/managed-by: Helm + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator rules: - - apiGroups: [""] + - apiGroups: + - operator.knative.dev + resources: + - '*' + verbs: + - '*' + # Bootstrapping permissions. + # Roles that are explicitly bound buch which are specified by this Operator + # MUST be specified here with 'get' and 'bind'. + - apiGroups: + - rbac.authorization.k8s.io + resources: + - clusterroles + - roles + verbs: + - create + - delete + # Escalate is necessary in order to create a role using cluster role aggregation, + # and to allow the Operator to bootstrap itself into the necessary set of + # permissions, even as those continue to evolve upstream. + - escalate + - get + - list + - update + - apiGroups: + - rbac.authorization.k8s.io + resources: + - clusterrolebindings + - rolebindings + verbs: + - create + - delete + - list + - get + - update + # Permissions required for Knative controller + # infra. + - apiGroups: + - apiregistration.k8s.io + resources: + - apiservices + verbs: + - update + - apiGroups: + - "" + resources: + - services + verbs: + - create + - delete + - get + - list + - watch + - apiGroups: + - caching.internal.knative.dev + resources: + - images + verbs: + - '*' + - apiGroups: + - "" + resources: + - namespaces + verbs: + - get + - update + - watch + - apiGroups: + - '' + resources: + - events + verbs: + - create + - update + - patch + - apiGroups: + - '' + resources: + - configmaps + verbs: + - create + - delete + - get + - list + - watch + - apiGroups: + - apps + resources: + - deployments + - daemonsets + - replicasets + - statefulsets + verbs: + - create + - delete + - get + - list + - watch + - apiGroups: + - apiregistration.k8s.io + resources: + - apiservices + verbs: + - create + - delete + - get + - list + - apiGroups: + - autoscaling + resources: + - horizontalpodautoscalers + verbs: + - create + - delete + - update + - get + - list + - apiGroups: + - coordination.k8s.io + resources: + - leases + verbs: + - '*' + - apiGroups: + - apiextensions.k8s.io + resources: + - customresourcedefinitions + verbs: + - '*' + - apiGroups: + - batch + resources: + - jobs + verbs: + - create + - delete + - update + - get + - list + - watch + # Old resources that need cleaning up that are not in the knative-eventing + # namespace. + - apiGroups: + - "" + resources: + - serviceaccounts + resourceNames: + - knative-eventing-operator + verbs: + - delete + # for RabbitMQ messaging topology objects + - apiGroups: + - rabbitmq.com + resources: + - rabbitmqclusters + verbs: + - "get" + - "list" + - "watch" + - apiGroups: + - rabbitmq.com + resources: + - bindings + - queues + - exchanges + verbs: + - create + - delete + - get + - list + - patch + - update + - watch + - apiGroups: + - rabbitmq.com + resources: + - bindings/status + - queues/status + - exchanges/status + verbs: + - get + # for Kafka eventing source + - apiGroups: + - keda.sh + resources: + - scaledobjects + - scaledobjects/finalizers + - scaledobjects/status + - triggerauthentications + - triggerauthentications/status + verbs: + - get + - list + - watch + - update + - create + - delete + # Internal APIs + - apiGroups: + - "internal.kafka.eventing.knative.dev" + resources: + - "consumers" + - "consumers/status" + - "consumergroups" + - "consumergroups/status" + verbs: + - create + - get + - list + - watch + - patch + - update + - delete + - apiGroups: + - "internal.kafka.eventing.knative.dev" + resources: + - "consumers/finalizers" + - "consumergroups/finalizers" + verbs: + - update + - delete + - apiGroups: + - apps + resources: + - statefulsets/scale + verbs: + - get + - list + - watch + - update + - patch + - create + - delete + - apiGroups: + - rbac.authorization.k8s.io + resources: + - clusterrolebindings + verbs: + - watch + - apiGroups: + - "*" + resources: + - configmaps + verbs: + - delete + - apiGroups: + - "*" resources: - configmaps - - deployments - - nodes - - pods - services - - resourcequotas - - replicationcontrollers - - limitranges - - persistentvolumeclaims - - persistentvolumes - - namespaces - - endpoints verbs: - get - list - watch + - update + - create + - delete - apiGroups: - - extensions + - "*" resources: - - daemonsets - - deployments - - replicasets + - pods verbs: - - get - list + - update + - get - watch - apiGroups: - - apps + - "*" resources: - - statefulsets - - deployments - - daemonsets - - replicasets + - pods/finalizers verbs: + - get - list - - watch + - create + - update + - delete - apiGroups: - - batch + - "*" resources: - - cronjobs - - jobs + - events verbs: - - get - - list - - watch + - patch + - create - apiGroups: - - autoscaling + - "*" resources: - - horizontalpodautoscalers + - secrets verbs: - get - list - watch + - update + - create + - delete - apiGroups: - - policy + - "*" resources: - - poddisruptionbudgets + - nodes verbs: - get - list - watch - apiGroups: - - storage.k8s.io + - "*" resources: - - storageclasses + - serviceaccounts verbs: - get - list - watch ---- -# Source: dataplane/templates/clusterresourcesync/serviceaccount.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: union-clustersync-resource -rules: + - update + - create + - delete - apiGroups: - - "" - - rbac.authorization.k8s.io + - "*" resources: - configmaps - - namespaces - - pods - - resourcequotas - - roles - - rolebindings - - secrets - - services - - serviceaccounts - - clusterrolebindings - - podtemplates + resourceNames: + - kafka-channel-config verbs: - - '*' ---- -# Source: dataplane/templates/nodeexecutor/serviceaccount.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: union-executor - labels: - app: executor -rules: -# Allow RO access to PODS -- apiGroups: - - "" - resources: - - pods - verbs: - - get - - list - - watch -# Allow Event recording access -- apiGroups: - - "" - resources: - - events - verbs: - - create - - update - - delete - - patch -# Allow Access All plugin objects -- apiGroups: - - '*' - resources: - - '*' - verbs: - - get - - list - - watch - - create - - update - - delete - - patch -# Allow Access to CRD -- apiGroups: - - apiextensions.k8s.io - resources: - - customresourcedefinitions - verbs: - - get - - list - - watch - - create - - delete - - update ---- -# Source: dataplane/templates/operator/serviceaccount-proxy.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: proxy-system - labels: - app.kubernetes.io/name: operator-proxy - app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name - app.kubernetes.io/managed-by: Helm -rules: + - patch - apiGroups: - - '*' + - "*" resources: - - events - - flyteworkflows - - pods/log - - pods - - rayjobs - - resourcequotas + - horizontalpodautoscalers + resourceNames: + - kafka-webhook verbs: - - get - - list - - watch ---- -# Source: dataplane/templates/operator/serviceaccount.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: operator-system - labels: - app.kubernetes.io/name: union-operator - app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name - app.kubernetes.io/managed-by: Helm -rules: - # Allow Access to all resources under flyte.lyft.com + - delete - apiGroups: - - flyte.lyft.com + - "*" resources: - - flyteworkflows - - flyteworkflows/finalizers + - leases verbs: - - get - - list - - watch - - create - - update - delete - - patch - - post - - deletecollection - apiGroups: - - '*' + - "*" resources: - - resourcequotas - - pods - - configmaps - - podtemplates - - secrets - - namespaces - - nodes + - poddisruptionbudgets + resourceNames: + - kafka-webhook verbs: - - get - - list - - watch - - create - - update - delete - - nonResourceURLs: - - /metrics - verbs: - - get ---- -# Source: dataplane/templates/prometheus/rbac.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: union-operator-prometheus - labels: - helm.sh/chart: dataplane-2026.4.2 - app.kubernetes.io/name: release-name-dataplane - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: prometheus -rules: - - apiGroups: [""] + - apiGroups: + - "*" resources: - - nodes - - nodes/proxy - - pods - - endpoints - services verbs: - - get - - list - - watch - - nonResourceURLs: - - /metrics - - /metrics/cadvisor + - patch + - apiGroups: + - "apps" + resources: + - deployments verbs: - - get ---- -# Source: dataplane/templates/propeller/serviceaccount-webhook.yaml -kind: ClusterRole -apiVersion: rbac.authorization.k8s.io/v1 -metadata: - name: flytepropeller-webhook-role - namespace: union -rules: + - deletecollection + # Eventing TLS - apiGroups: - - "*" + - "cert-manager.io" resources: - - mutatingwebhookconfigurations - - secrets - - pods - - replicasets/finalizers + - certificates + - issuers + - clusterissuers verbs: + - create + - delete + - update + - list - get + - watch + - apiGroups: + - "trust.cert-manager.io" + resources: + - bundles + verbs: - create + - delete - update - - patch + - list + - get + - watch --- -# Source: dataplane/templates/propeller/serviceaccount.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: - name: flytepropeller-role + name: knative-operator-webhook + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator rules: - # Allow RO access to PODS + # For watching logging configuration and getting certs. - apiGroups: - "" resources: - - pods + - "configmaps" verbs: - - get - - list - - watch - # Allow Event recording access + - "get" + - "list" + - "watch" + # For manipulating certs into secrets. + - apiGroups: + - "" + resources: + - "namespaces" + verbs: + - "get" + - "create" + - "update" + - "list" + - "watch" + - "patch" + # finalizers are needed for the owner reference of the webhook - apiGroups: - "" resources: - - events + - "namespaces/finalizers" verbs: - - create - - update - - delete - - patch - # Allow Access All plugin objects + - "update" + # For getting our Deployment so we can decorate with ownerref. - apiGroups: - - '*' + - "apps" resources: - - '*' + - "deployments" verbs: - - get - - list - - watch - - create - - update - - delete - - patch - # Allow Access to CRD + - "get" - apiGroups: - - apiextensions.k8s.io + - "apps" resources: - - customresourcedefinitions + - "deployments/finalizers" verbs: - - get - - list - - watch - - create - - delete - update - # Allow Access to all resources under flyte.lyft.com + # For actually registering our webhook. - apiGroups: - - flyte.lyft.com + - "admissionregistration.k8s.io" resources: - - flyteworkflows - - flyteworkflows/finalizers - verbs: - - get - - list - - watch - - create - - update - - delete - - patch - - post - - deletecollection + - "mutatingwebhookconfigurations" + - "validatingwebhookconfigurations" + verbs: &everything + - "get" + - "list" + - "create" + - "update" + - "delete" + - "patch" + - "watch" + # For leader election + - apiGroups: + - "coordination.k8s.io" + resources: + - "leases" + verbs: *everything + # Necessary for conversion webhook. These are copied from the serving + # TODO: Do we really need all these permissions? + - apiGroups: ["apiextensions.k8s.io"] + resources: ["customresourcedefinitions"] + verbs: ["get", "list", "create", "update", "delete", "patch", "watch"] --- # Source: dataplane/charts/fluentbit/templates/clusterrolebinding.yaml apiVersion: rbac.authorization.k8s.io/v1 @@ -4633,207 +4394,417 @@ subjects: name: fluentbit-system namespace: union --- -# Source: dataplane/charts/kube-state-metrics/templates/clusterrolebinding.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2020 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# TODO: Consider restriction of non-aggregated role to knativeservings namespaces. apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: - labels: - helm.sh/chart: kube-state-metrics-5.30.1 - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: metrics - app.kubernetes.io/part-of: kube-state-metrics - app.kubernetes.io/name: kube-state-metrics - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2.15.0" - name: release-name-kube-state-metrics + name: knative-serving-operator + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator roleRef: apiGroup: rbac.authorization.k8s.io kind: ClusterRole - name: release-name-kube-state-metrics + name: knative-serving-operator subjects: -- kind: ServiceAccount - name: release-name-kube-state-metrics - namespace: union + - kind: ServiceAccount + name: knative-operator + namespace: "union" --- -# Source: dataplane/charts/opencost/templates/clusterrolebinding.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# TODO: Consider restriction of non-aggregated role to knativeeventing namespaces. apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: - name: release-name-opencost + name: knative-eventing-operator labels: - helm.sh/chart: opencost-1.42.0 - app.kubernetes.io/name: opencost - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "1.111.0" - app.kubernetes.io/part-of: opencost - app.kubernetes.io/managed-by: Helm + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator roleRef: apiGroup: rbac.authorization.k8s.io kind: ClusterRole - name: release-name-opencost + name: knative-eventing-operator subjects: - kind: ServiceAccount - name: release-name-opencost - namespace: union + name: knative-operator + namespace: "union" --- -# Source: dataplane/templates/clusterresourcesync/serviceaccount.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: - name: union-clustersync-resource + name: operator-webhook + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator roleRef: apiGroup: rbac.authorization.k8s.io kind: ClusterRole - name: union-clustersync-resource + name: knative-operator-webhook subjects: - kind: ServiceAccount - name: union-clustersync-system - namespace: union + name: operator-webhook + namespace: "union" --- -# Source: dataplane/templates/clusterresourcesync/serviceaccount.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2020 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: - name: union-clustersync-auth-delegator + name: knative-serving-operator-aggregated + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator roleRef: apiGroup: rbac.authorization.k8s.io kind: ClusterRole - name: system:auth-delegator + name: knative-serving-operator-aggregated subjects: - kind: ServiceAccount - name: union-clustersync-system - namespace: union + name: knative-operator + namespace: "union" --- -# Source: dataplane/templates/nodeexecutor/serviceaccount.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: - name: union-executor + name: knative-serving-operator-aggregated-stable labels: - app: executor + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator roleRef: apiGroup: rbac.authorization.k8s.io kind: ClusterRole - name: union-executor + name: knative-serving-operator-aggregated-stable subjects: -- kind: ServiceAccount - name: executor - namespace: union + - kind: ServiceAccount + name: knative-operator + namespace: "union" --- -# Source: dataplane/templates/operator/serviceaccount-proxy.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: - name: proxy-system + name: knative-eventing-operator-aggregated labels: - app.kubernetes.io/name: operator-proxy - app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name - app.kubernetes.io/managed-by: Helm + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator roleRef: apiGroup: rbac.authorization.k8s.io kind: ClusterRole - name: proxy-system + name: knative-eventing-operator-aggregated subjects: - kind: ServiceAccount - name: proxy-system - namespace: union + name: knative-operator + namespace: "union" --- -# Source: dataplane/templates/operator/serviceaccount.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: - name: operator-system + name: knative-eventing-operator-aggregated-stable labels: - app.kubernetes.io/name: union-operator - app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name - app.kubernetes.io/managed-by: Helm + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator roleRef: apiGroup: rbac.authorization.k8s.io kind: ClusterRole - name: operator-system + name: knative-eventing-operator-aggregated-stable subjects: - kind: ServiceAccount - name: operator-system - namespace: union + name: knative-operator + namespace: "union" --- -# Source: dataplane/templates/prometheus/rbac.yaml +# Source: dataplane/charts/dcgm-exporter/templates/role.yaml apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding +kind: Role metadata: - name: union-operator-prometheus + name: dcgm-exporter-read-cm + namespace: union labels: - helm.sh/chart: dataplane-2026.4.2 - app.kubernetes.io/name: release-name-dataplane + helm.sh/chart: dcgm-exporter-4.7.1 + app.kubernetes.io/name: dcgm-exporter app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" + app.kubernetes.io/component: dcgm-exporter + app.kubernetes.io/version: "4.7.1" app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: prometheus -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: union-operator-prometheus -subjects: - - kind: ServiceAccount - name: union-operator-prometheus - namespace: union +rules: +- apiGroups: [""] + resources: ["configmaps"] + resourceNames: ["exporter-metrics-config-map"] + verbs: ["get"] --- -# Source: dataplane/templates/propeller/serviceaccount-webhook.yaml -# Create a binding from Role -> ServiceAccount -kind: ClusterRoleBinding +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2022 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + namespace: "union" + name: knative-operator-webhook + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +rules: + # For manipulating certs into secrets. + - apiGroups: + - "" + resources: + - "secrets" + verbs: + - "get" + - "create" + - "update" + - "list" + - "watch" + - "patch" +--- +# Source: dataplane/templates/monitoring/prometheus-rbac.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role metadata: - name: flytepropeller-webhook-binding + name: union-operator-prometheus-rbac namespace: union -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: flytepropeller-webhook-role -subjects: - - kind: ServiceAccount - name: flytepropeller-webhook-system - namespace: union + labels: + release: release-name +rules: + # Prometheus server scrape permissions + - apiGroups: [""] + resources: + - nodes + - nodes/proxy + - nodes/metrics + - services + - endpoints + - pods + - ingresses + - configmaps + verbs: + - get + - list + - watch + - apiGroups: ["extensions", "networking.k8s.io"] + resources: + - ingresses/status + - ingresses + verbs: + - get + - list + - watch + - apiGroups: ["discovery.k8s.io"] + resources: + - endpointslices + verbs: + - get + - list + - watch + # kube-state-metrics permissions (rbac.create=false on the subchart; managed here instead) + - apiGroups: ["certificates.k8s.io"] + resources: ["certificatesigningrequests"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["configmaps"] + verbs: ["list", "watch"] + - apiGroups: ["batch"] + resources: ["cronjobs"] + verbs: ["list", "watch"] + - apiGroups: ["extensions", "apps"] + resources: ["daemonsets"] + verbs: ["list", "watch"] + - apiGroups: ["extensions", "apps"] + resources: ["deployments"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["endpoints"] + verbs: ["list", "watch"] + - apiGroups: ["discovery.k8s.io"] + resources: ["endpointslices"] + verbs: ["list", "watch"] + - apiGroups: ["autoscaling"] + resources: ["horizontalpodautoscalers"] + verbs: ["list", "watch"] + - apiGroups: ["extensions", "networking.k8s.io"] + resources: ["ingresses"] + verbs: ["list", "watch"] + - apiGroups: ["batch"] + resources: ["jobs"] + verbs: ["list", "watch"] + - apiGroups: ["coordination.k8s.io"] + resources: ["leases"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["limitranges"] + verbs: ["list", "watch"] + - apiGroups: ["admissionregistration.k8s.io"] + resources: ["mutatingwebhookconfigurations"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["namespaces"] + verbs: ["list", "watch"] + - apiGroups: ["networking.k8s.io"] + resources: ["networkpolicies"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["persistentvolumeclaims"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["persistentvolumes"] + verbs: ["list", "watch"] + - apiGroups: ["policy"] + resources: ["poddisruptionbudgets"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["pods"] + verbs: ["list", "watch"] + - apiGroups: ["extensions", "apps"] + resources: ["replicasets"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["replicationcontrollers"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["resourcequotas"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["secrets"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["services"] + verbs: ["list", "watch"] + - apiGroups: ["apps"] + resources: ["statefulsets"] + verbs: ["list", "watch"] + - apiGroups: ["storage.k8s.io"] + resources: ["storageclasses"] + verbs: ["list", "watch"] + - apiGroups: ["admissionregistration.k8s.io"] + resources: ["validatingwebhookconfigurations"] + verbs: ["list", "watch"] + - apiGroups: ["storage.k8s.io"] + resources: ["volumeattachments"] + verbs: ["list", "watch"] --- -# Source: dataplane/templates/propeller/serviceaccount.yaml +# Source: dataplane/templates/nodeexecutor/serviceaccount.yaml apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding +kind: Role metadata: - name: flytepropeller-binding -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: flytepropeller-role -subjects: - - kind: ServiceAccount - name: flytepropeller-system - namespace: union + name: union-executor + labels: + app: executor +rules: +# Allow RO access to PODS +- apiGroups: + - "" + resources: + - pods + verbs: + - get + - list + - watch +# Allow Event recording access +- apiGroups: + - "" + resources: + - events + verbs: + - create + - update + - delete + - patch +# Allow Access All plugin objects +- apiGroups: + - '*' + resources: + - '*' + verbs: + - get + - list + - watch + - create + - update + - delete + - patch +# Allow Access to CRD +- apiGroups: + - apiextensions.k8s.io + resources: + - customresourcedefinitions + verbs: + - get + - list + - watch + - create + - delete + - update --- -# Source: dataplane/charts/dcgm-exporter/templates/role.yaml +# Source: dataplane/templates/operator/serviceaccount-proxy-secret.yaml apiVersion: rbac.authorization.k8s.io/v1 kind: Role metadata: - name: dcgm-exporter-read-cm + name: union-system-secret namespace: union labels: - helm.sh/chart: dcgm-exporter-4.7.1 - app.kubernetes.io/name: dcgm-exporter + app.kubernetes.io/name: operator-proxy app.kubernetes.io/instance: release-name - app.kubernetes.io/component: dcgm-exporter - app.kubernetes.io/version: "4.7.1" + platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm rules: -- apiGroups: [""] - resources: ["configmaps"] - resourceNames: ["exporter-metrics-config-map"] - verbs: ["get"] + - apiGroups: + - '*' + resources: + - secrets + verbs: + - get + - list + - create + - update + - delete --- -# Source: dataplane/templates/operator/serviceaccount-proxy-secret.yaml +# Source: dataplane/templates/operator/serviceaccount-proxy.yaml apiVersion: rbac.authorization.k8s.io/v1 kind: Role metadata: - name: proxy-system-secret - namespace: union + name: proxy-system labels: app.kubernetes.io/name: operator-proxy app.kubernetes.io/instance: release-name @@ -4843,13 +4814,16 @@ rules: - apiGroups: - '*' resources: - - secrets + - events + - flyteworkflows + - pods/log + - pods + - rayjobs + - resourcequotas verbs: - get - list - - create - - update - - delete + - watch --- # Source: dataplane/templates/operator/serviceaccount.yaml apiVersion: rbac.authorization.k8s.io/v1 @@ -4873,6 +4847,71 @@ rules: - watch - create - update + - apiGroups: + - flyte.lyft.com + resources: + - flyteworkflows + - flyteworkflows/finalizers + verbs: + - get + - list + - watch + - create + - update + - delete + - patch + - post + - deletecollection + - apiGroups: + - '*' + resources: + - resourcequotas + - pods + - configmaps + - podtemplates + - secrets + - namespaces + - nodes + verbs: + - get + - list + - watch + - create + - update + - delete + - apiGroups: + - serving.knative.dev + resources: + - revisions + - configurations + - services + verbs: + - get + - list + - watch + - create + - update + - delete +--- +# Source: dataplane/templates/webhook/serviceaccount.yaml +kind: Role +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: union-webhook-role + namespace: union +rules: + - apiGroups: + - "*" + resources: + - mutatingwebhookconfigurations + - secrets + - pods + - replicasets/finalizers + verbs: + - get + - create + - update + - patch --- # Source: dataplane/charts/dcgm-exporter/templates/rolebinding.yaml apiVersion: rbac.authorization.k8s.io/v1 @@ -4896,11 +4935,94 @@ roleRef: name: dcgm-exporter-read-cm apiGroup: rbac.authorization.k8s.io --- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2022 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + namespace: "union" + name: operator-webhook + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +subjects: + - kind: ServiceAccount + name: operator-webhook + namespace: "union" +roleRef: + kind: Role + name: knative-operator-webhook + apiGroup: rbac.authorization.k8s.io +--- +# Source: dataplane/templates/monitoring/prometheus-rbac.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: union-operator-prometheus-rbac + namespace: union + labels: + release: release-name +subjects: + - kind: ServiceAccount + name: union-operator-prometheus + namespace: union +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: union-operator-prometheus-rbac +--- +# Source: dataplane/templates/monitoring/prometheus-rbac.yaml +# kube-state-metrics service account RoleBinding +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: release-name-prometheus-kube-state-metrics + namespace: union + labels: + release: release-name +subjects: + - kind: ServiceAccount + name: release-name-prometheus-kube-state-metrics + namespace: union +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: union-operator-prometheus-rbac +--- +# Source: dataplane/templates/nodeexecutor/serviceaccount.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: union-executor + labels: + app: executor +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: union-executor +subjects: +- kind: ServiceAccount + name: union-system + namespace: union +--- # Source: dataplane/templates/operator/serviceaccount-proxy-secret.yaml apiVersion: rbac.authorization.k8s.io/v1 kind: RoleBinding metadata: - name: proxy-system-secret + name: union-system-secret namespace: union labels: app.kubernetes.io/name: operator-proxy @@ -4910,10 +5032,29 @@ metadata: roleRef: apiGroup: rbac.authorization.k8s.io kind: Role - name: proxy-system-secret + name: union-system-secret +subjects: + - kind: ServiceAccount + name: union-system + namespace: union +--- +# Source: dataplane/templates/operator/serviceaccount-proxy.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: proxy-system + labels: + app.kubernetes.io/name: operator-proxy + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: proxy-system subjects: - kind: ServiceAccount - name: proxy-system + name: union-system namespace: union --- # Source: dataplane/templates/operator/serviceaccount.yaml @@ -4932,7 +5073,23 @@ roleRef: name: operator-system subjects: - kind: ServiceAccount - name: operator-system + name: union-system + namespace: union +--- +# Source: dataplane/templates/webhook/serviceaccount.yaml +# Create a binding from Role -> ServiceAccount +kind: RoleBinding +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: union-webhook-binding + namespace: union +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: union-webhook-role +subjects: + - kind: ServiceAccount + name: union-system namespace: union --- # Source: dataplane/charts/dcgm-exporter/templates/service.yaml @@ -4998,20 +5155,60 @@ spec: app.kubernetes.io/name: fluentbit app.kubernetes.io/instance: release-name --- -# Source: dataplane/charts/kube-state-metrics/templates/service.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2022 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v1 +kind: Service +metadata: + labels: + role: operator-webhook + app.kubernetes.io/component: operator-webhook + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator + name: operator-webhook + namespace: "union" +spec: + ports: + # Define metrics and profiling for them to be accessible within service meshes. + - name: http-metrics + port: 9090 + targetPort: 9090 + - name: http-profiling + port: 8008 + targetPort: 8008 + - name: https-webhook + port: 443 + targetPort: 8443 + selector: + role: operator-webhook +--- +# Source: dataplane/charts/prometheus/charts/kube-state-metrics/templates/service.yaml apiVersion: v1 kind: Service metadata: name: release-name-kube-state-metrics namespace: union labels: - helm.sh/chart: kube-state-metrics-5.30.1 + helm.sh/chart: kube-state-metrics-5.25.1 app.kubernetes.io/managed-by: Helm app.kubernetes.io/component: metrics app.kubernetes.io/part-of: kube-state-metrics app.kubernetes.io/name: kube-state-metrics app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2.15.0" + app.kubernetes.io/version: "2.13.0" annotations: prometheus.io/scrape: 'true' spec: @@ -5026,28 +5223,32 @@ spec: app.kubernetes.io/name: kube-state-metrics app.kubernetes.io/instance: release-name --- -# Source: dataplane/charts/opencost/templates/service.yaml +# Source: dataplane/charts/prometheus/templates/service.yaml apiVersion: v1 kind: Service metadata: - name: release-name-opencost - namespace: union labels: - helm.sh/chart: opencost-1.42.0 - app.kubernetes.io/name: opencost + app.kubernetes.io/component: server + app.kubernetes.io/name: prometheus app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "1.111.0" - app.kubernetes.io/part-of: opencost + app.kubernetes.io/version: v2.54.1 + helm.sh/chart: prometheus-25.27.0 app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: prometheus + name: union-operator-prometheus + namespace: union spec: + ports: + - name: http + port: 80 + protocol: TCP + targetPort: 9090 selector: - app.kubernetes.io/name: opencost + app.kubernetes.io/component: server + app.kubernetes.io/name: prometheus app.kubernetes.io/instance: release-name + sessionAffinity: None type: "ClusterIP" - ports: - - name: http - port: 9003 - targetPort: 9003 --- # Source: dataplane/templates/clusterresourcesync/service.yaml apiVersion: v1 @@ -5071,6 +5272,31 @@ spec: app.kubernetes.io/name: clusterresourcesync app.kubernetes.io/instance: release-name --- +# Source: dataplane/templates/flyteconnector/service.yaml +apiVersion: v1 +kind: Service +metadata: + name: flyteconnector + namespace: union + labels: + app.kubernetes.io/name: flyteconnector + app.kubernetes.io/instance: release-name + app.kubernetes.io/managed-by: Helm +spec: + clusterIP: None + ports: + - name: grpc + port: 8000 + protocol: TCP + targetPort: grpc + - name: metric + port: 9090 + protocol: TCP + targetPort: metric + selector: + app.kubernetes.io/name: flyteconnector + app.kubernetes.io/instance: release-name +--- # Source: dataplane/templates/imagebuilder/service.yaml apiVersion: v1 kind: Service @@ -5096,7 +5322,7 @@ spec: apiVersion: v1 kind: Service metadata: - name: release-name-dataplane-executor + name: union-operator-executor labels: platform.union.ai/prometheus-group: "union-services" app: executor @@ -5143,58 +5369,33 @@ spec: # Source: dataplane/templates/operator/service.yaml apiVersion: v1 kind: Service -metadata: - name: union-operator - labels: - platform.union.ai/prometheus-group: "union-services" - app.kubernetes.io/name: union-operator - app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name - app.kubernetes.io/managed-by: Helm -spec: - type: ClusterIP - ports: - - port: 10254 - targetPort: debug - protocol: TCP - name: debug - selector: - app.kubernetes.io/name: union-operator - app.kubernetes.io/instance: release-name ---- -# Source: dataplane/templates/prometheus/service.yaml -apiVersion: v1 -kind: Service -metadata: - name: union-operator-prometheus - namespace: union +metadata: + name: union-operator labels: - helm.sh/chart: dataplane-2026.4.2 - app.kubernetes.io/name: release-name-dataplane + platform.union.ai/prometheus-group: "union-services" + app.kubernetes.io/name: union-operator app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" + platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: prometheus spec: type: ClusterIP ports: - - port: 80 - targetPort: 9090 + - port: 10254 + targetPort: debug protocol: TCP - name: http + name: debug selector: - app.kubernetes.io/component: prometheus - app.kubernetes.io/name: release-name-dataplane + app.kubernetes.io/name: union-operator app.kubernetes.io/instance: release-name --- -# Source: dataplane/templates/propeller/service-webhook.yaml +# Source: dataplane/templates/webhook/service.yaml apiVersion: v1 kind: Service metadata: - name: flyte-pod-webhook + name: union-pod-webhook namespace: union labels: - app.kubernetes.io/name: flyte-pod-webhook + app.kubernetes.io/name: union-pod-webhook app.kubernetes.io/instance: release-name platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm @@ -5202,7 +5403,7 @@ metadata: projectcontour.io/upstream-protocol.h2c: grpc spec: selector: - app.kubernetes.io/name: flyte-pod-webhook + app.kubernetes.io/name: union-pod-webhook app.kubernetes.io/instance: release-name ports: - name: https @@ -5214,23 +5415,23 @@ spec: port: 10254 targetPort: 10254 --- -# Source: dataplane/templates/propeller/service-webhook.yaml +# Source: dataplane/templates/webhook/service.yaml # Headless Service for cache invalidation — resolves to all pod IPs so that # we can fan out invalidation requests to every webhook replica. apiVersion: v1 kind: Service metadata: - name: flyte-pod-webhook-headless + name: union-pod-webhook-headless namespace: union labels: - app.kubernetes.io/name: flyte-pod-webhook + app.kubernetes.io/name: union-pod-webhook app.kubernetes.io/instance: release-name platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm spec: clusterIP: None selector: - app.kubernetes.io/name: flyte-pod-webhook + app.kubernetes.io/name: union-pod-webhook app.kubernetes.io/instance: release-name ports: - name: cache-internal @@ -5238,32 +5439,6 @@ spec: port: 9443 targetPort: 9443 --- -# Source: dataplane/templates/propeller/service.yaml -apiVersion: v1 -kind: Service -metadata: - namespace: union - name: flytepropeller - labels: - platform.union.ai/prometheus-group: "union-services" - app.kubernetes.io/name: flytepropeller - app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name - app.kubernetes.io/managed-by: Helm -spec: - type: ClusterIP - ports: - - name: debug - protocol: TCP - port: 10254 - - name: fasttask - port: 15605 - protocol: TCP - targetPort: 15605 - selector: - app.kubernetes.io/name: flytepropeller - app.kubernetes.io/instance: release-name ---- # Source: dataplane/charts/dcgm-exporter/templates/daemonset.yaml # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. # @@ -5594,20 +5769,217 @@ spec: tolerations: - operator: Exists --- -# Source: dataplane/charts/kube-state-metrics/templates/deployment.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2022 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: operator-webhook + namespace: "union" + labels: + app.kubernetes.io/component: operator-webhook + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +spec: + selector: + matchLabels: + app: operator-webhook + role: operator-webhook + template: + metadata: + labels: + app: operator-webhook + role: operator-webhook + app.kubernetes.io/component: operator-webhook + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator + sidecar.istio.io/inject: "false" + spec: + # To avoid node becoming SPOF, spread our replicas to different nodes. + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - podAffinityTerm: + labelSelector: + matchLabels: + app: operator-webhook + topologyKey: kubernetes.io/hostname + weight: 100 + serviceAccountName: operator-webhook + containers: + - name: operator-webhook + # This is the Go import path for the binary that is containerized + # and substituted here. + image: gcr.io/knative-releases/knative.dev/operator/cmd/webhook@sha256:d3a7a3304629ccfcaacec620b50736e0b4c902a6328b83369b6cbaf7e94677c9 + resources: + requests: + cpu: 100m + memory: 100Mi + limits: + cpu: 500m + memory: 500Mi + env: + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: SYSTEM_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: CONFIG_LOGGING_NAME + value: config-logging + - name: CONFIG_OBSERVABILITY_NAME + value: config-observability + - name: WEBHOOK_NAME + value: operator-webhook + - name: WEBHOOK_PORT + value: "8443" + - name: WEBHOOK_SECRET_NAME + value: operator-webhook-certs + - name: METRICS_DOMAIN + value: knative.dev/operator + - name: KUBERNETES_MIN_VERSION + value: "" + securityContext: + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + runAsNonRoot: true + capabilities: + drop: + - ALL + ports: + - name: metrics + containerPort: 9090 + - name: profiling + containerPort: 8008 + - name: https-webhook + containerPort: 8443 + readinessProbe: + periodSeconds: 1 + httpGet: + scheme: HTTPS + port: 8443 + httpHeaders: + - name: k-kubelet-probe + value: "webhook" + livenessProbe: + periodSeconds: 1 + httpGet: + scheme: HTTPS + port: 8443 + httpHeaders: + - name: k-kubelet-probe + value: "webhook" + failureThreshold: 6 + initialDelaySeconds: 120 + # Our webhook should gracefully terminate by lame ducking first, set this to a sufficiently + # high value that we respect whatever value it has configured for the lame duck grace period. + terminationGracePeriodSeconds: 300 +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2020 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: knative-operator + namespace: "union" + labels: + app.kubernetes.io/name: knative-operator + app.kubernetes.io/version: "1.16.0" +spec: + replicas: 1 + selector: + matchLabels: + name: knative-operator + template: + metadata: + labels: + name: knative-operator + app.kubernetes.io/name: knative-operator + app.kubernetes.io/version: "1.16.0" + sidecar.istio.io/inject: "false" + spec: + serviceAccountName: knative-operator + containers: + - name: knative-operator + image: gcr.io/knative-releases/knative.dev/operator/cmd/operator@sha256:0b5a3532417f9c8e7b6044e23f6f67ad932a74a40a4092ad965b6f173b2fd887 + resources: + requests: + cpu: 100m + memory: 100Mi + limits: + cpu: 1000m + memory: 1000Mi + imagePullPolicy: IfNotPresent + env: + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: SYSTEM_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: METRICS_DOMAIN + value: knative.dev/operator + - name: CONFIG_LOGGING_NAME + value: config-logging + - name: CONFIG_OBSERVABILITY_NAME + value: config-observability + - name: KUBERNETES_MIN_VERSION + value: "" + securityContext: + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + runAsNonRoot: true + capabilities: + drop: + - ALL + ports: + - name: metrics + containerPort: 9090 +--- +# Source: dataplane/charts/prometheus/charts/kube-state-metrics/templates/deployment.yaml apiVersion: apps/v1 kind: Deployment metadata: name: release-name-kube-state-metrics namespace: union labels: - helm.sh/chart: kube-state-metrics-5.30.1 + helm.sh/chart: kube-state-metrics-5.25.1 app.kubernetes.io/managed-by: Helm app.kubernetes.io/component: metrics app.kubernetes.io/part-of: kube-state-metrics app.kubernetes.io/name: kube-state-metrics app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2.15.0" + app.kubernetes.io/version: "2.13.0" spec: selector: matchLabels: @@ -5620,13 +5992,13 @@ spec: template: metadata: labels: - helm.sh/chart: kube-state-metrics-5.30.1 + helm.sh/chart: kube-state-metrics-5.25.1 app.kubernetes.io/managed-by: Helm app.kubernetes.io/component: metrics app.kubernetes.io/part-of: kube-state-metrics app.kubernetes.io/name: kube-state-metrics app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2.15.0" + app.kubernetes.io/version: "2.13.0" spec: automountServiceAccountToken: true hostNetwork: false @@ -5643,8 +6015,9 @@ spec: args: - --port=8080 - --resources=certificatesigningrequests,configmaps,cronjobs,daemonsets,deployments,endpoints,horizontalpodautoscalers,ingresses,jobs,leases,limitranges,mutatingwebhookconfigurations,namespaces,networkpolicies,nodes,persistentvolumeclaims,persistentvolumes,poddisruptionbudgets,pods,replicasets,replicationcontrollers,resourcequotas,secrets,services,statefulsets,storageclasses,validatingwebhookconfigurations,volumeattachments + - --namespaces=union imagePullPolicy: IfNotPresent - image: registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.15.0 + image: registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.13.0 ports: - containerPort: 8080 name: "http" @@ -5664,7 +6037,7 @@ spec: httpGet: httpHeaders: path: /readyz - port: 8081 + port: 8080 scheme: HTTP initialDelaySeconds: 5 periodSeconds: 10 @@ -5679,201 +6052,204 @@ spec: - ALL readOnlyRootFilesystem: true --- -# Source: dataplane/charts/opencost/templates/deployment.yaml +# Source: dataplane/charts/prometheus/templates/deploy.yaml apiVersion: apps/v1 kind: Deployment metadata: - name: release-name-opencost - namespace: union labels: - helm.sh/chart: opencost-1.42.0 - app.kubernetes.io/name: opencost + app.kubernetes.io/component: server + app.kubernetes.io/name: prometheus app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "1.111.0" - app.kubernetes.io/part-of: opencost + app.kubernetes.io/version: v2.54.1 + helm.sh/chart: prometheus-25.27.0 app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: prometheus + name: union-operator-prometheus + namespace: union spec: - replicas: 1 selector: matchLabels: - app.kubernetes.io/name: opencost + app.kubernetes.io/component: server + app.kubernetes.io/name: prometheus app.kubernetes.io/instance: release-name - strategy: - rollingUpdate: - maxSurge: 1 - maxUnavailable: 1 - type: RollingUpdate + replicas: 1 + revisionHistoryLimit: 10 + strategy: + type: Recreate + rollingUpdate: null template: metadata: labels: - app.kubernetes.io/name: opencost + app.kubernetes.io/component: server + app.kubernetes.io/name: prometheus app.kubernetes.io/instance: release-name + app.kubernetes.io/version: v2.54.1 + helm.sh/chart: prometheus-25.27.0 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: prometheus spec: - serviceAccountName: release-name-opencost + enableServiceLinks: true + serviceAccountName: union-operator-prometheus containers: - - name: release-name-opencost - image: ghcr.io/opencost/opencost:1.111.0@sha256:6aa68e52a24b14ba41f23db08d1b9db1429a1c0300f4c0381ecc2c61fc311a97 - imagePullPolicy: IfNotPresent + - name: prometheus-server-configmap-reload + image: "quay.io/prometheus-operator/prometheus-config-reloader:v0.76.0" + imagePullPolicy: "IfNotPresent" args: + - --watched-dir=/etc/config + - --listen-address=0.0.0.0:8080 + - --reload-url=http://127.0.0.1:9090/prometheus/-/reload ports: - - containerPort: 9003 - name: http - resources: - limits: - cpu: 1000m - memory: 4Gi - requests: - cpu: 500m - memory: 1Gi - startupProbe: - httpGet: - path: /healthz - port: 9003 - initialDelaySeconds: 10 - periodSeconds: 5 - failureThreshold: 30 + - containerPort: 8080 + name: metrics livenessProbe: httpGet: path: /healthz - port: 9003 - initialDelaySeconds: 10 - periodSeconds: 20 - failureThreshold: 3 + port: metrics + scheme: HTTP + initialDelaySeconds: 2 + periodSeconds: 10 readinessProbe: httpGet: path: /healthz - port: 9003 - initialDelaySeconds: 10 + port: metrics + scheme: HTTP periodSeconds: 10 + volumeMounts: + - name: config-volume + mountPath: /etc/config + readOnly: true + + - name: prometheus-server + image: "quay.io/prometheus/prometheus:v2.54.1" + imagePullPolicy: "IfNotPresent" + args: + - --storage.tsdb.retention.time=3d + - --config.file=/etc/config/prometheus.yml + - --storage.tsdb.path=/data + - --web.console.libraries=/etc/prometheus/console_libraries + - --web.console.templates=/etc/prometheus/consoles + - --web.enable-lifecycle + - --web.route-prefix=/prometheus + - --web.external-url=/prometheus/ + ports: + - containerPort: 9090 + readinessProbe: + httpGet: + path: /prometheus/-/ready + port: 9090 + scheme: HTTP + initialDelaySeconds: 30 + periodSeconds: 5 + timeoutSeconds: 4 failureThreshold: 3 - env: - - name: LOG_LEVEL - value: info - - name: CUSTOM_COST_ENABLED - value: "false" - - name: KUBECOST_NAMESPACE - value: union - - name: API_PORT - value: "9003" - - name: PROMETHEUS_SERVER_ENDPOINT - value: "http://union-operator-prometheus.union.svc:80/prometheus" - - name: CLUSTER_ID - value: "default-cluster" - - name: DATA_RETENTION_DAILY_RESOLUTION_DAYS - value: "15" - - name: CLOUD_COST_ENABLED - value: "false" - - name: CLOUD_COST_MONTH_TO_DATE_INTERVAL - value: "6" - - name: CLOUD_COST_REFRESH_RATE_HOURS - value: "6" - - name: CLOUD_COST_QUERY_WINDOW_DAYS - value: "7" - - name: CLOUD_COST_RUN_WINDOW_DAYS - value: "3" - # Add any additional provided variables + successThreshold: 1 + livenessProbe: + httpGet: + path: /prometheus/-/healthy + port: 9090 + scheme: HTTP + initialDelaySeconds: 30 + periodSeconds: 15 + timeoutSeconds: 10 + failureThreshold: 3 + successThreshold: 1 + resources: + limits: + cpu: "3" + memory: 3500Mi + requests: + cpu: "1" + memory: 1Gi + volumeMounts: + - name: config-volume + mountPath: /etc/config + - name: storage-volume + mountPath: /data + subPath: "" + - name: server-recording-rules + mountPath: /etc/config/recording + subPath: + readOnly: + dnsPolicy: ClusterFirst + securityContext: + fsGroup: 65534 + runAsGroup: 65534 + runAsNonRoot: true + runAsUser: 65534 + terminationGracePeriodSeconds: 300 + volumes: + - name: config-volume + configMap: + name: union-operator-prometheus + - name: server-recording-rules + configMap: + name: union-recording-rules + - name: storage-volume + emptyDir: + sizeLimit: 10Gi --- -# Source: dataplane/templates/clusterresourcesync/deployment.yaml +# Source: dataplane/templates/flyteconnector/deployment.yaml apiVersion: apps/v1 kind: Deployment metadata: - name: union-syncresources + name: flyteconnector namespace: union - labels: - app.kubernetes.io/name: clusterresourcesync + labels: + app.kubernetes.io/name: flyteconnector app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm spec: - replicas: 1 + replicas: 2 selector: - matchLabels: - app.kubernetes.io/name: clusterresourcesync + matchLabels: + app.kubernetes.io/name: flyteconnector app.kubernetes.io/instance: release-name template: metadata: annotations: - configChecksum: "50c23f588ae08100ced2921cd3daeb4d70435c050a32bbc24751761fe7fe89e" - labels: platform.union.ai/zone: "dataplane" - app.kubernetes.io/name: clusterresourcesync + app.kubernetes.io/name: flyteconnector app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm spec: containers: - command: - - clusterresource - - --config - - /etc/flyte/config/*.yaml - - clusterresource - - run - env: - - name: POD_NAME - valueFrom: - fieldRef: - fieldPath: metadata.name - - name: POD_NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - - name: GOMEMLIMIT - valueFrom: - resourceFieldRef: - divisor: 1 - resource: limits.memory - - name: GOMAXPROCS - valueFrom: - resourceFieldRef: - divisor: 1 - resource: limits.cpu - - name: CLUSTER_NAME - valueFrom: - secretKeyRef: - name: operator-cluster-name - key: cluster_name - - name: DEPLOYMENT_NAME - value: operator - - name: PROXY_SERVICE_URL - value: http://union-operator-proxy:8080 - - name: PROMETHEUS_SERVICE_URL - value: http://union-operator-prometheus:80 - - name: KNATIVE_PROXY_SERVICE_URL - value: http://kourier-internal - image: "public.ecr.aws/p0i0a9q8/unionoperator:2026.4.5" + - c0 + image: "ghcr.io/flyteorg/flyte-connectors:py3.13-2.0.0b50.dev3-g695bb1db3.d20260122" imagePullPolicy: "IfNotPresent" - name: sync-cluster-resources - resources: - limits: - cpu: "1" - memory: 500Mi - requests: - cpu: 500m - memory: 100Mi - volumeMounts: - - name: auth - mountPath: /etc/union/secret - - name: resource-templates - mountPath: /etc/flyte/clusterresource/templates - - name: config-volume - mountPath: /etc/flyte/config + name: flyteconnector + readinessProbe: + failureThreshold: 3 + httpGet: + path: / + port: 9090 + scheme: HTTP + initialDelaySeconds: 5 + periodSeconds: 10 + successThreshold: 1 + timeoutSeconds: 5 + env: + - name: AIRFLOW_CONN_GOOGLE_CLOUD_DEFAULT + value: "google-cloud-platform://" + - name: LOG_LEVEL + value: "20" ports: - - name: debug - containerPort: 10254 - protocol: TCP - serviceAccountName: union-clustersync-system - volumes: - - configMap: - name: union-clusterresource-template - name: resource-templates - - configMap: - name: union-clusterresourcesync-config - name: config-volume - - name: auth - secret: - secretName: union-secret-auth + - containerPort: 8000 + name: grpc + - containerPort: 9090 + name: metric + resources: + limits: + cpu: "1.5" + ephemeral-storage: 100Mi + memory: 1500Mi + requests: + cpu: "1" + ephemeral-storage: 100Mi + memory: 1000Mi + serviceAccountName: flyteconnector --- # Source: dataplane/templates/imagebuilder/deployment.yaml apiVersion: apps/v1 @@ -5895,17 +6271,15 @@ spec: app.kubernetes.io/instance: release-name template: metadata: - annotations: - container.apparmor.security.beta.kubernetes.io/buildkit: unconfined labels: platform.union.ai/zone: "dataplane" app.kubernetes.io/name: imagebuilder-buildkit app.kubernetes.io/instance: release-name spec: - serviceAccountName: "union-imagebuilder" + serviceAccountName: "union-system" containers: - name: "buildkit" - image: "docker.io/moby/buildkit:buildx-stable-1-rootless" + image: "docker.io/moby/buildkit:buildx-stable-1" imagePullPolicy: IfNotPresent env: - name: POD_NAME @@ -5940,18 +6314,15 @@ spec: - name: KNATIVE_PROXY_SERVICE_URL value: http://kourier-internal volumeMounts: - - mountPath: /home/user/.local/share/buildkit - name: buildkitd - mountPath: /etc/buildkit name: buildkit-config args: - --config - /etc/buildkit/buildkitd.toml - --addr - - unix:///run/user/1000/buildkit/buildkitd.sock + - unix:///run//buildkit/buildkitd.sock - --addr - tcp://0.0.0.0:1234 - - --oci-worker-no-process-sandbox ports: - name: tcp containerPort: 1234 @@ -5973,18 +6344,13 @@ spec: initialDelaySeconds: 5 periodSeconds: 30 securityContext: - seccompProfile: # Needs Kubernetes >= 1.19 - type: Unconfined - runAsUser: 1000 - runAsGroup: 1000 + privileged: true resources: requests: - cpu: 1 - ephemeral-storage: 20Gi - memory: 1Gi + cpu: 4 + ephemeral-storage: 50Gi + memory: 4Gi volumes: - - name: buildkitd - emptyDir: {} - configMap: name: union-operator-buildkit name: buildkit-config @@ -6013,16 +6379,18 @@ spec: template: metadata: annotations: - configChecksum: "19d5b93c874f9b21236bb2f5e2f27717fd6bbd6918af0867e806cc1044a9ced" + configChecksum: "8eb2b32d034fb0f8963dcdcb4544781f80d59220bd79e6f6b9d9beee2a9f3fb" labels: platform.union.ai/zone: "dataplane" app: executor + app.kubernetes.io/instance: 'release-name' + app.kubernetes.io/name: executor spec: securityContext: fsGroup: 1337 - serviceAccountName: executor + serviceAccountName: union-system volumes: - name: config-volume configMap: @@ -6038,7 +6406,7 @@ spec: image: "public.ecr.aws/p0i0a9q8/unionoperator:2026.4.5" imagePullPolicy: IfNotPresent command: - - executor + - executorv2 - serve - --config - /etc/config/*.yaml @@ -6116,7 +6484,7 @@ spec: template: metadata: annotations: - configChecksum: "a5ffc86bfe0989da8a70dae84d7c2e240cadb8f9fc98d22f1d6dee38415670f" + configChecksum: "e3f97d434a07515ae9564430aa1a80b3a89374b92a7241fe2c2fdb64266e1ce" labels: @@ -6131,12 +6499,10 @@ spec: sources: - configMap: name: union-operator - - configMap: - name: union-clusterresourcesync-config - name: secret-volume secret: secretName: union-secret-auth - serviceAccountName: proxy-system + serviceAccountName: union-system securityContext: {} containers: @@ -6253,7 +6619,7 @@ spec: template: metadata: annotations: - configChecksum: "a5ffc86bfe0989da8a70dae84d7c2e240cadb8f9fc98d22f1d6dee38415670f" + configChecksum: "e3f97d434a07515ae9564430aa1a80b3a89374b92a7241fe2c2fdb64266e1ce" labels: @@ -6262,7 +6628,7 @@ spec: platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm spec: - serviceAccountName: operator-system + serviceAccountName: union-system securityContext: {} volumes: @@ -6343,81 +6709,16 @@ spec: containerPort: 10254 protocol: TCP --- -# Source: dataplane/templates/prometheus/deployment.yaml -apiVersion: apps/v1 -kind: Deployment -metadata: - name: union-operator-prometheus - namespace: union - labels: - helm.sh/chart: dataplane-2026.4.2 - app.kubernetes.io/name: release-name-dataplane - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: prometheus -spec: - replicas: 1 - selector: - matchLabels: - app.kubernetes.io/component: prometheus - app.kubernetes.io/name: release-name-dataplane - app.kubernetes.io/instance: release-name - template: - metadata: - annotations: - configChecksum: "419ca35014526f736b956303f4160d73038d8a65f6d174286647884e21e6b4e" - labels: - app.kubernetes.io/component: prometheus - app.kubernetes.io/name: release-name-dataplane - app.kubernetes.io/instance: release-name - spec: - priorityClassName: system-cluster-critical - serviceAccountName: union-operator-prometheus - securityContext: - fsGroup: 65534 - runAsNonRoot: true - runAsUser: 65534 - fsGroupChangePolicy: OnRootMismatch - containers: - - name: prometheus - image: "prom/prometheus:v3.3.1" - args: - - --config.file=/etc/prometheus/prometheus.yml - - --web.external-url=/prometheus/ - - --web.route-prefix=/prometheus/ - - --storage.tsdb.retention.time=3d - ports: - - name: http - containerPort: 9090 - protocol: TCP - resources: - limits: - cpu: "3" - memory: 3500Mi - requests: - cpu: "1" - memory: 1Gi - securityContext: - allowPrivilegeEscalation: false - terminationMessagePolicy: FallbackToLogsOnError - volumeMounts: - - mountPath: /etc/prometheus - name: prometheus-config - volumes: - - name: prometheus-config - configMap: - name: union-operator-prometheus ---- -# Source: dataplane/templates/propeller/deployment-webhook.yaml +# Source: dataplane/templates/webhook/deployment.yaml +# Webhook deployment # Create the actual deployment apiVersion: apps/v1 kind: Deployment metadata: - name: flytepropeller-webhook + name: union-pod-webhook namespace: union labels: - app.kubernetes.io/name: flyte-pod-webhook + app.kubernetes.io/name: union-pod-webhook app.kubernetes.io/instance: release-name platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm @@ -6425,19 +6726,19 @@ spec: replicas: 1 selector: matchLabels: - app.kubernetes.io/name: flyte-pod-webhook + app.kubernetes.io/name: union-pod-webhook app.kubernetes.io/instance: release-name template: metadata: labels: platform.union.ai/zone: "dataplane" - app.kubernetes.io/name: flyte-pod-webhook + app.kubernetes.io/name: union-pod-webhook app.kubernetes.io/instance: release-name platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm annotations: - configChecksum: "6d56ed108e8a492245bac427b50799a1ec346824c1260acfae6306ea9ec1ce5" + configChecksum: "77958aee0464eee4cb43c70036cf5ffa89f47f58c908845b9eeaecc7e32bba4" spec: securityContext: @@ -6445,64 +6746,8 @@ spec: fsGroupChangePolicy: Always runAsNonRoot: true runAsUser: 1001 - seLinuxOptions: - type: spc_t - serviceAccountName: flytepropeller-webhook-system - initContainers: - - name: generate-secrets - image: "public.ecr.aws/p0i0a9q8/unionoperator:2026.4.5" - imagePullPolicy: "IfNotPresent" - command: - - flytepropeller - args: - - webhook - - init-certs - - --config - - /etc/flyte/config/*.yaml - env: - - name: POD_NAME - valueFrom: - fieldRef: - fieldPath: metadata.name - - name: POD_NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - - name: GOMEMLIMIT - valueFrom: - resourceFieldRef: - divisor: 1 - resource: limits.memory - - name: GOMAXPROCS - valueFrom: - resourceFieldRef: - divisor: 1 - resource: limits.cpu - - name: CLUSTER_NAME - valueFrom: - secretKeyRef: - name: operator-cluster-name - key: cluster_name - - name: DEPLOYMENT_NAME - value: operator - - name: PROXY_SERVICE_URL - value: http://union-operator-proxy:8080 - - name: PROMETHEUS_SERVICE_URL - value: http://union-operator-prometheus:80 - - name: KNATIVE_PROXY_SERVICE_URL - value: http://kourier-internal - volumeMounts: - - name: config-volume - mountPath: /etc/flyte/config - resources: - limits: - cpu: 1 - ephemeral-storage: 500Mi - memory: 500Mi - requests: - cpu: 200m - ephemeral-storage: 500Mi - memory: 500Mi + seLinuxOptions: null + serviceAccountName: union-system containers: - name: webhook image: "public.ecr.aws/p0i0a9q8/unionoperator:2026.4.5" @@ -6567,107 +6812,85 @@ spec: volumes: - name: config-volume configMap: - name: flyte-propeller-config + name: union-pod-webhook-config - name: webhook-certs secret: - secretName: flyte-pod-webhook + secretName: union-pod-webhook --- -# Source: dataplane/templates/propeller/deployment.yaml -apiVersion: apps/v1 -kind: Deployment +# Source: dataplane/templates/flyteconnector/hpa.yaml +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler metadata: - namespace: union - name: flytepropeller + name: flyteconnector labels: - app.kubernetes.io/name: flytepropeller + app.kubernetes.io/name: flyteconnector app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm spec: - replicas: 1 - selector: - matchLabels: - app.kubernetes.io/name: flytepropeller - app.kubernetes.io/instance: release-name - template: - metadata: - annotations: - configChecksum: "6d56ed108e8a492245bac427b50799a1ec346824c1260acfae6306ea9ec1ce5" - - labels: - platform.union.ai/zone: "dataplane" - - - app.kubernetes.io/name: flytepropeller - app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name - app.kubernetes.io/managed-by: Helm - spec: - priorityClassName: system-cluster-critical - containers: - - command: - - flytepropeller - - --config - - /etc/flyte/config/*.yaml - - --propeller.cluster-id - - 'test-cluster-name' - env: - - name: POD_NAME - valueFrom: - fieldRef: - fieldPath: metadata.name - - name: POD_NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - - name: GOMEMLIMIT - valueFrom: - resourceFieldRef: - divisor: 1 - resource: limits.memory - - name: GOMAXPROCS - valueFrom: - resourceFieldRef: - divisor: 1 - resource: limits.cpu - - name: CLUSTER_NAME - valueFrom: - secretKeyRef: - name: operator-cluster-name - key: cluster_name - - name: DEPLOYMENT_NAME - value: operator - - name: PROXY_SERVICE_URL - value: http://union-operator-proxy:8080 - - name: PROMETHEUS_SERVICE_URL - value: http://union-operator-prometheus:80 - - name: KNATIVE_PROXY_SERVICE_URL - value: http://kourier-internal - image: "public.ecr.aws/p0i0a9q8/unionoperator:2026.4.5" - imagePullPolicy: "IfNotPresent" - name: flytepropeller - ports: - - containerPort: 10254 - resources: - limits: - cpu: "3" - memory: 3Gi - requests: - cpu: "1" - memory: 1Gi - volumeMounts: - - name: config-volume - mountPath: /etc/flyte/config - - name: auth - mountPath: /etc/union/secret - serviceAccountName: flytepropeller-system - volumes: - - configMap: - name: flyte-propeller-config - name: config-volume - - name: auth - secret: - secretName: union-secret-auth + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: flyteconnector + minReplicas: 2 + maxReplicas: 5 + metrics: + - type: Resource + resource: + name: cpu + target: + type: Utilization + averageUtilization: 80 + - type: Resource + resource: + name: memory + target: + type: Utilization + averageUtilization: 80 +--- +# Source: dataplane/templates/webhook/mutatingwebhookconfiguration.yaml +apiVersion: admissionregistration.k8s.io/v1 +kind: MutatingWebhookConfiguration +metadata: + name: union-pod-webhook-test-org-name + labels: + app.kubernetes.io/name: union-pod-webhook + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm +webhooks: + - name: union-pod-webhook.flyte.org + admissionReviewVersions: + - v1 + - v1beta1 + clientConfig: + caBundle: LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSUROekNDQWgrZ0F3SUJBZ0lVSm9kQ09lem4vd1BHQTZjYkw1U3duSlpYT2hJd0RRWUpLb1pJaHZjTkFRRUwKQlFBd0t6RXBNQ2NHQTFVRUF3d2dabXg1ZEdWd2NtOXdaV3hzWlhJdGQyVmlhRzl2YXk1MWJtbHZiaTV6ZG1NdwpIaGNOTWpZd01USXpNREl6TmpVMVdoY05Nell3TVRJeE1ESXpOalUxV2pBck1Ta3dKd1lEVlFRRERDQm1iSGwwClpYQnliM0JsYkd4bGNpMTNaV0pvYjI5ckxuVnVhVzl1TG5OMll6Q0NBU0l3RFFZSktvWklodmNOQVFFQkJRQUQKZ2dFUEFEQ0NBUW9DZ2dFQkFJcjBSWDRGRjFNVTNYdFBmUHErMDUrMGI1Qm0xK3hsTVR5ZUp4bG5mWFJuRlgyegpPcHV6NmsreUpBQk4xRkxmNDVYRnlJclFlaW5DWWJtckZXalFRc3ZDWEloNkpPNGMwdmRMVVU5RERYcldWRFlsCkRNZytTUDlJVnZBMm5USkVFZ2RFREVsKzNWeThUbUJoNlNmcGthcXdHajY5SmhIdy9OeE9yZzM2bUY3TU5VZ1MKWXJVUkxqUFFzTUN6NElTVzhhQnEzS2IwRUFWUTA4V1lZQ0ZEbjY1aHBzVFdxR05rL3BKRnVTYjlsZll6RVdyYwpJeW5wNDhqTnIvNUdKS3NnWG9LeTUwZVo4UkppeTVyOUhENGtnSW15TVNQR2RDMXFZSVppRHRlZW8yOElxNk5nCmd1dWcyTGhlNndYYW9YZXFCQ2d3SHpwWFVXM1hJVzI5eFoxUkFRRUNBd0VBQWFOVE1GRXdIUVlEVlIwT0JCWUUKRktITDQ1RWhQaVg5bnlRN241QUk0YUZVNFlCL01COEdBMVVkSXdRWU1CYUFGS0hMNDVFaFBpWDlueVE3bjVBSQo0YUZVNFlCL01BOEdBMVVkRXdFQi93UUZNQU1CQWY4d0RRWUpLb1pJaHZjTkFRRUxCUUFEZ2dFQkFEeTRac21XCkNjZ3B5TmI4cU1ydzNYclBNejZLMTRlTVhqQVk1bERFYXM1NklLZExoZFM1bi9sWk95bENuUUtOZ3RHS0VrY3UKT05tMEE2Z0xNT3UzcE43dG9aNXh0WktOcnUvS0x6dm44QUNNNndCbUR0bG5oRERDa3ZUTHBVRzh6UTZSNmlaegpzTWlhVmpLSUJiWkJsZXNYSHZZQnQ2YlRTN2o1QWM2aTdYc1RPK3ZqQ2RUQjVYdCtEZ0x2QnBJRWh2ZGJzNXhmClFtc3ZzQmwxenJnR3JXemo4d0txcXNUMzZaOVU1TzdSK1RJMlQrUDVOR25SeHNQOGJ0R0RtMGZ5SW55WlJrSkcKVTdpVFJyMlZRWlRyaTNZQ2dlVjN1UjFtMnJjc2hyU1Avb082T0QyTXR5Q1FMRWtFK1pxVUVJVnpSYmVaNXl2RQptM29YMFV2OUtBUy9pWTA9Ci0tLS0tRU5EIENFUlRJRklDQVRFLS0tLS0K + service: + name: union-pod-webhook + namespace: union + path: /mutate--v1-pod/secrets + port: 443 + failurePolicy: Fail + matchPolicy: Equivalent + namespaceSelector: + matchLabels: + kubernetes.io/metadata.name: union + objectSelector: + matchLabels: + inject-flyte-secrets: "true" + organization: 'test-org-name' + reinvocationPolicy: Never + rules: + - apiGroups: + - '*' + apiVersions: + - v1 + operations: + - CREATE + resources: + - pods + scope: '*' + sideEffects: NoneOnDryRun + timeoutSeconds: 30 --- # Source: dataplane/charts/dcgm-exporter/templates/tls-secret.yaml # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. @@ -6699,6 +6922,209 @@ spec: # See the License for the specific language governing permissions and # limitations under the License. --- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Imported via https://github.com/knative/operator/releases/download/knative-v1.16.0/operator.yaml +# Copyright 2024 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +# Source: dataplane/templates/propeller/serviceaccount.yaml +--- +--- +# Source: dataplane/templates/serving/knative-serving.yaml +apiVersion: operator.knative.dev/v1beta1 +kind: KnativeServing +metadata: + name: union-operator-serving + namespace: union + labels: + helm.sh/chart: dataplane-2026.4.4 + app.kubernetes.io/name: release-name-dataplane + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2026.4.5" + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/component: serving +spec: + config: + deployment: + progress-deadline: "30m" + queue-sidecar-cpu-request: "25m" + queue-sidecar-cpu-limit: "1000m" + queue-sidecar-memory-request: "400Mi" + queue-sidecar-memory-limit: "800Mi" + queue-sidecar-ephemeral-storage-request: "512Mi" + queue-sidecar-ephemeral-storage-limit: "1024Mi" + registries-skipping-tag-resolving: managed.cr.union.ai + features: + kubernetes.podspec-affinity: "enabled" + kubernetes.podspec-nodeselector: "enabled" + kubernetes.podspec-tolerations: "enabled" + kubernetes.podspec-fieldref: "enabled" + kubernetes.podspec-dnspolicy: "enabled" + kubernetes.podspec-schedulername: "enabled" + kubernetes.podspec-securitycontext: "enabled" + network: + ingress-class: "kourier.ingress.networking.knative.dev" + high-availability: + replicas: 2 + ingress: + kourier: + enabled: true + bootstrap-configmap: "union-operator-serving-envoy-bootstrap" + service-type: ClusterIP + podDisruptionBudgets: + - name: 3scale-kourier-gateway-pdb + minAvailable: 50% + - name: activator-pdb + minAvailable: 50% + - name: webhook-pdb + minAvailable: 50% + registry: + override: + # TODO(jeev): Wire up Union fork of Envoy + 3scale-kourier-gateway/kourier-gateway: ghcr.io/unionai/envoy:456fed84d4ad9a9dfb186d117d9362e9dc0f7c1f + # TODO(jeev): Wire up Union fork of Kourier + net-kourier-controller/controller: ghcr.io/unionai/kourier@sha256:5804c348d15b3959604e3e3ceed216c3a1c7b32cbe254c7d3eb02a35e62ba9c4 + workloads: + - name: 3scale-kourier-gateway + labels: + helm.sh/chart: dataplane-2026.4.4 + app.kubernetes.io/name: release-name-dataplane + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2026.4.5" + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/component: 3scale-kourier-gateway + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchLabels: + app.kubernetes.io/name: 3scale-kourier-gateway + app.kubernetes.io/instance: release-name + topologyKey: topology.kubernetes.io/zone + annotations: + checksum/bootstrap-config: 6469f6f592eebc9e0c676d8ccc359a05bc799c0988e474b2da942c4cc328f656 + env: + - container: kourier-gateway + envVars: + - name: UNION_AUTHZ_TENANTAUTHURL + value: "https://test-controlplane-host/me" + - name: UNION_AUTHZ_TENANTAUTHSIGNINURL + value: "https://test-controlplane-host/login" + - name: UNION_AUTHZ_TENANTCONTROLPLANEURL + value: "https://test-controlplane-host" + resources: + - container: kourier-gateway + limits: + cpu: "2" + memory: 2Gi + requests: + cpu: "1" + memory: 1Gi + - name: activator + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app + operator: In + values: + - activator + topologyKey: topology.kubernetes.io/zone + - name: autoscaler + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app + operator: In + values: + - autoscaler + topologyKey: topology.kubernetes.io/zone + - name: autoscaler-hpa + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app + operator: In + values: + - autoscaler-hpa + topologyKey: topology.kubernetes.io/zone + - name: controller + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app + operator: In + values: + - controller + topologyKey: topology.kubernetes.io/zone + - name: net-kourier-controller + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app + operator: In + values: + - net-kourier-controller + topologyKey: topology.kubernetes.io/zone + env: + - container: controller + envVars: + - name: KOURIER_UNION_AUTHZ_ENABLED + value: "true" + resources: + - container: controller + limits: + cpu: "1" + memory: 1Gi + requests: + cpu: 500m + memory: 500Mi + - name: webhook + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app + operator: In + values: + - webhook + topologyKey: topology.kubernetes.io/zone +--- # Source: dataplane/templates/common/extra-manifests.yaml apiVersion: karpenter.sh/v1 kind: NodePool @@ -6756,6 +7182,19 @@ spec: value: "true" weight: 10 --- +# Source: dataplane/templates/common/task-podtemplate.yaml +apiVersion: v1 +kind: PodTemplate +metadata: + name: task-template + namespace: union +template: + spec: + serviceAccountName: union + containers: + - name: default + image: docker.io/rwgrim/docker-noop +--- # Source: dataplane/charts/fluentbit/templates/tests/test-connection.yaml apiVersion: v1 kind: Pod diff --git a/tests/generated/dataplane.aws.with-ingress.yaml b/tests/generated/dataplane.aws.with-ingress.yaml index e38af92e..8a9b7668 100644 --- a/tests/generated/dataplane.aws.with-ingress.yaml +++ b/tests/generated/dataplane.aws.with-ingress.yaml @@ -1,167 +1,140 @@ --- -# Source: dataplane/templates/common/namespaces.yaml -apiVersion: v1 -kind: Namespace -metadata: - name: flytesnacks-development ---- -# Source: dataplane/templates/common/namespaces.yaml -apiVersion: v1 -kind: Namespace -metadata: - name: flytesnacks-staging ---- -# Source: dataplane/templates/common/namespaces.yaml -apiVersion: v1 -kind: Namespace -metadata: - name: flytesnacks-production ---- -# Source: dataplane/templates/common/namespaces.yaml -apiVersion: v1 -kind: Namespace -metadata: - name: union-health-monitoring-development ---- -# Source: dataplane/templates/common/namespaces.yaml -apiVersion: v1 -kind: Namespace -metadata: - name: union-health-monitoring-staging ---- -# Source: dataplane/templates/common/namespaces.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2020 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + apiVersion: v1 -kind: Namespace +kind: ServiceAccount metadata: - name: union-health-monitoring-production + name: knative-operator + namespace: "union" + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator --- -# Source: dataplane/charts/fluentbit/templates/serviceaccount.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2020 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + apiVersion: v1 kind: ServiceAccount metadata: - name: fluentbit-system - namespace: union + name: operator-webhook + namespace: "union" labels: - helm.sh/chart: fluentbit-0.48.9 - app.kubernetes.io/name: fluentbit - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "3.2.8" - app.kubernetes.io/managed-by: Helm + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator --- -# Source: dataplane/charts/kube-state-metrics/templates/serviceaccount.yaml +# Source: dataplane/charts/prometheus/charts/kube-state-metrics/templates/serviceaccount.yaml apiVersion: v1 kind: ServiceAccount automountServiceAccountToken: true metadata: labels: - helm.sh/chart: kube-state-metrics-5.30.1 + helm.sh/chart: kube-state-metrics-5.25.1 app.kubernetes.io/managed-by: Helm app.kubernetes.io/component: metrics app.kubernetes.io/part-of: kube-state-metrics app.kubernetes.io/name: kube-state-metrics app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2.15.0" + app.kubernetes.io/version: "2.13.0" name: release-name-kube-state-metrics namespace: union --- -# Source: dataplane/charts/opencost/templates/serviceaccount.yaml +# Source: dataplane/charts/prometheus/templates/serviceaccount.yaml apiVersion: v1 kind: ServiceAccount metadata: - name: release-name-opencost - namespace: union labels: - helm.sh/chart: opencost-1.42.0 - app.kubernetes.io/name: opencost + app.kubernetes.io/component: server + app.kubernetes.io/name: prometheus app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "1.111.0" - app.kubernetes.io/part-of: opencost + app.kubernetes.io/version: v2.54.1 + helm.sh/chart: prometheus-25.27.0 app.kubernetes.io/managed-by: Helm -automountServiceAccountToken: true ---- -# Source: dataplane/templates/clusterresourcesync/serviceaccount.yaml -apiVersion: v1 -kind: ServiceAccount -metadata: - name: union-clustersync-system + app.kubernetes.io/part-of: prometheus + name: union-operator-prometheus namespace: union annotations: - eks.amazonaws.com/role-arn: arn:aws:iam::000000000000:role/union-flyte-role ---- -# Source: dataplane/templates/imagebuilder/serviceaccount.yaml -apiVersion: v1 -kind: ServiceAccount -metadata: - name: union-imagebuilder + {} --- -# Source: dataplane/templates/nodeexecutor/serviceaccount.yaml +# Source: dataplane/templates/common/system-serviceaccount.yaml apiVersion: v1 kind: ServiceAccount metadata: - name: executor + name: union-system namespace: union - labels: - app: executor - annotations: - eks.amazonaws.com/role-arn: arn:aws:iam::000000000000:role/union-flyte-role ---- -# Source: dataplane/templates/operator/serviceaccount-proxy.yaml -apiVersion: v1 -kind: ServiceAccount -metadata: - name: proxy-system - labels: - app.kubernetes.io/name: operator-proxy - app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name - app.kubernetes.io/managed-by: Helm annotations: eks.amazonaws.com/role-arn: arn:aws:iam::000000000000:role/union-flyte-role --- -# Source: dataplane/templates/operator/serviceaccount.yaml +# Source: dataplane/templates/common/union-serviceaccount.yaml apiVersion: v1 kind: ServiceAccount metadata: - name: operator-system - labels: - app.kubernetes.io/name: union-operator - app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name - app.kubernetes.io/managed-by: Helm + name: union + namespace: union annotations: eks.amazonaws.com/role-arn: arn:aws:iam::000000000000:role/union-flyte-role +automountServiceAccountToken: true --- -# Source: dataplane/templates/prometheus/serviceaccount.yaml +# Source: dataplane/templates/flyteconnector/serviceaccount.yaml apiVersion: v1 kind: ServiceAccount metadata: - name: union-operator-prometheus + name: flyteconnector namespace: union - labels: - helm.sh/chart: dataplane-2026.4.2 - app.kubernetes.io/name: release-name-dataplane + labels: + app.kubernetes.io/name: flyteconnector app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: prometheus ---- -# Source: dataplane/templates/propeller/serviceaccount-webhook.yaml -apiVersion: v1 -kind: ServiceAccount -metadata: - name: flytepropeller-webhook-system - namespace: union - annotations: - eks.amazonaws.com/role-arn: arn:aws:iam::000000000000:role/union-flyte-role --- -# Source: dataplane/templates/propeller/serviceaccount.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2022 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + apiVersion: v1 -kind: ServiceAccount +kind: Secret metadata: - name: flytepropeller-system - namespace: union - annotations: - eks.amazonaws.com/role-arn: arn:aws:iam::000000000000:role/union-flyte-role + name: operator-webhook-certs + namespace: "union" + labels: + app.kubernetes.io/component: operator-webhook + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +# The data is populated at install time. --- # Source: dataplane/templates/common/auth-secret.yaml apiVersion: v1 @@ -184,151 +157,533 @@ type: Opaque data: cluster_name: dW5pb24tYXdz --- -# Source: dataplane/templates/propeller/deployment-webhook.yaml -# Create an empty secret that the first propeller pod will populate +# Source: dataplane/templates/webhook/mutatingwebhookconfiguration.yaml apiVersion: v1 kind: Secret metadata: - name: flyte-pod-webhook + name: union-pod-webhook namespace: union + labels: + app.kubernetes.io/name: union-pod-webhook + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm type: Opaque +data: + ca.crt: LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSUROekNDQWgrZ0F3SUJBZ0lVSm9kQ09lem4vd1BHQTZjYkw1U3duSlpYT2hJd0RRWUpLb1pJaHZjTkFRRUwKQlFBd0t6RXBNQ2NHQTFVRUF3d2dabXg1ZEdWd2NtOXdaV3hzWlhJdGQyVmlhRzl2YXk1MWJtbHZiaTV6ZG1NdwpIaGNOTWpZd01USXpNREl6TmpVMVdoY05Nell3TVRJeE1ESXpOalUxV2pBck1Ta3dKd1lEVlFRRERDQm1iSGwwClpYQnliM0JsYkd4bGNpMTNaV0pvYjI5ckxuVnVhVzl1TG5OMll6Q0NBU0l3RFFZSktvWklodmNOQVFFQkJRQUQKZ2dFUEFEQ0NBUW9DZ2dFQkFJcjBSWDRGRjFNVTNYdFBmUHErMDUrMGI1Qm0xK3hsTVR5ZUp4bG5mWFJuRlgyegpPcHV6NmsreUpBQk4xRkxmNDVYRnlJclFlaW5DWWJtckZXalFRc3ZDWEloNkpPNGMwdmRMVVU5RERYcldWRFlsCkRNZytTUDlJVnZBMm5USkVFZ2RFREVsKzNWeThUbUJoNlNmcGthcXdHajY5SmhIdy9OeE9yZzM2bUY3TU5VZ1MKWXJVUkxqUFFzTUN6NElTVzhhQnEzS2IwRUFWUTA4V1lZQ0ZEbjY1aHBzVFdxR05rL3BKRnVTYjlsZll6RVdyYwpJeW5wNDhqTnIvNUdKS3NnWG9LeTUwZVo4UkppeTVyOUhENGtnSW15TVNQR2RDMXFZSVppRHRlZW8yOElxNk5nCmd1dWcyTGhlNndYYW9YZXFCQ2d3SHpwWFVXM1hJVzI5eFoxUkFRRUNBd0VBQWFOVE1GRXdIUVlEVlIwT0JCWUUKRktITDQ1RWhQaVg5bnlRN241QUk0YUZVNFlCL01COEdBMVVkSXdRWU1CYUFGS0hMNDVFaFBpWDlueVE3bjVBSQo0YUZVNFlCL01BOEdBMVVkRXdFQi93UUZNQU1CQWY4d0RRWUpLb1pJaHZjTkFRRUxCUUFEZ2dFQkFEeTRac21XCkNjZ3B5TmI4cU1ydzNYclBNejZLMTRlTVhqQVk1bERFYXM1NklLZExoZFM1bi9sWk95bENuUUtOZ3RHS0VrY3UKT05tMEE2Z0xNT3UzcE43dG9aNXh0WktOcnUvS0x6dm44QUNNNndCbUR0bG5oRERDa3ZUTHBVRzh6UTZSNmlaegpzTWlhVmpLSUJiWkJsZXNYSHZZQnQ2YlRTN2o1QWM2aTdYc1RPK3ZqQ2RUQjVYdCtEZ0x2QnBJRWh2ZGJzNXhmClFtc3ZzQmwxenJnR3JXemo4d0txcXNUMzZaOVU1TzdSK1RJMlQrUDVOR25SeHNQOGJ0R0RtMGZ5SW55WlJrSkcKVTdpVFJyMlZRWlRyaTNZQ2dlVjN1UjFtMnJjc2hyU1Avb082T0QyTXR5Q1FMRWtFK1pxVUVJVnpSYmVaNXl2RQptM29YMFV2OUtBUy9pWTA9Ci0tLS0tRU5EIENFUlRJRklDQVRFLS0tLS0K + tls.crt: LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSUQxakNDQXI2Z0F3SUJBZ0lVREtRN3A4MFpIdkQyUE5Ya3p3VGRLem9iUjg4d0RRWUpLb1pJaHZjTkFRRUwKQlFBd0t6RXBNQ2NHQTFVRUF3d2dabXg1ZEdWd2NtOXdaV3hzWlhJdGQyVmlhRzl2YXk1MWJtbHZiaTV6ZG1NdwpIaGNOTWpZd01USXpNREl6TmpVMVdoY05NamN3TVRJek1ESXpOalUxV2pBck1Ta3dKd1lEVlFRRERDQm1iSGwwClpYQnliM0JsYkd4bGNpMTNaV0pvYjI5ckxuVnVhVzl1TG5OMll6Q0NBU0l3RFFZSktvWklodmNOQVFFQkJRQUQKZ2dFUEFEQ0NBUW9DZ2dFQkFMNnMxUTRaZ3ZxVVdNdFp5VC9EKytEQWl6WmR0Q1lkVzArSGk4S3phQ1Vjbi9zZwo3a21NTEhDM2ZlZWtZK2ZUNENHUTd6K1l2WjRWWSt6WGljKzdrNndLbUZqdm1acEM1Rm96OWo1Q21NRk5HZmlWCnRobzVvSU9MUkIzNXRYMU1ROHUwNFVzT29QcjdMK016eEdxWXVmTFdtN1NVdjh0eEp3ZFMxZVlzMllNV1pMeU4KekI4akdWY01Qc2ZNVkdxdFhTTVljQ3hNYURKaTM1N0VEZSs3clM2U2ZtbnFtYjJxekdQM1BRRStCUm91Yk5nLwo4NCt4c253VXBLZUFpem95dFM2c2ZlOTdMUEtRTW9oVzd2TjZORFJrLy9xSmpHem9iZmZwUHU3endhWHVXMWtMCmllc2g2cVRrdlZLSTF3WFJ2WHArVU5GbjZzdDIwdEVlRks0ZzMwOENBd0VBQWFPQjhUQ0I3akFKQmdOVkhSTUUKQWpBQU1Bc0dBMVVkRHdRRUF3SUY0RENCa3dZRFZSMFJCSUdMTUlHSWdoWm1iSGwwWlhCeWIzQmxiR3hsY2kxMwpaV0pvYjI5cmdoeG1iSGwwWlhCeWIzQmxiR3hsY2kxM1pXSm9iMjlyTG5WdWFXOXVnaUJtYkhsMFpYQnliM0JsCmJHeGxjaTEzWldKb2IyOXJMblZ1YVc5dUxuTjJZNEl1Wm14NWRHVndjbTl3Wld4c1pYSXRkMlZpYUc5dmF5NTEKYm1sdmJpNXpkbU11WTJ4MWMzUmxjaTVzYjJOaGJEQWRCZ05WSFE0RUZnUVV1d25wcDZ1MTNKL0NsNnI3eDZ1Sgo5YmZ1RDhzd0h3WURWUjBqQkJnd0ZvQVVvY3Zqa1NFK0pmMmZKRHVma0FqaG9WVGhnSDh3RFFZSktvWklodmNOCkFRRUxCUUFEZ2dFQkFFMmlOa1MzczhGT3Z0cDhrVmJNZUN2amxIQytvYTVlZTFGNmowNlZ3bFJPOWdxOXJla1UKMGptWUhNWGNjWjcrZXJkNFRQRXgwK3ZqMzNhRTV3cDFmdVJJL2xIZHhuOUpJRHg2bTFQTHdVa3p1WG9EekYxLwozVXVIWU1xOGkrT2xsY0tqbmdVMUFUNmRWRCtIbFVWeHpMdjlNazdPb05SQUY5dTJHa1hiUHdCN0I1ZGRxdWxlCmtNcXBxbDhES2s4SVNVelVsTjNBb2Q5NmJUelIyV3RCVkxpTi9zRHZtTDZwZThFYm55RnBUb1BZTDlGeWZOcmIKT1ZPZEJwUkZFTG1KOHZaZWY5d1lxRmVXMTE3SGV5VDArZkxGZVNSemZUNVVWUlVraVFBcUVJdUhHUVRrOVFabgo1WnYvNUF0eEg3WE1rcXR0VE1zMFlSUG0xYjQzY3FBeFdhND0KLS0tLS1FTkQgQ0VSVElGSUNBVEUtLS0tLQo= + tls.key: LS0tLS1CRUdJTiBQUklWQVRFIEtFWS0tLS0tCk1JSUV2UUlCQURBTkJna3Foa2lHOXcwQkFRRUZBQVNDQktjd2dnU2pBZ0VBQW9JQkFRQytyTlVPR1lMNmxGakwKV2NrL3cvdmd3SXMyWGJRbUhWdFBoNHZDczJnbEhKLzdJTzVKakN4d3QzM25wR1BuMCtBaGtPOC9tTDJlRldQcwoxNG5QdTVPc0NwaFk3NW1hUXVSYU0vWStRcGpCVFJuNGxiWWFPYUNEaTBRZCtiVjlURVBMdE9GTERxRDYreS9qCk04UnFtTG55MXB1MGxML0xjU2NIVXRYbUxObURGbVM4amN3Zkl4bFhERDdIekZScXJWMGpHSEFzVEdneVl0K2UKeEEzdnU2MHVrbjVwNnBtOXFzeGo5ejBCUGdVYUxtellQL09Qc2JKOEZLU25nSXM2TXJVdXJIM3ZleXp5a0RLSQpWdTd6ZWpRMFpQLzZpWXhzNkczMzZUN3U4OEdsN2x0WkM0bnJJZXFrNUwxU2lOY0YwYjE2ZmxEUlorckxkdExSCkhoU3VJTjlQQWdNQkFBRUNnZ0VBRjV2SnMzbTRMSE9DdlFic2NwOC9DQmgvQkNlOC92MGlpYW5VUmJLMkFlWlYKN3Arb3NXV1FPUktYSGIvT2VPMVVjb09DQkFOUzh3aGQrM3pDZlB5U0w3cU9HM0RyT1Z6djdqVkIxM3FpZEVpcQpId3ZXWk0vZXpuckhYOWpEdm5SYmJwVUNVaXRKQmxwa2x3S1pYc056UHB5UTRkNkxFdEw3VEo1V2lxM2g3cWQzCjhncmJoeWlWNDhZRFAyYkhpZGt0S2QvMHBNL3piSzZPTS9iMFpLUmRBWVpHQXlOcklMaFJ3eDNsQXE2SDVqL20KUHBuNlZEdVQ3Y1h6UFhmNTgxR0IyNjQxWFJVTXRNWGVEREtaU3JudnIrbG5tV2lsYXZmNkg3ZHp4NlRBMmk0RApybCtIRVdUN1RSQUZGMnZ5SVUrYSt5RzNPVFBKRCtmVTdCWWh4Zld4aFFLQmdRRGVvcXVCbzQxK1dKU255blZnCmNWMHZoUEkwZ0dyRGQzNFdIbCtGNTZuSmxuMVVlbUQ2SUQ0TUhnNVpHWmNmd3NkRlB0aHl1VnB0OVl1SW11SGQKRHE5ZDhpWnpHUzJVbStkSWlsWjhnZXJ5Q3dON3hIbFM3SzJrc0RmRGJwYmZZSXZ1LzRnSExZRHh3YzQ0OG54ZwpWNGpCUFY2QTdYTHRqQzZzdFE2OW9uTlIwd0tCZ1FEYlFBUTRGdWpnUjJUOXh2Y01MdXhMbm1YaHE4U2NoZlM5CkRoWndDUzZIeHB1YlBxblBjQ04vQUZjbDZ1cDRXVEVZdVY4bEpwcGwwRmpYQ2lMdkhuUU15cm5vRjlUOThpanQKZGVHYjF6ZWlwRUpndFpNVHkvOGdERWorZU1wSFBuYTZwQVpMOVpRVUdLR2dDNGQ0SzljdVNHMmRlb1dSU0ViSQpMVGgzeGpXVEZRS0JnQStJRWFidG5nVmVjS0IwQTFSREZGa29VUzFRZUNKQ3g4MExPV2JDRHBvOW9XaXZVT3lpCkt6SDFOdE1JY2Y2SlBCV2NtTVVJSVVMaWltVnhTS2gvU2NTb0MvNmpsd1p6Q2VPSm94YjBpVXR4Y1VERktDR2MKMlZCUDZ0UDdkeE1HVFR6VEhzNUJZbWw3TjhQSlJ0d0J2MHliMTJmdktNRmhzaS9pUWJFQkVFSjVBb0dBUm9NSApHRmJkM0V0NXdsZzcyYkk1a25SRnhkY3RLejIxb1J1bndhNWlSWTV0T3Zkak8zQ1FLZWNkSC9lMklyQmtwdFB2Ck1vNkF0MS9UUW8xakFNNGxlbnUwWUYxUnhiNGN4WW5VM2Y3UVNNRlZDNjg1dHZNemdNWVNyNngvT1h4d1NNUTUKdGpJcnhtN0poQ1JSRkNmZFUyZzl3SmpIM2hxRmtSbGlBTHRCUGFVQ2dZRUF2QkRHejQvYS9kRzRJT0VpREk4SgpBbkxaRWZaTFJrdnI0ejRyT0p0TGZtWlFjTW5DSUZmKzBaelhTME9XOWhvMjk5RUNtS1l4U2VUWk1WSDgzOFZCCnRvSkNGM3lzb01rSC9meEVXTUpicDlMblJwSFUzUElicHlhRG1nWW1YdmsvK3laL3Y1c0dDTWNtVGkzWVFLRGsKcStlK1o0SHZyaGUxVytUaHpwOCtoNGs9Ci0tLS0tRU5EIFBSSVZBVEUgS0VZLS0tLS0K --- -# Source: dataplane/templates/clusterresourcesync/configmap.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2019 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + apiVersion: v1 kind: ConfigMap metadata: - name: union-clusterresourcesync-config - namespace: union + name: config-logging + namespace: "union" labels: - app.kubernetes.io/name: clusterresourcesync - app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name - app.kubernetes.io/managed-by: Helm -data: - cluster_resources.yaml: | - cluster_resources: - clusterName: 'union-aws' - customData: - - production: - - projectQuotaCpu: - value: "4096" - - projectQuotaMemory: - value: 2Ti - - projectQuotaNvidiaGpu: - value: "256" - - defaultUserRoleKey: - value: 'eks.amazonaws.com/role-arn' - - defaultUserRoleValue: - value: 'arn:aws:iam::000000000000:role/union-flyte-role' - - staging: - - projectQuotaCpu: - value: "4096" - - projectQuotaMemory: - value: 2Ti - - projectQuotaNvidiaGpu: - value: "256" - - defaultUserRoleKey: - value: 'eks.amazonaws.com/role-arn' - - defaultUserRoleValue: - value: 'arn:aws:iam::000000000000:role/union-flyte-role' - - development: - - projectQuotaCpu: - value: "4096" - - projectQuotaMemory: - value: 2Ti - - projectQuotaNvidiaGpu: - value: "256" - - defaultUserRoleKey: - value: 'eks.amazonaws.com/role-arn' - - defaultUserRoleValue: - value: 'arn:aws:iam::000000000000:role/union-flyte-role' - refreshInterval: 5m - standaloneDeployment: true - templatePath: /etc/flyte/clusterresource/templates - clusterResourcesPrivate: - app: - isServerless: false - union: - auth: - authorizationMetadataKey: flyte-authorization - clientId: 'clientId' - clientSecretLocation: /etc/union/secret/client_secret - tokenRefreshWindow: 5m - type: ClientSecret - connection: - host: dns:///union.us-west-2.union.ai - admin.yaml: | - admin: - clientId: 'clientId' - clientSecretLocation: /etc/union/secret/client_secret - endpoint: dns:///union.us-west-2.union.ai - insecure: false - event: - capacity: 1000 - rate: 500 - type: admin - domain.yaml: | - domains: - - id: development - name: development - - id: staging - name: staging - - id: production - name: production - clusters.yaml: | - clusters: - clusterConfigs: [] - labelClusterMap: {} - logger.yaml: | - logger: - formatter: - type: json - level: 4 - show-source: true + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +data: {} --- -# Source: dataplane/templates/clusterresourcesync/configmap.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2019 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + apiVersion: v1 kind: ConfigMap metadata: - name: union-clusterresource-template - namespace: union + name: config-observability + namespace: "union" labels: - app.kubernetes.io/name: clusterresourcesync + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +data: {} +--- +# Source: dataplane/charts/prometheus/templates/cm.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + labels: + app.kubernetes.io/component: server + app.kubernetes.io/name: prometheus app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name + app.kubernetes.io/version: v2.54.1 + helm.sh/chart: prometheus-25.27.0 app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: prometheus + name: union-operator-prometheus + namespace: union data: - a_namespace.yaml: | - apiVersion: v1 - kind: Namespace - metadata: - name: {{ namespace }} - labels: - union.ai/namespace-type: flyte - spec: - finalizers: - - kubernetes - - b_default_service_account.yaml: | - apiVersion: v1 - kind: ServiceAccount - metadata: - name: default - namespace: {{ namespace }} - annotations: - {{ defaultUserRoleKey }}: {{ defaultUserRoleValue }} + allow-snippet-annotations: "false" + alerting_rules.yml: | + {} + alerts: | + {} + prometheus.yml: | + global: + evaluation_interval: 1m + scrape_interval: 1m + scrape_timeout: 10s + rule_files: + - /etc/config/recording_rules.yml + - /etc/config/alerting_rules.yml + - /etc/config/rules + - /etc/config/alerts + scrape_configs: + - job_name: prometheus + static_configs: + - targets: + - localhost:9090 + - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + job_name: kubernetes-apiservers + kubernetes_sd_configs: + - role: endpoints + relabel_configs: + - action: keep + regex: default;kubernetes;https + source_labels: + - __meta_kubernetes_namespace + - __meta_kubernetes_service_name + - __meta_kubernetes_endpoint_port_name + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + insecure_skip_verify: true + - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + job_name: kubernetes-nodes + kubernetes_sd_configs: + - role: node + relabel_configs: + - action: labelmap + regex: __meta_kubernetes_node_label_(.+) + - replacement: kubernetes.default.svc:443 + target_label: __address__ + - regex: (.+) + replacement: /api/v1/nodes/$1/proxy/metrics + source_labels: + - __meta_kubernetes_node_name + target_label: __metrics_path__ + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + insecure_skip_verify: true + - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + job_name: kubernetes-nodes-cadvisor + kubernetes_sd_configs: + - role: node + relabel_configs: + - action: labelmap + regex: __meta_kubernetes_node_label_(.+) + - replacement: kubernetes.default.svc:443 + target_label: __address__ + - regex: (.+) + replacement: /api/v1/nodes/$1/proxy/metrics/cadvisor + source_labels: + - __meta_kubernetes_node_name + target_label: __metrics_path__ + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + insecure_skip_verify: true + - honor_labels: true + job_name: kubernetes-service-endpoints + kubernetes_sd_configs: + - role: endpoints + relabel_configs: + - action: keep + regex: true + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_scrape + - action: drop + regex: true + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_scrape_slow + - action: replace + regex: (https?) + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_scheme + target_label: __scheme__ + - action: replace + regex: (.+) + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_path + target_label: __metrics_path__ + - action: replace + regex: (.+?)(?::\d+)?;(\d+) + replacement: $1:$2 + source_labels: + - __address__ + - __meta_kubernetes_service_annotation_prometheus_io_port + target_label: __address__ + - action: labelmap + regex: __meta_kubernetes_service_annotation_prometheus_io_param_(.+) + replacement: __param_$1 + - action: labelmap + regex: __meta_kubernetes_service_label_(.+) + - action: replace + source_labels: + - __meta_kubernetes_namespace + target_label: namespace + - action: replace + source_labels: + - __meta_kubernetes_service_name + target_label: service + - action: replace + source_labels: + - __meta_kubernetes_pod_node_name + target_label: node + - honor_labels: true + job_name: kubernetes-service-endpoints-slow + kubernetes_sd_configs: + - role: endpoints + relabel_configs: + - action: keep + regex: true + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_scrape_slow + - action: replace + regex: (https?) + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_scheme + target_label: __scheme__ + - action: replace + regex: (.+) + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_path + target_label: __metrics_path__ + - action: replace + regex: (.+?)(?::\d+)?;(\d+) + replacement: $1:$2 + source_labels: + - __address__ + - __meta_kubernetes_service_annotation_prometheus_io_port + target_label: __address__ + - action: labelmap + regex: __meta_kubernetes_service_annotation_prometheus_io_param_(.+) + replacement: __param_$1 + - action: labelmap + regex: __meta_kubernetes_service_label_(.+) + - action: replace + source_labels: + - __meta_kubernetes_namespace + target_label: namespace + - action: replace + source_labels: + - __meta_kubernetes_service_name + target_label: service + - action: replace + source_labels: + - __meta_kubernetes_pod_node_name + target_label: node + scrape_interval: 5m + scrape_timeout: 30s + - honor_labels: true + job_name: prometheus-pushgateway + kubernetes_sd_configs: + - role: service + relabel_configs: + - action: keep + regex: pushgateway + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_probe + - honor_labels: true + job_name: kubernetes-services + kubernetes_sd_configs: + - role: service + metrics_path: /probe + params: + module: + - http_2xx + relabel_configs: + - action: keep + regex: true + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_probe + - source_labels: + - __address__ + target_label: __param_target + - replacement: blackbox + target_label: __address__ + - source_labels: + - __param_target + target_label: instance + - action: labelmap + regex: __meta_kubernetes_service_label_(.+) + - source_labels: + - __meta_kubernetes_namespace + target_label: namespace + - source_labels: + - __meta_kubernetes_service_name + target_label: service + - honor_labels: true + job_name: kubernetes-pods + kubernetes_sd_configs: + - role: pod + relabel_configs: + - action: keep + regex: true + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_scrape + - action: drop + regex: true + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_scrape_slow + - action: replace + regex: (https?) + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_scheme + target_label: __scheme__ + - action: replace + regex: (.+) + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_path + target_label: __metrics_path__ + - action: replace + regex: (\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4}) + replacement: '[$2]:$1' + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_port + - __meta_kubernetes_pod_ip + target_label: __address__ + - action: replace + regex: (\d+);((([0-9]+?)(\.|$)){4}) + replacement: $2:$1 + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_port + - __meta_kubernetes_pod_ip + target_label: __address__ + - action: labelmap + regex: __meta_kubernetes_pod_annotation_prometheus_io_param_(.+) + replacement: __param_$1 + - action: labelmap + regex: __meta_kubernetes_pod_label_(.+) + - action: replace + source_labels: + - __meta_kubernetes_namespace + target_label: namespace + - action: replace + source_labels: + - __meta_kubernetes_pod_name + target_label: pod + - action: drop + regex: Pending|Succeeded|Failed|Completed + source_labels: + - __meta_kubernetes_pod_phase + - action: replace + source_labels: + - __meta_kubernetes_pod_node_name + target_label: node + - honor_labels: true + job_name: kubernetes-pods-slow + kubernetes_sd_configs: + - role: pod + relabel_configs: + - action: keep + regex: true + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_scrape_slow + - action: replace + regex: (https?) + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_scheme + target_label: __scheme__ + - action: replace + regex: (.+) + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_path + target_label: __metrics_path__ + - action: replace + regex: (\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4}) + replacement: '[$2]:$1' + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_port + - __meta_kubernetes_pod_ip + target_label: __address__ + - action: replace + regex: (\d+);((([0-9]+?)(\.|$)){4}) + replacement: $2:$1 + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_port + - __meta_kubernetes_pod_ip + target_label: __address__ + - action: labelmap + regex: __meta_kubernetes_pod_annotation_prometheus_io_param_(.+) + replacement: __param_$1 + - action: labelmap + regex: __meta_kubernetes_pod_label_(.+) + - action: replace + source_labels: + - __meta_kubernetes_namespace + target_label: namespace + - action: replace + source_labels: + - __meta_kubernetes_pod_name + target_label: pod + - action: drop + regex: Pending|Succeeded|Failed|Completed + source_labels: + - __meta_kubernetes_pod_phase + - action: replace + source_labels: + - __meta_kubernetes_pod_node_name + target_label: node + scrape_interval: 5m + scrape_timeout: 30s + - job_name: 'kube-state-metrics' + metrics_path: /metrics + scrape_interval: 1m + honor_labels: true + kubernetes_sd_configs: + - role: endpoints + namespaces: + own_namespace: true + relabel_configs: + - source_labels: [__meta_kubernetes_service_label_app_kubernetes_io_name] + regex: kube-state-metrics + action: keep + - source_labels: [__meta_kubernetes_endpoint_port_name] + regex: http + action: keep + metric_relabel_configs: + - source_labels: [__name__] + regex: "kube_pod_container_resource_(limits|requests)|kube_pod_status_phase|kube_node_(labels|status_allocatable|status_condition|status_capacity)|kube_namespace_labels|kube_pod_container_status_(waiting|terminated|last_terminated).*_reason|kube_daemonset_status_number_unavailable|kube_deployment_status_replicas_unavailable|kube_resourcequota|kube_pod_info|kube_node_info|kube_pod_container_status_restarts_total" + action: keep + - source_labels: [__name__, phase] + separator: ";" + regex: "kube_pod_status_phase;(Succeeded|Failed)" + action: drop + - source_labels: [node] + target_label: nodename + regex: "(.*)" + action: replace + - source_labels: [label_node_group_name] + action: replace + regex: "(.+)" + target_label: label_node_pool_name + - job_name: 'opencost' + metrics_path: /metrics + scrape_interval: 1m + honor_labels: true + kubernetes_sd_configs: + - role: endpoints + namespaces: + own_namespace: true + relabel_configs: + - source_labels: [__meta_kubernetes_service_label_app_kubernetes_io_name] + regex: opencost + action: keep + - source_labels: [__meta_kubernetes_endpoint_port_name] + regex: http + action: keep + metric_relabel_configs: + - source_labels: [__name__] + regex: "kube_node_labels|kube_pod_labels|node_total_hourly_cost|node_ram_hourly_cost|node_cpu_hourly_cost|node_gpu_hourly_cost" + action: keep + - job_name: 'union-services' + metrics_path: /metrics + scrape_interval: 1m + honor_labels: true + kubernetes_sd_configs: + - role: endpoints + namespaces: + own_namespace: true + relabel_configs: + - source_labels: [__meta_kubernetes_service_label_platform_union_ai_service_group] + regex: .+ + action: keep + - source_labels: [__meta_kubernetes_endpoint_port_name] + regex: debug + action: keep + # Flyte propeller metrics for execution info and fast task duration. + # No-op when propeller is not deployed (no pods match the label selector). + - job_name: 'flytepropeller' + kubernetes_sd_configs: + - role: pod + namespaces: + own_namespace: true + selectors: + - role: pod + label: app.kubernetes.io/name=flytepropeller + metric_relabel_configs: + - source_labels: [__name__] + regex: "flyte:propeller:all:round:execution_info|flyte:propeller:all:node:fast_task:fast_task_execution_duration" + action: keep + relabel_configs: + - source_labels: [__meta_kubernetes_pod_container_name] + regex: flytepropeller + action: keep + - source_labels: [__meta_kubernetes_namespace] + action: replace + target_label: namespace + - source_labels: [__meta_kubernetes_pod_name] + action: replace + target_label: pod + # Kourier gateway (envoy) metrics for app serving. + # No-op when serving is not enabled (no pods match the label selector). + - job_name: 'serving-envoy' + kubernetes_sd_configs: + - role: pod + namespaces: + own_namespace: true + selectors: + - role: pod + label: app=3scale-kourier-gateway + metrics_path: /stats/prometheus + metric_relabel_configs: + - source_labels: [__name__] + regex: "envoy_cluster_upstream_rq_xx|envoy_cluster_upstream_rq_time_bucket" + action: keep + relabel_configs: + - source_labels: [__meta_kubernetes_pod_container_port_name] + regex: metrics + action: keep + - source_labels: [__meta_kubernetes_namespace] + action: replace + target_label: namespace + - source_labels: [__meta_kubernetes_pod_name] + action: replace + target_label: pod + # DCGM GPU metrics. + # No-op when dcgm-exporter is not deployed (no pods match the label selector). + - job_name: 'gpu-metrics' + kubernetes_sd_configs: + - role: pod + namespaces: + names: + - kube-system + selectors: + - role: pod + label: app.kubernetes.io/name=dcgm-exporter - c_project_resource_quota.yaml: | - apiVersion: v1 - kind: ResourceQuota - metadata: - name: project-quota - namespace: {{ namespace }} - spec: - hard: - limits.cpu: {{ projectQuotaCpu }} - limits.memory: {{ projectQuotaMemory }} - requests.nvidia.com/gpu: {{ projectQuotaNvidiaGpu }} + recording_rules.yml: | + {} + rules: | + {} --- # Source: dataplane/templates/fluent-bit/configmap.yaml apiVersion: v1 @@ -376,6 +731,34 @@ data: region us-east-2 bucket bucket --- +# Source: dataplane/templates/imagebuilder/build-image-configmap.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: build-image-config + labels: + app.kubernetes.io/name: union-operator + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm +data: + storage.yaml: | + storage: + container: "bucket" + type: s3 + connection: + auth-type: iam + region: us-east-2 + enable-multicontainer: true + limits: + maxDownloadMBs: 1024 + cache: + max_size_mbs: 0 + target_gc_percent: 70 + buildkit-uri: "tcp://union-operator-buildkit.union.svc.cluster.local:1234" + default-repository: ".dkr.ecr.us-east-2.amazonaws.com/union-dataplane" + authentication-type: "aws" +--- # Source: dataplane/templates/imagebuilder/configmap.yaml apiVersion: v1 kind: ConfigMap @@ -2532,6 +2915,18 @@ data: "version": 1 } --- +# Source: dataplane/templates/monitoring/prometheusrule.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: union-recording-rules + namespace: union + labels: + release: release-name +data: + recording_rules.yml: | + groups: [] +--- # Source: dataplane/templates/nodeexecutor/configmap.yaml apiVersion: v1 kind: ConfigMap @@ -2586,8 +2981,13 @@ data: templateUris: - https://scale.neptune.ai/{{ .taskConfig.project }}/-/run/?customId={{ .taskConfig.id }} - kubernetes-enabled: true + kubernetes-enabled: false enabled_plugins.yaml: | + plugins: + connector-service: + defaultConnector: + defaultServiceConfig: '{"loadBalancingConfig": [{"round_robin":{}}]}' + endpoint: k8s:///flyteconnector.union:8000 tasks: task-plugins: default-for-task-types: @@ -2620,6 +3020,8 @@ data: cpu: 4096 gpu: 256 memory: 2Ti + namespace_mapping: + template: union union: connection: host: dns:///union.us-west-2.union.ai @@ -2640,7 +3042,7 @@ data: cache-endpoint: dns:///union.us-west-2.union.ai endpoint: dns:///union.us-west-2.union.ai insecure: false - type: fallback + type: cacheservicev2 use-admin-auth: true logger: formatter: @@ -2660,6 +3062,7 @@ data: secure: false useAuth: false propeller: + limit-namespace: union node-config: disable-input-file-writes: true plugins: @@ -2667,7 +3070,7 @@ data: additional-worker-args: - --last-ack-grace-period-seconds - "120" - callback-uri: http://unionai-dataplane-executor.union.svc.cluster.local:15605 + callback-uri: http://union-operator-executor.union.svc.cluster.local:15605 grace-period-status-not-found: 2m ioutils: remoteFileOutputPaths: @@ -2677,6 +3080,7 @@ data: default-cpus: 100m default-env-vars: [] default-memory: 100Mi + default-pod-template-name: task-template co-pilot: image: 'cr.flyte.org/flyteorg/flytecopilot:v1.14.1' name: flyte-copilot- @@ -2711,6 +3115,7 @@ data: default-cpus: 100m default-env-vars: [] default-memory: 100Mi + default-pod-template-name: task-template config.yaml: | union: connection: @@ -2730,11 +3135,14 @@ data: operator: enabled: true enableTunnelService: true + # enableDepot: false tunnel: - enableDirectToAppIngress: false + enableDirectToAppIngress: true deploymentToRestart: union-operator-proxy + limitNamespace: union + disableClusterPermissions: true apps: - enabled: 'false' + enabled: 'true' syncClusterConfig: enabled: false clusterId: @@ -2749,19 +3157,19 @@ data: userRole: 'arn:aws:iam::000000000000:role/union-flyte-role' userRoleKey: 'eks.amazonaws.com/role-arn' collectUsages: - enabled: true + enabled: false billing: model: Legacy dependenciesHeartbeat: - prometheus: - endpoint: 'http://union-operator-prometheus:80/-/healthy' - propeller: - endpoint: 'http://flytepropeller:10254' + executor: + endpoint: 'http://union-operator-executor:10254' proxy: endpoint: 'http://union-operator-proxy:10254' secretsWatcher: dryRun: true enabled: false + org: + namespaceTemplate: union imageBuilder: enabled: true executionNamespaceLabels: @@ -2769,9 +3177,6 @@ data: referenceConfigmapName: union-operator targetConfigMapName: "build-image-config" proxy: - imageBuilderConfig: - authenticationType: 'noop' - defaultRepository: '' persistedLogs: objectStore: pathTemplate: namespace-{{.KubernetesNamespace}}.pod-{{.KubernetesPodName}}.cont-{{.KubernetesContainerName}} @@ -2813,989 +3218,149 @@ data: auth-type: iam region: us-east-2 image-builder.buildkit-uri: "tcp://union-operator-buildkit.union.svc.cluster.local:1234" - image-builder.default-repository: "" - image-builder.authentication-type: "noop" + image-builder.default-repository: ".dkr.ecr.us-east-2.amazonaws.com/union-dataplane" + image-builder.authentication-type: "aws" --- -# Source: dataplane/templates/prometheus/configmap.yaml +# Source: dataplane/templates/serving/bootstrap-configmap.yaml +# We need to copy the default configmap here since knative-operator does not automatically update the +# address of the `net-kourier-controller` endpoint to use the release namespace. It assumes that the +# resources are being installed into the `knative-serving` namespace instead. apiVersion: v1 kind: ConfigMap metadata: - name: union-operator-prometheus + name: union-operator-serving-envoy-bootstrap namespace: union - labels: - helm.sh/chart: dataplane-2026.4.2 - app.kubernetes.io/name: release-name-dataplane - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: prometheus data: - prometheus.yml: | - global: - scrape_interval: 15s - evaluation_interval: 15s - alerting: - alertmanagers: - - static_configs: - - targets: - rule_files: - - rules.yml - scrape_configs: - # Self-monitoring - - job_name: prometheus - metrics_path: /prometheus/metrics - static_configs: - - targets: ['localhost:9090'] - metric_relabel_configs: - - source_labels: [__name__] - regex: prometheus_tsdb_head_chunks_storage_size_bytes|prometheus_tsdb_head_series|prometheus_tsdb_head_series_created_total|prometheus_tsdb_head_series_removed_total|prometheus_tsdb_head_samples_appended_total|prometheus_tsdb_storage_blocks_bytes|prometheus_tsdb_retention_limit_bytes|prometheus_tsdb_wal_(.*)|.+_(limit|failed|missed|failures)_total - action: keep + envoy-bootstrap.yaml: | + dynamic_resources: + ads_config: + transport_api_version: V3 + api_type: GRPC + rate_limit_settings: {} + grpc_services: + - envoy_grpc: {cluster_name: xds_cluster} + cds_config: + resource_api_version: V3 + ads: {} + lds_config: + resource_api_version: V3 + ads: {} + node: + cluster: kourier-knative + id: 3scale-kourier-gateway + static_resources: + listeners: + - name: stats_listener + address: + socket_address: + address: 0.0.0.0 + port_value: 9000 + filter_chains: + - filters: + - name: envoy.filters.network.http_connection_manager + typed_config: + "@type": type.googleapis.com/envoy.extensions.filters.network.http_connection_manager.v3.HttpConnectionManager + stat_prefix: stats_server + http_filters: + - name: envoy.filters.http.router + typed_config: + "@type": type.googleapis.com/envoy.extensions.filters.http.router.v3.Router + route_config: + virtual_hosts: + - name: admin_interface + domains: + - "*" + routes: + - match: + safe_regex: + regex: '/(certs|stats(/prometheus)?|server_info|clusters|listeners|ready)?' + headers: + - name: ':method' + string_match: + exact: GET + route: + cluster: service_stats + - match: + safe_regex: + regex: '/drain_listeners' + headers: + - name: ':method' + string_match: + exact: POST + route: + cluster: service_stats + clusters: + - name: service_stats + connect_timeout: 0.250s + type: static + load_assignment: + cluster_name: service_stats + endpoints: + lb_endpoints: + endpoint: + address: + socket_address: + address: 127.0.0.1 + port_value: 9901 + - name: xds_cluster + # This keepalive is recommended by envoy docs. + # https://www.envoyproxy.io/docs/envoy/latest/api-docs/xds_protocol + typed_extension_protocol_options: + envoy.extensions.upstreams.http.v3.HttpProtocolOptions: + "@type": type.googleapis.com/envoy.extensions.upstreams.http.v3.HttpProtocolOptions + explicit_http_config: + http2_protocol_options: + connection_keepalive: + interval: 30s + timeout: 5s + connect_timeout: 1s + load_assignment: + cluster_name: xds_cluster + endpoints: + lb_endpoints: + endpoint: + address: + socket_address: + address: "net-kourier-controller" + port_value: 18000 + type: STRICT_DNS + admin: + access_log: + - name: envoy.access_loggers.stdout + typed_config: + "@type": type.googleapis.com/envoy.extensions.access_loggers.stream.v3.StdoutAccessLog + address: + socket_address: + address: 127.0.0.1 + port_value: 9901 + stats_config: + stats_tags: + - tag_name: name + regex: '^.*?\.u/.*?n=(.*?)/.*?u\..*$' + - tag_name: domain + regex: '^.*?\.u/.*?d=(.*?)/.*?u\..*$' + - tag_name: org + regex: '^.*?\.u/.*?o=(.*?)/.*?u\..*$' + - tag_name: project + regex: '^.*?\.u/.*?p=(.*?)/.*?u\..*$' + - tag_name: target + regex: '^.*?\.u/.*?t=(.*?)/.*?u\..*$' + - tag_name: target_namespace + regex: '^.*?\.u/.*?tns=(.*?)/.*?u\..*$' +--- +# Source: dataplane/templates/webhook/configmap.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: union-pod-webhook-config + namespace: union +data: + core.yaml: | - # Kube state metrics for pod/node resource tracking and cost calculations - - job_name: kube-state-metrics - static_configs: - - targets: ['release-name-kube-state-metrics:8080'] - metric_relabel_configs: - - separator: ; - source_labels: [__name__] - regex: kube_pod_container_resource_(limits|requests)|kube_pod_status_phase|kube_node_(labels|status_allocatable|status_condition|status_capacity)|kube_namespace_labels|kube_pod_container_status_(waiting|terminated|last_terminated).*_reason|kube_daemonset_status_number_unavailable|kube_deployment_status_replicas_unavailable|kube_resourcequota|kube_pod_info|kube_node_info|kube_pod_container_status_restarts_total - action: keep - - separator: ; - source_labels: [__name__, phase] - regex: kube_pod_status_phase;(Succeeded|Failed) - action: drop - - source_labels: [node] - target_label: nodename - regex: '(.*)' - action: replace - - source_labels: [label_node_group_name] - action: replace - regex: (.+) - target_label: label_node_pool_name - - # cAdvisor container metrics for CPU and memory tracking - - job_name: kubernetes-cadvisor - metrics_path: /metrics - scheme: https - kubernetes_sd_configs: - - role: node - namespaces: - names: [] - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token - tls_config: - ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt - insecure_skip_verify: false - metric_relabel_configs: - - separator: ; - source_labels: [__name__] - regex: container_cpu_usage_seconds_total|container_memory_working_set_bytes - action: keep - relabel_configs: - - separator: ; - regex: __meta_kubernetes_node_label_(.+) - replacement: $1 - action: labelmap - - separator: ; - regex: (.*) - target_label: __address__ - replacement: kubernetes.default.svc:443 - action: replace - - source_labels: [__meta_kubernetes_node_name] - separator: ; - regex: (.+) - target_label: __metrics_path__ - replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor - action: replace - - # Flyte propeller metrics for execution info and fast task duration - - job_name: flytepropeller - kubernetes_sd_configs: - - role: pod - namespaces: - names: - - union - selectors: - - role: pod - label: app.kubernetes.io/name=flytepropeller - metric_relabel_configs: - - source_labels: [__name__] - regex: "flyte:propeller:all:round:execution_info|flyte:propeller:all:node:fast_task:fast_task_execution_duration" - action: keep - relabel_configs: - - source_labels: [__meta_kubernetes_pod_container_name] - regex: flytepropeller - action: keep - - source_labels: [__meta_kubernetes_namespace] - action: replace - target_label: namespace - - source_labels: [__meta_kubernetes_pod_name] - action: replace - target_label: pod - # OpenCost metrics for cost tracking - - job_name: opencost - static_configs: - - targets: ['release-name-opencost:9003'] - metric_relabel_configs: - - source_labels: [__name__] - regex: "kube_node_labels|kube_pod_labels|node_total_hourly_cost|node_ram_hourly_cost|node_cpu_hourly_cost|node_gpu_hourly_cost" - action: keep - rules.yml: | - groups: - - name: cost_calculations_15s - interval: 15s - rules: - - record: pod_gpu_allocation - expr: | - sum by (namespace, pod) (DCGM_FI_DEV_GPU_UTIL >= bool 0) * on (namespace, pod) group_left() (max by (namespace, pod) (kube_pod_status_phase{phase=~"Running|Pending"} == 1)) - - record: execution_info # A join metric to look up execution-level info. Used to disambiguate workflow/task executions, apps, and workspaces. - expr: | - max by (label_domain, label_project, label_entity_name, label_execution_id, label_entity_id)( - label_replace( - label_replace( - label_replace( - label_replace( - label_replace( - flyte:propeller:all:round:execution_info{domain!="", project!="", workflow_name!="", execution_id!=""}, # filter for workflow/task executions - "label_entity_id", "$1", "execution_id", "(.*)" # join key - ), "label_entity_name", "$1", "workflow_name", "(.*)" # set label_entity_name to the workflow/task name from the workflow_execution_id - ), - "label_execution_id", "$1", "execution_id", "(.*)" - ), - "label_project", "$1", "project", "(.*)" # project - ), - "label_domain", "$1", "domain", "(.*)" # domain - ) - ) - - record: app_info # A join metric to look up app-level info. Used to disambiguate workflow/task executions, apps, and workspaces. - expr: | - max by (label_domain, label_project, label_app_name, label_app_version, label_entity_id)( - label_replace( - label_replace( - label_replace( - kube_pod_labels{ - label_domain!="", - label_project!="", - label_serving_unionai_dev_app_name!="", - label_serving_knative_dev_revision!="" - }, # this filters for apps - "label_app_name", "$1", "label_serving_unionai_dev_app_name", "(.*)" # rename to cleanup - ), - "label_app_version", "$1", "label_serving_knative_dev_revision", "(.*)" # the app_version is equivalent to an execution_id for workflows (lowest level of granularity) - ), - "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # join key - ) - ) - - record: workspace_info # A join metric to look up workspace info. Used to disambiguate workflow/task executions, apps, and workspaces. - expr: | - max by (label_domain, label_project, label_workspace_name, label_entity_id)( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # filter for workspaces - "label_entity_id", "$1", "label_node_id", "(.*)" # join key - ), "label_workspace_name", "$1", "label_node_id", "(.*)" # set label_workspace_name to the workspace name from the kube_pod_labels - ) - ) - - record: fast_task_execution_duration - expr: | - max by (label_domain, label_project, label_execution_id, label_entity_id, namespace, pod)( - label_replace( - label_replace( - label_replace( - label_replace( - label_replace( - label_replace( - flyte:propeller:all:node:fast_task:fast_task_execution_duration{domain!="", project!="", execution_id!="", exported_namespace!="", exported_pod!=""}, - "label_entity_id", "$1", "execution_id", "(.*)" # join key - ), - "label_execution_id", "$1", "execution_id", "(.*)" - ), - "label_project", "$1", "project", "(.*)" # project - ), - "label_domain", "$1", "domain", "(.*)" # domain - ), - "namespace", "$1", "exported_namespace", "(.*)" - ), - "pod", "$1", "exported_pod", "(.*)" - ) - ) - - record: fast_task_execution_duration_rate - expr: | - irate(fast_task_execution_duration[1m]) > 0 or absent(irate(fast_task_execution_duration[1m])) * fast_task_execution_duration - - record: entity_id:mem_usage_bytes_total_per_node:sum # Allocated memory (max(requested, consumed)) aggregated per node and entity, where entity is either a task/workflow execution or an app. - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id, node) ( # aggregate up to entity - # First, calculate the allocated memory for each pod - max by (namespace, pod) ( # this is the case where consumed (the memory working set) exceeds requested memory - ( - sum by (namespace, pod) ( - container_memory_working_set_bytes{namespace!="",pod!="",image!=""} - ) - > sum by (namespace, pod) ( - kube_pod_container_resource_requests{namespace!="", pod!="", node!="", resource="memory"} - ) - ) - or sum by (namespace, pod) ( # this is the case where memory requests are <= consumed memory - kube_pod_container_resource_requests{namespace!="", pod!="", node!="", resource="memory"} # needed to add node!="" to dedupe - ) - ) - # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but we do not want to double the number of pod-level metrics we save - * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) - "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") - ), - "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels - label_replace( - label_replace( - kube_pod_labels{ - label_domain!="", - label_project!="", - label_serving_unionai_dev_app_name!="", - label_serving_knative_dev_revision!="" - }, # this filters for apps only - "label_entity_type", "app", "", "" # set label_entity_type to "app" - ), - "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) - "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" - ), - "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - fast_task_execution_duration_rate{label_domain!="", label_project!="",label_execution_id!=""}, - "label_entity_type", "fast_task", "", "" - ) - ) - ) - # Then filter for pods only in the "Running" or "Pending" phase - * on (namespace, pod) group_left() ( - max by (namespace, pod) ( - kube_pod_status_phase{phase=~"Running|Pending"} == 1 - ) - ) - # Now join in node identifiers which are used for subsequent overhead calculations - * on (namespace, pod) group_left(node) ( - max by (namespace, pod, node) (kube_pod_info{node!=""}) # needed to add node!="" to dedupe - ) - ) - - record: entity_id:cpu_usage_per_node:sum # Allocated cpu (max(requested, consumed)) aggregated per node and entity, where entity is either a task/workflow execution or an app. - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id, node) ( - # First, calculate the allocated cpu for each pod - max by (namespace, pod) ( # this is the case where consumed (the cpu usage seconds total) exceeds requested cpu - ( - sum by (namespace, pod) ( - irate(container_cpu_usage_seconds_total{namespace!="",pod!="",image!=""}[5m]) - ) - > sum by (namespace, pod) ( - kube_pod_container_resource_requests{namespace!="", pod!="", node!="", resource="cpu"} - ) - ) - or sum by (namespace, pod) ( # this is the case where cpu requests are <= consumed cpu - kube_pod_container_resource_requests{namespace!="", pod!="", node!="", resource="cpu"} - ) - ) - # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but I didn't want to double the number of pod-level metrics we save - * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) - "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") - ), - "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels - label_replace( - label_replace( - kube_pod_labels{ - label_domain!="", - label_project!="", - label_serving_unionai_dev_app_name!="", - label_serving_knative_dev_revision!="" - }, # this filters for apps only - "label_entity_type", "app", "", "" # set label_entity_type to "app" - ), - "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) - "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" - ), - "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - fast_task_execution_duration_rate{label_domain!="", label_project!="", label_execution_id!=""}, - "label_entity_type", "fast_task", "", "" - ) - ) - ) - # Then filter for pods only in the "Running" or "Pending" phase - * on (namespace, pod) group_left() ( - max by (namespace, pod) ( - kube_pod_status_phase{phase=~"Running|Pending"} == 1 - ) - ) - # Now join in node identifiers which are used for subsequent overhead calculations - * on (namespace, pod) group_left(node) ( - max by (namespace, pod, node) (kube_pod_info{node!=""}) # needed to add node!="" to dedupe - ) - ) - - record: entity_id:gpu_usage_per_node:sum # Allocated gpu aggregated per node and entity, where entity is either a task/workflow execution or an app. - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id, node) ( - # First, grab the allocated gpu for each pod (which is always either 1 or zero, since k8s can't split gpus the way it can with cpu/memory) - max by (namespace, pod) ( - pod_gpu_allocation - ) - # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but I didn't want to double the number of pod-level metrics we save - * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) - "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") - ), - "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels - label_replace( - label_replace( - kube_pod_labels{ - label_domain!="", - label_project!="", - label_serving_unionai_dev_app_name!="", - label_serving_knative_dev_revision!="" - }, # this filters for apps only - "label_entity_type", "app", "", "" # set label_entity_type to "app" - ), - "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) - "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" - ), - "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - fast_task_execution_duration_rate{label_domain!="", label_project!="", label_execution_id!=""}, - "label_entity_type", "fast_task", "", "" - ) - ) - ) - # Then filter for pods only in the "Running" or "Pending" phase - * on (namespace, pod) group_left() ( - max by (namespace, pod) ( - kube_pod_status_phase{phase=~"Running|Pending"} == 1 - ) - ) - # Now join in node identifiers which are used for subsequent overhead calculations - * on (namespace, pod) group_left(node) ( - max by (namespace, pod, node) (kube_pod_info{node!=""}) # needed to add node!="" to dedupe - ) - ) - - record: entity_id:used_mem_bytes:sum # the sum of used memory across all containers in an entity (numerator for aggregate utilization calculations) - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id) ( # aggregate up to entity - # First, calculate the used memory for each pod - sum by (namespace, pod) ( - container_memory_working_set_bytes{namespace!="",pod!="",image!=""} - ) - # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but we do not want to double the number of pod-level metrics we save - * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) - "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") - ), - "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels - label_replace( - label_replace( - kube_pod_labels{ - label_domain!="", - label_project!="", - label_serving_unionai_dev_app_name!="", - label_serving_knative_dev_revision!="" - }, # this filters for apps only - "label_entity_type", "app", "", "" # set label_entity_type to "app" - ), - "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) - "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" - ), - "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - fast_task_execution_duration_rate{label_domain!="", label_project!="", label_execution_id!=""}, - "label_entity_type", "fast_task", "", "" - ) - ) - ) - # Then filter for pods only in the "Running" or "Pending" phase - * on (namespace, pod) group_left() ( - max by (namespace, pod) ( - kube_pod_status_phase{phase=~"Running|Pending"} == 1 - ) - ) - ) - - record: entity_id:allocated_mem_bytes:sum # the sum of allocated memory across all containers in an entity (denominator for aggregate utilization calculations) - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id) ( # aggregate up to entity (remove node) - entity_id:mem_usage_bytes_total_per_node:sum - ) - - record: entity_id:used_cpu:sum # the sum of used cpu across all containers in an entity (numerator for aggregate utilization calculations) - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id) ( - sum by (namespace, pod) ( - irate(container_cpu_usage_seconds_total{namespace!="",pod!="",image!=""}[5m]) - ) - # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but I didn't want to double the number of pod-level metrics we save - * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) - "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") - ), - "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels - label_replace( - label_replace( - kube_pod_labels{ - label_domain!="", - label_project!="", - label_serving_unionai_dev_app_name!="", - label_serving_knative_dev_revision!="" - }, # this filters for apps only - "label_entity_type", "app", "", "" # set label_entity_type to "app" - ), - "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) - "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" - ), - "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - fast_task_execution_duration_rate{label_domain!="", label_project!="", label_execution_id!=""}, - "label_entity_type", "fast_task", "", "" - ) - ) - ) - # Then filter for pods only in the "Running" or "Pending" phase - * on (namespace, pod) group_left() ( - max by (namespace, pod) ( - kube_pod_status_phase{phase=~"Running|Pending"} == 1 - ) - ) - ) - - record: entity_id:allocated_cpu:sum # the sum of allocated cpu across all containers in an entity (denominator for aggregate utilization calculations) - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id) ( # aggregate up to entity (remove node) - entity_id:cpu_usage_per_node:sum - ) - - record: entity_id:sm_occupancy:avg # the simple average of SM occupancy (a good generic measure of GPU utilization) per entity - expr: | - avg by (label_entity_type, label_domain, label_project, label_entity_id) ( - # First, grab the SM occupancy for each pod - max by (namespace, pod) ( - DCGM_FI_PROF_SM_OCCUPANCY # SM occupancy is a good proxy for actual GPU usage - ) - # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but I didn't want to double the number of pod-level metrics we save - * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) - "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") - ), - "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels - label_replace( - label_replace( - kube_pod_labels{ - label_domain!="", - label_project!="", - label_serving_unionai_dev_app_name!="", - label_serving_knative_dev_revision!="" - }, # this filters for apps only - "label_entity_type", "app", "", "" # set label_entity_type to "app" - ), - "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) - "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" - ), - "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - fast_task_execution_duration_rate{label_domain!="", label_project!="", label_execution_id!=""}, - "label_entity_type", "fast_task", "", "" - ) - ) - ) - # Then filter for pods only in the "Running" or "Pending" phase - * on (namespace, pod) group_left() ( - max by (namespace, pod) ( - kube_pod_status_phase{phase=~"Running|Pending"} == 1 - ) - ) - ) - - record: entity_id:gpu_count:sum # the count of running gpu pods per entity (need this to weight the gpu utilization when aggregating upwards - i.e. project-level) - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id, node) ( - # First, grab the allocated gpu for each pod (which is always either 1 or zero, since k8s can't split gpus the way it can with cpu/memory) - max by (namespace, pod) ( - pod_gpu_allocation - ) - # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but I didn't want to double the number of pod-level metrics we save - * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) - "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") - ), - "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels - label_replace( - label_replace( - kube_pod_labels{ - label_domain!="", - label_project!="", - label_serving_unionai_dev_app_name!="", - label_serving_knative_dev_revision!="" - }, # this filters for apps only - "label_entity_type", "app", "", "" # set label_entity_type to "app" - ), - "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) - "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" - ), - "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - fast_task_execution_duration_rate{label_domain!="", label_project!="", label_execution_id!=""}, - "label_entity_type", "fast_task", "", "" - ) - ) - ) - # Then filter for pods only in the "Running" or "Pending" phase - * on (namespace, pod) group_left() ( - max by (namespace, pod) ( - kube_pod_status_phase{phase=~"Running|Pending"} == 1 - ) - ) - ) - - record: entity_id:weighted_sm_occupancy:sum # product of SM occupancy and allocated GPU count (something like "used memory", numerator of weighted calcs) - expr: | - entity_id:sm_occupancy:avg - * on (label_domain, label_project, label_entity_type, label_entity_id) entity_id:gpu_count:sum - - record: entity_id:allocated_mem_cost:sum # Allocated cost of memory for each workflow/task execution and app. - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id, type) ( - entity_id:mem_usage_bytes_total_per_node:sum / (1024 * 1024 * 1024) # convert bytes to GB - * on (node) group_left(type) label_replace(avg by (node) (node_ram_hourly_cost * (15 / 3600)), "type", "mem", "", "") # convert hourly cost to 15-secondly cost and add type - ) - - record: entity_id:allocated_cpu_cost:sum # Allocated cost of cpu for each workflow/task execution and app. - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id, type)( - entity_id:cpu_usage_per_node:sum - * on (node) group_left(type) label_replace(avg by (node) (node_cpu_hourly_cost * (15 / 3600)), "type", "cpu", "", "") # convert hourly cost to 15-secondly cost and add type - ) - - record: entity_id:allocated_gpu_cost:sum # Allocated cost of gpu for each workflow/task execution and app. - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id, type)( - entity_id:gpu_usage_per_node:sum - * on (node) group_left(type) label_replace(avg by (node) (node_gpu_hourly_cost * (15 / 3600)), "type", "gpu", "", "") # convert hourly cost to 15-secondly cost and add type - ) - - record: entity_id:allocated_cost:sum # Allocated cost of memory, cpu, and gpu for each workflow/task execution and app. - expr: | - label_replace( - sum by (label_entity_type, label_domain, label_project, label_entity_id) ( # for the sum to work, the labels need to be different on each "or" element (type label) - entity_id:allocated_mem_cost:sum - or - entity_id:allocated_cpu_cost:sum - or - entity_id:allocated_gpu_cost:sum - ), - "type", "allocated", "", "" # add type info - ) - - record: entity_id:overhead_cost:sum # The amount of overhead costs (node costs that we can't allocate with container resources) to allocate to each entity (workflow/task execution or app) - expr: | - label_replace( - sum by (label_entity_type, label_entity_id, label_domain, label_project)( # Aggregate the per-node metrics up to workflow/task execution or app (label_entity_id) - # Start with each execution's and app's allocated cost per node - sum by (label_entity_type, label_domain, label_project, label_entity_id, node) ( # for the sum to work, the labels need to be different on each "or" element (type label) - entity_id:mem_usage_bytes_total_per_node:sum / (1024 * 1024 * 1024) # convert bytes to GB - * on (node) group_left(type) label_replace(avg by (node) (node_ram_hourly_cost * (15 / 3600)), "type", "mem", "", "") # convert hourly cost to 15-secondly cost and add type - or - entity_id:cpu_usage_per_node:sum - * on (node) group_left(type) label_replace(avg by (node) (node_cpu_hourly_cost * (15 / 3600)), "type", "cpu", "", "") # convert hourly cost to 15-secondly cost and add type - or - entity_id:gpu_usage_per_node:sum - * on (node) group_left(type) label_replace(avg by (node) (node_gpu_hourly_cost * (15 / 3600)), "type", "gpu", "", "") # convert hourly cost to 15-secondly cost and add type - ) - # Then divide out the total allocated cost per node to get the proportion of allocated cost associated with each entity - / on (node) group_left()( - sum by (node) ( # for the sum to work, the labels need to be different on each "or" element (type label) - entity_id:mem_usage_bytes_total_per_node:sum / (1024 * 1024 * 1024) # convert bytes to GB - * on (node) group_left(type) label_replace(avg by (node) (node_ram_hourly_cost * (15 / 3600)), "type", "mem", "", "") # convert hourly cost to 15-secondly cost and add type - or - entity_id:cpu_usage_per_node:sum - * on (node) group_left(type) label_replace(avg by (node) (node_cpu_hourly_cost * (15 / 3600)), "type", "cpu", "", "") # convert hourly cost to 15-secondly cost and add type - or - entity_id:gpu_usage_per_node:sum - * on (node) group_left(type) label_replace(avg by (node) (node_gpu_hourly_cost * (15 / 3600)), "type", "gpu", "", "") # convert hourly cost to 15-secondly cost and add type - ) - > 0 # need to avoid dividing by zero, or gaps in the data can cause NaNs to proliferate, borking all charts - ) - # Then multiply by the overhead cost per node - * on (node) group_left() ( - # To calculate overhead, start with the true cost of running each node - avg by (node)(kube_node_labels{label_flyte_org_node_role="worker"}) # only look at worker nodes - * on (node) max by (node) ( - node_total_hourly_cost{instance_type!=""} # sometimes, the instance_type can be null, causing an unlabeled label to show up in the Compute Costs dashboard charts - ) * (15 / 3600) # convert hourly cost to 15-secondly cost - # Then subtract out the total allocated cost on each node - - on (node) group_left()( - sum by (node) ( # for the sum to work, the labels need to be different on each "or" element (type label) - entity_id:mem_usage_bytes_total_per_node:sum / (1024 * 1024 * 1024) # convert bytes to GB - * on (node) group_left(type) label_replace(avg by (node) (node_ram_hourly_cost * (15 / 3600)), "type", "mem", "", "") # convert hourly cost to 15-secondly cost and add type - or - entity_id:cpu_usage_per_node:sum - * on (node) group_left(type) label_replace(avg by (node) (node_cpu_hourly_cost * (15 / 3600)), "type", "cpu", "", "") # convert hourly cost to 15-secondly cost and add type - or - entity_id:gpu_usage_per_node:sum - * on (node) group_left(type) label_replace(avg by (node) (node_gpu_hourly_cost * (15 / 3600)), "type", "gpu", "", "") # convert hourly cost to 15-secondly cost and add type - ) - ) - ) - ), - "type", "overhead", "", "" # add type info - ) - - record: entity_id:total_cost:sum # Total cost of each entity (workflow/task execution or app), including allocated (from container resources) and overhead (proportion of unallocated node costs) - expr: | - label_replace( - sum by (label_domain, label_project, label_entity_id, label_entity_type) ( - entity_id:allocated_cost:sum - or - entity_id:overhead_cost:sum - ), - "type", "total", "", "" # add type info - ) - - record: node:total_cost:sum # Total cost of all nodes - expr: | - sum ( - avg by (node)(kube_node_labels{label_flyte_org_node_role="worker", label_node_kubernetes_io_instance_type!=""}) # only look at worker nodes - * on (node) group_left() node_total_hourly_cost{instance_type!=""} * (15 / 3600) # convert hourly cost to 15-secondly cost - ) - - record: node_type:total_cost:sum # Total cost of nodes grouped by node type - expr: | - sum by (node_type)( - avg by (node)(kube_node_labels{label_flyte_org_node_role="worker", label_node_kubernetes_io_instance_type!=""}) # only look at worker nodes - * on (node) group_left(node_type) label_replace(node_total_hourly_cost{instance_type!=""}, "node_type", "$1", "instance_type", "(.*)") * (15 / 3600) # convert hourly cost to 15-secondly cost and rename label - ) - - record: node_type:uptime_hours:sum # Total uptime of nodes grouped by node type - expr: | - sum by (node_type)( - avg by (node, node_type)( # dedupe - label_replace(kube_node_labels{label_flyte_org_node_role="worker", label_node_kubernetes_io_instance_type!=""}, "node_type", "$1", "label_node_kubernetes_io_instance_type", "(.*)") # relabel - ) - ) * (15 / 3600) # convert to number of hours per 15-second observation # Aggregate the above into visible metrics - - name: cost_rollup_15m - interval: 15m - rules: - - record: execution_info15m - expr: | - max_over_time(execution_info[15m:15s]) - - record: app_info15m - expr: | - max_over_time(app_info[15m:15s]) - - record: workspace_info15m - expr: | - max_over_time(workspace_info[15m:15s]) - - record: entity_id:allocated_mem_bytes:sum15m - expr: | - sum_over_time(entity_id:allocated_mem_bytes:sum[15m:15s]) - - record: entity_id:used_mem_bytes:sum15m - expr: | - sum_over_time(entity_id:used_mem_bytes:sum[15m:15s]) - - record: entity_id:allocated_cpu:sum15m - expr: | - sum_over_time(entity_id:allocated_cpu:sum[15m:15s]) - - record: entity_id:used_cpu:sum15m - expr: | - sum_over_time(entity_id:used_cpu:sum[15m:15s]) - - record: entity_id:weighted_sm_occupancy:sum15m - expr: | - sum_over_time(entity_id:weighted_sm_occupancy:sum[15m:15s]) - - record: entity_id:gpu_count:sum15m - expr: | - sum_over_time(entity_id:gpu_count:sum[15m:15s]) - - record: entity_id:allocated_mem_cost:sum15m - expr: | - sum_over_time(entity_id:allocated_mem_cost:sum[15m:15s]) - - record: entity_id:allocated_cpu_cost:sum15m - expr: | - sum_over_time(entity_id:allocated_cpu_cost:sum[15m:15s]) - - record: entity_id:allocated_gpu_cost:sum15m - expr: | - sum_over_time(entity_id:allocated_gpu_cost:sum[15m:15s]) - - record: entity_id:allocated_cost:sum15m - expr: | - sum_over_time(entity_id:allocated_cost:sum[15m:15s]) - - record: entity_id:overhead_cost:sum15m - expr: | - sum_over_time(entity_id:overhead_cost:sum[15m:15s]) - - record: entity_id:total_cost:sum15m - expr: | - sum_over_time(entity_id:total_cost:sum[15m:15s]) - - record: node:total_cost:sum15m - expr: | - sum_over_time(node:total_cost:sum[15m:15s]) - - record: node_type:total_cost:sum15m - expr: | - sum_over_time(node_type:total_cost:sum[15m:15s]) - - record: node_type:uptime_hours:sum15m - expr: | - sum_over_time(node_type:uptime_hours:sum[15m:15s]) ---- -# Source: dataplane/templates/propeller/configmap.yaml -apiVersion: v1 -kind: ConfigMap -metadata: - name: flyte-propeller-config - namespace: union -data: - admin.yaml: | - admin: - clientId: 'clientId' - clientSecretLocation: /etc/union/secret/client_secret - endpoint: dns:///union.us-west-2.union.ai - insecure: false - event: - capacity: 1000 - rate: 500 - type: admin - catalog.yaml: | - catalog-cache: - cache-endpoint: dns:///union.us-west-2.union.ai - endpoint: dns:///union.us-west-2.union.ai - insecure: false - type: fallback - use-admin-auth: true - copilot.yaml: | - plugins: - k8s: - co-pilot: - image: 'cr.flyte.org/flyteorg/flytecopilot:v1.14.1' - name: flyte-copilot- - start-timeout: 30s - core.yaml: | - propeller: - downstream-eval-duration: 30s - enable-admin-launcher: true - leader-election: - enabled: true - lease-duration: 15s - lock-config-map: - name: propeller-leader - namespace: 'union' - renew-deadline: 10s - retry-period: 2s - limit-namespace: all - literal-offloading-config: - enabled: true - max-workflow-retries: 30 - metadata-prefix: metadata/propeller - metrics-prefix: flyte - prof-port: 10254 - queue: - batch-size: -1 - batching-interval: 2s - queue: - base-delay: 5s - capacity: 1000 - max-delay: 120s - rate: 100 - type: maxof - sub-queue: - capacity: 100 - rate: 10 - type: bucket - type: batch - rawoutput-prefix: 's3://bucket' - workers: 4 - workflow-reeval-duration: 30s webhook: certDir: /etc/webhook/certs + disableCreateMutatingWebhookConfig: true embeddedSecretManagerConfig: imagePullSecrets: enabled: true @@ -3803,367 +3368,1173 @@ data: namespace: 'union' type: 'K8s' listenPort: '9443' + localCert: true secretManagerTypes: - Embedded - K8s - serviceName: flyte-pod-webhook + secretName: union-pod-webhook + serviceName: union-pod-webhook servicePort: '443' - enabled_plugins.yaml: | - plugins: - connector-service: - defaultConnector: - defaultServiceConfig: '{"loadBalancingConfig": [{"round_robin":{}}]}' - endpoint: k8s:///flyteconnector.union:8000 - tasks: - task-plugins: - default-for-task-types: - actor: fast-task - container: container - container_array: k8s-array - sidecar: sidecar - enabled-plugins: - - container - - sidecar - - k8s-array - - echo - - fast-task - - connector-service - k8s.yaml: | - plugins: - k8s: - default-cpus: 100m - default-env-vars: [] - default-memory: 100Mi - logger.yaml: | - logger: - formatter: - type: json - level: 4 - show-source: true - resource_manager.yaml: | - propeller: - resourcemanager: - type: noop - task_logs.yaml: | - plugins: - logs: - cloudwatch-enabled: false - dynamic-log-links: - - vscode: - displayName: VS Code Debugger - templateUris: - - /dataplane/pod/v1/generated_name/task/{{.executionProject}}/{{.executionDomain}}/{{.executionName}}/{{.nodeID}}/{{.taskRetryAttempt}}/{{.taskProject}}/{{.taskDomain}}/{{.taskID}}/{{.taskVersion}}/ - kubernetes-enabled: false - templates: - - displayName: Task Logs - scheme: TaskExecution - templateUris: - - /console/projects/{{.executionProject}}/domains/{{.executionDomain}}/executions/{{.executionName}}/nodeId/{{.nodeID}}/taskId/{{.taskID}}/attempt/{{.taskRetryAttempt}}/view/logs?duration=all&fromExecutionNav=true - storage.yaml: | - storage: - container: "bucket" - type: s3 - connection: - auth-type: iam - region: us-east-2 - enable-multicontainer: true - limits: - maxDownloadMBs: 1024 - cache: - max_size_mbs: 0 - target_gc_percent: 70 --- -# Source: dataplane/charts/fluentbit/templates/clusterrole.yaml +# Source: dataplane/charts/fluentbit/templates/clusterrole.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: release-name-fluentbit + labels: + helm.sh/chart: fluentbit-0.48.9 + app.kubernetes.io/name: fluentbit + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "3.2.8" + app.kubernetes.io/managed-by: Helm +rules: + - apiGroups: + - "" + resources: + - namespaces + - pods + verbs: + - get + - list + - watch +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2020 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: knative-serving-operator-aggregated + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +aggregationRule: + clusterRoleSelectors: + # This (along with escalate below) allows the Operator to pick up any + # roles that are provided to the admin of the cluster by knative serving + # automatically. + - matchExpressions: + - {key: serving.knative.dev/release, operator: Exists} +rules: [] # Rules are automatically filled in by the controller manager. +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: knative-serving-operator-aggregated-stable + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +aggregationRule: + clusterRoleSelectors: + # This (along with escalate below) allows the Operator to pick up any + # roles that are provided to the admin of the cluster by knative serving + # automatically. + - matchExpressions: + - {key: app.kubernetes.io/name, operator: In, values: ["knative-serving"]} +rules: [] # Rules are automatically filled in by the controller manager. +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: knative-eventing-operator-aggregated + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +aggregationRule: + clusterRoleSelectors: + # This (along with escalate below) allows the Operator to pick up any + # roles that are provided to the admin of the cluster by knative eventing + # automatically. + - matchExpressions: + - {key: eventing.knative.dev/release, operator: Exists} +rules: [] # Rules are automatically filled in by the controller manager. +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: knative-eventing-operator-aggregated-stable + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +aggregationRule: + clusterRoleSelectors: + # This (along with escalate below) allows the Operator to pick up any + # roles that are provided to the admin of the cluster by knative eventing + # automatically. + - matchExpressions: + - {key: app.kubernetes.io/name, operator: In, values: ["knative-eventing"]} +rules: [] # Rules are automatically filled in by the controller manager. +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2020 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +kind: ClusterRole +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: knative-serving-operator + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +rules: + - apiGroups: + - operator.knative.dev + resources: + - '*' + verbs: + - '*' + # Bootstrapping permissions. + # Roles that are explicitly bound buch which are specified by this Operator + # MUST be specified here with 'get' and 'bind'. + - apiGroups: + - rbac.authorization.k8s.io + resources: + - clusterroles + resourceNames: + - system:auth-delegator + verbs: + - bind + - get + - apiGroups: + - rbac.authorization.k8s.io + resources: + - roles + resourceNames: + - extension-apiserver-authentication-reader + verbs: + - bind + - get + - apiGroups: + - rbac.authorization.k8s.io + resources: + - clusterroles + - roles + verbs: + - create + - delete + # Escalate is necessary in order to create a role using cluster role aggregation, + # and to allow the Operator to bootstrap itself into the necessary set of + # permissions, even as those continue to evolve upstream. + - escalate + - get + - list + - update + - apiGroups: + - rbac.authorization.k8s.io + resources: + - clusterrolebindings + - rolebindings + verbs: + - create + - delete + - list + - get + - update + # Permissions required for Knative controller + # infra. + - apiGroups: + - apiregistration.k8s.io + resources: + - apiservices + verbs: + - update + - apiGroups: + - "" + resources: + - services + verbs: + - create + - delete + - get + - list + - watch + - apiGroups: + - caching.internal.knative.dev + resources: + - images + verbs: + - '*' + - apiGroups: + - "" + resources: + - namespaces + verbs: + - get + - update + - watch + - apiGroups: + - '' + resources: + - events + verbs: + - create + - update + - patch + - apiGroups: + - '' + resources: + - configmaps + verbs: + - create + - delete + - get + - list + - watch + - apiGroups: + - security.istio.io + - apps + - policy + resources: + - poddisruptionbudgets + - peerauthentications + - deployments + - daemonsets + - replicasets + - statefulsets + verbs: + - create + - delete + - get + - list + - watch + - update + - apiGroups: + - apiregistration.k8s.io + resources: + - apiservices + verbs: + - create + - delete + - get + - list + - apiGroups: + - autoscaling + resources: + - horizontalpodautoscalers + verbs: + - create + - delete + - get + - list + - apiGroups: + - coordination.k8s.io + resources: + - leases + verbs: + - '*' + - apiGroups: + - apiextensions.k8s.io + resources: + - customresourcedefinitions + verbs: + - '*' + # Old resources that need cleaning up that are not in the knative-serving + # namespace. + - apiGroups: + - "" + resources: + - services + - deployments + - horizontalpodautoscalers + resourceNames: + - knative-ingressgateway + verbs: + - delete + - apiGroups: + - "" + resources: + - configmaps + resourceNames: + - config-controller + verbs: + - delete + - apiGroups: + - "" + resources: + - serviceaccounts + resourceNames: + - knative-serving-operator + verbs: + - delete + # for contour TLS + - apiGroups: + - projectcontour.io + resources: + - httpproxies + - tlscertificatedelegations + verbs: + - get + - list + - watch + - update + - create + - delete + - deletecollection + - patch + # for security-guard + - apiGroups: + - guard.security.knative.dev + resources: + - guardians + verbs: + - get + - list + - watch + - create + - update + - patch + - delete + - apiGroups: + - "" + resources: + - configmaps + verbs: + - get + - list + - watch + - create + - update + - patch + - delete + - apiGroups: + - authentication.k8s.io + resources: + - tokenreviews + verbs: + - create + - apiGroups: + - "" + resources: + - pods + verbs: + - get +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +kind: ClusterRole +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: knative-eventing-operator + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +rules: + - apiGroups: + - operator.knative.dev + resources: + - '*' + verbs: + - '*' + # Bootstrapping permissions. + # Roles that are explicitly bound buch which are specified by this Operator + # MUST be specified here with 'get' and 'bind'. + - apiGroups: + - rbac.authorization.k8s.io + resources: + - clusterroles + - roles + verbs: + - create + - delete + # Escalate is necessary in order to create a role using cluster role aggregation, + # and to allow the Operator to bootstrap itself into the necessary set of + # permissions, even as those continue to evolve upstream. + - escalate + - get + - list + - update + - apiGroups: + - rbac.authorization.k8s.io + resources: + - clusterrolebindings + - rolebindings + verbs: + - create + - delete + - list + - get + - update + # Permissions required for Knative controller + # infra. + - apiGroups: + - apiregistration.k8s.io + resources: + - apiservices + verbs: + - update + - apiGroups: + - "" + resources: + - services + verbs: + - create + - delete + - get + - list + - watch + - apiGroups: + - caching.internal.knative.dev + resources: + - images + verbs: + - '*' + - apiGroups: + - "" + resources: + - namespaces + verbs: + - get + - update + - watch + - apiGroups: + - '' + resources: + - events + verbs: + - create + - update + - patch + - apiGroups: + - '' + resources: + - configmaps + verbs: + - create + - delete + - get + - list + - watch + - apiGroups: + - apps + resources: + - deployments + - daemonsets + - replicasets + - statefulsets + verbs: + - create + - delete + - get + - list + - watch + - apiGroups: + - apiregistration.k8s.io + resources: + - apiservices + verbs: + - create + - delete + - get + - list + - apiGroups: + - autoscaling + resources: + - horizontalpodautoscalers + verbs: + - create + - delete + - update + - get + - list + - apiGroups: + - coordination.k8s.io + resources: + - leases + verbs: + - '*' + - apiGroups: + - apiextensions.k8s.io + resources: + - customresourcedefinitions + verbs: + - '*' + - apiGroups: + - batch + resources: + - jobs + verbs: + - create + - delete + - update + - get + - list + - watch + # Old resources that need cleaning up that are not in the knative-eventing + # namespace. + - apiGroups: + - "" + resources: + - serviceaccounts + resourceNames: + - knative-eventing-operator + verbs: + - delete + # for RabbitMQ messaging topology objects + - apiGroups: + - rabbitmq.com + resources: + - rabbitmqclusters + verbs: + - "get" + - "list" + - "watch" + - apiGroups: + - rabbitmq.com + resources: + - bindings + - queues + - exchanges + verbs: + - create + - delete + - get + - list + - patch + - update + - watch + - apiGroups: + - rabbitmq.com + resources: + - bindings/status + - queues/status + - exchanges/status + verbs: + - get + # for Kafka eventing source + - apiGroups: + - keda.sh + resources: + - scaledobjects + - scaledobjects/finalizers + - scaledobjects/status + - triggerauthentications + - triggerauthentications/status + verbs: + - get + - list + - watch + - update + - create + - delete + # Internal APIs + - apiGroups: + - "internal.kafka.eventing.knative.dev" + resources: + - "consumers" + - "consumers/status" + - "consumergroups" + - "consumergroups/status" + verbs: + - create + - get + - list + - watch + - patch + - update + - delete + - apiGroups: + - "internal.kafka.eventing.knative.dev" + resources: + - "consumers/finalizers" + - "consumergroups/finalizers" + verbs: + - update + - delete + - apiGroups: + - apps + resources: + - statefulsets/scale + verbs: + - get + - list + - watch + - update + - patch + - create + - delete + - apiGroups: + - rbac.authorization.k8s.io + resources: + - clusterrolebindings + verbs: + - watch + - apiGroups: + - "*" + resources: + - configmaps + verbs: + - delete + - apiGroups: + - "*" + resources: + - configmaps + - services + verbs: + - get + - list + - watch + - update + - create + - delete + - apiGroups: + - "*" + resources: + - pods + verbs: + - list + - update + - get + - watch + - apiGroups: + - "*" + resources: + - pods/finalizers + verbs: + - get + - list + - create + - update + - delete + - apiGroups: + - "*" + resources: + - events + verbs: + - patch + - create + - apiGroups: + - "*" + resources: + - secrets + verbs: + - get + - list + - watch + - update + - create + - delete + - apiGroups: + - "*" + resources: + - nodes + verbs: + - get + - list + - watch + - apiGroups: + - "*" + resources: + - serviceaccounts + verbs: + - get + - list + - watch + - update + - create + - delete + - apiGroups: + - "*" + resources: + - configmaps + resourceNames: + - kafka-channel-config + verbs: + - patch + - apiGroups: + - "*" + resources: + - horizontalpodautoscalers + resourceNames: + - kafka-webhook + verbs: + - delete + - apiGroups: + - "*" + resources: + - leases + verbs: + - delete + - apiGroups: + - "*" + resources: + - poddisruptionbudgets + resourceNames: + - kafka-webhook + verbs: + - delete + - apiGroups: + - "*" + resources: + - services + verbs: + - patch + - apiGroups: + - "apps" + resources: + - deployments + verbs: + - deletecollection + # Eventing TLS + - apiGroups: + - "cert-manager.io" + resources: + - certificates + - issuers + - clusterissuers + verbs: + - create + - delete + - update + - list + - get + - watch + - apiGroups: + - "trust.cert-manager.io" + resources: + - bundles + verbs: + - create + - delete + - update + - list + - get + - watch +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: knative-operator-webhook + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +rules: + # For watching logging configuration and getting certs. + - apiGroups: + - "" + resources: + - "configmaps" + verbs: + - "get" + - "list" + - "watch" + # For manipulating certs into secrets. + - apiGroups: + - "" + resources: + - "namespaces" + verbs: + - "get" + - "create" + - "update" + - "list" + - "watch" + - "patch" + # finalizers are needed for the owner reference of the webhook + - apiGroups: + - "" + resources: + - "namespaces/finalizers" + verbs: + - "update" + # For getting our Deployment so we can decorate with ownerref. + - apiGroups: + - "apps" + resources: + - "deployments" + verbs: + - "get" + - apiGroups: + - "apps" + resources: + - "deployments/finalizers" + verbs: + - update + # For actually registering our webhook. + - apiGroups: + - "admissionregistration.k8s.io" + resources: + - "mutatingwebhookconfigurations" + - "validatingwebhookconfigurations" + verbs: &everything + - "get" + - "list" + - "create" + - "update" + - "delete" + - "patch" + - "watch" + # For leader election + - apiGroups: + - "coordination.k8s.io" + resources: + - "leases" + verbs: *everything + # Necessary for conversion webhook. These are copied from the serving + # TODO: Do we really need all these permissions? + - apiGroups: ["apiextensions.k8s.io"] + resources: ["customresourcedefinitions"] + verbs: ["get", "list", "create", "update", "delete", "patch", "watch"] +--- +# Source: dataplane/charts/fluentbit/templates/clusterrolebinding.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: release-name-fluentbit + labels: + helm.sh/chart: fluentbit-0.48.9 + app.kubernetes.io/name: fluentbit + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "3.2.8" + app.kubernetes.io/managed-by: Helm +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: release-name-fluentbit +subjects: + - kind: ServiceAccount + name: union-system + namespace: union +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2020 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# TODO: Consider restriction of non-aggregated role to knativeservings namespaces. +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: knative-serving-operator + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: knative-serving-operator +subjects: + - kind: ServiceAccount + name: knative-operator + namespace: "union" +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# TODO: Consider restriction of non-aggregated role to knativeeventing namespaces. +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: knative-eventing-operator + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: knative-eventing-operator +subjects: + - kind: ServiceAccount + name: knative-operator + namespace: "union" +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: operator-webhook + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: knative-operator-webhook +subjects: + - kind: ServiceAccount + name: operator-webhook + namespace: "union" +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2020 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: knative-serving-operator-aggregated + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: knative-serving-operator-aggregated +subjects: + - kind: ServiceAccount + name: knative-operator + namespace: "union" +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: knative-serving-operator-aggregated-stable + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: knative-serving-operator-aggregated-stable +subjects: + - kind: ServiceAccount + name: knative-operator + namespace: "union" +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: knative-eventing-operator-aggregated + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: knative-eventing-operator-aggregated +subjects: + - kind: ServiceAccount + name: knative-operator + namespace: "union" +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: knative-eventing-operator-aggregated-stable + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: knative-eventing-operator-aggregated-stable +subjects: + - kind: ServiceAccount + name: knative-operator + namespace: "union" +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2022 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole +kind: Role metadata: - name: release-name-fluentbit + namespace: "union" + name: knative-operator-webhook labels: - helm.sh/chart: fluentbit-0.48.9 - app.kubernetes.io/name: fluentbit - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "3.2.8" - app.kubernetes.io/managed-by: Helm + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator rules: + # For manipulating certs into secrets. - apiGroups: - "" resources: - - namespaces - - pods + - "secrets" verbs: - - get - - list - - watch ---- -# Source: dataplane/charts/kube-state-metrics/templates/role.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - labels: - helm.sh/chart: kube-state-metrics-5.30.1 - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: metrics - app.kubernetes.io/part-of: kube-state-metrics - app.kubernetes.io/name: kube-state-metrics - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2.15.0" - name: release-name-kube-state-metrics -rules: - -- apiGroups: ["certificates.k8s.io"] - resources: - - certificatesigningrequests - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - configmaps - verbs: ["list", "watch"] - -- apiGroups: ["batch"] - resources: - - cronjobs - verbs: ["list", "watch"] - -- apiGroups: ["extensions", "apps"] - resources: - - daemonsets - verbs: ["list", "watch"] - -- apiGroups: ["extensions", "apps"] - resources: - - deployments - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - endpoints - verbs: ["list", "watch"] - -- apiGroups: ["autoscaling"] - resources: - - horizontalpodautoscalers - verbs: ["list", "watch"] - -- apiGroups: ["extensions", "networking.k8s.io"] - resources: - - ingresses - verbs: ["list", "watch"] - -- apiGroups: ["batch"] - resources: - - jobs - verbs: ["list", "watch"] - -- apiGroups: ["coordination.k8s.io"] - resources: - - leases - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - limitranges - verbs: ["list", "watch"] - -- apiGroups: ["admissionregistration.k8s.io"] - resources: - - mutatingwebhookconfigurations - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - namespaces - verbs: ["list", "watch"] - -- apiGroups: ["networking.k8s.io"] - resources: - - networkpolicies - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - nodes - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - persistentvolumeclaims - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - persistentvolumes - verbs: ["list", "watch"] - -- apiGroups: ["policy"] - resources: - - poddisruptionbudgets - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - pods - verbs: ["list", "watch"] - -- apiGroups: ["extensions", "apps"] - resources: - - replicasets - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - replicationcontrollers - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - resourcequotas - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - secrets - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - services - verbs: ["list", "watch"] - -- apiGroups: ["apps"] - resources: - - statefulsets - verbs: ["list", "watch"] - -- apiGroups: ["storage.k8s.io"] - resources: - - storageclasses - verbs: ["list", "watch"] - -- apiGroups: ["admissionregistration.k8s.io"] - resources: - - validatingwebhookconfigurations - verbs: ["list", "watch"] - -- apiGroups: ["storage.k8s.io"] - resources: - - volumeattachments - verbs: ["list", "watch"] + - "get" + - "create" + - "update" + - "list" + - "watch" + - "patch" --- -# Source: dataplane/charts/opencost/templates/clusterrole.yaml -# Cluster role giving opencost to get, list, watch required resources -# No write permissions are required +# Source: dataplane/templates/monitoring/prometheus-rbac.yaml apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole +kind: Role metadata: - name: release-name-opencost + name: union-operator-prometheus-rbac + namespace: union labels: - helm.sh/chart: opencost-1.42.0 - app.kubernetes.io/name: opencost - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "1.111.0" - app.kubernetes.io/part-of: opencost - app.kubernetes.io/managed-by: Helm + release: release-name rules: + # Prometheus server scrape permissions - apiGroups: [""] resources: - - configmaps - - deployments - nodes - - pods + - nodes/proxy + - nodes/metrics - services - - resourcequotas - - replicationcontrollers - - limitranges - - persistentvolumeclaims - - persistentvolumes - - namespaces - endpoints + - pods + - ingresses + - configmaps verbs: - get - list - watch - - apiGroups: - - extensions - resources: - - daemonsets - - deployments - - replicasets - verbs: - - get - - list - - watch - - apiGroups: - - apps - resources: - - statefulsets - - deployments - - daemonsets - - replicasets - verbs: - - list - - watch - - apiGroups: - - batch - resources: - - cronjobs - - jobs - verbs: - - get - - list - - watch - - apiGroups: - - autoscaling - resources: - - horizontalpodautoscalers - verbs: - - get - - list - - watch - - apiGroups: - - policy + - apiGroups: ["extensions", "networking.k8s.io"] resources: - - poddisruptionbudgets + - ingresses/status + - ingresses verbs: - get - list - watch - - apiGroups: - - storage.k8s.io + - apiGroups: ["discovery.k8s.io"] resources: - - storageclasses + - endpointslices verbs: - get - list - watch ---- -# Source: dataplane/templates/clusterresourcesync/serviceaccount.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: union-clustersync-resource -rules: - - apiGroups: - - "" - - rbac.authorization.k8s.io - resources: - - configmaps - - namespaces - - pods - - resourcequotas - - roles - - rolebindings - - secrets - - services - - serviceaccounts - - clusterrolebindings - - podtemplates - verbs: - - '*' + # kube-state-metrics permissions (rbac.create=false on the subchart; managed here instead) + - apiGroups: ["certificates.k8s.io"] + resources: ["certificatesigningrequests"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["configmaps"] + verbs: ["list", "watch"] + - apiGroups: ["batch"] + resources: ["cronjobs"] + verbs: ["list", "watch"] + - apiGroups: ["extensions", "apps"] + resources: ["daemonsets"] + verbs: ["list", "watch"] + - apiGroups: ["extensions", "apps"] + resources: ["deployments"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["endpoints"] + verbs: ["list", "watch"] + - apiGroups: ["discovery.k8s.io"] + resources: ["endpointslices"] + verbs: ["list", "watch"] + - apiGroups: ["autoscaling"] + resources: ["horizontalpodautoscalers"] + verbs: ["list", "watch"] + - apiGroups: ["extensions", "networking.k8s.io"] + resources: ["ingresses"] + verbs: ["list", "watch"] + - apiGroups: ["batch"] + resources: ["jobs"] + verbs: ["list", "watch"] + - apiGroups: ["coordination.k8s.io"] + resources: ["leases"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["limitranges"] + verbs: ["list", "watch"] + - apiGroups: ["admissionregistration.k8s.io"] + resources: ["mutatingwebhookconfigurations"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["namespaces"] + verbs: ["list", "watch"] + - apiGroups: ["networking.k8s.io"] + resources: ["networkpolicies"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["persistentvolumeclaims"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["persistentvolumes"] + verbs: ["list", "watch"] + - apiGroups: ["policy"] + resources: ["poddisruptionbudgets"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["pods"] + verbs: ["list", "watch"] + - apiGroups: ["extensions", "apps"] + resources: ["replicasets"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["replicationcontrollers"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["resourcequotas"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["secrets"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["services"] + verbs: ["list", "watch"] + - apiGroups: ["apps"] + resources: ["statefulsets"] + verbs: ["list", "watch"] + - apiGroups: ["storage.k8s.io"] + resources: ["storageclasses"] + verbs: ["list", "watch"] + - apiGroups: ["admissionregistration.k8s.io"] + resources: ["validatingwebhookconfigurations"] + verbs: ["list", "watch"] + - apiGroups: ["storage.k8s.io"] + resources: ["volumeattachments"] + verbs: ["list", "watch"] --- # Source: dataplane/templates/nodeexecutor/serviceaccount.yaml apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole +kind: Role metadata: name: union-executor labels: @@ -4214,11 +4585,12 @@ rules: - delete - update --- -# Source: dataplane/templates/operator/serviceaccount-proxy.yaml +# Source: dataplane/templates/operator/serviceaccount-proxy-secret.yaml apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole +kind: Role metadata: - name: proxy-system + name: union-system-secret + namespace: union labels: app.kubernetes.io/name: operator-proxy app.kubernetes.io/instance: release-name @@ -4228,146 +4600,86 @@ rules: - apiGroups: - '*' resources: - - events - - flyteworkflows - - pods/log - - pods - - rayjobs - - resourcequotas - verbs: - - get - - list - - watch ---- -# Source: dataplane/templates/operator/serviceaccount.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: operator-system - labels: - app.kubernetes.io/name: union-operator - app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name - app.kubernetes.io/managed-by: Helm -rules: - # Allow Access to all resources under flyte.lyft.com - - apiGroups: - - flyte.lyft.com - resources: - - flyteworkflows - - flyteworkflows/finalizers - verbs: - - get - - list - - watch - - create - - update - - delete - - patch - - post - - deletecollection - - apiGroups: - - '*' - resources: - - resourcequotas - - pods - - configmaps - - podtemplates - secrets - - namespaces - - nodes verbs: - get - list - - watch - create - update - delete - - nonResourceURLs: - - /metrics - verbs: - - get --- -# Source: dataplane/templates/prometheus/rbac.yaml +# Source: dataplane/templates/operator/serviceaccount-proxy.yaml apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole +kind: Role metadata: - name: union-operator-prometheus + name: proxy-system labels: - helm.sh/chart: dataplane-2026.4.2 - app.kubernetes.io/name: release-name-dataplane + app.kubernetes.io/name: operator-proxy app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" + platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: prometheus rules: - - apiGroups: [""] + - apiGroups: + - '*' resources: - - nodes - - nodes/proxy + - events + - flyteworkflows + - pods/log - pods - - endpoints - - services + - rayjobs + - resourcequotas verbs: - get - list - watch - - nonResourceURLs: - - /metrics - - /metrics/cadvisor - verbs: - - get --- -# Source: dataplane/templates/propeller/serviceaccount-webhook.yaml -kind: ClusterRole +# Source: dataplane/templates/operator/serviceaccount.yaml apiVersion: rbac.authorization.k8s.io/v1 +kind: Role metadata: - name: flytepropeller-webhook-role - namespace: union + name: operator-system + labels: + app.kubernetes.io/name: union-operator + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm rules: - apiGroups: - - "*" + - '*' resources: - - mutatingwebhookconfigurations - secrets - - pods - - replicasets/finalizers + - deployments verbs: - get + - list + - watch - create - update - - patch ---- -# Source: dataplane/templates/propeller/serviceaccount.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: flytepropeller-role -rules: - # Allow RO access to PODS - apiGroups: - - "" + - flyte.lyft.com resources: - - pods + - flyteworkflows + - flyteworkflows/finalizers verbs: - get - list - watch - # Allow Event recording access - - apiGroups: - - "" - resources: - - events - verbs: - create - update - delete - patch - # Allow Access All plugin objects + - post + - deletecollection - apiGroups: - '*' resources: - - '*' + - resourcequotas + - pods + - configmaps + - podtemplates + - secrets + - namespaces + - nodes verbs: - get - list @@ -4375,148 +4687,129 @@ rules: - create - update - delete - - patch - # Allow Access to CRD - apiGroups: - - apiextensions.k8s.io + - serving.knative.dev resources: - - customresourcedefinitions + - revisions + - configurations + - services verbs: - get - list - watch - create - - delete - update - # Allow Access to all resources under flyte.lyft.com + - delete +--- +# Source: dataplane/templates/webhook/serviceaccount.yaml +kind: Role +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: union-webhook-role + namespace: union +rules: - apiGroups: - - flyte.lyft.com + - "*" resources: - - flyteworkflows - - flyteworkflows/finalizers + - mutatingwebhookconfigurations + - secrets + - pods + - replicasets/finalizers verbs: - get - - list - - watch - create - update - - delete - patch - - post - - deletecollection --- -# Source: dataplane/charts/fluentbit/templates/clusterrolebinding.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2022 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding +kind: RoleBinding metadata: - name: release-name-fluentbit + namespace: "union" + name: operator-webhook labels: - helm.sh/chart: fluentbit-0.48.9 - app.kubernetes.io/name: fluentbit - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "3.2.8" - app.kubernetes.io/managed-by: Helm -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: release-name-fluentbit + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator subjects: - kind: ServiceAccount - name: fluentbit-system - namespace: union ---- -# Source: dataplane/charts/kube-state-metrics/templates/clusterrolebinding.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - labels: - helm.sh/chart: kube-state-metrics-5.30.1 - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: metrics - app.kubernetes.io/part-of: kube-state-metrics - app.kubernetes.io/name: kube-state-metrics - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2.15.0" - name: release-name-kube-state-metrics + name: operator-webhook + namespace: "union" roleRef: + kind: Role + name: knative-operator-webhook apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: release-name-kube-state-metrics -subjects: -- kind: ServiceAccount - name: release-name-kube-state-metrics - namespace: union --- -# Source: dataplane/charts/opencost/templates/clusterrolebinding.yaml +# Source: dataplane/templates/monitoring/prometheus-rbac.yaml apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding +kind: RoleBinding metadata: - name: release-name-opencost + name: union-operator-prometheus-rbac + namespace: union labels: - helm.sh/chart: opencost-1.42.0 - app.kubernetes.io/name: opencost - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "1.111.0" - app.kubernetes.io/part-of: opencost - app.kubernetes.io/managed-by: Helm -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: release-name-opencost + release: release-name subjects: - kind: ServiceAccount - name: release-name-opencost + name: union-operator-prometheus namespace: union ---- -# Source: dataplane/templates/clusterresourcesync/serviceaccount.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - name: union-clustersync-resource roleRef: apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: union-clustersync-resource -subjects: - - kind: ServiceAccount - name: union-clustersync-system - namespace: union + kind: Role + name: union-operator-prometheus-rbac --- -# Source: dataplane/templates/clusterresourcesync/serviceaccount.yaml +# Source: dataplane/templates/monitoring/prometheus-rbac.yaml +# kube-state-metrics service account RoleBinding apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding +kind: RoleBinding metadata: - name: union-clustersync-auth-delegator -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: system:auth-delegator + name: release-name-prometheus-kube-state-metrics + namespace: union + labels: + release: release-name subjects: - kind: ServiceAccount - name: union-clustersync-system + name: release-name-prometheus-kube-state-metrics namespace: union +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: union-operator-prometheus-rbac --- # Source: dataplane/templates/nodeexecutor/serviceaccount.yaml apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding +kind: RoleBinding metadata: name: union-executor labels: app: executor roleRef: apiGroup: rbac.authorization.k8s.io - kind: ClusterRole + kind: Role name: union-executor subjects: - kind: ServiceAccount - name: executor + name: union-system namespace: union --- -# Source: dataplane/templates/operator/serviceaccount-proxy.yaml +# Source: dataplane/templates/operator/serviceaccount-proxy-secret.yaml apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding +kind: RoleBinding metadata: - name: proxy-system + name: union-system-secret + namespace: union labels: app.kubernetes.io/name: operator-proxy app.kubernetes.io/instance: release-name @@ -4524,109 +4817,35 @@ metadata: app.kubernetes.io/managed-by: Helm roleRef: apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: proxy-system + kind: Role + name: union-system-secret subjects: - kind: ServiceAccount - name: proxy-system + name: union-system namespace: union --- -# Source: dataplane/templates/operator/serviceaccount.yaml +# Source: dataplane/templates/operator/serviceaccount-proxy.yaml apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding +kind: RoleBinding metadata: - name: operator-system + name: proxy-system labels: - app.kubernetes.io/name: union-operator + app.kubernetes.io/name: operator-proxy app.kubernetes.io/instance: release-name platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm roleRef: apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: operator-system -subjects: - - kind: ServiceAccount - name: operator-system - namespace: union ---- -# Source: dataplane/templates/prometheus/rbac.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - name: union-operator-prometheus - labels: - helm.sh/chart: dataplane-2026.4.2 - app.kubernetes.io/name: release-name-dataplane - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: prometheus -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: union-operator-prometheus -subjects: - - kind: ServiceAccount - name: union-operator-prometheus - namespace: union ---- -# Source: dataplane/templates/propeller/serviceaccount-webhook.yaml -# Create a binding from Role -> ServiceAccount -kind: ClusterRoleBinding -apiVersion: rbac.authorization.k8s.io/v1 -metadata: - name: flytepropeller-webhook-binding - namespace: union -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: flytepropeller-webhook-role -subjects: - - kind: ServiceAccount - name: flytepropeller-webhook-system - namespace: union ---- -# Source: dataplane/templates/propeller/serviceaccount.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - name: flytepropeller-binding -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: flytepropeller-role + kind: Role + name: proxy-system subjects: - kind: ServiceAccount - name: flytepropeller-system + name: union-system namespace: union --- -# Source: dataplane/templates/operator/serviceaccount-proxy-secret.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: Role -metadata: - name: proxy-system-secret - namespace: union - labels: - app.kubernetes.io/name: operator-proxy - app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name - app.kubernetes.io/managed-by: Helm -rules: - - apiGroups: - - '*' - resources: - - secrets - verbs: - - get - - list - - create - - update - - delete ---- # Source: dataplane/templates/operator/serviceaccount.yaml apiVersion: rbac.authorization.k8s.io/v1 -kind: Role +kind: RoleBinding metadata: name: operator-system labels: @@ -4634,56 +4853,29 @@ metadata: app.kubernetes.io/instance: release-name platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm -rules: - - apiGroups: - - '*' - resources: - - secrets - - deployments - verbs: - - get - - list - - watch - - create - - update ---- -# Source: dataplane/templates/operator/serviceaccount-proxy-secret.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: RoleBinding -metadata: - name: proxy-system-secret - namespace: union - labels: - app.kubernetes.io/name: operator-proxy - app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name - app.kubernetes.io/managed-by: Helm roleRef: apiGroup: rbac.authorization.k8s.io kind: Role - name: proxy-system-secret + name: operator-system subjects: - kind: ServiceAccount - name: proxy-system + name: union-system namespace: union --- -# Source: dataplane/templates/operator/serviceaccount.yaml -apiVersion: rbac.authorization.k8s.io/v1 +# Source: dataplane/templates/webhook/serviceaccount.yaml +# Create a binding from Role -> ServiceAccount kind: RoleBinding +apiVersion: rbac.authorization.k8s.io/v1 metadata: - name: operator-system - labels: - app.kubernetes.io/name: union-operator - app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name - app.kubernetes.io/managed-by: Helm + name: union-webhook-binding + namespace: union roleRef: apiGroup: rbac.authorization.k8s.io kind: Role - name: operator-system + name: union-webhook-role subjects: - kind: ServiceAccount - name: operator-system + name: union-system namespace: union --- # Source: dataplane/charts/fluentbit/templates/service.yaml @@ -4709,20 +4901,60 @@ spec: app.kubernetes.io/name: fluentbit app.kubernetes.io/instance: release-name --- -# Source: dataplane/charts/kube-state-metrics/templates/service.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2022 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v1 +kind: Service +metadata: + labels: + role: operator-webhook + app.kubernetes.io/component: operator-webhook + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator + name: operator-webhook + namespace: "union" +spec: + ports: + # Define metrics and profiling for them to be accessible within service meshes. + - name: http-metrics + port: 9090 + targetPort: 9090 + - name: http-profiling + port: 8008 + targetPort: 8008 + - name: https-webhook + port: 443 + targetPort: 8443 + selector: + role: operator-webhook +--- +# Source: dataplane/charts/prometheus/charts/kube-state-metrics/templates/service.yaml apiVersion: v1 kind: Service metadata: name: release-name-kube-state-metrics namespace: union labels: - helm.sh/chart: kube-state-metrics-5.30.1 + helm.sh/chart: kube-state-metrics-5.25.1 app.kubernetes.io/managed-by: Helm app.kubernetes.io/component: metrics app.kubernetes.io/part-of: kube-state-metrics app.kubernetes.io/name: kube-state-metrics app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2.15.0" + app.kubernetes.io/version: "2.13.0" annotations: prometheus.io/scrape: 'true' spec: @@ -4737,28 +4969,32 @@ spec: app.kubernetes.io/name: kube-state-metrics app.kubernetes.io/instance: release-name --- -# Source: dataplane/charts/opencost/templates/service.yaml +# Source: dataplane/charts/prometheus/templates/service.yaml apiVersion: v1 kind: Service metadata: - name: release-name-opencost - namespace: union labels: - helm.sh/chart: opencost-1.42.0 - app.kubernetes.io/name: opencost + app.kubernetes.io/component: server + app.kubernetes.io/name: prometheus app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "1.111.0" - app.kubernetes.io/part-of: opencost + app.kubernetes.io/version: v2.54.1 + helm.sh/chart: prometheus-25.27.0 app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: prometheus + name: union-operator-prometheus + namespace: union spec: + ports: + - name: http + port: 80 + protocol: TCP + targetPort: 9090 selector: - app.kubernetes.io/name: opencost + app.kubernetes.io/component: server + app.kubernetes.io/name: prometheus app.kubernetes.io/instance: release-name + sessionAffinity: None type: "ClusterIP" - ports: - - name: http - port: 9003 - targetPort: 9003 --- # Source: dataplane/templates/clusterresourcesync/service.yaml apiVersion: v1 @@ -4782,6 +5018,31 @@ spec: app.kubernetes.io/name: clusterresourcesync app.kubernetes.io/instance: release-name --- +# Source: dataplane/templates/flyteconnector/service.yaml +apiVersion: v1 +kind: Service +metadata: + name: flyteconnector + namespace: union + labels: + app.kubernetes.io/name: flyteconnector + app.kubernetes.io/instance: release-name + app.kubernetes.io/managed-by: Helm +spec: + clusterIP: None + ports: + - name: grpc + port: 8000 + protocol: TCP + targetPort: grpc + - name: metric + port: 9090 + protocol: TCP + targetPort: metric + selector: + app.kubernetes.io/name: flyteconnector + app.kubernetes.io/instance: release-name +--- # Source: dataplane/templates/imagebuilder/service.yaml apiVersion: v1 kind: Service @@ -4807,7 +5068,7 @@ spec: apiVersion: v1 kind: Service metadata: - name: release-name-dataplane-executor + name: union-operator-executor labels: platform.union.ai/prometheus-group: "union-services" app: executor @@ -4873,39 +5134,14 @@ spec: app.kubernetes.io/name: union-operator app.kubernetes.io/instance: release-name --- -# Source: dataplane/templates/prometheus/service.yaml -apiVersion: v1 -kind: Service -metadata: - name: union-operator-prometheus - namespace: union - labels: - helm.sh/chart: dataplane-2026.4.2 - app.kubernetes.io/name: release-name-dataplane - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: prometheus -spec: - type: ClusterIP - ports: - - port: 80 - targetPort: 9090 - protocol: TCP - name: http - selector: - app.kubernetes.io/component: prometheus - app.kubernetes.io/name: release-name-dataplane - app.kubernetes.io/instance: release-name ---- -# Source: dataplane/templates/propeller/service-webhook.yaml +# Source: dataplane/templates/webhook/service.yaml apiVersion: v1 kind: Service metadata: - name: flyte-pod-webhook + name: union-pod-webhook namespace: union labels: - app.kubernetes.io/name: flyte-pod-webhook + app.kubernetes.io/name: union-pod-webhook app.kubernetes.io/instance: release-name platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm @@ -4913,7 +5149,7 @@ metadata: projectcontour.io/upstream-protocol.h2c: grpc spec: selector: - app.kubernetes.io/name: flyte-pod-webhook + app.kubernetes.io/name: union-pod-webhook app.kubernetes.io/instance: release-name ports: - name: https @@ -4925,23 +5161,23 @@ spec: port: 10254 targetPort: 10254 --- -# Source: dataplane/templates/propeller/service-webhook.yaml +# Source: dataplane/templates/webhook/service.yaml # Headless Service for cache invalidation — resolves to all pod IPs so that # we can fan out invalidation requests to every webhook replica. apiVersion: v1 kind: Service metadata: - name: flyte-pod-webhook-headless + name: union-pod-webhook-headless namespace: union labels: - app.kubernetes.io/name: flyte-pod-webhook + app.kubernetes.io/name: union-pod-webhook app.kubernetes.io/instance: release-name platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm spec: clusterIP: None selector: - app.kubernetes.io/name: flyte-pod-webhook + app.kubernetes.io/name: union-pod-webhook app.kubernetes.io/instance: release-name ports: - name: cache-internal @@ -4949,32 +5185,6 @@ spec: port: 9443 targetPort: 9443 --- -# Source: dataplane/templates/propeller/service.yaml -apiVersion: v1 -kind: Service -metadata: - namespace: union - name: flytepropeller - labels: - platform.union.ai/prometheus-group: "union-services" - app.kubernetes.io/name: flytepropeller - app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name - app.kubernetes.io/managed-by: Helm -spec: - type: ClusterIP - ports: - - name: debug - protocol: TCP - port: 10254 - - name: fasttask - port: 15605 - protocol: TCP - targetPort: 15605 - selector: - app.kubernetes.io/name: flytepropeller - app.kubernetes.io/instance: release-name ---- # Source: dataplane/charts/fluentbit/templates/daemonset.yaml apiVersion: apps/v1 kind: DaemonSet @@ -5000,7 +5210,7 @@ spec: annotations: checksum/config: e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855 spec: - serviceAccountName: fluentbit-system + serviceAccountName: union-system hostNetwork: false dnsPolicy: ClusterFirst containers: @@ -5052,20 +5262,217 @@ spec: tolerations: - operator: Exists --- -# Source: dataplane/charts/kube-state-metrics/templates/deployment.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2022 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: operator-webhook + namespace: "union" + labels: + app.kubernetes.io/component: operator-webhook + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +spec: + selector: + matchLabels: + app: operator-webhook + role: operator-webhook + template: + metadata: + labels: + app: operator-webhook + role: operator-webhook + app.kubernetes.io/component: operator-webhook + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator + sidecar.istio.io/inject: "false" + spec: + # To avoid node becoming SPOF, spread our replicas to different nodes. + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - podAffinityTerm: + labelSelector: + matchLabels: + app: operator-webhook + topologyKey: kubernetes.io/hostname + weight: 100 + serviceAccountName: operator-webhook + containers: + - name: operator-webhook + # This is the Go import path for the binary that is containerized + # and substituted here. + image: gcr.io/knative-releases/knative.dev/operator/cmd/webhook@sha256:d3a7a3304629ccfcaacec620b50736e0b4c902a6328b83369b6cbaf7e94677c9 + resources: + requests: + cpu: 100m + memory: 100Mi + limits: + cpu: 500m + memory: 500Mi + env: + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: SYSTEM_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: CONFIG_LOGGING_NAME + value: config-logging + - name: CONFIG_OBSERVABILITY_NAME + value: config-observability + - name: WEBHOOK_NAME + value: operator-webhook + - name: WEBHOOK_PORT + value: "8443" + - name: WEBHOOK_SECRET_NAME + value: operator-webhook-certs + - name: METRICS_DOMAIN + value: knative.dev/operator + - name: KUBERNETES_MIN_VERSION + value: "" + securityContext: + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + runAsNonRoot: true + capabilities: + drop: + - ALL + ports: + - name: metrics + containerPort: 9090 + - name: profiling + containerPort: 8008 + - name: https-webhook + containerPort: 8443 + readinessProbe: + periodSeconds: 1 + httpGet: + scheme: HTTPS + port: 8443 + httpHeaders: + - name: k-kubelet-probe + value: "webhook" + livenessProbe: + periodSeconds: 1 + httpGet: + scheme: HTTPS + port: 8443 + httpHeaders: + - name: k-kubelet-probe + value: "webhook" + failureThreshold: 6 + initialDelaySeconds: 120 + # Our webhook should gracefully terminate by lame ducking first, set this to a sufficiently + # high value that we respect whatever value it has configured for the lame duck grace period. + terminationGracePeriodSeconds: 300 +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2020 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: knative-operator + namespace: "union" + labels: + app.kubernetes.io/name: knative-operator + app.kubernetes.io/version: "1.16.0" +spec: + replicas: 1 + selector: + matchLabels: + name: knative-operator + template: + metadata: + labels: + name: knative-operator + app.kubernetes.io/name: knative-operator + app.kubernetes.io/version: "1.16.0" + sidecar.istio.io/inject: "false" + spec: + serviceAccountName: knative-operator + containers: + - name: knative-operator + image: gcr.io/knative-releases/knative.dev/operator/cmd/operator@sha256:0b5a3532417f9c8e7b6044e23f6f67ad932a74a40a4092ad965b6f173b2fd887 + resources: + requests: + cpu: 100m + memory: 100Mi + limits: + cpu: 1000m + memory: 1000Mi + imagePullPolicy: IfNotPresent + env: + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: SYSTEM_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: METRICS_DOMAIN + value: knative.dev/operator + - name: CONFIG_LOGGING_NAME + value: config-logging + - name: CONFIG_OBSERVABILITY_NAME + value: config-observability + - name: KUBERNETES_MIN_VERSION + value: "" + securityContext: + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + runAsNonRoot: true + capabilities: + drop: + - ALL + ports: + - name: metrics + containerPort: 9090 +--- +# Source: dataplane/charts/prometheus/charts/kube-state-metrics/templates/deployment.yaml apiVersion: apps/v1 kind: Deployment metadata: name: release-name-kube-state-metrics namespace: union labels: - helm.sh/chart: kube-state-metrics-5.30.1 + helm.sh/chart: kube-state-metrics-5.25.1 app.kubernetes.io/managed-by: Helm app.kubernetes.io/component: metrics app.kubernetes.io/part-of: kube-state-metrics app.kubernetes.io/name: kube-state-metrics app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2.15.0" + app.kubernetes.io/version: "2.13.0" spec: selector: matchLabels: @@ -5078,13 +5485,13 @@ spec: template: metadata: labels: - helm.sh/chart: kube-state-metrics-5.30.1 + helm.sh/chart: kube-state-metrics-5.25.1 app.kubernetes.io/managed-by: Helm app.kubernetes.io/component: metrics app.kubernetes.io/part-of: kube-state-metrics app.kubernetes.io/name: kube-state-metrics app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2.15.0" + app.kubernetes.io/version: "2.13.0" spec: automountServiceAccountToken: true hostNetwork: false @@ -5101,8 +5508,9 @@ spec: args: - --port=8080 - --resources=certificatesigningrequests,configmaps,cronjobs,daemonsets,deployments,endpoints,horizontalpodautoscalers,ingresses,jobs,leases,limitranges,mutatingwebhookconfigurations,namespaces,networkpolicies,nodes,persistentvolumeclaims,persistentvolumes,poddisruptionbudgets,pods,replicasets,replicationcontrollers,resourcequotas,secrets,services,statefulsets,storageclasses,validatingwebhookconfigurations,volumeattachments + - --namespaces=union imagePullPolicy: IfNotPresent - image: registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.15.0 + image: registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.13.0 ports: - containerPort: 8080 name: "http" @@ -5122,7 +5530,7 @@ spec: httpGet: httpHeaders: path: /readyz - port: 8081 + port: 8080 scheme: HTTP initialDelaySeconds: 5 periodSeconds: 10 @@ -5137,201 +5545,204 @@ spec: - ALL readOnlyRootFilesystem: true --- -# Source: dataplane/charts/opencost/templates/deployment.yaml +# Source: dataplane/charts/prometheus/templates/deploy.yaml apiVersion: apps/v1 kind: Deployment metadata: - name: release-name-opencost - namespace: union labels: - helm.sh/chart: opencost-1.42.0 - app.kubernetes.io/name: opencost + app.kubernetes.io/component: server + app.kubernetes.io/name: prometheus app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "1.111.0" - app.kubernetes.io/part-of: opencost + app.kubernetes.io/version: v2.54.1 + helm.sh/chart: prometheus-25.27.0 app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: prometheus + name: union-operator-prometheus + namespace: union spec: - replicas: 1 selector: matchLabels: - app.kubernetes.io/name: opencost + app.kubernetes.io/component: server + app.kubernetes.io/name: prometheus app.kubernetes.io/instance: release-name - strategy: - rollingUpdate: - maxSurge: 1 - maxUnavailable: 1 - type: RollingUpdate + replicas: 1 + revisionHistoryLimit: 10 + strategy: + type: Recreate + rollingUpdate: null template: metadata: labels: - app.kubernetes.io/name: opencost + app.kubernetes.io/component: server + app.kubernetes.io/name: prometheus app.kubernetes.io/instance: release-name + app.kubernetes.io/version: v2.54.1 + helm.sh/chart: prometheus-25.27.0 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: prometheus spec: - serviceAccountName: release-name-opencost + enableServiceLinks: true + serviceAccountName: union-operator-prometheus containers: - - name: release-name-opencost - image: ghcr.io/opencost/opencost:1.111.0@sha256:6aa68e52a24b14ba41f23db08d1b9db1429a1c0300f4c0381ecc2c61fc311a97 - imagePullPolicy: IfNotPresent + - name: prometheus-server-configmap-reload + image: "quay.io/prometheus-operator/prometheus-config-reloader:v0.76.0" + imagePullPolicy: "IfNotPresent" args: + - --watched-dir=/etc/config + - --listen-address=0.0.0.0:8080 + - --reload-url=http://127.0.0.1:9090/prometheus/-/reload ports: - - containerPort: 9003 - name: http - resources: - limits: - cpu: 1000m - memory: 4Gi - requests: - cpu: 500m - memory: 1Gi - startupProbe: - httpGet: - path: /healthz - port: 9003 - initialDelaySeconds: 10 - periodSeconds: 5 - failureThreshold: 30 + - containerPort: 8080 + name: metrics livenessProbe: httpGet: path: /healthz - port: 9003 - initialDelaySeconds: 10 - periodSeconds: 20 - failureThreshold: 3 + port: metrics + scheme: HTTP + initialDelaySeconds: 2 + periodSeconds: 10 readinessProbe: httpGet: path: /healthz - port: 9003 - initialDelaySeconds: 10 + port: metrics + scheme: HTTP periodSeconds: 10 + volumeMounts: + - name: config-volume + mountPath: /etc/config + readOnly: true + + - name: prometheus-server + image: "quay.io/prometheus/prometheus:v2.54.1" + imagePullPolicy: "IfNotPresent" + args: + - --storage.tsdb.retention.time=3d + - --config.file=/etc/config/prometheus.yml + - --storage.tsdb.path=/data + - --web.console.libraries=/etc/prometheus/console_libraries + - --web.console.templates=/etc/prometheus/consoles + - --web.enable-lifecycle + - --web.route-prefix=/prometheus + - --web.external-url=/prometheus/ + ports: + - containerPort: 9090 + readinessProbe: + httpGet: + path: /prometheus/-/ready + port: 9090 + scheme: HTTP + initialDelaySeconds: 30 + periodSeconds: 5 + timeoutSeconds: 4 failureThreshold: 3 - env: - - name: LOG_LEVEL - value: info - - name: CUSTOM_COST_ENABLED - value: "false" - - name: KUBECOST_NAMESPACE - value: union - - name: API_PORT - value: "9003" - - name: PROMETHEUS_SERVER_ENDPOINT - value: "http://union-operator-prometheus.union.svc:80/prometheus" - - name: CLUSTER_ID - value: "default-cluster" - - name: DATA_RETENTION_DAILY_RESOLUTION_DAYS - value: "15" - - name: CLOUD_COST_ENABLED - value: "false" - - name: CLOUD_COST_MONTH_TO_DATE_INTERVAL - value: "6" - - name: CLOUD_COST_REFRESH_RATE_HOURS - value: "6" - - name: CLOUD_COST_QUERY_WINDOW_DAYS - value: "7" - - name: CLOUD_COST_RUN_WINDOW_DAYS - value: "3" - # Add any additional provided variables + successThreshold: 1 + livenessProbe: + httpGet: + path: /prometheus/-/healthy + port: 9090 + scheme: HTTP + initialDelaySeconds: 30 + periodSeconds: 15 + timeoutSeconds: 10 + failureThreshold: 3 + successThreshold: 1 + resources: + limits: + cpu: "3" + memory: 3500Mi + requests: + cpu: "1" + memory: 1Gi + volumeMounts: + - name: config-volume + mountPath: /etc/config + - name: storage-volume + mountPath: /data + subPath: "" + - name: server-recording-rules + mountPath: /etc/config/recording + subPath: + readOnly: + dnsPolicy: ClusterFirst + securityContext: + fsGroup: 65534 + runAsGroup: 65534 + runAsNonRoot: true + runAsUser: 65534 + terminationGracePeriodSeconds: 300 + volumes: + - name: config-volume + configMap: + name: union-operator-prometheus + - name: server-recording-rules + configMap: + name: union-recording-rules + - name: storage-volume + emptyDir: + sizeLimit: 10Gi --- -# Source: dataplane/templates/clusterresourcesync/deployment.yaml +# Source: dataplane/templates/flyteconnector/deployment.yaml apiVersion: apps/v1 kind: Deployment metadata: - name: union-syncresources + name: flyteconnector namespace: union - labels: - app.kubernetes.io/name: clusterresourcesync + labels: + app.kubernetes.io/name: flyteconnector app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm spec: - replicas: 1 + replicas: 2 selector: - matchLabels: - app.kubernetes.io/name: clusterresourcesync + matchLabels: + app.kubernetes.io/name: flyteconnector app.kubernetes.io/instance: release-name template: metadata: annotations: - configChecksum: "8b8cf98723ebace833b6b652bcb56bba6be768a8bc199ed5c1e2b7c0ea97e86" - labels: platform.union.ai/zone: "dataplane" - app.kubernetes.io/name: clusterresourcesync + app.kubernetes.io/name: flyteconnector app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm spec: containers: - command: - - clusterresource - - --config - - /etc/flyte/config/*.yaml - - clusterresource - - run - env: - - name: POD_NAME - valueFrom: - fieldRef: - fieldPath: metadata.name - - name: POD_NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - - name: GOMEMLIMIT - valueFrom: - resourceFieldRef: - divisor: 1 - resource: limits.memory - - name: GOMAXPROCS - valueFrom: - resourceFieldRef: - divisor: 1 - resource: limits.cpu - - name: CLUSTER_NAME - valueFrom: - secretKeyRef: - name: operator-cluster-name - key: cluster_name - - name: DEPLOYMENT_NAME - value: operator - - name: PROXY_SERVICE_URL - value: http://union-operator-proxy:8080 - - name: PROMETHEUS_SERVICE_URL - value: http://union-operator-prometheus:80 - - name: KNATIVE_PROXY_SERVICE_URL - value: http://kourier-internal - image: "public.ecr.aws/p0i0a9q8/unionoperator:2026.4.5" + - c0 + image: "ghcr.io/flyteorg/flyte-connectors:py3.13-2.0.0b50.dev3-g695bb1db3.d20260122" imagePullPolicy: "IfNotPresent" - name: sync-cluster-resources + name: flyteconnector + readinessProbe: + failureThreshold: 3 + httpGet: + path: / + port: 9090 + scheme: HTTP + initialDelaySeconds: 5 + periodSeconds: 10 + successThreshold: 1 + timeoutSeconds: 5 + env: + - name: AIRFLOW_CONN_GOOGLE_CLOUD_DEFAULT + value: "google-cloud-platform://" + - name: LOG_LEVEL + value: "20" + ports: + - containerPort: 8000 + name: grpc + - containerPort: 9090 + name: metric resources: limits: - cpu: "1" - memory: 500Mi + cpu: "1.5" + ephemeral-storage: 100Mi + memory: 1500Mi requests: - cpu: 500m - memory: 100Mi - volumeMounts: - - name: auth - mountPath: /etc/union/secret - - name: resource-templates - mountPath: /etc/flyte/clusterresource/templates - - name: config-volume - mountPath: /etc/flyte/config - ports: - - name: debug - containerPort: 10254 - protocol: TCP - serviceAccountName: union-clustersync-system - volumes: - - configMap: - name: union-clusterresource-template - name: resource-templates - - configMap: - name: union-clusterresourcesync-config - name: config-volume - - name: auth - secret: - secretName: union-secret-auth + cpu: "1" + ephemeral-storage: 100Mi + memory: 1000Mi + serviceAccountName: flyteconnector --- # Source: dataplane/templates/imagebuilder/deployment.yaml apiVersion: apps/v1 @@ -5353,17 +5764,15 @@ spec: app.kubernetes.io/instance: release-name template: metadata: - annotations: - container.apparmor.security.beta.kubernetes.io/buildkit: unconfined labels: platform.union.ai/zone: "dataplane" app.kubernetes.io/name: imagebuilder-buildkit app.kubernetes.io/instance: release-name spec: - serviceAccountName: "union-imagebuilder" + serviceAccountName: "union-system" containers: - name: "buildkit" - image: "docker.io/moby/buildkit:buildx-stable-1-rootless" + image: "docker.io/moby/buildkit:buildx-stable-1" imagePullPolicy: IfNotPresent env: - name: POD_NAME @@ -5398,18 +5807,15 @@ spec: - name: KNATIVE_PROXY_SERVICE_URL value: http://kourier-internal volumeMounts: - - mountPath: /home/user/.local/share/buildkit - name: buildkitd - mountPath: /etc/buildkit name: buildkit-config args: - --config - /etc/buildkit/buildkitd.toml - --addr - - unix:///run/user/1000/buildkit/buildkitd.sock + - unix:///run//buildkit/buildkitd.sock - --addr - tcp://0.0.0.0:1234 - - --oci-worker-no-process-sandbox ports: - name: tcp containerPort: 1234 @@ -5431,18 +5837,13 @@ spec: initialDelaySeconds: 5 periodSeconds: 30 securityContext: - seccompProfile: # Needs Kubernetes >= 1.19 - type: Unconfined - runAsUser: 1000 - runAsGroup: 1000 + privileged: true resources: requests: - cpu: 1 - ephemeral-storage: 20Gi - memory: 1Gi + cpu: 4 + ephemeral-storage: 50Gi + memory: 4Gi volumes: - - name: buildkitd - emptyDir: {} - configMap: name: union-operator-buildkit name: buildkit-config @@ -5471,16 +5872,18 @@ spec: template: metadata: annotations: - configChecksum: "ccc125c725a6f943c9c1d07ac09b644fd70df133dc169446431878603c5ac76" + configChecksum: "6dae4c4d3fa72268a682f8c18bb4d25980489f85ce450122568726253e508b7" labels: platform.union.ai/zone: "dataplane" app: executor + app.kubernetes.io/instance: 'release-name' + app.kubernetes.io/name: executor spec: securityContext: fsGroup: 1337 - serviceAccountName: executor + serviceAccountName: union-system volumes: - name: config-volume configMap: @@ -5496,7 +5899,7 @@ spec: image: "public.ecr.aws/p0i0a9q8/unionoperator:2026.4.5" imagePullPolicy: IfNotPresent command: - - executor + - executorv2 - serve - --config - /etc/config/*.yaml @@ -5574,7 +5977,7 @@ spec: template: metadata: annotations: - configChecksum: "12ef062dcf32a3ad1014f5e1e803e2672bd3a1c636ae04bb803571712ec200f" + configChecksum: "a354659ec6780321ee929320da221164a5a26a636d8d421e4b9418e542b31fd" labels: @@ -5589,12 +5992,10 @@ spec: sources: - configMap: name: union-operator - - configMap: - name: union-clusterresourcesync-config - name: secret-volume secret: secretName: union-secret-auth - serviceAccountName: proxy-system + serviceAccountName: union-system securityContext: {} containers: @@ -5711,7 +6112,7 @@ spec: template: metadata: annotations: - configChecksum: "12ef062dcf32a3ad1014f5e1e803e2672bd3a1c636ae04bb803571712ec200f" + configChecksum: "a354659ec6780321ee929320da221164a5a26a636d8d421e4b9418e542b31fd" labels: @@ -5720,7 +6121,7 @@ spec: platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm spec: - serviceAccountName: operator-system + serviceAccountName: union-system securityContext: {} volumes: @@ -5801,81 +6202,16 @@ spec: containerPort: 10254 protocol: TCP --- -# Source: dataplane/templates/prometheus/deployment.yaml -apiVersion: apps/v1 -kind: Deployment -metadata: - name: union-operator-prometheus - namespace: union - labels: - helm.sh/chart: dataplane-2026.4.2 - app.kubernetes.io/name: release-name-dataplane - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: prometheus -spec: - replicas: 1 - selector: - matchLabels: - app.kubernetes.io/component: prometheus - app.kubernetes.io/name: release-name-dataplane - app.kubernetes.io/instance: release-name - template: - metadata: - annotations: - configChecksum: "72a39ca838f02fc499675b5708fa77e2a4938e316a99315b59e33ef7f2d31ae" - labels: - app.kubernetes.io/component: prometheus - app.kubernetes.io/name: release-name-dataplane - app.kubernetes.io/instance: release-name - spec: - priorityClassName: system-cluster-critical - serviceAccountName: union-operator-prometheus - securityContext: - fsGroup: 65534 - runAsNonRoot: true - runAsUser: 65534 - fsGroupChangePolicy: OnRootMismatch - containers: - - name: prometheus - image: "prom/prometheus:v3.3.1" - args: - - --config.file=/etc/prometheus/prometheus.yml - - --web.external-url=/prometheus/ - - --web.route-prefix=/prometheus/ - - --storage.tsdb.retention.time=3d - ports: - - name: http - containerPort: 9090 - protocol: TCP - resources: - limits: - cpu: "3" - memory: 3500Mi - requests: - cpu: "1" - memory: 1Gi - securityContext: - allowPrivilegeEscalation: false - terminationMessagePolicy: FallbackToLogsOnError - volumeMounts: - - mountPath: /etc/prometheus - name: prometheus-config - volumes: - - name: prometheus-config - configMap: - name: union-operator-prometheus ---- -# Source: dataplane/templates/propeller/deployment-webhook.yaml +# Source: dataplane/templates/webhook/deployment.yaml +# Webhook deployment # Create the actual deployment apiVersion: apps/v1 kind: Deployment metadata: - name: flytepropeller-webhook + name: union-pod-webhook namespace: union labels: - app.kubernetes.io/name: flyte-pod-webhook + app.kubernetes.io/name: union-pod-webhook app.kubernetes.io/instance: release-name platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm @@ -5883,19 +6219,19 @@ spec: replicas: 1 selector: matchLabels: - app.kubernetes.io/name: flyte-pod-webhook + app.kubernetes.io/name: union-pod-webhook app.kubernetes.io/instance: release-name template: metadata: labels: platform.union.ai/zone: "dataplane" - app.kubernetes.io/name: flyte-pod-webhook + app.kubernetes.io/name: union-pod-webhook app.kubernetes.io/instance: release-name platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm annotations: - configChecksum: "d883634d70922473a7c6b721668e7a46bbd8bcab808203581628e616d007390" + configChecksum: "77958aee0464eee4cb43c70036cf5ffa89f47f58c908845b9eeaecc7e32bba4" spec: securityContext: @@ -5903,64 +6239,8 @@ spec: fsGroupChangePolicy: Always runAsNonRoot: true runAsUser: 1001 - seLinuxOptions: - type: spc_t - serviceAccountName: flytepropeller-webhook-system - initContainers: - - name: generate-secrets - image: "public.ecr.aws/p0i0a9q8/unionoperator:2026.4.5" - imagePullPolicy: "IfNotPresent" - command: - - flytepropeller - args: - - webhook - - init-certs - - --config - - /etc/flyte/config/*.yaml - env: - - name: POD_NAME - valueFrom: - fieldRef: - fieldPath: metadata.name - - name: POD_NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - - name: GOMEMLIMIT - valueFrom: - resourceFieldRef: - divisor: 1 - resource: limits.memory - - name: GOMAXPROCS - valueFrom: - resourceFieldRef: - divisor: 1 - resource: limits.cpu - - name: CLUSTER_NAME - valueFrom: - secretKeyRef: - name: operator-cluster-name - key: cluster_name - - name: DEPLOYMENT_NAME - value: operator - - name: PROXY_SERVICE_URL - value: http://union-operator-proxy:8080 - - name: PROMETHEUS_SERVICE_URL - value: http://union-operator-prometheus:80 - - name: KNATIVE_PROXY_SERVICE_URL - value: http://kourier-internal - volumeMounts: - - name: config-volume - mountPath: /etc/flyte/config - resources: - limits: - cpu: 1 - ephemeral-storage: 500Mi - memory: 500Mi - requests: - cpu: 200m - ephemeral-storage: 500Mi - memory: 500Mi + seLinuxOptions: null + serviceAccountName: union-system containers: - name: webhook image: "public.ecr.aws/p0i0a9q8/unionoperator:2026.4.5" @@ -6025,107 +6305,62 @@ spec: volumes: - name: config-volume configMap: - name: flyte-propeller-config + name: union-pod-webhook-config - name: webhook-certs secret: - secretName: flyte-pod-webhook + secretName: union-pod-webhook --- -# Source: dataplane/templates/propeller/deployment.yaml -apiVersion: apps/v1 -kind: Deployment +# Source: dataplane/templates/flyteconnector/hpa.yaml +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler metadata: - namespace: union - name: flytepropeller + name: flyteconnector labels: - app.kubernetes.io/name: flytepropeller + app.kubernetes.io/name: flyteconnector app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm spec: - replicas: 1 - selector: - matchLabels: - app.kubernetes.io/name: flytepropeller - app.kubernetes.io/instance: release-name - template: - metadata: - annotations: - configChecksum: "d883634d70922473a7c6b721668e7a46bbd8bcab808203581628e616d007390" - - labels: - platform.union.ai/zone: "dataplane" - - - app.kubernetes.io/name: flytepropeller - app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name - app.kubernetes.io/managed-by: Helm - spec: - priorityClassName: system-cluster-critical - containers: - - command: - - flytepropeller - - --config - - /etc/flyte/config/*.yaml - - --propeller.cluster-id - - union-aws - env: - - name: POD_NAME - valueFrom: - fieldRef: - fieldPath: metadata.name - - name: POD_NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - - name: GOMEMLIMIT - valueFrom: - resourceFieldRef: - divisor: 1 - resource: limits.memory - - name: GOMAXPROCS - valueFrom: - resourceFieldRef: - divisor: 1 - resource: limits.cpu - - name: CLUSTER_NAME - valueFrom: - secretKeyRef: - name: operator-cluster-name - key: cluster_name - - name: DEPLOYMENT_NAME - value: operator - - name: PROXY_SERVICE_URL - value: http://union-operator-proxy:8080 - - name: PROMETHEUS_SERVICE_URL - value: http://union-operator-prometheus:80 - - name: KNATIVE_PROXY_SERVICE_URL - value: http://kourier-internal - image: "public.ecr.aws/p0i0a9q8/unionoperator:2026.4.5" - imagePullPolicy: "IfNotPresent" - name: flytepropeller - ports: - - containerPort: 10254 - resources: - limits: - cpu: "3" - memory: 3Gi - requests: - cpu: "1" - memory: 1Gi - volumeMounts: - - name: config-volume - mountPath: /etc/flyte/config - - name: auth - mountPath: /etc/union/secret - serviceAccountName: flytepropeller-system - volumes: - - configMap: - name: flyte-propeller-config - name: config-volume - - name: auth - secret: - secretName: union-secret-auth + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: flyteconnector + minReplicas: 2 + maxReplicas: 5 + metrics: + - type: Resource + resource: + name: cpu + target: + type: Utilization + averageUtilization: 80 + - type: Resource + resource: + name: memory + target: + type: Utilization + averageUtilization: 80 +--- +# Source: dataplane/templates/common/ingress.yaml +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: release-name-dataplane-serving + namespace: union + labels: + app.kubernetes.io/name: dataplane + app.kubernetes.io/instance: release-name +spec: + rules: + - http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: kourier-internal + port: + number: 80 + host: "*.apps." --- # Source: dataplane/templates/common/ingress.yaml apiVersion: networking.k8s.io/v1 @@ -6225,6 +6460,267 @@ spec: port: number: 8080 --- +# Source: dataplane/templates/webhook/mutatingwebhookconfiguration.yaml +apiVersion: admissionregistration.k8s.io/v1 +kind: MutatingWebhookConfiguration +metadata: + name: union-pod-webhook-union + labels: + app.kubernetes.io/name: union-pod-webhook + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm +webhooks: + - name: union-pod-webhook.flyte.org + admissionReviewVersions: + - v1 + - v1beta1 + clientConfig: + caBundle: LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSUROekNDQWgrZ0F3SUJBZ0lVSm9kQ09lem4vd1BHQTZjYkw1U3duSlpYT2hJd0RRWUpLb1pJaHZjTkFRRUwKQlFBd0t6RXBNQ2NHQTFVRUF3d2dabXg1ZEdWd2NtOXdaV3hzWlhJdGQyVmlhRzl2YXk1MWJtbHZiaTV6ZG1NdwpIaGNOTWpZd01USXpNREl6TmpVMVdoY05Nell3TVRJeE1ESXpOalUxV2pBck1Ta3dKd1lEVlFRRERDQm1iSGwwClpYQnliM0JsYkd4bGNpMTNaV0pvYjI5ckxuVnVhVzl1TG5OMll6Q0NBU0l3RFFZSktvWklodmNOQVFFQkJRQUQKZ2dFUEFEQ0NBUW9DZ2dFQkFJcjBSWDRGRjFNVTNYdFBmUHErMDUrMGI1Qm0xK3hsTVR5ZUp4bG5mWFJuRlgyegpPcHV6NmsreUpBQk4xRkxmNDVYRnlJclFlaW5DWWJtckZXalFRc3ZDWEloNkpPNGMwdmRMVVU5RERYcldWRFlsCkRNZytTUDlJVnZBMm5USkVFZ2RFREVsKzNWeThUbUJoNlNmcGthcXdHajY5SmhIdy9OeE9yZzM2bUY3TU5VZ1MKWXJVUkxqUFFzTUN6NElTVzhhQnEzS2IwRUFWUTA4V1lZQ0ZEbjY1aHBzVFdxR05rL3BKRnVTYjlsZll6RVdyYwpJeW5wNDhqTnIvNUdKS3NnWG9LeTUwZVo4UkppeTVyOUhENGtnSW15TVNQR2RDMXFZSVppRHRlZW8yOElxNk5nCmd1dWcyTGhlNndYYW9YZXFCQ2d3SHpwWFVXM1hJVzI5eFoxUkFRRUNBd0VBQWFOVE1GRXdIUVlEVlIwT0JCWUUKRktITDQ1RWhQaVg5bnlRN241QUk0YUZVNFlCL01COEdBMVVkSXdRWU1CYUFGS0hMNDVFaFBpWDlueVE3bjVBSQo0YUZVNFlCL01BOEdBMVVkRXdFQi93UUZNQU1CQWY4d0RRWUpLb1pJaHZjTkFRRUxCUUFEZ2dFQkFEeTRac21XCkNjZ3B5TmI4cU1ydzNYclBNejZLMTRlTVhqQVk1bERFYXM1NklLZExoZFM1bi9sWk95bENuUUtOZ3RHS0VrY3UKT05tMEE2Z0xNT3UzcE43dG9aNXh0WktOcnUvS0x6dm44QUNNNndCbUR0bG5oRERDa3ZUTHBVRzh6UTZSNmlaegpzTWlhVmpLSUJiWkJsZXNYSHZZQnQ2YlRTN2o1QWM2aTdYc1RPK3ZqQ2RUQjVYdCtEZ0x2QnBJRWh2ZGJzNXhmClFtc3ZzQmwxenJnR3JXemo4d0txcXNUMzZaOVU1TzdSK1RJMlQrUDVOR25SeHNQOGJ0R0RtMGZ5SW55WlJrSkcKVTdpVFJyMlZRWlRyaTNZQ2dlVjN1UjFtMnJjc2hyU1Avb082T0QyTXR5Q1FMRWtFK1pxVUVJVnpSYmVaNXl2RQptM29YMFV2OUtBUy9pWTA9Ci0tLS0tRU5EIENFUlRJRklDQVRFLS0tLS0K + service: + name: union-pod-webhook + namespace: union + path: /mutate--v1-pod/secrets + port: 443 + failurePolicy: Fail + matchPolicy: Equivalent + namespaceSelector: + matchLabels: + kubernetes.io/metadata.name: union + objectSelector: + matchLabels: + inject-flyte-secrets: "true" + organization: 'union' + reinvocationPolicy: Never + rules: + - apiGroups: + - '*' + apiVersions: + - v1 + operations: + - CREATE + resources: + - pods + scope: '*' + sideEffects: NoneOnDryRun + timeoutSeconds: 30 +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Imported via https://github.com/knative/operator/releases/download/knative-v1.16.0/operator.yaml +# Copyright 2024 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +# Source: dataplane/templates/propeller/serviceaccount.yaml +--- +--- +# Source: dataplane/templates/serving/knative-serving.yaml +apiVersion: operator.knative.dev/v1beta1 +kind: KnativeServing +metadata: + name: union-operator-serving + namespace: union + labels: + helm.sh/chart: dataplane-2026.4.4 + app.kubernetes.io/name: release-name-dataplane + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2026.4.5" + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/component: serving +spec: + config: + deployment: + progress-deadline: "30m" + queue-sidecar-cpu-request: "25m" + queue-sidecar-cpu-limit: "1000m" + queue-sidecar-memory-request: "400Mi" + queue-sidecar-memory-limit: "800Mi" + queue-sidecar-ephemeral-storage-request: "512Mi" + queue-sidecar-ephemeral-storage-limit: "1024Mi" + registries-skipping-tag-resolving: managed.cr.union.ai + features: + kubernetes.podspec-affinity: "enabled" + kubernetes.podspec-nodeselector: "enabled" + kubernetes.podspec-tolerations: "enabled" + kubernetes.podspec-fieldref: "enabled" + kubernetes.podspec-dnspolicy: "enabled" + kubernetes.podspec-schedulername: "enabled" + kubernetes.podspec-securitycontext: "enabled" + network: + ingress-class: "kourier.ingress.networking.knative.dev" + high-availability: + replicas: 2 + ingress: + kourier: + enabled: true + bootstrap-configmap: "union-operator-serving-envoy-bootstrap" + service-type: ClusterIP + podDisruptionBudgets: + - name: 3scale-kourier-gateway-pdb + minAvailable: 50% + - name: activator-pdb + minAvailable: 50% + - name: webhook-pdb + minAvailable: 50% + registry: + override: + # TODO(jeev): Wire up Union fork of Envoy + 3scale-kourier-gateway/kourier-gateway: ghcr.io/unionai/envoy:456fed84d4ad9a9dfb186d117d9362e9dc0f7c1f + # TODO(jeev): Wire up Union fork of Kourier + net-kourier-controller/controller: ghcr.io/unionai/kourier@sha256:5804c348d15b3959604e3e3ceed216c3a1c7b32cbe254c7d3eb02a35e62ba9c4 + workloads: + - name: 3scale-kourier-gateway + labels: + helm.sh/chart: dataplane-2026.4.4 + app.kubernetes.io/name: release-name-dataplane + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2026.4.5" + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/component: 3scale-kourier-gateway + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchLabels: + app.kubernetes.io/name: 3scale-kourier-gateway + app.kubernetes.io/instance: release-name + topologyKey: topology.kubernetes.io/zone + annotations: + checksum/bootstrap-config: 6469f6f592eebc9e0c676d8ccc359a05bc799c0988e474b2da942c4cc328f656 + env: + - container: kourier-gateway + envVars: + - name: UNION_AUTHZ_TENANTAUTHURL + value: "https://union.us-west-2.union.ai/me" + - name: UNION_AUTHZ_TENANTAUTHSIGNINURL + value: "https://union.us-west-2.union.ai/login" + - name: UNION_AUTHZ_TENANTCONTROLPLANEURL + value: "https://union.us-west-2.union.ai" + resources: + - container: kourier-gateway + limits: + cpu: "2" + memory: 2Gi + requests: + cpu: "1" + memory: 1Gi + - name: activator + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app + operator: In + values: + - activator + topologyKey: topology.kubernetes.io/zone + - name: autoscaler + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app + operator: In + values: + - autoscaler + topologyKey: topology.kubernetes.io/zone + - name: autoscaler-hpa + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app + operator: In + values: + - autoscaler-hpa + topologyKey: topology.kubernetes.io/zone + - name: controller + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app + operator: In + values: + - controller + topologyKey: topology.kubernetes.io/zone + - name: net-kourier-controller + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app + operator: In + values: + - net-kourier-controller + topologyKey: topology.kubernetes.io/zone + env: + - container: controller + envVars: + - name: KOURIER_UNION_AUTHZ_ENABLED + value: "true" + resources: + - container: controller + limits: + cpu: "1" + memory: 1Gi + requests: + cpu: 500m + memory: 500Mi + - name: webhook + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app + operator: In + values: + - webhook + topologyKey: topology.kubernetes.io/zone +--- +# Source: dataplane/templates/common/task-podtemplate.yaml +apiVersion: v1 +kind: PodTemplate +metadata: + name: task-template + namespace: union +template: + spec: + serviceAccountName: union + containers: + - name: default + image: docker.io/rwgrim/docker-noop +--- # Source: dataplane/charts/fluentbit/templates/tests/test-connection.yaml apiVersion: v1 kind: Pod diff --git a/tests/generated/dataplane.aws.yaml b/tests/generated/dataplane.aws.yaml index 245ca514..0b0c2369 100644 --- a/tests/generated/dataplane.aws.yaml +++ b/tests/generated/dataplane.aws.yaml @@ -1,40 +1,4 @@ --- -# Source: dataplane/templates/common/namespaces.yaml -apiVersion: v1 -kind: Namespace -metadata: - name: flytesnacks-development ---- -# Source: dataplane/templates/common/namespaces.yaml -apiVersion: v1 -kind: Namespace -metadata: - name: flytesnacks-staging ---- -# Source: dataplane/templates/common/namespaces.yaml -apiVersion: v1 -kind: Namespace -metadata: - name: flytesnacks-production ---- -# Source: dataplane/templates/common/namespaces.yaml -apiVersion: v1 -kind: Namespace -metadata: - name: union-health-monitoring-development ---- -# Source: dataplane/templates/common/namespaces.yaml -apiVersion: v1 -kind: Namespace -metadata: - name: union-health-monitoring-staging ---- -# Source: dataplane/templates/common/namespaces.yaml -apiVersion: v1 -kind: Namespace -metadata: - name: union-health-monitoring-production ---- # Source: dataplane/charts/dcgm-exporter/templates/serviceaccount.yaml # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. # @@ -64,133 +28,142 @@ metadata: app.kubernetes.io/managed-by: Helm automountServiceAccountToken: false --- -# Source: dataplane/charts/fluentbit/templates/serviceaccount.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2020 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + apiVersion: v1 kind: ServiceAccount metadata: - name: fluentbit-system - namespace: union + name: knative-operator + namespace: "union" labels: - helm.sh/chart: fluentbit-0.48.9 - app.kubernetes.io/name: fluentbit - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "3.2.8" - app.kubernetes.io/managed-by: Helm + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2020 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v1 +kind: ServiceAccount +metadata: + name: operator-webhook + namespace: "union" + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator --- -# Source: dataplane/charts/kube-state-metrics/templates/serviceaccount.yaml +# Source: dataplane/charts/prometheus/charts/kube-state-metrics/templates/serviceaccount.yaml apiVersion: v1 kind: ServiceAccount automountServiceAccountToken: true metadata: labels: - helm.sh/chart: kube-state-metrics-5.30.1 + helm.sh/chart: kube-state-metrics-5.25.1 app.kubernetes.io/managed-by: Helm app.kubernetes.io/component: metrics app.kubernetes.io/part-of: kube-state-metrics app.kubernetes.io/name: kube-state-metrics app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2.15.0" + app.kubernetes.io/version: "2.13.0" name: release-name-kube-state-metrics namespace: union --- -# Source: dataplane/charts/opencost/templates/serviceaccount.yaml +# Source: dataplane/charts/prometheus/templates/serviceaccount.yaml apiVersion: v1 kind: ServiceAccount metadata: - name: release-name-opencost - namespace: union labels: - helm.sh/chart: opencost-1.42.0 - app.kubernetes.io/name: opencost + app.kubernetes.io/component: server + app.kubernetes.io/name: prometheus app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "1.111.0" - app.kubernetes.io/part-of: opencost + app.kubernetes.io/version: v2.54.1 + helm.sh/chart: prometheus-25.27.0 app.kubernetes.io/managed-by: Helm -automountServiceAccountToken: true ---- -# Source: dataplane/templates/clusterresourcesync/serviceaccount.yaml -apiVersion: v1 -kind: ServiceAccount -metadata: - name: union-clustersync-system + app.kubernetes.io/part-of: prometheus + name: union-operator-prometheus namespace: union annotations: - eks.amazonaws.com/role-arn: 'test-backend-iam-role-arn' ---- -# Source: dataplane/templates/imagebuilder/serviceaccount.yaml -apiVersion: v1 -kind: ServiceAccount -metadata: - name: union-imagebuilder + {} --- -# Source: dataplane/templates/nodeexecutor/serviceaccount.yaml +# Source: dataplane/templates/common/system-serviceaccount.yaml apiVersion: v1 kind: ServiceAccount metadata: - name: executor + name: union-system namespace: union - labels: - app: executor - annotations: - eks.amazonaws.com/role-arn: 'test-backend-iam-role-arn' ---- -# Source: dataplane/templates/operator/serviceaccount-proxy.yaml -apiVersion: v1 -kind: ServiceAccount -metadata: - name: proxy-system - labels: - app.kubernetes.io/name: operator-proxy - app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name - app.kubernetes.io/managed-by: Helm annotations: eks.amazonaws.com/role-arn: 'test-backend-iam-role-arn' --- -# Source: dataplane/templates/operator/serviceaccount.yaml +# Source: dataplane/templates/common/union-serviceaccount.yaml apiVersion: v1 kind: ServiceAccount metadata: - name: operator-system - labels: - app.kubernetes.io/name: union-operator - app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name - app.kubernetes.io/managed-by: Helm + name: union + namespace: union annotations: - eks.amazonaws.com/role-arn: 'test-backend-iam-role-arn' + eks.amazonaws.com/role-arn: test-worker-iam-role-arn +automountServiceAccountToken: true --- -# Source: dataplane/templates/prometheus/serviceaccount.yaml +# Source: dataplane/templates/flyteconnector/serviceaccount.yaml apiVersion: v1 kind: ServiceAccount metadata: - name: union-operator-prometheus + name: flyteconnector namespace: union - labels: - helm.sh/chart: dataplane-2026.4.2 - app.kubernetes.io/name: release-name-dataplane + labels: + app.kubernetes.io/name: flyteconnector app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: prometheus ---- -# Source: dataplane/templates/propeller/serviceaccount-webhook.yaml -apiVersion: v1 -kind: ServiceAccount -metadata: - name: flytepropeller-webhook-system - namespace: union - annotations: - eks.amazonaws.com/role-arn: 'test-backend-iam-role-arn' --- -# Source: dataplane/templates/propeller/serviceaccount.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2022 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + apiVersion: v1 -kind: ServiceAccount +kind: Secret metadata: - name: flytepropeller-system - namespace: union - annotations: - eks.amazonaws.com/role-arn: 'test-backend-iam-role-arn' + name: operator-webhook-certs + namespace: "union" + labels: + app.kubernetes.io/component: operator-webhook + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +# The data is populated at install time. --- # Source: dataplane/templates/common/auth-secret.yaml apiVersion: v1 @@ -213,14 +186,22 @@ type: Opaque data: cluster_name: dGVzdC1jbHVzdGVyLW5hbWU= --- -# Source: dataplane/templates/propeller/deployment-webhook.yaml -# Create an empty secret that the first propeller pod will populate +# Source: dataplane/templates/webhook/mutatingwebhookconfiguration.yaml apiVersion: v1 kind: Secret metadata: - name: flyte-pod-webhook + name: union-pod-webhook namespace: union + labels: + app.kubernetes.io/name: union-pod-webhook + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm type: Opaque +data: + ca.crt: LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSUROekNDQWgrZ0F3SUJBZ0lVSm9kQ09lem4vd1BHQTZjYkw1U3duSlpYT2hJd0RRWUpLb1pJaHZjTkFRRUwKQlFBd0t6RXBNQ2NHQTFVRUF3d2dabXg1ZEdWd2NtOXdaV3hzWlhJdGQyVmlhRzl2YXk1MWJtbHZiaTV6ZG1NdwpIaGNOTWpZd01USXpNREl6TmpVMVdoY05Nell3TVRJeE1ESXpOalUxV2pBck1Ta3dKd1lEVlFRRERDQm1iSGwwClpYQnliM0JsYkd4bGNpMTNaV0pvYjI5ckxuVnVhVzl1TG5OMll6Q0NBU0l3RFFZSktvWklodmNOQVFFQkJRQUQKZ2dFUEFEQ0NBUW9DZ2dFQkFJcjBSWDRGRjFNVTNYdFBmUHErMDUrMGI1Qm0xK3hsTVR5ZUp4bG5mWFJuRlgyegpPcHV6NmsreUpBQk4xRkxmNDVYRnlJclFlaW5DWWJtckZXalFRc3ZDWEloNkpPNGMwdmRMVVU5RERYcldWRFlsCkRNZytTUDlJVnZBMm5USkVFZ2RFREVsKzNWeThUbUJoNlNmcGthcXdHajY5SmhIdy9OeE9yZzM2bUY3TU5VZ1MKWXJVUkxqUFFzTUN6NElTVzhhQnEzS2IwRUFWUTA4V1lZQ0ZEbjY1aHBzVFdxR05rL3BKRnVTYjlsZll6RVdyYwpJeW5wNDhqTnIvNUdKS3NnWG9LeTUwZVo4UkppeTVyOUhENGtnSW15TVNQR2RDMXFZSVppRHRlZW8yOElxNk5nCmd1dWcyTGhlNndYYW9YZXFCQ2d3SHpwWFVXM1hJVzI5eFoxUkFRRUNBd0VBQWFOVE1GRXdIUVlEVlIwT0JCWUUKRktITDQ1RWhQaVg5bnlRN241QUk0YUZVNFlCL01COEdBMVVkSXdRWU1CYUFGS0hMNDVFaFBpWDlueVE3bjVBSQo0YUZVNFlCL01BOEdBMVVkRXdFQi93UUZNQU1CQWY4d0RRWUpLb1pJaHZjTkFRRUxCUUFEZ2dFQkFEeTRac21XCkNjZ3B5TmI4cU1ydzNYclBNejZLMTRlTVhqQVk1bERFYXM1NklLZExoZFM1bi9sWk95bENuUUtOZ3RHS0VrY3UKT05tMEE2Z0xNT3UzcE43dG9aNXh0WktOcnUvS0x6dm44QUNNNndCbUR0bG5oRERDa3ZUTHBVRzh6UTZSNmlaegpzTWlhVmpLSUJiWkJsZXNYSHZZQnQ2YlRTN2o1QWM2aTdYc1RPK3ZqQ2RUQjVYdCtEZ0x2QnBJRWh2ZGJzNXhmClFtc3ZzQmwxenJnR3JXemo4d0txcXNUMzZaOVU1TzdSK1RJMlQrUDVOR25SeHNQOGJ0R0RtMGZ5SW55WlJrSkcKVTdpVFJyMlZRWlRyaTNZQ2dlVjN1UjFtMnJjc2hyU1Avb082T0QyTXR5Q1FMRWtFK1pxVUVJVnpSYmVaNXl2RQptM29YMFV2OUtBUy9pWTA9Ci0tLS0tRU5EIENFUlRJRklDQVRFLS0tLS0K + tls.crt: LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSUQxakNDQXI2Z0F3SUJBZ0lVREtRN3A4MFpIdkQyUE5Ya3p3VGRLem9iUjg4d0RRWUpLb1pJaHZjTkFRRUwKQlFBd0t6RXBNQ2NHQTFVRUF3d2dabXg1ZEdWd2NtOXdaV3hzWlhJdGQyVmlhRzl2YXk1MWJtbHZiaTV6ZG1NdwpIaGNOTWpZd01USXpNREl6TmpVMVdoY05NamN3TVRJek1ESXpOalUxV2pBck1Ta3dKd1lEVlFRRERDQm1iSGwwClpYQnliM0JsYkd4bGNpMTNaV0pvYjI5ckxuVnVhVzl1TG5OMll6Q0NBU0l3RFFZSktvWklodmNOQVFFQkJRQUQKZ2dFUEFEQ0NBUW9DZ2dFQkFMNnMxUTRaZ3ZxVVdNdFp5VC9EKytEQWl6WmR0Q1lkVzArSGk4S3phQ1Vjbi9zZwo3a21NTEhDM2ZlZWtZK2ZUNENHUTd6K1l2WjRWWSt6WGljKzdrNndLbUZqdm1acEM1Rm96OWo1Q21NRk5HZmlWCnRobzVvSU9MUkIzNXRYMU1ROHUwNFVzT29QcjdMK016eEdxWXVmTFdtN1NVdjh0eEp3ZFMxZVlzMllNV1pMeU4KekI4akdWY01Qc2ZNVkdxdFhTTVljQ3hNYURKaTM1N0VEZSs3clM2U2ZtbnFtYjJxekdQM1BRRStCUm91Yk5nLwo4NCt4c253VXBLZUFpem95dFM2c2ZlOTdMUEtRTW9oVzd2TjZORFJrLy9xSmpHem9iZmZwUHU3endhWHVXMWtMCmllc2g2cVRrdlZLSTF3WFJ2WHArVU5GbjZzdDIwdEVlRks0ZzMwOENBd0VBQWFPQjhUQ0I3akFKQmdOVkhSTUUKQWpBQU1Bc0dBMVVkRHdRRUF3SUY0RENCa3dZRFZSMFJCSUdMTUlHSWdoWm1iSGwwWlhCeWIzQmxiR3hsY2kxMwpaV0pvYjI5cmdoeG1iSGwwWlhCeWIzQmxiR3hsY2kxM1pXSm9iMjlyTG5WdWFXOXVnaUJtYkhsMFpYQnliM0JsCmJHeGxjaTEzWldKb2IyOXJMblZ1YVc5dUxuTjJZNEl1Wm14NWRHVndjbTl3Wld4c1pYSXRkMlZpYUc5dmF5NTEKYm1sdmJpNXpkbU11WTJ4MWMzUmxjaTVzYjJOaGJEQWRCZ05WSFE0RUZnUVV1d25wcDZ1MTNKL0NsNnI3eDZ1Sgo5YmZ1RDhzd0h3WURWUjBqQkJnd0ZvQVVvY3Zqa1NFK0pmMmZKRHVma0FqaG9WVGhnSDh3RFFZSktvWklodmNOCkFRRUxCUUFEZ2dFQkFFMmlOa1MzczhGT3Z0cDhrVmJNZUN2amxIQytvYTVlZTFGNmowNlZ3bFJPOWdxOXJla1UKMGptWUhNWGNjWjcrZXJkNFRQRXgwK3ZqMzNhRTV3cDFmdVJJL2xIZHhuOUpJRHg2bTFQTHdVa3p1WG9EekYxLwozVXVIWU1xOGkrT2xsY0tqbmdVMUFUNmRWRCtIbFVWeHpMdjlNazdPb05SQUY5dTJHa1hiUHdCN0I1ZGRxdWxlCmtNcXBxbDhES2s4SVNVelVsTjNBb2Q5NmJUelIyV3RCVkxpTi9zRHZtTDZwZThFYm55RnBUb1BZTDlGeWZOcmIKT1ZPZEJwUkZFTG1KOHZaZWY5d1lxRmVXMTE3SGV5VDArZkxGZVNSemZUNVVWUlVraVFBcUVJdUhHUVRrOVFabgo1WnYvNUF0eEg3WE1rcXR0VE1zMFlSUG0xYjQzY3FBeFdhND0KLS0tLS1FTkQgQ0VSVElGSUNBVEUtLS0tLQo= + tls.key: LS0tLS1CRUdJTiBQUklWQVRFIEtFWS0tLS0tCk1JSUV2UUlCQURBTkJna3Foa2lHOXcwQkFRRUZBQVNDQktjd2dnU2pBZ0VBQW9JQkFRQytyTlVPR1lMNmxGakwKV2NrL3cvdmd3SXMyWGJRbUhWdFBoNHZDczJnbEhKLzdJTzVKakN4d3QzM25wR1BuMCtBaGtPOC9tTDJlRldQcwoxNG5QdTVPc0NwaFk3NW1hUXVSYU0vWStRcGpCVFJuNGxiWWFPYUNEaTBRZCtiVjlURVBMdE9GTERxRDYreS9qCk04UnFtTG55MXB1MGxML0xjU2NIVXRYbUxObURGbVM4amN3Zkl4bFhERDdIekZScXJWMGpHSEFzVEdneVl0K2UKeEEzdnU2MHVrbjVwNnBtOXFzeGo5ejBCUGdVYUxtellQL09Qc2JKOEZLU25nSXM2TXJVdXJIM3ZleXp5a0RLSQpWdTd6ZWpRMFpQLzZpWXhzNkczMzZUN3U4OEdsN2x0WkM0bnJJZXFrNUwxU2lOY0YwYjE2ZmxEUlorckxkdExSCkhoU3VJTjlQQWdNQkFBRUNnZ0VBRjV2SnMzbTRMSE9DdlFic2NwOC9DQmgvQkNlOC92MGlpYW5VUmJLMkFlWlYKN3Arb3NXV1FPUktYSGIvT2VPMVVjb09DQkFOUzh3aGQrM3pDZlB5U0w3cU9HM0RyT1Z6djdqVkIxM3FpZEVpcQpId3ZXWk0vZXpuckhYOWpEdm5SYmJwVUNVaXRKQmxwa2x3S1pYc056UHB5UTRkNkxFdEw3VEo1V2lxM2g3cWQzCjhncmJoeWlWNDhZRFAyYkhpZGt0S2QvMHBNL3piSzZPTS9iMFpLUmRBWVpHQXlOcklMaFJ3eDNsQXE2SDVqL20KUHBuNlZEdVQ3Y1h6UFhmNTgxR0IyNjQxWFJVTXRNWGVEREtaU3JudnIrbG5tV2lsYXZmNkg3ZHp4NlRBMmk0RApybCtIRVdUN1RSQUZGMnZ5SVUrYSt5RzNPVFBKRCtmVTdCWWh4Zld4aFFLQmdRRGVvcXVCbzQxK1dKU255blZnCmNWMHZoUEkwZ0dyRGQzNFdIbCtGNTZuSmxuMVVlbUQ2SUQ0TUhnNVpHWmNmd3NkRlB0aHl1VnB0OVl1SW11SGQKRHE5ZDhpWnpHUzJVbStkSWlsWjhnZXJ5Q3dON3hIbFM3SzJrc0RmRGJwYmZZSXZ1LzRnSExZRHh3YzQ0OG54ZwpWNGpCUFY2QTdYTHRqQzZzdFE2OW9uTlIwd0tCZ1FEYlFBUTRGdWpnUjJUOXh2Y01MdXhMbm1YaHE4U2NoZlM5CkRoWndDUzZIeHB1YlBxblBjQ04vQUZjbDZ1cDRXVEVZdVY4bEpwcGwwRmpYQ2lMdkhuUU15cm5vRjlUOThpanQKZGVHYjF6ZWlwRUpndFpNVHkvOGdERWorZU1wSFBuYTZwQVpMOVpRVUdLR2dDNGQ0SzljdVNHMmRlb1dSU0ViSQpMVGgzeGpXVEZRS0JnQStJRWFidG5nVmVjS0IwQTFSREZGa29VUzFRZUNKQ3g4MExPV2JDRHBvOW9XaXZVT3lpCkt6SDFOdE1JY2Y2SlBCV2NtTVVJSVVMaWltVnhTS2gvU2NTb0MvNmpsd1p6Q2VPSm94YjBpVXR4Y1VERktDR2MKMlZCUDZ0UDdkeE1HVFR6VEhzNUJZbWw3TjhQSlJ0d0J2MHliMTJmdktNRmhzaS9pUWJFQkVFSjVBb0dBUm9NSApHRmJkM0V0NXdsZzcyYkk1a25SRnhkY3RLejIxb1J1bndhNWlSWTV0T3Zkak8zQ1FLZWNkSC9lMklyQmtwdFB2Ck1vNkF0MS9UUW8xakFNNGxlbnUwWUYxUnhiNGN4WW5VM2Y3UVNNRlZDNjg1dHZNemdNWVNyNngvT1h4d1NNUTUKdGpJcnhtN0poQ1JSRkNmZFUyZzl3SmpIM2hxRmtSbGlBTHRCUGFVQ2dZRUF2QkRHejQvYS9kRzRJT0VpREk4SgpBbkxaRWZaTFJrdnI0ejRyT0p0TGZtWlFjTW5DSUZmKzBaelhTME9XOWhvMjk5RUNtS1l4U2VUWk1WSDgzOFZCCnRvSkNGM3lzb01rSC9meEVXTUpicDlMblJwSFUzUElicHlhRG1nWW1YdmsvK3laL3Y1c0dDTWNtVGkzWVFLRGsKcStlK1o0SHZyaGUxVytUaHpwOCtoNGs9Ci0tLS0tRU5EIFBSSVZBVEUgS0VZLS0tLS0K --- # Source: dataplane/charts/dcgm-exporter/templates/metrics-configmap.yaml apiVersion: v1 @@ -316,145 +297,516 @@ data: DCGM_FI_PROF_PCIE_TX_BYTES, counter, The number of bytes of active pcie tx data including both header and payload. DCGM_FI_PROF_PCIE_RX_BYTES, counter, The number of bytes of active pcie rx data including both header and payload. --- -# Source: dataplane/templates/clusterresourcesync/configmap.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2019 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + apiVersion: v1 kind: ConfigMap metadata: - name: union-clusterresourcesync-config - namespace: union + name: config-logging + namespace: "union" labels: - app.kubernetes.io/name: clusterresourcesync - app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name - app.kubernetes.io/managed-by: Helm -data: - cluster_resources.yaml: | - cluster_resources: - clusterName: 'test-cluster-name' - customData: - - production: - - projectQuotaCpu: - value: "4096" - - projectQuotaMemory: - value: 2Ti - - projectQuotaNvidiaGpu: - value: "256" - - defaultUserRoleKey: - value: 'eks.amazonaws.com/role-arn' - - defaultUserRoleValue: - value: 'test-worker-iam-role-arn' - - staging: - - projectQuotaCpu: - value: "4096" - - projectQuotaMemory: - value: 2Ti - - projectQuotaNvidiaGpu: - value: "256" - - defaultUserRoleKey: - value: 'eks.amazonaws.com/role-arn' - - defaultUserRoleValue: - value: 'test-worker-iam-role-arn' - - development: - - projectQuotaCpu: - value: "4096" - - projectQuotaMemory: - value: 2Ti - - projectQuotaNvidiaGpu: - value: "256" - - defaultUserRoleKey: - value: 'eks.amazonaws.com/role-arn' - - defaultUserRoleValue: - value: 'test-worker-iam-role-arn' - refreshInterval: 5m - standaloneDeployment: true - templatePath: /etc/flyte/clusterresource/templates - clusterResourcesPrivate: - app: - isServerless: false - union: - auth: - authorizationMetadataKey: flyte-authorization - clientId: 'test-client-id' - clientSecretLocation: /etc/union/secret/client_secret - tokenRefreshWindow: 5m - type: ClientSecret - connection: - host: dns:///test-controlplane-host - admin.yaml: | - admin: - clientId: 'test-client-id' - clientSecretLocation: /etc/union/secret/client_secret - endpoint: dns:///test-controlplane-host - insecure: false - event: - capacity: 1000 - rate: 500 - type: admin - domain.yaml: | - domains: - - id: development - name: development - - id: staging - name: staging - - id: production - name: production - clusters.yaml: | - clusters: - clusterConfigs: [] - labelClusterMap: {} - logger.yaml: | - logger: - formatter: - type: json - level: 4 - show-source: true - namespace_mapping.yaml: | - namespace_mapping: - template: '{{ project }}-{{ domain }}' + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +data: {} +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2019 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v1 +kind: ConfigMap +metadata: + name: config-observability + namespace: "union" + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +data: {} --- -# Source: dataplane/templates/clusterresourcesync/configmap.yaml +# Source: dataplane/charts/prometheus/templates/cm.yaml apiVersion: v1 kind: ConfigMap metadata: - name: union-clusterresource-template - namespace: union labels: - app.kubernetes.io/name: clusterresourcesync + app.kubernetes.io/component: server + app.kubernetes.io/name: prometheus app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name + app.kubernetes.io/version: v2.54.1 + helm.sh/chart: prometheus-25.27.0 app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: prometheus + name: union-operator-prometheus + namespace: union data: - a_namespace.yaml: | - apiVersion: v1 - kind: Namespace - metadata: - name: {{ namespace }} - labels: - union.ai/namespace-type: flyte - spec: - finalizers: - - kubernetes - - b_default_service_account.yaml: | - apiVersion: v1 - kind: ServiceAccount - metadata: - name: default - namespace: {{ namespace }} - annotations: - {{ defaultUserRoleKey }}: {{ defaultUserRoleValue }} + allow-snippet-annotations: "false" + alerting_rules.yml: | + {} + alerts: | + {} + prometheus.yml: | + global: + evaluation_interval: 1m + scrape_interval: 1m + scrape_timeout: 10s + rule_files: + - /etc/config/recording_rules.yml + - /etc/config/alerting_rules.yml + - /etc/config/rules + - /etc/config/alerts + scrape_configs: + - job_name: prometheus + static_configs: + - targets: + - localhost:9090 + - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + job_name: kubernetes-apiservers + kubernetes_sd_configs: + - role: endpoints + relabel_configs: + - action: keep + regex: default;kubernetes;https + source_labels: + - __meta_kubernetes_namespace + - __meta_kubernetes_service_name + - __meta_kubernetes_endpoint_port_name + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + insecure_skip_verify: true + - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + job_name: kubernetes-nodes + kubernetes_sd_configs: + - role: node + relabel_configs: + - action: labelmap + regex: __meta_kubernetes_node_label_(.+) + - replacement: kubernetes.default.svc:443 + target_label: __address__ + - regex: (.+) + replacement: /api/v1/nodes/$1/proxy/metrics + source_labels: + - __meta_kubernetes_node_name + target_label: __metrics_path__ + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + insecure_skip_verify: true + - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + job_name: kubernetes-nodes-cadvisor + kubernetes_sd_configs: + - role: node + relabel_configs: + - action: labelmap + regex: __meta_kubernetes_node_label_(.+) + - replacement: kubernetes.default.svc:443 + target_label: __address__ + - regex: (.+) + replacement: /api/v1/nodes/$1/proxy/metrics/cadvisor + source_labels: + - __meta_kubernetes_node_name + target_label: __metrics_path__ + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + insecure_skip_verify: true + - honor_labels: true + job_name: kubernetes-service-endpoints + kubernetes_sd_configs: + - role: endpoints + relabel_configs: + - action: keep + regex: true + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_scrape + - action: drop + regex: true + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_scrape_slow + - action: replace + regex: (https?) + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_scheme + target_label: __scheme__ + - action: replace + regex: (.+) + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_path + target_label: __metrics_path__ + - action: replace + regex: (.+?)(?::\d+)?;(\d+) + replacement: $1:$2 + source_labels: + - __address__ + - __meta_kubernetes_service_annotation_prometheus_io_port + target_label: __address__ + - action: labelmap + regex: __meta_kubernetes_service_annotation_prometheus_io_param_(.+) + replacement: __param_$1 + - action: labelmap + regex: __meta_kubernetes_service_label_(.+) + - action: replace + source_labels: + - __meta_kubernetes_namespace + target_label: namespace + - action: replace + source_labels: + - __meta_kubernetes_service_name + target_label: service + - action: replace + source_labels: + - __meta_kubernetes_pod_node_name + target_label: node + - honor_labels: true + job_name: kubernetes-service-endpoints-slow + kubernetes_sd_configs: + - role: endpoints + relabel_configs: + - action: keep + regex: true + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_scrape_slow + - action: replace + regex: (https?) + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_scheme + target_label: __scheme__ + - action: replace + regex: (.+) + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_path + target_label: __metrics_path__ + - action: replace + regex: (.+?)(?::\d+)?;(\d+) + replacement: $1:$2 + source_labels: + - __address__ + - __meta_kubernetes_service_annotation_prometheus_io_port + target_label: __address__ + - action: labelmap + regex: __meta_kubernetes_service_annotation_prometheus_io_param_(.+) + replacement: __param_$1 + - action: labelmap + regex: __meta_kubernetes_service_label_(.+) + - action: replace + source_labels: + - __meta_kubernetes_namespace + target_label: namespace + - action: replace + source_labels: + - __meta_kubernetes_service_name + target_label: service + - action: replace + source_labels: + - __meta_kubernetes_pod_node_name + target_label: node + scrape_interval: 5m + scrape_timeout: 30s + - honor_labels: true + job_name: prometheus-pushgateway + kubernetes_sd_configs: + - role: service + relabel_configs: + - action: keep + regex: pushgateway + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_probe + - honor_labels: true + job_name: kubernetes-services + kubernetes_sd_configs: + - role: service + metrics_path: /probe + params: + module: + - http_2xx + relabel_configs: + - action: keep + regex: true + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_probe + - source_labels: + - __address__ + target_label: __param_target + - replacement: blackbox + target_label: __address__ + - source_labels: + - __param_target + target_label: instance + - action: labelmap + regex: __meta_kubernetes_service_label_(.+) + - source_labels: + - __meta_kubernetes_namespace + target_label: namespace + - source_labels: + - __meta_kubernetes_service_name + target_label: service + - honor_labels: true + job_name: kubernetes-pods + kubernetes_sd_configs: + - role: pod + relabel_configs: + - action: keep + regex: true + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_scrape + - action: drop + regex: true + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_scrape_slow + - action: replace + regex: (https?) + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_scheme + target_label: __scheme__ + - action: replace + regex: (.+) + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_path + target_label: __metrics_path__ + - action: replace + regex: (\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4}) + replacement: '[$2]:$1' + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_port + - __meta_kubernetes_pod_ip + target_label: __address__ + - action: replace + regex: (\d+);((([0-9]+?)(\.|$)){4}) + replacement: $2:$1 + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_port + - __meta_kubernetes_pod_ip + target_label: __address__ + - action: labelmap + regex: __meta_kubernetes_pod_annotation_prometheus_io_param_(.+) + replacement: __param_$1 + - action: labelmap + regex: __meta_kubernetes_pod_label_(.+) + - action: replace + source_labels: + - __meta_kubernetes_namespace + target_label: namespace + - action: replace + source_labels: + - __meta_kubernetes_pod_name + target_label: pod + - action: drop + regex: Pending|Succeeded|Failed|Completed + source_labels: + - __meta_kubernetes_pod_phase + - action: replace + source_labels: + - __meta_kubernetes_pod_node_name + target_label: node + - honor_labels: true + job_name: kubernetes-pods-slow + kubernetes_sd_configs: + - role: pod + relabel_configs: + - action: keep + regex: true + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_scrape_slow + - action: replace + regex: (https?) + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_scheme + target_label: __scheme__ + - action: replace + regex: (.+) + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_path + target_label: __metrics_path__ + - action: replace + regex: (\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4}) + replacement: '[$2]:$1' + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_port + - __meta_kubernetes_pod_ip + target_label: __address__ + - action: replace + regex: (\d+);((([0-9]+?)(\.|$)){4}) + replacement: $2:$1 + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_port + - __meta_kubernetes_pod_ip + target_label: __address__ + - action: labelmap + regex: __meta_kubernetes_pod_annotation_prometheus_io_param_(.+) + replacement: __param_$1 + - action: labelmap + regex: __meta_kubernetes_pod_label_(.+) + - action: replace + source_labels: + - __meta_kubernetes_namespace + target_label: namespace + - action: replace + source_labels: + - __meta_kubernetes_pod_name + target_label: pod + - action: drop + regex: Pending|Succeeded|Failed|Completed + source_labels: + - __meta_kubernetes_pod_phase + - action: replace + source_labels: + - __meta_kubernetes_pod_node_name + target_label: node + scrape_interval: 5m + scrape_timeout: 30s + - job_name: 'kube-state-metrics' + metrics_path: /metrics + scrape_interval: 1m + honor_labels: true + kubernetes_sd_configs: + - role: endpoints + namespaces: + own_namespace: true + relabel_configs: + - source_labels: [__meta_kubernetes_service_label_app_kubernetes_io_name] + regex: kube-state-metrics + action: keep + - source_labels: [__meta_kubernetes_endpoint_port_name] + regex: http + action: keep + metric_relabel_configs: + - source_labels: [__name__] + regex: "kube_pod_container_resource_(limits|requests)|kube_pod_status_phase|kube_node_(labels|status_allocatable|status_condition|status_capacity)|kube_namespace_labels|kube_pod_container_status_(waiting|terminated|last_terminated).*_reason|kube_daemonset_status_number_unavailable|kube_deployment_status_replicas_unavailable|kube_resourcequota|kube_pod_info|kube_node_info|kube_pod_container_status_restarts_total" + action: keep + - source_labels: [__name__, phase] + separator: ";" + regex: "kube_pod_status_phase;(Succeeded|Failed)" + action: drop + - source_labels: [node] + target_label: nodename + regex: "(.*)" + action: replace + - source_labels: [label_node_group_name] + action: replace + regex: "(.+)" + target_label: label_node_pool_name + - job_name: 'opencost' + metrics_path: /metrics + scrape_interval: 1m + honor_labels: true + kubernetes_sd_configs: + - role: endpoints + namespaces: + own_namespace: true + relabel_configs: + - source_labels: [__meta_kubernetes_service_label_app_kubernetes_io_name] + regex: opencost + action: keep + - source_labels: [__meta_kubernetes_endpoint_port_name] + regex: http + action: keep + metric_relabel_configs: + - source_labels: [__name__] + regex: "kube_node_labels|kube_pod_labels|node_total_hourly_cost|node_ram_hourly_cost|node_cpu_hourly_cost|node_gpu_hourly_cost" + action: keep + - job_name: 'union-services' + metrics_path: /metrics + scrape_interval: 1m + honor_labels: true + kubernetes_sd_configs: + - role: endpoints + namespaces: + own_namespace: true + relabel_configs: + - source_labels: [__meta_kubernetes_service_label_platform_union_ai_service_group] + regex: .+ + action: keep + - source_labels: [__meta_kubernetes_endpoint_port_name] + regex: debug + action: keep + # Flyte propeller metrics for execution info and fast task duration. + # No-op when propeller is not deployed (no pods match the label selector). + - job_name: 'flytepropeller' + kubernetes_sd_configs: + - role: pod + namespaces: + own_namespace: true + selectors: + - role: pod + label: app.kubernetes.io/name=flytepropeller + metric_relabel_configs: + - source_labels: [__name__] + regex: "flyte:propeller:all:round:execution_info|flyte:propeller:all:node:fast_task:fast_task_execution_duration" + action: keep + relabel_configs: + - source_labels: [__meta_kubernetes_pod_container_name] + regex: flytepropeller + action: keep + - source_labels: [__meta_kubernetes_namespace] + action: replace + target_label: namespace + - source_labels: [__meta_kubernetes_pod_name] + action: replace + target_label: pod + # Kourier gateway (envoy) metrics for app serving. + # No-op when serving is not enabled (no pods match the label selector). + - job_name: 'serving-envoy' + kubernetes_sd_configs: + - role: pod + namespaces: + own_namespace: true + selectors: + - role: pod + label: app=3scale-kourier-gateway + metrics_path: /stats/prometheus + metric_relabel_configs: + - source_labels: [__name__] + regex: "envoy_cluster_upstream_rq_xx|envoy_cluster_upstream_rq_time_bucket" + action: keep + relabel_configs: + - source_labels: [__meta_kubernetes_pod_container_port_name] + regex: metrics + action: keep + - source_labels: [__meta_kubernetes_namespace] + action: replace + target_label: namespace + - source_labels: [__meta_kubernetes_pod_name] + action: replace + target_label: pod + # DCGM GPU metrics. + # No-op when dcgm-exporter is not deployed (no pods match the label selector). + - job_name: 'gpu-metrics' + kubernetes_sd_configs: + - role: pod + namespaces: + names: + - kube-system + selectors: + - role: pod + label: app.kubernetes.io/name=dcgm-exporter - c_project_resource_quota.yaml: | - apiVersion: v1 - kind: ResourceQuota - metadata: - name: project-quota - namespace: {{ namespace }} - spec: - hard: - limits.cpu: {{ projectQuotaCpu }} - limits.memory: {{ projectQuotaMemory }} - requests.nvidia.com/gpu: {{ projectQuotaNvidiaGpu }} + recording_rules.yml: | + {} + rules: | + {} --- # Source: dataplane/templates/fluent-bit/configmap.yaml apiVersion: v1 @@ -502,11 +854,39 @@ data: region test-region bucket test-metadata-bucket --- -# Source: dataplane/templates/imagebuilder/configmap.yaml +# Source: dataplane/templates/imagebuilder/build-image-configmap.yaml apiVersion: v1 kind: ConfigMap metadata: - name : union-operator-buildkit + name: build-image-config + labels: + app.kubernetes.io/name: union-operator + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm +data: + storage.yaml: | + storage: + container: "test-metadata-bucket" + type: s3 + connection: + auth-type: iam + region: test-region + enable-multicontainer: true + limits: + maxDownloadMBs: 1024 + cache: + max_size_mbs: 0 + target_gc_percent: 70 + buildkit-uri: "tcp://union-operator-buildkit.union.svc.cluster.local:1234" + default-repository: ".dkr.ecr.test-region.amazonaws.com/union-dataplane" + authentication-type: "aws" +--- +# Source: dataplane/templates/imagebuilder/configmap.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name : union-operator-buildkit data: buildkitd.toml: | debug = false @@ -2658,6 +3038,18 @@ data: "version": 1 } --- +# Source: dataplane/templates/monitoring/prometheusrule.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: union-recording-rules + namespace: union + labels: + release: release-name +data: + recording_rules.yml: | + groups: [] +--- # Source: dataplane/templates/nodeexecutor/configmap.yaml apiVersion: v1 kind: ConfigMap @@ -2712,8 +3104,13 @@ data: templateUris: - https://scale.neptune.ai/{{ .taskConfig.project }}/-/run/?customId={{ .taskConfig.id }} - kubernetes-enabled: true + kubernetes-enabled: false enabled_plugins.yaml: | + plugins: + connector-service: + defaultConnector: + defaultServiceConfig: '{"loadBalancingConfig": [{"round_robin":{}}]}' + endpoint: k8s:///flyteconnector.union:8000 tasks: task-plugins: default-for-task-types: @@ -2746,6 +3143,8 @@ data: cpu: 4096 gpu: 256 memory: 2Ti + namespace_mapping: + template: union namespace_mapping: template: '{{ project }}-{{ domain }}' union: @@ -2768,7 +3167,7 @@ data: cache-endpoint: dns:///test-controlplane-host endpoint: dns:///test-controlplane-host insecure: false - type: fallback + type: cacheservicev2 use-admin-auth: true logger: formatter: @@ -2788,6 +3187,7 @@ data: secure: false useAuth: false propeller: + limit-namespace: union node-config: disable-input-file-writes: true plugins: @@ -2795,7 +3195,7 @@ data: additional-worker-args: - --last-ack-grace-period-seconds - "120" - callback-uri: http://unionai-dataplane-executor.union.svc.cluster.local:15605 + callback-uri: http://union-operator-executor.union.svc.cluster.local:15605 grace-period-status-not-found: 2m ioutils: remoteFileOutputPaths: @@ -2805,6 +3205,7 @@ data: default-cpus: 100m default-env-vars: [] default-memory: 100Mi + default-pod-template-name: task-template co-pilot: image: 'cr.flyte.org/flyteorg/flytecopilot:v1.14.1' name: flyte-copilot- @@ -2839,6 +3240,7 @@ data: default-cpus: 100m default-env-vars: [] default-memory: 100Mi + default-pod-template-name: task-template config.yaml: | union: connection: @@ -2858,11 +3260,14 @@ data: operator: enabled: true enableTunnelService: true + # enableDepot: false tunnel: - enableDirectToAppIngress: false + enableDirectToAppIngress: true deploymentToRestart: union-operator-proxy + limitNamespace: union + disableClusterPermissions: true apps: - enabled: 'false' + enabled: 'true' syncClusterConfig: enabled: false clusterId: @@ -2877,14 +3282,12 @@ data: userRole: 'test-worker-iam-role-arn' userRoleKey: 'eks.amazonaws.com/role-arn' collectUsages: - enabled: true + enabled: false billing: model: Legacy dependenciesHeartbeat: - prometheus: - endpoint: 'http://union-operator-prometheus:80/-/healthy' - propeller: - endpoint: 'http://flytepropeller:10254' + executor: + endpoint: 'http://union-operator-executor:10254' proxy: endpoint: 'http://union-operator-proxy:10254' secretsWatcher: @@ -2899,9 +3302,6 @@ data: referenceConfigmapName: union-operator targetConfigMapName: "build-image-config" proxy: - imageBuilderConfig: - authenticationType: 'noop' - defaultRepository: '' persistedLogs: objectStore: pathTemplate: namespace-{{.KubernetesNamespace}}.pod-{{.KubernetesPodName}}.cont-{{.KubernetesContainerName}} @@ -2943,999 +3343,149 @@ data: auth-type: iam region: test-region image-builder.buildkit-uri: "tcp://union-operator-buildkit.union.svc.cluster.local:1234" - image-builder.default-repository: "" - image-builder.authentication-type: "noop" + image-builder.default-repository: ".dkr.ecr.test-region.amazonaws.com/union-dataplane" + image-builder.authentication-type: "aws" --- -# Source: dataplane/templates/prometheus/configmap.yaml +# Source: dataplane/templates/serving/bootstrap-configmap.yaml +# We need to copy the default configmap here since knative-operator does not automatically update the +# address of the `net-kourier-controller` endpoint to use the release namespace. It assumes that the +# resources are being installed into the `knative-serving` namespace instead. apiVersion: v1 kind: ConfigMap metadata: - name: union-operator-prometheus + name: union-operator-serving-envoy-bootstrap namespace: union - labels: - helm.sh/chart: dataplane-2026.4.2 - app.kubernetes.io/name: release-name-dataplane - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: prometheus data: - prometheus.yml: | - global: - scrape_interval: 15s - evaluation_interval: 15s - alerting: - alertmanagers: - - static_configs: - - targets: - rule_files: - - rules.yml - scrape_configs: - # Self-monitoring - - job_name: prometheus - metrics_path: /prometheus/metrics - static_configs: - - targets: ['localhost:9090'] - metric_relabel_configs: - - source_labels: [__name__] - regex: prometheus_tsdb_head_chunks_storage_size_bytes|prometheus_tsdb_head_series|prometheus_tsdb_head_series_created_total|prometheus_tsdb_head_series_removed_total|prometheus_tsdb_head_samples_appended_total|prometheus_tsdb_storage_blocks_bytes|prometheus_tsdb_retention_limit_bytes|prometheus_tsdb_wal_(.*)|.+_(limit|failed|missed|failures)_total - action: keep - - # Kube state metrics for pod/node resource tracking and cost calculations - - job_name: kube-state-metrics - static_configs: - - targets: ['release-name-kube-state-metrics:8080'] - metric_relabel_configs: - - separator: ; - source_labels: [__name__] - regex: kube_pod_container_resource_(limits|requests)|kube_pod_status_phase|kube_node_(labels|status_allocatable|status_condition|status_capacity)|kube_namespace_labels|kube_pod_container_status_(waiting|terminated|last_terminated).*_reason|kube_daemonset_status_number_unavailable|kube_deployment_status_replicas_unavailable|kube_resourcequota|kube_pod_info|kube_node_info|kube_pod_container_status_restarts_total - action: keep - - separator: ; - source_labels: [__name__, phase] - regex: kube_pod_status_phase;(Succeeded|Failed) - action: drop - - source_labels: [node] - target_label: nodename - regex: '(.*)' - action: replace - - source_labels: [label_node_group_name] - action: replace - regex: (.+) - target_label: label_node_pool_name - - # cAdvisor container metrics for CPU and memory tracking - - job_name: kubernetes-cadvisor - metrics_path: /metrics - scheme: https - kubernetes_sd_configs: - - role: node - namespaces: - names: [] - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token - tls_config: - ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt - insecure_skip_verify: false - metric_relabel_configs: - - separator: ; - source_labels: [__name__] - regex: container_cpu_usage_seconds_total|container_memory_working_set_bytes - action: keep - relabel_configs: - - separator: ; - regex: __meta_kubernetes_node_label_(.+) - replacement: $1 - action: labelmap - - separator: ; - regex: (.*) - target_label: __address__ - replacement: kubernetes.default.svc:443 - action: replace - - source_labels: [__meta_kubernetes_node_name] - separator: ; - regex: (.+) - target_label: __metrics_path__ - replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor - action: replace - - # Flyte propeller metrics for execution info and fast task duration - - job_name: flytepropeller - kubernetes_sd_configs: - - role: pod - namespaces: - names: - - union - selectors: - - role: pod - label: app.kubernetes.io/name=flytepropeller - metric_relabel_configs: - - source_labels: [__name__] - regex: "flyte:propeller:all:round:execution_info|flyte:propeller:all:node:fast_task:fast_task_execution_duration" - action: keep - relabel_configs: - - source_labels: [__meta_kubernetes_pod_container_name] - regex: flytepropeller - action: keep - - source_labels: [__meta_kubernetes_namespace] - action: replace - target_label: namespace - - source_labels: [__meta_kubernetes_pod_name] - action: replace - target_label: pod - # OpenCost metrics for cost tracking - - job_name: opencost - static_configs: - - targets: ['release-name-opencost:9003'] - metric_relabel_configs: - - source_labels: [__name__] - regex: "kube_node_labels|kube_pod_labels|node_total_hourly_cost|node_ram_hourly_cost|node_cpu_hourly_cost|node_gpu_hourly_cost" - action: keep - # DCGM GPU metrics - - job_name: gpu-metrics - kubernetes_sd_configs: - - role: pod - namespaces: - names: - - kube-system - selectors: - - role: pod - label: app.kubernetes.io/name=dcgm-exporter - rules.yml: | - - groups: - - name: cost_calculations_15s - interval: 15s - rules: - - record: pod_gpu_allocation - expr: | - sum by (namespace, pod) (DCGM_FI_DEV_GPU_UTIL >= bool 0) * on (namespace, pod) group_left() (max by (namespace, pod) (kube_pod_status_phase{phase=~"Running|Pending"} == 1)) - - record: execution_info # A join metric to look up execution-level info. Used to disambiguate workflow/task executions, apps, and workspaces. - expr: | - max by (label_domain, label_project, label_entity_name, label_execution_id, label_entity_id)( - label_replace( - label_replace( - label_replace( - label_replace( - label_replace( - flyte:propeller:all:round:execution_info{domain!="", project!="", workflow_name!="", execution_id!=""}, # filter for workflow/task executions - "label_entity_id", "$1", "execution_id", "(.*)" # join key - ), "label_entity_name", "$1", "workflow_name", "(.*)" # set label_entity_name to the workflow/task name from the workflow_execution_id - ), - "label_execution_id", "$1", "execution_id", "(.*)" - ), - "label_project", "$1", "project", "(.*)" # project - ), - "label_domain", "$1", "domain", "(.*)" # domain - ) - ) - - record: app_info # A join metric to look up app-level info. Used to disambiguate workflow/task executions, apps, and workspaces. - expr: | - max by (label_domain, label_project, label_app_name, label_app_version, label_entity_id)( - label_replace( - label_replace( - label_replace( - kube_pod_labels{ - label_domain!="", - label_project!="", - label_serving_unionai_dev_app_name!="", - label_serving_knative_dev_revision!="" - }, # this filters for apps - "label_app_name", "$1", "label_serving_unionai_dev_app_name", "(.*)" # rename to cleanup - ), - "label_app_version", "$1", "label_serving_knative_dev_revision", "(.*)" # the app_version is equivalent to an execution_id for workflows (lowest level of granularity) - ), - "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # join key - ) - ) - - record: workspace_info # A join metric to look up workspace info. Used to disambiguate workflow/task executions, apps, and workspaces. - expr: | - max by (label_domain, label_project, label_workspace_name, label_entity_id)( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # filter for workspaces - "label_entity_id", "$1", "label_node_id", "(.*)" # join key - ), "label_workspace_name", "$1", "label_node_id", "(.*)" # set label_workspace_name to the workspace name from the kube_pod_labels - ) - ) - - record: fast_task_execution_duration - expr: | - max by (label_domain, label_project, label_execution_id, label_entity_id, namespace, pod)( - label_replace( - label_replace( - label_replace( - label_replace( - label_replace( - label_replace( - flyte:propeller:all:node:fast_task:fast_task_execution_duration{domain!="", project!="", execution_id!="", exported_namespace!="", exported_pod!=""}, - "label_entity_id", "$1", "execution_id", "(.*)" # join key - ), - "label_execution_id", "$1", "execution_id", "(.*)" - ), - "label_project", "$1", "project", "(.*)" # project - ), - "label_domain", "$1", "domain", "(.*)" # domain - ), - "namespace", "$1", "exported_namespace", "(.*)" - ), - "pod", "$1", "exported_pod", "(.*)" - ) - ) - - record: fast_task_execution_duration_rate - expr: | - irate(fast_task_execution_duration[1m]) > 0 or absent(irate(fast_task_execution_duration[1m])) * fast_task_execution_duration - - record: entity_id:mem_usage_bytes_total_per_node:sum # Allocated memory (max(requested, consumed)) aggregated per node and entity, where entity is either a task/workflow execution or an app. - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id, node) ( # aggregate up to entity - # First, calculate the allocated memory for each pod - max by (namespace, pod) ( # this is the case where consumed (the memory working set) exceeds requested memory - ( - sum by (namespace, pod) ( - container_memory_working_set_bytes{namespace!="",pod!="",image!=""} - ) - > sum by (namespace, pod) ( - kube_pod_container_resource_requests{namespace!="", pod!="", node!="", resource="memory"} - ) - ) - or sum by (namespace, pod) ( # this is the case where memory requests are <= consumed memory - kube_pod_container_resource_requests{namespace!="", pod!="", node!="", resource="memory"} # needed to add node!="" to dedupe - ) - ) - # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but we do not want to double the number of pod-level metrics we save - * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) - "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") - ), - "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels - label_replace( - label_replace( - kube_pod_labels{ - label_domain!="", - label_project!="", - label_serving_unionai_dev_app_name!="", - label_serving_knative_dev_revision!="" - }, # this filters for apps only - "label_entity_type", "app", "", "" # set label_entity_type to "app" - ), - "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) - "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" - ), - "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - fast_task_execution_duration_rate{label_domain!="", label_project!="",label_execution_id!=""}, - "label_entity_type", "fast_task", "", "" - ) - ) - ) - # Then filter for pods only in the "Running" or "Pending" phase - * on (namespace, pod) group_left() ( - max by (namespace, pod) ( - kube_pod_status_phase{phase=~"Running|Pending"} == 1 - ) - ) - # Now join in node identifiers which are used for subsequent overhead calculations - * on (namespace, pod) group_left(node) ( - max by (namespace, pod, node) (kube_pod_info{node!=""}) # needed to add node!="" to dedupe - ) - ) - - record: entity_id:cpu_usage_per_node:sum # Allocated cpu (max(requested, consumed)) aggregated per node and entity, where entity is either a task/workflow execution or an app. - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id, node) ( - # First, calculate the allocated cpu for each pod - max by (namespace, pod) ( # this is the case where consumed (the cpu usage seconds total) exceeds requested cpu - ( - sum by (namespace, pod) ( - irate(container_cpu_usage_seconds_total{namespace!="",pod!="",image!=""}[5m]) - ) - > sum by (namespace, pod) ( - kube_pod_container_resource_requests{namespace!="", pod!="", node!="", resource="cpu"} - ) - ) - or sum by (namespace, pod) ( # this is the case where cpu requests are <= consumed cpu - kube_pod_container_resource_requests{namespace!="", pod!="", node!="", resource="cpu"} - ) - ) - # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but I didn't want to double the number of pod-level metrics we save - * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) - "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") - ), - "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels - label_replace( - label_replace( - kube_pod_labels{ - label_domain!="", - label_project!="", - label_serving_unionai_dev_app_name!="", - label_serving_knative_dev_revision!="" - }, # this filters for apps only - "label_entity_type", "app", "", "" # set label_entity_type to "app" - ), - "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) - "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" - ), - "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - fast_task_execution_duration_rate{label_domain!="", label_project!="", label_execution_id!=""}, - "label_entity_type", "fast_task", "", "" - ) - ) - ) - # Then filter for pods only in the "Running" or "Pending" phase - * on (namespace, pod) group_left() ( - max by (namespace, pod) ( - kube_pod_status_phase{phase=~"Running|Pending"} == 1 - ) - ) - # Now join in node identifiers which are used for subsequent overhead calculations - * on (namespace, pod) group_left(node) ( - max by (namespace, pod, node) (kube_pod_info{node!=""}) # needed to add node!="" to dedupe - ) - ) - - record: entity_id:gpu_usage_per_node:sum # Allocated gpu aggregated per node and entity, where entity is either a task/workflow execution or an app. - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id, node) ( - # First, grab the allocated gpu for each pod (which is always either 1 or zero, since k8s can't split gpus the way it can with cpu/memory) - max by (namespace, pod) ( - pod_gpu_allocation - ) - # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but I didn't want to double the number of pod-level metrics we save - * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) - "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") - ), - "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels - label_replace( - label_replace( - kube_pod_labels{ - label_domain!="", - label_project!="", - label_serving_unionai_dev_app_name!="", - label_serving_knative_dev_revision!="" - }, # this filters for apps only - "label_entity_type", "app", "", "" # set label_entity_type to "app" - ), - "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) - "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" - ), - "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - fast_task_execution_duration_rate{label_domain!="", label_project!="", label_execution_id!=""}, - "label_entity_type", "fast_task", "", "" - ) - ) - ) - # Then filter for pods only in the "Running" or "Pending" phase - * on (namespace, pod) group_left() ( - max by (namespace, pod) ( - kube_pod_status_phase{phase=~"Running|Pending"} == 1 - ) - ) - # Now join in node identifiers which are used for subsequent overhead calculations - * on (namespace, pod) group_left(node) ( - max by (namespace, pod, node) (kube_pod_info{node!=""}) # needed to add node!="" to dedupe - ) - ) - - record: entity_id:used_mem_bytes:sum # the sum of used memory across all containers in an entity (numerator for aggregate utilization calculations) - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id) ( # aggregate up to entity - # First, calculate the used memory for each pod - sum by (namespace, pod) ( - container_memory_working_set_bytes{namespace!="",pod!="",image!=""} - ) - # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but we do not want to double the number of pod-level metrics we save - * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) - "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") - ), - "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels - label_replace( - label_replace( - kube_pod_labels{ - label_domain!="", - label_project!="", - label_serving_unionai_dev_app_name!="", - label_serving_knative_dev_revision!="" - }, # this filters for apps only - "label_entity_type", "app", "", "" # set label_entity_type to "app" - ), - "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) - "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" - ), - "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - fast_task_execution_duration_rate{label_domain!="", label_project!="", label_execution_id!=""}, - "label_entity_type", "fast_task", "", "" - ) - ) - ) - # Then filter for pods only in the "Running" or "Pending" phase - * on (namespace, pod) group_left() ( - max by (namespace, pod) ( - kube_pod_status_phase{phase=~"Running|Pending"} == 1 - ) - ) - ) - - record: entity_id:allocated_mem_bytes:sum # the sum of allocated memory across all containers in an entity (denominator for aggregate utilization calculations) - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id) ( # aggregate up to entity (remove node) - entity_id:mem_usage_bytes_total_per_node:sum - ) - - record: entity_id:used_cpu:sum # the sum of used cpu across all containers in an entity (numerator for aggregate utilization calculations) - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id) ( - sum by (namespace, pod) ( - irate(container_cpu_usage_seconds_total{namespace!="",pod!="",image!=""}[5m]) - ) - # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but I didn't want to double the number of pod-level metrics we save - * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) - "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") - ), - "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels - label_replace( - label_replace( - kube_pod_labels{ - label_domain!="", - label_project!="", - label_serving_unionai_dev_app_name!="", - label_serving_knative_dev_revision!="" - }, # this filters for apps only - "label_entity_type", "app", "", "" # set label_entity_type to "app" - ), - "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) - "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" - ), - "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - fast_task_execution_duration_rate{label_domain!="", label_project!="", label_execution_id!=""}, - "label_entity_type", "fast_task", "", "" - ) - ) - ) - # Then filter for pods only in the "Running" or "Pending" phase - * on (namespace, pod) group_left() ( - max by (namespace, pod) ( - kube_pod_status_phase{phase=~"Running|Pending"} == 1 - ) - ) - ) - - record: entity_id:allocated_cpu:sum # the sum of allocated cpu across all containers in an entity (denominator for aggregate utilization calculations) - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id) ( # aggregate up to entity (remove node) - entity_id:cpu_usage_per_node:sum - ) - - record: entity_id:sm_occupancy:avg # the simple average of SM occupancy (a good generic measure of GPU utilization) per entity - expr: | - avg by (label_entity_type, label_domain, label_project, label_entity_id) ( - # First, grab the SM occupancy for each pod - max by (namespace, pod) ( - DCGM_FI_PROF_SM_OCCUPANCY # SM occupancy is a good proxy for actual GPU usage - ) - # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but I didn't want to double the number of pod-level metrics we save - * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) - "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") - ), - "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels - label_replace( - label_replace( - kube_pod_labels{ - label_domain!="", - label_project!="", - label_serving_unionai_dev_app_name!="", - label_serving_knative_dev_revision!="" - }, # this filters for apps only - "label_entity_type", "app", "", "" # set label_entity_type to "app" - ), - "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) - "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" - ), - "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - fast_task_execution_duration_rate{label_domain!="", label_project!="", label_execution_id!=""}, - "label_entity_type", "fast_task", "", "" - ) - ) - ) - # Then filter for pods only in the "Running" or "Pending" phase - * on (namespace, pod) group_left() ( - max by (namespace, pod) ( - kube_pod_status_phase{phase=~"Running|Pending"} == 1 - ) - ) - ) - - record: entity_id:gpu_count:sum # the count of running gpu pods per entity (need this to weight the gpu utilization when aggregating upwards - i.e. project-level) - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id, node) ( - # First, grab the allocated gpu for each pod (which is always either 1 or zero, since k8s can't split gpus the way it can with cpu/memory) - max by (namespace, pod) ( - pod_gpu_allocation - ) - # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but I didn't want to double the number of pod-level metrics we save - * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) - "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") - ), - "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels - label_replace( - label_replace( - kube_pod_labels{ - label_domain!="", - label_project!="", - label_serving_unionai_dev_app_name!="", - label_serving_knative_dev_revision!="" - }, # this filters for apps only - "label_entity_type", "app", "", "" # set label_entity_type to "app" - ), - "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) - "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" - ), - "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - fast_task_execution_duration_rate{label_domain!="", label_project!="", label_execution_id!=""}, - "label_entity_type", "fast_task", "", "" - ) - ) - ) - # Then filter for pods only in the "Running" or "Pending" phase - * on (namespace, pod) group_left() ( - max by (namespace, pod) ( - kube_pod_status_phase{phase=~"Running|Pending"} == 1 - ) - ) - ) - - record: entity_id:weighted_sm_occupancy:sum # product of SM occupancy and allocated GPU count (something like "used memory", numerator of weighted calcs) - expr: | - entity_id:sm_occupancy:avg - * on (label_domain, label_project, label_entity_type, label_entity_id) entity_id:gpu_count:sum - - record: entity_id:allocated_mem_cost:sum # Allocated cost of memory for each workflow/task execution and app. - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id, type) ( - entity_id:mem_usage_bytes_total_per_node:sum / (1024 * 1024 * 1024) # convert bytes to GB - * on (node) group_left(type) label_replace(avg by (node) (node_ram_hourly_cost * (15 / 3600)), "type", "mem", "", "") # convert hourly cost to 15-secondly cost and add type - ) - - record: entity_id:allocated_cpu_cost:sum # Allocated cost of cpu for each workflow/task execution and app. - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id, type)( - entity_id:cpu_usage_per_node:sum - * on (node) group_left(type) label_replace(avg by (node) (node_cpu_hourly_cost * (15 / 3600)), "type", "cpu", "", "") # convert hourly cost to 15-secondly cost and add type - ) - - record: entity_id:allocated_gpu_cost:sum # Allocated cost of gpu for each workflow/task execution and app. - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id, type)( - entity_id:gpu_usage_per_node:sum - * on (node) group_left(type) label_replace(avg by (node) (node_gpu_hourly_cost * (15 / 3600)), "type", "gpu", "", "") # convert hourly cost to 15-secondly cost and add type - ) - - record: entity_id:allocated_cost:sum # Allocated cost of memory, cpu, and gpu for each workflow/task execution and app. - expr: | - label_replace( - sum by (label_entity_type, label_domain, label_project, label_entity_id) ( # for the sum to work, the labels need to be different on each "or" element (type label) - entity_id:allocated_mem_cost:sum - or - entity_id:allocated_cpu_cost:sum - or - entity_id:allocated_gpu_cost:sum - ), - "type", "allocated", "", "" # add type info - ) - - record: entity_id:overhead_cost:sum # The amount of overhead costs (node costs that we can't allocate with container resources) to allocate to each entity (workflow/task execution or app) - expr: | - label_replace( - sum by (label_entity_type, label_entity_id, label_domain, label_project)( # Aggregate the per-node metrics up to workflow/task execution or app (label_entity_id) - # Start with each execution's and app's allocated cost per node - sum by (label_entity_type, label_domain, label_project, label_entity_id, node) ( # for the sum to work, the labels need to be different on each "or" element (type label) - entity_id:mem_usage_bytes_total_per_node:sum / (1024 * 1024 * 1024) # convert bytes to GB - * on (node) group_left(type) label_replace(avg by (node) (node_ram_hourly_cost * (15 / 3600)), "type", "mem", "", "") # convert hourly cost to 15-secondly cost and add type - or - entity_id:cpu_usage_per_node:sum - * on (node) group_left(type) label_replace(avg by (node) (node_cpu_hourly_cost * (15 / 3600)), "type", "cpu", "", "") # convert hourly cost to 15-secondly cost and add type - or - entity_id:gpu_usage_per_node:sum - * on (node) group_left(type) label_replace(avg by (node) (node_gpu_hourly_cost * (15 / 3600)), "type", "gpu", "", "") # convert hourly cost to 15-secondly cost and add type - ) - # Then divide out the total allocated cost per node to get the proportion of allocated cost associated with each entity - / on (node) group_left()( - sum by (node) ( # for the sum to work, the labels need to be different on each "or" element (type label) - entity_id:mem_usage_bytes_total_per_node:sum / (1024 * 1024 * 1024) # convert bytes to GB - * on (node) group_left(type) label_replace(avg by (node) (node_ram_hourly_cost * (15 / 3600)), "type", "mem", "", "") # convert hourly cost to 15-secondly cost and add type - or - entity_id:cpu_usage_per_node:sum - * on (node) group_left(type) label_replace(avg by (node) (node_cpu_hourly_cost * (15 / 3600)), "type", "cpu", "", "") # convert hourly cost to 15-secondly cost and add type - or - entity_id:gpu_usage_per_node:sum - * on (node) group_left(type) label_replace(avg by (node) (node_gpu_hourly_cost * (15 / 3600)), "type", "gpu", "", "") # convert hourly cost to 15-secondly cost and add type - ) - > 0 # need to avoid dividing by zero, or gaps in the data can cause NaNs to proliferate, borking all charts - ) - # Then multiply by the overhead cost per node - * on (node) group_left() ( - # To calculate overhead, start with the true cost of running each node - avg by (node)(kube_node_labels{label_flyte_org_node_role="worker"}) # only look at worker nodes - * on (node) max by (node) ( - node_total_hourly_cost{instance_type!=""} # sometimes, the instance_type can be null, causing an unlabeled label to show up in the Compute Costs dashboard charts - ) * (15 / 3600) # convert hourly cost to 15-secondly cost - # Then subtract out the total allocated cost on each node - - on (node) group_left()( - sum by (node) ( # for the sum to work, the labels need to be different on each "or" element (type label) - entity_id:mem_usage_bytes_total_per_node:sum / (1024 * 1024 * 1024) # convert bytes to GB - * on (node) group_left(type) label_replace(avg by (node) (node_ram_hourly_cost * (15 / 3600)), "type", "mem", "", "") # convert hourly cost to 15-secondly cost and add type - or - entity_id:cpu_usage_per_node:sum - * on (node) group_left(type) label_replace(avg by (node) (node_cpu_hourly_cost * (15 / 3600)), "type", "cpu", "", "") # convert hourly cost to 15-secondly cost and add type - or - entity_id:gpu_usage_per_node:sum - * on (node) group_left(type) label_replace(avg by (node) (node_gpu_hourly_cost * (15 / 3600)), "type", "gpu", "", "") # convert hourly cost to 15-secondly cost and add type - ) - ) - ) - ), - "type", "overhead", "", "" # add type info - ) - - record: entity_id:total_cost:sum # Total cost of each entity (workflow/task execution or app), including allocated (from container resources) and overhead (proportion of unallocated node costs) - expr: | - label_replace( - sum by (label_domain, label_project, label_entity_id, label_entity_type) ( - entity_id:allocated_cost:sum - or - entity_id:overhead_cost:sum - ), - "type", "total", "", "" # add type info - ) - - record: node:total_cost:sum # Total cost of all nodes - expr: | - sum ( - avg by (node)(kube_node_labels{label_flyte_org_node_role="worker", label_node_kubernetes_io_instance_type!=""}) # only look at worker nodes - * on (node) group_left() node_total_hourly_cost{instance_type!=""} * (15 / 3600) # convert hourly cost to 15-secondly cost - ) - - record: node_type:total_cost:sum # Total cost of nodes grouped by node type - expr: | - sum by (node_type)( - avg by (node)(kube_node_labels{label_flyte_org_node_role="worker", label_node_kubernetes_io_instance_type!=""}) # only look at worker nodes - * on (node) group_left(node_type) label_replace(node_total_hourly_cost{instance_type!=""}, "node_type", "$1", "instance_type", "(.*)") * (15 / 3600) # convert hourly cost to 15-secondly cost and rename label - ) - - record: node_type:uptime_hours:sum # Total uptime of nodes grouped by node type - expr: | - sum by (node_type)( - avg by (node, node_type)( # dedupe - label_replace(kube_node_labels{label_flyte_org_node_role="worker", label_node_kubernetes_io_instance_type!=""}, "node_type", "$1", "label_node_kubernetes_io_instance_type", "(.*)") # relabel - ) - ) * (15 / 3600) # convert to number of hours per 15-second observation # Aggregate the above into visible metrics - - name: cost_rollup_15m - interval: 15m - rules: - - record: execution_info15m - expr: | - max_over_time(execution_info[15m:15s]) - - record: app_info15m - expr: | - max_over_time(app_info[15m:15s]) - - record: workspace_info15m - expr: | - max_over_time(workspace_info[15m:15s]) - - record: entity_id:allocated_mem_bytes:sum15m - expr: | - sum_over_time(entity_id:allocated_mem_bytes:sum[15m:15s]) - - record: entity_id:used_mem_bytes:sum15m - expr: | - sum_over_time(entity_id:used_mem_bytes:sum[15m:15s]) - - record: entity_id:allocated_cpu:sum15m - expr: | - sum_over_time(entity_id:allocated_cpu:sum[15m:15s]) - - record: entity_id:used_cpu:sum15m - expr: | - sum_over_time(entity_id:used_cpu:sum[15m:15s]) - - record: entity_id:weighted_sm_occupancy:sum15m - expr: | - sum_over_time(entity_id:weighted_sm_occupancy:sum[15m:15s]) - - record: entity_id:gpu_count:sum15m - expr: | - sum_over_time(entity_id:gpu_count:sum[15m:15s]) - - record: entity_id:allocated_mem_cost:sum15m - expr: | - sum_over_time(entity_id:allocated_mem_cost:sum[15m:15s]) - - record: entity_id:allocated_cpu_cost:sum15m - expr: | - sum_over_time(entity_id:allocated_cpu_cost:sum[15m:15s]) - - record: entity_id:allocated_gpu_cost:sum15m - expr: | - sum_over_time(entity_id:allocated_gpu_cost:sum[15m:15s]) - - record: entity_id:allocated_cost:sum15m - expr: | - sum_over_time(entity_id:allocated_cost:sum[15m:15s]) - - record: entity_id:overhead_cost:sum15m - expr: | - sum_over_time(entity_id:overhead_cost:sum[15m:15s]) - - record: entity_id:total_cost:sum15m - expr: | - sum_over_time(entity_id:total_cost:sum[15m:15s]) - - record: node:total_cost:sum15m - expr: | - sum_over_time(node:total_cost:sum[15m:15s]) - - record: node_type:total_cost:sum15m - expr: | - sum_over_time(node_type:total_cost:sum[15m:15s]) - - record: node_type:uptime_hours:sum15m - expr: | - sum_over_time(node_type:uptime_hours:sum[15m:15s]) + envoy-bootstrap.yaml: | + dynamic_resources: + ads_config: + transport_api_version: V3 + api_type: GRPC + rate_limit_settings: {} + grpc_services: + - envoy_grpc: {cluster_name: xds_cluster} + cds_config: + resource_api_version: V3 + ads: {} + lds_config: + resource_api_version: V3 + ads: {} + node: + cluster: kourier-knative + id: 3scale-kourier-gateway + static_resources: + listeners: + - name: stats_listener + address: + socket_address: + address: 0.0.0.0 + port_value: 9000 + filter_chains: + - filters: + - name: envoy.filters.network.http_connection_manager + typed_config: + "@type": type.googleapis.com/envoy.extensions.filters.network.http_connection_manager.v3.HttpConnectionManager + stat_prefix: stats_server + http_filters: + - name: envoy.filters.http.router + typed_config: + "@type": type.googleapis.com/envoy.extensions.filters.http.router.v3.Router + route_config: + virtual_hosts: + - name: admin_interface + domains: + - "*" + routes: + - match: + safe_regex: + regex: '/(certs|stats(/prometheus)?|server_info|clusters|listeners|ready)?' + headers: + - name: ':method' + string_match: + exact: GET + route: + cluster: service_stats + - match: + safe_regex: + regex: '/drain_listeners' + headers: + - name: ':method' + string_match: + exact: POST + route: + cluster: service_stats + clusters: + - name: service_stats + connect_timeout: 0.250s + type: static + load_assignment: + cluster_name: service_stats + endpoints: + lb_endpoints: + endpoint: + address: + socket_address: + address: 127.0.0.1 + port_value: 9901 + - name: xds_cluster + # This keepalive is recommended by envoy docs. + # https://www.envoyproxy.io/docs/envoy/latest/api-docs/xds_protocol + typed_extension_protocol_options: + envoy.extensions.upstreams.http.v3.HttpProtocolOptions: + "@type": type.googleapis.com/envoy.extensions.upstreams.http.v3.HttpProtocolOptions + explicit_http_config: + http2_protocol_options: + connection_keepalive: + interval: 30s + timeout: 5s + connect_timeout: 1s + load_assignment: + cluster_name: xds_cluster + endpoints: + lb_endpoints: + endpoint: + address: + socket_address: + address: "net-kourier-controller" + port_value: 18000 + type: STRICT_DNS + admin: + access_log: + - name: envoy.access_loggers.stdout + typed_config: + "@type": type.googleapis.com/envoy.extensions.access_loggers.stream.v3.StdoutAccessLog + address: + socket_address: + address: 127.0.0.1 + port_value: 9901 + stats_config: + stats_tags: + - tag_name: name + regex: '^.*?\.u/.*?n=(.*?)/.*?u\..*$' + - tag_name: domain + regex: '^.*?\.u/.*?d=(.*?)/.*?u\..*$' + - tag_name: org + regex: '^.*?\.u/.*?o=(.*?)/.*?u\..*$' + - tag_name: project + regex: '^.*?\.u/.*?p=(.*?)/.*?u\..*$' + - tag_name: target + regex: '^.*?\.u/.*?t=(.*?)/.*?u\..*$' + - tag_name: target_namespace + regex: '^.*?\.u/.*?tns=(.*?)/.*?u\..*$' --- -# Source: dataplane/templates/propeller/configmap.yaml +# Source: dataplane/templates/webhook/configmap.yaml apiVersion: v1 kind: ConfigMap metadata: - name: flyte-propeller-config + name: union-pod-webhook-config namespace: union data: - admin.yaml: | - admin: - clientId: 'test-client-id' - clientSecretLocation: /etc/union/secret/client_secret - endpoint: dns:///test-controlplane-host - insecure: false - event: - capacity: 1000 - rate: 500 - type: admin - catalog.yaml: | - catalog-cache: - cache-endpoint: dns:///test-controlplane-host - endpoint: dns:///test-controlplane-host - insecure: false - type: fallback - use-admin-auth: true - copilot.yaml: | - plugins: - k8s: - co-pilot: - image: 'cr.flyte.org/flyteorg/flytecopilot:v1.14.1' - name: flyte-copilot- - start-timeout: 30s - core.yaml: | - propeller: - downstream-eval-duration: 30s - enable-admin-launcher: true - leader-election: - enabled: true - lease-duration: 15s - lock-config-map: - name: propeller-leader - namespace: 'union' - renew-deadline: 10s - retry-period: 2s - limit-namespace: all - literal-offloading-config: - enabled: true - max-workflow-retries: 30 - metadata-prefix: metadata/propeller - metrics-prefix: flyte - prof-port: 10254 - queue: - batch-size: -1 - batching-interval: 2s - queue: - base-delay: 5s - capacity: 1000 - max-delay: 120s - rate: 100 - type: maxof - sub-queue: - capacity: 100 - rate: 10 - type: bucket - type: batch - rawoutput-prefix: 's3://test-metadata-bucket' - workers: 4 - workflow-reeval-duration: 30s + core.yaml: | + + webhook: certDir: /etc/webhook/certs + disableCreateMutatingWebhookConfig: true embeddedSecretManagerConfig: imagePullSecrets: enabled: true @@ -3943,79 +3493,13 @@ data: namespace: 'union' type: 'K8s' listenPort: '9443' + localCert: true secretManagerTypes: - Embedded - K8s - serviceName: flyte-pod-webhook + secretName: union-pod-webhook + serviceName: union-pod-webhook servicePort: '443' - enabled_plugins.yaml: | - plugins: - connector-service: - defaultConnector: - defaultServiceConfig: '{"loadBalancingConfig": [{"round_robin":{}}]}' - endpoint: k8s:///flyteconnector.union:8000 - tasks: - task-plugins: - default-for-task-types: - actor: fast-task - container: container - container_array: k8s-array - sidecar: sidecar - enabled-plugins: - - container - - sidecar - - k8s-array - - echo - - fast-task - - connector-service - k8s.yaml: | - plugins: - k8s: - default-cpus: 100m - default-env-vars: [] - default-memory: 100Mi - logger.yaml: | - logger: - formatter: - type: json - level: 4 - show-source: true - namespace_config.yaml: | - namespace_config: - namespace_mapping: - template: '{{ project }}-{{ domain }}' - resource_manager.yaml: | - propeller: - resourcemanager: - type: noop - task_logs.yaml: | - plugins: - logs: - cloudwatch-enabled: false - dynamic-log-links: - - vscode: - displayName: VS Code Debugger - templateUris: - - /dataplane/pod/v1/generated_name/task/{{.executionProject}}/{{.executionDomain}}/{{.executionName}}/{{.nodeID}}/{{.taskRetryAttempt}}/{{.taskProject}}/{{.taskDomain}}/{{.taskID}}/{{.taskVersion}}/ - kubernetes-enabled: false - templates: - - displayName: Task Logs - scheme: TaskExecution - templateUris: - - /console/projects/{{.executionProject}}/domains/{{.executionDomain}}/executions/{{.executionName}}/nodeId/{{.nodeID}}/taskId/{{.taskID}}/attempt/{{.taskRetryAttempt}}/view/logs?duration=all&fromExecutionNav=true - storage.yaml: | - storage: - container: "test-metadata-bucket" - type: s3 - connection: - auth-type: iam - region: test-region - enable-multicontainer: true - limits: - maxDownloadMBs: 1024 - cache: - max_size_mbs: 0 - target_gc_percent: 70 --- # Source: dataplane/charts/fluentbit/templates/clusterrole.yaml apiVersion: rbac.authorization.k8s.io/v1 @@ -4039,515 +3523,808 @@ rules: - list - watch --- -# Source: dataplane/charts/kube-state-metrics/templates/role.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2020 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: - labels: - helm.sh/chart: kube-state-metrics-5.30.1 - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: metrics - app.kubernetes.io/part-of: kube-state-metrics - app.kubernetes.io/name: kube-state-metrics - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2.15.0" - name: release-name-kube-state-metrics -rules: - -- apiGroups: ["certificates.k8s.io"] - resources: - - certificatesigningrequests - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - configmaps - verbs: ["list", "watch"] - -- apiGroups: ["batch"] - resources: - - cronjobs - verbs: ["list", "watch"] - -- apiGroups: ["extensions", "apps"] - resources: - - daemonsets - verbs: ["list", "watch"] - -- apiGroups: ["extensions", "apps"] - resources: - - deployments - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - endpoints - verbs: ["list", "watch"] - -- apiGroups: ["autoscaling"] - resources: - - horizontalpodautoscalers - verbs: ["list", "watch"] - -- apiGroups: ["extensions", "networking.k8s.io"] - resources: - - ingresses - verbs: ["list", "watch"] - -- apiGroups: ["batch"] - resources: - - jobs - verbs: ["list", "watch"] - -- apiGroups: ["coordination.k8s.io"] - resources: - - leases - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - limitranges - verbs: ["list", "watch"] - -- apiGroups: ["admissionregistration.k8s.io"] - resources: - - mutatingwebhookconfigurations - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - namespaces - verbs: ["list", "watch"] - -- apiGroups: ["networking.k8s.io"] - resources: - - networkpolicies - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - nodes - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - persistentvolumeclaims - verbs: ["list", "watch"] + name: knative-serving-operator-aggregated + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +aggregationRule: + clusterRoleSelectors: + # This (along with escalate below) allows the Operator to pick up any + # roles that are provided to the admin of the cluster by knative serving + # automatically. + - matchExpressions: + - {key: serving.knative.dev/release, operator: Exists} +rules: [] # Rules are automatically filled in by the controller manager. +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: knative-serving-operator-aggregated-stable + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +aggregationRule: + clusterRoleSelectors: + # This (along with escalate below) allows the Operator to pick up any + # roles that are provided to the admin of the cluster by knative serving + # automatically. + - matchExpressions: + - {key: app.kubernetes.io/name, operator: In, values: ["knative-serving"]} +rules: [] # Rules are automatically filled in by the controller manager. +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: knative-eventing-operator-aggregated + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +aggregationRule: + clusterRoleSelectors: + # This (along with escalate below) allows the Operator to pick up any + # roles that are provided to the admin of the cluster by knative eventing + # automatically. + - matchExpressions: + - {key: eventing.knative.dev/release, operator: Exists} +rules: [] # Rules are automatically filled in by the controller manager. +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: knative-eventing-operator-aggregated-stable + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +aggregationRule: + clusterRoleSelectors: + # This (along with escalate below) allows the Operator to pick up any + # roles that are provided to the admin of the cluster by knative eventing + # automatically. + - matchExpressions: + - {key: app.kubernetes.io/name, operator: In, values: ["knative-eventing"]} +rules: [] # Rules are automatically filled in by the controller manager. +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2020 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. -- apiGroups: [""] - resources: - - persistentvolumes - verbs: ["list", "watch"] - -- apiGroups: ["policy"] - resources: - - poddisruptionbudgets - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - pods - verbs: ["list", "watch"] - -- apiGroups: ["extensions", "apps"] - resources: - - replicasets - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - replicationcontrollers - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - resourcequotas - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - secrets - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - services - verbs: ["list", "watch"] - -- apiGroups: ["apps"] - resources: - - statefulsets - verbs: ["list", "watch"] - -- apiGroups: ["storage.k8s.io"] - resources: - - storageclasses - verbs: ["list", "watch"] - -- apiGroups: ["admissionregistration.k8s.io"] - resources: - - validatingwebhookconfigurations - verbs: ["list", "watch"] - -- apiGroups: ["storage.k8s.io"] - resources: - - volumeattachments - verbs: ["list", "watch"] ---- -# Source: dataplane/charts/opencost/templates/clusterrole.yaml -# Cluster role giving opencost to get, list, watch required resources -# No write permissions are required -apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole +apiVersion: rbac.authorization.k8s.io/v1 metadata: - name: release-name-opencost + name: knative-serving-operator labels: - helm.sh/chart: opencost-1.42.0 - app.kubernetes.io/name: opencost - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "1.111.0" - app.kubernetes.io/part-of: opencost - app.kubernetes.io/managed-by: Helm + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator rules: - - apiGroups: [""] + - apiGroups: + - operator.knative.dev + resources: + - '*' + verbs: + - '*' + # Bootstrapping permissions. + # Roles that are explicitly bound buch which are specified by this Operator + # MUST be specified here with 'get' and 'bind'. + - apiGroups: + - rbac.authorization.k8s.io + resources: + - clusterroles + resourceNames: + - system:auth-delegator + verbs: + - bind + - get + - apiGroups: + - rbac.authorization.k8s.io + resources: + - roles + resourceNames: + - extension-apiserver-authentication-reader + verbs: + - bind + - get + - apiGroups: + - rbac.authorization.k8s.io + resources: + - clusterroles + - roles + verbs: + - create + - delete + # Escalate is necessary in order to create a role using cluster role aggregation, + # and to allow the Operator to bootstrap itself into the necessary set of + # permissions, even as those continue to evolve upstream. + - escalate + - get + - list + - update + - apiGroups: + - rbac.authorization.k8s.io + resources: + - clusterrolebindings + - rolebindings + verbs: + - create + - delete + - list + - get + - update + # Permissions required for Knative controller + # infra. + - apiGroups: + - apiregistration.k8s.io + resources: + - apiservices + verbs: + - update + - apiGroups: + - "" resources: - - configmaps - - deployments - - nodes - - pods - services - - resourcequotas - - replicationcontrollers - - limitranges - - persistentvolumeclaims - - persistentvolumes + verbs: + - create + - delete + - get + - list + - watch + - apiGroups: + - caching.internal.knative.dev + resources: + - images + verbs: + - '*' + - apiGroups: + - "" + resources: - namespaces - - endpoints verbs: + - get + - update + - watch + - apiGroups: + - '' + resources: + - events + verbs: + - create + - update + - patch + - apiGroups: + - '' + resources: + - configmaps + verbs: + - create + - delete - get - list - watch - apiGroups: - - extensions + - security.istio.io + - apps + - policy resources: - - daemonsets + - poddisruptionbudgets + - peerauthentications - deployments + - daemonsets - replicasets + - statefulsets + verbs: + - create + - delete + - get + - list + - watch + - update + - apiGroups: + - apiregistration.k8s.io + resources: + - apiservices + verbs: + - create + - delete + - get + - list + - apiGroups: + - autoscaling + resources: + - horizontalpodautoscalers + verbs: + - create + - delete + - get + - list + - apiGroups: + - coordination.k8s.io + resources: + - leases + verbs: + - '*' + - apiGroups: + - apiextensions.k8s.io + resources: + - customresourcedefinitions + verbs: + - '*' + # Old resources that need cleaning up that are not in the knative-serving + # namespace. + - apiGroups: + - "" + resources: + - services + - deployments + - horizontalpodautoscalers + resourceNames: + - knative-ingressgateway + verbs: + - delete + - apiGroups: + - "" + resources: + - configmaps + resourceNames: + - config-controller + verbs: + - delete + - apiGroups: + - "" + resources: + - serviceaccounts + resourceNames: + - knative-serving-operator + verbs: + - delete + # for contour TLS + - apiGroups: + - projectcontour.io + resources: + - httpproxies + - tlscertificatedelegations verbs: - get - list - watch + - update + - create + - delete + - deletecollection + - patch + # for security-guard + - apiGroups: + - guard.security.knative.dev + resources: + - guardians + verbs: + - get + - list + - watch + - create + - update + - patch + - delete + - apiGroups: + - "" + resources: + - configmaps + verbs: + - get + - list + - watch + - create + - update + - patch + - delete + - apiGroups: + - authentication.k8s.io + resources: + - tokenreviews + verbs: + - create + - apiGroups: + - "" + resources: + - pods + verbs: + - get +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +kind: ClusterRole +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: knative-eventing-operator + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +rules: + - apiGroups: + - operator.knative.dev + resources: + - '*' + verbs: + - '*' + # Bootstrapping permissions. + # Roles that are explicitly bound buch which are specified by this Operator + # MUST be specified here with 'get' and 'bind'. + - apiGroups: + - rbac.authorization.k8s.io + resources: + - clusterroles + - roles + verbs: + - create + - delete + # Escalate is necessary in order to create a role using cluster role aggregation, + # and to allow the Operator to bootstrap itself into the necessary set of + # permissions, even as those continue to evolve upstream. + - escalate + - get + - list + - update + - apiGroups: + - rbac.authorization.k8s.io + resources: + - clusterrolebindings + - rolebindings + verbs: + - create + - delete + - list + - get + - update + # Permissions required for Knative controller + # infra. + - apiGroups: + - apiregistration.k8s.io + resources: + - apiservices + verbs: + - update + - apiGroups: + - "" + resources: + - services + verbs: + - create + - delete + - get + - list + - watch + - apiGroups: + - caching.internal.knative.dev + resources: + - images + verbs: + - '*' + - apiGroups: + - "" + resources: + - namespaces + verbs: + - get + - update + - watch + - apiGroups: + - '' + resources: + - events + verbs: + - create + - update + - patch + - apiGroups: + - '' + resources: + - configmaps + verbs: + - create + - delete + - get + - list + - watch - apiGroups: - apps resources: - - statefulsets - deployments - daemonsets - replicasets + - statefulsets verbs: + - create + - delete + - get - list - watch + - apiGroups: + - apiregistration.k8s.io + resources: + - apiservices + verbs: + - create + - delete + - get + - list + - apiGroups: + - autoscaling + resources: + - horizontalpodautoscalers + verbs: + - create + - delete + - update + - get + - list + - apiGroups: + - coordination.k8s.io + resources: + - leases + verbs: + - '*' + - apiGroups: + - apiextensions.k8s.io + resources: + - customresourcedefinitions + verbs: + - '*' - apiGroups: - batch resources: - - cronjobs - jobs + verbs: + - create + - delete + - update + - get + - list + - watch + # Old resources that need cleaning up that are not in the knative-eventing + # namespace. + - apiGroups: + - "" + resources: + - serviceaccounts + resourceNames: + - knative-eventing-operator + verbs: + - delete + # for RabbitMQ messaging topology objects + - apiGroups: + - rabbitmq.com + resources: + - rabbitmqclusters + verbs: + - "get" + - "list" + - "watch" + - apiGroups: + - rabbitmq.com + resources: + - bindings + - queues + - exchanges + verbs: + - create + - delete + - get + - list + - patch + - update + - watch + - apiGroups: + - rabbitmq.com + resources: + - bindings/status + - queues/status + - exchanges/status + verbs: + - get + # for Kafka eventing source + - apiGroups: + - keda.sh + resources: + - scaledobjects + - scaledobjects/finalizers + - scaledobjects/status + - triggerauthentications + - triggerauthentications/status verbs: - get - list - watch + - update + - create + - delete + # Internal APIs - apiGroups: - - autoscaling + - "internal.kafka.eventing.knative.dev" resources: - - horizontalpodautoscalers + - "consumers" + - "consumers/status" + - "consumergroups" + - "consumergroups/status" verbs: + - create - get - list - watch + - patch + - update + - delete - apiGroups: - - policy + - "internal.kafka.eventing.knative.dev" resources: - - poddisruptionbudgets + - "consumers/finalizers" + - "consumergroups/finalizers" verbs: - - get - - list - - watch + - update + - delete - apiGroups: - - storage.k8s.io + - apps resources: - - storageclasses + - statefulsets/scale verbs: - get - list - watch ---- -# Source: dataplane/templates/clusterresourcesync/serviceaccount.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: union-clustersync-resource -rules: + - update + - patch + - create + - delete - apiGroups: - - "" - rbac.authorization.k8s.io + resources: + - clusterrolebindings + verbs: + - watch + - apiGroups: + - "*" + resources: + - configmaps + verbs: + - delete + - apiGroups: + - "*" resources: - configmaps - - namespaces - - pods - - resourcequotas - - roles - - rolebindings - - secrets - services - - serviceaccounts - - clusterrolebindings - - podtemplates verbs: - - '*' ---- -# Source: dataplane/templates/nodeexecutor/serviceaccount.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: union-executor - labels: - app: executor -rules: -# Allow RO access to PODS -- apiGroups: - - "" - resources: - - pods - verbs: - - get - - list - - watch -# Allow Event recording access -- apiGroups: - - "" - resources: - - events - verbs: - - create - - update - - delete - - patch -# Allow Access All plugin objects -- apiGroups: - - '*' - resources: - - '*' - verbs: - - get - - list - - watch - - create - - update - - delete - - patch -# Allow Access to CRD -- apiGroups: - - apiextensions.k8s.io - resources: - - customresourcedefinitions - verbs: - - get - - list - - watch - - create - - delete - - update ---- -# Source: dataplane/templates/operator/serviceaccount-proxy.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: proxy-system - labels: - app.kubernetes.io/name: operator-proxy - app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name - app.kubernetes.io/managed-by: Helm -rules: + - get + - list + - watch + - update + - create + - delete - apiGroups: - - '*' + - "*" resources: - - events - - flyteworkflows - - pods/log - pods - - rayjobs - - resourcequotas verbs: - - get - list + - update + - get - watch ---- -# Source: dataplane/templates/operator/serviceaccount.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: operator-system - labels: - app.kubernetes.io/name: union-operator - app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name - app.kubernetes.io/managed-by: Helm -rules: - # Allow Access to all resources under flyte.lyft.com - apiGroups: - - flyte.lyft.com + - "*" resources: - - flyteworkflows - - flyteworkflows/finalizers + - pods/finalizers verbs: - get - list - - watch - create - update - delete + - apiGroups: + - "*" + resources: + - events + verbs: - patch - - post - - deletecollection + - create - apiGroups: - - '*' + - "*" resources: - - resourcequotas - - pods - - configmaps - - podtemplates - secrets - - namespaces - - nodes verbs: - get - list - watch - - create - update + - create - delete - - nonResourceURLs: - - /metrics - verbs: - - get ---- -# Source: dataplane/templates/prometheus/rbac.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: union-operator-prometheus - labels: - helm.sh/chart: dataplane-2026.4.2 - app.kubernetes.io/name: release-name-dataplane - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: prometheus -rules: - - apiGroups: [""] + - apiGroups: + - "*" resources: - nodes - - nodes/proxy - - pods - - endpoints - - services verbs: - get - list - watch - - nonResourceURLs: - - /metrics - - /metrics/cadvisor + - apiGroups: + - "*" + resources: + - serviceaccounts verbs: - get ---- -# Source: dataplane/templates/propeller/serviceaccount-webhook.yaml -kind: ClusterRole -apiVersion: rbac.authorization.k8s.io/v1 -metadata: - name: flytepropeller-webhook-role - namespace: union -rules: + - list + - watch + - update + - create + - delete - apiGroups: - "*" resources: - - mutatingwebhookconfigurations - - secrets - - pods - - replicasets/finalizers + - configmaps + resourceNames: + - kafka-channel-config + verbs: + - patch + - apiGroups: + - "*" + resources: + - horizontalpodautoscalers + resourceNames: + - kafka-webhook + verbs: + - delete + - apiGroups: + - "*" + resources: + - leases + verbs: + - delete + - apiGroups: + - "*" + resources: + - poddisruptionbudgets + resourceNames: + - kafka-webhook + verbs: + - delete + - apiGroups: + - "*" + resources: + - services + verbs: + - patch + - apiGroups: + - "apps" + resources: + - deployments + verbs: + - deletecollection + # Eventing TLS + - apiGroups: + - "cert-manager.io" + resources: + - certificates + - issuers + - clusterissuers verbs: + - create + - delete + - update + - list - get + - watch + - apiGroups: + - "trust.cert-manager.io" + resources: + - bundles + verbs: - create + - delete - update - - patch + - list + - get + - watch --- -# Source: dataplane/templates/propeller/serviceaccount.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: - name: flytepropeller-role + name: knative-operator-webhook + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator rules: - # Allow RO access to PODS + # For watching logging configuration and getting certs. + - apiGroups: + - "" + resources: + - "configmaps" + verbs: + - "get" + - "list" + - "watch" + # For manipulating certs into secrets. - apiGroups: - "" resources: - - pods + - "namespaces" verbs: - - get - - list - - watch - # Allow Event recording access + - "get" + - "create" + - "update" + - "list" + - "watch" + - "patch" + # finalizers are needed for the owner reference of the webhook - apiGroups: - "" resources: - - events + - "namespaces/finalizers" verbs: - - create - - update - - delete - - patch - # Allow Access All plugin objects + - "update" + # For getting our Deployment so we can decorate with ownerref. - apiGroups: - - '*' + - "apps" resources: - - '*' + - "deployments" verbs: - - get - - list - - watch - - create - - update - - delete - - patch - # Allow Access to CRD + - "get" - apiGroups: - - apiextensions.k8s.io + - "apps" resources: - - customresourcedefinitions + - "deployments/finalizers" verbs: - - get - - list - - watch - - create - - delete - update - # Allow Access to all resources under flyte.lyft.com + # For actually registering our webhook. - apiGroups: - - flyte.lyft.com + - "admissionregistration.k8s.io" resources: - - flyteworkflows - - flyteworkflows/finalizers - verbs: - - get - - list - - watch - - create - - update - - delete - - patch - - post - - deletecollection + - "mutatingwebhookconfigurations" + - "validatingwebhookconfigurations" + verbs: &everything + - "get" + - "list" + - "create" + - "update" + - "delete" + - "patch" + - "watch" + # For leader election + - apiGroups: + - "coordination.k8s.io" + resources: + - "leases" + verbs: *everything + # Necessary for conversion webhook. These are copied from the serving + # TODO: Do we really need all these permissions? + - apiGroups: ["apiextensions.k8s.io"] + resources: ["customresourcedefinitions"] + verbs: ["get", "list", "create", "update", "delete", "patch", "watch"] --- # Source: dataplane/charts/fluentbit/templates/clusterrolebinding.yaml apiVersion: rbac.authorization.k8s.io/v1 @@ -4569,207 +4346,417 @@ subjects: name: fluentbit-system namespace: union --- -# Source: dataplane/charts/kube-state-metrics/templates/clusterrolebinding.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2020 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# TODO: Consider restriction of non-aggregated role to knativeservings namespaces. apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: - labels: - helm.sh/chart: kube-state-metrics-5.30.1 - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: metrics - app.kubernetes.io/part-of: kube-state-metrics - app.kubernetes.io/name: kube-state-metrics - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2.15.0" - name: release-name-kube-state-metrics + name: knative-serving-operator + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator roleRef: apiGroup: rbac.authorization.k8s.io kind: ClusterRole - name: release-name-kube-state-metrics + name: knative-serving-operator subjects: -- kind: ServiceAccount - name: release-name-kube-state-metrics - namespace: union + - kind: ServiceAccount + name: knative-operator + namespace: "union" --- -# Source: dataplane/charts/opencost/templates/clusterrolebinding.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# TODO: Consider restriction of non-aggregated role to knativeeventing namespaces. apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: - name: release-name-opencost + name: knative-eventing-operator labels: - helm.sh/chart: opencost-1.42.0 - app.kubernetes.io/name: opencost - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "1.111.0" - app.kubernetes.io/part-of: opencost - app.kubernetes.io/managed-by: Helm + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator roleRef: apiGroup: rbac.authorization.k8s.io kind: ClusterRole - name: release-name-opencost + name: knative-eventing-operator subjects: - kind: ServiceAccount - name: release-name-opencost - namespace: union + name: knative-operator + namespace: "union" --- -# Source: dataplane/templates/clusterresourcesync/serviceaccount.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: - name: union-clustersync-resource + name: operator-webhook + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator roleRef: apiGroup: rbac.authorization.k8s.io kind: ClusterRole - name: union-clustersync-resource + name: knative-operator-webhook subjects: - kind: ServiceAccount - name: union-clustersync-system - namespace: union + name: operator-webhook + namespace: "union" --- -# Source: dataplane/templates/clusterresourcesync/serviceaccount.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2020 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: - name: union-clustersync-auth-delegator + name: knative-serving-operator-aggregated + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator roleRef: apiGroup: rbac.authorization.k8s.io kind: ClusterRole - name: system:auth-delegator + name: knative-serving-operator-aggregated subjects: - kind: ServiceAccount - name: union-clustersync-system - namespace: union + name: knative-operator + namespace: "union" --- -# Source: dataplane/templates/nodeexecutor/serviceaccount.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: - name: union-executor + name: knative-serving-operator-aggregated-stable labels: - app: executor + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator roleRef: apiGroup: rbac.authorization.k8s.io kind: ClusterRole - name: union-executor + name: knative-serving-operator-aggregated-stable subjects: -- kind: ServiceAccount - name: executor - namespace: union + - kind: ServiceAccount + name: knative-operator + namespace: "union" --- -# Source: dataplane/templates/operator/serviceaccount-proxy.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: - name: proxy-system + name: knative-eventing-operator-aggregated labels: - app.kubernetes.io/name: operator-proxy - app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name - app.kubernetes.io/managed-by: Helm + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator roleRef: apiGroup: rbac.authorization.k8s.io kind: ClusterRole - name: proxy-system + name: knative-eventing-operator-aggregated subjects: - kind: ServiceAccount - name: proxy-system - namespace: union + name: knative-operator + namespace: "union" --- -# Source: dataplane/templates/operator/serviceaccount.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: - name: operator-system + name: knative-eventing-operator-aggregated-stable labels: - app.kubernetes.io/name: union-operator - app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name - app.kubernetes.io/managed-by: Helm + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator roleRef: apiGroup: rbac.authorization.k8s.io kind: ClusterRole - name: operator-system + name: knative-eventing-operator-aggregated-stable subjects: - kind: ServiceAccount - name: operator-system - namespace: union + name: knative-operator + namespace: "union" --- -# Source: dataplane/templates/prometheus/rbac.yaml +# Source: dataplane/charts/dcgm-exporter/templates/role.yaml apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding +kind: Role metadata: - name: union-operator-prometheus + name: dcgm-exporter-read-cm + namespace: union labels: - helm.sh/chart: dataplane-2026.4.2 - app.kubernetes.io/name: release-name-dataplane + helm.sh/chart: dcgm-exporter-4.7.1 + app.kubernetes.io/name: dcgm-exporter app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" + app.kubernetes.io/component: dcgm-exporter + app.kubernetes.io/version: "4.7.1" app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: prometheus -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: union-operator-prometheus -subjects: - - kind: ServiceAccount - name: union-operator-prometheus - namespace: union +rules: +- apiGroups: [""] + resources: ["configmaps"] + resourceNames: ["exporter-metrics-config-map"] + verbs: ["get"] --- -# Source: dataplane/templates/propeller/serviceaccount-webhook.yaml -# Create a binding from Role -> ServiceAccount -kind: ClusterRoleBinding +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2022 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + namespace: "union" + name: knative-operator-webhook + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +rules: + # For manipulating certs into secrets. + - apiGroups: + - "" + resources: + - "secrets" + verbs: + - "get" + - "create" + - "update" + - "list" + - "watch" + - "patch" +--- +# Source: dataplane/templates/monitoring/prometheus-rbac.yaml apiVersion: rbac.authorization.k8s.io/v1 +kind: Role metadata: - name: flytepropeller-webhook-binding + name: union-operator-prometheus-rbac namespace: union -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: flytepropeller-webhook-role -subjects: - - kind: ServiceAccount - name: flytepropeller-webhook-system - namespace: union + labels: + release: release-name +rules: + # Prometheus server scrape permissions + - apiGroups: [""] + resources: + - nodes + - nodes/proxy + - nodes/metrics + - services + - endpoints + - pods + - ingresses + - configmaps + verbs: + - get + - list + - watch + - apiGroups: ["extensions", "networking.k8s.io"] + resources: + - ingresses/status + - ingresses + verbs: + - get + - list + - watch + - apiGroups: ["discovery.k8s.io"] + resources: + - endpointslices + verbs: + - get + - list + - watch + # kube-state-metrics permissions (rbac.create=false on the subchart; managed here instead) + - apiGroups: ["certificates.k8s.io"] + resources: ["certificatesigningrequests"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["configmaps"] + verbs: ["list", "watch"] + - apiGroups: ["batch"] + resources: ["cronjobs"] + verbs: ["list", "watch"] + - apiGroups: ["extensions", "apps"] + resources: ["daemonsets"] + verbs: ["list", "watch"] + - apiGroups: ["extensions", "apps"] + resources: ["deployments"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["endpoints"] + verbs: ["list", "watch"] + - apiGroups: ["discovery.k8s.io"] + resources: ["endpointslices"] + verbs: ["list", "watch"] + - apiGroups: ["autoscaling"] + resources: ["horizontalpodautoscalers"] + verbs: ["list", "watch"] + - apiGroups: ["extensions", "networking.k8s.io"] + resources: ["ingresses"] + verbs: ["list", "watch"] + - apiGroups: ["batch"] + resources: ["jobs"] + verbs: ["list", "watch"] + - apiGroups: ["coordination.k8s.io"] + resources: ["leases"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["limitranges"] + verbs: ["list", "watch"] + - apiGroups: ["admissionregistration.k8s.io"] + resources: ["mutatingwebhookconfigurations"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["namespaces"] + verbs: ["list", "watch"] + - apiGroups: ["networking.k8s.io"] + resources: ["networkpolicies"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["persistentvolumeclaims"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["persistentvolumes"] + verbs: ["list", "watch"] + - apiGroups: ["policy"] + resources: ["poddisruptionbudgets"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["pods"] + verbs: ["list", "watch"] + - apiGroups: ["extensions", "apps"] + resources: ["replicasets"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["replicationcontrollers"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["resourcequotas"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["secrets"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["services"] + verbs: ["list", "watch"] + - apiGroups: ["apps"] + resources: ["statefulsets"] + verbs: ["list", "watch"] + - apiGroups: ["storage.k8s.io"] + resources: ["storageclasses"] + verbs: ["list", "watch"] + - apiGroups: ["admissionregistration.k8s.io"] + resources: ["validatingwebhookconfigurations"] + verbs: ["list", "watch"] + - apiGroups: ["storage.k8s.io"] + resources: ["volumeattachments"] + verbs: ["list", "watch"] --- -# Source: dataplane/templates/propeller/serviceaccount.yaml +# Source: dataplane/templates/nodeexecutor/serviceaccount.yaml apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding +kind: Role metadata: - name: flytepropeller-binding -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: flytepropeller-role -subjects: - - kind: ServiceAccount - name: flytepropeller-system - namespace: union + name: union-executor + labels: + app: executor +rules: +# Allow RO access to PODS +- apiGroups: + - "" + resources: + - pods + verbs: + - get + - list + - watch +# Allow Event recording access +- apiGroups: + - "" + resources: + - events + verbs: + - create + - update + - delete + - patch +# Allow Access All plugin objects +- apiGroups: + - '*' + resources: + - '*' + verbs: + - get + - list + - watch + - create + - update + - delete + - patch +# Allow Access to CRD +- apiGroups: + - apiextensions.k8s.io + resources: + - customresourcedefinitions + verbs: + - get + - list + - watch + - create + - delete + - update --- -# Source: dataplane/charts/dcgm-exporter/templates/role.yaml +# Source: dataplane/templates/operator/serviceaccount-proxy-secret.yaml apiVersion: rbac.authorization.k8s.io/v1 kind: Role metadata: - name: dcgm-exporter-read-cm + name: union-system-secret namespace: union labels: - helm.sh/chart: dcgm-exporter-4.7.1 - app.kubernetes.io/name: dcgm-exporter + app.kubernetes.io/name: operator-proxy app.kubernetes.io/instance: release-name - app.kubernetes.io/component: dcgm-exporter - app.kubernetes.io/version: "4.7.1" + platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm rules: -- apiGroups: [""] - resources: ["configmaps"] - resourceNames: ["exporter-metrics-config-map"] - verbs: ["get"] + - apiGroups: + - '*' + resources: + - secrets + verbs: + - get + - list + - create + - update + - delete --- -# Source: dataplane/templates/operator/serviceaccount-proxy-secret.yaml +# Source: dataplane/templates/operator/serviceaccount-proxy.yaml apiVersion: rbac.authorization.k8s.io/v1 kind: Role metadata: - name: proxy-system-secret - namespace: union + name: proxy-system labels: app.kubernetes.io/name: operator-proxy app.kubernetes.io/instance: release-name @@ -4779,13 +4766,16 @@ rules: - apiGroups: - '*' resources: - - secrets + - events + - flyteworkflows + - pods/log + - pods + - rayjobs + - resourcequotas verbs: - get - list - - create - - update - - delete + - watch --- # Source: dataplane/templates/operator/serviceaccount.yaml apiVersion: rbac.authorization.k8s.io/v1 @@ -4809,6 +4799,71 @@ rules: - watch - create - update + - apiGroups: + - flyte.lyft.com + resources: + - flyteworkflows + - flyteworkflows/finalizers + verbs: + - get + - list + - watch + - create + - update + - delete + - patch + - post + - deletecollection + - apiGroups: + - '*' + resources: + - resourcequotas + - pods + - configmaps + - podtemplates + - secrets + - namespaces + - nodes + verbs: + - get + - list + - watch + - create + - update + - delete + - apiGroups: + - serving.knative.dev + resources: + - revisions + - configurations + - services + verbs: + - get + - list + - watch + - create + - update + - delete +--- +# Source: dataplane/templates/webhook/serviceaccount.yaml +kind: Role +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: union-webhook-role + namespace: union +rules: + - apiGroups: + - "*" + resources: + - mutatingwebhookconfigurations + - secrets + - pods + - replicasets/finalizers + verbs: + - get + - create + - update + - patch --- # Source: dataplane/charts/dcgm-exporter/templates/rolebinding.yaml apiVersion: rbac.authorization.k8s.io/v1 @@ -4832,11 +4887,94 @@ roleRef: name: dcgm-exporter-read-cm apiGroup: rbac.authorization.k8s.io --- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2022 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + namespace: "union" + name: operator-webhook + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +subjects: + - kind: ServiceAccount + name: operator-webhook + namespace: "union" +roleRef: + kind: Role + name: knative-operator-webhook + apiGroup: rbac.authorization.k8s.io +--- +# Source: dataplane/templates/monitoring/prometheus-rbac.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: union-operator-prometheus-rbac + namespace: union + labels: + release: release-name +subjects: + - kind: ServiceAccount + name: union-operator-prometheus + namespace: union +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: union-operator-prometheus-rbac +--- +# Source: dataplane/templates/monitoring/prometheus-rbac.yaml +# kube-state-metrics service account RoleBinding +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: release-name-prometheus-kube-state-metrics + namespace: union + labels: + release: release-name +subjects: + - kind: ServiceAccount + name: release-name-prometheus-kube-state-metrics + namespace: union +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: union-operator-prometheus-rbac +--- +# Source: dataplane/templates/nodeexecutor/serviceaccount.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: union-executor + labels: + app: executor +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: union-executor +subjects: +- kind: ServiceAccount + name: union-system + namespace: union +--- # Source: dataplane/templates/operator/serviceaccount-proxy-secret.yaml apiVersion: rbac.authorization.k8s.io/v1 kind: RoleBinding metadata: - name: proxy-system-secret + name: union-system-secret namespace: union labels: app.kubernetes.io/name: operator-proxy @@ -4846,10 +4984,29 @@ metadata: roleRef: apiGroup: rbac.authorization.k8s.io kind: Role - name: proxy-system-secret + name: union-system-secret +subjects: + - kind: ServiceAccount + name: union-system + namespace: union +--- +# Source: dataplane/templates/operator/serviceaccount-proxy.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: proxy-system + labels: + app.kubernetes.io/name: operator-proxy + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: proxy-system subjects: - kind: ServiceAccount - name: proxy-system + name: union-system namespace: union --- # Source: dataplane/templates/operator/serviceaccount.yaml @@ -4868,7 +5025,23 @@ roleRef: name: operator-system subjects: - kind: ServiceAccount - name: operator-system + name: union-system + namespace: union +--- +# Source: dataplane/templates/webhook/serviceaccount.yaml +# Create a binding from Role -> ServiceAccount +kind: RoleBinding +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: union-webhook-binding + namespace: union +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: union-webhook-role +subjects: + - kind: ServiceAccount + name: union-system namespace: union --- # Source: dataplane/charts/dcgm-exporter/templates/service.yaml @@ -4934,20 +5107,60 @@ spec: app.kubernetes.io/name: fluentbit app.kubernetes.io/instance: release-name --- -# Source: dataplane/charts/kube-state-metrics/templates/service.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2022 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v1 +kind: Service +metadata: + labels: + role: operator-webhook + app.kubernetes.io/component: operator-webhook + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator + name: operator-webhook + namespace: "union" +spec: + ports: + # Define metrics and profiling for them to be accessible within service meshes. + - name: http-metrics + port: 9090 + targetPort: 9090 + - name: http-profiling + port: 8008 + targetPort: 8008 + - name: https-webhook + port: 443 + targetPort: 8443 + selector: + role: operator-webhook +--- +# Source: dataplane/charts/prometheus/charts/kube-state-metrics/templates/service.yaml apiVersion: v1 kind: Service metadata: name: release-name-kube-state-metrics namespace: union labels: - helm.sh/chart: kube-state-metrics-5.30.1 + helm.sh/chart: kube-state-metrics-5.25.1 app.kubernetes.io/managed-by: Helm app.kubernetes.io/component: metrics app.kubernetes.io/part-of: kube-state-metrics app.kubernetes.io/name: kube-state-metrics app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2.15.0" + app.kubernetes.io/version: "2.13.0" annotations: prometheus.io/scrape: 'true' spec: @@ -4962,28 +5175,32 @@ spec: app.kubernetes.io/name: kube-state-metrics app.kubernetes.io/instance: release-name --- -# Source: dataplane/charts/opencost/templates/service.yaml +# Source: dataplane/charts/prometheus/templates/service.yaml apiVersion: v1 kind: Service metadata: - name: release-name-opencost - namespace: union labels: - helm.sh/chart: opencost-1.42.0 - app.kubernetes.io/name: opencost + app.kubernetes.io/component: server + app.kubernetes.io/name: prometheus app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "1.111.0" - app.kubernetes.io/part-of: opencost + app.kubernetes.io/version: v2.54.1 + helm.sh/chart: prometheus-25.27.0 app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: prometheus + name: union-operator-prometheus + namespace: union spec: + ports: + - name: http + port: 80 + protocol: TCP + targetPort: 9090 selector: - app.kubernetes.io/name: opencost + app.kubernetes.io/component: server + app.kubernetes.io/name: prometheus app.kubernetes.io/instance: release-name + sessionAffinity: None type: "ClusterIP" - ports: - - name: http - port: 9003 - targetPort: 9003 --- # Source: dataplane/templates/clusterresourcesync/service.yaml apiVersion: v1 @@ -5007,6 +5224,31 @@ spec: app.kubernetes.io/name: clusterresourcesync app.kubernetes.io/instance: release-name --- +# Source: dataplane/templates/flyteconnector/service.yaml +apiVersion: v1 +kind: Service +metadata: + name: flyteconnector + namespace: union + labels: + app.kubernetes.io/name: flyteconnector + app.kubernetes.io/instance: release-name + app.kubernetes.io/managed-by: Helm +spec: + clusterIP: None + ports: + - name: grpc + port: 8000 + protocol: TCP + targetPort: grpc + - name: metric + port: 9090 + protocol: TCP + targetPort: metric + selector: + app.kubernetes.io/name: flyteconnector + app.kubernetes.io/instance: release-name +--- # Source: dataplane/templates/imagebuilder/service.yaml apiVersion: v1 kind: Service @@ -5032,7 +5274,7 @@ spec: apiVersion: v1 kind: Service metadata: - name: release-name-dataplane-executor + name: union-operator-executor labels: platform.union.ai/prometheus-group: "union-services" app: executor @@ -5080,57 +5322,32 @@ spec: apiVersion: v1 kind: Service metadata: - name: union-operator - labels: - platform.union.ai/prometheus-group: "union-services" - app.kubernetes.io/name: union-operator - app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name - app.kubernetes.io/managed-by: Helm -spec: - type: ClusterIP - ports: - - port: 10254 - targetPort: debug - protocol: TCP - name: debug - selector: - app.kubernetes.io/name: union-operator - app.kubernetes.io/instance: release-name ---- -# Source: dataplane/templates/prometheus/service.yaml -apiVersion: v1 -kind: Service -metadata: - name: union-operator-prometheus - namespace: union + name: union-operator labels: - helm.sh/chart: dataplane-2026.4.2 - app.kubernetes.io/name: release-name-dataplane + platform.union.ai/prometheus-group: "union-services" + app.kubernetes.io/name: union-operator app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" + platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: prometheus spec: type: ClusterIP ports: - - port: 80 - targetPort: 9090 + - port: 10254 + targetPort: debug protocol: TCP - name: http + name: debug selector: - app.kubernetes.io/component: prometheus - app.kubernetes.io/name: release-name-dataplane + app.kubernetes.io/name: union-operator app.kubernetes.io/instance: release-name --- -# Source: dataplane/templates/propeller/service-webhook.yaml +# Source: dataplane/templates/webhook/service.yaml apiVersion: v1 kind: Service metadata: - name: flyte-pod-webhook + name: union-pod-webhook namespace: union labels: - app.kubernetes.io/name: flyte-pod-webhook + app.kubernetes.io/name: union-pod-webhook app.kubernetes.io/instance: release-name platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm @@ -5138,7 +5355,7 @@ metadata: projectcontour.io/upstream-protocol.h2c: grpc spec: selector: - app.kubernetes.io/name: flyte-pod-webhook + app.kubernetes.io/name: union-pod-webhook app.kubernetes.io/instance: release-name ports: - name: https @@ -5150,23 +5367,23 @@ spec: port: 10254 targetPort: 10254 --- -# Source: dataplane/templates/propeller/service-webhook.yaml +# Source: dataplane/templates/webhook/service.yaml # Headless Service for cache invalidation — resolves to all pod IPs so that # we can fan out invalidation requests to every webhook replica. apiVersion: v1 kind: Service metadata: - name: flyte-pod-webhook-headless + name: union-pod-webhook-headless namespace: union labels: - app.kubernetes.io/name: flyte-pod-webhook + app.kubernetes.io/name: union-pod-webhook app.kubernetes.io/instance: release-name platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm spec: clusterIP: None selector: - app.kubernetes.io/name: flyte-pod-webhook + app.kubernetes.io/name: union-pod-webhook app.kubernetes.io/instance: release-name ports: - name: cache-internal @@ -5174,32 +5391,6 @@ spec: port: 9443 targetPort: 9443 --- -# Source: dataplane/templates/propeller/service.yaml -apiVersion: v1 -kind: Service -metadata: - namespace: union - name: flytepropeller - labels: - platform.union.ai/prometheus-group: "union-services" - app.kubernetes.io/name: flytepropeller - app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name - app.kubernetes.io/managed-by: Helm -spec: - type: ClusterIP - ports: - - name: debug - protocol: TCP - port: 10254 - - name: fasttask - port: 15605 - protocol: TCP - targetPort: 15605 - selector: - app.kubernetes.io/name: flytepropeller - app.kubernetes.io/instance: release-name ---- # Source: dataplane/charts/dcgm-exporter/templates/daemonset.yaml # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. # @@ -5530,20 +5721,217 @@ spec: tolerations: - operator: Exists --- -# Source: dataplane/charts/kube-state-metrics/templates/deployment.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2022 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: operator-webhook + namespace: "union" + labels: + app.kubernetes.io/component: operator-webhook + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +spec: + selector: + matchLabels: + app: operator-webhook + role: operator-webhook + template: + metadata: + labels: + app: operator-webhook + role: operator-webhook + app.kubernetes.io/component: operator-webhook + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator + sidecar.istio.io/inject: "false" + spec: + # To avoid node becoming SPOF, spread our replicas to different nodes. + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - podAffinityTerm: + labelSelector: + matchLabels: + app: operator-webhook + topologyKey: kubernetes.io/hostname + weight: 100 + serviceAccountName: operator-webhook + containers: + - name: operator-webhook + # This is the Go import path for the binary that is containerized + # and substituted here. + image: gcr.io/knative-releases/knative.dev/operator/cmd/webhook@sha256:d3a7a3304629ccfcaacec620b50736e0b4c902a6328b83369b6cbaf7e94677c9 + resources: + requests: + cpu: 100m + memory: 100Mi + limits: + cpu: 500m + memory: 500Mi + env: + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: SYSTEM_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: CONFIG_LOGGING_NAME + value: config-logging + - name: CONFIG_OBSERVABILITY_NAME + value: config-observability + - name: WEBHOOK_NAME + value: operator-webhook + - name: WEBHOOK_PORT + value: "8443" + - name: WEBHOOK_SECRET_NAME + value: operator-webhook-certs + - name: METRICS_DOMAIN + value: knative.dev/operator + - name: KUBERNETES_MIN_VERSION + value: "" + securityContext: + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + runAsNonRoot: true + capabilities: + drop: + - ALL + ports: + - name: metrics + containerPort: 9090 + - name: profiling + containerPort: 8008 + - name: https-webhook + containerPort: 8443 + readinessProbe: + periodSeconds: 1 + httpGet: + scheme: HTTPS + port: 8443 + httpHeaders: + - name: k-kubelet-probe + value: "webhook" + livenessProbe: + periodSeconds: 1 + httpGet: + scheme: HTTPS + port: 8443 + httpHeaders: + - name: k-kubelet-probe + value: "webhook" + failureThreshold: 6 + initialDelaySeconds: 120 + # Our webhook should gracefully terminate by lame ducking first, set this to a sufficiently + # high value that we respect whatever value it has configured for the lame duck grace period. + terminationGracePeriodSeconds: 300 +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2020 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: knative-operator + namespace: "union" + labels: + app.kubernetes.io/name: knative-operator + app.kubernetes.io/version: "1.16.0" +spec: + replicas: 1 + selector: + matchLabels: + name: knative-operator + template: + metadata: + labels: + name: knative-operator + app.kubernetes.io/name: knative-operator + app.kubernetes.io/version: "1.16.0" + sidecar.istio.io/inject: "false" + spec: + serviceAccountName: knative-operator + containers: + - name: knative-operator + image: gcr.io/knative-releases/knative.dev/operator/cmd/operator@sha256:0b5a3532417f9c8e7b6044e23f6f67ad932a74a40a4092ad965b6f173b2fd887 + resources: + requests: + cpu: 100m + memory: 100Mi + limits: + cpu: 1000m + memory: 1000Mi + imagePullPolicy: IfNotPresent + env: + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: SYSTEM_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: METRICS_DOMAIN + value: knative.dev/operator + - name: CONFIG_LOGGING_NAME + value: config-logging + - name: CONFIG_OBSERVABILITY_NAME + value: config-observability + - name: KUBERNETES_MIN_VERSION + value: "" + securityContext: + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + runAsNonRoot: true + capabilities: + drop: + - ALL + ports: + - name: metrics + containerPort: 9090 +--- +# Source: dataplane/charts/prometheus/charts/kube-state-metrics/templates/deployment.yaml apiVersion: apps/v1 kind: Deployment metadata: name: release-name-kube-state-metrics namespace: union labels: - helm.sh/chart: kube-state-metrics-5.30.1 + helm.sh/chart: kube-state-metrics-5.25.1 app.kubernetes.io/managed-by: Helm app.kubernetes.io/component: metrics app.kubernetes.io/part-of: kube-state-metrics app.kubernetes.io/name: kube-state-metrics app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2.15.0" + app.kubernetes.io/version: "2.13.0" spec: selector: matchLabels: @@ -5556,13 +5944,13 @@ spec: template: metadata: labels: - helm.sh/chart: kube-state-metrics-5.30.1 + helm.sh/chart: kube-state-metrics-5.25.1 app.kubernetes.io/managed-by: Helm app.kubernetes.io/component: metrics app.kubernetes.io/part-of: kube-state-metrics app.kubernetes.io/name: kube-state-metrics app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2.15.0" + app.kubernetes.io/version: "2.13.0" spec: automountServiceAccountToken: true hostNetwork: false @@ -5579,8 +5967,9 @@ spec: args: - --port=8080 - --resources=certificatesigningrequests,configmaps,cronjobs,daemonsets,deployments,endpoints,horizontalpodautoscalers,ingresses,jobs,leases,limitranges,mutatingwebhookconfigurations,namespaces,networkpolicies,nodes,persistentvolumeclaims,persistentvolumes,poddisruptionbudgets,pods,replicasets,replicationcontrollers,resourcequotas,secrets,services,statefulsets,storageclasses,validatingwebhookconfigurations,volumeattachments + - --namespaces=union imagePullPolicy: IfNotPresent - image: registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.15.0 + image: registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.13.0 ports: - containerPort: 8080 name: "http" @@ -5600,7 +5989,7 @@ spec: httpGet: httpHeaders: path: /readyz - port: 8081 + port: 8080 scheme: HTTP initialDelaySeconds: 5 periodSeconds: 10 @@ -5615,201 +6004,204 @@ spec: - ALL readOnlyRootFilesystem: true --- -# Source: dataplane/charts/opencost/templates/deployment.yaml +# Source: dataplane/charts/prometheus/templates/deploy.yaml apiVersion: apps/v1 kind: Deployment metadata: - name: release-name-opencost - namespace: union labels: - helm.sh/chart: opencost-1.42.0 - app.kubernetes.io/name: opencost + app.kubernetes.io/component: server + app.kubernetes.io/name: prometheus app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "1.111.0" - app.kubernetes.io/part-of: opencost + app.kubernetes.io/version: v2.54.1 + helm.sh/chart: prometheus-25.27.0 app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: prometheus + name: union-operator-prometheus + namespace: union spec: - replicas: 1 selector: matchLabels: - app.kubernetes.io/name: opencost + app.kubernetes.io/component: server + app.kubernetes.io/name: prometheus app.kubernetes.io/instance: release-name - strategy: - rollingUpdate: - maxSurge: 1 - maxUnavailable: 1 - type: RollingUpdate + replicas: 1 + revisionHistoryLimit: 10 + strategy: + type: Recreate + rollingUpdate: null template: metadata: labels: - app.kubernetes.io/name: opencost + app.kubernetes.io/component: server + app.kubernetes.io/name: prometheus app.kubernetes.io/instance: release-name + app.kubernetes.io/version: v2.54.1 + helm.sh/chart: prometheus-25.27.0 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: prometheus spec: - serviceAccountName: release-name-opencost + enableServiceLinks: true + serviceAccountName: union-operator-prometheus containers: - - name: release-name-opencost - image: ghcr.io/opencost/opencost:1.111.0@sha256:6aa68e52a24b14ba41f23db08d1b9db1429a1c0300f4c0381ecc2c61fc311a97 - imagePullPolicy: IfNotPresent + - name: prometheus-server-configmap-reload + image: "quay.io/prometheus-operator/prometheus-config-reloader:v0.76.0" + imagePullPolicy: "IfNotPresent" args: + - --watched-dir=/etc/config + - --listen-address=0.0.0.0:8080 + - --reload-url=http://127.0.0.1:9090/prometheus/-/reload ports: - - containerPort: 9003 - name: http - resources: - limits: - cpu: 1000m - memory: 4Gi - requests: - cpu: 500m - memory: 1Gi - startupProbe: - httpGet: - path: /healthz - port: 9003 - initialDelaySeconds: 10 - periodSeconds: 5 - failureThreshold: 30 + - containerPort: 8080 + name: metrics livenessProbe: httpGet: path: /healthz - port: 9003 - initialDelaySeconds: 10 - periodSeconds: 20 - failureThreshold: 3 + port: metrics + scheme: HTTP + initialDelaySeconds: 2 + periodSeconds: 10 readinessProbe: httpGet: path: /healthz - port: 9003 - initialDelaySeconds: 10 + port: metrics + scheme: HTTP periodSeconds: 10 + volumeMounts: + - name: config-volume + mountPath: /etc/config + readOnly: true + + - name: prometheus-server + image: "quay.io/prometheus/prometheus:v2.54.1" + imagePullPolicy: "IfNotPresent" + args: + - --storage.tsdb.retention.time=3d + - --config.file=/etc/config/prometheus.yml + - --storage.tsdb.path=/data + - --web.console.libraries=/etc/prometheus/console_libraries + - --web.console.templates=/etc/prometheus/consoles + - --web.enable-lifecycle + - --web.route-prefix=/prometheus + - --web.external-url=/prometheus/ + ports: + - containerPort: 9090 + readinessProbe: + httpGet: + path: /prometheus/-/ready + port: 9090 + scheme: HTTP + initialDelaySeconds: 30 + periodSeconds: 5 + timeoutSeconds: 4 failureThreshold: 3 - env: - - name: LOG_LEVEL - value: info - - name: CUSTOM_COST_ENABLED - value: "false" - - name: KUBECOST_NAMESPACE - value: union - - name: API_PORT - value: "9003" - - name: PROMETHEUS_SERVER_ENDPOINT - value: "http://union-operator-prometheus.union.svc:80/prometheus" - - name: CLUSTER_ID - value: "default-cluster" - - name: DATA_RETENTION_DAILY_RESOLUTION_DAYS - value: "15" - - name: CLOUD_COST_ENABLED - value: "false" - - name: CLOUD_COST_MONTH_TO_DATE_INTERVAL - value: "6" - - name: CLOUD_COST_REFRESH_RATE_HOURS - value: "6" - - name: CLOUD_COST_QUERY_WINDOW_DAYS - value: "7" - - name: CLOUD_COST_RUN_WINDOW_DAYS - value: "3" - # Add any additional provided variables + successThreshold: 1 + livenessProbe: + httpGet: + path: /prometheus/-/healthy + port: 9090 + scheme: HTTP + initialDelaySeconds: 30 + periodSeconds: 15 + timeoutSeconds: 10 + failureThreshold: 3 + successThreshold: 1 + resources: + limits: + cpu: "3" + memory: 3500Mi + requests: + cpu: "1" + memory: 1Gi + volumeMounts: + - name: config-volume + mountPath: /etc/config + - name: storage-volume + mountPath: /data + subPath: "" + - name: server-recording-rules + mountPath: /etc/config/recording + subPath: + readOnly: + dnsPolicy: ClusterFirst + securityContext: + fsGroup: 65534 + runAsGroup: 65534 + runAsNonRoot: true + runAsUser: 65534 + terminationGracePeriodSeconds: 300 + volumes: + - name: config-volume + configMap: + name: union-operator-prometheus + - name: server-recording-rules + configMap: + name: union-recording-rules + - name: storage-volume + emptyDir: + sizeLimit: 10Gi --- -# Source: dataplane/templates/clusterresourcesync/deployment.yaml +# Source: dataplane/templates/flyteconnector/deployment.yaml apiVersion: apps/v1 kind: Deployment metadata: - name: union-syncresources + name: flyteconnector namespace: union - labels: - app.kubernetes.io/name: clusterresourcesync + labels: + app.kubernetes.io/name: flyteconnector app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm spec: - replicas: 1 + replicas: 2 selector: - matchLabels: - app.kubernetes.io/name: clusterresourcesync + matchLabels: + app.kubernetes.io/name: flyteconnector app.kubernetes.io/instance: release-name template: metadata: annotations: - configChecksum: "c4724efa4c7236a0f6d5b124721f726008c901cf543e57636813146b2119713" - labels: platform.union.ai/zone: "dataplane" - app.kubernetes.io/name: clusterresourcesync + app.kubernetes.io/name: flyteconnector app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm spec: containers: - command: - - clusterresource - - --config - - /etc/flyte/config/*.yaml - - clusterresource - - run - env: - - name: POD_NAME - valueFrom: - fieldRef: - fieldPath: metadata.name - - name: POD_NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - - name: GOMEMLIMIT - valueFrom: - resourceFieldRef: - divisor: 1 - resource: limits.memory - - name: GOMAXPROCS - valueFrom: - resourceFieldRef: - divisor: 1 - resource: limits.cpu - - name: CLUSTER_NAME - valueFrom: - secretKeyRef: - name: operator-cluster-name - key: cluster_name - - name: DEPLOYMENT_NAME - value: operator - - name: PROXY_SERVICE_URL - value: http://union-operator-proxy:8080 - - name: PROMETHEUS_SERVICE_URL - value: http://union-operator-prometheus:80 - - name: KNATIVE_PROXY_SERVICE_URL - value: http://kourier-internal - image: "public.ecr.aws/p0i0a9q8/unionoperator:2026.4.5" + - c0 + image: "ghcr.io/flyteorg/flyte-connectors:py3.13-2.0.0b50.dev3-g695bb1db3.d20260122" imagePullPolicy: "IfNotPresent" - name: sync-cluster-resources - resources: - limits: - cpu: "1" - memory: 500Mi - requests: - cpu: 500m - memory: 100Mi - volumeMounts: - - name: auth - mountPath: /etc/union/secret - - name: resource-templates - mountPath: /etc/flyte/clusterresource/templates - - name: config-volume - mountPath: /etc/flyte/config + name: flyteconnector + readinessProbe: + failureThreshold: 3 + httpGet: + path: / + port: 9090 + scheme: HTTP + initialDelaySeconds: 5 + periodSeconds: 10 + successThreshold: 1 + timeoutSeconds: 5 + env: + - name: AIRFLOW_CONN_GOOGLE_CLOUD_DEFAULT + value: "google-cloud-platform://" + - name: LOG_LEVEL + value: "20" ports: - - name: debug - containerPort: 10254 - protocol: TCP - serviceAccountName: union-clustersync-system - volumes: - - configMap: - name: union-clusterresource-template - name: resource-templates - - configMap: - name: union-clusterresourcesync-config - name: config-volume - - name: auth - secret: - secretName: union-secret-auth + - containerPort: 8000 + name: grpc + - containerPort: 9090 + name: metric + resources: + limits: + cpu: "1.5" + ephemeral-storage: 100Mi + memory: 1500Mi + requests: + cpu: "1" + ephemeral-storage: 100Mi + memory: 1000Mi + serviceAccountName: flyteconnector --- # Source: dataplane/templates/imagebuilder/deployment.yaml apiVersion: apps/v1 @@ -5831,17 +6223,15 @@ spec: app.kubernetes.io/instance: release-name template: metadata: - annotations: - container.apparmor.security.beta.kubernetes.io/buildkit: unconfined labels: platform.union.ai/zone: "dataplane" app.kubernetes.io/name: imagebuilder-buildkit app.kubernetes.io/instance: release-name spec: - serviceAccountName: "union-imagebuilder" + serviceAccountName: "union-system" containers: - name: "buildkit" - image: "docker.io/moby/buildkit:buildx-stable-1-rootless" + image: "docker.io/moby/buildkit:buildx-stable-1" imagePullPolicy: IfNotPresent env: - name: POD_NAME @@ -5876,18 +6266,15 @@ spec: - name: KNATIVE_PROXY_SERVICE_URL value: http://kourier-internal volumeMounts: - - mountPath: /home/user/.local/share/buildkit - name: buildkitd - mountPath: /etc/buildkit name: buildkit-config args: - --config - /etc/buildkit/buildkitd.toml - --addr - - unix:///run/user/1000/buildkit/buildkitd.sock + - unix:///run//buildkit/buildkitd.sock - --addr - tcp://0.0.0.0:1234 - - --oci-worker-no-process-sandbox ports: - name: tcp containerPort: 1234 @@ -5909,18 +6296,13 @@ spec: initialDelaySeconds: 5 periodSeconds: 30 securityContext: - seccompProfile: # Needs Kubernetes >= 1.19 - type: Unconfined - runAsUser: 1000 - runAsGroup: 1000 + privileged: true resources: requests: - cpu: 1 - ephemeral-storage: 20Gi - memory: 1Gi + cpu: 4 + ephemeral-storage: 50Gi + memory: 4Gi volumes: - - name: buildkitd - emptyDir: {} - configMap: name: union-operator-buildkit name: buildkit-config @@ -5949,16 +6331,18 @@ spec: template: metadata: annotations: - configChecksum: "80028a87d6450e2f927cf0995d0e9dccd3df17b1ccd1dbb085e52947baeec82" + configChecksum: "5b4caae6721fc6eb2d30673d2acc929faf173a7de2681af42f781abbe227452" labels: platform.union.ai/zone: "dataplane" app: executor + app.kubernetes.io/instance: 'release-name' + app.kubernetes.io/name: executor spec: securityContext: fsGroup: 1337 - serviceAccountName: executor + serviceAccountName: union-system volumes: - name: config-volume configMap: @@ -5974,7 +6358,7 @@ spec: image: "public.ecr.aws/p0i0a9q8/unionoperator:2026.4.5" imagePullPolicy: IfNotPresent command: - - executor + - executorv2 - serve - --config - /etc/config/*.yaml @@ -6052,7 +6436,7 @@ spec: template: metadata: annotations: - configChecksum: "f62821d85fbda1c246ee1291b8106d9f51b7b33b6b890d707fc2ca31b84350e" + configChecksum: "b7464b41c6e640059db085f351b8de1b9426fd12ccfcc95146b5f338b1a6338" labels: @@ -6067,12 +6451,10 @@ spec: sources: - configMap: name: union-operator - - configMap: - name: union-clusterresourcesync-config - name: secret-volume secret: secretName: union-secret-auth - serviceAccountName: proxy-system + serviceAccountName: union-system securityContext: {} containers: @@ -6189,7 +6571,7 @@ spec: template: metadata: annotations: - configChecksum: "f62821d85fbda1c246ee1291b8106d9f51b7b33b6b890d707fc2ca31b84350e" + configChecksum: "b7464b41c6e640059db085f351b8de1b9426fd12ccfcc95146b5f338b1a6338" labels: @@ -6198,7 +6580,7 @@ spec: platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm spec: - serviceAccountName: operator-system + serviceAccountName: union-system securityContext: {} volumes: @@ -6279,81 +6661,16 @@ spec: containerPort: 10254 protocol: TCP --- -# Source: dataplane/templates/prometheus/deployment.yaml -apiVersion: apps/v1 -kind: Deployment -metadata: - name: union-operator-prometheus - namespace: union - labels: - helm.sh/chart: dataplane-2026.4.2 - app.kubernetes.io/name: release-name-dataplane - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: prometheus -spec: - replicas: 1 - selector: - matchLabels: - app.kubernetes.io/component: prometheus - app.kubernetes.io/name: release-name-dataplane - app.kubernetes.io/instance: release-name - template: - metadata: - annotations: - configChecksum: "419ca35014526f736b956303f4160d73038d8a65f6d174286647884e21e6b4e" - labels: - app.kubernetes.io/component: prometheus - app.kubernetes.io/name: release-name-dataplane - app.kubernetes.io/instance: release-name - spec: - priorityClassName: system-cluster-critical - serviceAccountName: union-operator-prometheus - securityContext: - fsGroup: 65534 - runAsNonRoot: true - runAsUser: 65534 - fsGroupChangePolicy: OnRootMismatch - containers: - - name: prometheus - image: "prom/prometheus:v3.3.1" - args: - - --config.file=/etc/prometheus/prometheus.yml - - --web.external-url=/prometheus/ - - --web.route-prefix=/prometheus/ - - --storage.tsdb.retention.time=3d - ports: - - name: http - containerPort: 9090 - protocol: TCP - resources: - limits: - cpu: "3" - memory: 3500Mi - requests: - cpu: "1" - memory: 1Gi - securityContext: - allowPrivilegeEscalation: false - terminationMessagePolicy: FallbackToLogsOnError - volumeMounts: - - mountPath: /etc/prometheus - name: prometheus-config - volumes: - - name: prometheus-config - configMap: - name: union-operator-prometheus ---- -# Source: dataplane/templates/propeller/deployment-webhook.yaml +# Source: dataplane/templates/webhook/deployment.yaml +# Webhook deployment # Create the actual deployment apiVersion: apps/v1 kind: Deployment metadata: - name: flytepropeller-webhook + name: union-pod-webhook namespace: union labels: - app.kubernetes.io/name: flyte-pod-webhook + app.kubernetes.io/name: union-pod-webhook app.kubernetes.io/instance: release-name platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm @@ -6361,19 +6678,19 @@ spec: replicas: 1 selector: matchLabels: - app.kubernetes.io/name: flyte-pod-webhook + app.kubernetes.io/name: union-pod-webhook app.kubernetes.io/instance: release-name template: metadata: labels: platform.union.ai/zone: "dataplane" - app.kubernetes.io/name: flyte-pod-webhook + app.kubernetes.io/name: union-pod-webhook app.kubernetes.io/instance: release-name platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm annotations: - configChecksum: "045ec67f3450b2e8ed72becbc5587d9a49b6ea4a7e6d905e089999a096eec0d" + configChecksum: "77958aee0464eee4cb43c70036cf5ffa89f47f58c908845b9eeaecc7e32bba4" spec: securityContext: @@ -6381,64 +6698,8 @@ spec: fsGroupChangePolicy: Always runAsNonRoot: true runAsUser: 1001 - seLinuxOptions: - type: spc_t - serviceAccountName: flytepropeller-webhook-system - initContainers: - - name: generate-secrets - image: "public.ecr.aws/p0i0a9q8/unionoperator:2026.4.5" - imagePullPolicy: "IfNotPresent" - command: - - flytepropeller - args: - - webhook - - init-certs - - --config - - /etc/flyte/config/*.yaml - env: - - name: POD_NAME - valueFrom: - fieldRef: - fieldPath: metadata.name - - name: POD_NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - - name: GOMEMLIMIT - valueFrom: - resourceFieldRef: - divisor: 1 - resource: limits.memory - - name: GOMAXPROCS - valueFrom: - resourceFieldRef: - divisor: 1 - resource: limits.cpu - - name: CLUSTER_NAME - valueFrom: - secretKeyRef: - name: operator-cluster-name - key: cluster_name - - name: DEPLOYMENT_NAME - value: operator - - name: PROXY_SERVICE_URL - value: http://union-operator-proxy:8080 - - name: PROMETHEUS_SERVICE_URL - value: http://union-operator-prometheus:80 - - name: KNATIVE_PROXY_SERVICE_URL - value: http://kourier-internal - volumeMounts: - - name: config-volume - mountPath: /etc/flyte/config - resources: - limits: - cpu: 1 - ephemeral-storage: 500Mi - memory: 500Mi - requests: - cpu: 200m - ephemeral-storage: 500Mi - memory: 500Mi + seLinuxOptions: null + serviceAccountName: union-system containers: - name: webhook image: "public.ecr.aws/p0i0a9q8/unionoperator:2026.4.5" @@ -6503,107 +6764,85 @@ spec: volumes: - name: config-volume configMap: - name: flyte-propeller-config + name: union-pod-webhook-config - name: webhook-certs secret: - secretName: flyte-pod-webhook + secretName: union-pod-webhook --- -# Source: dataplane/templates/propeller/deployment.yaml -apiVersion: apps/v1 -kind: Deployment +# Source: dataplane/templates/flyteconnector/hpa.yaml +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler metadata: - namespace: union - name: flytepropeller + name: flyteconnector labels: - app.kubernetes.io/name: flytepropeller + app.kubernetes.io/name: flyteconnector app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm spec: - replicas: 1 - selector: - matchLabels: - app.kubernetes.io/name: flytepropeller - app.kubernetes.io/instance: release-name - template: - metadata: - annotations: - configChecksum: "045ec67f3450b2e8ed72becbc5587d9a49b6ea4a7e6d905e089999a096eec0d" - - labels: - platform.union.ai/zone: "dataplane" - - - app.kubernetes.io/name: flytepropeller - app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name - app.kubernetes.io/managed-by: Helm - spec: - priorityClassName: system-cluster-critical - containers: - - command: - - flytepropeller - - --config - - /etc/flyte/config/*.yaml - - --propeller.cluster-id - - 'test-cluster-name' - env: - - name: POD_NAME - valueFrom: - fieldRef: - fieldPath: metadata.name - - name: POD_NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - - name: GOMEMLIMIT - valueFrom: - resourceFieldRef: - divisor: 1 - resource: limits.memory - - name: GOMAXPROCS - valueFrom: - resourceFieldRef: - divisor: 1 - resource: limits.cpu - - name: CLUSTER_NAME - valueFrom: - secretKeyRef: - name: operator-cluster-name - key: cluster_name - - name: DEPLOYMENT_NAME - value: operator - - name: PROXY_SERVICE_URL - value: http://union-operator-proxy:8080 - - name: PROMETHEUS_SERVICE_URL - value: http://union-operator-prometheus:80 - - name: KNATIVE_PROXY_SERVICE_URL - value: http://kourier-internal - image: "public.ecr.aws/p0i0a9q8/unionoperator:2026.4.5" - imagePullPolicy: "IfNotPresent" - name: flytepropeller - ports: - - containerPort: 10254 - resources: - limits: - cpu: "3" - memory: 3Gi - requests: - cpu: "1" - memory: 1Gi - volumeMounts: - - name: config-volume - mountPath: /etc/flyte/config - - name: auth - mountPath: /etc/union/secret - serviceAccountName: flytepropeller-system - volumes: - - configMap: - name: flyte-propeller-config - name: config-volume - - name: auth - secret: - secretName: union-secret-auth + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: flyteconnector + minReplicas: 2 + maxReplicas: 5 + metrics: + - type: Resource + resource: + name: cpu + target: + type: Utilization + averageUtilization: 80 + - type: Resource + resource: + name: memory + target: + type: Utilization + averageUtilization: 80 +--- +# Source: dataplane/templates/webhook/mutatingwebhookconfiguration.yaml +apiVersion: admissionregistration.k8s.io/v1 +kind: MutatingWebhookConfiguration +metadata: + name: union-pod-webhook-test-org-name + labels: + app.kubernetes.io/name: union-pod-webhook + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm +webhooks: + - name: union-pod-webhook.flyte.org + admissionReviewVersions: + - v1 + - v1beta1 + clientConfig: + caBundle: LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSUROekNDQWgrZ0F3SUJBZ0lVSm9kQ09lem4vd1BHQTZjYkw1U3duSlpYT2hJd0RRWUpLb1pJaHZjTkFRRUwKQlFBd0t6RXBNQ2NHQTFVRUF3d2dabXg1ZEdWd2NtOXdaV3hzWlhJdGQyVmlhRzl2YXk1MWJtbHZiaTV6ZG1NdwpIaGNOTWpZd01USXpNREl6TmpVMVdoY05Nell3TVRJeE1ESXpOalUxV2pBck1Ta3dKd1lEVlFRRERDQm1iSGwwClpYQnliM0JsYkd4bGNpMTNaV0pvYjI5ckxuVnVhVzl1TG5OMll6Q0NBU0l3RFFZSktvWklodmNOQVFFQkJRQUQKZ2dFUEFEQ0NBUW9DZ2dFQkFJcjBSWDRGRjFNVTNYdFBmUHErMDUrMGI1Qm0xK3hsTVR5ZUp4bG5mWFJuRlgyegpPcHV6NmsreUpBQk4xRkxmNDVYRnlJclFlaW5DWWJtckZXalFRc3ZDWEloNkpPNGMwdmRMVVU5RERYcldWRFlsCkRNZytTUDlJVnZBMm5USkVFZ2RFREVsKzNWeThUbUJoNlNmcGthcXdHajY5SmhIdy9OeE9yZzM2bUY3TU5VZ1MKWXJVUkxqUFFzTUN6NElTVzhhQnEzS2IwRUFWUTA4V1lZQ0ZEbjY1aHBzVFdxR05rL3BKRnVTYjlsZll6RVdyYwpJeW5wNDhqTnIvNUdKS3NnWG9LeTUwZVo4UkppeTVyOUhENGtnSW15TVNQR2RDMXFZSVppRHRlZW8yOElxNk5nCmd1dWcyTGhlNndYYW9YZXFCQ2d3SHpwWFVXM1hJVzI5eFoxUkFRRUNBd0VBQWFOVE1GRXdIUVlEVlIwT0JCWUUKRktITDQ1RWhQaVg5bnlRN241QUk0YUZVNFlCL01COEdBMVVkSXdRWU1CYUFGS0hMNDVFaFBpWDlueVE3bjVBSQo0YUZVNFlCL01BOEdBMVVkRXdFQi93UUZNQU1CQWY4d0RRWUpLb1pJaHZjTkFRRUxCUUFEZ2dFQkFEeTRac21XCkNjZ3B5TmI4cU1ydzNYclBNejZLMTRlTVhqQVk1bERFYXM1NklLZExoZFM1bi9sWk95bENuUUtOZ3RHS0VrY3UKT05tMEE2Z0xNT3UzcE43dG9aNXh0WktOcnUvS0x6dm44QUNNNndCbUR0bG5oRERDa3ZUTHBVRzh6UTZSNmlaegpzTWlhVmpLSUJiWkJsZXNYSHZZQnQ2YlRTN2o1QWM2aTdYc1RPK3ZqQ2RUQjVYdCtEZ0x2QnBJRWh2ZGJzNXhmClFtc3ZzQmwxenJnR3JXemo4d0txcXNUMzZaOVU1TzdSK1RJMlQrUDVOR25SeHNQOGJ0R0RtMGZ5SW55WlJrSkcKVTdpVFJyMlZRWlRyaTNZQ2dlVjN1UjFtMnJjc2hyU1Avb082T0QyTXR5Q1FMRWtFK1pxVUVJVnpSYmVaNXl2RQptM29YMFV2OUtBUy9pWTA9Ci0tLS0tRU5EIENFUlRJRklDQVRFLS0tLS0K + service: + name: union-pod-webhook + namespace: union + path: /mutate--v1-pod/secrets + port: 443 + failurePolicy: Fail + matchPolicy: Equivalent + namespaceSelector: + matchLabels: + kubernetes.io/metadata.name: union + objectSelector: + matchLabels: + inject-flyte-secrets: "true" + organization: 'test-org-name' + reinvocationPolicy: Never + rules: + - apiGroups: + - '*' + apiVersions: + - v1 + operations: + - CREATE + resources: + - pods + scope: '*' + sideEffects: NoneOnDryRun + timeoutSeconds: 30 --- # Source: dataplane/charts/dcgm-exporter/templates/tls-secret.yaml # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. @@ -6635,6 +6874,222 @@ spec: # See the License for the specific language governing permissions and # limitations under the License. --- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Imported via https://github.com/knative/operator/releases/download/knative-v1.16.0/operator.yaml +# Copyright 2024 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +# Source: dataplane/templates/propeller/serviceaccount.yaml +--- +--- +# Source: dataplane/templates/serving/knative-serving.yaml +apiVersion: operator.knative.dev/v1beta1 +kind: KnativeServing +metadata: + name: union-operator-serving + namespace: union + labels: + helm.sh/chart: dataplane-2026.4.4 + app.kubernetes.io/name: release-name-dataplane + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2026.4.5" + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/component: serving +spec: + config: + deployment: + progress-deadline: "30m" + queue-sidecar-cpu-request: "25m" + queue-sidecar-cpu-limit: "1000m" + queue-sidecar-memory-request: "400Mi" + queue-sidecar-memory-limit: "800Mi" + queue-sidecar-ephemeral-storage-request: "512Mi" + queue-sidecar-ephemeral-storage-limit: "1024Mi" + registries-skipping-tag-resolving: managed.cr.union.ai + features: + kubernetes.podspec-affinity: "enabled" + kubernetes.podspec-nodeselector: "enabled" + kubernetes.podspec-tolerations: "enabled" + kubernetes.podspec-fieldref: "enabled" + kubernetes.podspec-dnspolicy: "enabled" + kubernetes.podspec-schedulername: "enabled" + kubernetes.podspec-securitycontext: "enabled" + network: + ingress-class: "kourier.ingress.networking.knative.dev" + high-availability: + replicas: 2 + ingress: + kourier: + enabled: true + bootstrap-configmap: "union-operator-serving-envoy-bootstrap" + service-type: ClusterIP + podDisruptionBudgets: + - name: 3scale-kourier-gateway-pdb + minAvailable: 50% + - name: activator-pdb + minAvailable: 50% + - name: webhook-pdb + minAvailable: 50% + registry: + override: + # TODO(jeev): Wire up Union fork of Envoy + 3scale-kourier-gateway/kourier-gateway: ghcr.io/unionai/envoy:456fed84d4ad9a9dfb186d117d9362e9dc0f7c1f + # TODO(jeev): Wire up Union fork of Kourier + net-kourier-controller/controller: ghcr.io/unionai/kourier@sha256:5804c348d15b3959604e3e3ceed216c3a1c7b32cbe254c7d3eb02a35e62ba9c4 + workloads: + - name: 3scale-kourier-gateway + labels: + helm.sh/chart: dataplane-2026.4.4 + app.kubernetes.io/name: release-name-dataplane + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2026.4.5" + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/component: 3scale-kourier-gateway + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchLabels: + app.kubernetes.io/name: 3scale-kourier-gateway + app.kubernetes.io/instance: release-name + topologyKey: topology.kubernetes.io/zone + annotations: + checksum/bootstrap-config: 6469f6f592eebc9e0c676d8ccc359a05bc799c0988e474b2da942c4cc328f656 + env: + - container: kourier-gateway + envVars: + - name: UNION_AUTHZ_TENANTAUTHURL + value: "https://test-controlplane-host/me" + - name: UNION_AUTHZ_TENANTAUTHSIGNINURL + value: "https://test-controlplane-host/login" + - name: UNION_AUTHZ_TENANTCONTROLPLANEURL + value: "https://test-controlplane-host" + resources: + - container: kourier-gateway + limits: + cpu: "2" + memory: 2Gi + requests: + cpu: "1" + memory: 1Gi + - name: activator + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app + operator: In + values: + - activator + topologyKey: topology.kubernetes.io/zone + - name: autoscaler + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app + operator: In + values: + - autoscaler + topologyKey: topology.kubernetes.io/zone + - name: autoscaler-hpa + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app + operator: In + values: + - autoscaler-hpa + topologyKey: topology.kubernetes.io/zone + - name: controller + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app + operator: In + values: + - controller + topologyKey: topology.kubernetes.io/zone + - name: net-kourier-controller + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app + operator: In + values: + - net-kourier-controller + topologyKey: topology.kubernetes.io/zone + env: + - container: controller + envVars: + - name: KOURIER_UNION_AUTHZ_ENABLED + value: "true" + resources: + - container: controller + limits: + cpu: "1" + memory: 1Gi + requests: + cpu: 500m + memory: 500Mi + - name: webhook + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app + operator: In + values: + - webhook + topologyKey: topology.kubernetes.io/zone +--- +# Source: dataplane/templates/common/task-podtemplate.yaml +apiVersion: v1 +kind: PodTemplate +metadata: + name: task-template + namespace: union +template: + spec: + serviceAccountName: union + containers: + - name: default + image: docker.io/rwgrim/docker-noop +--- # Source: dataplane/charts/fluentbit/templates/tests/test-connection.yaml apiVersion: v1 kind: Pod diff --git a/tests/generated/dataplane.azure-custom-storage-prefix.yaml b/tests/generated/dataplane.azure-custom-storage-prefix.yaml index c4f27ce9..aa41433d 100644 --- a/tests/generated/dataplane.azure-custom-storage-prefix.yaml +++ b/tests/generated/dataplane.azure-custom-storage-prefix.yaml @@ -1,167 +1,140 @@ --- -# Source: dataplane/templates/common/namespaces.yaml -apiVersion: v1 -kind: Namespace -metadata: - name: flytesnacks-development ---- -# Source: dataplane/templates/common/namespaces.yaml -apiVersion: v1 -kind: Namespace -metadata: - name: flytesnacks-staging ---- -# Source: dataplane/templates/common/namespaces.yaml -apiVersion: v1 -kind: Namespace -metadata: - name: flytesnacks-production ---- -# Source: dataplane/templates/common/namespaces.yaml -apiVersion: v1 -kind: Namespace -metadata: - name: union-health-monitoring-development ---- -# Source: dataplane/templates/common/namespaces.yaml -apiVersion: v1 -kind: Namespace -metadata: - name: union-health-monitoring-staging ---- -# Source: dataplane/templates/common/namespaces.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2020 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + apiVersion: v1 -kind: Namespace +kind: ServiceAccount metadata: - name: union-health-monitoring-production + name: knative-operator + namespace: "union" + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator --- -# Source: dataplane/charts/fluentbit/templates/serviceaccount.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2020 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + apiVersion: v1 kind: ServiceAccount metadata: - name: fluentbit-system - namespace: union + name: operator-webhook + namespace: "union" labels: - helm.sh/chart: fluentbit-0.48.9 - app.kubernetes.io/name: fluentbit - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "3.2.8" - app.kubernetes.io/managed-by: Helm + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator --- -# Source: dataplane/charts/kube-state-metrics/templates/serviceaccount.yaml +# Source: dataplane/charts/prometheus/charts/kube-state-metrics/templates/serviceaccount.yaml apiVersion: v1 kind: ServiceAccount automountServiceAccountToken: true metadata: labels: - helm.sh/chart: kube-state-metrics-5.30.1 + helm.sh/chart: kube-state-metrics-5.25.1 app.kubernetes.io/managed-by: Helm app.kubernetes.io/component: metrics app.kubernetes.io/part-of: kube-state-metrics app.kubernetes.io/name: kube-state-metrics app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2.15.0" + app.kubernetes.io/version: "2.13.0" name: release-name-kube-state-metrics namespace: union --- -# Source: dataplane/charts/opencost/templates/serviceaccount.yaml +# Source: dataplane/charts/prometheus/templates/serviceaccount.yaml apiVersion: v1 kind: ServiceAccount metadata: - name: release-name-opencost - namespace: union labels: - helm.sh/chart: opencost-1.42.0 - app.kubernetes.io/name: opencost + app.kubernetes.io/component: server + app.kubernetes.io/name: prometheus app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "1.111.0" - app.kubernetes.io/part-of: opencost + app.kubernetes.io/version: v2.54.1 + helm.sh/chart: prometheus-25.27.0 app.kubernetes.io/managed-by: Helm -automountServiceAccountToken: true ---- -# Source: dataplane/templates/clusterresourcesync/serviceaccount.yaml -apiVersion: v1 -kind: ServiceAccount -metadata: - name: union-clustersync-system + app.kubernetes.io/part-of: prometheus + name: union-operator-prometheus namespace: union annotations: - azure.workload.identity/client-id: 'test-backend-client-id' ---- -# Source: dataplane/templates/imagebuilder/serviceaccount.yaml -apiVersion: v1 -kind: ServiceAccount -metadata: - name: union-imagebuilder + {} --- -# Source: dataplane/templates/nodeexecutor/serviceaccount.yaml +# Source: dataplane/templates/common/system-serviceaccount.yaml apiVersion: v1 kind: ServiceAccount metadata: - name: executor + name: union-system namespace: union - labels: - app: executor - annotations: - azure.workload.identity/client-id: 'test-backend-client-id' ---- -# Source: dataplane/templates/operator/serviceaccount-proxy.yaml -apiVersion: v1 -kind: ServiceAccount -metadata: - name: proxy-system - labels: - app.kubernetes.io/name: operator-proxy - app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name - app.kubernetes.io/managed-by: Helm annotations: azure.workload.identity/client-id: 'test-backend-client-id' --- -# Source: dataplane/templates/operator/serviceaccount.yaml +# Source: dataplane/templates/common/union-serviceaccount.yaml apiVersion: v1 kind: ServiceAccount metadata: - name: operator-system - labels: - app.kubernetes.io/name: union-operator - app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name - app.kubernetes.io/managed-by: Helm + name: union + namespace: union annotations: - azure.workload.identity/client-id: 'test-backend-client-id' + azure.workload.identity/client-id: test-worker-client-id +automountServiceAccountToken: true --- -# Source: dataplane/templates/prometheus/serviceaccount.yaml +# Source: dataplane/templates/flyteconnector/serviceaccount.yaml apiVersion: v1 kind: ServiceAccount metadata: - name: union-operator-prometheus + name: flyteconnector namespace: union - labels: - helm.sh/chart: dataplane-2026.4.2 - app.kubernetes.io/name: release-name-dataplane + labels: + app.kubernetes.io/name: flyteconnector app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: prometheus ---- -# Source: dataplane/templates/propeller/serviceaccount-webhook.yaml -apiVersion: v1 -kind: ServiceAccount -metadata: - name: flytepropeller-webhook-system - namespace: union - annotations: - azure.workload.identity/client-id: 'test-backend-client-id' --- -# Source: dataplane/templates/propeller/serviceaccount.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2022 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + apiVersion: v1 -kind: ServiceAccount +kind: Secret metadata: - name: flytepropeller-system - namespace: union - annotations: - azure.workload.identity/client-id: 'test-backend-client-id' + name: operator-webhook-certs + namespace: "union" + labels: + app.kubernetes.io/component: operator-webhook + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +# The data is populated at install time. --- # Source: dataplane/templates/common/auth-secret.yaml apiVersion: v1 @@ -184,154 +157,533 @@ type: Opaque data: cluster_name: dGVzdC1henVyZS1jbHVzdGVy --- -# Source: dataplane/templates/propeller/deployment-webhook.yaml -# Create an empty secret that the first propeller pod will populate +# Source: dataplane/templates/webhook/mutatingwebhookconfiguration.yaml apiVersion: v1 kind: Secret metadata: - name: flyte-pod-webhook + name: union-pod-webhook namespace: union + labels: + app.kubernetes.io/name: union-pod-webhook + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm type: Opaque +data: + ca.crt: LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSUROekNDQWgrZ0F3SUJBZ0lVSm9kQ09lem4vd1BHQTZjYkw1U3duSlpYT2hJd0RRWUpLb1pJaHZjTkFRRUwKQlFBd0t6RXBNQ2NHQTFVRUF3d2dabXg1ZEdWd2NtOXdaV3hzWlhJdGQyVmlhRzl2YXk1MWJtbHZiaTV6ZG1NdwpIaGNOTWpZd01USXpNREl6TmpVMVdoY05Nell3TVRJeE1ESXpOalUxV2pBck1Ta3dKd1lEVlFRRERDQm1iSGwwClpYQnliM0JsYkd4bGNpMTNaV0pvYjI5ckxuVnVhVzl1TG5OMll6Q0NBU0l3RFFZSktvWklodmNOQVFFQkJRQUQKZ2dFUEFEQ0NBUW9DZ2dFQkFJcjBSWDRGRjFNVTNYdFBmUHErMDUrMGI1Qm0xK3hsTVR5ZUp4bG5mWFJuRlgyegpPcHV6NmsreUpBQk4xRkxmNDVYRnlJclFlaW5DWWJtckZXalFRc3ZDWEloNkpPNGMwdmRMVVU5RERYcldWRFlsCkRNZytTUDlJVnZBMm5USkVFZ2RFREVsKzNWeThUbUJoNlNmcGthcXdHajY5SmhIdy9OeE9yZzM2bUY3TU5VZ1MKWXJVUkxqUFFzTUN6NElTVzhhQnEzS2IwRUFWUTA4V1lZQ0ZEbjY1aHBzVFdxR05rL3BKRnVTYjlsZll6RVdyYwpJeW5wNDhqTnIvNUdKS3NnWG9LeTUwZVo4UkppeTVyOUhENGtnSW15TVNQR2RDMXFZSVppRHRlZW8yOElxNk5nCmd1dWcyTGhlNndYYW9YZXFCQ2d3SHpwWFVXM1hJVzI5eFoxUkFRRUNBd0VBQWFOVE1GRXdIUVlEVlIwT0JCWUUKRktITDQ1RWhQaVg5bnlRN241QUk0YUZVNFlCL01COEdBMVVkSXdRWU1CYUFGS0hMNDVFaFBpWDlueVE3bjVBSQo0YUZVNFlCL01BOEdBMVVkRXdFQi93UUZNQU1CQWY4d0RRWUpLb1pJaHZjTkFRRUxCUUFEZ2dFQkFEeTRac21XCkNjZ3B5TmI4cU1ydzNYclBNejZLMTRlTVhqQVk1bERFYXM1NklLZExoZFM1bi9sWk95bENuUUtOZ3RHS0VrY3UKT05tMEE2Z0xNT3UzcE43dG9aNXh0WktOcnUvS0x6dm44QUNNNndCbUR0bG5oRERDa3ZUTHBVRzh6UTZSNmlaegpzTWlhVmpLSUJiWkJsZXNYSHZZQnQ2YlRTN2o1QWM2aTdYc1RPK3ZqQ2RUQjVYdCtEZ0x2QnBJRWh2ZGJzNXhmClFtc3ZzQmwxenJnR3JXemo4d0txcXNUMzZaOVU1TzdSK1RJMlQrUDVOR25SeHNQOGJ0R0RtMGZ5SW55WlJrSkcKVTdpVFJyMlZRWlRyaTNZQ2dlVjN1UjFtMnJjc2hyU1Avb082T0QyTXR5Q1FMRWtFK1pxVUVJVnpSYmVaNXl2RQptM29YMFV2OUtBUy9pWTA9Ci0tLS0tRU5EIENFUlRJRklDQVRFLS0tLS0K + tls.crt: LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSUQxakNDQXI2Z0F3SUJBZ0lVREtRN3A4MFpIdkQyUE5Ya3p3VGRLem9iUjg4d0RRWUpLb1pJaHZjTkFRRUwKQlFBd0t6RXBNQ2NHQTFVRUF3d2dabXg1ZEdWd2NtOXdaV3hzWlhJdGQyVmlhRzl2YXk1MWJtbHZiaTV6ZG1NdwpIaGNOTWpZd01USXpNREl6TmpVMVdoY05NamN3TVRJek1ESXpOalUxV2pBck1Ta3dKd1lEVlFRRERDQm1iSGwwClpYQnliM0JsYkd4bGNpMTNaV0pvYjI5ckxuVnVhVzl1TG5OMll6Q0NBU0l3RFFZSktvWklodmNOQVFFQkJRQUQKZ2dFUEFEQ0NBUW9DZ2dFQkFMNnMxUTRaZ3ZxVVdNdFp5VC9EKytEQWl6WmR0Q1lkVzArSGk4S3phQ1Vjbi9zZwo3a21NTEhDM2ZlZWtZK2ZUNENHUTd6K1l2WjRWWSt6WGljKzdrNndLbUZqdm1acEM1Rm96OWo1Q21NRk5HZmlWCnRobzVvSU9MUkIzNXRYMU1ROHUwNFVzT29QcjdMK016eEdxWXVmTFdtN1NVdjh0eEp3ZFMxZVlzMllNV1pMeU4KekI4akdWY01Qc2ZNVkdxdFhTTVljQ3hNYURKaTM1N0VEZSs3clM2U2ZtbnFtYjJxekdQM1BRRStCUm91Yk5nLwo4NCt4c253VXBLZUFpem95dFM2c2ZlOTdMUEtRTW9oVzd2TjZORFJrLy9xSmpHem9iZmZwUHU3endhWHVXMWtMCmllc2g2cVRrdlZLSTF3WFJ2WHArVU5GbjZzdDIwdEVlRks0ZzMwOENBd0VBQWFPQjhUQ0I3akFKQmdOVkhSTUUKQWpBQU1Bc0dBMVVkRHdRRUF3SUY0RENCa3dZRFZSMFJCSUdMTUlHSWdoWm1iSGwwWlhCeWIzQmxiR3hsY2kxMwpaV0pvYjI5cmdoeG1iSGwwWlhCeWIzQmxiR3hsY2kxM1pXSm9iMjlyTG5WdWFXOXVnaUJtYkhsMFpYQnliM0JsCmJHeGxjaTEzWldKb2IyOXJMblZ1YVc5dUxuTjJZNEl1Wm14NWRHVndjbTl3Wld4c1pYSXRkMlZpYUc5dmF5NTEKYm1sdmJpNXpkbU11WTJ4MWMzUmxjaTVzYjJOaGJEQWRCZ05WSFE0RUZnUVV1d25wcDZ1MTNKL0NsNnI3eDZ1Sgo5YmZ1RDhzd0h3WURWUjBqQkJnd0ZvQVVvY3Zqa1NFK0pmMmZKRHVma0FqaG9WVGhnSDh3RFFZSktvWklodmNOCkFRRUxCUUFEZ2dFQkFFMmlOa1MzczhGT3Z0cDhrVmJNZUN2amxIQytvYTVlZTFGNmowNlZ3bFJPOWdxOXJla1UKMGptWUhNWGNjWjcrZXJkNFRQRXgwK3ZqMzNhRTV3cDFmdVJJL2xIZHhuOUpJRHg2bTFQTHdVa3p1WG9EekYxLwozVXVIWU1xOGkrT2xsY0tqbmdVMUFUNmRWRCtIbFVWeHpMdjlNazdPb05SQUY5dTJHa1hiUHdCN0I1ZGRxdWxlCmtNcXBxbDhES2s4SVNVelVsTjNBb2Q5NmJUelIyV3RCVkxpTi9zRHZtTDZwZThFYm55RnBUb1BZTDlGeWZOcmIKT1ZPZEJwUkZFTG1KOHZaZWY5d1lxRmVXMTE3SGV5VDArZkxGZVNSemZUNVVWUlVraVFBcUVJdUhHUVRrOVFabgo1WnYvNUF0eEg3WE1rcXR0VE1zMFlSUG0xYjQzY3FBeFdhND0KLS0tLS1FTkQgQ0VSVElGSUNBVEUtLS0tLQo= + tls.key: LS0tLS1CRUdJTiBQUklWQVRFIEtFWS0tLS0tCk1JSUV2UUlCQURBTkJna3Foa2lHOXcwQkFRRUZBQVNDQktjd2dnU2pBZ0VBQW9JQkFRQytyTlVPR1lMNmxGakwKV2NrL3cvdmd3SXMyWGJRbUhWdFBoNHZDczJnbEhKLzdJTzVKakN4d3QzM25wR1BuMCtBaGtPOC9tTDJlRldQcwoxNG5QdTVPc0NwaFk3NW1hUXVSYU0vWStRcGpCVFJuNGxiWWFPYUNEaTBRZCtiVjlURVBMdE9GTERxRDYreS9qCk04UnFtTG55MXB1MGxML0xjU2NIVXRYbUxObURGbVM4amN3Zkl4bFhERDdIekZScXJWMGpHSEFzVEdneVl0K2UKeEEzdnU2MHVrbjVwNnBtOXFzeGo5ejBCUGdVYUxtellQL09Qc2JKOEZLU25nSXM2TXJVdXJIM3ZleXp5a0RLSQpWdTd6ZWpRMFpQLzZpWXhzNkczMzZUN3U4OEdsN2x0WkM0bnJJZXFrNUwxU2lOY0YwYjE2ZmxEUlorckxkdExSCkhoU3VJTjlQQWdNQkFBRUNnZ0VBRjV2SnMzbTRMSE9DdlFic2NwOC9DQmgvQkNlOC92MGlpYW5VUmJLMkFlWlYKN3Arb3NXV1FPUktYSGIvT2VPMVVjb09DQkFOUzh3aGQrM3pDZlB5U0w3cU9HM0RyT1Z6djdqVkIxM3FpZEVpcQpId3ZXWk0vZXpuckhYOWpEdm5SYmJwVUNVaXRKQmxwa2x3S1pYc056UHB5UTRkNkxFdEw3VEo1V2lxM2g3cWQzCjhncmJoeWlWNDhZRFAyYkhpZGt0S2QvMHBNL3piSzZPTS9iMFpLUmRBWVpHQXlOcklMaFJ3eDNsQXE2SDVqL20KUHBuNlZEdVQ3Y1h6UFhmNTgxR0IyNjQxWFJVTXRNWGVEREtaU3JudnIrbG5tV2lsYXZmNkg3ZHp4NlRBMmk0RApybCtIRVdUN1RSQUZGMnZ5SVUrYSt5RzNPVFBKRCtmVTdCWWh4Zld4aFFLQmdRRGVvcXVCbzQxK1dKU255blZnCmNWMHZoUEkwZ0dyRGQzNFdIbCtGNTZuSmxuMVVlbUQ2SUQ0TUhnNVpHWmNmd3NkRlB0aHl1VnB0OVl1SW11SGQKRHE5ZDhpWnpHUzJVbStkSWlsWjhnZXJ5Q3dON3hIbFM3SzJrc0RmRGJwYmZZSXZ1LzRnSExZRHh3YzQ0OG54ZwpWNGpCUFY2QTdYTHRqQzZzdFE2OW9uTlIwd0tCZ1FEYlFBUTRGdWpnUjJUOXh2Y01MdXhMbm1YaHE4U2NoZlM5CkRoWndDUzZIeHB1YlBxblBjQ04vQUZjbDZ1cDRXVEVZdVY4bEpwcGwwRmpYQ2lMdkhuUU15cm5vRjlUOThpanQKZGVHYjF6ZWlwRUpndFpNVHkvOGdERWorZU1wSFBuYTZwQVpMOVpRVUdLR2dDNGQ0SzljdVNHMmRlb1dSU0ViSQpMVGgzeGpXVEZRS0JnQStJRWFidG5nVmVjS0IwQTFSREZGa29VUzFRZUNKQ3g4MExPV2JDRHBvOW9XaXZVT3lpCkt6SDFOdE1JY2Y2SlBCV2NtTVVJSVVMaWltVnhTS2gvU2NTb0MvNmpsd1p6Q2VPSm94YjBpVXR4Y1VERktDR2MKMlZCUDZ0UDdkeE1HVFR6VEhzNUJZbWw3TjhQSlJ0d0J2MHliMTJmdktNRmhzaS9pUWJFQkVFSjVBb0dBUm9NSApHRmJkM0V0NXdsZzcyYkk1a25SRnhkY3RLejIxb1J1bndhNWlSWTV0T3Zkak8zQ1FLZWNkSC9lMklyQmtwdFB2Ck1vNkF0MS9UUW8xakFNNGxlbnUwWUYxUnhiNGN4WW5VM2Y3UVNNRlZDNjg1dHZNemdNWVNyNngvT1h4d1NNUTUKdGpJcnhtN0poQ1JSRkNmZFUyZzl3SmpIM2hxRmtSbGlBTHRCUGFVQ2dZRUF2QkRHejQvYS9kRzRJT0VpREk4SgpBbkxaRWZaTFJrdnI0ejRyT0p0TGZtWlFjTW5DSUZmKzBaelhTME9XOWhvMjk5RUNtS1l4U2VUWk1WSDgzOFZCCnRvSkNGM3lzb01rSC9meEVXTUpicDlMblJwSFUzUElicHlhRG1nWW1YdmsvK3laL3Y1c0dDTWNtVGkzWVFLRGsKcStlK1o0SHZyaGUxVytUaHpwOCtoNGs9Ci0tLS0tRU5EIFBSSVZBVEUgS0VZLS0tLS0K --- -# Source: dataplane/templates/clusterresourcesync/configmap.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2019 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + apiVersion: v1 kind: ConfigMap metadata: - name: union-clusterresourcesync-config - namespace: union + name: config-logging + namespace: "union" labels: - app.kubernetes.io/name: clusterresourcesync - app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name - app.kubernetes.io/managed-by: Helm -data: - cluster_resources.yaml: | - cluster_resources: - clusterName: 'test-azure-cluster' - customData: - - production: - - projectQuotaCpu: - value: "4096" - - projectQuotaMemory: - value: 2Ti - - projectQuotaNvidiaGpu: - value: "256" - - defaultUserRoleKey: - value: 'azure.workload.identity/client-id' - - defaultUserRoleValue: - value: 'test-worker-client-id' - - staging: - - projectQuotaCpu: - value: "4096" - - projectQuotaMemory: - value: 2Ti - - projectQuotaNvidiaGpu: - value: "256" - - defaultUserRoleKey: - value: 'azure.workload.identity/client-id' - - defaultUserRoleValue: - value: 'test-worker-client-id' - - development: - - projectQuotaCpu: - value: "4096" - - projectQuotaMemory: - value: 2Ti - - projectQuotaNvidiaGpu: - value: "256" - - defaultUserRoleKey: - value: 'azure.workload.identity/client-id' - - defaultUserRoleValue: - value: 'test-worker-client-id' - refreshInterval: 5m - standaloneDeployment: true - templatePath: /etc/flyte/clusterresource/templates - clusterResourcesPrivate: - app: - isServerless: false - union: - auth: - authorizationMetadataKey: flyte-authorization - clientId: 'test-client-id' - clientSecretLocation: /etc/union/secret/client_secret - tokenRefreshWindow: 5m - type: ClientSecret - connection: - host: dns:///test.dataplane.union.ai - admin.yaml: | - admin: - clientId: 'test-client-id' - clientSecretLocation: /etc/union/secret/client_secret - endpoint: dns:///test.dataplane.union.ai - insecure: false - event: - capacity: 1000 - rate: 500 - type: admin - domain.yaml: | - domains: - - id: development - name: development - - id: staging - name: staging - - id: production - name: production - namespace_config.yaml: | - namespace_mapping: - template: '{{`{{ domain }}`}}' - clusters.yaml: | - clusters: - clusterConfigs: [] - labelClusterMap: {} - logger.yaml: | - logger: - formatter: - type: json - level: 4 - show-source: true + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +data: {} --- -# Source: dataplane/templates/clusterresourcesync/configmap.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2019 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + apiVersion: v1 kind: ConfigMap metadata: - name: union-clusterresource-template - namespace: union + name: config-observability + namespace: "union" labels: - app.kubernetes.io/name: clusterresourcesync + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +data: {} +--- +# Source: dataplane/charts/prometheus/templates/cm.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + labels: + app.kubernetes.io/component: server + app.kubernetes.io/name: prometheus app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name + app.kubernetes.io/version: v2.54.1 + helm.sh/chart: prometheus-25.27.0 app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: prometheus + name: union-operator-prometheus + namespace: union data: - a_namespace.yaml: | - apiVersion: v1 - kind: Namespace - metadata: - name: {{ namespace }} - labels: - union.ai/namespace-type: flyte - spec: - finalizers: - - kubernetes - - b_default_service_account.yaml: | - apiVersion: v1 - kind: ServiceAccount - metadata: - name: default - namespace: {{ namespace }} - annotations: - {{ defaultUserRoleKey }}: {{ defaultUserRoleValue }} + allow-snippet-annotations: "false" + alerting_rules.yml: | + {} + alerts: | + {} + prometheus.yml: | + global: + evaluation_interval: 1m + scrape_interval: 1m + scrape_timeout: 10s + rule_files: + - /etc/config/recording_rules.yml + - /etc/config/alerting_rules.yml + - /etc/config/rules + - /etc/config/alerts + scrape_configs: + - job_name: prometheus + static_configs: + - targets: + - localhost:9090 + - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + job_name: kubernetes-apiservers + kubernetes_sd_configs: + - role: endpoints + relabel_configs: + - action: keep + regex: default;kubernetes;https + source_labels: + - __meta_kubernetes_namespace + - __meta_kubernetes_service_name + - __meta_kubernetes_endpoint_port_name + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + insecure_skip_verify: true + - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + job_name: kubernetes-nodes + kubernetes_sd_configs: + - role: node + relabel_configs: + - action: labelmap + regex: __meta_kubernetes_node_label_(.+) + - replacement: kubernetes.default.svc:443 + target_label: __address__ + - regex: (.+) + replacement: /api/v1/nodes/$1/proxy/metrics + source_labels: + - __meta_kubernetes_node_name + target_label: __metrics_path__ + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + insecure_skip_verify: true + - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + job_name: kubernetes-nodes-cadvisor + kubernetes_sd_configs: + - role: node + relabel_configs: + - action: labelmap + regex: __meta_kubernetes_node_label_(.+) + - replacement: kubernetes.default.svc:443 + target_label: __address__ + - regex: (.+) + replacement: /api/v1/nodes/$1/proxy/metrics/cadvisor + source_labels: + - __meta_kubernetes_node_name + target_label: __metrics_path__ + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + insecure_skip_verify: true + - honor_labels: true + job_name: kubernetes-service-endpoints + kubernetes_sd_configs: + - role: endpoints + relabel_configs: + - action: keep + regex: true + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_scrape + - action: drop + regex: true + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_scrape_slow + - action: replace + regex: (https?) + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_scheme + target_label: __scheme__ + - action: replace + regex: (.+) + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_path + target_label: __metrics_path__ + - action: replace + regex: (.+?)(?::\d+)?;(\d+) + replacement: $1:$2 + source_labels: + - __address__ + - __meta_kubernetes_service_annotation_prometheus_io_port + target_label: __address__ + - action: labelmap + regex: __meta_kubernetes_service_annotation_prometheus_io_param_(.+) + replacement: __param_$1 + - action: labelmap + regex: __meta_kubernetes_service_label_(.+) + - action: replace + source_labels: + - __meta_kubernetes_namespace + target_label: namespace + - action: replace + source_labels: + - __meta_kubernetes_service_name + target_label: service + - action: replace + source_labels: + - __meta_kubernetes_pod_node_name + target_label: node + - honor_labels: true + job_name: kubernetes-service-endpoints-slow + kubernetes_sd_configs: + - role: endpoints + relabel_configs: + - action: keep + regex: true + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_scrape_slow + - action: replace + regex: (https?) + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_scheme + target_label: __scheme__ + - action: replace + regex: (.+) + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_path + target_label: __metrics_path__ + - action: replace + regex: (.+?)(?::\d+)?;(\d+) + replacement: $1:$2 + source_labels: + - __address__ + - __meta_kubernetes_service_annotation_prometheus_io_port + target_label: __address__ + - action: labelmap + regex: __meta_kubernetes_service_annotation_prometheus_io_param_(.+) + replacement: __param_$1 + - action: labelmap + regex: __meta_kubernetes_service_label_(.+) + - action: replace + source_labels: + - __meta_kubernetes_namespace + target_label: namespace + - action: replace + source_labels: + - __meta_kubernetes_service_name + target_label: service + - action: replace + source_labels: + - __meta_kubernetes_pod_node_name + target_label: node + scrape_interval: 5m + scrape_timeout: 30s + - honor_labels: true + job_name: prometheus-pushgateway + kubernetes_sd_configs: + - role: service + relabel_configs: + - action: keep + regex: pushgateway + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_probe + - honor_labels: true + job_name: kubernetes-services + kubernetes_sd_configs: + - role: service + metrics_path: /probe + params: + module: + - http_2xx + relabel_configs: + - action: keep + regex: true + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_probe + - source_labels: + - __address__ + target_label: __param_target + - replacement: blackbox + target_label: __address__ + - source_labels: + - __param_target + target_label: instance + - action: labelmap + regex: __meta_kubernetes_service_label_(.+) + - source_labels: + - __meta_kubernetes_namespace + target_label: namespace + - source_labels: + - __meta_kubernetes_service_name + target_label: service + - honor_labels: true + job_name: kubernetes-pods + kubernetes_sd_configs: + - role: pod + relabel_configs: + - action: keep + regex: true + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_scrape + - action: drop + regex: true + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_scrape_slow + - action: replace + regex: (https?) + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_scheme + target_label: __scheme__ + - action: replace + regex: (.+) + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_path + target_label: __metrics_path__ + - action: replace + regex: (\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4}) + replacement: '[$2]:$1' + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_port + - __meta_kubernetes_pod_ip + target_label: __address__ + - action: replace + regex: (\d+);((([0-9]+?)(\.|$)){4}) + replacement: $2:$1 + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_port + - __meta_kubernetes_pod_ip + target_label: __address__ + - action: labelmap + regex: __meta_kubernetes_pod_annotation_prometheus_io_param_(.+) + replacement: __param_$1 + - action: labelmap + regex: __meta_kubernetes_pod_label_(.+) + - action: replace + source_labels: + - __meta_kubernetes_namespace + target_label: namespace + - action: replace + source_labels: + - __meta_kubernetes_pod_name + target_label: pod + - action: drop + regex: Pending|Succeeded|Failed|Completed + source_labels: + - __meta_kubernetes_pod_phase + - action: replace + source_labels: + - __meta_kubernetes_pod_node_name + target_label: node + - honor_labels: true + job_name: kubernetes-pods-slow + kubernetes_sd_configs: + - role: pod + relabel_configs: + - action: keep + regex: true + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_scrape_slow + - action: replace + regex: (https?) + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_scheme + target_label: __scheme__ + - action: replace + regex: (.+) + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_path + target_label: __metrics_path__ + - action: replace + regex: (\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4}) + replacement: '[$2]:$1' + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_port + - __meta_kubernetes_pod_ip + target_label: __address__ + - action: replace + regex: (\d+);((([0-9]+?)(\.|$)){4}) + replacement: $2:$1 + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_port + - __meta_kubernetes_pod_ip + target_label: __address__ + - action: labelmap + regex: __meta_kubernetes_pod_annotation_prometheus_io_param_(.+) + replacement: __param_$1 + - action: labelmap + regex: __meta_kubernetes_pod_label_(.+) + - action: replace + source_labels: + - __meta_kubernetes_namespace + target_label: namespace + - action: replace + source_labels: + - __meta_kubernetes_pod_name + target_label: pod + - action: drop + regex: Pending|Succeeded|Failed|Completed + source_labels: + - __meta_kubernetes_pod_phase + - action: replace + source_labels: + - __meta_kubernetes_pod_node_name + target_label: node + scrape_interval: 5m + scrape_timeout: 30s + - job_name: 'kube-state-metrics' + metrics_path: /metrics + scrape_interval: 1m + honor_labels: true + kubernetes_sd_configs: + - role: endpoints + namespaces: + own_namespace: true + relabel_configs: + - source_labels: [__meta_kubernetes_service_label_app_kubernetes_io_name] + regex: kube-state-metrics + action: keep + - source_labels: [__meta_kubernetes_endpoint_port_name] + regex: http + action: keep + metric_relabel_configs: + - source_labels: [__name__] + regex: "kube_pod_container_resource_(limits|requests)|kube_pod_status_phase|kube_node_(labels|status_allocatable|status_condition|status_capacity)|kube_namespace_labels|kube_pod_container_status_(waiting|terminated|last_terminated).*_reason|kube_daemonset_status_number_unavailable|kube_deployment_status_replicas_unavailable|kube_resourcequota|kube_pod_info|kube_node_info|kube_pod_container_status_restarts_total" + action: keep + - source_labels: [__name__, phase] + separator: ";" + regex: "kube_pod_status_phase;(Succeeded|Failed)" + action: drop + - source_labels: [node] + target_label: nodename + regex: "(.*)" + action: replace + - source_labels: [label_node_group_name] + action: replace + regex: "(.+)" + target_label: label_node_pool_name + - job_name: 'opencost' + metrics_path: /metrics + scrape_interval: 1m + honor_labels: true + kubernetes_sd_configs: + - role: endpoints + namespaces: + own_namespace: true + relabel_configs: + - source_labels: [__meta_kubernetes_service_label_app_kubernetes_io_name] + regex: opencost + action: keep + - source_labels: [__meta_kubernetes_endpoint_port_name] + regex: http + action: keep + metric_relabel_configs: + - source_labels: [__name__] + regex: "kube_node_labels|kube_pod_labels|node_total_hourly_cost|node_ram_hourly_cost|node_cpu_hourly_cost|node_gpu_hourly_cost" + action: keep + - job_name: 'union-services' + metrics_path: /metrics + scrape_interval: 1m + honor_labels: true + kubernetes_sd_configs: + - role: endpoints + namespaces: + own_namespace: true + relabel_configs: + - source_labels: [__meta_kubernetes_service_label_platform_union_ai_service_group] + regex: .+ + action: keep + - source_labels: [__meta_kubernetes_endpoint_port_name] + regex: debug + action: keep + # Flyte propeller metrics for execution info and fast task duration. + # No-op when propeller is not deployed (no pods match the label selector). + - job_name: 'flytepropeller' + kubernetes_sd_configs: + - role: pod + namespaces: + own_namespace: true + selectors: + - role: pod + label: app.kubernetes.io/name=flytepropeller + metric_relabel_configs: + - source_labels: [__name__] + regex: "flyte:propeller:all:round:execution_info|flyte:propeller:all:node:fast_task:fast_task_execution_duration" + action: keep + relabel_configs: + - source_labels: [__meta_kubernetes_pod_container_name] + regex: flytepropeller + action: keep + - source_labels: [__meta_kubernetes_namespace] + action: replace + target_label: namespace + - source_labels: [__meta_kubernetes_pod_name] + action: replace + target_label: pod + # Kourier gateway (envoy) metrics for app serving. + # No-op when serving is not enabled (no pods match the label selector). + - job_name: 'serving-envoy' + kubernetes_sd_configs: + - role: pod + namespaces: + own_namespace: true + selectors: + - role: pod + label: app=3scale-kourier-gateway + metrics_path: /stats/prometheus + metric_relabel_configs: + - source_labels: [__name__] + regex: "envoy_cluster_upstream_rq_xx|envoy_cluster_upstream_rq_time_bucket" + action: keep + relabel_configs: + - source_labels: [__meta_kubernetes_pod_container_port_name] + regex: metrics + action: keep + - source_labels: [__meta_kubernetes_namespace] + action: replace + target_label: namespace + - source_labels: [__meta_kubernetes_pod_name] + action: replace + target_label: pod + # DCGM GPU metrics. + # No-op when dcgm-exporter is not deployed (no pods match the label selector). + - job_name: 'gpu-metrics' + kubernetes_sd_configs: + - role: pod + namespaces: + names: + - kube-system + selectors: + - role: pod + label: app.kubernetes.io/name=dcgm-exporter - c_project_resource_quota.yaml: | - apiVersion: v1 - kind: ResourceQuota - metadata: - name: project-quota - namespace: {{ namespace }} - spec: - hard: - limits.cpu: {{ projectQuotaCpu }} - limits.memory: {{ projectQuotaMemory }} - requests.nvidia.com/gpu: {{ projectQuotaNvidiaGpu }} + recording_rules.yml: | + {} + rules: | + {} --- # Source: dataplane/templates/fluent-bit/configmap.yaml apiVersion: v1 @@ -368,6 +720,36 @@ data: Skip_Long_Lines On Refresh_Interval 10 --- +# Source: dataplane/templates/imagebuilder/build-image-configmap.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: build-image-config + labels: + app.kubernetes.io/name: union-operator + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm +data: + storage.yaml: | + storage: + container: "test-metadata-container" + container: 'test-metadata-container' + stow: + config: + account: 'teststorageaccount' + kind: azure + type: stow + enable-multicontainer: true + limits: + maxDownloadMBs: 1024 + cache: + max_size_mbs: 0 + target_gc_percent: 70 + buildkit-uri: "tcp://union-operator-buildkit.union.svc.cluster.local:1234" + default-repository: "union-dataplane.azurecr.io" + authentication-type: "azure" +--- # Source: dataplane/templates/imagebuilder/configmap.yaml apiVersion: v1 kind: ConfigMap @@ -2524,6 +2906,18 @@ data: "version": 1 } --- +# Source: dataplane/templates/monitoring/prometheusrule.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: union-recording-rules + namespace: union + labels: + release: release-name +data: + recording_rules.yml: | + groups: [] +--- # Source: dataplane/templates/nodeexecutor/configmap.yaml apiVersion: v1 kind: ConfigMap @@ -2578,8 +2972,13 @@ data: templateUris: - https://scale.neptune.ai/{{ .taskConfig.project }}/-/run/?customId={{ .taskConfig.id }} - kubernetes-enabled: true + kubernetes-enabled: false enabled_plugins.yaml: | + plugins: + connector-service: + defaultConnector: + defaultServiceConfig: '{"loadBalancingConfig": [{"round_robin":{}}]}' + endpoint: k8s:///flyteconnector.union:8000 tasks: task-plugins: default-for-task-types: @@ -2612,6 +3011,8 @@ data: cpu: 4096 gpu: 256 memory: 2Ti + namespace_mapping: + template: union union: connection: host: dns:///test.dataplane.union.ai @@ -2632,7 +3033,7 @@ data: cache-endpoint: dns:///test.dataplane.union.ai endpoint: dns:///test.dataplane.union.ai insecure: false - type: fallback + type: cacheservicev2 use-admin-auth: true logger: formatter: @@ -2654,6 +3055,7 @@ data: secure: false useAuth: false propeller: + limit-namespace: union node-config: disable-input-file-writes: true plugins: @@ -2661,7 +3063,7 @@ data: additional-worker-args: - --last-ack-grace-period-seconds - "120" - callback-uri: http://unionai-dataplane-executor.union.svc.cluster.local:15605 + callback-uri: http://union-operator-executor.union.svc.cluster.local:15605 grace-period-status-not-found: 2m ioutils: remoteFileOutputPaths: @@ -2674,6 +3076,7 @@ data: default-labels: - azure.workload.identity/use: "true" default-memory: 100Mi + default-pod-template-name: task-template interruptible-node-selector-requirement: key: kubernetes.azure.com/scalesetpriority operator: In @@ -2726,6 +3129,7 @@ data: default-labels: - azure.workload.identity/use: "true" default-memory: 100Mi + default-pod-template-name: task-template interruptible-node-selector-requirement: key: kubernetes.azure.com/scalesetpriority operator: In @@ -2758,11 +3162,14 @@ data: operator: enabled: true enableTunnelService: true + # enableDepot: false tunnel: - enableDirectToAppIngress: false + enableDirectToAppIngress: true deploymentToRestart: union-operator-proxy + limitNamespace: union + disableClusterPermissions: true apps: - enabled: 'false' + enabled: 'true' syncClusterConfig: enabled: false clusterId: @@ -2786,14 +3193,12 @@ data: kind: azure type: stow collectUsages: - enabled: true + enabled: false billing: model: Legacy dependenciesHeartbeat: - prometheus: - endpoint: 'http://union-operator-prometheus:80/-/healthy' - propeller: - endpoint: 'http://flytepropeller:10254' + executor: + endpoint: 'http://union-operator-executor:10254' proxy: endpoint: 'http://union-operator-proxy:10254' org: @@ -2801,6 +3206,8 @@ data: secretsWatcher: dryRun: true enabled: false + org: + namespaceTemplate: union imageBuilder: enabled: true executionNamespaceLabels: @@ -2808,9 +3215,6 @@ data: referenceConfigmapName: union-operator targetConfigMapName: "build-image-config" proxy: - imageBuilderConfig: - authenticationType: 'noop' - defaultRepository: '' persistedLogs: azureLogAnalytics: logAnalyticsWorkspaceResourceIdTemplate: /subscriptions/test-subscription-id/resourceGroups/test-resource-group/providers/Microsoft.OperationalInsights/workspaces/union-test-org @@ -2860,989 +3264,149 @@ data: kind: azure type: stow image-builder.buildkit-uri: "tcp://union-operator-buildkit.union.svc.cluster.local:1234" - image-builder.default-repository: "" - image-builder.authentication-type: "noop" + image-builder.default-repository: "union-dataplane.azurecr.io" + image-builder.authentication-type: "azure" --- -# Source: dataplane/templates/prometheus/configmap.yaml +# Source: dataplane/templates/serving/bootstrap-configmap.yaml +# We need to copy the default configmap here since knative-operator does not automatically update the +# address of the `net-kourier-controller` endpoint to use the release namespace. It assumes that the +# resources are being installed into the `knative-serving` namespace instead. apiVersion: v1 kind: ConfigMap metadata: - name: union-operator-prometheus + name: union-operator-serving-envoy-bootstrap namespace: union - labels: - helm.sh/chart: dataplane-2026.4.2 - app.kubernetes.io/name: release-name-dataplane - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: prometheus data: - prometheus.yml: | - global: - scrape_interval: 15s - evaluation_interval: 15s - alerting: - alertmanagers: - - static_configs: - - targets: - rule_files: - - rules.yml - scrape_configs: - # Self-monitoring - - job_name: prometheus - metrics_path: /prometheus/metrics - static_configs: - - targets: ['localhost:9090'] - metric_relabel_configs: - - source_labels: [__name__] - regex: prometheus_tsdb_head_chunks_storage_size_bytes|prometheus_tsdb_head_series|prometheus_tsdb_head_series_created_total|prometheus_tsdb_head_series_removed_total|prometheus_tsdb_head_samples_appended_total|prometheus_tsdb_storage_blocks_bytes|prometheus_tsdb_retention_limit_bytes|prometheus_tsdb_wal_(.*)|.+_(limit|failed|missed|failures)_total - action: keep + envoy-bootstrap.yaml: | + dynamic_resources: + ads_config: + transport_api_version: V3 + api_type: GRPC + rate_limit_settings: {} + grpc_services: + - envoy_grpc: {cluster_name: xds_cluster} + cds_config: + resource_api_version: V3 + ads: {} + lds_config: + resource_api_version: V3 + ads: {} + node: + cluster: kourier-knative + id: 3scale-kourier-gateway + static_resources: + listeners: + - name: stats_listener + address: + socket_address: + address: 0.0.0.0 + port_value: 9000 + filter_chains: + - filters: + - name: envoy.filters.network.http_connection_manager + typed_config: + "@type": type.googleapis.com/envoy.extensions.filters.network.http_connection_manager.v3.HttpConnectionManager + stat_prefix: stats_server + http_filters: + - name: envoy.filters.http.router + typed_config: + "@type": type.googleapis.com/envoy.extensions.filters.http.router.v3.Router + route_config: + virtual_hosts: + - name: admin_interface + domains: + - "*" + routes: + - match: + safe_regex: + regex: '/(certs|stats(/prometheus)?|server_info|clusters|listeners|ready)?' + headers: + - name: ':method' + string_match: + exact: GET + route: + cluster: service_stats + - match: + safe_regex: + regex: '/drain_listeners' + headers: + - name: ':method' + string_match: + exact: POST + route: + cluster: service_stats + clusters: + - name: service_stats + connect_timeout: 0.250s + type: static + load_assignment: + cluster_name: service_stats + endpoints: + lb_endpoints: + endpoint: + address: + socket_address: + address: 127.0.0.1 + port_value: 9901 + - name: xds_cluster + # This keepalive is recommended by envoy docs. + # https://www.envoyproxy.io/docs/envoy/latest/api-docs/xds_protocol + typed_extension_protocol_options: + envoy.extensions.upstreams.http.v3.HttpProtocolOptions: + "@type": type.googleapis.com/envoy.extensions.upstreams.http.v3.HttpProtocolOptions + explicit_http_config: + http2_protocol_options: + connection_keepalive: + interval: 30s + timeout: 5s + connect_timeout: 1s + load_assignment: + cluster_name: xds_cluster + endpoints: + lb_endpoints: + endpoint: + address: + socket_address: + address: "net-kourier-controller" + port_value: 18000 + type: STRICT_DNS + admin: + access_log: + - name: envoy.access_loggers.stdout + typed_config: + "@type": type.googleapis.com/envoy.extensions.access_loggers.stream.v3.StdoutAccessLog + address: + socket_address: + address: 127.0.0.1 + port_value: 9901 + stats_config: + stats_tags: + - tag_name: name + regex: '^.*?\.u/.*?n=(.*?)/.*?u\..*$' + - tag_name: domain + regex: '^.*?\.u/.*?d=(.*?)/.*?u\..*$' + - tag_name: org + regex: '^.*?\.u/.*?o=(.*?)/.*?u\..*$' + - tag_name: project + regex: '^.*?\.u/.*?p=(.*?)/.*?u\..*$' + - tag_name: target + regex: '^.*?\.u/.*?t=(.*?)/.*?u\..*$' + - tag_name: target_namespace + regex: '^.*?\.u/.*?tns=(.*?)/.*?u\..*$' +--- +# Source: dataplane/templates/webhook/configmap.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: union-pod-webhook-config + namespace: union +data: + core.yaml: | - # Kube state metrics for pod/node resource tracking and cost calculations - - job_name: kube-state-metrics - static_configs: - - targets: ['release-name-kube-state-metrics:8080'] - metric_relabel_configs: - - separator: ; - source_labels: [__name__] - regex: kube_pod_container_resource_(limits|requests)|kube_pod_status_phase|kube_node_(labels|status_allocatable|status_condition|status_capacity)|kube_namespace_labels|kube_pod_container_status_(waiting|terminated|last_terminated).*_reason|kube_daemonset_status_number_unavailable|kube_deployment_status_replicas_unavailable|kube_resourcequota|kube_pod_info|kube_node_info|kube_pod_container_status_restarts_total - action: keep - - separator: ; - source_labels: [__name__, phase] - regex: kube_pod_status_phase;(Succeeded|Failed) - action: drop - - source_labels: [node] - target_label: nodename - regex: '(.*)' - action: replace - - source_labels: [label_node_group_name] - action: replace - regex: (.+) - target_label: label_node_pool_name - - # cAdvisor container metrics for CPU and memory tracking - - job_name: kubernetes-cadvisor - metrics_path: /metrics - scheme: https - kubernetes_sd_configs: - - role: node - namespaces: - names: [] - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token - tls_config: - ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt - insecure_skip_verify: false - metric_relabel_configs: - - separator: ; - source_labels: [__name__] - regex: container_cpu_usage_seconds_total|container_memory_working_set_bytes - action: keep - relabel_configs: - - separator: ; - regex: __meta_kubernetes_node_label_(.+) - replacement: $1 - action: labelmap - - separator: ; - regex: (.*) - target_label: __address__ - replacement: kubernetes.default.svc:443 - action: replace - - source_labels: [__meta_kubernetes_node_name] - separator: ; - regex: (.+) - target_label: __metrics_path__ - replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor - action: replace - - # Flyte propeller metrics for execution info and fast task duration - - job_name: flytepropeller - kubernetes_sd_configs: - - role: pod - namespaces: - names: - - union - selectors: - - role: pod - label: app.kubernetes.io/name=flytepropeller - metric_relabel_configs: - - source_labels: [__name__] - regex: "flyte:propeller:all:round:execution_info|flyte:propeller:all:node:fast_task:fast_task_execution_duration" - action: keep - relabel_configs: - - source_labels: [__meta_kubernetes_pod_container_name] - regex: flytepropeller - action: keep - - source_labels: [__meta_kubernetes_namespace] - action: replace - target_label: namespace - - source_labels: [__meta_kubernetes_pod_name] - action: replace - target_label: pod - # OpenCost metrics for cost tracking - - job_name: opencost - static_configs: - - targets: ['release-name-opencost:9003'] - metric_relabel_configs: - - source_labels: [__name__] - regex: "kube_node_labels|kube_pod_labels|node_total_hourly_cost|node_ram_hourly_cost|node_cpu_hourly_cost|node_gpu_hourly_cost" - action: keep - rules.yml: | - groups: - - name: cost_calculations_15s - interval: 15s - rules: - - record: pod_gpu_allocation - expr: | - sum by (namespace, pod) (DCGM_FI_DEV_GPU_UTIL >= bool 0) * on (namespace, pod) group_left() (max by (namespace, pod) (kube_pod_status_phase{phase=~"Running|Pending"} == 1)) - - record: execution_info # A join metric to look up execution-level info. Used to disambiguate workflow/task executions, apps, and workspaces. - expr: | - max by (label_domain, label_project, label_entity_name, label_execution_id, label_entity_id)( - label_replace( - label_replace( - label_replace( - label_replace( - label_replace( - flyte:propeller:all:round:execution_info{domain!="", project!="", workflow_name!="", execution_id!=""}, # filter for workflow/task executions - "label_entity_id", "$1", "execution_id", "(.*)" # join key - ), "label_entity_name", "$1", "workflow_name", "(.*)" # set label_entity_name to the workflow/task name from the workflow_execution_id - ), - "label_execution_id", "$1", "execution_id", "(.*)" - ), - "label_project", "$1", "project", "(.*)" # project - ), - "label_domain", "$1", "domain", "(.*)" # domain - ) - ) - - record: app_info # A join metric to look up app-level info. Used to disambiguate workflow/task executions, apps, and workspaces. - expr: | - max by (label_domain, label_project, label_app_name, label_app_version, label_entity_id)( - label_replace( - label_replace( - label_replace( - kube_pod_labels{ - label_domain!="", - label_project!="", - label_serving_unionai_dev_app_name!="", - label_serving_knative_dev_revision!="" - }, # this filters for apps - "label_app_name", "$1", "label_serving_unionai_dev_app_name", "(.*)" # rename to cleanup - ), - "label_app_version", "$1", "label_serving_knative_dev_revision", "(.*)" # the app_version is equivalent to an execution_id for workflows (lowest level of granularity) - ), - "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # join key - ) - ) - - record: workspace_info # A join metric to look up workspace info. Used to disambiguate workflow/task executions, apps, and workspaces. - expr: | - max by (label_domain, label_project, label_workspace_name, label_entity_id)( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # filter for workspaces - "label_entity_id", "$1", "label_node_id", "(.*)" # join key - ), "label_workspace_name", "$1", "label_node_id", "(.*)" # set label_workspace_name to the workspace name from the kube_pod_labels - ) - ) - - record: fast_task_execution_duration - expr: | - max by (label_domain, label_project, label_execution_id, label_entity_id, namespace, pod)( - label_replace( - label_replace( - label_replace( - label_replace( - label_replace( - label_replace( - flyte:propeller:all:node:fast_task:fast_task_execution_duration{domain!="", project!="", execution_id!="", exported_namespace!="", exported_pod!=""}, - "label_entity_id", "$1", "execution_id", "(.*)" # join key - ), - "label_execution_id", "$1", "execution_id", "(.*)" - ), - "label_project", "$1", "project", "(.*)" # project - ), - "label_domain", "$1", "domain", "(.*)" # domain - ), - "namespace", "$1", "exported_namespace", "(.*)" - ), - "pod", "$1", "exported_pod", "(.*)" - ) - ) - - record: fast_task_execution_duration_rate - expr: | - irate(fast_task_execution_duration[1m]) > 0 or absent(irate(fast_task_execution_duration[1m])) * fast_task_execution_duration - - record: entity_id:mem_usage_bytes_total_per_node:sum # Allocated memory (max(requested, consumed)) aggregated per node and entity, where entity is either a task/workflow execution or an app. - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id, node) ( # aggregate up to entity - # First, calculate the allocated memory for each pod - max by (namespace, pod) ( # this is the case where consumed (the memory working set) exceeds requested memory - ( - sum by (namespace, pod) ( - container_memory_working_set_bytes{namespace!="",pod!="",image!=""} - ) - > sum by (namespace, pod) ( - kube_pod_container_resource_requests{namespace!="", pod!="", node!="", resource="memory"} - ) - ) - or sum by (namespace, pod) ( # this is the case where memory requests are <= consumed memory - kube_pod_container_resource_requests{namespace!="", pod!="", node!="", resource="memory"} # needed to add node!="" to dedupe - ) - ) - # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but we do not want to double the number of pod-level metrics we save - * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) - "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") - ), - "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels - label_replace( - label_replace( - kube_pod_labels{ - label_domain!="", - label_project!="", - label_serving_unionai_dev_app_name!="", - label_serving_knative_dev_revision!="" - }, # this filters for apps only - "label_entity_type", "app", "", "" # set label_entity_type to "app" - ), - "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) - "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" - ), - "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - fast_task_execution_duration_rate{label_domain!="", label_project!="",label_execution_id!=""}, - "label_entity_type", "fast_task", "", "" - ) - ) - ) - # Then filter for pods only in the "Running" or "Pending" phase - * on (namespace, pod) group_left() ( - max by (namespace, pod) ( - kube_pod_status_phase{phase=~"Running|Pending"} == 1 - ) - ) - # Now join in node identifiers which are used for subsequent overhead calculations - * on (namespace, pod) group_left(node) ( - max by (namespace, pod, node) (kube_pod_info{node!=""}) # needed to add node!="" to dedupe - ) - ) - - record: entity_id:cpu_usage_per_node:sum # Allocated cpu (max(requested, consumed)) aggregated per node and entity, where entity is either a task/workflow execution or an app. - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id, node) ( - # First, calculate the allocated cpu for each pod - max by (namespace, pod) ( # this is the case where consumed (the cpu usage seconds total) exceeds requested cpu - ( - sum by (namespace, pod) ( - irate(container_cpu_usage_seconds_total{namespace!="",pod!="",image!=""}[5m]) - ) - > sum by (namespace, pod) ( - kube_pod_container_resource_requests{namespace!="", pod!="", node!="", resource="cpu"} - ) - ) - or sum by (namespace, pod) ( # this is the case where cpu requests are <= consumed cpu - kube_pod_container_resource_requests{namespace!="", pod!="", node!="", resource="cpu"} - ) - ) - # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but I didn't want to double the number of pod-level metrics we save - * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) - "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") - ), - "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels - label_replace( - label_replace( - kube_pod_labels{ - label_domain!="", - label_project!="", - label_serving_unionai_dev_app_name!="", - label_serving_knative_dev_revision!="" - }, # this filters for apps only - "label_entity_type", "app", "", "" # set label_entity_type to "app" - ), - "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) - "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" - ), - "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - fast_task_execution_duration_rate{label_domain!="", label_project!="", label_execution_id!=""}, - "label_entity_type", "fast_task", "", "" - ) - ) - ) - # Then filter for pods only in the "Running" or "Pending" phase - * on (namespace, pod) group_left() ( - max by (namespace, pod) ( - kube_pod_status_phase{phase=~"Running|Pending"} == 1 - ) - ) - # Now join in node identifiers which are used for subsequent overhead calculations - * on (namespace, pod) group_left(node) ( - max by (namespace, pod, node) (kube_pod_info{node!=""}) # needed to add node!="" to dedupe - ) - ) - - record: entity_id:gpu_usage_per_node:sum # Allocated gpu aggregated per node and entity, where entity is either a task/workflow execution or an app. - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id, node) ( - # First, grab the allocated gpu for each pod (which is always either 1 or zero, since k8s can't split gpus the way it can with cpu/memory) - max by (namespace, pod) ( - pod_gpu_allocation - ) - # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but I didn't want to double the number of pod-level metrics we save - * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) - "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") - ), - "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels - label_replace( - label_replace( - kube_pod_labels{ - label_domain!="", - label_project!="", - label_serving_unionai_dev_app_name!="", - label_serving_knative_dev_revision!="" - }, # this filters for apps only - "label_entity_type", "app", "", "" # set label_entity_type to "app" - ), - "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) - "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" - ), - "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - fast_task_execution_duration_rate{label_domain!="", label_project!="", label_execution_id!=""}, - "label_entity_type", "fast_task", "", "" - ) - ) - ) - # Then filter for pods only in the "Running" or "Pending" phase - * on (namespace, pod) group_left() ( - max by (namespace, pod) ( - kube_pod_status_phase{phase=~"Running|Pending"} == 1 - ) - ) - # Now join in node identifiers which are used for subsequent overhead calculations - * on (namespace, pod) group_left(node) ( - max by (namespace, pod, node) (kube_pod_info{node!=""}) # needed to add node!="" to dedupe - ) - ) - - record: entity_id:used_mem_bytes:sum # the sum of used memory across all containers in an entity (numerator for aggregate utilization calculations) - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id) ( # aggregate up to entity - # First, calculate the used memory for each pod - sum by (namespace, pod) ( - container_memory_working_set_bytes{namespace!="",pod!="",image!=""} - ) - # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but we do not want to double the number of pod-level metrics we save - * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) - "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") - ), - "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels - label_replace( - label_replace( - kube_pod_labels{ - label_domain!="", - label_project!="", - label_serving_unionai_dev_app_name!="", - label_serving_knative_dev_revision!="" - }, # this filters for apps only - "label_entity_type", "app", "", "" # set label_entity_type to "app" - ), - "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) - "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" - ), - "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - fast_task_execution_duration_rate{label_domain!="", label_project!="", label_execution_id!=""}, - "label_entity_type", "fast_task", "", "" - ) - ) - ) - # Then filter for pods only in the "Running" or "Pending" phase - * on (namespace, pod) group_left() ( - max by (namespace, pod) ( - kube_pod_status_phase{phase=~"Running|Pending"} == 1 - ) - ) - ) - - record: entity_id:allocated_mem_bytes:sum # the sum of allocated memory across all containers in an entity (denominator for aggregate utilization calculations) - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id) ( # aggregate up to entity (remove node) - entity_id:mem_usage_bytes_total_per_node:sum - ) - - record: entity_id:used_cpu:sum # the sum of used cpu across all containers in an entity (numerator for aggregate utilization calculations) - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id) ( - sum by (namespace, pod) ( - irate(container_cpu_usage_seconds_total{namespace!="",pod!="",image!=""}[5m]) - ) - # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but I didn't want to double the number of pod-level metrics we save - * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) - "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") - ), - "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels - label_replace( - label_replace( - kube_pod_labels{ - label_domain!="", - label_project!="", - label_serving_unionai_dev_app_name!="", - label_serving_knative_dev_revision!="" - }, # this filters for apps only - "label_entity_type", "app", "", "" # set label_entity_type to "app" - ), - "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) - "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" - ), - "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - fast_task_execution_duration_rate{label_domain!="", label_project!="", label_execution_id!=""}, - "label_entity_type", "fast_task", "", "" - ) - ) - ) - # Then filter for pods only in the "Running" or "Pending" phase - * on (namespace, pod) group_left() ( - max by (namespace, pod) ( - kube_pod_status_phase{phase=~"Running|Pending"} == 1 - ) - ) - ) - - record: entity_id:allocated_cpu:sum # the sum of allocated cpu across all containers in an entity (denominator for aggregate utilization calculations) - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id) ( # aggregate up to entity (remove node) - entity_id:cpu_usage_per_node:sum - ) - - record: entity_id:sm_occupancy:avg # the simple average of SM occupancy (a good generic measure of GPU utilization) per entity - expr: | - avg by (label_entity_type, label_domain, label_project, label_entity_id) ( - # First, grab the SM occupancy for each pod - max by (namespace, pod) ( - DCGM_FI_PROF_SM_OCCUPANCY # SM occupancy is a good proxy for actual GPU usage - ) - # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but I didn't want to double the number of pod-level metrics we save - * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) - "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") - ), - "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels - label_replace( - label_replace( - kube_pod_labels{ - label_domain!="", - label_project!="", - label_serving_unionai_dev_app_name!="", - label_serving_knative_dev_revision!="" - }, # this filters for apps only - "label_entity_type", "app", "", "" # set label_entity_type to "app" - ), - "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) - "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" - ), - "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - fast_task_execution_duration_rate{label_domain!="", label_project!="", label_execution_id!=""}, - "label_entity_type", "fast_task", "", "" - ) - ) - ) - # Then filter for pods only in the "Running" or "Pending" phase - * on (namespace, pod) group_left() ( - max by (namespace, pod) ( - kube_pod_status_phase{phase=~"Running|Pending"} == 1 - ) - ) - ) - - record: entity_id:gpu_count:sum # the count of running gpu pods per entity (need this to weight the gpu utilization when aggregating upwards - i.e. project-level) - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id, node) ( - # First, grab the allocated gpu for each pod (which is always either 1 or zero, since k8s can't split gpus the way it can with cpu/memory) - max by (namespace, pod) ( - pod_gpu_allocation - ) - # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but I didn't want to double the number of pod-level metrics we save - * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) - "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") - ), - "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels - label_replace( - label_replace( - kube_pod_labels{ - label_domain!="", - label_project!="", - label_serving_unionai_dev_app_name!="", - label_serving_knative_dev_revision!="" - }, # this filters for apps only - "label_entity_type", "app", "", "" # set label_entity_type to "app" - ), - "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) - "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" - ), - "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - fast_task_execution_duration_rate{label_domain!="", label_project!="", label_execution_id!=""}, - "label_entity_type", "fast_task", "", "" - ) - ) - ) - # Then filter for pods only in the "Running" or "Pending" phase - * on (namespace, pod) group_left() ( - max by (namespace, pod) ( - kube_pod_status_phase{phase=~"Running|Pending"} == 1 - ) - ) - ) - - record: entity_id:weighted_sm_occupancy:sum # product of SM occupancy and allocated GPU count (something like "used memory", numerator of weighted calcs) - expr: | - entity_id:sm_occupancy:avg - * on (label_domain, label_project, label_entity_type, label_entity_id) entity_id:gpu_count:sum - - record: entity_id:allocated_mem_cost:sum # Allocated cost of memory for each workflow/task execution and app. - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id, type) ( - entity_id:mem_usage_bytes_total_per_node:sum / (1024 * 1024 * 1024) # convert bytes to GB - * on (node) group_left(type) label_replace(avg by (node) (node_ram_hourly_cost * (15 / 3600)), "type", "mem", "", "") # convert hourly cost to 15-secondly cost and add type - ) - - record: entity_id:allocated_cpu_cost:sum # Allocated cost of cpu for each workflow/task execution and app. - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id, type)( - entity_id:cpu_usage_per_node:sum - * on (node) group_left(type) label_replace(avg by (node) (node_cpu_hourly_cost * (15 / 3600)), "type", "cpu", "", "") # convert hourly cost to 15-secondly cost and add type - ) - - record: entity_id:allocated_gpu_cost:sum # Allocated cost of gpu for each workflow/task execution and app. - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id, type)( - entity_id:gpu_usage_per_node:sum - * on (node) group_left(type) label_replace(avg by (node) (node_gpu_hourly_cost * (15 / 3600)), "type", "gpu", "", "") # convert hourly cost to 15-secondly cost and add type - ) - - record: entity_id:allocated_cost:sum # Allocated cost of memory, cpu, and gpu for each workflow/task execution and app. - expr: | - label_replace( - sum by (label_entity_type, label_domain, label_project, label_entity_id) ( # for the sum to work, the labels need to be different on each "or" element (type label) - entity_id:allocated_mem_cost:sum - or - entity_id:allocated_cpu_cost:sum - or - entity_id:allocated_gpu_cost:sum - ), - "type", "allocated", "", "" # add type info - ) - - record: entity_id:overhead_cost:sum # The amount of overhead costs (node costs that we can't allocate with container resources) to allocate to each entity (workflow/task execution or app) - expr: | - label_replace( - sum by (label_entity_type, label_entity_id, label_domain, label_project)( # Aggregate the per-node metrics up to workflow/task execution or app (label_entity_id) - # Start with each execution's and app's allocated cost per node - sum by (label_entity_type, label_domain, label_project, label_entity_id, node) ( # for the sum to work, the labels need to be different on each "or" element (type label) - entity_id:mem_usage_bytes_total_per_node:sum / (1024 * 1024 * 1024) # convert bytes to GB - * on (node) group_left(type) label_replace(avg by (node) (node_ram_hourly_cost * (15 / 3600)), "type", "mem", "", "") # convert hourly cost to 15-secondly cost and add type - or - entity_id:cpu_usage_per_node:sum - * on (node) group_left(type) label_replace(avg by (node) (node_cpu_hourly_cost * (15 / 3600)), "type", "cpu", "", "") # convert hourly cost to 15-secondly cost and add type - or - entity_id:gpu_usage_per_node:sum - * on (node) group_left(type) label_replace(avg by (node) (node_gpu_hourly_cost * (15 / 3600)), "type", "gpu", "", "") # convert hourly cost to 15-secondly cost and add type - ) - # Then divide out the total allocated cost per node to get the proportion of allocated cost associated with each entity - / on (node) group_left()( - sum by (node) ( # for the sum to work, the labels need to be different on each "or" element (type label) - entity_id:mem_usage_bytes_total_per_node:sum / (1024 * 1024 * 1024) # convert bytes to GB - * on (node) group_left(type) label_replace(avg by (node) (node_ram_hourly_cost * (15 / 3600)), "type", "mem", "", "") # convert hourly cost to 15-secondly cost and add type - or - entity_id:cpu_usage_per_node:sum - * on (node) group_left(type) label_replace(avg by (node) (node_cpu_hourly_cost * (15 / 3600)), "type", "cpu", "", "") # convert hourly cost to 15-secondly cost and add type - or - entity_id:gpu_usage_per_node:sum - * on (node) group_left(type) label_replace(avg by (node) (node_gpu_hourly_cost * (15 / 3600)), "type", "gpu", "", "") # convert hourly cost to 15-secondly cost and add type - ) - > 0 # need to avoid dividing by zero, or gaps in the data can cause NaNs to proliferate, borking all charts - ) - # Then multiply by the overhead cost per node - * on (node) group_left() ( - # To calculate overhead, start with the true cost of running each node - avg by (node)(kube_node_labels{label_flyte_org_node_role="worker"}) # only look at worker nodes - * on (node) max by (node) ( - node_total_hourly_cost{instance_type!=""} # sometimes, the instance_type can be null, causing an unlabeled label to show up in the Compute Costs dashboard charts - ) * (15 / 3600) # convert hourly cost to 15-secondly cost - # Then subtract out the total allocated cost on each node - - on (node) group_left()( - sum by (node) ( # for the sum to work, the labels need to be different on each "or" element (type label) - entity_id:mem_usage_bytes_total_per_node:sum / (1024 * 1024 * 1024) # convert bytes to GB - * on (node) group_left(type) label_replace(avg by (node) (node_ram_hourly_cost * (15 / 3600)), "type", "mem", "", "") # convert hourly cost to 15-secondly cost and add type - or - entity_id:cpu_usage_per_node:sum - * on (node) group_left(type) label_replace(avg by (node) (node_cpu_hourly_cost * (15 / 3600)), "type", "cpu", "", "") # convert hourly cost to 15-secondly cost and add type - or - entity_id:gpu_usage_per_node:sum - * on (node) group_left(type) label_replace(avg by (node) (node_gpu_hourly_cost * (15 / 3600)), "type", "gpu", "", "") # convert hourly cost to 15-secondly cost and add type - ) - ) - ) - ), - "type", "overhead", "", "" # add type info - ) - - record: entity_id:total_cost:sum # Total cost of each entity (workflow/task execution or app), including allocated (from container resources) and overhead (proportion of unallocated node costs) - expr: | - label_replace( - sum by (label_domain, label_project, label_entity_id, label_entity_type) ( - entity_id:allocated_cost:sum - or - entity_id:overhead_cost:sum - ), - "type", "total", "", "" # add type info - ) - - record: node:total_cost:sum # Total cost of all nodes - expr: | - sum ( - avg by (node)(kube_node_labels{label_flyte_org_node_role="worker", label_node_kubernetes_io_instance_type!=""}) # only look at worker nodes - * on (node) group_left() node_total_hourly_cost{instance_type!=""} * (15 / 3600) # convert hourly cost to 15-secondly cost - ) - - record: node_type:total_cost:sum # Total cost of nodes grouped by node type - expr: | - sum by (node_type)( - avg by (node)(kube_node_labels{label_flyte_org_node_role="worker", label_node_kubernetes_io_instance_type!=""}) # only look at worker nodes - * on (node) group_left(node_type) label_replace(node_total_hourly_cost{instance_type!=""}, "node_type", "$1", "instance_type", "(.*)") * (15 / 3600) # convert hourly cost to 15-secondly cost and rename label - ) - - record: node_type:uptime_hours:sum # Total uptime of nodes grouped by node type - expr: | - sum by (node_type)( - avg by (node, node_type)( # dedupe - label_replace(kube_node_labels{label_flyte_org_node_role="worker", label_node_kubernetes_io_instance_type!=""}, "node_type", "$1", "label_node_kubernetes_io_instance_type", "(.*)") # relabel - ) - ) * (15 / 3600) # convert to number of hours per 15-second observation # Aggregate the above into visible metrics - - name: cost_rollup_15m - interval: 15m - rules: - - record: execution_info15m - expr: | - max_over_time(execution_info[15m:15s]) - - record: app_info15m - expr: | - max_over_time(app_info[15m:15s]) - - record: workspace_info15m - expr: | - max_over_time(workspace_info[15m:15s]) - - record: entity_id:allocated_mem_bytes:sum15m - expr: | - sum_over_time(entity_id:allocated_mem_bytes:sum[15m:15s]) - - record: entity_id:used_mem_bytes:sum15m - expr: | - sum_over_time(entity_id:used_mem_bytes:sum[15m:15s]) - - record: entity_id:allocated_cpu:sum15m - expr: | - sum_over_time(entity_id:allocated_cpu:sum[15m:15s]) - - record: entity_id:used_cpu:sum15m - expr: | - sum_over_time(entity_id:used_cpu:sum[15m:15s]) - - record: entity_id:weighted_sm_occupancy:sum15m - expr: | - sum_over_time(entity_id:weighted_sm_occupancy:sum[15m:15s]) - - record: entity_id:gpu_count:sum15m - expr: | - sum_over_time(entity_id:gpu_count:sum[15m:15s]) - - record: entity_id:allocated_mem_cost:sum15m - expr: | - sum_over_time(entity_id:allocated_mem_cost:sum[15m:15s]) - - record: entity_id:allocated_cpu_cost:sum15m - expr: | - sum_over_time(entity_id:allocated_cpu_cost:sum[15m:15s]) - - record: entity_id:allocated_gpu_cost:sum15m - expr: | - sum_over_time(entity_id:allocated_gpu_cost:sum[15m:15s]) - - record: entity_id:allocated_cost:sum15m - expr: | - sum_over_time(entity_id:allocated_cost:sum[15m:15s]) - - record: entity_id:overhead_cost:sum15m - expr: | - sum_over_time(entity_id:overhead_cost:sum[15m:15s]) - - record: entity_id:total_cost:sum15m - expr: | - sum_over_time(entity_id:total_cost:sum[15m:15s]) - - record: node:total_cost:sum15m - expr: | - sum_over_time(node:total_cost:sum[15m:15s]) - - record: node_type:total_cost:sum15m - expr: | - sum_over_time(node_type:total_cost:sum[15m:15s]) - - record: node_type:uptime_hours:sum15m - expr: | - sum_over_time(node_type:uptime_hours:sum[15m:15s]) ---- -# Source: dataplane/templates/propeller/configmap.yaml -apiVersion: v1 -kind: ConfigMap -metadata: - name: flyte-propeller-config - namespace: union -data: - admin.yaml: | - admin: - clientId: 'test-client-id' - clientSecretLocation: /etc/union/secret/client_secret - endpoint: dns:///test.dataplane.union.ai - insecure: false - event: - capacity: 1000 - rate: 500 - type: admin - catalog.yaml: | - catalog-cache: - cache-endpoint: dns:///test.dataplane.union.ai - endpoint: dns:///test.dataplane.union.ai - insecure: false - type: fallback - use-admin-auth: true - copilot.yaml: | - plugins: - k8s: - co-pilot: - image: 'cr.flyte.org/flyteorg/flytecopilot:v1.14.1' - name: flyte-copilot- - start-timeout: 30s - core.yaml: | - propeller: - downstream-eval-duration: 30s - enable-admin-launcher: true - leader-election: - enabled: true - lease-duration: 15s - lock-config-map: - name: propeller-leader - namespace: 'union' - renew-deadline: 10s - retry-period: 2s - limit-namespace: all - literal-offloading-config: - enabled: true - max-workflow-retries: 30 - metadata-prefix: metadata/propeller - metrics-prefix: flyte - prof-port: 10254 - queue: - batch-size: -1 - batching-interval: 2s - queue: - base-delay: 5s - capacity: 1000 - max-delay: 120s - rate: 100 - type: maxof - sub-queue: - capacity: 100 - rate: 10 - type: bucket - type: batch - rawoutput-prefix: 'abfs://test-metadata-container@teststorageaccount.dfs.core.windows.net' - workers: 4 - workflow-reeval-duration: 30s webhook: certDir: /etc/webhook/certs + disableCreateMutatingWebhookConfig: true embeddedSecretManagerConfig: azureConfig: vaultURI: 'test-azure-key-vault-uri' @@ -3853,120 +3417,13 @@ data: namespace: 'union' type: Azure listenPort: '9443' + localCert: true secretManagerTypes: - Azure - Embedded - serviceName: flyte-pod-webhook + secretName: union-pod-webhook + serviceName: union-pod-webhook servicePort: '443' - enabled_plugins.yaml: | - plugins: - connector-service: - defaultConnector: - defaultServiceConfig: '{"loadBalancingConfig": [{"round_robin":{}}]}' - endpoint: k8s:///flyteconnector.union:8000 - tasks: - task-plugins: - default-for-task-types: - actor: fast-task - container: container - container_array: k8s-array - sidecar: sidecar - enabled-plugins: - - container - - sidecar - - k8s-array - - echo - - fast-task - - connector-service - k8s.yaml: | - plugins: - k8s: - default-cpus: 100m - default-env-vars: - - AZURE_STORAGE_ACCOUNT_NAME: teststorageaccount - default-labels: - - azure.workload.identity/use: "true" - default-memory: 100Mi - interruptible-node-selector-requirement: - key: kubernetes.azure.com/scalesetpriority - operator: In - values: - - spot - interruptible-tolerations: - - effect: NoSchedule - key: kubernetes.azure.com/scalesetpriority - operator: Equal - value: spot - non-interruptible-node-selector-requirement: - key: kubernetes.azure.com/scalesetpriority - operator: DoesNotExist - logger.yaml: | - logger: - formatter: - type: json - level: 4 - show-source: true - namespace_config.yaml: | - namespace_mapping: - template: '{{ domain }}' - resource_manager.yaml: | - propeller: - resourcemanager: - type: noop - task_logs.yaml: | - plugins: - fasttask: - logs: - azure-log-templates: - - displayName: Azure Logs - templateUris: - - https://portal.azure.com#@test-tenant-id/blade/Microsoft_OperationsManagementSuite_Workspace/Logs.ReactView/resourceId/%2Fsubscriptions%2Ftest-subscription-id%2FresourceGroups%2Ftest-resource-group/source/LogsBlade.AnalyticsShareLinkToQuery/q/ - cloudwatch-enabled: false - kubernetes-enabled: false - stackdriver-enabled: false - k8s-array: - logs: - config: - azure-log-templates: - - displayName: Azure Logs - templateUris: - - https://portal.azure.com#@test-tenant-id/blade/Microsoft_OperationsManagementSuite_Workspace/Logs.ReactView/resourceId/%%2Fsubscriptions%%2Ftest-subscription-id%%2FresourceGroups%%2Ftest-resource-group/source/LogsBlade.AnalyticsShareLinkToQuery/q/ - cloudwatch-enabled: false - kubernetes-enabled: false - stackdriver-enabled: false - logs: - azure-log-templates: - - displayName: Azure Logs - templateUris: - - https://portal.azure.com#@test-tenant-id/blade/Microsoft_OperationsManagementSuite_Workspace/Logs.ReactView/resourceId/%2Fsubscriptions%2Ftest-subscription-id%2FresourceGroups%2Ftest-resource-group/source/LogsBlade.AnalyticsShareLinkToQuery/q/ - cloudwatch-enabled: false - dynamic-log-links: - - vscode: - displayName: VS Code Debugger - templateUris: - - /dataplane/pod/v1/generated_name/task/{{.executionProject}}/{{.executionDomain}}/{{.executionName}}/{{.nodeID}}/{{.taskRetryAttempt}}/{{.taskProject}}/{{.taskDomain}}/{{.taskID}}/{{.taskVersion}}/ - kubernetes-enabled: false - stackdriver-enabled: false - templates: - - displayName: Task Logs - scheme: TaskExecution - templateUris: - - /console/projects/{{.executionProject}}/domains/{{.executionDomain}}/executions/{{.executionName}}/nodeId/{{.nodeID}}/taskId/{{.taskID}}/attempt/{{.taskRetryAttempt}}/view/logs?duration=all&fromExecutionNav=true - storage.yaml: | - storage: - container: "test-metadata-container" - container: 'test-metadata-container' - stow: - config: - account: 'teststorageaccount' - kind: azure - type: stow - enable-multicontainer: true - limits: - maxDownloadMBs: 1024 - cache: - max_size_mbs: 0 - target_gc_percent: 70 --- # Source: dataplane/charts/fluentbit/templates/clusterrole.yaml apiVersion: rbac.authorization.k8s.io/v1 @@ -3990,275 +3447,1143 @@ rules: - list - watch --- -# Source: dataplane/charts/kube-state-metrics/templates/role.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2020 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: - labels: - helm.sh/chart: kube-state-metrics-5.30.1 - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: metrics - app.kubernetes.io/part-of: kube-state-metrics - app.kubernetes.io/name: kube-state-metrics - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2.15.0" - name: release-name-kube-state-metrics -rules: - -- apiGroups: ["certificates.k8s.io"] - resources: - - certificatesigningrequests - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - configmaps - verbs: ["list", "watch"] - -- apiGroups: ["batch"] - resources: - - cronjobs - verbs: ["list", "watch"] - -- apiGroups: ["extensions", "apps"] - resources: - - daemonsets - verbs: ["list", "watch"] - -- apiGroups: ["extensions", "apps"] - resources: - - deployments - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - endpoints - verbs: ["list", "watch"] - -- apiGroups: ["autoscaling"] - resources: - - horizontalpodautoscalers - verbs: ["list", "watch"] - -- apiGroups: ["extensions", "networking.k8s.io"] - resources: - - ingresses - verbs: ["list", "watch"] - -- apiGroups: ["batch"] - resources: - - jobs - verbs: ["list", "watch"] - -- apiGroups: ["coordination.k8s.io"] - resources: - - leases - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - limitranges - verbs: ["list", "watch"] - -- apiGroups: ["admissionregistration.k8s.io"] - resources: - - mutatingwebhookconfigurations - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - namespaces - verbs: ["list", "watch"] - -- apiGroups: ["networking.k8s.io"] - resources: - - networkpolicies - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - nodes - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - persistentvolumeclaims - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - persistentvolumes - verbs: ["list", "watch"] - -- apiGroups: ["policy"] - resources: - - poddisruptionbudgets - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - pods - verbs: ["list", "watch"] - -- apiGroups: ["extensions", "apps"] - resources: - - replicasets - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - replicationcontrollers - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - resourcequotas - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - secrets - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - services - verbs: ["list", "watch"] - -- apiGroups: ["apps"] - resources: - - statefulsets - verbs: ["list", "watch"] + name: knative-serving-operator-aggregated + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +aggregationRule: + clusterRoleSelectors: + # This (along with escalate below) allows the Operator to pick up any + # roles that are provided to the admin of the cluster by knative serving + # automatically. + - matchExpressions: + - {key: serving.knative.dev/release, operator: Exists} +rules: [] # Rules are automatically filled in by the controller manager. +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: knative-serving-operator-aggregated-stable + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +aggregationRule: + clusterRoleSelectors: + # This (along with escalate below) allows the Operator to pick up any + # roles that are provided to the admin of the cluster by knative serving + # automatically. + - matchExpressions: + - {key: app.kubernetes.io/name, operator: In, values: ["knative-serving"]} +rules: [] # Rules are automatically filled in by the controller manager. +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: knative-eventing-operator-aggregated + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +aggregationRule: + clusterRoleSelectors: + # This (along with escalate below) allows the Operator to pick up any + # roles that are provided to the admin of the cluster by knative eventing + # automatically. + - matchExpressions: + - {key: eventing.knative.dev/release, operator: Exists} +rules: [] # Rules are automatically filled in by the controller manager. +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: knative-eventing-operator-aggregated-stable + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +aggregationRule: + clusterRoleSelectors: + # This (along with escalate below) allows the Operator to pick up any + # roles that are provided to the admin of the cluster by knative eventing + # automatically. + - matchExpressions: + - {key: app.kubernetes.io/name, operator: In, values: ["knative-eventing"]} +rules: [] # Rules are automatically filled in by the controller manager. +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2020 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. -- apiGroups: ["storage.k8s.io"] - resources: - - storageclasses - verbs: ["list", "watch"] +kind: ClusterRole +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: knative-serving-operator + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +rules: + - apiGroups: + - operator.knative.dev + resources: + - '*' + verbs: + - '*' + # Bootstrapping permissions. + # Roles that are explicitly bound buch which are specified by this Operator + # MUST be specified here with 'get' and 'bind'. + - apiGroups: + - rbac.authorization.k8s.io + resources: + - clusterroles + resourceNames: + - system:auth-delegator + verbs: + - bind + - get + - apiGroups: + - rbac.authorization.k8s.io + resources: + - roles + resourceNames: + - extension-apiserver-authentication-reader + verbs: + - bind + - get + - apiGroups: + - rbac.authorization.k8s.io + resources: + - clusterroles + - roles + verbs: + - create + - delete + # Escalate is necessary in order to create a role using cluster role aggregation, + # and to allow the Operator to bootstrap itself into the necessary set of + # permissions, even as those continue to evolve upstream. + - escalate + - get + - list + - update + - apiGroups: + - rbac.authorization.k8s.io + resources: + - clusterrolebindings + - rolebindings + verbs: + - create + - delete + - list + - get + - update + # Permissions required for Knative controller + # infra. + - apiGroups: + - apiregistration.k8s.io + resources: + - apiservices + verbs: + - update + - apiGroups: + - "" + resources: + - services + verbs: + - create + - delete + - get + - list + - watch + - apiGroups: + - caching.internal.knative.dev + resources: + - images + verbs: + - '*' + - apiGroups: + - "" + resources: + - namespaces + verbs: + - get + - update + - watch + - apiGroups: + - '' + resources: + - events + verbs: + - create + - update + - patch + - apiGroups: + - '' + resources: + - configmaps + verbs: + - create + - delete + - get + - list + - watch + - apiGroups: + - security.istio.io + - apps + - policy + resources: + - poddisruptionbudgets + - peerauthentications + - deployments + - daemonsets + - replicasets + - statefulsets + verbs: + - create + - delete + - get + - list + - watch + - update + - apiGroups: + - apiregistration.k8s.io + resources: + - apiservices + verbs: + - create + - delete + - get + - list + - apiGroups: + - autoscaling + resources: + - horizontalpodautoscalers + verbs: + - create + - delete + - get + - list + - apiGroups: + - coordination.k8s.io + resources: + - leases + verbs: + - '*' + - apiGroups: + - apiextensions.k8s.io + resources: + - customresourcedefinitions + verbs: + - '*' + # Old resources that need cleaning up that are not in the knative-serving + # namespace. + - apiGroups: + - "" + resources: + - services + - deployments + - horizontalpodautoscalers + resourceNames: + - knative-ingressgateway + verbs: + - delete + - apiGroups: + - "" + resources: + - configmaps + resourceNames: + - config-controller + verbs: + - delete + - apiGroups: + - "" + resources: + - serviceaccounts + resourceNames: + - knative-serving-operator + verbs: + - delete + # for contour TLS + - apiGroups: + - projectcontour.io + resources: + - httpproxies + - tlscertificatedelegations + verbs: + - get + - list + - watch + - update + - create + - delete + - deletecollection + - patch + # for security-guard + - apiGroups: + - guard.security.knative.dev + resources: + - guardians + verbs: + - get + - list + - watch + - create + - update + - patch + - delete + - apiGroups: + - "" + resources: + - configmaps + verbs: + - get + - list + - watch + - create + - update + - patch + - delete + - apiGroups: + - authentication.k8s.io + resources: + - tokenreviews + verbs: + - create + - apiGroups: + - "" + resources: + - pods + verbs: + - get +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +kind: ClusterRole +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: knative-eventing-operator + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +rules: + - apiGroups: + - operator.knative.dev + resources: + - '*' + verbs: + - '*' + # Bootstrapping permissions. + # Roles that are explicitly bound buch which are specified by this Operator + # MUST be specified here with 'get' and 'bind'. + - apiGroups: + - rbac.authorization.k8s.io + resources: + - clusterroles + - roles + verbs: + - create + - delete + # Escalate is necessary in order to create a role using cluster role aggregation, + # and to allow the Operator to bootstrap itself into the necessary set of + # permissions, even as those continue to evolve upstream. + - escalate + - get + - list + - update + - apiGroups: + - rbac.authorization.k8s.io + resources: + - clusterrolebindings + - rolebindings + verbs: + - create + - delete + - list + - get + - update + # Permissions required for Knative controller + # infra. + - apiGroups: + - apiregistration.k8s.io + resources: + - apiservices + verbs: + - update + - apiGroups: + - "" + resources: + - services + verbs: + - create + - delete + - get + - list + - watch + - apiGroups: + - caching.internal.knative.dev + resources: + - images + verbs: + - '*' + - apiGroups: + - "" + resources: + - namespaces + verbs: + - get + - update + - watch + - apiGroups: + - '' + resources: + - events + verbs: + - create + - update + - patch + - apiGroups: + - '' + resources: + - configmaps + verbs: + - create + - delete + - get + - list + - watch + - apiGroups: + - apps + resources: + - deployments + - daemonsets + - replicasets + - statefulsets + verbs: + - create + - delete + - get + - list + - watch + - apiGroups: + - apiregistration.k8s.io + resources: + - apiservices + verbs: + - create + - delete + - get + - list + - apiGroups: + - autoscaling + resources: + - horizontalpodautoscalers + verbs: + - create + - delete + - update + - get + - list + - apiGroups: + - coordination.k8s.io + resources: + - leases + verbs: + - '*' + - apiGroups: + - apiextensions.k8s.io + resources: + - customresourcedefinitions + verbs: + - '*' + - apiGroups: + - batch + resources: + - jobs + verbs: + - create + - delete + - update + - get + - list + - watch + # Old resources that need cleaning up that are not in the knative-eventing + # namespace. + - apiGroups: + - "" + resources: + - serviceaccounts + resourceNames: + - knative-eventing-operator + verbs: + - delete + # for RabbitMQ messaging topology objects + - apiGroups: + - rabbitmq.com + resources: + - rabbitmqclusters + verbs: + - "get" + - "list" + - "watch" + - apiGroups: + - rabbitmq.com + resources: + - bindings + - queues + - exchanges + verbs: + - create + - delete + - get + - list + - patch + - update + - watch + - apiGroups: + - rabbitmq.com + resources: + - bindings/status + - queues/status + - exchanges/status + verbs: + - get + # for Kafka eventing source + - apiGroups: + - keda.sh + resources: + - scaledobjects + - scaledobjects/finalizers + - scaledobjects/status + - triggerauthentications + - triggerauthentications/status + verbs: + - get + - list + - watch + - update + - create + - delete + # Internal APIs + - apiGroups: + - "internal.kafka.eventing.knative.dev" + resources: + - "consumers" + - "consumers/status" + - "consumergroups" + - "consumergroups/status" + verbs: + - create + - get + - list + - watch + - patch + - update + - delete + - apiGroups: + - "internal.kafka.eventing.knative.dev" + resources: + - "consumers/finalizers" + - "consumergroups/finalizers" + verbs: + - update + - delete + - apiGroups: + - apps + resources: + - statefulsets/scale + verbs: + - get + - list + - watch + - update + - patch + - create + - delete + - apiGroups: + - rbac.authorization.k8s.io + resources: + - clusterrolebindings + verbs: + - watch + - apiGroups: + - "*" + resources: + - configmaps + verbs: + - delete + - apiGroups: + - "*" + resources: + - configmaps + - services + verbs: + - get + - list + - watch + - update + - create + - delete + - apiGroups: + - "*" + resources: + - pods + verbs: + - list + - update + - get + - watch + - apiGroups: + - "*" + resources: + - pods/finalizers + verbs: + - get + - list + - create + - update + - delete + - apiGroups: + - "*" + resources: + - events + verbs: + - patch + - create + - apiGroups: + - "*" + resources: + - secrets + verbs: + - get + - list + - watch + - update + - create + - delete + - apiGroups: + - "*" + resources: + - nodes + verbs: + - get + - list + - watch + - apiGroups: + - "*" + resources: + - serviceaccounts + verbs: + - get + - list + - watch + - update + - create + - delete + - apiGroups: + - "*" + resources: + - configmaps + resourceNames: + - kafka-channel-config + verbs: + - patch + - apiGroups: + - "*" + resources: + - horizontalpodautoscalers + resourceNames: + - kafka-webhook + verbs: + - delete + - apiGroups: + - "*" + resources: + - leases + verbs: + - delete + - apiGroups: + - "*" + resources: + - poddisruptionbudgets + resourceNames: + - kafka-webhook + verbs: + - delete + - apiGroups: + - "*" + resources: + - services + verbs: + - patch + - apiGroups: + - "apps" + resources: + - deployments + verbs: + - deletecollection + # Eventing TLS + - apiGroups: + - "cert-manager.io" + resources: + - certificates + - issuers + - clusterissuers + verbs: + - create + - delete + - update + - list + - get + - watch + - apiGroups: + - "trust.cert-manager.io" + resources: + - bundles + verbs: + - create + - delete + - update + - list + - get + - watch +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: knative-operator-webhook + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +rules: + # For watching logging configuration and getting certs. + - apiGroups: + - "" + resources: + - "configmaps" + verbs: + - "get" + - "list" + - "watch" + # For manipulating certs into secrets. + - apiGroups: + - "" + resources: + - "namespaces" + verbs: + - "get" + - "create" + - "update" + - "list" + - "watch" + - "patch" + # finalizers are needed for the owner reference of the webhook + - apiGroups: + - "" + resources: + - "namespaces/finalizers" + verbs: + - "update" + # For getting our Deployment so we can decorate with ownerref. + - apiGroups: + - "apps" + resources: + - "deployments" + verbs: + - "get" + - apiGroups: + - "apps" + resources: + - "deployments/finalizers" + verbs: + - update + # For actually registering our webhook. + - apiGroups: + - "admissionregistration.k8s.io" + resources: + - "mutatingwebhookconfigurations" + - "validatingwebhookconfigurations" + verbs: &everything + - "get" + - "list" + - "create" + - "update" + - "delete" + - "patch" + - "watch" + # For leader election + - apiGroups: + - "coordination.k8s.io" + resources: + - "leases" + verbs: *everything + # Necessary for conversion webhook. These are copied from the serving + # TODO: Do we really need all these permissions? + - apiGroups: ["apiextensions.k8s.io"] + resources: ["customresourcedefinitions"] + verbs: ["get", "list", "create", "update", "delete", "patch", "watch"] +--- +# Source: dataplane/charts/fluentbit/templates/clusterrolebinding.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: release-name-fluentbit + labels: + helm.sh/chart: fluentbit-0.48.9 + app.kubernetes.io/name: fluentbit + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "3.2.8" + app.kubernetes.io/managed-by: Helm +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: release-name-fluentbit +subjects: + - kind: ServiceAccount + name: union-system + namespace: union +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2020 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# TODO: Consider restriction of non-aggregated role to knativeservings namespaces. +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: knative-serving-operator + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: knative-serving-operator +subjects: + - kind: ServiceAccount + name: knative-operator + namespace: "union" +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# TODO: Consider restriction of non-aggregated role to knativeeventing namespaces. +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: knative-eventing-operator + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: knative-eventing-operator +subjects: + - kind: ServiceAccount + name: knative-operator + namespace: "union" +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: operator-webhook + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: knative-operator-webhook +subjects: + - kind: ServiceAccount + name: operator-webhook + namespace: "union" +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2020 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. -- apiGroups: ["admissionregistration.k8s.io"] - resources: - - validatingwebhookconfigurations - verbs: ["list", "watch"] +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: knative-serving-operator-aggregated + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: knative-serving-operator-aggregated +subjects: + - kind: ServiceAccount + name: knative-operator + namespace: "union" +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: knative-serving-operator-aggregated-stable + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: knative-serving-operator-aggregated-stable +subjects: + - kind: ServiceAccount + name: knative-operator + namespace: "union" +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: knative-eventing-operator-aggregated + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: knative-eventing-operator-aggregated +subjects: + - kind: ServiceAccount + name: knative-operator + namespace: "union" +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: knative-eventing-operator-aggregated-stable + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: knative-eventing-operator-aggregated-stable +subjects: + - kind: ServiceAccount + name: knative-operator + namespace: "union" +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2022 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. -- apiGroups: ["storage.k8s.io"] - resources: - - volumeattachments - verbs: ["list", "watch"] +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + namespace: "union" + name: knative-operator-webhook + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +rules: + # For manipulating certs into secrets. + - apiGroups: + - "" + resources: + - "secrets" + verbs: + - "get" + - "create" + - "update" + - "list" + - "watch" + - "patch" --- -# Source: dataplane/charts/opencost/templates/clusterrole.yaml -# Cluster role giving opencost to get, list, watch required resources -# No write permissions are required +# Source: dataplane/templates/monitoring/prometheus-rbac.yaml apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole +kind: Role metadata: - name: release-name-opencost + name: union-operator-prometheus-rbac + namespace: union labels: - helm.sh/chart: opencost-1.42.0 - app.kubernetes.io/name: opencost - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "1.111.0" - app.kubernetes.io/part-of: opencost - app.kubernetes.io/managed-by: Helm + release: release-name rules: + # Prometheus server scrape permissions - apiGroups: [""] resources: - - configmaps - - deployments - nodes - - pods + - nodes/proxy + - nodes/metrics - services - - resourcequotas - - replicationcontrollers - - limitranges - - persistentvolumeclaims - - persistentvolumes - - namespaces - endpoints + - pods + - ingresses + - configmaps verbs: - get - list - watch - - apiGroups: - - extensions - resources: - - daemonsets - - deployments - - replicasets - verbs: - - get - - list - - watch - - apiGroups: - - apps - resources: - - statefulsets - - deployments - - daemonsets - - replicasets - verbs: - - list - - watch - - apiGroups: - - batch - resources: - - cronjobs - - jobs - verbs: - - get - - list - - watch - - apiGroups: - - autoscaling - resources: - - horizontalpodautoscalers - verbs: - - get - - list - - watch - - apiGroups: - - policy + - apiGroups: ["extensions", "networking.k8s.io"] resources: - - poddisruptionbudgets + - ingresses/status + - ingresses verbs: - get - list - watch - - apiGroups: - - storage.k8s.io + - apiGroups: ["discovery.k8s.io"] resources: - - storageclasses + - endpointslices verbs: - get - list - watch ---- -# Source: dataplane/templates/clusterresourcesync/serviceaccount.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: union-clustersync-resource -rules: - - apiGroups: - - "" - - rbac.authorization.k8s.io - resources: - - configmaps - - namespaces - - pods - - resourcequotas - - roles - - rolebindings - - secrets - - services - - serviceaccounts - - clusterrolebindings - - podtemplates - verbs: - - '*' + # kube-state-metrics permissions (rbac.create=false on the subchart; managed here instead) + - apiGroups: ["certificates.k8s.io"] + resources: ["certificatesigningrequests"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["configmaps"] + verbs: ["list", "watch"] + - apiGroups: ["batch"] + resources: ["cronjobs"] + verbs: ["list", "watch"] + - apiGroups: ["extensions", "apps"] + resources: ["daemonsets"] + verbs: ["list", "watch"] + - apiGroups: ["extensions", "apps"] + resources: ["deployments"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["endpoints"] + verbs: ["list", "watch"] + - apiGroups: ["discovery.k8s.io"] + resources: ["endpointslices"] + verbs: ["list", "watch"] + - apiGroups: ["autoscaling"] + resources: ["horizontalpodautoscalers"] + verbs: ["list", "watch"] + - apiGroups: ["extensions", "networking.k8s.io"] + resources: ["ingresses"] + verbs: ["list", "watch"] + - apiGroups: ["batch"] + resources: ["jobs"] + verbs: ["list", "watch"] + - apiGroups: ["coordination.k8s.io"] + resources: ["leases"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["limitranges"] + verbs: ["list", "watch"] + - apiGroups: ["admissionregistration.k8s.io"] + resources: ["mutatingwebhookconfigurations"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["namespaces"] + verbs: ["list", "watch"] + - apiGroups: ["networking.k8s.io"] + resources: ["networkpolicies"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["persistentvolumeclaims"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["persistentvolumes"] + verbs: ["list", "watch"] + - apiGroups: ["policy"] + resources: ["poddisruptionbudgets"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["pods"] + verbs: ["list", "watch"] + - apiGroups: ["extensions", "apps"] + resources: ["replicasets"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["replicationcontrollers"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["resourcequotas"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["secrets"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["services"] + verbs: ["list", "watch"] + - apiGroups: ["apps"] + resources: ["statefulsets"] + verbs: ["list", "watch"] + - apiGroups: ["storage.k8s.io"] + resources: ["storageclasses"] + verbs: ["list", "watch"] + - apiGroups: ["admissionregistration.k8s.io"] + resources: ["validatingwebhookconfigurations"] + verbs: ["list", "watch"] + - apiGroups: ["storage.k8s.io"] + resources: ["volumeattachments"] + verbs: ["list", "watch"] --- # Source: dataplane/templates/nodeexecutor/serviceaccount.yaml apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole +kind: Role metadata: name: union-executor labels: @@ -4309,11 +4634,12 @@ rules: - delete - update --- -# Source: dataplane/templates/operator/serviceaccount-proxy.yaml +# Source: dataplane/templates/operator/serviceaccount-proxy-secret.yaml apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole +kind: Role metadata: - name: proxy-system + name: union-system-secret + namespace: union labels: app.kubernetes.io/name: operator-proxy app.kubernetes.io/instance: release-name @@ -4323,146 +4649,86 @@ rules: - apiGroups: - '*' resources: - - events - - flyteworkflows - - pods/log - - pods - - rayjobs - - resourcequotas - verbs: - - get - - list - - watch ---- -# Source: dataplane/templates/operator/serviceaccount.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: operator-system - labels: - app.kubernetes.io/name: union-operator - app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name - app.kubernetes.io/managed-by: Helm -rules: - # Allow Access to all resources under flyte.lyft.com - - apiGroups: - - flyte.lyft.com - resources: - - flyteworkflows - - flyteworkflows/finalizers - verbs: - - get - - list - - watch - - create - - update - - delete - - patch - - post - - deletecollection - - apiGroups: - - '*' - resources: - - resourcequotas - - pods - - configmaps - - podtemplates - secrets - - namespaces - - nodes verbs: - get - list - - watch - create - update - delete - - nonResourceURLs: - - /metrics - verbs: - - get --- -# Source: dataplane/templates/prometheus/rbac.yaml +# Source: dataplane/templates/operator/serviceaccount-proxy.yaml apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole +kind: Role metadata: - name: union-operator-prometheus + name: proxy-system labels: - helm.sh/chart: dataplane-2026.4.2 - app.kubernetes.io/name: release-name-dataplane + app.kubernetes.io/name: operator-proxy app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" + platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: prometheus rules: - - apiGroups: [""] + - apiGroups: + - '*' resources: - - nodes - - nodes/proxy + - events + - flyteworkflows + - pods/log - pods - - endpoints - - services + - rayjobs + - resourcequotas verbs: - get - list - watch - - nonResourceURLs: - - /metrics - - /metrics/cadvisor - verbs: - - get ---- -# Source: dataplane/templates/propeller/serviceaccount-webhook.yaml -kind: ClusterRole -apiVersion: rbac.authorization.k8s.io/v1 -metadata: - name: flytepropeller-webhook-role - namespace: union -rules: - - apiGroups: - - "*" - resources: - - mutatingwebhookconfigurations - - secrets - - pods - - replicasets/finalizers - verbs: - - get - - create - - update - - patch --- -# Source: dataplane/templates/propeller/serviceaccount.yaml +# Source: dataplane/templates/operator/serviceaccount.yaml apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole +kind: Role metadata: - name: flytepropeller-role + name: operator-system + labels: + app.kubernetes.io/name: union-operator + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm rules: - # Allow RO access to PODS - apiGroups: - - "" + - '*' resources: - - pods + - secrets + - deployments verbs: - get - list - watch - # Allow Event recording access + - create + - update - apiGroups: - - "" + - flyte.lyft.com resources: - - events + - flyteworkflows + - flyteworkflows/finalizers verbs: + - get + - list + - watch - create - update - delete - patch - # Allow Access All plugin objects + - post + - deletecollection - apiGroups: - '*' resources: - - '*' + - resourcequotas + - pods + - configmaps + - podtemplates + - secrets + - namespaces + - nodes verbs: - get - list @@ -4470,148 +4736,129 @@ rules: - create - update - delete - - patch - # Allow Access to CRD - apiGroups: - - apiextensions.k8s.io + - serving.knative.dev resources: - - customresourcedefinitions + - revisions + - configurations + - services verbs: - get - list - watch - create - - delete - update - # Allow Access to all resources under flyte.lyft.com + - delete +--- +# Source: dataplane/templates/webhook/serviceaccount.yaml +kind: Role +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: union-webhook-role + namespace: union +rules: - apiGroups: - - flyte.lyft.com + - "*" resources: - - flyteworkflows - - flyteworkflows/finalizers + - mutatingwebhookconfigurations + - secrets + - pods + - replicasets/finalizers verbs: - get - - list - - watch - create - update - - delete - patch - - post - - deletecollection --- -# Source: dataplane/charts/fluentbit/templates/clusterrolebinding.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2022 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding +kind: RoleBinding metadata: - name: release-name-fluentbit + namespace: "union" + name: operator-webhook labels: - helm.sh/chart: fluentbit-0.48.9 - app.kubernetes.io/name: fluentbit - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "3.2.8" - app.kubernetes.io/managed-by: Helm -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: release-name-fluentbit + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator subjects: - kind: ServiceAccount - name: fluentbit-system - namespace: union ---- -# Source: dataplane/charts/kube-state-metrics/templates/clusterrolebinding.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - labels: - helm.sh/chart: kube-state-metrics-5.30.1 - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: metrics - app.kubernetes.io/part-of: kube-state-metrics - app.kubernetes.io/name: kube-state-metrics - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2.15.0" - name: release-name-kube-state-metrics + name: operator-webhook + namespace: "union" roleRef: + kind: Role + name: knative-operator-webhook apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: release-name-kube-state-metrics -subjects: -- kind: ServiceAccount - name: release-name-kube-state-metrics - namespace: union --- -# Source: dataplane/charts/opencost/templates/clusterrolebinding.yaml +# Source: dataplane/templates/monitoring/prometheus-rbac.yaml apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding +kind: RoleBinding metadata: - name: release-name-opencost + name: union-operator-prometheus-rbac + namespace: union labels: - helm.sh/chart: opencost-1.42.0 - app.kubernetes.io/name: opencost - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "1.111.0" - app.kubernetes.io/part-of: opencost - app.kubernetes.io/managed-by: Helm -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: release-name-opencost + release: release-name subjects: - kind: ServiceAccount - name: release-name-opencost + name: union-operator-prometheus namespace: union ---- -# Source: dataplane/templates/clusterresourcesync/serviceaccount.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - name: union-clustersync-resource roleRef: apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: union-clustersync-resource -subjects: - - kind: ServiceAccount - name: union-clustersync-system - namespace: union + kind: Role + name: union-operator-prometheus-rbac --- -# Source: dataplane/templates/clusterresourcesync/serviceaccount.yaml +# Source: dataplane/templates/monitoring/prometheus-rbac.yaml +# kube-state-metrics service account RoleBinding apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding +kind: RoleBinding metadata: - name: union-clustersync-auth-delegator -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: system:auth-delegator + name: release-name-prometheus-kube-state-metrics + namespace: union + labels: + release: release-name subjects: - kind: ServiceAccount - name: union-clustersync-system + name: release-name-prometheus-kube-state-metrics namespace: union +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: union-operator-prometheus-rbac --- # Source: dataplane/templates/nodeexecutor/serviceaccount.yaml apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding +kind: RoleBinding metadata: name: union-executor labels: app: executor roleRef: apiGroup: rbac.authorization.k8s.io - kind: ClusterRole + kind: Role name: union-executor subjects: - kind: ServiceAccount - name: executor + name: union-system namespace: union --- -# Source: dataplane/templates/operator/serviceaccount-proxy.yaml +# Source: dataplane/templates/operator/serviceaccount-proxy-secret.yaml apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding +kind: RoleBinding metadata: - name: proxy-system + name: union-system-secret + namespace: union labels: app.kubernetes.io/name: operator-proxy app.kubernetes.io/instance: release-name @@ -4619,109 +4866,35 @@ metadata: app.kubernetes.io/managed-by: Helm roleRef: apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: proxy-system + kind: Role + name: union-system-secret subjects: - kind: ServiceAccount - name: proxy-system + name: union-system namespace: union --- -# Source: dataplane/templates/operator/serviceaccount.yaml +# Source: dataplane/templates/operator/serviceaccount-proxy.yaml apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding +kind: RoleBinding metadata: - name: operator-system + name: proxy-system labels: - app.kubernetes.io/name: union-operator + app.kubernetes.io/name: operator-proxy app.kubernetes.io/instance: release-name platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm roleRef: apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: operator-system -subjects: - - kind: ServiceAccount - name: operator-system - namespace: union ---- -# Source: dataplane/templates/prometheus/rbac.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - name: union-operator-prometheus - labels: - helm.sh/chart: dataplane-2026.4.2 - app.kubernetes.io/name: release-name-dataplane - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: prometheus -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: union-operator-prometheus -subjects: - - kind: ServiceAccount - name: union-operator-prometheus - namespace: union ---- -# Source: dataplane/templates/propeller/serviceaccount-webhook.yaml -# Create a binding from Role -> ServiceAccount -kind: ClusterRoleBinding -apiVersion: rbac.authorization.k8s.io/v1 -metadata: - name: flytepropeller-webhook-binding - namespace: union -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: flytepropeller-webhook-role -subjects: - - kind: ServiceAccount - name: flytepropeller-webhook-system - namespace: union ---- -# Source: dataplane/templates/propeller/serviceaccount.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - name: flytepropeller-binding -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: flytepropeller-role + kind: Role + name: proxy-system subjects: - kind: ServiceAccount - name: flytepropeller-system + name: union-system namespace: union --- -# Source: dataplane/templates/operator/serviceaccount-proxy-secret.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: Role -metadata: - name: proxy-system-secret - namespace: union - labels: - app.kubernetes.io/name: operator-proxy - app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name - app.kubernetes.io/managed-by: Helm -rules: - - apiGroups: - - '*' - resources: - - secrets - verbs: - - get - - list - - create - - update - - delete ---- # Source: dataplane/templates/operator/serviceaccount.yaml apiVersion: rbac.authorization.k8s.io/v1 -kind: Role +kind: RoleBinding metadata: name: operator-system labels: @@ -4729,56 +4902,29 @@ metadata: app.kubernetes.io/instance: release-name platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm -rules: - - apiGroups: - - '*' - resources: - - secrets - - deployments - verbs: - - get - - list - - watch - - create - - update ---- -# Source: dataplane/templates/operator/serviceaccount-proxy-secret.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: RoleBinding -metadata: - name: proxy-system-secret - namespace: union - labels: - app.kubernetes.io/name: operator-proxy - app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name - app.kubernetes.io/managed-by: Helm roleRef: apiGroup: rbac.authorization.k8s.io kind: Role - name: proxy-system-secret + name: operator-system subjects: - kind: ServiceAccount - name: proxy-system + name: union-system namespace: union --- -# Source: dataplane/templates/operator/serviceaccount.yaml -apiVersion: rbac.authorization.k8s.io/v1 +# Source: dataplane/templates/webhook/serviceaccount.yaml +# Create a binding from Role -> ServiceAccount kind: RoleBinding +apiVersion: rbac.authorization.k8s.io/v1 metadata: - name: operator-system - labels: - app.kubernetes.io/name: union-operator - app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name - app.kubernetes.io/managed-by: Helm + name: union-webhook-binding + namespace: union roleRef: apiGroup: rbac.authorization.k8s.io kind: Role - name: operator-system + name: union-webhook-role subjects: - kind: ServiceAccount - name: operator-system + name: union-system namespace: union --- # Source: dataplane/charts/fluentbit/templates/service.yaml @@ -4804,20 +4950,60 @@ spec: app.kubernetes.io/name: fluentbit app.kubernetes.io/instance: release-name --- -# Source: dataplane/charts/kube-state-metrics/templates/service.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2022 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v1 +kind: Service +metadata: + labels: + role: operator-webhook + app.kubernetes.io/component: operator-webhook + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator + name: operator-webhook + namespace: "union" +spec: + ports: + # Define metrics and profiling for them to be accessible within service meshes. + - name: http-metrics + port: 9090 + targetPort: 9090 + - name: http-profiling + port: 8008 + targetPort: 8008 + - name: https-webhook + port: 443 + targetPort: 8443 + selector: + role: operator-webhook +--- +# Source: dataplane/charts/prometheus/charts/kube-state-metrics/templates/service.yaml apiVersion: v1 kind: Service metadata: name: release-name-kube-state-metrics namespace: union labels: - helm.sh/chart: kube-state-metrics-5.30.1 + helm.sh/chart: kube-state-metrics-5.25.1 app.kubernetes.io/managed-by: Helm app.kubernetes.io/component: metrics app.kubernetes.io/part-of: kube-state-metrics app.kubernetes.io/name: kube-state-metrics app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2.15.0" + app.kubernetes.io/version: "2.13.0" annotations: prometheus.io/scrape: 'true' spec: @@ -4832,28 +5018,32 @@ spec: app.kubernetes.io/name: kube-state-metrics app.kubernetes.io/instance: release-name --- -# Source: dataplane/charts/opencost/templates/service.yaml +# Source: dataplane/charts/prometheus/templates/service.yaml apiVersion: v1 kind: Service metadata: - name: release-name-opencost - namespace: union labels: - helm.sh/chart: opencost-1.42.0 - app.kubernetes.io/name: opencost + app.kubernetes.io/component: server + app.kubernetes.io/name: prometheus app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "1.111.0" - app.kubernetes.io/part-of: opencost + app.kubernetes.io/version: v2.54.1 + helm.sh/chart: prometheus-25.27.0 app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: prometheus + name: union-operator-prometheus + namespace: union spec: + ports: + - name: http + port: 80 + protocol: TCP + targetPort: 9090 selector: - app.kubernetes.io/name: opencost + app.kubernetes.io/component: server + app.kubernetes.io/name: prometheus app.kubernetes.io/instance: release-name + sessionAffinity: None type: "ClusterIP" - ports: - - name: http - port: 9003 - targetPort: 9003 --- # Source: dataplane/templates/clusterresourcesync/service.yaml apiVersion: v1 @@ -4877,6 +5067,31 @@ spec: app.kubernetes.io/name: clusterresourcesync app.kubernetes.io/instance: release-name --- +# Source: dataplane/templates/flyteconnector/service.yaml +apiVersion: v1 +kind: Service +metadata: + name: flyteconnector + namespace: union + labels: + app.kubernetes.io/name: flyteconnector + app.kubernetes.io/instance: release-name + app.kubernetes.io/managed-by: Helm +spec: + clusterIP: None + ports: + - name: grpc + port: 8000 + protocol: TCP + targetPort: grpc + - name: metric + port: 9090 + protocol: TCP + targetPort: metric + selector: + app.kubernetes.io/name: flyteconnector + app.kubernetes.io/instance: release-name +--- # Source: dataplane/templates/imagebuilder/service.yaml apiVersion: v1 kind: Service @@ -4902,7 +5117,7 @@ spec: apiVersion: v1 kind: Service metadata: - name: release-name-dataplane-executor + name: union-operator-executor labels: platform.union.ai/prometheus-group: "union-services" app: executor @@ -4968,39 +5183,14 @@ spec: app.kubernetes.io/name: union-operator app.kubernetes.io/instance: release-name --- -# Source: dataplane/templates/prometheus/service.yaml -apiVersion: v1 -kind: Service -metadata: - name: union-operator-prometheus - namespace: union - labels: - helm.sh/chart: dataplane-2026.4.2 - app.kubernetes.io/name: release-name-dataplane - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: prometheus -spec: - type: ClusterIP - ports: - - port: 80 - targetPort: 9090 - protocol: TCP - name: http - selector: - app.kubernetes.io/component: prometheus - app.kubernetes.io/name: release-name-dataplane - app.kubernetes.io/instance: release-name ---- -# Source: dataplane/templates/propeller/service-webhook.yaml +# Source: dataplane/templates/webhook/service.yaml apiVersion: v1 kind: Service metadata: - name: flyte-pod-webhook + name: union-pod-webhook namespace: union labels: - app.kubernetes.io/name: flyte-pod-webhook + app.kubernetes.io/name: union-pod-webhook app.kubernetes.io/instance: release-name platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm @@ -5008,7 +5198,7 @@ metadata: projectcontour.io/upstream-protocol.h2c: grpc spec: selector: - app.kubernetes.io/name: flyte-pod-webhook + app.kubernetes.io/name: union-pod-webhook app.kubernetes.io/instance: release-name ports: - name: https @@ -5020,23 +5210,23 @@ spec: port: 10254 targetPort: 10254 --- -# Source: dataplane/templates/propeller/service-webhook.yaml +# Source: dataplane/templates/webhook/service.yaml # Headless Service for cache invalidation — resolves to all pod IPs so that # we can fan out invalidation requests to every webhook replica. apiVersion: v1 kind: Service metadata: - name: flyte-pod-webhook-headless + name: union-pod-webhook-headless namespace: union labels: - app.kubernetes.io/name: flyte-pod-webhook + app.kubernetes.io/name: union-pod-webhook app.kubernetes.io/instance: release-name platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm spec: clusterIP: None selector: - app.kubernetes.io/name: flyte-pod-webhook + app.kubernetes.io/name: union-pod-webhook app.kubernetes.io/instance: release-name ports: - name: cache-internal @@ -5044,32 +5234,6 @@ spec: port: 9443 targetPort: 9443 --- -# Source: dataplane/templates/propeller/service.yaml -apiVersion: v1 -kind: Service -metadata: - namespace: union - name: flytepropeller - labels: - platform.union.ai/prometheus-group: "union-services" - app.kubernetes.io/name: flytepropeller - app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name - app.kubernetes.io/managed-by: Helm -spec: - type: ClusterIP - ports: - - name: debug - protocol: TCP - port: 10254 - - name: fasttask - port: 15605 - protocol: TCP - targetPort: 15605 - selector: - app.kubernetes.io/name: flytepropeller - app.kubernetes.io/instance: release-name ---- # Source: dataplane/charts/fluentbit/templates/daemonset.yaml apiVersion: apps/v1 kind: DaemonSet @@ -5095,7 +5259,7 @@ spec: annotations: checksum/config: e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855 spec: - serviceAccountName: fluentbit-system + serviceAccountName: union-system hostNetwork: false dnsPolicy: ClusterFirst containers: @@ -5147,20 +5311,217 @@ spec: tolerations: - operator: Exists --- -# Source: dataplane/charts/kube-state-metrics/templates/deployment.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2022 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: operator-webhook + namespace: "union" + labels: + app.kubernetes.io/component: operator-webhook + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +spec: + selector: + matchLabels: + app: operator-webhook + role: operator-webhook + template: + metadata: + labels: + app: operator-webhook + role: operator-webhook + app.kubernetes.io/component: operator-webhook + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator + sidecar.istio.io/inject: "false" + spec: + # To avoid node becoming SPOF, spread our replicas to different nodes. + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - podAffinityTerm: + labelSelector: + matchLabels: + app: operator-webhook + topologyKey: kubernetes.io/hostname + weight: 100 + serviceAccountName: operator-webhook + containers: + - name: operator-webhook + # This is the Go import path for the binary that is containerized + # and substituted here. + image: gcr.io/knative-releases/knative.dev/operator/cmd/webhook@sha256:d3a7a3304629ccfcaacec620b50736e0b4c902a6328b83369b6cbaf7e94677c9 + resources: + requests: + cpu: 100m + memory: 100Mi + limits: + cpu: 500m + memory: 500Mi + env: + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: SYSTEM_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: CONFIG_LOGGING_NAME + value: config-logging + - name: CONFIG_OBSERVABILITY_NAME + value: config-observability + - name: WEBHOOK_NAME + value: operator-webhook + - name: WEBHOOK_PORT + value: "8443" + - name: WEBHOOK_SECRET_NAME + value: operator-webhook-certs + - name: METRICS_DOMAIN + value: knative.dev/operator + - name: KUBERNETES_MIN_VERSION + value: "" + securityContext: + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + runAsNonRoot: true + capabilities: + drop: + - ALL + ports: + - name: metrics + containerPort: 9090 + - name: profiling + containerPort: 8008 + - name: https-webhook + containerPort: 8443 + readinessProbe: + periodSeconds: 1 + httpGet: + scheme: HTTPS + port: 8443 + httpHeaders: + - name: k-kubelet-probe + value: "webhook" + livenessProbe: + periodSeconds: 1 + httpGet: + scheme: HTTPS + port: 8443 + httpHeaders: + - name: k-kubelet-probe + value: "webhook" + failureThreshold: 6 + initialDelaySeconds: 120 + # Our webhook should gracefully terminate by lame ducking first, set this to a sufficiently + # high value that we respect whatever value it has configured for the lame duck grace period. + terminationGracePeriodSeconds: 300 +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2020 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: knative-operator + namespace: "union" + labels: + app.kubernetes.io/name: knative-operator + app.kubernetes.io/version: "1.16.0" +spec: + replicas: 1 + selector: + matchLabels: + name: knative-operator + template: + metadata: + labels: + name: knative-operator + app.kubernetes.io/name: knative-operator + app.kubernetes.io/version: "1.16.0" + sidecar.istio.io/inject: "false" + spec: + serviceAccountName: knative-operator + containers: + - name: knative-operator + image: gcr.io/knative-releases/knative.dev/operator/cmd/operator@sha256:0b5a3532417f9c8e7b6044e23f6f67ad932a74a40a4092ad965b6f173b2fd887 + resources: + requests: + cpu: 100m + memory: 100Mi + limits: + cpu: 1000m + memory: 1000Mi + imagePullPolicy: IfNotPresent + env: + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: SYSTEM_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: METRICS_DOMAIN + value: knative.dev/operator + - name: CONFIG_LOGGING_NAME + value: config-logging + - name: CONFIG_OBSERVABILITY_NAME + value: config-observability + - name: KUBERNETES_MIN_VERSION + value: "" + securityContext: + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + runAsNonRoot: true + capabilities: + drop: + - ALL + ports: + - name: metrics + containerPort: 9090 +--- +# Source: dataplane/charts/prometheus/charts/kube-state-metrics/templates/deployment.yaml apiVersion: apps/v1 kind: Deployment metadata: name: release-name-kube-state-metrics namespace: union labels: - helm.sh/chart: kube-state-metrics-5.30.1 + helm.sh/chart: kube-state-metrics-5.25.1 app.kubernetes.io/managed-by: Helm app.kubernetes.io/component: metrics app.kubernetes.io/part-of: kube-state-metrics app.kubernetes.io/name: kube-state-metrics app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2.15.0" + app.kubernetes.io/version: "2.13.0" spec: selector: matchLabels: @@ -5173,13 +5534,13 @@ spec: template: metadata: labels: - helm.sh/chart: kube-state-metrics-5.30.1 + helm.sh/chart: kube-state-metrics-5.25.1 app.kubernetes.io/managed-by: Helm app.kubernetes.io/component: metrics app.kubernetes.io/part-of: kube-state-metrics app.kubernetes.io/name: kube-state-metrics app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2.15.0" + app.kubernetes.io/version: "2.13.0" spec: automountServiceAccountToken: true hostNetwork: false @@ -5196,8 +5557,9 @@ spec: args: - --port=8080 - --resources=certificatesigningrequests,configmaps,cronjobs,daemonsets,deployments,endpoints,horizontalpodautoscalers,ingresses,jobs,leases,limitranges,mutatingwebhookconfigurations,namespaces,networkpolicies,nodes,persistentvolumeclaims,persistentvolumes,poddisruptionbudgets,pods,replicasets,replicationcontrollers,resourcequotas,secrets,services,statefulsets,storageclasses,validatingwebhookconfigurations,volumeattachments + - --namespaces=union imagePullPolicy: IfNotPresent - image: registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.15.0 + image: registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.13.0 ports: - containerPort: 8080 name: "http" @@ -5217,7 +5579,7 @@ spec: httpGet: httpHeaders: path: /readyz - port: 8081 + port: 8080 scheme: HTTP initialDelaySeconds: 5 periodSeconds: 10 @@ -5232,202 +5594,204 @@ spec: - ALL readOnlyRootFilesystem: true --- -# Source: dataplane/charts/opencost/templates/deployment.yaml +# Source: dataplane/charts/prometheus/templates/deploy.yaml apiVersion: apps/v1 kind: Deployment metadata: - name: release-name-opencost - namespace: union labels: - helm.sh/chart: opencost-1.42.0 - app.kubernetes.io/name: opencost + app.kubernetes.io/component: server + app.kubernetes.io/name: prometheus app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "1.111.0" - app.kubernetes.io/part-of: opencost + app.kubernetes.io/version: v2.54.1 + helm.sh/chart: prometheus-25.27.0 app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: prometheus + name: union-operator-prometheus + namespace: union spec: - replicas: 1 selector: matchLabels: - app.kubernetes.io/name: opencost + app.kubernetes.io/component: server + app.kubernetes.io/name: prometheus app.kubernetes.io/instance: release-name - strategy: - rollingUpdate: - maxSurge: 1 - maxUnavailable: 1 - type: RollingUpdate + replicas: 1 + revisionHistoryLimit: 10 + strategy: + type: Recreate + rollingUpdate: null template: metadata: labels: - app.kubernetes.io/name: opencost + app.kubernetes.io/component: server + app.kubernetes.io/name: prometheus app.kubernetes.io/instance: release-name + app.kubernetes.io/version: v2.54.1 + helm.sh/chart: prometheus-25.27.0 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: prometheus spec: - serviceAccountName: release-name-opencost + enableServiceLinks: true + serviceAccountName: union-operator-prometheus containers: - - name: release-name-opencost - image: ghcr.io/opencost/opencost:1.111.0@sha256:6aa68e52a24b14ba41f23db08d1b9db1429a1c0300f4c0381ecc2c61fc311a97 - imagePullPolicy: IfNotPresent + - name: prometheus-server-configmap-reload + image: "quay.io/prometheus-operator/prometheus-config-reloader:v0.76.0" + imagePullPolicy: "IfNotPresent" args: + - --watched-dir=/etc/config + - --listen-address=0.0.0.0:8080 + - --reload-url=http://127.0.0.1:9090/prometheus/-/reload ports: - - containerPort: 9003 - name: http - resources: - limits: - cpu: 1000m - memory: 4Gi - requests: - cpu: 500m - memory: 1Gi - startupProbe: - httpGet: - path: /healthz - port: 9003 - initialDelaySeconds: 10 - periodSeconds: 5 - failureThreshold: 30 + - containerPort: 8080 + name: metrics livenessProbe: httpGet: path: /healthz - port: 9003 - initialDelaySeconds: 10 - periodSeconds: 20 - failureThreshold: 3 + port: metrics + scheme: HTTP + initialDelaySeconds: 2 + periodSeconds: 10 readinessProbe: httpGet: path: /healthz - port: 9003 - initialDelaySeconds: 10 + port: metrics + scheme: HTTP periodSeconds: 10 + volumeMounts: + - name: config-volume + mountPath: /etc/config + readOnly: true + + - name: prometheus-server + image: "quay.io/prometheus/prometheus:v2.54.1" + imagePullPolicy: "IfNotPresent" + args: + - --storage.tsdb.retention.time=3d + - --config.file=/etc/config/prometheus.yml + - --storage.tsdb.path=/data + - --web.console.libraries=/etc/prometheus/console_libraries + - --web.console.templates=/etc/prometheus/consoles + - --web.enable-lifecycle + - --web.route-prefix=/prometheus + - --web.external-url=/prometheus/ + ports: + - containerPort: 9090 + readinessProbe: + httpGet: + path: /prometheus/-/ready + port: 9090 + scheme: HTTP + initialDelaySeconds: 30 + periodSeconds: 5 + timeoutSeconds: 4 failureThreshold: 3 - env: - - name: LOG_LEVEL - value: info - - name: CUSTOM_COST_ENABLED - value: "false" - - name: KUBECOST_NAMESPACE - value: union - - name: API_PORT - value: "9003" - - name: PROMETHEUS_SERVER_ENDPOINT - value: "http://union-operator-prometheus.union.svc:80/prometheus" - - name: CLUSTER_ID - value: "default-cluster" - - name: DATA_RETENTION_DAILY_RESOLUTION_DAYS - value: "15" - - name: CLOUD_COST_ENABLED - value: "false" - - name: CLOUD_COST_MONTH_TO_DATE_INTERVAL - value: "6" - - name: CLOUD_COST_REFRESH_RATE_HOURS - value: "6" - - name: CLOUD_COST_QUERY_WINDOW_DAYS - value: "7" - - name: CLOUD_COST_RUN_WINDOW_DAYS - value: "3" - # Add any additional provided variables + successThreshold: 1 + livenessProbe: + httpGet: + path: /prometheus/-/healthy + port: 9090 + scheme: HTTP + initialDelaySeconds: 30 + periodSeconds: 15 + timeoutSeconds: 10 + failureThreshold: 3 + successThreshold: 1 + resources: + limits: + cpu: "3" + memory: 3500Mi + requests: + cpu: "1" + memory: 1Gi + volumeMounts: + - name: config-volume + mountPath: /etc/config + - name: storage-volume + mountPath: /data + subPath: "" + - name: server-recording-rules + mountPath: /etc/config/recording + subPath: + readOnly: + dnsPolicy: ClusterFirst + securityContext: + fsGroup: 65534 + runAsGroup: 65534 + runAsNonRoot: true + runAsUser: 65534 + terminationGracePeriodSeconds: 300 + volumes: + - name: config-volume + configMap: + name: union-operator-prometheus + - name: server-recording-rules + configMap: + name: union-recording-rules + - name: storage-volume + emptyDir: + sizeLimit: 10Gi --- -# Source: dataplane/templates/clusterresourcesync/deployment.yaml +# Source: dataplane/templates/flyteconnector/deployment.yaml apiVersion: apps/v1 kind: Deployment metadata: - name: union-syncresources + name: flyteconnector namespace: union - labels: - app.kubernetes.io/name: clusterresourcesync + labels: + app.kubernetes.io/name: flyteconnector app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm spec: - replicas: 1 - selector: - matchLabels: - app.kubernetes.io/name: clusterresourcesync - app.kubernetes.io/instance: release-name - template: - metadata: - annotations: - configChecksum: "c13a554b9a39b142cd7fe140c69cc4a74b036b91594b6a122650fa05f1e1bce" - - labels: - platform.union.ai/zone: "dataplane" - - azure.workload.identity/use: "true" - app.kubernetes.io/name: clusterresourcesync - app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name - app.kubernetes.io/managed-by: Helm - spec: - containers: - - command: - - clusterresource - - --config - - /etc/flyte/config/*.yaml - - clusterresource - - run - env: - - name: POD_NAME - valueFrom: - fieldRef: - fieldPath: metadata.name - - name: POD_NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - - name: GOMEMLIMIT - valueFrom: - resourceFieldRef: - divisor: 1 - resource: limits.memory - - name: GOMAXPROCS - valueFrom: - resourceFieldRef: - divisor: 1 - resource: limits.cpu - - name: CLUSTER_NAME - valueFrom: - secretKeyRef: - name: operator-cluster-name - key: cluster_name - - name: DEPLOYMENT_NAME - value: operator - - name: PROXY_SERVICE_URL - value: http://union-operator-proxy:8080 - - name: PROMETHEUS_SERVICE_URL - value: http://union-operator-prometheus:80 - - name: KNATIVE_PROXY_SERVICE_URL - value: http://kourier-internal - image: "public.ecr.aws/p0i0a9q8/unionoperator:2026.4.5" + replicas: 2 + selector: + matchLabels: + app.kubernetes.io/name: flyteconnector + app.kubernetes.io/instance: release-name + template: + metadata: + annotations: + labels: + platform.union.ai/zone: "dataplane" + + app.kubernetes.io/name: flyteconnector + app.kubernetes.io/instance: release-name + app.kubernetes.io/managed-by: Helm + spec: + containers: + - command: + - c0 + image: "ghcr.io/flyteorg/flyte-connectors:py3.13-2.0.0b50.dev3-g695bb1db3.d20260122" imagePullPolicy: "IfNotPresent" - name: sync-cluster-resources + name: flyteconnector + readinessProbe: + failureThreshold: 3 + httpGet: + path: / + port: 9090 + scheme: HTTP + initialDelaySeconds: 5 + periodSeconds: 10 + successThreshold: 1 + timeoutSeconds: 5 + env: + - name: AIRFLOW_CONN_GOOGLE_CLOUD_DEFAULT + value: "google-cloud-platform://" + - name: LOG_LEVEL + value: "20" + ports: + - containerPort: 8000 + name: grpc + - containerPort: 9090 + name: metric resources: limits: - cpu: "1" - memory: 500Mi + cpu: "1.5" + ephemeral-storage: 100Mi + memory: 1500Mi requests: - cpu: 500m - memory: 100Mi - volumeMounts: - - name: auth - mountPath: /etc/union/secret - - name: resource-templates - mountPath: /etc/flyte/clusterresource/templates - - name: config-volume - mountPath: /etc/flyte/config - ports: - - name: debug - containerPort: 10254 - protocol: TCP - serviceAccountName: union-clustersync-system - volumes: - - configMap: - name: union-clusterresource-template - name: resource-templates - - configMap: - name: union-clusterresourcesync-config - name: config-volume - - name: auth - secret: - secretName: union-secret-auth + cpu: "1" + ephemeral-storage: 100Mi + memory: 1000Mi + serviceAccountName: flyteconnector --- # Source: dataplane/templates/imagebuilder/deployment.yaml apiVersion: apps/v1 @@ -5449,17 +5813,15 @@ spec: app.kubernetes.io/instance: release-name template: metadata: - annotations: - container.apparmor.security.beta.kubernetes.io/buildkit: unconfined labels: platform.union.ai/zone: "dataplane" app.kubernetes.io/name: imagebuilder-buildkit app.kubernetes.io/instance: release-name spec: - serviceAccountName: "union-imagebuilder" + serviceAccountName: "union-system" containers: - name: "buildkit" - image: "docker.io/moby/buildkit:buildx-stable-1-rootless" + image: "docker.io/moby/buildkit:buildx-stable-1" imagePullPolicy: IfNotPresent env: - name: POD_NAME @@ -5494,18 +5856,15 @@ spec: - name: KNATIVE_PROXY_SERVICE_URL value: http://kourier-internal volumeMounts: - - mountPath: /home/user/.local/share/buildkit - name: buildkitd - mountPath: /etc/buildkit name: buildkit-config args: - --config - /etc/buildkit/buildkitd.toml - --addr - - unix:///run/user/1000/buildkit/buildkitd.sock + - unix:///run//buildkit/buildkitd.sock - --addr - tcp://0.0.0.0:1234 - - --oci-worker-no-process-sandbox ports: - name: tcp containerPort: 1234 @@ -5527,18 +5886,13 @@ spec: initialDelaySeconds: 5 periodSeconds: 30 securityContext: - seccompProfile: # Needs Kubernetes >= 1.19 - type: Unconfined - runAsUser: 1000 - runAsGroup: 1000 + privileged: true resources: requests: - cpu: 1 - ephemeral-storage: 20Gi - memory: 1Gi + cpu: 4 + ephemeral-storage: 50Gi + memory: 4Gi volumes: - - name: buildkitd - emptyDir: {} - configMap: name: union-operator-buildkit name: buildkit-config @@ -5567,17 +5921,19 @@ spec: template: metadata: annotations: - configChecksum: "e767f4bbc6e2c812d5e85c1626f16976591e906cd50a63d0a05936dc4bc6713" + configChecksum: "1fda2c79bb8d0fd2311d527f35a4e24d37b94822a3711895e9bc13e41169df7" labels: platform.union.ai/zone: "dataplane" azure.workload.identity/use: "true" app: executor + app.kubernetes.io/instance: 'release-name' + app.kubernetes.io/name: executor spec: securityContext: fsGroup: 1337 - serviceAccountName: executor + serviceAccountName: union-system volumes: - name: config-volume configMap: @@ -5593,7 +5949,7 @@ spec: image: "public.ecr.aws/p0i0a9q8/unionoperator:2026.4.5" imagePullPolicy: IfNotPresent command: - - executor + - executorv2 - serve - --config - /etc/config/*.yaml @@ -5671,7 +6027,7 @@ spec: template: metadata: annotations: - configChecksum: "e957bf97e34bc517eb848291a4518edd796c12a83ea838157152d802194bf58" + configChecksum: "7dce40cadaab9d761e4b05c6e57bfb4d664999f4293ef014176d1b8fe03e595" labels: @@ -5687,12 +6043,10 @@ spec: sources: - configMap: name: union-operator - - configMap: - name: union-clusterresourcesync-config - name: secret-volume secret: secretName: union-secret-auth - serviceAccountName: proxy-system + serviceAccountName: union-system securityContext: {} containers: @@ -5809,7 +6163,7 @@ spec: template: metadata: annotations: - configChecksum: "e957bf97e34bc517eb848291a4518edd796c12a83ea838157152d802194bf58" + configChecksum: "7dce40cadaab9d761e4b05c6e57bfb4d664999f4293ef014176d1b8fe03e595" labels: @@ -5819,7 +6173,7 @@ spec: platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm spec: - serviceAccountName: operator-system + serviceAccountName: union-system securityContext: {} volumes: @@ -5900,81 +6254,16 @@ spec: containerPort: 10254 protocol: TCP --- -# Source: dataplane/templates/prometheus/deployment.yaml -apiVersion: apps/v1 -kind: Deployment -metadata: - name: union-operator-prometheus - namespace: union - labels: - helm.sh/chart: dataplane-2026.4.2 - app.kubernetes.io/name: release-name-dataplane - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: prometheus -spec: - replicas: 1 - selector: - matchLabels: - app.kubernetes.io/component: prometheus - app.kubernetes.io/name: release-name-dataplane - app.kubernetes.io/instance: release-name - template: - metadata: - annotations: - configChecksum: "72a39ca838f02fc499675b5708fa77e2a4938e316a99315b59e33ef7f2d31ae" - labels: - app.kubernetes.io/component: prometheus - app.kubernetes.io/name: release-name-dataplane - app.kubernetes.io/instance: release-name - spec: - priorityClassName: system-cluster-critical - serviceAccountName: union-operator-prometheus - securityContext: - fsGroup: 65534 - runAsNonRoot: true - runAsUser: 65534 - fsGroupChangePolicy: OnRootMismatch - containers: - - name: prometheus - image: "prom/prometheus:v3.3.1" - args: - - --config.file=/etc/prometheus/prometheus.yml - - --web.external-url=/prometheus/ - - --web.route-prefix=/prometheus/ - - --storage.tsdb.retention.time=3d - ports: - - name: http - containerPort: 9090 - protocol: TCP - resources: - limits: - cpu: "3" - memory: 3500Mi - requests: - cpu: "1" - memory: 1Gi - securityContext: - allowPrivilegeEscalation: false - terminationMessagePolicy: FallbackToLogsOnError - volumeMounts: - - mountPath: /etc/prometheus - name: prometheus-config - volumes: - - name: prometheus-config - configMap: - name: union-operator-prometheus ---- -# Source: dataplane/templates/propeller/deployment-webhook.yaml +# Source: dataplane/templates/webhook/deployment.yaml +# Webhook deployment # Create the actual deployment apiVersion: apps/v1 kind: Deployment metadata: - name: flytepropeller-webhook + name: union-pod-webhook namespace: union labels: - app.kubernetes.io/name: flyte-pod-webhook + app.kubernetes.io/name: union-pod-webhook app.kubernetes.io/instance: release-name platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm @@ -5982,7 +6271,7 @@ spec: replicas: 1 selector: matchLabels: - app.kubernetes.io/name: flyte-pod-webhook + app.kubernetes.io/name: union-pod-webhook app.kubernetes.io/instance: release-name template: metadata: @@ -5990,12 +6279,12 @@ spec: platform.union.ai/zone: "dataplane" azure.workload.identity/use: "true" - app.kubernetes.io/name: flyte-pod-webhook + app.kubernetes.io/name: union-pod-webhook app.kubernetes.io/instance: release-name platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm annotations: - configChecksum: "933b7ca10215c494cba6ea2776ebc555d5ec87de998392bd86a452aa81abe90" + configChecksum: "47b0ca630b85f3918c4871e21cdd2c7cd448b05b7ef3eca9abc7e1490808e1e" spec: securityContext: @@ -6003,64 +6292,8 @@ spec: fsGroupChangePolicy: Always runAsNonRoot: true runAsUser: 1001 - seLinuxOptions: - type: spc_t - serviceAccountName: flytepropeller-webhook-system - initContainers: - - name: generate-secrets - image: "public.ecr.aws/p0i0a9q8/unionoperator:2026.4.5" - imagePullPolicy: "IfNotPresent" - command: - - flytepropeller - args: - - webhook - - init-certs - - --config - - /etc/flyte/config/*.yaml - env: - - name: POD_NAME - valueFrom: - fieldRef: - fieldPath: metadata.name - - name: POD_NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - - name: GOMEMLIMIT - valueFrom: - resourceFieldRef: - divisor: 1 - resource: limits.memory - - name: GOMAXPROCS - valueFrom: - resourceFieldRef: - divisor: 1 - resource: limits.cpu - - name: CLUSTER_NAME - valueFrom: - secretKeyRef: - name: operator-cluster-name - key: cluster_name - - name: DEPLOYMENT_NAME - value: operator - - name: PROXY_SERVICE_URL - value: http://union-operator-proxy:8080 - - name: PROMETHEUS_SERVICE_URL - value: http://union-operator-prometheus:80 - - name: KNATIVE_PROXY_SERVICE_URL - value: http://kourier-internal - volumeMounts: - - name: config-volume - mountPath: /etc/flyte/config - resources: - limits: - cpu: 1 - ephemeral-storage: 500Mi - memory: 500Mi - requests: - cpu: 200m - ephemeral-storage: 500Mi - memory: 500Mi + seLinuxOptions: null + serviceAccountName: union-system containers: - name: webhook image: "public.ecr.aws/p0i0a9q8/unionoperator:2026.4.5" @@ -6125,108 +6358,301 @@ spec: volumes: - name: config-volume configMap: - name: flyte-propeller-config + name: union-pod-webhook-config - name: webhook-certs secret: - secretName: flyte-pod-webhook + secretName: union-pod-webhook --- -# Source: dataplane/templates/propeller/deployment.yaml -apiVersion: apps/v1 -kind: Deployment +# Source: dataplane/templates/flyteconnector/hpa.yaml +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler metadata: - namespace: union - name: flytepropeller + name: flyteconnector + labels: + app.kubernetes.io/name: flyteconnector + app.kubernetes.io/instance: release-name + app.kubernetes.io/managed-by: Helm +spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: flyteconnector + minReplicas: 2 + maxReplicas: 5 + metrics: + - type: Resource + resource: + name: cpu + target: + type: Utilization + averageUtilization: 80 + - type: Resource + resource: + name: memory + target: + type: Utilization + averageUtilization: 80 +--- +# Source: dataplane/templates/webhook/mutatingwebhookconfiguration.yaml +apiVersion: admissionregistration.k8s.io/v1 +kind: MutatingWebhookConfiguration +metadata: + name: union-pod-webhook-test-org labels: - app.kubernetes.io/name: flytepropeller + app.kubernetes.io/name: union-pod-webhook app.kubernetes.io/instance: release-name platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm +webhooks: + - name: union-pod-webhook.flyte.org + admissionReviewVersions: + - v1 + - v1beta1 + clientConfig: + caBundle: LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSUROekNDQWgrZ0F3SUJBZ0lVSm9kQ09lem4vd1BHQTZjYkw1U3duSlpYT2hJd0RRWUpLb1pJaHZjTkFRRUwKQlFBd0t6RXBNQ2NHQTFVRUF3d2dabXg1ZEdWd2NtOXdaV3hzWlhJdGQyVmlhRzl2YXk1MWJtbHZiaTV6ZG1NdwpIaGNOTWpZd01USXpNREl6TmpVMVdoY05Nell3TVRJeE1ESXpOalUxV2pBck1Ta3dKd1lEVlFRRERDQm1iSGwwClpYQnliM0JsYkd4bGNpMTNaV0pvYjI5ckxuVnVhVzl1TG5OMll6Q0NBU0l3RFFZSktvWklodmNOQVFFQkJRQUQKZ2dFUEFEQ0NBUW9DZ2dFQkFJcjBSWDRGRjFNVTNYdFBmUHErMDUrMGI1Qm0xK3hsTVR5ZUp4bG5mWFJuRlgyegpPcHV6NmsreUpBQk4xRkxmNDVYRnlJclFlaW5DWWJtckZXalFRc3ZDWEloNkpPNGMwdmRMVVU5RERYcldWRFlsCkRNZytTUDlJVnZBMm5USkVFZ2RFREVsKzNWeThUbUJoNlNmcGthcXdHajY5SmhIdy9OeE9yZzM2bUY3TU5VZ1MKWXJVUkxqUFFzTUN6NElTVzhhQnEzS2IwRUFWUTA4V1lZQ0ZEbjY1aHBzVFdxR05rL3BKRnVTYjlsZll6RVdyYwpJeW5wNDhqTnIvNUdKS3NnWG9LeTUwZVo4UkppeTVyOUhENGtnSW15TVNQR2RDMXFZSVppRHRlZW8yOElxNk5nCmd1dWcyTGhlNndYYW9YZXFCQ2d3SHpwWFVXM1hJVzI5eFoxUkFRRUNBd0VBQWFOVE1GRXdIUVlEVlIwT0JCWUUKRktITDQ1RWhQaVg5bnlRN241QUk0YUZVNFlCL01COEdBMVVkSXdRWU1CYUFGS0hMNDVFaFBpWDlueVE3bjVBSQo0YUZVNFlCL01BOEdBMVVkRXdFQi93UUZNQU1CQWY4d0RRWUpLb1pJaHZjTkFRRUxCUUFEZ2dFQkFEeTRac21XCkNjZ3B5TmI4cU1ydzNYclBNejZLMTRlTVhqQVk1bERFYXM1NklLZExoZFM1bi9sWk95bENuUUtOZ3RHS0VrY3UKT05tMEE2Z0xNT3UzcE43dG9aNXh0WktOcnUvS0x6dm44QUNNNndCbUR0bG5oRERDa3ZUTHBVRzh6UTZSNmlaegpzTWlhVmpLSUJiWkJsZXNYSHZZQnQ2YlRTN2o1QWM2aTdYc1RPK3ZqQ2RUQjVYdCtEZ0x2QnBJRWh2ZGJzNXhmClFtc3ZzQmwxenJnR3JXemo4d0txcXNUMzZaOVU1TzdSK1RJMlQrUDVOR25SeHNQOGJ0R0RtMGZ5SW55WlJrSkcKVTdpVFJyMlZRWlRyaTNZQ2dlVjN1UjFtMnJjc2hyU1Avb082T0QyTXR5Q1FMRWtFK1pxVUVJVnpSYmVaNXl2RQptM29YMFV2OUtBUy9pWTA9Ci0tLS0tRU5EIENFUlRJRklDQVRFLS0tLS0K + service: + name: union-pod-webhook + namespace: union + path: /mutate--v1-pod/secrets + port: 443 + failurePolicy: Fail + matchPolicy: Equivalent + namespaceSelector: + matchLabels: + kubernetes.io/metadata.name: union + objectSelector: + matchLabels: + inject-flyte-secrets: "true" + organization: 'test-org' + reinvocationPolicy: Never + rules: + - apiGroups: + - '*' + apiVersions: + - v1 + operations: + - CREATE + resources: + - pods + scope: '*' + sideEffects: NoneOnDryRun + timeoutSeconds: 30 +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Imported via https://github.com/knative/operator/releases/download/knative-v1.16.0/operator.yaml +# Copyright 2024 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +# Source: dataplane/templates/propeller/serviceaccount.yaml +--- +--- +# Source: dataplane/templates/serving/knative-serving.yaml +apiVersion: operator.knative.dev/v1beta1 +kind: KnativeServing +metadata: + name: union-operator-serving + namespace: union + labels: + helm.sh/chart: dataplane-2026.4.4 + app.kubernetes.io/name: release-name-dataplane + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2026.4.5" + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/component: serving spec: - replicas: 1 - selector: - matchLabels: - app.kubernetes.io/name: flytepropeller + config: + deployment: + progress-deadline: "30m" + queue-sidecar-cpu-request: "25m" + queue-sidecar-cpu-limit: "1000m" + queue-sidecar-memory-request: "400Mi" + queue-sidecar-memory-limit: "800Mi" + queue-sidecar-ephemeral-storage-request: "512Mi" + queue-sidecar-ephemeral-storage-limit: "1024Mi" + registries-skipping-tag-resolving: managed.cr.union.ai + features: + kubernetes.podspec-affinity: "enabled" + kubernetes.podspec-nodeselector: "enabled" + kubernetes.podspec-tolerations: "enabled" + kubernetes.podspec-fieldref: "enabled" + kubernetes.podspec-dnspolicy: "enabled" + kubernetes.podspec-schedulername: "enabled" + kubernetes.podspec-securitycontext: "enabled" + network: + ingress-class: "kourier.ingress.networking.knative.dev" + high-availability: + replicas: 2 + ingress: + kourier: + enabled: true + bootstrap-configmap: "union-operator-serving-envoy-bootstrap" + service-type: ClusterIP + podDisruptionBudgets: + - name: 3scale-kourier-gateway-pdb + minAvailable: 50% + - name: activator-pdb + minAvailable: 50% + - name: webhook-pdb + minAvailable: 50% + registry: + override: + # TODO(jeev): Wire up Union fork of Envoy + 3scale-kourier-gateway/kourier-gateway: ghcr.io/unionai/envoy:456fed84d4ad9a9dfb186d117d9362e9dc0f7c1f + # TODO(jeev): Wire up Union fork of Kourier + net-kourier-controller/controller: ghcr.io/unionai/kourier@sha256:5804c348d15b3959604e3e3ceed216c3a1c7b32cbe254c7d3eb02a35e62ba9c4 + workloads: + - name: 3scale-kourier-gateway + labels: + helm.sh/chart: dataplane-2026.4.4 + app.kubernetes.io/name: release-name-dataplane app.kubernetes.io/instance: release-name - template: - metadata: - annotations: - configChecksum: "933b7ca10215c494cba6ea2776ebc555d5ec87de998392bd86a452aa81abe90" - - labels: - platform.union.ai/zone: "dataplane" - - - azure.workload.identity/use: "true" - app.kubernetes.io/name: flytepropeller - app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name - app.kubernetes.io/managed-by: Helm - spec: - priorityClassName: system-cluster-critical - containers: - - command: - - flytepropeller - - --config - - /etc/flyte/config/*.yaml - - --propeller.cluster-id - - 'test-azure-cluster' - env: - - name: POD_NAME - valueFrom: - fieldRef: - fieldPath: metadata.name - - name: POD_NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - - name: GOMEMLIMIT - valueFrom: - resourceFieldRef: - divisor: 1 - resource: limits.memory - - name: GOMAXPROCS - valueFrom: - resourceFieldRef: - divisor: 1 - resource: limits.cpu - - name: CLUSTER_NAME - valueFrom: - secretKeyRef: - name: operator-cluster-name - key: cluster_name - - name: DEPLOYMENT_NAME - value: operator - - name: PROXY_SERVICE_URL - value: http://union-operator-proxy:8080 - - name: PROMETHEUS_SERVICE_URL - value: http://union-operator-prometheus:80 - - name: KNATIVE_PROXY_SERVICE_URL - value: http://kourier-internal - image: "public.ecr.aws/p0i0a9q8/unionoperator:2026.4.5" - imagePullPolicy: "IfNotPresent" - name: flytepropeller - ports: - - containerPort: 10254 - resources: - limits: - cpu: "3" - memory: 3Gi - requests: - cpu: "1" - memory: 1Gi - volumeMounts: - - name: config-volume - mountPath: /etc/flyte/config - - name: auth - mountPath: /etc/union/secret - serviceAccountName: flytepropeller-system - volumes: - - configMap: - name: flyte-propeller-config - name: config-volume - - name: auth - secret: - secretName: union-secret-auth + app.kubernetes.io/version: "2026.4.5" + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/component: 3scale-kourier-gateway + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchLabels: + app.kubernetes.io/name: 3scale-kourier-gateway + app.kubernetes.io/instance: release-name + topologyKey: topology.kubernetes.io/zone + annotations: + checksum/bootstrap-config: 6469f6f592eebc9e0c676d8ccc359a05bc799c0988e474b2da942c4cc328f656 + env: + - container: kourier-gateway + envVars: + - name: UNION_AUTHZ_TENANTAUTHURL + value: "https://test.dataplane.union.ai/me" + - name: UNION_AUTHZ_TENANTAUTHSIGNINURL + value: "https://test.dataplane.union.ai/login" + - name: UNION_AUTHZ_TENANTCONTROLPLANEURL + value: "https://test.dataplane.union.ai" + resources: + - container: kourier-gateway + limits: + cpu: "2" + memory: 2Gi + requests: + cpu: "1" + memory: 1Gi + - name: activator + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app + operator: In + values: + - activator + topologyKey: topology.kubernetes.io/zone + - name: autoscaler + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app + operator: In + values: + - autoscaler + topologyKey: topology.kubernetes.io/zone + - name: autoscaler-hpa + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app + operator: In + values: + - autoscaler-hpa + topologyKey: topology.kubernetes.io/zone + - name: controller + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app + operator: In + values: + - controller + topologyKey: topology.kubernetes.io/zone + - name: net-kourier-controller + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app + operator: In + values: + - net-kourier-controller + topologyKey: topology.kubernetes.io/zone + env: + - container: controller + envVars: + - name: KOURIER_UNION_AUTHZ_ENABLED + value: "true" + resources: + - container: controller + limits: + cpu: "1" + memory: 1Gi + requests: + cpu: 500m + memory: 500Mi + - name: webhook + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app + operator: In + values: + - webhook + topologyKey: topology.kubernetes.io/zone +--- +# Source: dataplane/templates/common/task-podtemplate.yaml +apiVersion: v1 +kind: PodTemplate +metadata: + name: task-template + namespace: union +template: + spec: + serviceAccountName: union + containers: + - name: default + image: docker.io/rwgrim/docker-noop --- # Source: dataplane/charts/fluentbit/templates/tests/test-connection.yaml apiVersion: v1 diff --git a/tests/generated/dataplane.azure.yaml b/tests/generated/dataplane.azure.yaml index db7955e3..4ce1a186 100644 --- a/tests/generated/dataplane.azure.yaml +++ b/tests/generated/dataplane.azure.yaml @@ -1,169 +1,140 @@ --- -# Source: dataplane/templates/common/namespaces.yaml -apiVersion: v1 -kind: Namespace -metadata: - name: flytesnacks-development ---- -# Source: dataplane/templates/common/namespaces.yaml -apiVersion: v1 -kind: Namespace -metadata: - name: flytesnacks-staging ---- -# Source: dataplane/templates/common/namespaces.yaml -apiVersion: v1 -kind: Namespace -metadata: - name: flytesnacks-production ---- -# Source: dataplane/templates/common/namespaces.yaml -apiVersion: v1 -kind: Namespace -metadata: - name: union-health-monitoring-development ---- -# Source: dataplane/templates/common/namespaces.yaml -apiVersion: v1 -kind: Namespace -metadata: - name: union-health-monitoring-staging ---- -# Source: dataplane/templates/common/namespaces.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2020 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + apiVersion: v1 -kind: Namespace +kind: ServiceAccount metadata: - name: union-health-monitoring-production + name: knative-operator + namespace: "union" + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator --- -# Source: dataplane/charts/fluentbit/templates/serviceaccount.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2020 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + apiVersion: v1 kind: ServiceAccount metadata: - name: fluentbit-system - namespace: union + name: operator-webhook + namespace: "union" labels: - helm.sh/chart: fluentbit-0.48.9 - app.kubernetes.io/name: fluentbit - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "3.2.8" - app.kubernetes.io/managed-by: Helm - annotations: - azure.workload.identity/client-id: test-backend-client-id + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator --- -# Source: dataplane/charts/kube-state-metrics/templates/serviceaccount.yaml +# Source: dataplane/charts/prometheus/charts/kube-state-metrics/templates/serviceaccount.yaml apiVersion: v1 kind: ServiceAccount automountServiceAccountToken: true metadata: labels: - helm.sh/chart: kube-state-metrics-5.30.1 + helm.sh/chart: kube-state-metrics-5.25.1 app.kubernetes.io/managed-by: Helm app.kubernetes.io/component: metrics app.kubernetes.io/part-of: kube-state-metrics app.kubernetes.io/name: kube-state-metrics app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2.15.0" + app.kubernetes.io/version: "2.13.0" name: release-name-kube-state-metrics namespace: union --- -# Source: dataplane/charts/opencost/templates/serviceaccount.yaml +# Source: dataplane/charts/prometheus/templates/serviceaccount.yaml apiVersion: v1 kind: ServiceAccount metadata: - name: release-name-opencost - namespace: union labels: - helm.sh/chart: opencost-1.42.0 - app.kubernetes.io/name: opencost + app.kubernetes.io/component: server + app.kubernetes.io/name: prometheus app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "1.111.0" - app.kubernetes.io/part-of: opencost + app.kubernetes.io/version: v2.54.1 + helm.sh/chart: prometheus-25.27.0 app.kubernetes.io/managed-by: Helm -automountServiceAccountToken: true ---- -# Source: dataplane/templates/clusterresourcesync/serviceaccount.yaml -apiVersion: v1 -kind: ServiceAccount -metadata: - name: union-clustersync-system + app.kubernetes.io/part-of: prometheus + name: union-operator-prometheus namespace: union annotations: - azure.workload.identity/client-id: 'test-backend-client-id' ---- -# Source: dataplane/templates/imagebuilder/serviceaccount.yaml -apiVersion: v1 -kind: ServiceAccount -metadata: - name: union-imagebuilder + {} --- -# Source: dataplane/templates/nodeexecutor/serviceaccount.yaml +# Source: dataplane/templates/common/system-serviceaccount.yaml apiVersion: v1 kind: ServiceAccount metadata: - name: executor + name: union-system namespace: union - labels: - app: executor - annotations: - azure.workload.identity/client-id: 'test-backend-client-id' ---- -# Source: dataplane/templates/operator/serviceaccount-proxy.yaml -apiVersion: v1 -kind: ServiceAccount -metadata: - name: proxy-system - labels: - app.kubernetes.io/name: operator-proxy - app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name - app.kubernetes.io/managed-by: Helm annotations: azure.workload.identity/client-id: 'test-backend-client-id' --- -# Source: dataplane/templates/operator/serviceaccount.yaml +# Source: dataplane/templates/common/union-serviceaccount.yaml apiVersion: v1 kind: ServiceAccount metadata: - name: operator-system - labels: - app.kubernetes.io/name: union-operator - app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name - app.kubernetes.io/managed-by: Helm + name: union + namespace: union annotations: - azure.workload.identity/client-id: 'test-backend-client-id' + azure.workload.identity/client-id: test-worker-client-id +automountServiceAccountToken: true --- -# Source: dataplane/templates/prometheus/serviceaccount.yaml +# Source: dataplane/templates/flyteconnector/serviceaccount.yaml apiVersion: v1 kind: ServiceAccount metadata: - name: union-operator-prometheus + name: flyteconnector namespace: union - labels: - helm.sh/chart: dataplane-2026.4.2 - app.kubernetes.io/name: release-name-dataplane + labels: + app.kubernetes.io/name: flyteconnector app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: prometheus ---- -# Source: dataplane/templates/propeller/serviceaccount-webhook.yaml -apiVersion: v1 -kind: ServiceAccount -metadata: - name: flytepropeller-webhook-system - namespace: union - annotations: - azure.workload.identity/client-id: 'test-backend-client-id' --- -# Source: dataplane/templates/propeller/serviceaccount.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2022 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + apiVersion: v1 -kind: ServiceAccount +kind: Secret metadata: - name: flytepropeller-system - namespace: union - annotations: - azure.workload.identity/client-id: 'test-backend-client-id' + name: operator-webhook-certs + namespace: "union" + labels: + app.kubernetes.io/component: operator-webhook + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +# The data is populated at install time. --- # Source: dataplane/templates/common/auth-secret.yaml apiVersion: v1 @@ -186,154 +157,533 @@ type: Opaque data: cluster_name: dGVzdC1henVyZS1jbHVzdGVy --- -# Source: dataplane/templates/propeller/deployment-webhook.yaml -# Create an empty secret that the first propeller pod will populate +# Source: dataplane/templates/webhook/mutatingwebhookconfiguration.yaml apiVersion: v1 kind: Secret metadata: - name: flyte-pod-webhook + name: union-pod-webhook namespace: union + labels: + app.kubernetes.io/name: union-pod-webhook + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm type: Opaque +data: + ca.crt: LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSUROekNDQWgrZ0F3SUJBZ0lVSm9kQ09lem4vd1BHQTZjYkw1U3duSlpYT2hJd0RRWUpLb1pJaHZjTkFRRUwKQlFBd0t6RXBNQ2NHQTFVRUF3d2dabXg1ZEdWd2NtOXdaV3hzWlhJdGQyVmlhRzl2YXk1MWJtbHZiaTV6ZG1NdwpIaGNOTWpZd01USXpNREl6TmpVMVdoY05Nell3TVRJeE1ESXpOalUxV2pBck1Ta3dKd1lEVlFRRERDQm1iSGwwClpYQnliM0JsYkd4bGNpMTNaV0pvYjI5ckxuVnVhVzl1TG5OMll6Q0NBU0l3RFFZSktvWklodmNOQVFFQkJRQUQKZ2dFUEFEQ0NBUW9DZ2dFQkFJcjBSWDRGRjFNVTNYdFBmUHErMDUrMGI1Qm0xK3hsTVR5ZUp4bG5mWFJuRlgyegpPcHV6NmsreUpBQk4xRkxmNDVYRnlJclFlaW5DWWJtckZXalFRc3ZDWEloNkpPNGMwdmRMVVU5RERYcldWRFlsCkRNZytTUDlJVnZBMm5USkVFZ2RFREVsKzNWeThUbUJoNlNmcGthcXdHajY5SmhIdy9OeE9yZzM2bUY3TU5VZ1MKWXJVUkxqUFFzTUN6NElTVzhhQnEzS2IwRUFWUTA4V1lZQ0ZEbjY1aHBzVFdxR05rL3BKRnVTYjlsZll6RVdyYwpJeW5wNDhqTnIvNUdKS3NnWG9LeTUwZVo4UkppeTVyOUhENGtnSW15TVNQR2RDMXFZSVppRHRlZW8yOElxNk5nCmd1dWcyTGhlNndYYW9YZXFCQ2d3SHpwWFVXM1hJVzI5eFoxUkFRRUNBd0VBQWFOVE1GRXdIUVlEVlIwT0JCWUUKRktITDQ1RWhQaVg5bnlRN241QUk0YUZVNFlCL01COEdBMVVkSXdRWU1CYUFGS0hMNDVFaFBpWDlueVE3bjVBSQo0YUZVNFlCL01BOEdBMVVkRXdFQi93UUZNQU1CQWY4d0RRWUpLb1pJaHZjTkFRRUxCUUFEZ2dFQkFEeTRac21XCkNjZ3B5TmI4cU1ydzNYclBNejZLMTRlTVhqQVk1bERFYXM1NklLZExoZFM1bi9sWk95bENuUUtOZ3RHS0VrY3UKT05tMEE2Z0xNT3UzcE43dG9aNXh0WktOcnUvS0x6dm44QUNNNndCbUR0bG5oRERDa3ZUTHBVRzh6UTZSNmlaegpzTWlhVmpLSUJiWkJsZXNYSHZZQnQ2YlRTN2o1QWM2aTdYc1RPK3ZqQ2RUQjVYdCtEZ0x2QnBJRWh2ZGJzNXhmClFtc3ZzQmwxenJnR3JXemo4d0txcXNUMzZaOVU1TzdSK1RJMlQrUDVOR25SeHNQOGJ0R0RtMGZ5SW55WlJrSkcKVTdpVFJyMlZRWlRyaTNZQ2dlVjN1UjFtMnJjc2hyU1Avb082T0QyTXR5Q1FMRWtFK1pxVUVJVnpSYmVaNXl2RQptM29YMFV2OUtBUy9pWTA9Ci0tLS0tRU5EIENFUlRJRklDQVRFLS0tLS0K + tls.crt: LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSUQxakNDQXI2Z0F3SUJBZ0lVREtRN3A4MFpIdkQyUE5Ya3p3VGRLem9iUjg4d0RRWUpLb1pJaHZjTkFRRUwKQlFBd0t6RXBNQ2NHQTFVRUF3d2dabXg1ZEdWd2NtOXdaV3hzWlhJdGQyVmlhRzl2YXk1MWJtbHZiaTV6ZG1NdwpIaGNOTWpZd01USXpNREl6TmpVMVdoY05NamN3TVRJek1ESXpOalUxV2pBck1Ta3dKd1lEVlFRRERDQm1iSGwwClpYQnliM0JsYkd4bGNpMTNaV0pvYjI5ckxuVnVhVzl1TG5OMll6Q0NBU0l3RFFZSktvWklodmNOQVFFQkJRQUQKZ2dFUEFEQ0NBUW9DZ2dFQkFMNnMxUTRaZ3ZxVVdNdFp5VC9EKytEQWl6WmR0Q1lkVzArSGk4S3phQ1Vjbi9zZwo3a21NTEhDM2ZlZWtZK2ZUNENHUTd6K1l2WjRWWSt6WGljKzdrNndLbUZqdm1acEM1Rm96OWo1Q21NRk5HZmlWCnRobzVvSU9MUkIzNXRYMU1ROHUwNFVzT29QcjdMK016eEdxWXVmTFdtN1NVdjh0eEp3ZFMxZVlzMllNV1pMeU4KekI4akdWY01Qc2ZNVkdxdFhTTVljQ3hNYURKaTM1N0VEZSs3clM2U2ZtbnFtYjJxekdQM1BRRStCUm91Yk5nLwo4NCt4c253VXBLZUFpem95dFM2c2ZlOTdMUEtRTW9oVzd2TjZORFJrLy9xSmpHem9iZmZwUHU3endhWHVXMWtMCmllc2g2cVRrdlZLSTF3WFJ2WHArVU5GbjZzdDIwdEVlRks0ZzMwOENBd0VBQWFPQjhUQ0I3akFKQmdOVkhSTUUKQWpBQU1Bc0dBMVVkRHdRRUF3SUY0RENCa3dZRFZSMFJCSUdMTUlHSWdoWm1iSGwwWlhCeWIzQmxiR3hsY2kxMwpaV0pvYjI5cmdoeG1iSGwwWlhCeWIzQmxiR3hsY2kxM1pXSm9iMjlyTG5WdWFXOXVnaUJtYkhsMFpYQnliM0JsCmJHeGxjaTEzWldKb2IyOXJMblZ1YVc5dUxuTjJZNEl1Wm14NWRHVndjbTl3Wld4c1pYSXRkMlZpYUc5dmF5NTEKYm1sdmJpNXpkbU11WTJ4MWMzUmxjaTVzYjJOaGJEQWRCZ05WSFE0RUZnUVV1d25wcDZ1MTNKL0NsNnI3eDZ1Sgo5YmZ1RDhzd0h3WURWUjBqQkJnd0ZvQVVvY3Zqa1NFK0pmMmZKRHVma0FqaG9WVGhnSDh3RFFZSktvWklodmNOCkFRRUxCUUFEZ2dFQkFFMmlOa1MzczhGT3Z0cDhrVmJNZUN2amxIQytvYTVlZTFGNmowNlZ3bFJPOWdxOXJla1UKMGptWUhNWGNjWjcrZXJkNFRQRXgwK3ZqMzNhRTV3cDFmdVJJL2xIZHhuOUpJRHg2bTFQTHdVa3p1WG9EekYxLwozVXVIWU1xOGkrT2xsY0tqbmdVMUFUNmRWRCtIbFVWeHpMdjlNazdPb05SQUY5dTJHa1hiUHdCN0I1ZGRxdWxlCmtNcXBxbDhES2s4SVNVelVsTjNBb2Q5NmJUelIyV3RCVkxpTi9zRHZtTDZwZThFYm55RnBUb1BZTDlGeWZOcmIKT1ZPZEJwUkZFTG1KOHZaZWY5d1lxRmVXMTE3SGV5VDArZkxGZVNSemZUNVVWUlVraVFBcUVJdUhHUVRrOVFabgo1WnYvNUF0eEg3WE1rcXR0VE1zMFlSUG0xYjQzY3FBeFdhND0KLS0tLS1FTkQgQ0VSVElGSUNBVEUtLS0tLQo= + tls.key: LS0tLS1CRUdJTiBQUklWQVRFIEtFWS0tLS0tCk1JSUV2UUlCQURBTkJna3Foa2lHOXcwQkFRRUZBQVNDQktjd2dnU2pBZ0VBQW9JQkFRQytyTlVPR1lMNmxGakwKV2NrL3cvdmd3SXMyWGJRbUhWdFBoNHZDczJnbEhKLzdJTzVKakN4d3QzM25wR1BuMCtBaGtPOC9tTDJlRldQcwoxNG5QdTVPc0NwaFk3NW1hUXVSYU0vWStRcGpCVFJuNGxiWWFPYUNEaTBRZCtiVjlURVBMdE9GTERxRDYreS9qCk04UnFtTG55MXB1MGxML0xjU2NIVXRYbUxObURGbVM4amN3Zkl4bFhERDdIekZScXJWMGpHSEFzVEdneVl0K2UKeEEzdnU2MHVrbjVwNnBtOXFzeGo5ejBCUGdVYUxtellQL09Qc2JKOEZLU25nSXM2TXJVdXJIM3ZleXp5a0RLSQpWdTd6ZWpRMFpQLzZpWXhzNkczMzZUN3U4OEdsN2x0WkM0bnJJZXFrNUwxU2lOY0YwYjE2ZmxEUlorckxkdExSCkhoU3VJTjlQQWdNQkFBRUNnZ0VBRjV2SnMzbTRMSE9DdlFic2NwOC9DQmgvQkNlOC92MGlpYW5VUmJLMkFlWlYKN3Arb3NXV1FPUktYSGIvT2VPMVVjb09DQkFOUzh3aGQrM3pDZlB5U0w3cU9HM0RyT1Z6djdqVkIxM3FpZEVpcQpId3ZXWk0vZXpuckhYOWpEdm5SYmJwVUNVaXRKQmxwa2x3S1pYc056UHB5UTRkNkxFdEw3VEo1V2lxM2g3cWQzCjhncmJoeWlWNDhZRFAyYkhpZGt0S2QvMHBNL3piSzZPTS9iMFpLUmRBWVpHQXlOcklMaFJ3eDNsQXE2SDVqL20KUHBuNlZEdVQ3Y1h6UFhmNTgxR0IyNjQxWFJVTXRNWGVEREtaU3JudnIrbG5tV2lsYXZmNkg3ZHp4NlRBMmk0RApybCtIRVdUN1RSQUZGMnZ5SVUrYSt5RzNPVFBKRCtmVTdCWWh4Zld4aFFLQmdRRGVvcXVCbzQxK1dKU255blZnCmNWMHZoUEkwZ0dyRGQzNFdIbCtGNTZuSmxuMVVlbUQ2SUQ0TUhnNVpHWmNmd3NkRlB0aHl1VnB0OVl1SW11SGQKRHE5ZDhpWnpHUzJVbStkSWlsWjhnZXJ5Q3dON3hIbFM3SzJrc0RmRGJwYmZZSXZ1LzRnSExZRHh3YzQ0OG54ZwpWNGpCUFY2QTdYTHRqQzZzdFE2OW9uTlIwd0tCZ1FEYlFBUTRGdWpnUjJUOXh2Y01MdXhMbm1YaHE4U2NoZlM5CkRoWndDUzZIeHB1YlBxblBjQ04vQUZjbDZ1cDRXVEVZdVY4bEpwcGwwRmpYQ2lMdkhuUU15cm5vRjlUOThpanQKZGVHYjF6ZWlwRUpndFpNVHkvOGdERWorZU1wSFBuYTZwQVpMOVpRVUdLR2dDNGQ0SzljdVNHMmRlb1dSU0ViSQpMVGgzeGpXVEZRS0JnQStJRWFidG5nVmVjS0IwQTFSREZGa29VUzFRZUNKQ3g4MExPV2JDRHBvOW9XaXZVT3lpCkt6SDFOdE1JY2Y2SlBCV2NtTVVJSVVMaWltVnhTS2gvU2NTb0MvNmpsd1p6Q2VPSm94YjBpVXR4Y1VERktDR2MKMlZCUDZ0UDdkeE1HVFR6VEhzNUJZbWw3TjhQSlJ0d0J2MHliMTJmdktNRmhzaS9pUWJFQkVFSjVBb0dBUm9NSApHRmJkM0V0NXdsZzcyYkk1a25SRnhkY3RLejIxb1J1bndhNWlSWTV0T3Zkak8zQ1FLZWNkSC9lMklyQmtwdFB2Ck1vNkF0MS9UUW8xakFNNGxlbnUwWUYxUnhiNGN4WW5VM2Y3UVNNRlZDNjg1dHZNemdNWVNyNngvT1h4d1NNUTUKdGpJcnhtN0poQ1JSRkNmZFUyZzl3SmpIM2hxRmtSbGlBTHRCUGFVQ2dZRUF2QkRHejQvYS9kRzRJT0VpREk4SgpBbkxaRWZaTFJrdnI0ejRyT0p0TGZtWlFjTW5DSUZmKzBaelhTME9XOWhvMjk5RUNtS1l4U2VUWk1WSDgzOFZCCnRvSkNGM3lzb01rSC9meEVXTUpicDlMblJwSFUzUElicHlhRG1nWW1YdmsvK3laL3Y1c0dDTWNtVGkzWVFLRGsKcStlK1o0SHZyaGUxVytUaHpwOCtoNGs9Ci0tLS0tRU5EIFBSSVZBVEUgS0VZLS0tLS0K --- -# Source: dataplane/templates/clusterresourcesync/configmap.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2019 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + apiVersion: v1 kind: ConfigMap metadata: - name: union-clusterresourcesync-config - namespace: union + name: config-logging + namespace: "union" labels: - app.kubernetes.io/name: clusterresourcesync - app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name - app.kubernetes.io/managed-by: Helm -data: - cluster_resources.yaml: | - cluster_resources: - clusterName: 'test-azure-cluster' - customData: - - production: - - projectQuotaCpu: - value: "4096" - - projectQuotaMemory: - value: 2Ti - - projectQuotaNvidiaGpu: - value: "256" - - defaultUserRoleKey: - value: 'azure.workload.identity/client-id' - - defaultUserRoleValue: - value: 'test-worker-client-id' - - staging: - - projectQuotaCpu: - value: "4096" - - projectQuotaMemory: - value: 2Ti - - projectQuotaNvidiaGpu: - value: "256" - - defaultUserRoleKey: - value: 'azure.workload.identity/client-id' - - defaultUserRoleValue: - value: 'test-worker-client-id' - - development: - - projectQuotaCpu: - value: "4096" - - projectQuotaMemory: - value: 2Ti - - projectQuotaNvidiaGpu: - value: "256" - - defaultUserRoleKey: - value: 'azure.workload.identity/client-id' - - defaultUserRoleValue: - value: 'test-worker-client-id' - refreshInterval: 5m - standaloneDeployment: true - templatePath: /etc/flyte/clusterresource/templates - clusterResourcesPrivate: - app: - isServerless: false - union: - auth: - authorizationMetadataKey: flyte-authorization - clientId: 'test-client-id' - clientSecretLocation: /etc/union/secret/client_secret - tokenRefreshWindow: 5m - type: ClientSecret - connection: - host: dns:///test.dataplane.union.ai - admin.yaml: | - admin: - clientId: 'test-client-id' - clientSecretLocation: /etc/union/secret/client_secret - endpoint: dns:///test.dataplane.union.ai - insecure: false - event: - capacity: 1000 - rate: 500 - type: admin - domain.yaml: | - domains: - - id: development - name: development - - id: staging - name: staging - - id: production - name: production - namespace_config.yaml: | - namespace_mapping: - template: '{{`{{ domain }}`}}' - clusters.yaml: | - clusters: - clusterConfigs: [] - labelClusterMap: {} - logger.yaml: | - logger: - formatter: - type: json - level: 4 - show-source: true + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +data: {} --- -# Source: dataplane/templates/clusterresourcesync/configmap.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2019 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + apiVersion: v1 kind: ConfigMap metadata: - name: union-clusterresource-template - namespace: union + name: config-observability + namespace: "union" labels: - app.kubernetes.io/name: clusterresourcesync + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +data: {} +--- +# Source: dataplane/charts/prometheus/templates/cm.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + labels: + app.kubernetes.io/component: server + app.kubernetes.io/name: prometheus app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name + app.kubernetes.io/version: v2.54.1 + helm.sh/chart: prometheus-25.27.0 app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: prometheus + name: union-operator-prometheus + namespace: union data: - a_namespace.yaml: | - apiVersion: v1 - kind: Namespace - metadata: - name: {{ namespace }} - labels: - union.ai/namespace-type: flyte - spec: - finalizers: - - kubernetes - - b_default_service_account.yaml: | - apiVersion: v1 - kind: ServiceAccount - metadata: - name: default - namespace: {{ namespace }} - annotations: - {{ defaultUserRoleKey }}: {{ defaultUserRoleValue }} + allow-snippet-annotations: "false" + alerting_rules.yml: | + {} + alerts: | + {} + prometheus.yml: | + global: + evaluation_interval: 1m + scrape_interval: 1m + scrape_timeout: 10s + rule_files: + - /etc/config/recording_rules.yml + - /etc/config/alerting_rules.yml + - /etc/config/rules + - /etc/config/alerts + scrape_configs: + - job_name: prometheus + static_configs: + - targets: + - localhost:9090 + - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + job_name: kubernetes-apiservers + kubernetes_sd_configs: + - role: endpoints + relabel_configs: + - action: keep + regex: default;kubernetes;https + source_labels: + - __meta_kubernetes_namespace + - __meta_kubernetes_service_name + - __meta_kubernetes_endpoint_port_name + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + insecure_skip_verify: true + - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + job_name: kubernetes-nodes + kubernetes_sd_configs: + - role: node + relabel_configs: + - action: labelmap + regex: __meta_kubernetes_node_label_(.+) + - replacement: kubernetes.default.svc:443 + target_label: __address__ + - regex: (.+) + replacement: /api/v1/nodes/$1/proxy/metrics + source_labels: + - __meta_kubernetes_node_name + target_label: __metrics_path__ + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + insecure_skip_verify: true + - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + job_name: kubernetes-nodes-cadvisor + kubernetes_sd_configs: + - role: node + relabel_configs: + - action: labelmap + regex: __meta_kubernetes_node_label_(.+) + - replacement: kubernetes.default.svc:443 + target_label: __address__ + - regex: (.+) + replacement: /api/v1/nodes/$1/proxy/metrics/cadvisor + source_labels: + - __meta_kubernetes_node_name + target_label: __metrics_path__ + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + insecure_skip_verify: true + - honor_labels: true + job_name: kubernetes-service-endpoints + kubernetes_sd_configs: + - role: endpoints + relabel_configs: + - action: keep + regex: true + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_scrape + - action: drop + regex: true + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_scrape_slow + - action: replace + regex: (https?) + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_scheme + target_label: __scheme__ + - action: replace + regex: (.+) + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_path + target_label: __metrics_path__ + - action: replace + regex: (.+?)(?::\d+)?;(\d+) + replacement: $1:$2 + source_labels: + - __address__ + - __meta_kubernetes_service_annotation_prometheus_io_port + target_label: __address__ + - action: labelmap + regex: __meta_kubernetes_service_annotation_prometheus_io_param_(.+) + replacement: __param_$1 + - action: labelmap + regex: __meta_kubernetes_service_label_(.+) + - action: replace + source_labels: + - __meta_kubernetes_namespace + target_label: namespace + - action: replace + source_labels: + - __meta_kubernetes_service_name + target_label: service + - action: replace + source_labels: + - __meta_kubernetes_pod_node_name + target_label: node + - honor_labels: true + job_name: kubernetes-service-endpoints-slow + kubernetes_sd_configs: + - role: endpoints + relabel_configs: + - action: keep + regex: true + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_scrape_slow + - action: replace + regex: (https?) + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_scheme + target_label: __scheme__ + - action: replace + regex: (.+) + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_path + target_label: __metrics_path__ + - action: replace + regex: (.+?)(?::\d+)?;(\d+) + replacement: $1:$2 + source_labels: + - __address__ + - __meta_kubernetes_service_annotation_prometheus_io_port + target_label: __address__ + - action: labelmap + regex: __meta_kubernetes_service_annotation_prometheus_io_param_(.+) + replacement: __param_$1 + - action: labelmap + regex: __meta_kubernetes_service_label_(.+) + - action: replace + source_labels: + - __meta_kubernetes_namespace + target_label: namespace + - action: replace + source_labels: + - __meta_kubernetes_service_name + target_label: service + - action: replace + source_labels: + - __meta_kubernetes_pod_node_name + target_label: node + scrape_interval: 5m + scrape_timeout: 30s + - honor_labels: true + job_name: prometheus-pushgateway + kubernetes_sd_configs: + - role: service + relabel_configs: + - action: keep + regex: pushgateway + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_probe + - honor_labels: true + job_name: kubernetes-services + kubernetes_sd_configs: + - role: service + metrics_path: /probe + params: + module: + - http_2xx + relabel_configs: + - action: keep + regex: true + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_probe + - source_labels: + - __address__ + target_label: __param_target + - replacement: blackbox + target_label: __address__ + - source_labels: + - __param_target + target_label: instance + - action: labelmap + regex: __meta_kubernetes_service_label_(.+) + - source_labels: + - __meta_kubernetes_namespace + target_label: namespace + - source_labels: + - __meta_kubernetes_service_name + target_label: service + - honor_labels: true + job_name: kubernetes-pods + kubernetes_sd_configs: + - role: pod + relabel_configs: + - action: keep + regex: true + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_scrape + - action: drop + regex: true + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_scrape_slow + - action: replace + regex: (https?) + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_scheme + target_label: __scheme__ + - action: replace + regex: (.+) + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_path + target_label: __metrics_path__ + - action: replace + regex: (\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4}) + replacement: '[$2]:$1' + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_port + - __meta_kubernetes_pod_ip + target_label: __address__ + - action: replace + regex: (\d+);((([0-9]+?)(\.|$)){4}) + replacement: $2:$1 + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_port + - __meta_kubernetes_pod_ip + target_label: __address__ + - action: labelmap + regex: __meta_kubernetes_pod_annotation_prometheus_io_param_(.+) + replacement: __param_$1 + - action: labelmap + regex: __meta_kubernetes_pod_label_(.+) + - action: replace + source_labels: + - __meta_kubernetes_namespace + target_label: namespace + - action: replace + source_labels: + - __meta_kubernetes_pod_name + target_label: pod + - action: drop + regex: Pending|Succeeded|Failed|Completed + source_labels: + - __meta_kubernetes_pod_phase + - action: replace + source_labels: + - __meta_kubernetes_pod_node_name + target_label: node + - honor_labels: true + job_name: kubernetes-pods-slow + kubernetes_sd_configs: + - role: pod + relabel_configs: + - action: keep + regex: true + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_scrape_slow + - action: replace + regex: (https?) + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_scheme + target_label: __scheme__ + - action: replace + regex: (.+) + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_path + target_label: __metrics_path__ + - action: replace + regex: (\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4}) + replacement: '[$2]:$1' + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_port + - __meta_kubernetes_pod_ip + target_label: __address__ + - action: replace + regex: (\d+);((([0-9]+?)(\.|$)){4}) + replacement: $2:$1 + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_port + - __meta_kubernetes_pod_ip + target_label: __address__ + - action: labelmap + regex: __meta_kubernetes_pod_annotation_prometheus_io_param_(.+) + replacement: __param_$1 + - action: labelmap + regex: __meta_kubernetes_pod_label_(.+) + - action: replace + source_labels: + - __meta_kubernetes_namespace + target_label: namespace + - action: replace + source_labels: + - __meta_kubernetes_pod_name + target_label: pod + - action: drop + regex: Pending|Succeeded|Failed|Completed + source_labels: + - __meta_kubernetes_pod_phase + - action: replace + source_labels: + - __meta_kubernetes_pod_node_name + target_label: node + scrape_interval: 5m + scrape_timeout: 30s + - job_name: 'kube-state-metrics' + metrics_path: /metrics + scrape_interval: 1m + honor_labels: true + kubernetes_sd_configs: + - role: endpoints + namespaces: + own_namespace: true + relabel_configs: + - source_labels: [__meta_kubernetes_service_label_app_kubernetes_io_name] + regex: kube-state-metrics + action: keep + - source_labels: [__meta_kubernetes_endpoint_port_name] + regex: http + action: keep + metric_relabel_configs: + - source_labels: [__name__] + regex: "kube_pod_container_resource_(limits|requests)|kube_pod_status_phase|kube_node_(labels|status_allocatable|status_condition|status_capacity)|kube_namespace_labels|kube_pod_container_status_(waiting|terminated|last_terminated).*_reason|kube_daemonset_status_number_unavailable|kube_deployment_status_replicas_unavailable|kube_resourcequota|kube_pod_info|kube_node_info|kube_pod_container_status_restarts_total" + action: keep + - source_labels: [__name__, phase] + separator: ";" + regex: "kube_pod_status_phase;(Succeeded|Failed)" + action: drop + - source_labels: [node] + target_label: nodename + regex: "(.*)" + action: replace + - source_labels: [label_node_group_name] + action: replace + regex: "(.+)" + target_label: label_node_pool_name + - job_name: 'opencost' + metrics_path: /metrics + scrape_interval: 1m + honor_labels: true + kubernetes_sd_configs: + - role: endpoints + namespaces: + own_namespace: true + relabel_configs: + - source_labels: [__meta_kubernetes_service_label_app_kubernetes_io_name] + regex: opencost + action: keep + - source_labels: [__meta_kubernetes_endpoint_port_name] + regex: http + action: keep + metric_relabel_configs: + - source_labels: [__name__] + regex: "kube_node_labels|kube_pod_labels|node_total_hourly_cost|node_ram_hourly_cost|node_cpu_hourly_cost|node_gpu_hourly_cost" + action: keep + - job_name: 'union-services' + metrics_path: /metrics + scrape_interval: 1m + honor_labels: true + kubernetes_sd_configs: + - role: endpoints + namespaces: + own_namespace: true + relabel_configs: + - source_labels: [__meta_kubernetes_service_label_platform_union_ai_service_group] + regex: .+ + action: keep + - source_labels: [__meta_kubernetes_endpoint_port_name] + regex: debug + action: keep + # Flyte propeller metrics for execution info and fast task duration. + # No-op when propeller is not deployed (no pods match the label selector). + - job_name: 'flytepropeller' + kubernetes_sd_configs: + - role: pod + namespaces: + own_namespace: true + selectors: + - role: pod + label: app.kubernetes.io/name=flytepropeller + metric_relabel_configs: + - source_labels: [__name__] + regex: "flyte:propeller:all:round:execution_info|flyte:propeller:all:node:fast_task:fast_task_execution_duration" + action: keep + relabel_configs: + - source_labels: [__meta_kubernetes_pod_container_name] + regex: flytepropeller + action: keep + - source_labels: [__meta_kubernetes_namespace] + action: replace + target_label: namespace + - source_labels: [__meta_kubernetes_pod_name] + action: replace + target_label: pod + # Kourier gateway (envoy) metrics for app serving. + # No-op when serving is not enabled (no pods match the label selector). + - job_name: 'serving-envoy' + kubernetes_sd_configs: + - role: pod + namespaces: + own_namespace: true + selectors: + - role: pod + label: app=3scale-kourier-gateway + metrics_path: /stats/prometheus + metric_relabel_configs: + - source_labels: [__name__] + regex: "envoy_cluster_upstream_rq_xx|envoy_cluster_upstream_rq_time_bucket" + action: keep + relabel_configs: + - source_labels: [__meta_kubernetes_pod_container_port_name] + regex: metrics + action: keep + - source_labels: [__meta_kubernetes_namespace] + action: replace + target_label: namespace + - source_labels: [__meta_kubernetes_pod_name] + action: replace + target_label: pod + # DCGM GPU metrics. + # No-op when dcgm-exporter is not deployed (no pods match the label selector). + - job_name: 'gpu-metrics' + kubernetes_sd_configs: + - role: pod + namespaces: + names: + - kube-system + selectors: + - role: pod + label: app.kubernetes.io/name=dcgm-exporter - c_project_resource_quota.yaml: | - apiVersion: v1 - kind: ResourceQuota - metadata: - name: project-quota - namespace: {{ namespace }} - spec: - hard: - limits.cpu: {{ projectQuotaCpu }} - limits.memory: {{ projectQuotaMemory }} - requests.nvidia.com/gpu: {{ projectQuotaNvidiaGpu }} + recording_rules.yml: | + {} + rules: | + {} --- # Source: dataplane/templates/fluent-bit/configmap.yaml apiVersion: v1 @@ -370,6 +720,36 @@ data: Skip_Long_Lines On Refresh_Interval 10 --- +# Source: dataplane/templates/imagebuilder/build-image-configmap.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: build-image-config + labels: + app.kubernetes.io/name: union-operator + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm +data: + storage.yaml: | + storage: + container: "" + container: 'test-metadata-container' + stow: + config: + account: 'test-storage-account' + kind: azure + type: stow + enable-multicontainer: true + limits: + maxDownloadMBs: 1024 + cache: + max_size_mbs: 0 + target_gc_percent: 70 + buildkit-uri: "tcp://union-operator-buildkit.union.svc.cluster.local:1234" + default-repository: "union-dataplane.azurecr.io" + authentication-type: "azure" +--- # Source: dataplane/templates/imagebuilder/configmap.yaml apiVersion: v1 kind: ConfigMap @@ -2526,6 +2906,18 @@ data: "version": 1 } --- +# Source: dataplane/templates/monitoring/prometheusrule.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: union-recording-rules + namespace: union + labels: + release: release-name +data: + recording_rules.yml: | + groups: [] +--- # Source: dataplane/templates/nodeexecutor/configmap.yaml apiVersion: v1 kind: ConfigMap @@ -2580,8 +2972,13 @@ data: templateUris: - https://scale.neptune.ai/{{ .taskConfig.project }}/-/run/?customId={{ .taskConfig.id }} - kubernetes-enabled: true + kubernetes-enabled: false enabled_plugins.yaml: | + plugins: + connector-service: + defaultConnector: + defaultServiceConfig: '{"loadBalancingConfig": [{"round_robin":{}}]}' + endpoint: k8s:///flyteconnector.union:8000 tasks: task-plugins: default-for-task-types: @@ -2614,6 +3011,8 @@ data: cpu: 4096 gpu: 256 memory: 2Ti + namespace_mapping: + template: union union: connection: host: dns:///test.dataplane.union.ai @@ -2634,7 +3033,7 @@ data: cache-endpoint: dns:///test.dataplane.union.ai endpoint: dns:///test.dataplane.union.ai insecure: false - type: fallback + type: cacheservicev2 use-admin-auth: true logger: formatter: @@ -2656,6 +3055,7 @@ data: secure: false useAuth: false propeller: + limit-namespace: union node-config: disable-input-file-writes: true plugins: @@ -2663,7 +3063,7 @@ data: additional-worker-args: - --last-ack-grace-period-seconds - "120" - callback-uri: http://unionai-dataplane-executor.union.svc.cluster.local:15605 + callback-uri: http://union-operator-executor.union.svc.cluster.local:15605 grace-period-status-not-found: 2m ioutils: remoteFileOutputPaths: @@ -2676,6 +3076,7 @@ data: default-labels: - azure.workload.identity/use: "true" default-memory: 100Mi + default-pod-template-name: task-template interruptible-node-selector-requirement: key: kubernetes.azure.com/scalesetpriority operator: In @@ -2728,6 +3129,7 @@ data: default-labels: - azure.workload.identity/use: "true" default-memory: 100Mi + default-pod-template-name: task-template interruptible-node-selector-requirement: key: kubernetes.azure.com/scalesetpriority operator: In @@ -2760,11 +3162,14 @@ data: operator: enabled: true enableTunnelService: true + # enableDepot: false tunnel: - enableDirectToAppIngress: false + enableDirectToAppIngress: true deploymentToRestart: union-operator-proxy + limitNamespace: union + disableClusterPermissions: true apps: - enabled: 'false' + enabled: 'true' syncClusterConfig: enabled: false clusterId: @@ -2788,14 +3193,12 @@ data: kind: azure type: stow collectUsages: - enabled: true + enabled: false billing: model: Legacy dependenciesHeartbeat: - prometheus: - endpoint: 'http://union-operator-prometheus:80/-/healthy' - propeller: - endpoint: 'http://flytepropeller:10254' + executor: + endpoint: 'http://union-operator-executor:10254' proxy: endpoint: 'http://union-operator-proxy:10254' org: @@ -2803,6 +3206,8 @@ data: secretsWatcher: dryRun: true enabled: false + org: + namespaceTemplate: union imageBuilder: enabled: true executionNamespaceLabels: @@ -2810,9 +3215,6 @@ data: referenceConfigmapName: union-operator targetConfigMapName: "build-image-config" proxy: - imageBuilderConfig: - authenticationType: 'noop' - defaultRepository: '' persistedLogs: azureLogAnalytics: logAnalyticsWorkspaceResourceIdTemplate: /subscriptions/test-subscription-id/resourceGroups/test-resource-group/providers/Microsoft.OperationalInsights/workspaces/union-test-org @@ -2862,989 +3264,149 @@ data: kind: azure type: stow image-builder.buildkit-uri: "tcp://union-operator-buildkit.union.svc.cluster.local:1234" - image-builder.default-repository: "" - image-builder.authentication-type: "noop" + image-builder.default-repository: "union-dataplane.azurecr.io" + image-builder.authentication-type: "azure" --- -# Source: dataplane/templates/prometheus/configmap.yaml +# Source: dataplane/templates/serving/bootstrap-configmap.yaml +# We need to copy the default configmap here since knative-operator does not automatically update the +# address of the `net-kourier-controller` endpoint to use the release namespace. It assumes that the +# resources are being installed into the `knative-serving` namespace instead. apiVersion: v1 kind: ConfigMap metadata: - name: union-operator-prometheus + name: union-operator-serving-envoy-bootstrap namespace: union - labels: - helm.sh/chart: dataplane-2026.4.2 - app.kubernetes.io/name: release-name-dataplane - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: prometheus data: - prometheus.yml: | - global: - scrape_interval: 15s - evaluation_interval: 15s - alerting: - alertmanagers: - - static_configs: - - targets: - rule_files: - - rules.yml - scrape_configs: - # Self-monitoring - - job_name: prometheus - metrics_path: /prometheus/metrics - static_configs: - - targets: ['localhost:9090'] - metric_relabel_configs: - - source_labels: [__name__] - regex: prometheus_tsdb_head_chunks_storage_size_bytes|prometheus_tsdb_head_series|prometheus_tsdb_head_series_created_total|prometheus_tsdb_head_series_removed_total|prometheus_tsdb_head_samples_appended_total|prometheus_tsdb_storage_blocks_bytes|prometheus_tsdb_retention_limit_bytes|prometheus_tsdb_wal_(.*)|.+_(limit|failed|missed|failures)_total - action: keep + envoy-bootstrap.yaml: | + dynamic_resources: + ads_config: + transport_api_version: V3 + api_type: GRPC + rate_limit_settings: {} + grpc_services: + - envoy_grpc: {cluster_name: xds_cluster} + cds_config: + resource_api_version: V3 + ads: {} + lds_config: + resource_api_version: V3 + ads: {} + node: + cluster: kourier-knative + id: 3scale-kourier-gateway + static_resources: + listeners: + - name: stats_listener + address: + socket_address: + address: 0.0.0.0 + port_value: 9000 + filter_chains: + - filters: + - name: envoy.filters.network.http_connection_manager + typed_config: + "@type": type.googleapis.com/envoy.extensions.filters.network.http_connection_manager.v3.HttpConnectionManager + stat_prefix: stats_server + http_filters: + - name: envoy.filters.http.router + typed_config: + "@type": type.googleapis.com/envoy.extensions.filters.http.router.v3.Router + route_config: + virtual_hosts: + - name: admin_interface + domains: + - "*" + routes: + - match: + safe_regex: + regex: '/(certs|stats(/prometheus)?|server_info|clusters|listeners|ready)?' + headers: + - name: ':method' + string_match: + exact: GET + route: + cluster: service_stats + - match: + safe_regex: + regex: '/drain_listeners' + headers: + - name: ':method' + string_match: + exact: POST + route: + cluster: service_stats + clusters: + - name: service_stats + connect_timeout: 0.250s + type: static + load_assignment: + cluster_name: service_stats + endpoints: + lb_endpoints: + endpoint: + address: + socket_address: + address: 127.0.0.1 + port_value: 9901 + - name: xds_cluster + # This keepalive is recommended by envoy docs. + # https://www.envoyproxy.io/docs/envoy/latest/api-docs/xds_protocol + typed_extension_protocol_options: + envoy.extensions.upstreams.http.v3.HttpProtocolOptions: + "@type": type.googleapis.com/envoy.extensions.upstreams.http.v3.HttpProtocolOptions + explicit_http_config: + http2_protocol_options: + connection_keepalive: + interval: 30s + timeout: 5s + connect_timeout: 1s + load_assignment: + cluster_name: xds_cluster + endpoints: + lb_endpoints: + endpoint: + address: + socket_address: + address: "net-kourier-controller" + port_value: 18000 + type: STRICT_DNS + admin: + access_log: + - name: envoy.access_loggers.stdout + typed_config: + "@type": type.googleapis.com/envoy.extensions.access_loggers.stream.v3.StdoutAccessLog + address: + socket_address: + address: 127.0.0.1 + port_value: 9901 + stats_config: + stats_tags: + - tag_name: name + regex: '^.*?\.u/.*?n=(.*?)/.*?u\..*$' + - tag_name: domain + regex: '^.*?\.u/.*?d=(.*?)/.*?u\..*$' + - tag_name: org + regex: '^.*?\.u/.*?o=(.*?)/.*?u\..*$' + - tag_name: project + regex: '^.*?\.u/.*?p=(.*?)/.*?u\..*$' + - tag_name: target + regex: '^.*?\.u/.*?t=(.*?)/.*?u\..*$' + - tag_name: target_namespace + regex: '^.*?\.u/.*?tns=(.*?)/.*?u\..*$' +--- +# Source: dataplane/templates/webhook/configmap.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: union-pod-webhook-config + namespace: union +data: + core.yaml: | - # Kube state metrics for pod/node resource tracking and cost calculations - - job_name: kube-state-metrics - static_configs: - - targets: ['release-name-kube-state-metrics:8080'] - metric_relabel_configs: - - separator: ; - source_labels: [__name__] - regex: kube_pod_container_resource_(limits|requests)|kube_pod_status_phase|kube_node_(labels|status_allocatable|status_condition|status_capacity)|kube_namespace_labels|kube_pod_container_status_(waiting|terminated|last_terminated).*_reason|kube_daemonset_status_number_unavailable|kube_deployment_status_replicas_unavailable|kube_resourcequota|kube_pod_info|kube_node_info|kube_pod_container_status_restarts_total - action: keep - - separator: ; - source_labels: [__name__, phase] - regex: kube_pod_status_phase;(Succeeded|Failed) - action: drop - - source_labels: [node] - target_label: nodename - regex: '(.*)' - action: replace - - source_labels: [label_node_group_name] - action: replace - regex: (.+) - target_label: label_node_pool_name - - # cAdvisor container metrics for CPU and memory tracking - - job_name: kubernetes-cadvisor - metrics_path: /metrics - scheme: https - kubernetes_sd_configs: - - role: node - namespaces: - names: [] - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token - tls_config: - ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt - insecure_skip_verify: false - metric_relabel_configs: - - separator: ; - source_labels: [__name__] - regex: container_cpu_usage_seconds_total|container_memory_working_set_bytes - action: keep - relabel_configs: - - separator: ; - regex: __meta_kubernetes_node_label_(.+) - replacement: $1 - action: labelmap - - separator: ; - regex: (.*) - target_label: __address__ - replacement: kubernetes.default.svc:443 - action: replace - - source_labels: [__meta_kubernetes_node_name] - separator: ; - regex: (.+) - target_label: __metrics_path__ - replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor - action: replace - - # Flyte propeller metrics for execution info and fast task duration - - job_name: flytepropeller - kubernetes_sd_configs: - - role: pod - namespaces: - names: - - union - selectors: - - role: pod - label: app.kubernetes.io/name=flytepropeller - metric_relabel_configs: - - source_labels: [__name__] - regex: "flyte:propeller:all:round:execution_info|flyte:propeller:all:node:fast_task:fast_task_execution_duration" - action: keep - relabel_configs: - - source_labels: [__meta_kubernetes_pod_container_name] - regex: flytepropeller - action: keep - - source_labels: [__meta_kubernetes_namespace] - action: replace - target_label: namespace - - source_labels: [__meta_kubernetes_pod_name] - action: replace - target_label: pod - # OpenCost metrics for cost tracking - - job_name: opencost - static_configs: - - targets: ['release-name-opencost:9003'] - metric_relabel_configs: - - source_labels: [__name__] - regex: "kube_node_labels|kube_pod_labels|node_total_hourly_cost|node_ram_hourly_cost|node_cpu_hourly_cost|node_gpu_hourly_cost" - action: keep - rules.yml: | - groups: - - name: cost_calculations_15s - interval: 15s - rules: - - record: pod_gpu_allocation - expr: | - sum by (namespace, pod) (DCGM_FI_DEV_GPU_UTIL >= bool 0) * on (namespace, pod) group_left() (max by (namespace, pod) (kube_pod_status_phase{phase=~"Running|Pending"} == 1)) - - record: execution_info # A join metric to look up execution-level info. Used to disambiguate workflow/task executions, apps, and workspaces. - expr: | - max by (label_domain, label_project, label_entity_name, label_execution_id, label_entity_id)( - label_replace( - label_replace( - label_replace( - label_replace( - label_replace( - flyte:propeller:all:round:execution_info{domain!="", project!="", workflow_name!="", execution_id!=""}, # filter for workflow/task executions - "label_entity_id", "$1", "execution_id", "(.*)" # join key - ), "label_entity_name", "$1", "workflow_name", "(.*)" # set label_entity_name to the workflow/task name from the workflow_execution_id - ), - "label_execution_id", "$1", "execution_id", "(.*)" - ), - "label_project", "$1", "project", "(.*)" # project - ), - "label_domain", "$1", "domain", "(.*)" # domain - ) - ) - - record: app_info # A join metric to look up app-level info. Used to disambiguate workflow/task executions, apps, and workspaces. - expr: | - max by (label_domain, label_project, label_app_name, label_app_version, label_entity_id)( - label_replace( - label_replace( - label_replace( - kube_pod_labels{ - label_domain!="", - label_project!="", - label_serving_unionai_dev_app_name!="", - label_serving_knative_dev_revision!="" - }, # this filters for apps - "label_app_name", "$1", "label_serving_unionai_dev_app_name", "(.*)" # rename to cleanup - ), - "label_app_version", "$1", "label_serving_knative_dev_revision", "(.*)" # the app_version is equivalent to an execution_id for workflows (lowest level of granularity) - ), - "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # join key - ) - ) - - record: workspace_info # A join metric to look up workspace info. Used to disambiguate workflow/task executions, apps, and workspaces. - expr: | - max by (label_domain, label_project, label_workspace_name, label_entity_id)( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # filter for workspaces - "label_entity_id", "$1", "label_node_id", "(.*)" # join key - ), "label_workspace_name", "$1", "label_node_id", "(.*)" # set label_workspace_name to the workspace name from the kube_pod_labels - ) - ) - - record: fast_task_execution_duration - expr: | - max by (label_domain, label_project, label_execution_id, label_entity_id, namespace, pod)( - label_replace( - label_replace( - label_replace( - label_replace( - label_replace( - label_replace( - flyte:propeller:all:node:fast_task:fast_task_execution_duration{domain!="", project!="", execution_id!="", exported_namespace!="", exported_pod!=""}, - "label_entity_id", "$1", "execution_id", "(.*)" # join key - ), - "label_execution_id", "$1", "execution_id", "(.*)" - ), - "label_project", "$1", "project", "(.*)" # project - ), - "label_domain", "$1", "domain", "(.*)" # domain - ), - "namespace", "$1", "exported_namespace", "(.*)" - ), - "pod", "$1", "exported_pod", "(.*)" - ) - ) - - record: fast_task_execution_duration_rate - expr: | - irate(fast_task_execution_duration[1m]) > 0 or absent(irate(fast_task_execution_duration[1m])) * fast_task_execution_duration - - record: entity_id:mem_usage_bytes_total_per_node:sum # Allocated memory (max(requested, consumed)) aggregated per node and entity, where entity is either a task/workflow execution or an app. - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id, node) ( # aggregate up to entity - # First, calculate the allocated memory for each pod - max by (namespace, pod) ( # this is the case where consumed (the memory working set) exceeds requested memory - ( - sum by (namespace, pod) ( - container_memory_working_set_bytes{namespace!="",pod!="",image!=""} - ) - > sum by (namespace, pod) ( - kube_pod_container_resource_requests{namespace!="", pod!="", node!="", resource="memory"} - ) - ) - or sum by (namespace, pod) ( # this is the case where memory requests are <= consumed memory - kube_pod_container_resource_requests{namespace!="", pod!="", node!="", resource="memory"} # needed to add node!="" to dedupe - ) - ) - # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but we do not want to double the number of pod-level metrics we save - * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) - "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") - ), - "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels - label_replace( - label_replace( - kube_pod_labels{ - label_domain!="", - label_project!="", - label_serving_unionai_dev_app_name!="", - label_serving_knative_dev_revision!="" - }, # this filters for apps only - "label_entity_type", "app", "", "" # set label_entity_type to "app" - ), - "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) - "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" - ), - "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - fast_task_execution_duration_rate{label_domain!="", label_project!="",label_execution_id!=""}, - "label_entity_type", "fast_task", "", "" - ) - ) - ) - # Then filter for pods only in the "Running" or "Pending" phase - * on (namespace, pod) group_left() ( - max by (namespace, pod) ( - kube_pod_status_phase{phase=~"Running|Pending"} == 1 - ) - ) - # Now join in node identifiers which are used for subsequent overhead calculations - * on (namespace, pod) group_left(node) ( - max by (namespace, pod, node) (kube_pod_info{node!=""}) # needed to add node!="" to dedupe - ) - ) - - record: entity_id:cpu_usage_per_node:sum # Allocated cpu (max(requested, consumed)) aggregated per node and entity, where entity is either a task/workflow execution or an app. - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id, node) ( - # First, calculate the allocated cpu for each pod - max by (namespace, pod) ( # this is the case where consumed (the cpu usage seconds total) exceeds requested cpu - ( - sum by (namespace, pod) ( - irate(container_cpu_usage_seconds_total{namespace!="",pod!="",image!=""}[5m]) - ) - > sum by (namespace, pod) ( - kube_pod_container_resource_requests{namespace!="", pod!="", node!="", resource="cpu"} - ) - ) - or sum by (namespace, pod) ( # this is the case where cpu requests are <= consumed cpu - kube_pod_container_resource_requests{namespace!="", pod!="", node!="", resource="cpu"} - ) - ) - # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but I didn't want to double the number of pod-level metrics we save - * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) - "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") - ), - "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels - label_replace( - label_replace( - kube_pod_labels{ - label_domain!="", - label_project!="", - label_serving_unionai_dev_app_name!="", - label_serving_knative_dev_revision!="" - }, # this filters for apps only - "label_entity_type", "app", "", "" # set label_entity_type to "app" - ), - "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) - "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" - ), - "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - fast_task_execution_duration_rate{label_domain!="", label_project!="", label_execution_id!=""}, - "label_entity_type", "fast_task", "", "" - ) - ) - ) - # Then filter for pods only in the "Running" or "Pending" phase - * on (namespace, pod) group_left() ( - max by (namespace, pod) ( - kube_pod_status_phase{phase=~"Running|Pending"} == 1 - ) - ) - # Now join in node identifiers which are used for subsequent overhead calculations - * on (namespace, pod) group_left(node) ( - max by (namespace, pod, node) (kube_pod_info{node!=""}) # needed to add node!="" to dedupe - ) - ) - - record: entity_id:gpu_usage_per_node:sum # Allocated gpu aggregated per node and entity, where entity is either a task/workflow execution or an app. - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id, node) ( - # First, grab the allocated gpu for each pod (which is always either 1 or zero, since k8s can't split gpus the way it can with cpu/memory) - max by (namespace, pod) ( - pod_gpu_allocation - ) - # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but I didn't want to double the number of pod-level metrics we save - * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) - "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") - ), - "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels - label_replace( - label_replace( - kube_pod_labels{ - label_domain!="", - label_project!="", - label_serving_unionai_dev_app_name!="", - label_serving_knative_dev_revision!="" - }, # this filters for apps only - "label_entity_type", "app", "", "" # set label_entity_type to "app" - ), - "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) - "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" - ), - "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - fast_task_execution_duration_rate{label_domain!="", label_project!="", label_execution_id!=""}, - "label_entity_type", "fast_task", "", "" - ) - ) - ) - # Then filter for pods only in the "Running" or "Pending" phase - * on (namespace, pod) group_left() ( - max by (namespace, pod) ( - kube_pod_status_phase{phase=~"Running|Pending"} == 1 - ) - ) - # Now join in node identifiers which are used for subsequent overhead calculations - * on (namespace, pod) group_left(node) ( - max by (namespace, pod, node) (kube_pod_info{node!=""}) # needed to add node!="" to dedupe - ) - ) - - record: entity_id:used_mem_bytes:sum # the sum of used memory across all containers in an entity (numerator for aggregate utilization calculations) - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id) ( # aggregate up to entity - # First, calculate the used memory for each pod - sum by (namespace, pod) ( - container_memory_working_set_bytes{namespace!="",pod!="",image!=""} - ) - # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but we do not want to double the number of pod-level metrics we save - * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) - "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") - ), - "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels - label_replace( - label_replace( - kube_pod_labels{ - label_domain!="", - label_project!="", - label_serving_unionai_dev_app_name!="", - label_serving_knative_dev_revision!="" - }, # this filters for apps only - "label_entity_type", "app", "", "" # set label_entity_type to "app" - ), - "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) - "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" - ), - "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - fast_task_execution_duration_rate{label_domain!="", label_project!="", label_execution_id!=""}, - "label_entity_type", "fast_task", "", "" - ) - ) - ) - # Then filter for pods only in the "Running" or "Pending" phase - * on (namespace, pod) group_left() ( - max by (namespace, pod) ( - kube_pod_status_phase{phase=~"Running|Pending"} == 1 - ) - ) - ) - - record: entity_id:allocated_mem_bytes:sum # the sum of allocated memory across all containers in an entity (denominator for aggregate utilization calculations) - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id) ( # aggregate up to entity (remove node) - entity_id:mem_usage_bytes_total_per_node:sum - ) - - record: entity_id:used_cpu:sum # the sum of used cpu across all containers in an entity (numerator for aggregate utilization calculations) - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id) ( - sum by (namespace, pod) ( - irate(container_cpu_usage_seconds_total{namespace!="",pod!="",image!=""}[5m]) - ) - # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but I didn't want to double the number of pod-level metrics we save - * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) - "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") - ), - "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels - label_replace( - label_replace( - kube_pod_labels{ - label_domain!="", - label_project!="", - label_serving_unionai_dev_app_name!="", - label_serving_knative_dev_revision!="" - }, # this filters for apps only - "label_entity_type", "app", "", "" # set label_entity_type to "app" - ), - "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) - "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" - ), - "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - fast_task_execution_duration_rate{label_domain!="", label_project!="", label_execution_id!=""}, - "label_entity_type", "fast_task", "", "" - ) - ) - ) - # Then filter for pods only in the "Running" or "Pending" phase - * on (namespace, pod) group_left() ( - max by (namespace, pod) ( - kube_pod_status_phase{phase=~"Running|Pending"} == 1 - ) - ) - ) - - record: entity_id:allocated_cpu:sum # the sum of allocated cpu across all containers in an entity (denominator for aggregate utilization calculations) - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id) ( # aggregate up to entity (remove node) - entity_id:cpu_usage_per_node:sum - ) - - record: entity_id:sm_occupancy:avg # the simple average of SM occupancy (a good generic measure of GPU utilization) per entity - expr: | - avg by (label_entity_type, label_domain, label_project, label_entity_id) ( - # First, grab the SM occupancy for each pod - max by (namespace, pod) ( - DCGM_FI_PROF_SM_OCCUPANCY # SM occupancy is a good proxy for actual GPU usage - ) - # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but I didn't want to double the number of pod-level metrics we save - * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) - "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") - ), - "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels - label_replace( - label_replace( - kube_pod_labels{ - label_domain!="", - label_project!="", - label_serving_unionai_dev_app_name!="", - label_serving_knative_dev_revision!="" - }, # this filters for apps only - "label_entity_type", "app", "", "" # set label_entity_type to "app" - ), - "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) - "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" - ), - "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - fast_task_execution_duration_rate{label_domain!="", label_project!="", label_execution_id!=""}, - "label_entity_type", "fast_task", "", "" - ) - ) - ) - # Then filter for pods only in the "Running" or "Pending" phase - * on (namespace, pod) group_left() ( - max by (namespace, pod) ( - kube_pod_status_phase{phase=~"Running|Pending"} == 1 - ) - ) - ) - - record: entity_id:gpu_count:sum # the count of running gpu pods per entity (need this to weight the gpu utilization when aggregating upwards - i.e. project-level) - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id, node) ( - # First, grab the allocated gpu for each pod (which is always either 1 or zero, since k8s can't split gpus the way it can with cpu/memory) - max by (namespace, pod) ( - pod_gpu_allocation - ) - # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but I didn't want to double the number of pod-level metrics we save - * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) - "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") - ), - "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels - label_replace( - label_replace( - kube_pod_labels{ - label_domain!="", - label_project!="", - label_serving_unionai_dev_app_name!="", - label_serving_knative_dev_revision!="" - }, # this filters for apps only - "label_entity_type", "app", "", "" # set label_entity_type to "app" - ), - "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) - "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" - ), - "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - fast_task_execution_duration_rate{label_domain!="", label_project!="", label_execution_id!=""}, - "label_entity_type", "fast_task", "", "" - ) - ) - ) - # Then filter for pods only in the "Running" or "Pending" phase - * on (namespace, pod) group_left() ( - max by (namespace, pod) ( - kube_pod_status_phase{phase=~"Running|Pending"} == 1 - ) - ) - ) - - record: entity_id:weighted_sm_occupancy:sum # product of SM occupancy and allocated GPU count (something like "used memory", numerator of weighted calcs) - expr: | - entity_id:sm_occupancy:avg - * on (label_domain, label_project, label_entity_type, label_entity_id) entity_id:gpu_count:sum - - record: entity_id:allocated_mem_cost:sum # Allocated cost of memory for each workflow/task execution and app. - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id, type) ( - entity_id:mem_usage_bytes_total_per_node:sum / (1024 * 1024 * 1024) # convert bytes to GB - * on (node) group_left(type) label_replace(avg by (node) (node_ram_hourly_cost * (15 / 3600)), "type", "mem", "", "") # convert hourly cost to 15-secondly cost and add type - ) - - record: entity_id:allocated_cpu_cost:sum # Allocated cost of cpu for each workflow/task execution and app. - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id, type)( - entity_id:cpu_usage_per_node:sum - * on (node) group_left(type) label_replace(avg by (node) (node_cpu_hourly_cost * (15 / 3600)), "type", "cpu", "", "") # convert hourly cost to 15-secondly cost and add type - ) - - record: entity_id:allocated_gpu_cost:sum # Allocated cost of gpu for each workflow/task execution and app. - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id, type)( - entity_id:gpu_usage_per_node:sum - * on (node) group_left(type) label_replace(avg by (node) (node_gpu_hourly_cost * (15 / 3600)), "type", "gpu", "", "") # convert hourly cost to 15-secondly cost and add type - ) - - record: entity_id:allocated_cost:sum # Allocated cost of memory, cpu, and gpu for each workflow/task execution and app. - expr: | - label_replace( - sum by (label_entity_type, label_domain, label_project, label_entity_id) ( # for the sum to work, the labels need to be different on each "or" element (type label) - entity_id:allocated_mem_cost:sum - or - entity_id:allocated_cpu_cost:sum - or - entity_id:allocated_gpu_cost:sum - ), - "type", "allocated", "", "" # add type info - ) - - record: entity_id:overhead_cost:sum # The amount of overhead costs (node costs that we can't allocate with container resources) to allocate to each entity (workflow/task execution or app) - expr: | - label_replace( - sum by (label_entity_type, label_entity_id, label_domain, label_project)( # Aggregate the per-node metrics up to workflow/task execution or app (label_entity_id) - # Start with each execution's and app's allocated cost per node - sum by (label_entity_type, label_domain, label_project, label_entity_id, node) ( # for the sum to work, the labels need to be different on each "or" element (type label) - entity_id:mem_usage_bytes_total_per_node:sum / (1024 * 1024 * 1024) # convert bytes to GB - * on (node) group_left(type) label_replace(avg by (node) (node_ram_hourly_cost * (15 / 3600)), "type", "mem", "", "") # convert hourly cost to 15-secondly cost and add type - or - entity_id:cpu_usage_per_node:sum - * on (node) group_left(type) label_replace(avg by (node) (node_cpu_hourly_cost * (15 / 3600)), "type", "cpu", "", "") # convert hourly cost to 15-secondly cost and add type - or - entity_id:gpu_usage_per_node:sum - * on (node) group_left(type) label_replace(avg by (node) (node_gpu_hourly_cost * (15 / 3600)), "type", "gpu", "", "") # convert hourly cost to 15-secondly cost and add type - ) - # Then divide out the total allocated cost per node to get the proportion of allocated cost associated with each entity - / on (node) group_left()( - sum by (node) ( # for the sum to work, the labels need to be different on each "or" element (type label) - entity_id:mem_usage_bytes_total_per_node:sum / (1024 * 1024 * 1024) # convert bytes to GB - * on (node) group_left(type) label_replace(avg by (node) (node_ram_hourly_cost * (15 / 3600)), "type", "mem", "", "") # convert hourly cost to 15-secondly cost and add type - or - entity_id:cpu_usage_per_node:sum - * on (node) group_left(type) label_replace(avg by (node) (node_cpu_hourly_cost * (15 / 3600)), "type", "cpu", "", "") # convert hourly cost to 15-secondly cost and add type - or - entity_id:gpu_usage_per_node:sum - * on (node) group_left(type) label_replace(avg by (node) (node_gpu_hourly_cost * (15 / 3600)), "type", "gpu", "", "") # convert hourly cost to 15-secondly cost and add type - ) - > 0 # need to avoid dividing by zero, or gaps in the data can cause NaNs to proliferate, borking all charts - ) - # Then multiply by the overhead cost per node - * on (node) group_left() ( - # To calculate overhead, start with the true cost of running each node - avg by (node)(kube_node_labels{label_flyte_org_node_role="worker"}) # only look at worker nodes - * on (node) max by (node) ( - node_total_hourly_cost{instance_type!=""} # sometimes, the instance_type can be null, causing an unlabeled label to show up in the Compute Costs dashboard charts - ) * (15 / 3600) # convert hourly cost to 15-secondly cost - # Then subtract out the total allocated cost on each node - - on (node) group_left()( - sum by (node) ( # for the sum to work, the labels need to be different on each "or" element (type label) - entity_id:mem_usage_bytes_total_per_node:sum / (1024 * 1024 * 1024) # convert bytes to GB - * on (node) group_left(type) label_replace(avg by (node) (node_ram_hourly_cost * (15 / 3600)), "type", "mem", "", "") # convert hourly cost to 15-secondly cost and add type - or - entity_id:cpu_usage_per_node:sum - * on (node) group_left(type) label_replace(avg by (node) (node_cpu_hourly_cost * (15 / 3600)), "type", "cpu", "", "") # convert hourly cost to 15-secondly cost and add type - or - entity_id:gpu_usage_per_node:sum - * on (node) group_left(type) label_replace(avg by (node) (node_gpu_hourly_cost * (15 / 3600)), "type", "gpu", "", "") # convert hourly cost to 15-secondly cost and add type - ) - ) - ) - ), - "type", "overhead", "", "" # add type info - ) - - record: entity_id:total_cost:sum # Total cost of each entity (workflow/task execution or app), including allocated (from container resources) and overhead (proportion of unallocated node costs) - expr: | - label_replace( - sum by (label_domain, label_project, label_entity_id, label_entity_type) ( - entity_id:allocated_cost:sum - or - entity_id:overhead_cost:sum - ), - "type", "total", "", "" # add type info - ) - - record: node:total_cost:sum # Total cost of all nodes - expr: | - sum ( - avg by (node)(kube_node_labels{label_flyte_org_node_role="worker", label_node_kubernetes_io_instance_type!=""}) # only look at worker nodes - * on (node) group_left() node_total_hourly_cost{instance_type!=""} * (15 / 3600) # convert hourly cost to 15-secondly cost - ) - - record: node_type:total_cost:sum # Total cost of nodes grouped by node type - expr: | - sum by (node_type)( - avg by (node)(kube_node_labels{label_flyte_org_node_role="worker", label_node_kubernetes_io_instance_type!=""}) # only look at worker nodes - * on (node) group_left(node_type) label_replace(node_total_hourly_cost{instance_type!=""}, "node_type", "$1", "instance_type", "(.*)") * (15 / 3600) # convert hourly cost to 15-secondly cost and rename label - ) - - record: node_type:uptime_hours:sum # Total uptime of nodes grouped by node type - expr: | - sum by (node_type)( - avg by (node, node_type)( # dedupe - label_replace(kube_node_labels{label_flyte_org_node_role="worker", label_node_kubernetes_io_instance_type!=""}, "node_type", "$1", "label_node_kubernetes_io_instance_type", "(.*)") # relabel - ) - ) * (15 / 3600) # convert to number of hours per 15-second observation # Aggregate the above into visible metrics - - name: cost_rollup_15m - interval: 15m - rules: - - record: execution_info15m - expr: | - max_over_time(execution_info[15m:15s]) - - record: app_info15m - expr: | - max_over_time(app_info[15m:15s]) - - record: workspace_info15m - expr: | - max_over_time(workspace_info[15m:15s]) - - record: entity_id:allocated_mem_bytes:sum15m - expr: | - sum_over_time(entity_id:allocated_mem_bytes:sum[15m:15s]) - - record: entity_id:used_mem_bytes:sum15m - expr: | - sum_over_time(entity_id:used_mem_bytes:sum[15m:15s]) - - record: entity_id:allocated_cpu:sum15m - expr: | - sum_over_time(entity_id:allocated_cpu:sum[15m:15s]) - - record: entity_id:used_cpu:sum15m - expr: | - sum_over_time(entity_id:used_cpu:sum[15m:15s]) - - record: entity_id:weighted_sm_occupancy:sum15m - expr: | - sum_over_time(entity_id:weighted_sm_occupancy:sum[15m:15s]) - - record: entity_id:gpu_count:sum15m - expr: | - sum_over_time(entity_id:gpu_count:sum[15m:15s]) - - record: entity_id:allocated_mem_cost:sum15m - expr: | - sum_over_time(entity_id:allocated_mem_cost:sum[15m:15s]) - - record: entity_id:allocated_cpu_cost:sum15m - expr: | - sum_over_time(entity_id:allocated_cpu_cost:sum[15m:15s]) - - record: entity_id:allocated_gpu_cost:sum15m - expr: | - sum_over_time(entity_id:allocated_gpu_cost:sum[15m:15s]) - - record: entity_id:allocated_cost:sum15m - expr: | - sum_over_time(entity_id:allocated_cost:sum[15m:15s]) - - record: entity_id:overhead_cost:sum15m - expr: | - sum_over_time(entity_id:overhead_cost:sum[15m:15s]) - - record: entity_id:total_cost:sum15m - expr: | - sum_over_time(entity_id:total_cost:sum[15m:15s]) - - record: node:total_cost:sum15m - expr: | - sum_over_time(node:total_cost:sum[15m:15s]) - - record: node_type:total_cost:sum15m - expr: | - sum_over_time(node_type:total_cost:sum[15m:15s]) - - record: node_type:uptime_hours:sum15m - expr: | - sum_over_time(node_type:uptime_hours:sum[15m:15s]) ---- -# Source: dataplane/templates/propeller/configmap.yaml -apiVersion: v1 -kind: ConfigMap -metadata: - name: flyte-propeller-config - namespace: union -data: - admin.yaml: | - admin: - clientId: 'test-client-id' - clientSecretLocation: /etc/union/secret/client_secret - endpoint: dns:///test.dataplane.union.ai - insecure: false - event: - capacity: 1000 - rate: 500 - type: admin - catalog.yaml: | - catalog-cache: - cache-endpoint: dns:///test.dataplane.union.ai - endpoint: dns:///test.dataplane.union.ai - insecure: false - type: fallback - use-admin-auth: true - copilot.yaml: | - plugins: - k8s: - co-pilot: - image: 'cr.flyte.org/flyteorg/flytecopilot:v1.14.1' - name: flyte-copilot- - start-timeout: 30s - core.yaml: | - propeller: - downstream-eval-duration: 30s - enable-admin-launcher: true - leader-election: - enabled: true - lease-duration: 15s - lock-config-map: - name: propeller-leader - namespace: 'union' - renew-deadline: 10s - retry-period: 2s - limit-namespace: all - literal-offloading-config: - enabled: true - max-workflow-retries: 30 - metadata-prefix: metadata/propeller - metrics-prefix: flyte - prof-port: 10254 - queue: - batch-size: -1 - batching-interval: 2s - queue: - base-delay: 5s - capacity: 1000 - max-delay: 120s - rate: 100 - type: maxof - sub-queue: - capacity: 100 - rate: 10 - type: bucket - type: batch - rawoutput-prefix: 's3://' - workers: 4 - workflow-reeval-duration: 30s webhook: certDir: /etc/webhook/certs + disableCreateMutatingWebhookConfig: true embeddedSecretManagerConfig: azureConfig: vaultURI: 'test-azure-key-vault-uri' @@ -3855,120 +3417,13 @@ data: namespace: 'union' type: Azure listenPort: '9443' + localCert: true secretManagerTypes: - Azure - Embedded - serviceName: flyte-pod-webhook + secretName: union-pod-webhook + serviceName: union-pod-webhook servicePort: '443' - enabled_plugins.yaml: | - plugins: - connector-service: - defaultConnector: - defaultServiceConfig: '{"loadBalancingConfig": [{"round_robin":{}}]}' - endpoint: k8s:///flyteconnector.union:8000 - tasks: - task-plugins: - default-for-task-types: - actor: fast-task - container: container - container_array: k8s-array - sidecar: sidecar - enabled-plugins: - - container - - sidecar - - k8s-array - - echo - - fast-task - - connector-service - k8s.yaml: | - plugins: - k8s: - default-cpus: 100m - default-env-vars: - - AZURE_STORAGE_ACCOUNT_NAME: test-storage-account - default-labels: - - azure.workload.identity/use: "true" - default-memory: 100Mi - interruptible-node-selector-requirement: - key: kubernetes.azure.com/scalesetpriority - operator: In - values: - - spot - interruptible-tolerations: - - effect: NoSchedule - key: kubernetes.azure.com/scalesetpriority - operator: Equal - value: spot - non-interruptible-node-selector-requirement: - key: kubernetes.azure.com/scalesetpriority - operator: DoesNotExist - logger.yaml: | - logger: - formatter: - type: json - level: 4 - show-source: true - namespace_config.yaml: | - namespace_mapping: - template: '{{ domain }}' - resource_manager.yaml: | - propeller: - resourcemanager: - type: noop - task_logs.yaml: | - plugins: - fasttask: - logs: - azure-log-templates: - - displayName: Azure Logs - templateUris: - - https://portal.azure.com#@test-tenant-id/blade/Microsoft_OperationsManagementSuite_Workspace/Logs.ReactView/resourceId/%2Fsubscriptions%2Ftest-subscription-id%2FresourceGroups%2Ftest-resource-group/source/LogsBlade.AnalyticsShareLinkToQuery/q/ - cloudwatch-enabled: false - kubernetes-enabled: false - stackdriver-enabled: false - k8s-array: - logs: - config: - azure-log-templates: - - displayName: Azure Logs - templateUris: - - https://portal.azure.com#@test-tenant-id/blade/Microsoft_OperationsManagementSuite_Workspace/Logs.ReactView/resourceId/%%2Fsubscriptions%%2Ftest-subscription-id%%2FresourceGroups%%2Ftest-resource-group/source/LogsBlade.AnalyticsShareLinkToQuery/q/ - cloudwatch-enabled: false - kubernetes-enabled: false - stackdriver-enabled: false - logs: - azure-log-templates: - - displayName: Azure Logs - templateUris: - - https://portal.azure.com#@test-tenant-id/blade/Microsoft_OperationsManagementSuite_Workspace/Logs.ReactView/resourceId/%2Fsubscriptions%2Ftest-subscription-id%2FresourceGroups%2Ftest-resource-group/source/LogsBlade.AnalyticsShareLinkToQuery/q/ - cloudwatch-enabled: false - dynamic-log-links: - - vscode: - displayName: VS Code Debugger - templateUris: - - /dataplane/pod/v1/generated_name/task/{{.executionProject}}/{{.executionDomain}}/{{.executionName}}/{{.nodeID}}/{{.taskRetryAttempt}}/{{.taskProject}}/{{.taskDomain}}/{{.taskID}}/{{.taskVersion}}/ - kubernetes-enabled: false - stackdriver-enabled: false - templates: - - displayName: Task Logs - scheme: TaskExecution - templateUris: - - /console/projects/{{.executionProject}}/domains/{{.executionDomain}}/executions/{{.executionName}}/nodeId/{{.nodeID}}/taskId/{{.taskID}}/attempt/{{.taskRetryAttempt}}/view/logs?duration=all&fromExecutionNav=true - storage.yaml: | - storage: - container: "" - container: 'test-metadata-container' - stow: - config: - account: 'test-storage-account' - kind: azure - type: stow - enable-multicontainer: true - limits: - maxDownloadMBs: 1024 - cache: - max_size_mbs: 0 - target_gc_percent: 70 --- # Source: dataplane/charts/fluentbit/templates/clusterrole.yaml apiVersion: rbac.authorization.k8s.io/v1 @@ -3992,275 +3447,1143 @@ rules: - list - watch --- -# Source: dataplane/charts/kube-state-metrics/templates/role.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2020 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: - labels: - helm.sh/chart: kube-state-metrics-5.30.1 - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: metrics - app.kubernetes.io/part-of: kube-state-metrics - app.kubernetes.io/name: kube-state-metrics - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2.15.0" - name: release-name-kube-state-metrics -rules: - -- apiGroups: ["certificates.k8s.io"] - resources: - - certificatesigningrequests - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - configmaps - verbs: ["list", "watch"] - -- apiGroups: ["batch"] - resources: - - cronjobs - verbs: ["list", "watch"] - -- apiGroups: ["extensions", "apps"] - resources: - - daemonsets - verbs: ["list", "watch"] - -- apiGroups: ["extensions", "apps"] - resources: - - deployments - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - endpoints - verbs: ["list", "watch"] - -- apiGroups: ["autoscaling"] - resources: - - horizontalpodautoscalers - verbs: ["list", "watch"] - -- apiGroups: ["extensions", "networking.k8s.io"] - resources: - - ingresses - verbs: ["list", "watch"] - -- apiGroups: ["batch"] - resources: - - jobs - verbs: ["list", "watch"] - -- apiGroups: ["coordination.k8s.io"] - resources: - - leases - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - limitranges - verbs: ["list", "watch"] - -- apiGroups: ["admissionregistration.k8s.io"] - resources: - - mutatingwebhookconfigurations - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - namespaces - verbs: ["list", "watch"] - -- apiGroups: ["networking.k8s.io"] - resources: - - networkpolicies - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - nodes - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - persistentvolumeclaims - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - persistentvolumes - verbs: ["list", "watch"] - -- apiGroups: ["policy"] - resources: - - poddisruptionbudgets - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - pods - verbs: ["list", "watch"] - -- apiGroups: ["extensions", "apps"] - resources: - - replicasets - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - replicationcontrollers - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - resourcequotas - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - secrets - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - services - verbs: ["list", "watch"] - -- apiGroups: ["apps"] - resources: - - statefulsets - verbs: ["list", "watch"] + name: knative-serving-operator-aggregated + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +aggregationRule: + clusterRoleSelectors: + # This (along with escalate below) allows the Operator to pick up any + # roles that are provided to the admin of the cluster by knative serving + # automatically. + - matchExpressions: + - {key: serving.knative.dev/release, operator: Exists} +rules: [] # Rules are automatically filled in by the controller manager. +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: knative-serving-operator-aggregated-stable + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +aggregationRule: + clusterRoleSelectors: + # This (along with escalate below) allows the Operator to pick up any + # roles that are provided to the admin of the cluster by knative serving + # automatically. + - matchExpressions: + - {key: app.kubernetes.io/name, operator: In, values: ["knative-serving"]} +rules: [] # Rules are automatically filled in by the controller manager. +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: knative-eventing-operator-aggregated + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +aggregationRule: + clusterRoleSelectors: + # This (along with escalate below) allows the Operator to pick up any + # roles that are provided to the admin of the cluster by knative eventing + # automatically. + - matchExpressions: + - {key: eventing.knative.dev/release, operator: Exists} +rules: [] # Rules are automatically filled in by the controller manager. +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: knative-eventing-operator-aggregated-stable + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +aggregationRule: + clusterRoleSelectors: + # This (along with escalate below) allows the Operator to pick up any + # roles that are provided to the admin of the cluster by knative eventing + # automatically. + - matchExpressions: + - {key: app.kubernetes.io/name, operator: In, values: ["knative-eventing"]} +rules: [] # Rules are automatically filled in by the controller manager. +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2020 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. -- apiGroups: ["storage.k8s.io"] - resources: - - storageclasses - verbs: ["list", "watch"] +kind: ClusterRole +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: knative-serving-operator + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +rules: + - apiGroups: + - operator.knative.dev + resources: + - '*' + verbs: + - '*' + # Bootstrapping permissions. + # Roles that are explicitly bound buch which are specified by this Operator + # MUST be specified here with 'get' and 'bind'. + - apiGroups: + - rbac.authorization.k8s.io + resources: + - clusterroles + resourceNames: + - system:auth-delegator + verbs: + - bind + - get + - apiGroups: + - rbac.authorization.k8s.io + resources: + - roles + resourceNames: + - extension-apiserver-authentication-reader + verbs: + - bind + - get + - apiGroups: + - rbac.authorization.k8s.io + resources: + - clusterroles + - roles + verbs: + - create + - delete + # Escalate is necessary in order to create a role using cluster role aggregation, + # and to allow the Operator to bootstrap itself into the necessary set of + # permissions, even as those continue to evolve upstream. + - escalate + - get + - list + - update + - apiGroups: + - rbac.authorization.k8s.io + resources: + - clusterrolebindings + - rolebindings + verbs: + - create + - delete + - list + - get + - update + # Permissions required for Knative controller + # infra. + - apiGroups: + - apiregistration.k8s.io + resources: + - apiservices + verbs: + - update + - apiGroups: + - "" + resources: + - services + verbs: + - create + - delete + - get + - list + - watch + - apiGroups: + - caching.internal.knative.dev + resources: + - images + verbs: + - '*' + - apiGroups: + - "" + resources: + - namespaces + verbs: + - get + - update + - watch + - apiGroups: + - '' + resources: + - events + verbs: + - create + - update + - patch + - apiGroups: + - '' + resources: + - configmaps + verbs: + - create + - delete + - get + - list + - watch + - apiGroups: + - security.istio.io + - apps + - policy + resources: + - poddisruptionbudgets + - peerauthentications + - deployments + - daemonsets + - replicasets + - statefulsets + verbs: + - create + - delete + - get + - list + - watch + - update + - apiGroups: + - apiregistration.k8s.io + resources: + - apiservices + verbs: + - create + - delete + - get + - list + - apiGroups: + - autoscaling + resources: + - horizontalpodautoscalers + verbs: + - create + - delete + - get + - list + - apiGroups: + - coordination.k8s.io + resources: + - leases + verbs: + - '*' + - apiGroups: + - apiextensions.k8s.io + resources: + - customresourcedefinitions + verbs: + - '*' + # Old resources that need cleaning up that are not in the knative-serving + # namespace. + - apiGroups: + - "" + resources: + - services + - deployments + - horizontalpodautoscalers + resourceNames: + - knative-ingressgateway + verbs: + - delete + - apiGroups: + - "" + resources: + - configmaps + resourceNames: + - config-controller + verbs: + - delete + - apiGroups: + - "" + resources: + - serviceaccounts + resourceNames: + - knative-serving-operator + verbs: + - delete + # for contour TLS + - apiGroups: + - projectcontour.io + resources: + - httpproxies + - tlscertificatedelegations + verbs: + - get + - list + - watch + - update + - create + - delete + - deletecollection + - patch + # for security-guard + - apiGroups: + - guard.security.knative.dev + resources: + - guardians + verbs: + - get + - list + - watch + - create + - update + - patch + - delete + - apiGroups: + - "" + resources: + - configmaps + verbs: + - get + - list + - watch + - create + - update + - patch + - delete + - apiGroups: + - authentication.k8s.io + resources: + - tokenreviews + verbs: + - create + - apiGroups: + - "" + resources: + - pods + verbs: + - get +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +kind: ClusterRole +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: knative-eventing-operator + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +rules: + - apiGroups: + - operator.knative.dev + resources: + - '*' + verbs: + - '*' + # Bootstrapping permissions. + # Roles that are explicitly bound buch which are specified by this Operator + # MUST be specified here with 'get' and 'bind'. + - apiGroups: + - rbac.authorization.k8s.io + resources: + - clusterroles + - roles + verbs: + - create + - delete + # Escalate is necessary in order to create a role using cluster role aggregation, + # and to allow the Operator to bootstrap itself into the necessary set of + # permissions, even as those continue to evolve upstream. + - escalate + - get + - list + - update + - apiGroups: + - rbac.authorization.k8s.io + resources: + - clusterrolebindings + - rolebindings + verbs: + - create + - delete + - list + - get + - update + # Permissions required for Knative controller + # infra. + - apiGroups: + - apiregistration.k8s.io + resources: + - apiservices + verbs: + - update + - apiGroups: + - "" + resources: + - services + verbs: + - create + - delete + - get + - list + - watch + - apiGroups: + - caching.internal.knative.dev + resources: + - images + verbs: + - '*' + - apiGroups: + - "" + resources: + - namespaces + verbs: + - get + - update + - watch + - apiGroups: + - '' + resources: + - events + verbs: + - create + - update + - patch + - apiGroups: + - '' + resources: + - configmaps + verbs: + - create + - delete + - get + - list + - watch + - apiGroups: + - apps + resources: + - deployments + - daemonsets + - replicasets + - statefulsets + verbs: + - create + - delete + - get + - list + - watch + - apiGroups: + - apiregistration.k8s.io + resources: + - apiservices + verbs: + - create + - delete + - get + - list + - apiGroups: + - autoscaling + resources: + - horizontalpodautoscalers + verbs: + - create + - delete + - update + - get + - list + - apiGroups: + - coordination.k8s.io + resources: + - leases + verbs: + - '*' + - apiGroups: + - apiextensions.k8s.io + resources: + - customresourcedefinitions + verbs: + - '*' + - apiGroups: + - batch + resources: + - jobs + verbs: + - create + - delete + - update + - get + - list + - watch + # Old resources that need cleaning up that are not in the knative-eventing + # namespace. + - apiGroups: + - "" + resources: + - serviceaccounts + resourceNames: + - knative-eventing-operator + verbs: + - delete + # for RabbitMQ messaging topology objects + - apiGroups: + - rabbitmq.com + resources: + - rabbitmqclusters + verbs: + - "get" + - "list" + - "watch" + - apiGroups: + - rabbitmq.com + resources: + - bindings + - queues + - exchanges + verbs: + - create + - delete + - get + - list + - patch + - update + - watch + - apiGroups: + - rabbitmq.com + resources: + - bindings/status + - queues/status + - exchanges/status + verbs: + - get + # for Kafka eventing source + - apiGroups: + - keda.sh + resources: + - scaledobjects + - scaledobjects/finalizers + - scaledobjects/status + - triggerauthentications + - triggerauthentications/status + verbs: + - get + - list + - watch + - update + - create + - delete + # Internal APIs + - apiGroups: + - "internal.kafka.eventing.knative.dev" + resources: + - "consumers" + - "consumers/status" + - "consumergroups" + - "consumergroups/status" + verbs: + - create + - get + - list + - watch + - patch + - update + - delete + - apiGroups: + - "internal.kafka.eventing.knative.dev" + resources: + - "consumers/finalizers" + - "consumergroups/finalizers" + verbs: + - update + - delete + - apiGroups: + - apps + resources: + - statefulsets/scale + verbs: + - get + - list + - watch + - update + - patch + - create + - delete + - apiGroups: + - rbac.authorization.k8s.io + resources: + - clusterrolebindings + verbs: + - watch + - apiGroups: + - "*" + resources: + - configmaps + verbs: + - delete + - apiGroups: + - "*" + resources: + - configmaps + - services + verbs: + - get + - list + - watch + - update + - create + - delete + - apiGroups: + - "*" + resources: + - pods + verbs: + - list + - update + - get + - watch + - apiGroups: + - "*" + resources: + - pods/finalizers + verbs: + - get + - list + - create + - update + - delete + - apiGroups: + - "*" + resources: + - events + verbs: + - patch + - create + - apiGroups: + - "*" + resources: + - secrets + verbs: + - get + - list + - watch + - update + - create + - delete + - apiGroups: + - "*" + resources: + - nodes + verbs: + - get + - list + - watch + - apiGroups: + - "*" + resources: + - serviceaccounts + verbs: + - get + - list + - watch + - update + - create + - delete + - apiGroups: + - "*" + resources: + - configmaps + resourceNames: + - kafka-channel-config + verbs: + - patch + - apiGroups: + - "*" + resources: + - horizontalpodautoscalers + resourceNames: + - kafka-webhook + verbs: + - delete + - apiGroups: + - "*" + resources: + - leases + verbs: + - delete + - apiGroups: + - "*" + resources: + - poddisruptionbudgets + resourceNames: + - kafka-webhook + verbs: + - delete + - apiGroups: + - "*" + resources: + - services + verbs: + - patch + - apiGroups: + - "apps" + resources: + - deployments + verbs: + - deletecollection + # Eventing TLS + - apiGroups: + - "cert-manager.io" + resources: + - certificates + - issuers + - clusterissuers + verbs: + - create + - delete + - update + - list + - get + - watch + - apiGroups: + - "trust.cert-manager.io" + resources: + - bundles + verbs: + - create + - delete + - update + - list + - get + - watch +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: knative-operator-webhook + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +rules: + # For watching logging configuration and getting certs. + - apiGroups: + - "" + resources: + - "configmaps" + verbs: + - "get" + - "list" + - "watch" + # For manipulating certs into secrets. + - apiGroups: + - "" + resources: + - "namespaces" + verbs: + - "get" + - "create" + - "update" + - "list" + - "watch" + - "patch" + # finalizers are needed for the owner reference of the webhook + - apiGroups: + - "" + resources: + - "namespaces/finalizers" + verbs: + - "update" + # For getting our Deployment so we can decorate with ownerref. + - apiGroups: + - "apps" + resources: + - "deployments" + verbs: + - "get" + - apiGroups: + - "apps" + resources: + - "deployments/finalizers" + verbs: + - update + # For actually registering our webhook. + - apiGroups: + - "admissionregistration.k8s.io" + resources: + - "mutatingwebhookconfigurations" + - "validatingwebhookconfigurations" + verbs: &everything + - "get" + - "list" + - "create" + - "update" + - "delete" + - "patch" + - "watch" + # For leader election + - apiGroups: + - "coordination.k8s.io" + resources: + - "leases" + verbs: *everything + # Necessary for conversion webhook. These are copied from the serving + # TODO: Do we really need all these permissions? + - apiGroups: ["apiextensions.k8s.io"] + resources: ["customresourcedefinitions"] + verbs: ["get", "list", "create", "update", "delete", "patch", "watch"] +--- +# Source: dataplane/charts/fluentbit/templates/clusterrolebinding.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: release-name-fluentbit + labels: + helm.sh/chart: fluentbit-0.48.9 + app.kubernetes.io/name: fluentbit + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "3.2.8" + app.kubernetes.io/managed-by: Helm +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: release-name-fluentbit +subjects: + - kind: ServiceAccount + name: fluentbit-system + namespace: union +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2020 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# TODO: Consider restriction of non-aggregated role to knativeservings namespaces. +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: knative-serving-operator + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: knative-serving-operator +subjects: + - kind: ServiceAccount + name: knative-operator + namespace: "union" +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# TODO: Consider restriction of non-aggregated role to knativeeventing namespaces. +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: knative-eventing-operator + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: knative-eventing-operator +subjects: + - kind: ServiceAccount + name: knative-operator + namespace: "union" +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: operator-webhook + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: knative-operator-webhook +subjects: + - kind: ServiceAccount + name: operator-webhook + namespace: "union" +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2020 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. -- apiGroups: ["admissionregistration.k8s.io"] - resources: - - validatingwebhookconfigurations - verbs: ["list", "watch"] +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: knative-serving-operator-aggregated + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: knative-serving-operator-aggregated +subjects: + - kind: ServiceAccount + name: knative-operator + namespace: "union" +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: knative-serving-operator-aggregated-stable + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: knative-serving-operator-aggregated-stable +subjects: + - kind: ServiceAccount + name: knative-operator + namespace: "union" +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: knative-eventing-operator-aggregated + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: knative-eventing-operator-aggregated +subjects: + - kind: ServiceAccount + name: knative-operator + namespace: "union" +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: knative-eventing-operator-aggregated-stable + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: knative-eventing-operator-aggregated-stable +subjects: + - kind: ServiceAccount + name: knative-operator + namespace: "union" +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2022 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. -- apiGroups: ["storage.k8s.io"] - resources: - - volumeattachments - verbs: ["list", "watch"] +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + namespace: "union" + name: knative-operator-webhook + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +rules: + # For manipulating certs into secrets. + - apiGroups: + - "" + resources: + - "secrets" + verbs: + - "get" + - "create" + - "update" + - "list" + - "watch" + - "patch" --- -# Source: dataplane/charts/opencost/templates/clusterrole.yaml -# Cluster role giving opencost to get, list, watch required resources -# No write permissions are required +# Source: dataplane/templates/monitoring/prometheus-rbac.yaml apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole +kind: Role metadata: - name: release-name-opencost + name: union-operator-prometheus-rbac + namespace: union labels: - helm.sh/chart: opencost-1.42.0 - app.kubernetes.io/name: opencost - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "1.111.0" - app.kubernetes.io/part-of: opencost - app.kubernetes.io/managed-by: Helm + release: release-name rules: + # Prometheus server scrape permissions - apiGroups: [""] resources: - - configmaps - - deployments - nodes - - pods + - nodes/proxy + - nodes/metrics - services - - resourcequotas - - replicationcontrollers - - limitranges - - persistentvolumeclaims - - persistentvolumes - - namespaces - endpoints + - pods + - ingresses + - configmaps verbs: - get - list - watch - - apiGroups: - - extensions - resources: - - daemonsets - - deployments - - replicasets - verbs: - - get - - list - - watch - - apiGroups: - - apps - resources: - - statefulsets - - deployments - - daemonsets - - replicasets - verbs: - - list - - watch - - apiGroups: - - batch - resources: - - cronjobs - - jobs - verbs: - - get - - list - - watch - - apiGroups: - - autoscaling - resources: - - horizontalpodautoscalers - verbs: - - get - - list - - watch - - apiGroups: - - policy + - apiGroups: ["extensions", "networking.k8s.io"] resources: - - poddisruptionbudgets + - ingresses/status + - ingresses verbs: - get - list - watch - - apiGroups: - - storage.k8s.io + - apiGroups: ["discovery.k8s.io"] resources: - - storageclasses + - endpointslices verbs: - get - list - watch ---- -# Source: dataplane/templates/clusterresourcesync/serviceaccount.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: union-clustersync-resource -rules: - - apiGroups: - - "" - - rbac.authorization.k8s.io - resources: - - configmaps - - namespaces - - pods - - resourcequotas - - roles - - rolebindings - - secrets - - services - - serviceaccounts - - clusterrolebindings - - podtemplates - verbs: - - '*' + # kube-state-metrics permissions (rbac.create=false on the subchart; managed here instead) + - apiGroups: ["certificates.k8s.io"] + resources: ["certificatesigningrequests"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["configmaps"] + verbs: ["list", "watch"] + - apiGroups: ["batch"] + resources: ["cronjobs"] + verbs: ["list", "watch"] + - apiGroups: ["extensions", "apps"] + resources: ["daemonsets"] + verbs: ["list", "watch"] + - apiGroups: ["extensions", "apps"] + resources: ["deployments"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["endpoints"] + verbs: ["list", "watch"] + - apiGroups: ["discovery.k8s.io"] + resources: ["endpointslices"] + verbs: ["list", "watch"] + - apiGroups: ["autoscaling"] + resources: ["horizontalpodautoscalers"] + verbs: ["list", "watch"] + - apiGroups: ["extensions", "networking.k8s.io"] + resources: ["ingresses"] + verbs: ["list", "watch"] + - apiGroups: ["batch"] + resources: ["jobs"] + verbs: ["list", "watch"] + - apiGroups: ["coordination.k8s.io"] + resources: ["leases"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["limitranges"] + verbs: ["list", "watch"] + - apiGroups: ["admissionregistration.k8s.io"] + resources: ["mutatingwebhookconfigurations"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["namespaces"] + verbs: ["list", "watch"] + - apiGroups: ["networking.k8s.io"] + resources: ["networkpolicies"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["persistentvolumeclaims"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["persistentvolumes"] + verbs: ["list", "watch"] + - apiGroups: ["policy"] + resources: ["poddisruptionbudgets"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["pods"] + verbs: ["list", "watch"] + - apiGroups: ["extensions", "apps"] + resources: ["replicasets"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["replicationcontrollers"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["resourcequotas"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["secrets"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["services"] + verbs: ["list", "watch"] + - apiGroups: ["apps"] + resources: ["statefulsets"] + verbs: ["list", "watch"] + - apiGroups: ["storage.k8s.io"] + resources: ["storageclasses"] + verbs: ["list", "watch"] + - apiGroups: ["admissionregistration.k8s.io"] + resources: ["validatingwebhookconfigurations"] + verbs: ["list", "watch"] + - apiGroups: ["storage.k8s.io"] + resources: ["volumeattachments"] + verbs: ["list", "watch"] --- # Source: dataplane/templates/nodeexecutor/serviceaccount.yaml apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole +kind: Role metadata: name: union-executor labels: @@ -4311,11 +4634,12 @@ rules: - delete - update --- -# Source: dataplane/templates/operator/serviceaccount-proxy.yaml +# Source: dataplane/templates/operator/serviceaccount-proxy-secret.yaml apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole +kind: Role metadata: - name: proxy-system + name: union-system-secret + namespace: union labels: app.kubernetes.io/name: operator-proxy app.kubernetes.io/instance: release-name @@ -4325,146 +4649,86 @@ rules: - apiGroups: - '*' resources: - - events - - flyteworkflows - - pods/log - - pods - - rayjobs - - resourcequotas - verbs: - - get - - list - - watch ---- -# Source: dataplane/templates/operator/serviceaccount.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: operator-system - labels: - app.kubernetes.io/name: union-operator - app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name - app.kubernetes.io/managed-by: Helm -rules: - # Allow Access to all resources under flyte.lyft.com - - apiGroups: - - flyte.lyft.com - resources: - - flyteworkflows - - flyteworkflows/finalizers - verbs: - - get - - list - - watch - - create - - update - - delete - - patch - - post - - deletecollection - - apiGroups: - - '*' - resources: - - resourcequotas - - pods - - configmaps - - podtemplates - secrets - - namespaces - - nodes verbs: - get - list - - watch - create - update - delete - - nonResourceURLs: - - /metrics - verbs: - - get --- -# Source: dataplane/templates/prometheus/rbac.yaml +# Source: dataplane/templates/operator/serviceaccount-proxy.yaml apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole +kind: Role metadata: - name: union-operator-prometheus + name: proxy-system labels: - helm.sh/chart: dataplane-2026.4.2 - app.kubernetes.io/name: release-name-dataplane + app.kubernetes.io/name: operator-proxy app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" + platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: prometheus rules: - - apiGroups: [""] + - apiGroups: + - '*' resources: - - nodes - - nodes/proxy + - events + - flyteworkflows + - pods/log - pods - - endpoints - - services + - rayjobs + - resourcequotas verbs: - get - list - watch - - nonResourceURLs: - - /metrics - - /metrics/cadvisor - verbs: - - get ---- -# Source: dataplane/templates/propeller/serviceaccount-webhook.yaml -kind: ClusterRole -apiVersion: rbac.authorization.k8s.io/v1 -metadata: - name: flytepropeller-webhook-role - namespace: union -rules: - - apiGroups: - - "*" - resources: - - mutatingwebhookconfigurations - - secrets - - pods - - replicasets/finalizers - verbs: - - get - - create - - update - - patch --- -# Source: dataplane/templates/propeller/serviceaccount.yaml +# Source: dataplane/templates/operator/serviceaccount.yaml apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole +kind: Role metadata: - name: flytepropeller-role + name: operator-system + labels: + app.kubernetes.io/name: union-operator + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm rules: - # Allow RO access to PODS - apiGroups: - - "" + - '*' resources: - - pods + - secrets + - deployments verbs: - get - list - watch - # Allow Event recording access + - create + - update - apiGroups: - - "" + - flyte.lyft.com resources: - - events + - flyteworkflows + - flyteworkflows/finalizers verbs: + - get + - list + - watch - create - update - delete - patch - # Allow Access All plugin objects + - post + - deletecollection - apiGroups: - '*' resources: - - '*' + - resourcequotas + - pods + - configmaps + - podtemplates + - secrets + - namespaces + - nodes verbs: - get - list @@ -4472,148 +4736,129 @@ rules: - create - update - delete - - patch - # Allow Access to CRD - apiGroups: - - apiextensions.k8s.io + - serving.knative.dev resources: - - customresourcedefinitions + - revisions + - configurations + - services verbs: - get - list - watch - create - - delete - update - # Allow Access to all resources under flyte.lyft.com + - delete +--- +# Source: dataplane/templates/webhook/serviceaccount.yaml +kind: Role +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: union-webhook-role + namespace: union +rules: - apiGroups: - - flyte.lyft.com + - "*" resources: - - flyteworkflows - - flyteworkflows/finalizers + - mutatingwebhookconfigurations + - secrets + - pods + - replicasets/finalizers verbs: - get - - list - - watch - create - update - - delete - patch - - post - - deletecollection --- -# Source: dataplane/charts/fluentbit/templates/clusterrolebinding.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2022 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding +kind: RoleBinding metadata: - name: release-name-fluentbit + namespace: "union" + name: operator-webhook labels: - helm.sh/chart: fluentbit-0.48.9 - app.kubernetes.io/name: fluentbit - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "3.2.8" - app.kubernetes.io/managed-by: Helm -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: release-name-fluentbit + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator subjects: - kind: ServiceAccount - name: fluentbit-system - namespace: union ---- -# Source: dataplane/charts/kube-state-metrics/templates/clusterrolebinding.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - labels: - helm.sh/chart: kube-state-metrics-5.30.1 - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: metrics - app.kubernetes.io/part-of: kube-state-metrics - app.kubernetes.io/name: kube-state-metrics - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2.15.0" - name: release-name-kube-state-metrics + name: operator-webhook + namespace: "union" roleRef: + kind: Role + name: knative-operator-webhook apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: release-name-kube-state-metrics -subjects: -- kind: ServiceAccount - name: release-name-kube-state-metrics - namespace: union --- -# Source: dataplane/charts/opencost/templates/clusterrolebinding.yaml +# Source: dataplane/templates/monitoring/prometheus-rbac.yaml apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding +kind: RoleBinding metadata: - name: release-name-opencost + name: union-operator-prometheus-rbac + namespace: union labels: - helm.sh/chart: opencost-1.42.0 - app.kubernetes.io/name: opencost - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "1.111.0" - app.kubernetes.io/part-of: opencost - app.kubernetes.io/managed-by: Helm -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: release-name-opencost + release: release-name subjects: - kind: ServiceAccount - name: release-name-opencost + name: union-operator-prometheus namespace: union ---- -# Source: dataplane/templates/clusterresourcesync/serviceaccount.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - name: union-clustersync-resource roleRef: apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: union-clustersync-resource -subjects: - - kind: ServiceAccount - name: union-clustersync-system - namespace: union + kind: Role + name: union-operator-prometheus-rbac --- -# Source: dataplane/templates/clusterresourcesync/serviceaccount.yaml +# Source: dataplane/templates/monitoring/prometheus-rbac.yaml +# kube-state-metrics service account RoleBinding apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding +kind: RoleBinding metadata: - name: union-clustersync-auth-delegator -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: system:auth-delegator + name: release-name-prometheus-kube-state-metrics + namespace: union + labels: + release: release-name subjects: - kind: ServiceAccount - name: union-clustersync-system + name: release-name-prometheus-kube-state-metrics namespace: union +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: union-operator-prometheus-rbac --- # Source: dataplane/templates/nodeexecutor/serviceaccount.yaml apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding +kind: RoleBinding metadata: name: union-executor labels: app: executor roleRef: apiGroup: rbac.authorization.k8s.io - kind: ClusterRole + kind: Role name: union-executor subjects: - kind: ServiceAccount - name: executor + name: union-system namespace: union --- -# Source: dataplane/templates/operator/serviceaccount-proxy.yaml +# Source: dataplane/templates/operator/serviceaccount-proxy-secret.yaml apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding +kind: RoleBinding metadata: - name: proxy-system + name: union-system-secret + namespace: union labels: app.kubernetes.io/name: operator-proxy app.kubernetes.io/instance: release-name @@ -4621,109 +4866,35 @@ metadata: app.kubernetes.io/managed-by: Helm roleRef: apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: proxy-system + kind: Role + name: union-system-secret subjects: - kind: ServiceAccount - name: proxy-system + name: union-system namespace: union --- -# Source: dataplane/templates/operator/serviceaccount.yaml +# Source: dataplane/templates/operator/serviceaccount-proxy.yaml apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding +kind: RoleBinding metadata: - name: operator-system + name: proxy-system labels: - app.kubernetes.io/name: union-operator + app.kubernetes.io/name: operator-proxy app.kubernetes.io/instance: release-name platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm roleRef: apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: operator-system -subjects: - - kind: ServiceAccount - name: operator-system - namespace: union ---- -# Source: dataplane/templates/prometheus/rbac.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - name: union-operator-prometheus - labels: - helm.sh/chart: dataplane-2026.4.2 - app.kubernetes.io/name: release-name-dataplane - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: prometheus -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: union-operator-prometheus -subjects: - - kind: ServiceAccount - name: union-operator-prometheus - namespace: union ---- -# Source: dataplane/templates/propeller/serviceaccount-webhook.yaml -# Create a binding from Role -> ServiceAccount -kind: ClusterRoleBinding -apiVersion: rbac.authorization.k8s.io/v1 -metadata: - name: flytepropeller-webhook-binding - namespace: union -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: flytepropeller-webhook-role -subjects: - - kind: ServiceAccount - name: flytepropeller-webhook-system - namespace: union ---- -# Source: dataplane/templates/propeller/serviceaccount.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - name: flytepropeller-binding -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: flytepropeller-role + kind: Role + name: proxy-system subjects: - kind: ServiceAccount - name: flytepropeller-system + name: union-system namespace: union --- -# Source: dataplane/templates/operator/serviceaccount-proxy-secret.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: Role -metadata: - name: proxy-system-secret - namespace: union - labels: - app.kubernetes.io/name: operator-proxy - app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name - app.kubernetes.io/managed-by: Helm -rules: - - apiGroups: - - '*' - resources: - - secrets - verbs: - - get - - list - - create - - update - - delete ---- # Source: dataplane/templates/operator/serviceaccount.yaml apiVersion: rbac.authorization.k8s.io/v1 -kind: Role +kind: RoleBinding metadata: name: operator-system labels: @@ -4731,56 +4902,29 @@ metadata: app.kubernetes.io/instance: release-name platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm -rules: - - apiGroups: - - '*' - resources: - - secrets - - deployments - verbs: - - get - - list - - watch - - create - - update ---- -# Source: dataplane/templates/operator/serviceaccount-proxy-secret.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: RoleBinding -metadata: - name: proxy-system-secret - namespace: union - labels: - app.kubernetes.io/name: operator-proxy - app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name - app.kubernetes.io/managed-by: Helm roleRef: apiGroup: rbac.authorization.k8s.io kind: Role - name: proxy-system-secret + name: operator-system subjects: - kind: ServiceAccount - name: proxy-system + name: union-system namespace: union --- -# Source: dataplane/templates/operator/serviceaccount.yaml -apiVersion: rbac.authorization.k8s.io/v1 +# Source: dataplane/templates/webhook/serviceaccount.yaml +# Create a binding from Role -> ServiceAccount kind: RoleBinding +apiVersion: rbac.authorization.k8s.io/v1 metadata: - name: operator-system - labels: - app.kubernetes.io/name: union-operator - app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name - app.kubernetes.io/managed-by: Helm + name: union-webhook-binding + namespace: union roleRef: apiGroup: rbac.authorization.k8s.io kind: Role - name: operator-system + name: union-webhook-role subjects: - kind: ServiceAccount - name: operator-system + name: union-system namespace: union --- # Source: dataplane/charts/fluentbit/templates/service.yaml @@ -4806,20 +4950,60 @@ spec: app.kubernetes.io/name: fluentbit app.kubernetes.io/instance: release-name --- -# Source: dataplane/charts/kube-state-metrics/templates/service.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2022 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v1 +kind: Service +metadata: + labels: + role: operator-webhook + app.kubernetes.io/component: operator-webhook + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator + name: operator-webhook + namespace: "union" +spec: + ports: + # Define metrics and profiling for them to be accessible within service meshes. + - name: http-metrics + port: 9090 + targetPort: 9090 + - name: http-profiling + port: 8008 + targetPort: 8008 + - name: https-webhook + port: 443 + targetPort: 8443 + selector: + role: operator-webhook +--- +# Source: dataplane/charts/prometheus/charts/kube-state-metrics/templates/service.yaml apiVersion: v1 kind: Service metadata: name: release-name-kube-state-metrics namespace: union labels: - helm.sh/chart: kube-state-metrics-5.30.1 + helm.sh/chart: kube-state-metrics-5.25.1 app.kubernetes.io/managed-by: Helm app.kubernetes.io/component: metrics app.kubernetes.io/part-of: kube-state-metrics app.kubernetes.io/name: kube-state-metrics app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2.15.0" + app.kubernetes.io/version: "2.13.0" annotations: prometheus.io/scrape: 'true' spec: @@ -4834,28 +5018,32 @@ spec: app.kubernetes.io/name: kube-state-metrics app.kubernetes.io/instance: release-name --- -# Source: dataplane/charts/opencost/templates/service.yaml +# Source: dataplane/charts/prometheus/templates/service.yaml apiVersion: v1 kind: Service metadata: - name: release-name-opencost - namespace: union labels: - helm.sh/chart: opencost-1.42.0 - app.kubernetes.io/name: opencost + app.kubernetes.io/component: server + app.kubernetes.io/name: prometheus app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "1.111.0" - app.kubernetes.io/part-of: opencost + app.kubernetes.io/version: v2.54.1 + helm.sh/chart: prometheus-25.27.0 app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: prometheus + name: union-operator-prometheus + namespace: union spec: + ports: + - name: http + port: 80 + protocol: TCP + targetPort: 9090 selector: - app.kubernetes.io/name: opencost + app.kubernetes.io/component: server + app.kubernetes.io/name: prometheus app.kubernetes.io/instance: release-name + sessionAffinity: None type: "ClusterIP" - ports: - - name: http - port: 9003 - targetPort: 9003 --- # Source: dataplane/templates/clusterresourcesync/service.yaml apiVersion: v1 @@ -4879,6 +5067,31 @@ spec: app.kubernetes.io/name: clusterresourcesync app.kubernetes.io/instance: release-name --- +# Source: dataplane/templates/flyteconnector/service.yaml +apiVersion: v1 +kind: Service +metadata: + name: flyteconnector + namespace: union + labels: + app.kubernetes.io/name: flyteconnector + app.kubernetes.io/instance: release-name + app.kubernetes.io/managed-by: Helm +spec: + clusterIP: None + ports: + - name: grpc + port: 8000 + protocol: TCP + targetPort: grpc + - name: metric + port: 9090 + protocol: TCP + targetPort: metric + selector: + app.kubernetes.io/name: flyteconnector + app.kubernetes.io/instance: release-name +--- # Source: dataplane/templates/imagebuilder/service.yaml apiVersion: v1 kind: Service @@ -4904,7 +5117,7 @@ spec: apiVersion: v1 kind: Service metadata: - name: release-name-dataplane-executor + name: union-operator-executor labels: platform.union.ai/prometheus-group: "union-services" app: executor @@ -4970,39 +5183,14 @@ spec: app.kubernetes.io/name: union-operator app.kubernetes.io/instance: release-name --- -# Source: dataplane/templates/prometheus/service.yaml -apiVersion: v1 -kind: Service -metadata: - name: union-operator-prometheus - namespace: union - labels: - helm.sh/chart: dataplane-2026.4.2 - app.kubernetes.io/name: release-name-dataplane - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: prometheus -spec: - type: ClusterIP - ports: - - port: 80 - targetPort: 9090 - protocol: TCP - name: http - selector: - app.kubernetes.io/component: prometheus - app.kubernetes.io/name: release-name-dataplane - app.kubernetes.io/instance: release-name ---- -# Source: dataplane/templates/propeller/service-webhook.yaml +# Source: dataplane/templates/webhook/service.yaml apiVersion: v1 kind: Service metadata: - name: flyte-pod-webhook + name: union-pod-webhook namespace: union labels: - app.kubernetes.io/name: flyte-pod-webhook + app.kubernetes.io/name: union-pod-webhook app.kubernetes.io/instance: release-name platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm @@ -5010,7 +5198,7 @@ metadata: projectcontour.io/upstream-protocol.h2c: grpc spec: selector: - app.kubernetes.io/name: flyte-pod-webhook + app.kubernetes.io/name: union-pod-webhook app.kubernetes.io/instance: release-name ports: - name: https @@ -5022,23 +5210,23 @@ spec: port: 10254 targetPort: 10254 --- -# Source: dataplane/templates/propeller/service-webhook.yaml +# Source: dataplane/templates/webhook/service.yaml # Headless Service for cache invalidation — resolves to all pod IPs so that # we can fan out invalidation requests to every webhook replica. apiVersion: v1 kind: Service metadata: - name: flyte-pod-webhook-headless + name: union-pod-webhook-headless namespace: union labels: - app.kubernetes.io/name: flyte-pod-webhook + app.kubernetes.io/name: union-pod-webhook app.kubernetes.io/instance: release-name platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm spec: clusterIP: None selector: - app.kubernetes.io/name: flyte-pod-webhook + app.kubernetes.io/name: union-pod-webhook app.kubernetes.io/instance: release-name ports: - name: cache-internal @@ -5046,32 +5234,6 @@ spec: port: 9443 targetPort: 9443 --- -# Source: dataplane/templates/propeller/service.yaml -apiVersion: v1 -kind: Service -metadata: - namespace: union - name: flytepropeller - labels: - platform.union.ai/prometheus-group: "union-services" - app.kubernetes.io/name: flytepropeller - app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name - app.kubernetes.io/managed-by: Helm -spec: - type: ClusterIP - ports: - - name: debug - protocol: TCP - port: 10254 - - name: fasttask - port: 15605 - protocol: TCP - targetPort: 15605 - selector: - app.kubernetes.io/name: flytepropeller - app.kubernetes.io/instance: release-name ---- # Source: dataplane/charts/fluentbit/templates/daemonset.yaml apiVersion: apps/v1 kind: DaemonSet @@ -5149,20 +5311,217 @@ spec: tolerations: - operator: Exists --- -# Source: dataplane/charts/kube-state-metrics/templates/deployment.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2022 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: operator-webhook + namespace: "union" + labels: + app.kubernetes.io/component: operator-webhook + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +spec: + selector: + matchLabels: + app: operator-webhook + role: operator-webhook + template: + metadata: + labels: + app: operator-webhook + role: operator-webhook + app.kubernetes.io/component: operator-webhook + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator + sidecar.istio.io/inject: "false" + spec: + # To avoid node becoming SPOF, spread our replicas to different nodes. + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - podAffinityTerm: + labelSelector: + matchLabels: + app: operator-webhook + topologyKey: kubernetes.io/hostname + weight: 100 + serviceAccountName: operator-webhook + containers: + - name: operator-webhook + # This is the Go import path for the binary that is containerized + # and substituted here. + image: gcr.io/knative-releases/knative.dev/operator/cmd/webhook@sha256:d3a7a3304629ccfcaacec620b50736e0b4c902a6328b83369b6cbaf7e94677c9 + resources: + requests: + cpu: 100m + memory: 100Mi + limits: + cpu: 500m + memory: 500Mi + env: + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: SYSTEM_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: CONFIG_LOGGING_NAME + value: config-logging + - name: CONFIG_OBSERVABILITY_NAME + value: config-observability + - name: WEBHOOK_NAME + value: operator-webhook + - name: WEBHOOK_PORT + value: "8443" + - name: WEBHOOK_SECRET_NAME + value: operator-webhook-certs + - name: METRICS_DOMAIN + value: knative.dev/operator + - name: KUBERNETES_MIN_VERSION + value: "" + securityContext: + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + runAsNonRoot: true + capabilities: + drop: + - ALL + ports: + - name: metrics + containerPort: 9090 + - name: profiling + containerPort: 8008 + - name: https-webhook + containerPort: 8443 + readinessProbe: + periodSeconds: 1 + httpGet: + scheme: HTTPS + port: 8443 + httpHeaders: + - name: k-kubelet-probe + value: "webhook" + livenessProbe: + periodSeconds: 1 + httpGet: + scheme: HTTPS + port: 8443 + httpHeaders: + - name: k-kubelet-probe + value: "webhook" + failureThreshold: 6 + initialDelaySeconds: 120 + # Our webhook should gracefully terminate by lame ducking first, set this to a sufficiently + # high value that we respect whatever value it has configured for the lame duck grace period. + terminationGracePeriodSeconds: 300 +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2020 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: knative-operator + namespace: "union" + labels: + app.kubernetes.io/name: knative-operator + app.kubernetes.io/version: "1.16.0" +spec: + replicas: 1 + selector: + matchLabels: + name: knative-operator + template: + metadata: + labels: + name: knative-operator + app.kubernetes.io/name: knative-operator + app.kubernetes.io/version: "1.16.0" + sidecar.istio.io/inject: "false" + spec: + serviceAccountName: knative-operator + containers: + - name: knative-operator + image: gcr.io/knative-releases/knative.dev/operator/cmd/operator@sha256:0b5a3532417f9c8e7b6044e23f6f67ad932a74a40a4092ad965b6f173b2fd887 + resources: + requests: + cpu: 100m + memory: 100Mi + limits: + cpu: 1000m + memory: 1000Mi + imagePullPolicy: IfNotPresent + env: + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: SYSTEM_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: METRICS_DOMAIN + value: knative.dev/operator + - name: CONFIG_LOGGING_NAME + value: config-logging + - name: CONFIG_OBSERVABILITY_NAME + value: config-observability + - name: KUBERNETES_MIN_VERSION + value: "" + securityContext: + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + runAsNonRoot: true + capabilities: + drop: + - ALL + ports: + - name: metrics + containerPort: 9090 +--- +# Source: dataplane/charts/prometheus/charts/kube-state-metrics/templates/deployment.yaml apiVersion: apps/v1 kind: Deployment metadata: name: release-name-kube-state-metrics namespace: union labels: - helm.sh/chart: kube-state-metrics-5.30.1 + helm.sh/chart: kube-state-metrics-5.25.1 app.kubernetes.io/managed-by: Helm app.kubernetes.io/component: metrics app.kubernetes.io/part-of: kube-state-metrics app.kubernetes.io/name: kube-state-metrics app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2.15.0" + app.kubernetes.io/version: "2.13.0" spec: selector: matchLabels: @@ -5175,13 +5534,13 @@ spec: template: metadata: labels: - helm.sh/chart: kube-state-metrics-5.30.1 + helm.sh/chart: kube-state-metrics-5.25.1 app.kubernetes.io/managed-by: Helm app.kubernetes.io/component: metrics app.kubernetes.io/part-of: kube-state-metrics app.kubernetes.io/name: kube-state-metrics app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2.15.0" + app.kubernetes.io/version: "2.13.0" spec: automountServiceAccountToken: true hostNetwork: false @@ -5198,8 +5557,9 @@ spec: args: - --port=8080 - --resources=certificatesigningrequests,configmaps,cronjobs,daemonsets,deployments,endpoints,horizontalpodautoscalers,ingresses,jobs,leases,limitranges,mutatingwebhookconfigurations,namespaces,networkpolicies,nodes,persistentvolumeclaims,persistentvolumes,poddisruptionbudgets,pods,replicasets,replicationcontrollers,resourcequotas,secrets,services,statefulsets,storageclasses,validatingwebhookconfigurations,volumeattachments + - --namespaces=union imagePullPolicy: IfNotPresent - image: registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.15.0 + image: registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.13.0 ports: - containerPort: 8080 name: "http" @@ -5219,7 +5579,7 @@ spec: httpGet: httpHeaders: path: /readyz - port: 8081 + port: 8080 scheme: HTTP initialDelaySeconds: 5 periodSeconds: 10 @@ -5234,202 +5594,204 @@ spec: - ALL readOnlyRootFilesystem: true --- -# Source: dataplane/charts/opencost/templates/deployment.yaml +# Source: dataplane/charts/prometheus/templates/deploy.yaml apiVersion: apps/v1 kind: Deployment metadata: - name: release-name-opencost - namespace: union labels: - helm.sh/chart: opencost-1.42.0 - app.kubernetes.io/name: opencost + app.kubernetes.io/component: server + app.kubernetes.io/name: prometheus app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "1.111.0" - app.kubernetes.io/part-of: opencost + app.kubernetes.io/version: v2.54.1 + helm.sh/chart: prometheus-25.27.0 app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: prometheus + name: union-operator-prometheus + namespace: union spec: - replicas: 1 selector: matchLabels: - app.kubernetes.io/name: opencost + app.kubernetes.io/component: server + app.kubernetes.io/name: prometheus app.kubernetes.io/instance: release-name - strategy: - rollingUpdate: - maxSurge: 1 - maxUnavailable: 1 - type: RollingUpdate + replicas: 1 + revisionHistoryLimit: 10 + strategy: + type: Recreate + rollingUpdate: null template: metadata: labels: - app.kubernetes.io/name: opencost + app.kubernetes.io/component: server + app.kubernetes.io/name: prometheus app.kubernetes.io/instance: release-name + app.kubernetes.io/version: v2.54.1 + helm.sh/chart: prometheus-25.27.0 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: prometheus spec: - serviceAccountName: release-name-opencost + enableServiceLinks: true + serviceAccountName: union-operator-prometheus containers: - - name: release-name-opencost - image: ghcr.io/opencost/opencost:1.111.0@sha256:6aa68e52a24b14ba41f23db08d1b9db1429a1c0300f4c0381ecc2c61fc311a97 - imagePullPolicy: IfNotPresent + - name: prometheus-server-configmap-reload + image: "quay.io/prometheus-operator/prometheus-config-reloader:v0.76.0" + imagePullPolicy: "IfNotPresent" args: + - --watched-dir=/etc/config + - --listen-address=0.0.0.0:8080 + - --reload-url=http://127.0.0.1:9090/prometheus/-/reload ports: - - containerPort: 9003 - name: http - resources: - limits: - cpu: 1000m - memory: 4Gi - requests: - cpu: 500m - memory: 1Gi - startupProbe: - httpGet: - path: /healthz - port: 9003 - initialDelaySeconds: 10 - periodSeconds: 5 - failureThreshold: 30 + - containerPort: 8080 + name: metrics livenessProbe: httpGet: path: /healthz - port: 9003 - initialDelaySeconds: 10 - periodSeconds: 20 - failureThreshold: 3 + port: metrics + scheme: HTTP + initialDelaySeconds: 2 + periodSeconds: 10 readinessProbe: httpGet: path: /healthz - port: 9003 - initialDelaySeconds: 10 + port: metrics + scheme: HTTP periodSeconds: 10 + volumeMounts: + - name: config-volume + mountPath: /etc/config + readOnly: true + + - name: prometheus-server + image: "quay.io/prometheus/prometheus:v2.54.1" + imagePullPolicy: "IfNotPresent" + args: + - --storage.tsdb.retention.time=3d + - --config.file=/etc/config/prometheus.yml + - --storage.tsdb.path=/data + - --web.console.libraries=/etc/prometheus/console_libraries + - --web.console.templates=/etc/prometheus/consoles + - --web.enable-lifecycle + - --web.route-prefix=/prometheus + - --web.external-url=/prometheus/ + ports: + - containerPort: 9090 + readinessProbe: + httpGet: + path: /prometheus/-/ready + port: 9090 + scheme: HTTP + initialDelaySeconds: 30 + periodSeconds: 5 + timeoutSeconds: 4 failureThreshold: 3 - env: - - name: LOG_LEVEL - value: info - - name: CUSTOM_COST_ENABLED - value: "false" - - name: KUBECOST_NAMESPACE - value: union - - name: API_PORT - value: "9003" - - name: PROMETHEUS_SERVER_ENDPOINT - value: "http://union-operator-prometheus.union.svc:80/prometheus" - - name: CLUSTER_ID - value: "default-cluster" - - name: DATA_RETENTION_DAILY_RESOLUTION_DAYS - value: "15" - - name: CLOUD_COST_ENABLED - value: "false" - - name: CLOUD_COST_MONTH_TO_DATE_INTERVAL - value: "6" - - name: CLOUD_COST_REFRESH_RATE_HOURS - value: "6" - - name: CLOUD_COST_QUERY_WINDOW_DAYS - value: "7" - - name: CLOUD_COST_RUN_WINDOW_DAYS - value: "3" - # Add any additional provided variables + successThreshold: 1 + livenessProbe: + httpGet: + path: /prometheus/-/healthy + port: 9090 + scheme: HTTP + initialDelaySeconds: 30 + periodSeconds: 15 + timeoutSeconds: 10 + failureThreshold: 3 + successThreshold: 1 + resources: + limits: + cpu: "3" + memory: 3500Mi + requests: + cpu: "1" + memory: 1Gi + volumeMounts: + - name: config-volume + mountPath: /etc/config + - name: storage-volume + mountPath: /data + subPath: "" + - name: server-recording-rules + mountPath: /etc/config/recording + subPath: + readOnly: + dnsPolicy: ClusterFirst + securityContext: + fsGroup: 65534 + runAsGroup: 65534 + runAsNonRoot: true + runAsUser: 65534 + terminationGracePeriodSeconds: 300 + volumes: + - name: config-volume + configMap: + name: union-operator-prometheus + - name: server-recording-rules + configMap: + name: union-recording-rules + - name: storage-volume + emptyDir: + sizeLimit: 10Gi --- -# Source: dataplane/templates/clusterresourcesync/deployment.yaml +# Source: dataplane/templates/flyteconnector/deployment.yaml apiVersion: apps/v1 kind: Deployment metadata: - name: union-syncresources + name: flyteconnector namespace: union - labels: - app.kubernetes.io/name: clusterresourcesync + labels: + app.kubernetes.io/name: flyteconnector app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm spec: - replicas: 1 - selector: - matchLabels: - app.kubernetes.io/name: clusterresourcesync - app.kubernetes.io/instance: release-name - template: - metadata: - annotations: - configChecksum: "c13a554b9a39b142cd7fe140c69cc4a74b036b91594b6a122650fa05f1e1bce" - - labels: - platform.union.ai/zone: "dataplane" - - azure.workload.identity/use: "true" - app.kubernetes.io/name: clusterresourcesync - app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name - app.kubernetes.io/managed-by: Helm - spec: - containers: - - command: - - clusterresource - - --config - - /etc/flyte/config/*.yaml - - clusterresource - - run - env: - - name: POD_NAME - valueFrom: - fieldRef: - fieldPath: metadata.name - - name: POD_NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - - name: GOMEMLIMIT - valueFrom: - resourceFieldRef: - divisor: 1 - resource: limits.memory - - name: GOMAXPROCS - valueFrom: - resourceFieldRef: - divisor: 1 - resource: limits.cpu - - name: CLUSTER_NAME - valueFrom: - secretKeyRef: - name: operator-cluster-name - key: cluster_name - - name: DEPLOYMENT_NAME - value: operator - - name: PROXY_SERVICE_URL - value: http://union-operator-proxy:8080 - - name: PROMETHEUS_SERVICE_URL - value: http://union-operator-prometheus:80 - - name: KNATIVE_PROXY_SERVICE_URL - value: http://kourier-internal - image: "public.ecr.aws/p0i0a9q8/unionoperator:2026.4.5" + replicas: 2 + selector: + matchLabels: + app.kubernetes.io/name: flyteconnector + app.kubernetes.io/instance: release-name + template: + metadata: + annotations: + labels: + platform.union.ai/zone: "dataplane" + + app.kubernetes.io/name: flyteconnector + app.kubernetes.io/instance: release-name + app.kubernetes.io/managed-by: Helm + spec: + containers: + - command: + - c0 + image: "ghcr.io/flyteorg/flyte-connectors:py3.13-2.0.0b50.dev3-g695bb1db3.d20260122" imagePullPolicy: "IfNotPresent" - name: sync-cluster-resources + name: flyteconnector + readinessProbe: + failureThreshold: 3 + httpGet: + path: / + port: 9090 + scheme: HTTP + initialDelaySeconds: 5 + periodSeconds: 10 + successThreshold: 1 + timeoutSeconds: 5 + env: + - name: AIRFLOW_CONN_GOOGLE_CLOUD_DEFAULT + value: "google-cloud-platform://" + - name: LOG_LEVEL + value: "20" + ports: + - containerPort: 8000 + name: grpc + - containerPort: 9090 + name: metric resources: limits: - cpu: "1" - memory: 500Mi + cpu: "1.5" + ephemeral-storage: 100Mi + memory: 1500Mi requests: - cpu: 500m - memory: 100Mi - volumeMounts: - - name: auth - mountPath: /etc/union/secret - - name: resource-templates - mountPath: /etc/flyte/clusterresource/templates - - name: config-volume - mountPath: /etc/flyte/config - ports: - - name: debug - containerPort: 10254 - protocol: TCP - serviceAccountName: union-clustersync-system - volumes: - - configMap: - name: union-clusterresource-template - name: resource-templates - - configMap: - name: union-clusterresourcesync-config - name: config-volume - - name: auth - secret: - secretName: union-secret-auth + cpu: "1" + ephemeral-storage: 100Mi + memory: 1000Mi + serviceAccountName: flyteconnector --- # Source: dataplane/templates/imagebuilder/deployment.yaml apiVersion: apps/v1 @@ -5451,17 +5813,15 @@ spec: app.kubernetes.io/instance: release-name template: metadata: - annotations: - container.apparmor.security.beta.kubernetes.io/buildkit: unconfined labels: platform.union.ai/zone: "dataplane" app.kubernetes.io/name: imagebuilder-buildkit app.kubernetes.io/instance: release-name spec: - serviceAccountName: "union-imagebuilder" + serviceAccountName: "union-system" containers: - name: "buildkit" - image: "docker.io/moby/buildkit:buildx-stable-1-rootless" + image: "docker.io/moby/buildkit:buildx-stable-1" imagePullPolicy: IfNotPresent env: - name: POD_NAME @@ -5496,18 +5856,15 @@ spec: - name: KNATIVE_PROXY_SERVICE_URL value: http://kourier-internal volumeMounts: - - mountPath: /home/user/.local/share/buildkit - name: buildkitd - mountPath: /etc/buildkit name: buildkit-config args: - --config - /etc/buildkit/buildkitd.toml - --addr - - unix:///run/user/1000/buildkit/buildkitd.sock + - unix:///run//buildkit/buildkitd.sock - --addr - tcp://0.0.0.0:1234 - - --oci-worker-no-process-sandbox ports: - name: tcp containerPort: 1234 @@ -5529,18 +5886,13 @@ spec: initialDelaySeconds: 5 periodSeconds: 30 securityContext: - seccompProfile: # Needs Kubernetes >= 1.19 - type: Unconfined - runAsUser: 1000 - runAsGroup: 1000 + privileged: true resources: requests: - cpu: 1 - ephemeral-storage: 20Gi - memory: 1Gi + cpu: 4 + ephemeral-storage: 50Gi + memory: 4Gi volumes: - - name: buildkitd - emptyDir: {} - configMap: name: union-operator-buildkit name: buildkit-config @@ -5569,17 +5921,19 @@ spec: template: metadata: annotations: - configChecksum: "50f47b0370129a5e8a53e23a80f621993e6d636aead636080486321b581b557" + configChecksum: "43c19f26de413b1c595d602a570f523cd7f73e26397a9baeb19033c4b1651df" labels: platform.union.ai/zone: "dataplane" azure.workload.identity/use: "true" app: executor + app.kubernetes.io/instance: 'release-name' + app.kubernetes.io/name: executor spec: securityContext: fsGroup: 1337 - serviceAccountName: executor + serviceAccountName: union-system volumes: - name: config-volume configMap: @@ -5595,7 +5949,7 @@ spec: image: "public.ecr.aws/p0i0a9q8/unionoperator:2026.4.5" imagePullPolicy: IfNotPresent command: - - executor + - executorv2 - serve - --config - /etc/config/*.yaml @@ -5673,7 +6027,7 @@ spec: template: metadata: annotations: - configChecksum: "2cef92ccb9dbbf62c406577ead9d8ceaca9f66f109e3f4d02bac5c41eda9373" + configChecksum: "24634feef8600f3ccf6cb02dcb3860eb7cd4b9f1ffd973eb9eff97f5e2756b3" labels: @@ -5689,12 +6043,10 @@ spec: sources: - configMap: name: union-operator - - configMap: - name: union-clusterresourcesync-config - name: secret-volume secret: secretName: union-secret-auth - serviceAccountName: proxy-system + serviceAccountName: union-system securityContext: {} containers: @@ -5811,7 +6163,7 @@ spec: template: metadata: annotations: - configChecksum: "2cef92ccb9dbbf62c406577ead9d8ceaca9f66f109e3f4d02bac5c41eda9373" + configChecksum: "24634feef8600f3ccf6cb02dcb3860eb7cd4b9f1ffd973eb9eff97f5e2756b3" labels: @@ -5821,7 +6173,7 @@ spec: platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm spec: - serviceAccountName: operator-system + serviceAccountName: union-system securityContext: {} volumes: @@ -5902,81 +6254,16 @@ spec: containerPort: 10254 protocol: TCP --- -# Source: dataplane/templates/prometheus/deployment.yaml -apiVersion: apps/v1 -kind: Deployment -metadata: - name: union-operator-prometheus - namespace: union - labels: - helm.sh/chart: dataplane-2026.4.2 - app.kubernetes.io/name: release-name-dataplane - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: prometheus -spec: - replicas: 1 - selector: - matchLabels: - app.kubernetes.io/component: prometheus - app.kubernetes.io/name: release-name-dataplane - app.kubernetes.io/instance: release-name - template: - metadata: - annotations: - configChecksum: "72a39ca838f02fc499675b5708fa77e2a4938e316a99315b59e33ef7f2d31ae" - labels: - app.kubernetes.io/component: prometheus - app.kubernetes.io/name: release-name-dataplane - app.kubernetes.io/instance: release-name - spec: - priorityClassName: system-cluster-critical - serviceAccountName: union-operator-prometheus - securityContext: - fsGroup: 65534 - runAsNonRoot: true - runAsUser: 65534 - fsGroupChangePolicy: OnRootMismatch - containers: - - name: prometheus - image: "prom/prometheus:v3.3.1" - args: - - --config.file=/etc/prometheus/prometheus.yml - - --web.external-url=/prometheus/ - - --web.route-prefix=/prometheus/ - - --storage.tsdb.retention.time=3d - ports: - - name: http - containerPort: 9090 - protocol: TCP - resources: - limits: - cpu: "3" - memory: 3500Mi - requests: - cpu: "1" - memory: 1Gi - securityContext: - allowPrivilegeEscalation: false - terminationMessagePolicy: FallbackToLogsOnError - volumeMounts: - - mountPath: /etc/prometheus - name: prometheus-config - volumes: - - name: prometheus-config - configMap: - name: union-operator-prometheus ---- -# Source: dataplane/templates/propeller/deployment-webhook.yaml +# Source: dataplane/templates/webhook/deployment.yaml +# Webhook deployment # Create the actual deployment apiVersion: apps/v1 kind: Deployment metadata: - name: flytepropeller-webhook + name: union-pod-webhook namespace: union labels: - app.kubernetes.io/name: flyte-pod-webhook + app.kubernetes.io/name: union-pod-webhook app.kubernetes.io/instance: release-name platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm @@ -5984,7 +6271,7 @@ spec: replicas: 1 selector: matchLabels: - app.kubernetes.io/name: flyte-pod-webhook + app.kubernetes.io/name: union-pod-webhook app.kubernetes.io/instance: release-name template: metadata: @@ -5992,12 +6279,12 @@ spec: platform.union.ai/zone: "dataplane" azure.workload.identity/use: "true" - app.kubernetes.io/name: flyte-pod-webhook + app.kubernetes.io/name: union-pod-webhook app.kubernetes.io/instance: release-name platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm annotations: - configChecksum: "a8328980e66ca964e6d674436aa9bc7036a50c23d3322a6ea05b815388b72c0" + configChecksum: "47b0ca630b85f3918c4871e21cdd2c7cd448b05b7ef3eca9abc7e1490808e1e" spec: securityContext: @@ -6005,64 +6292,8 @@ spec: fsGroupChangePolicy: Always runAsNonRoot: true runAsUser: 1001 - seLinuxOptions: - type: spc_t - serviceAccountName: flytepropeller-webhook-system - initContainers: - - name: generate-secrets - image: "public.ecr.aws/p0i0a9q8/unionoperator:2026.4.5" - imagePullPolicy: "IfNotPresent" - command: - - flytepropeller - args: - - webhook - - init-certs - - --config - - /etc/flyte/config/*.yaml - env: - - name: POD_NAME - valueFrom: - fieldRef: - fieldPath: metadata.name - - name: POD_NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - - name: GOMEMLIMIT - valueFrom: - resourceFieldRef: - divisor: 1 - resource: limits.memory - - name: GOMAXPROCS - valueFrom: - resourceFieldRef: - divisor: 1 - resource: limits.cpu - - name: CLUSTER_NAME - valueFrom: - secretKeyRef: - name: operator-cluster-name - key: cluster_name - - name: DEPLOYMENT_NAME - value: operator - - name: PROXY_SERVICE_URL - value: http://union-operator-proxy:8080 - - name: PROMETHEUS_SERVICE_URL - value: http://union-operator-prometheus:80 - - name: KNATIVE_PROXY_SERVICE_URL - value: http://kourier-internal - volumeMounts: - - name: config-volume - mountPath: /etc/flyte/config - resources: - limits: - cpu: 1 - ephemeral-storage: 500Mi - memory: 500Mi - requests: - cpu: 200m - ephemeral-storage: 500Mi - memory: 500Mi + seLinuxOptions: null + serviceAccountName: union-system containers: - name: webhook image: "public.ecr.aws/p0i0a9q8/unionoperator:2026.4.5" @@ -6127,108 +6358,301 @@ spec: volumes: - name: config-volume configMap: - name: flyte-propeller-config + name: union-pod-webhook-config - name: webhook-certs secret: - secretName: flyte-pod-webhook + secretName: union-pod-webhook --- -# Source: dataplane/templates/propeller/deployment.yaml -apiVersion: apps/v1 -kind: Deployment +# Source: dataplane/templates/flyteconnector/hpa.yaml +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler metadata: - namespace: union - name: flytepropeller + name: flyteconnector + labels: + app.kubernetes.io/name: flyteconnector + app.kubernetes.io/instance: release-name + app.kubernetes.io/managed-by: Helm +spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: flyteconnector + minReplicas: 2 + maxReplicas: 5 + metrics: + - type: Resource + resource: + name: cpu + target: + type: Utilization + averageUtilization: 80 + - type: Resource + resource: + name: memory + target: + type: Utilization + averageUtilization: 80 +--- +# Source: dataplane/templates/webhook/mutatingwebhookconfiguration.yaml +apiVersion: admissionregistration.k8s.io/v1 +kind: MutatingWebhookConfiguration +metadata: + name: union-pod-webhook-test-org labels: - app.kubernetes.io/name: flytepropeller + app.kubernetes.io/name: union-pod-webhook app.kubernetes.io/instance: release-name platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm +webhooks: + - name: union-pod-webhook.flyte.org + admissionReviewVersions: + - v1 + - v1beta1 + clientConfig: + caBundle: LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSUROekNDQWgrZ0F3SUJBZ0lVSm9kQ09lem4vd1BHQTZjYkw1U3duSlpYT2hJd0RRWUpLb1pJaHZjTkFRRUwKQlFBd0t6RXBNQ2NHQTFVRUF3d2dabXg1ZEdWd2NtOXdaV3hzWlhJdGQyVmlhRzl2YXk1MWJtbHZiaTV6ZG1NdwpIaGNOTWpZd01USXpNREl6TmpVMVdoY05Nell3TVRJeE1ESXpOalUxV2pBck1Ta3dKd1lEVlFRRERDQm1iSGwwClpYQnliM0JsYkd4bGNpMTNaV0pvYjI5ckxuVnVhVzl1TG5OMll6Q0NBU0l3RFFZSktvWklodmNOQVFFQkJRQUQKZ2dFUEFEQ0NBUW9DZ2dFQkFJcjBSWDRGRjFNVTNYdFBmUHErMDUrMGI1Qm0xK3hsTVR5ZUp4bG5mWFJuRlgyegpPcHV6NmsreUpBQk4xRkxmNDVYRnlJclFlaW5DWWJtckZXalFRc3ZDWEloNkpPNGMwdmRMVVU5RERYcldWRFlsCkRNZytTUDlJVnZBMm5USkVFZ2RFREVsKzNWeThUbUJoNlNmcGthcXdHajY5SmhIdy9OeE9yZzM2bUY3TU5VZ1MKWXJVUkxqUFFzTUN6NElTVzhhQnEzS2IwRUFWUTA4V1lZQ0ZEbjY1aHBzVFdxR05rL3BKRnVTYjlsZll6RVdyYwpJeW5wNDhqTnIvNUdKS3NnWG9LeTUwZVo4UkppeTVyOUhENGtnSW15TVNQR2RDMXFZSVppRHRlZW8yOElxNk5nCmd1dWcyTGhlNndYYW9YZXFCQ2d3SHpwWFVXM1hJVzI5eFoxUkFRRUNBd0VBQWFOVE1GRXdIUVlEVlIwT0JCWUUKRktITDQ1RWhQaVg5bnlRN241QUk0YUZVNFlCL01COEdBMVVkSXdRWU1CYUFGS0hMNDVFaFBpWDlueVE3bjVBSQo0YUZVNFlCL01BOEdBMVVkRXdFQi93UUZNQU1CQWY4d0RRWUpLb1pJaHZjTkFRRUxCUUFEZ2dFQkFEeTRac21XCkNjZ3B5TmI4cU1ydzNYclBNejZLMTRlTVhqQVk1bERFYXM1NklLZExoZFM1bi9sWk95bENuUUtOZ3RHS0VrY3UKT05tMEE2Z0xNT3UzcE43dG9aNXh0WktOcnUvS0x6dm44QUNNNndCbUR0bG5oRERDa3ZUTHBVRzh6UTZSNmlaegpzTWlhVmpLSUJiWkJsZXNYSHZZQnQ2YlRTN2o1QWM2aTdYc1RPK3ZqQ2RUQjVYdCtEZ0x2QnBJRWh2ZGJzNXhmClFtc3ZzQmwxenJnR3JXemo4d0txcXNUMzZaOVU1TzdSK1RJMlQrUDVOR25SeHNQOGJ0R0RtMGZ5SW55WlJrSkcKVTdpVFJyMlZRWlRyaTNZQ2dlVjN1UjFtMnJjc2hyU1Avb082T0QyTXR5Q1FMRWtFK1pxVUVJVnpSYmVaNXl2RQptM29YMFV2OUtBUy9pWTA9Ci0tLS0tRU5EIENFUlRJRklDQVRFLS0tLS0K + service: + name: union-pod-webhook + namespace: union + path: /mutate--v1-pod/secrets + port: 443 + failurePolicy: Fail + matchPolicy: Equivalent + namespaceSelector: + matchLabels: + kubernetes.io/metadata.name: union + objectSelector: + matchLabels: + inject-flyte-secrets: "true" + organization: 'test-org' + reinvocationPolicy: Never + rules: + - apiGroups: + - '*' + apiVersions: + - v1 + operations: + - CREATE + resources: + - pods + scope: '*' + sideEffects: NoneOnDryRun + timeoutSeconds: 30 +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Imported via https://github.com/knative/operator/releases/download/knative-v1.16.0/operator.yaml +# Copyright 2024 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +# Source: dataplane/templates/propeller/serviceaccount.yaml +--- +--- +# Source: dataplane/templates/serving/knative-serving.yaml +apiVersion: operator.knative.dev/v1beta1 +kind: KnativeServing +metadata: + name: union-operator-serving + namespace: union + labels: + helm.sh/chart: dataplane-2026.4.4 + app.kubernetes.io/name: release-name-dataplane + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2026.4.5" + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/component: serving spec: - replicas: 1 - selector: - matchLabels: - app.kubernetes.io/name: flytepropeller + config: + deployment: + progress-deadline: "30m" + queue-sidecar-cpu-request: "25m" + queue-sidecar-cpu-limit: "1000m" + queue-sidecar-memory-request: "400Mi" + queue-sidecar-memory-limit: "800Mi" + queue-sidecar-ephemeral-storage-request: "512Mi" + queue-sidecar-ephemeral-storage-limit: "1024Mi" + registries-skipping-tag-resolving: managed.cr.union.ai + features: + kubernetes.podspec-affinity: "enabled" + kubernetes.podspec-nodeselector: "enabled" + kubernetes.podspec-tolerations: "enabled" + kubernetes.podspec-fieldref: "enabled" + kubernetes.podspec-dnspolicy: "enabled" + kubernetes.podspec-schedulername: "enabled" + kubernetes.podspec-securitycontext: "enabled" + network: + ingress-class: "kourier.ingress.networking.knative.dev" + high-availability: + replicas: 2 + ingress: + kourier: + enabled: true + bootstrap-configmap: "union-operator-serving-envoy-bootstrap" + service-type: ClusterIP + podDisruptionBudgets: + - name: 3scale-kourier-gateway-pdb + minAvailable: 50% + - name: activator-pdb + minAvailable: 50% + - name: webhook-pdb + minAvailable: 50% + registry: + override: + # TODO(jeev): Wire up Union fork of Envoy + 3scale-kourier-gateway/kourier-gateway: ghcr.io/unionai/envoy:456fed84d4ad9a9dfb186d117d9362e9dc0f7c1f + # TODO(jeev): Wire up Union fork of Kourier + net-kourier-controller/controller: ghcr.io/unionai/kourier@sha256:5804c348d15b3959604e3e3ceed216c3a1c7b32cbe254c7d3eb02a35e62ba9c4 + workloads: + - name: 3scale-kourier-gateway + labels: + helm.sh/chart: dataplane-2026.4.4 + app.kubernetes.io/name: release-name-dataplane app.kubernetes.io/instance: release-name - template: - metadata: - annotations: - configChecksum: "a8328980e66ca964e6d674436aa9bc7036a50c23d3322a6ea05b815388b72c0" - - labels: - platform.union.ai/zone: "dataplane" - - - azure.workload.identity/use: "true" - app.kubernetes.io/name: flytepropeller - app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name - app.kubernetes.io/managed-by: Helm - spec: - priorityClassName: system-cluster-critical - containers: - - command: - - flytepropeller - - --config - - /etc/flyte/config/*.yaml - - --propeller.cluster-id - - 'test-azure-cluster' - env: - - name: POD_NAME - valueFrom: - fieldRef: - fieldPath: metadata.name - - name: POD_NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - - name: GOMEMLIMIT - valueFrom: - resourceFieldRef: - divisor: 1 - resource: limits.memory - - name: GOMAXPROCS - valueFrom: - resourceFieldRef: - divisor: 1 - resource: limits.cpu - - name: CLUSTER_NAME - valueFrom: - secretKeyRef: - name: operator-cluster-name - key: cluster_name - - name: DEPLOYMENT_NAME - value: operator - - name: PROXY_SERVICE_URL - value: http://union-operator-proxy:8080 - - name: PROMETHEUS_SERVICE_URL - value: http://union-operator-prometheus:80 - - name: KNATIVE_PROXY_SERVICE_URL - value: http://kourier-internal - image: "public.ecr.aws/p0i0a9q8/unionoperator:2026.4.5" - imagePullPolicy: "IfNotPresent" - name: flytepropeller - ports: - - containerPort: 10254 - resources: - limits: - cpu: "3" - memory: 3Gi - requests: - cpu: "1" - memory: 1Gi - volumeMounts: - - name: config-volume - mountPath: /etc/flyte/config - - name: auth - mountPath: /etc/union/secret - serviceAccountName: flytepropeller-system - volumes: - - configMap: - name: flyte-propeller-config - name: config-volume - - name: auth - secret: - secretName: union-secret-auth + app.kubernetes.io/version: "2026.4.5" + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/component: 3scale-kourier-gateway + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchLabels: + app.kubernetes.io/name: 3scale-kourier-gateway + app.kubernetes.io/instance: release-name + topologyKey: topology.kubernetes.io/zone + annotations: + checksum/bootstrap-config: 6469f6f592eebc9e0c676d8ccc359a05bc799c0988e474b2da942c4cc328f656 + env: + - container: kourier-gateway + envVars: + - name: UNION_AUTHZ_TENANTAUTHURL + value: "https://test.dataplane.union.ai/me" + - name: UNION_AUTHZ_TENANTAUTHSIGNINURL + value: "https://test.dataplane.union.ai/login" + - name: UNION_AUTHZ_TENANTCONTROLPLANEURL + value: "https://test.dataplane.union.ai" + resources: + - container: kourier-gateway + limits: + cpu: "2" + memory: 2Gi + requests: + cpu: "1" + memory: 1Gi + - name: activator + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app + operator: In + values: + - activator + topologyKey: topology.kubernetes.io/zone + - name: autoscaler + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app + operator: In + values: + - autoscaler + topologyKey: topology.kubernetes.io/zone + - name: autoscaler-hpa + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app + operator: In + values: + - autoscaler-hpa + topologyKey: topology.kubernetes.io/zone + - name: controller + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app + operator: In + values: + - controller + topologyKey: topology.kubernetes.io/zone + - name: net-kourier-controller + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app + operator: In + values: + - net-kourier-controller + topologyKey: topology.kubernetes.io/zone + env: + - container: controller + envVars: + - name: KOURIER_UNION_AUTHZ_ENABLED + value: "true" + resources: + - container: controller + limits: + cpu: "1" + memory: 1Gi + requests: + cpu: 500m + memory: 500Mi + - name: webhook + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app + operator: In + values: + - webhook + topologyKey: topology.kubernetes.io/zone +--- +# Source: dataplane/templates/common/task-podtemplate.yaml +apiVersion: v1 +kind: PodTemplate +metadata: + name: task-template + namespace: union +template: + spec: + serviceAccountName: union + containers: + - name: default + image: docker.io/rwgrim/docker-noop --- # Source: dataplane/charts/fluentbit/templates/tests/test-connection.yaml apiVersion: v1 diff --git a/tests/generated/dataplane.cost.yaml b/tests/generated/dataplane.cost.yaml index 14378091..21b6c950 100644 --- a/tests/generated/dataplane.cost.yaml +++ b/tests/generated/dataplane.cost.yaml @@ -1,140 +1,139 @@ --- -# Source: dataplane/templates/common/namespaces.yaml -apiVersion: v1 -kind: Namespace -metadata: - name: flytesnacks-development ---- -# Source: dataplane/templates/common/namespaces.yaml -apiVersion: v1 -kind: Namespace -metadata: - name: flytesnacks-staging ---- -# Source: dataplane/templates/common/namespaces.yaml -apiVersion: v1 -kind: Namespace -metadata: - name: flytesnacks-production ---- -# Source: dataplane/templates/common/namespaces.yaml -apiVersion: v1 -kind: Namespace -metadata: - name: union-health-monitoring-development ---- -# Source: dataplane/templates/common/namespaces.yaml -apiVersion: v1 -kind: Namespace -metadata: - name: union-health-monitoring-staging ---- -# Source: dataplane/templates/common/namespaces.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2020 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + apiVersion: v1 -kind: Namespace +kind: ServiceAccount metadata: - name: union-health-monitoring-production + name: knative-operator + namespace: "union" + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator --- -# Source: dataplane/charts/fluentbit/templates/serviceaccount.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2020 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + apiVersion: v1 kind: ServiceAccount metadata: - name: fluentbit-system - namespace: union + name: operator-webhook + namespace: "union" labels: - helm.sh/chart: fluentbit-0.48.9 - app.kubernetes.io/name: fluentbit - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "3.2.8" - app.kubernetes.io/managed-by: Helm + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator --- -# Source: dataplane/charts/kube-state-metrics/templates/serviceaccount.yaml +# Source: dataplane/charts/prometheus/charts/kube-state-metrics/templates/serviceaccount.yaml apiVersion: v1 kind: ServiceAccount automountServiceAccountToken: true metadata: labels: - helm.sh/chart: kube-state-metrics-5.30.1 + helm.sh/chart: kube-state-metrics-5.25.1 app.kubernetes.io/managed-by: Helm app.kubernetes.io/component: metrics app.kubernetes.io/part-of: kube-state-metrics app.kubernetes.io/name: kube-state-metrics app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2.15.0" + app.kubernetes.io/version: "2.13.0" name: release-name-kube-state-metrics namespace: union --- -# Source: dataplane/templates/clusterresourcesync/serviceaccount.yaml -apiVersion: v1 -kind: ServiceAccount -metadata: - name: union-clustersync-system - namespace: union ---- -# Source: dataplane/templates/imagebuilder/serviceaccount.yaml -apiVersion: v1 -kind: ServiceAccount -metadata: - name: union-imagebuilder ---- -# Source: dataplane/templates/nodeexecutor/serviceaccount.yaml -apiVersion: v1 -kind: ServiceAccount -metadata: - name: executor - namespace: union - labels: - app: executor ---- -# Source: dataplane/templates/operator/serviceaccount-proxy.yaml +# Source: dataplane/charts/prometheus/templates/serviceaccount.yaml apiVersion: v1 kind: ServiceAccount metadata: - name: proxy-system labels: - app.kubernetes.io/name: operator-proxy + app.kubernetes.io/component: server + app.kubernetes.io/name: prometheus app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name + app.kubernetes.io/version: v2.54.1 + helm.sh/chart: prometheus-25.27.0 app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: prometheus + name: union-operator-prometheus + namespace: union + annotations: + {} --- -# Source: dataplane/templates/operator/serviceaccount.yaml +# Source: dataplane/templates/common/system-serviceaccount.yaml apiVersion: v1 kind: ServiceAccount metadata: - name: operator-system - labels: - app.kubernetes.io/name: union-operator - app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name - app.kubernetes.io/managed-by: Helm + name: union-system + namespace: union + annotations: --- -# Source: dataplane/templates/prometheus/serviceaccount.yaml +# Source: dataplane/templates/common/union-serviceaccount.yaml apiVersion: v1 kind: ServiceAccount metadata: - name: union-operator-prometheus + name: union namespace: union - labels: - helm.sh/chart: dataplane-2026.4.2 - app.kubernetes.io/name: release-name-dataplane - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: prometheus + annotations: + eks.amazonaws.com/role-arn: arn:aws:iam::ACCOUNT_ID:role/flyte_project_role +automountServiceAccountToken: true --- -# Source: dataplane/templates/propeller/serviceaccount-webhook.yaml +# Source: dataplane/templates/flyteconnector/serviceaccount.yaml apiVersion: v1 kind: ServiceAccount metadata: - name: flytepropeller-webhook-system + name: flyteconnector namespace: union + labels: + app.kubernetes.io/name: flyteconnector + app.kubernetes.io/instance: release-name + app.kubernetes.io/managed-by: Helm --- -# Source: dataplane/templates/propeller/serviceaccount.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2022 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + apiVersion: v1 -kind: ServiceAccount +kind: Secret metadata: - name: flytepropeller-system - namespace: union + name: operator-webhook-certs + namespace: "union" + labels: + app.kubernetes.io/component: operator-webhook + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +# The data is populated at install time. --- # Source: dataplane/templates/common/cluster-secret.yaml apiVersion: v1 @@ -145,151 +144,533 @@ type: Opaque data: cluster_name: --- -# Source: dataplane/templates/propeller/deployment-webhook.yaml -# Create an empty secret that the first propeller pod will populate +# Source: dataplane/templates/webhook/mutatingwebhookconfiguration.yaml apiVersion: v1 kind: Secret metadata: - name: flyte-pod-webhook + name: union-pod-webhook namespace: union + labels: + app.kubernetes.io/name: union-pod-webhook + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm type: Opaque +data: + ca.crt: LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSUROekNDQWgrZ0F3SUJBZ0lVSm9kQ09lem4vd1BHQTZjYkw1U3duSlpYT2hJd0RRWUpLb1pJaHZjTkFRRUwKQlFBd0t6RXBNQ2NHQTFVRUF3d2dabXg1ZEdWd2NtOXdaV3hzWlhJdGQyVmlhRzl2YXk1MWJtbHZiaTV6ZG1NdwpIaGNOTWpZd01USXpNREl6TmpVMVdoY05Nell3TVRJeE1ESXpOalUxV2pBck1Ta3dKd1lEVlFRRERDQm1iSGwwClpYQnliM0JsYkd4bGNpMTNaV0pvYjI5ckxuVnVhVzl1TG5OMll6Q0NBU0l3RFFZSktvWklodmNOQVFFQkJRQUQKZ2dFUEFEQ0NBUW9DZ2dFQkFJcjBSWDRGRjFNVTNYdFBmUHErMDUrMGI1Qm0xK3hsTVR5ZUp4bG5mWFJuRlgyegpPcHV6NmsreUpBQk4xRkxmNDVYRnlJclFlaW5DWWJtckZXalFRc3ZDWEloNkpPNGMwdmRMVVU5RERYcldWRFlsCkRNZytTUDlJVnZBMm5USkVFZ2RFREVsKzNWeThUbUJoNlNmcGthcXdHajY5SmhIdy9OeE9yZzM2bUY3TU5VZ1MKWXJVUkxqUFFzTUN6NElTVzhhQnEzS2IwRUFWUTA4V1lZQ0ZEbjY1aHBzVFdxR05rL3BKRnVTYjlsZll6RVdyYwpJeW5wNDhqTnIvNUdKS3NnWG9LeTUwZVo4UkppeTVyOUhENGtnSW15TVNQR2RDMXFZSVppRHRlZW8yOElxNk5nCmd1dWcyTGhlNndYYW9YZXFCQ2d3SHpwWFVXM1hJVzI5eFoxUkFRRUNBd0VBQWFOVE1GRXdIUVlEVlIwT0JCWUUKRktITDQ1RWhQaVg5bnlRN241QUk0YUZVNFlCL01COEdBMVVkSXdRWU1CYUFGS0hMNDVFaFBpWDlueVE3bjVBSQo0YUZVNFlCL01BOEdBMVVkRXdFQi93UUZNQU1CQWY4d0RRWUpLb1pJaHZjTkFRRUxCUUFEZ2dFQkFEeTRac21XCkNjZ3B5TmI4cU1ydzNYclBNejZLMTRlTVhqQVk1bERFYXM1NklLZExoZFM1bi9sWk95bENuUUtOZ3RHS0VrY3UKT05tMEE2Z0xNT3UzcE43dG9aNXh0WktOcnUvS0x6dm44QUNNNndCbUR0bG5oRERDa3ZUTHBVRzh6UTZSNmlaegpzTWlhVmpLSUJiWkJsZXNYSHZZQnQ2YlRTN2o1QWM2aTdYc1RPK3ZqQ2RUQjVYdCtEZ0x2QnBJRWh2ZGJzNXhmClFtc3ZzQmwxenJnR3JXemo4d0txcXNUMzZaOVU1TzdSK1RJMlQrUDVOR25SeHNQOGJ0R0RtMGZ5SW55WlJrSkcKVTdpVFJyMlZRWlRyaTNZQ2dlVjN1UjFtMnJjc2hyU1Avb082T0QyTXR5Q1FMRWtFK1pxVUVJVnpSYmVaNXl2RQptM29YMFV2OUtBUy9pWTA9Ci0tLS0tRU5EIENFUlRJRklDQVRFLS0tLS0K + tls.crt: LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSUQxakNDQXI2Z0F3SUJBZ0lVREtRN3A4MFpIdkQyUE5Ya3p3VGRLem9iUjg4d0RRWUpLb1pJaHZjTkFRRUwKQlFBd0t6RXBNQ2NHQTFVRUF3d2dabXg1ZEdWd2NtOXdaV3hzWlhJdGQyVmlhRzl2YXk1MWJtbHZiaTV6ZG1NdwpIaGNOTWpZd01USXpNREl6TmpVMVdoY05NamN3TVRJek1ESXpOalUxV2pBck1Ta3dKd1lEVlFRRERDQm1iSGwwClpYQnliM0JsYkd4bGNpMTNaV0pvYjI5ckxuVnVhVzl1TG5OMll6Q0NBU0l3RFFZSktvWklodmNOQVFFQkJRQUQKZ2dFUEFEQ0NBUW9DZ2dFQkFMNnMxUTRaZ3ZxVVdNdFp5VC9EKytEQWl6WmR0Q1lkVzArSGk4S3phQ1Vjbi9zZwo3a21NTEhDM2ZlZWtZK2ZUNENHUTd6K1l2WjRWWSt6WGljKzdrNndLbUZqdm1acEM1Rm96OWo1Q21NRk5HZmlWCnRobzVvSU9MUkIzNXRYMU1ROHUwNFVzT29QcjdMK016eEdxWXVmTFdtN1NVdjh0eEp3ZFMxZVlzMllNV1pMeU4KekI4akdWY01Qc2ZNVkdxdFhTTVljQ3hNYURKaTM1N0VEZSs3clM2U2ZtbnFtYjJxekdQM1BRRStCUm91Yk5nLwo4NCt4c253VXBLZUFpem95dFM2c2ZlOTdMUEtRTW9oVzd2TjZORFJrLy9xSmpHem9iZmZwUHU3endhWHVXMWtMCmllc2g2cVRrdlZLSTF3WFJ2WHArVU5GbjZzdDIwdEVlRks0ZzMwOENBd0VBQWFPQjhUQ0I3akFKQmdOVkhSTUUKQWpBQU1Bc0dBMVVkRHdRRUF3SUY0RENCa3dZRFZSMFJCSUdMTUlHSWdoWm1iSGwwWlhCeWIzQmxiR3hsY2kxMwpaV0pvYjI5cmdoeG1iSGwwWlhCeWIzQmxiR3hsY2kxM1pXSm9iMjlyTG5WdWFXOXVnaUJtYkhsMFpYQnliM0JsCmJHeGxjaTEzWldKb2IyOXJMblZ1YVc5dUxuTjJZNEl1Wm14NWRHVndjbTl3Wld4c1pYSXRkMlZpYUc5dmF5NTEKYm1sdmJpNXpkbU11WTJ4MWMzUmxjaTVzYjJOaGJEQWRCZ05WSFE0RUZnUVV1d25wcDZ1MTNKL0NsNnI3eDZ1Sgo5YmZ1RDhzd0h3WURWUjBqQkJnd0ZvQVVvY3Zqa1NFK0pmMmZKRHVma0FqaG9WVGhnSDh3RFFZSktvWklodmNOCkFRRUxCUUFEZ2dFQkFFMmlOa1MzczhGT3Z0cDhrVmJNZUN2amxIQytvYTVlZTFGNmowNlZ3bFJPOWdxOXJla1UKMGptWUhNWGNjWjcrZXJkNFRQRXgwK3ZqMzNhRTV3cDFmdVJJL2xIZHhuOUpJRHg2bTFQTHdVa3p1WG9EekYxLwozVXVIWU1xOGkrT2xsY0tqbmdVMUFUNmRWRCtIbFVWeHpMdjlNazdPb05SQUY5dTJHa1hiUHdCN0I1ZGRxdWxlCmtNcXBxbDhES2s4SVNVelVsTjNBb2Q5NmJUelIyV3RCVkxpTi9zRHZtTDZwZThFYm55RnBUb1BZTDlGeWZOcmIKT1ZPZEJwUkZFTG1KOHZaZWY5d1lxRmVXMTE3SGV5VDArZkxGZVNSemZUNVVWUlVraVFBcUVJdUhHUVRrOVFabgo1WnYvNUF0eEg3WE1rcXR0VE1zMFlSUG0xYjQzY3FBeFdhND0KLS0tLS1FTkQgQ0VSVElGSUNBVEUtLS0tLQo= + tls.key: LS0tLS1CRUdJTiBQUklWQVRFIEtFWS0tLS0tCk1JSUV2UUlCQURBTkJna3Foa2lHOXcwQkFRRUZBQVNDQktjd2dnU2pBZ0VBQW9JQkFRQytyTlVPR1lMNmxGakwKV2NrL3cvdmd3SXMyWGJRbUhWdFBoNHZDczJnbEhKLzdJTzVKakN4d3QzM25wR1BuMCtBaGtPOC9tTDJlRldQcwoxNG5QdTVPc0NwaFk3NW1hUXVSYU0vWStRcGpCVFJuNGxiWWFPYUNEaTBRZCtiVjlURVBMdE9GTERxRDYreS9qCk04UnFtTG55MXB1MGxML0xjU2NIVXRYbUxObURGbVM4amN3Zkl4bFhERDdIekZScXJWMGpHSEFzVEdneVl0K2UKeEEzdnU2MHVrbjVwNnBtOXFzeGo5ejBCUGdVYUxtellQL09Qc2JKOEZLU25nSXM2TXJVdXJIM3ZleXp5a0RLSQpWdTd6ZWpRMFpQLzZpWXhzNkczMzZUN3U4OEdsN2x0WkM0bnJJZXFrNUwxU2lOY0YwYjE2ZmxEUlorckxkdExSCkhoU3VJTjlQQWdNQkFBRUNnZ0VBRjV2SnMzbTRMSE9DdlFic2NwOC9DQmgvQkNlOC92MGlpYW5VUmJLMkFlWlYKN3Arb3NXV1FPUktYSGIvT2VPMVVjb09DQkFOUzh3aGQrM3pDZlB5U0w3cU9HM0RyT1Z6djdqVkIxM3FpZEVpcQpId3ZXWk0vZXpuckhYOWpEdm5SYmJwVUNVaXRKQmxwa2x3S1pYc056UHB5UTRkNkxFdEw3VEo1V2lxM2g3cWQzCjhncmJoeWlWNDhZRFAyYkhpZGt0S2QvMHBNL3piSzZPTS9iMFpLUmRBWVpHQXlOcklMaFJ3eDNsQXE2SDVqL20KUHBuNlZEdVQ3Y1h6UFhmNTgxR0IyNjQxWFJVTXRNWGVEREtaU3JudnIrbG5tV2lsYXZmNkg3ZHp4NlRBMmk0RApybCtIRVdUN1RSQUZGMnZ5SVUrYSt5RzNPVFBKRCtmVTdCWWh4Zld4aFFLQmdRRGVvcXVCbzQxK1dKU255blZnCmNWMHZoUEkwZ0dyRGQzNFdIbCtGNTZuSmxuMVVlbUQ2SUQ0TUhnNVpHWmNmd3NkRlB0aHl1VnB0OVl1SW11SGQKRHE5ZDhpWnpHUzJVbStkSWlsWjhnZXJ5Q3dON3hIbFM3SzJrc0RmRGJwYmZZSXZ1LzRnSExZRHh3YzQ0OG54ZwpWNGpCUFY2QTdYTHRqQzZzdFE2OW9uTlIwd0tCZ1FEYlFBUTRGdWpnUjJUOXh2Y01MdXhMbm1YaHE4U2NoZlM5CkRoWndDUzZIeHB1YlBxblBjQ04vQUZjbDZ1cDRXVEVZdVY4bEpwcGwwRmpYQ2lMdkhuUU15cm5vRjlUOThpanQKZGVHYjF6ZWlwRUpndFpNVHkvOGdERWorZU1wSFBuYTZwQVpMOVpRVUdLR2dDNGQ0SzljdVNHMmRlb1dSU0ViSQpMVGgzeGpXVEZRS0JnQStJRWFidG5nVmVjS0IwQTFSREZGa29VUzFRZUNKQ3g4MExPV2JDRHBvOW9XaXZVT3lpCkt6SDFOdE1JY2Y2SlBCV2NtTVVJSVVMaWltVnhTS2gvU2NTb0MvNmpsd1p6Q2VPSm94YjBpVXR4Y1VERktDR2MKMlZCUDZ0UDdkeE1HVFR6VEhzNUJZbWw3TjhQSlJ0d0J2MHliMTJmdktNRmhzaS9pUWJFQkVFSjVBb0dBUm9NSApHRmJkM0V0NXdsZzcyYkk1a25SRnhkY3RLejIxb1J1bndhNWlSWTV0T3Zkak8zQ1FLZWNkSC9lMklyQmtwdFB2Ck1vNkF0MS9UUW8xakFNNGxlbnUwWUYxUnhiNGN4WW5VM2Y3UVNNRlZDNjg1dHZNemdNWVNyNngvT1h4d1NNUTUKdGpJcnhtN0poQ1JSRkNmZFUyZzl3SmpIM2hxRmtSbGlBTHRCUGFVQ2dZRUF2QkRHejQvYS9kRzRJT0VpREk4SgpBbkxaRWZaTFJrdnI0ejRyT0p0TGZtWlFjTW5DSUZmKzBaelhTME9XOWhvMjk5RUNtS1l4U2VUWk1WSDgzOFZCCnRvSkNGM3lzb01rSC9meEVXTUpicDlMblJwSFUzUElicHlhRG1nWW1YdmsvK3laL3Y1c0dDTWNtVGkzWVFLRGsKcStlK1o0SHZyaGUxVytUaHpwOCtoNGs9Ci0tLS0tRU5EIFBSSVZBVEUgS0VZLS0tLS0K --- -# Source: dataplane/templates/clusterresourcesync/configmap.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2019 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + apiVersion: v1 kind: ConfigMap metadata: - name: union-clusterresourcesync-config - namespace: union + name: config-logging + namespace: "union" labels: - app.kubernetes.io/name: clusterresourcesync - app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name - app.kubernetes.io/managed-by: Helm -data: - cluster_resources.yaml: | - cluster_resources: - clusterName: '' - customData: - - production: - - projectQuotaCpu: - value: "4096" - - projectQuotaMemory: - value: 2Ti - - projectQuotaNvidiaGpu: - value: "256" - - defaultUserRoleKey: - value: 'eks.amazonaws.com/role-arn' - - defaultUserRoleValue: - value: 'arn:aws:iam::ACCOUNT_ID:role/flyte_project_role' - - staging: - - projectQuotaCpu: - value: "4096" - - projectQuotaMemory: - value: 2Ti - - projectQuotaNvidiaGpu: - value: "256" - - defaultUserRoleKey: - value: 'eks.amazonaws.com/role-arn' - - defaultUserRoleValue: - value: 'arn:aws:iam::ACCOUNT_ID:role/flyte_project_role' - - development: - - projectQuotaCpu: - value: "4096" - - projectQuotaMemory: - value: 2Ti - - projectQuotaNvidiaGpu: - value: "256" - - defaultUserRoleKey: - value: 'eks.amazonaws.com/role-arn' - - defaultUserRoleValue: - value: 'arn:aws:iam::ACCOUNT_ID:role/flyte_project_role' - refreshInterval: 5m - standaloneDeployment: true - templatePath: /etc/flyte/clusterresource/templates - clusterResourcesPrivate: - app: - isServerless: false - union: - auth: - authorizationMetadataKey: flyte-authorization - clientId: 'dataplane-operator' - clientSecretLocation: /etc/union/secret/client_secret - tokenRefreshWindow: 5m - type: ClientSecret - connection: - host: dns:/// - admin.yaml: | - admin: - clientId: 'dataplane-operator' - clientSecretLocation: /etc/union/secret/client_secret - endpoint: dns:/// - insecure: false - event: - capacity: 1000 - rate: 500 - type: admin - domain.yaml: | - domains: - - id: development - name: development - - id: staging - name: staging - - id: production - name: production - clusters.yaml: | - clusters: - clusterConfigs: [] - labelClusterMap: {} - logger.yaml: | - logger: - formatter: - type: json - level: 4 - show-source: true + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +data: {} --- -# Source: dataplane/templates/clusterresourcesync/configmap.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2019 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + apiVersion: v1 kind: ConfigMap metadata: - name: union-clusterresource-template - namespace: union + name: config-observability + namespace: "union" labels: - app.kubernetes.io/name: clusterresourcesync + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +data: {} +--- +# Source: dataplane/charts/prometheus/templates/cm.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + labels: + app.kubernetes.io/component: server + app.kubernetes.io/name: prometheus app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name + app.kubernetes.io/version: v2.54.1 + helm.sh/chart: prometheus-25.27.0 app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: prometheus + name: union-operator-prometheus + namespace: union data: - a_namespace.yaml: | - apiVersion: v1 - kind: Namespace - metadata: - name: {{ namespace }} - labels: - union.ai/namespace-type: flyte - spec: - finalizers: - - kubernetes - - b_default_service_account.yaml: | - apiVersion: v1 - kind: ServiceAccount - metadata: - name: default - namespace: {{ namespace }} - annotations: - {{ defaultUserRoleKey }}: {{ defaultUserRoleValue }} + allow-snippet-annotations: "false" + alerting_rules.yml: | + {} + alerts: | + {} + prometheus.yml: | + global: + evaluation_interval: 1m + scrape_interval: 1m + scrape_timeout: 10s + rule_files: + - /etc/config/recording_rules.yml + - /etc/config/alerting_rules.yml + - /etc/config/rules + - /etc/config/alerts + scrape_configs: + - job_name: prometheus + static_configs: + - targets: + - localhost:9090 + - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + job_name: kubernetes-apiservers + kubernetes_sd_configs: + - role: endpoints + relabel_configs: + - action: keep + regex: default;kubernetes;https + source_labels: + - __meta_kubernetes_namespace + - __meta_kubernetes_service_name + - __meta_kubernetes_endpoint_port_name + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + insecure_skip_verify: true + - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + job_name: kubernetes-nodes + kubernetes_sd_configs: + - role: node + relabel_configs: + - action: labelmap + regex: __meta_kubernetes_node_label_(.+) + - replacement: kubernetes.default.svc:443 + target_label: __address__ + - regex: (.+) + replacement: /api/v1/nodes/$1/proxy/metrics + source_labels: + - __meta_kubernetes_node_name + target_label: __metrics_path__ + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + insecure_skip_verify: true + - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + job_name: kubernetes-nodes-cadvisor + kubernetes_sd_configs: + - role: node + relabel_configs: + - action: labelmap + regex: __meta_kubernetes_node_label_(.+) + - replacement: kubernetes.default.svc:443 + target_label: __address__ + - regex: (.+) + replacement: /api/v1/nodes/$1/proxy/metrics/cadvisor + source_labels: + - __meta_kubernetes_node_name + target_label: __metrics_path__ + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + insecure_skip_verify: true + - honor_labels: true + job_name: kubernetes-service-endpoints + kubernetes_sd_configs: + - role: endpoints + relabel_configs: + - action: keep + regex: true + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_scrape + - action: drop + regex: true + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_scrape_slow + - action: replace + regex: (https?) + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_scheme + target_label: __scheme__ + - action: replace + regex: (.+) + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_path + target_label: __metrics_path__ + - action: replace + regex: (.+?)(?::\d+)?;(\d+) + replacement: $1:$2 + source_labels: + - __address__ + - __meta_kubernetes_service_annotation_prometheus_io_port + target_label: __address__ + - action: labelmap + regex: __meta_kubernetes_service_annotation_prometheus_io_param_(.+) + replacement: __param_$1 + - action: labelmap + regex: __meta_kubernetes_service_label_(.+) + - action: replace + source_labels: + - __meta_kubernetes_namespace + target_label: namespace + - action: replace + source_labels: + - __meta_kubernetes_service_name + target_label: service + - action: replace + source_labels: + - __meta_kubernetes_pod_node_name + target_label: node + - honor_labels: true + job_name: kubernetes-service-endpoints-slow + kubernetes_sd_configs: + - role: endpoints + relabel_configs: + - action: keep + regex: true + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_scrape_slow + - action: replace + regex: (https?) + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_scheme + target_label: __scheme__ + - action: replace + regex: (.+) + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_path + target_label: __metrics_path__ + - action: replace + regex: (.+?)(?::\d+)?;(\d+) + replacement: $1:$2 + source_labels: + - __address__ + - __meta_kubernetes_service_annotation_prometheus_io_port + target_label: __address__ + - action: labelmap + regex: __meta_kubernetes_service_annotation_prometheus_io_param_(.+) + replacement: __param_$1 + - action: labelmap + regex: __meta_kubernetes_service_label_(.+) + - action: replace + source_labels: + - __meta_kubernetes_namespace + target_label: namespace + - action: replace + source_labels: + - __meta_kubernetes_service_name + target_label: service + - action: replace + source_labels: + - __meta_kubernetes_pod_node_name + target_label: node + scrape_interval: 5m + scrape_timeout: 30s + - honor_labels: true + job_name: prometheus-pushgateway + kubernetes_sd_configs: + - role: service + relabel_configs: + - action: keep + regex: pushgateway + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_probe + - honor_labels: true + job_name: kubernetes-services + kubernetes_sd_configs: + - role: service + metrics_path: /probe + params: + module: + - http_2xx + relabel_configs: + - action: keep + regex: true + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_probe + - source_labels: + - __address__ + target_label: __param_target + - replacement: blackbox + target_label: __address__ + - source_labels: + - __param_target + target_label: instance + - action: labelmap + regex: __meta_kubernetes_service_label_(.+) + - source_labels: + - __meta_kubernetes_namespace + target_label: namespace + - source_labels: + - __meta_kubernetes_service_name + target_label: service + - honor_labels: true + job_name: kubernetes-pods + kubernetes_sd_configs: + - role: pod + relabel_configs: + - action: keep + regex: true + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_scrape + - action: drop + regex: true + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_scrape_slow + - action: replace + regex: (https?) + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_scheme + target_label: __scheme__ + - action: replace + regex: (.+) + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_path + target_label: __metrics_path__ + - action: replace + regex: (\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4}) + replacement: '[$2]:$1' + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_port + - __meta_kubernetes_pod_ip + target_label: __address__ + - action: replace + regex: (\d+);((([0-9]+?)(\.|$)){4}) + replacement: $2:$1 + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_port + - __meta_kubernetes_pod_ip + target_label: __address__ + - action: labelmap + regex: __meta_kubernetes_pod_annotation_prometheus_io_param_(.+) + replacement: __param_$1 + - action: labelmap + regex: __meta_kubernetes_pod_label_(.+) + - action: replace + source_labels: + - __meta_kubernetes_namespace + target_label: namespace + - action: replace + source_labels: + - __meta_kubernetes_pod_name + target_label: pod + - action: drop + regex: Pending|Succeeded|Failed|Completed + source_labels: + - __meta_kubernetes_pod_phase + - action: replace + source_labels: + - __meta_kubernetes_pod_node_name + target_label: node + - honor_labels: true + job_name: kubernetes-pods-slow + kubernetes_sd_configs: + - role: pod + relabel_configs: + - action: keep + regex: true + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_scrape_slow + - action: replace + regex: (https?) + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_scheme + target_label: __scheme__ + - action: replace + regex: (.+) + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_path + target_label: __metrics_path__ + - action: replace + regex: (\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4}) + replacement: '[$2]:$1' + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_port + - __meta_kubernetes_pod_ip + target_label: __address__ + - action: replace + regex: (\d+);((([0-9]+?)(\.|$)){4}) + replacement: $2:$1 + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_port + - __meta_kubernetes_pod_ip + target_label: __address__ + - action: labelmap + regex: __meta_kubernetes_pod_annotation_prometheus_io_param_(.+) + replacement: __param_$1 + - action: labelmap + regex: __meta_kubernetes_pod_label_(.+) + - action: replace + source_labels: + - __meta_kubernetes_namespace + target_label: namespace + - action: replace + source_labels: + - __meta_kubernetes_pod_name + target_label: pod + - action: drop + regex: Pending|Succeeded|Failed|Completed + source_labels: + - __meta_kubernetes_pod_phase + - action: replace + source_labels: + - __meta_kubernetes_pod_node_name + target_label: node + scrape_interval: 5m + scrape_timeout: 30s + - job_name: 'kube-state-metrics' + metrics_path: /metrics + scrape_interval: 1m + honor_labels: true + kubernetes_sd_configs: + - role: endpoints + namespaces: + own_namespace: true + relabel_configs: + - source_labels: [__meta_kubernetes_service_label_app_kubernetes_io_name] + regex: kube-state-metrics + action: keep + - source_labels: [__meta_kubernetes_endpoint_port_name] + regex: http + action: keep + metric_relabel_configs: + - source_labels: [__name__] + regex: "kube_pod_container_resource_(limits|requests)|kube_pod_status_phase|kube_node_(labels|status_allocatable|status_condition|status_capacity)|kube_namespace_labels|kube_pod_container_status_(waiting|terminated|last_terminated).*_reason|kube_daemonset_status_number_unavailable|kube_deployment_status_replicas_unavailable|kube_resourcequota|kube_pod_info|kube_node_info|kube_pod_container_status_restarts_total" + action: keep + - source_labels: [__name__, phase] + separator: ";" + regex: "kube_pod_status_phase;(Succeeded|Failed)" + action: drop + - source_labels: [node] + target_label: nodename + regex: "(.*)" + action: replace + - source_labels: [label_node_group_name] + action: replace + regex: "(.+)" + target_label: label_node_pool_name + - job_name: 'opencost' + metrics_path: /metrics + scrape_interval: 1m + honor_labels: true + kubernetes_sd_configs: + - role: endpoints + namespaces: + own_namespace: true + relabel_configs: + - source_labels: [__meta_kubernetes_service_label_app_kubernetes_io_name] + regex: opencost + action: keep + - source_labels: [__meta_kubernetes_endpoint_port_name] + regex: http + action: keep + metric_relabel_configs: + - source_labels: [__name__] + regex: "kube_node_labels|kube_pod_labels|node_total_hourly_cost|node_ram_hourly_cost|node_cpu_hourly_cost|node_gpu_hourly_cost" + action: keep + - job_name: 'union-services' + metrics_path: /metrics + scrape_interval: 1m + honor_labels: true + kubernetes_sd_configs: + - role: endpoints + namespaces: + own_namespace: true + relabel_configs: + - source_labels: [__meta_kubernetes_service_label_platform_union_ai_service_group] + regex: .+ + action: keep + - source_labels: [__meta_kubernetes_endpoint_port_name] + regex: debug + action: keep + # Flyte propeller metrics for execution info and fast task duration. + # No-op when propeller is not deployed (no pods match the label selector). + - job_name: 'flytepropeller' + kubernetes_sd_configs: + - role: pod + namespaces: + own_namespace: true + selectors: + - role: pod + label: app.kubernetes.io/name=flytepropeller + metric_relabel_configs: + - source_labels: [__name__] + regex: "flyte:propeller:all:round:execution_info|flyte:propeller:all:node:fast_task:fast_task_execution_duration" + action: keep + relabel_configs: + - source_labels: [__meta_kubernetes_pod_container_name] + regex: flytepropeller + action: keep + - source_labels: [__meta_kubernetes_namespace] + action: replace + target_label: namespace + - source_labels: [__meta_kubernetes_pod_name] + action: replace + target_label: pod + # Kourier gateway (envoy) metrics for app serving. + # No-op when serving is not enabled (no pods match the label selector). + - job_name: 'serving-envoy' + kubernetes_sd_configs: + - role: pod + namespaces: + own_namespace: true + selectors: + - role: pod + label: app=3scale-kourier-gateway + metrics_path: /stats/prometheus + metric_relabel_configs: + - source_labels: [__name__] + regex: "envoy_cluster_upstream_rq_xx|envoy_cluster_upstream_rq_time_bucket" + action: keep + relabel_configs: + - source_labels: [__meta_kubernetes_pod_container_port_name] + regex: metrics + action: keep + - source_labels: [__meta_kubernetes_namespace] + action: replace + target_label: namespace + - source_labels: [__meta_kubernetes_pod_name] + action: replace + target_label: pod + # DCGM GPU metrics. + # No-op when dcgm-exporter is not deployed (no pods match the label selector). + - job_name: 'gpu-metrics' + kubernetes_sd_configs: + - role: pod + namespaces: + names: + - kube-system + selectors: + - role: pod + label: app.kubernetes.io/name=dcgm-exporter - c_project_resource_quota.yaml: | - apiVersion: v1 - kind: ResourceQuota - metadata: - name: project-quota - namespace: {{ namespace }} - spec: - hard: - limits.cpu: {{ projectQuotaCpu }} - limits.memory: {{ projectQuotaMemory }} - requests.nvidia.com/gpu: {{ projectQuotaNvidiaGpu }} + recording_rules.yml: | + {} + rules: | + {} --- # Source: dataplane/templates/fluent-bit/configmap.yaml apiVersion: v1 @@ -337,6 +718,40 @@ data: region us-east-1 bucket --- +# Source: dataplane/templates/imagebuilder/build-image-configmap.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: build-image-config + labels: + app.kubernetes.io/name: union-operator + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm +data: + storage.yaml: | + storage: + container: "" + type: stow + stow: + kind: s3 + config: + auth_type: accesskey + access_key_id: + secret_key: + disable_ssl: false + endpoint: + region: us-east-1 + enable-multicontainer: false + limits: + maxDownloadMBs: 1024 + cache: + max_size_mbs: 0 + target_gc_percent: 70 + buildkit-uri: "tcp://union-operator-buildkit.union.svc.cluster.local:1234" + default-repository: "union-dataplane" + authentication-type: "noop" +--- # Source: dataplane/templates/imagebuilder/configmap.yaml apiVersion: v1 kind: ConfigMap @@ -2493,6 +2908,18 @@ data: "version": 1 } --- +# Source: dataplane/templates/monitoring/prometheusrule.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: union-recording-rules + namespace: union + labels: + release: release-name +data: + recording_rules.yml: | + groups: [] +--- # Source: dataplane/templates/nodeexecutor/configmap.yaml apiVersion: v1 kind: ConfigMap @@ -2547,8 +2974,13 @@ data: templateUris: - https://scale.neptune.ai/{{ .taskConfig.project }}/-/run/?customId={{ .taskConfig.id }} - kubernetes-enabled: true + kubernetes-enabled: false enabled_plugins.yaml: | + plugins: + connector-service: + defaultConnector: + defaultServiceConfig: '{"loadBalancingConfig": [{"round_robin":{}}]}' + endpoint: k8s:///flyteconnector.union:8000 tasks: task-plugins: default-for-task-types: @@ -2581,6 +3013,8 @@ data: cpu: 4096 gpu: 256 memory: 2Ti + namespace_mapping: + template: union union: connection: host: dns:/// @@ -2601,7 +3035,7 @@ data: cache-endpoint: dns:/// endpoint: dns:/// insecure: false - type: fallback + type: cacheservicev2 use-admin-auth: true logger: formatter: @@ -2621,6 +3055,7 @@ data: secure: false useAuth: false propeller: + limit-namespace: union node-config: disable-input-file-writes: true plugins: @@ -2628,7 +3063,7 @@ data: additional-worker-args: - --last-ack-grace-period-seconds - "120" - callback-uri: http://unionai-dataplane-executor.union.svc.cluster.local:15605 + callback-uri: http://union-operator-executor.union.svc.cluster.local:15605 grace-period-status-not-found: 2m ioutils: remoteFileOutputPaths: @@ -2638,6 +3073,7 @@ data: default-cpus: 100m default-env-vars: [] default-memory: 100Mi + default-pod-template-name: task-template co-pilot: image: 'cr.flyte.org/flyteorg/flytecopilot:v1.14.1' name: flyte-copilot- @@ -2678,6 +3114,7 @@ data: default-cpus: 100m default-env-vars: [] default-memory: 100Mi + default-pod-template-name: task-template config.yaml: | union: connection: @@ -2697,11 +3134,14 @@ data: operator: enabled: true enableTunnelService: true + # enableDepot: false tunnel: - enableDirectToAppIngress: false + enableDirectToAppIngress: true deploymentToRestart: union-operator-proxy + limitNamespace: union + disableClusterPermissions: true apps: - enabled: 'false' + enabled: 'true' syncClusterConfig: enabled: false clusterId: @@ -2716,19 +3156,19 @@ data: userRole: 'arn:aws:iam::ACCOUNT_ID:role/flyte_project_role' userRoleKey: 'eks.amazonaws.com/role-arn' collectUsages: - enabled: true + enabled: false billing: model: Legacy dependenciesHeartbeat: - prometheus: - endpoint: 'http://union-operator-prometheus:80/-/healthy' - propeller: - endpoint: 'http://flytepropeller:10254' + executor: + endpoint: 'http://union-operator-executor:10254' proxy: endpoint: 'http://union-operator-proxy:10254' secretsWatcher: dryRun: true enabled: false + org: + namespaceTemplate: union imageBuilder: enabled: true executionNamespaceLabels: @@ -2736,9 +3176,6 @@ data: referenceConfigmapName: union-operator targetConfigMapName: "build-image-config" proxy: - imageBuilderConfig: - authenticationType: 'noop' - defaultRepository: '' persistedLogs: objectStore: pathTemplate: namespace-{{.KubernetesNamespace}}.pod-{{.KubernetesPodName}}.cont-{{.KubernetesContainerName}} @@ -2792,981 +3229,149 @@ data: endpoint: region: us-east-1 image-builder.buildkit-uri: "tcp://union-operator-buildkit.union.svc.cluster.local:1234" - image-builder.default-repository: "" + image-builder.default-repository: "union-dataplane" image-builder.authentication-type: "noop" --- -# Source: dataplane/templates/prometheus/configmap.yaml +# Source: dataplane/templates/serving/bootstrap-configmap.yaml +# We need to copy the default configmap here since knative-operator does not automatically update the +# address of the `net-kourier-controller` endpoint to use the release namespace. It assumes that the +# resources are being installed into the `knative-serving` namespace instead. apiVersion: v1 kind: ConfigMap metadata: - name: union-operator-prometheus + name: union-operator-serving-envoy-bootstrap namespace: union - labels: - helm.sh/chart: dataplane-2026.4.2 - app.kubernetes.io/name: release-name-dataplane - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: prometheus data: - prometheus.yml: | - global: - scrape_interval: 15s - evaluation_interval: 15s - alerting: - alertmanagers: - - static_configs: - - targets: - rule_files: - - rules.yml - scrape_configs: - # Self-monitoring - - job_name: prometheus - metrics_path: /prometheus/metrics - static_configs: - - targets: ['localhost:9090'] - metric_relabel_configs: - - source_labels: [__name__] - regex: prometheus_tsdb_head_chunks_storage_size_bytes|prometheus_tsdb_head_series|prometheus_tsdb_head_series_created_total|prometheus_tsdb_head_series_removed_total|prometheus_tsdb_head_samples_appended_total|prometheus_tsdb_storage_blocks_bytes|prometheus_tsdb_retention_limit_bytes|prometheus_tsdb_wal_(.*)|.+_(limit|failed|missed|failures)_total - action: keep - - # Kube state metrics for pod/node resource tracking and cost calculations - - job_name: kube-state-metrics - static_configs: - - targets: ['release-name-kube-state-metrics:8080'] - metric_relabel_configs: - - separator: ; - source_labels: [__name__] - regex: kube_pod_container_resource_(limits|requests)|kube_pod_status_phase|kube_node_(labels|status_allocatable|status_condition|status_capacity)|kube_namespace_labels|kube_pod_container_status_(waiting|terminated|last_terminated).*_reason|kube_daemonset_status_number_unavailable|kube_deployment_status_replicas_unavailable|kube_resourcequota|kube_pod_info|kube_node_info|kube_pod_container_status_restarts_total - action: keep - - separator: ; - source_labels: [__name__, phase] - regex: kube_pod_status_phase;(Succeeded|Failed) - action: drop - - source_labels: [node] - target_label: nodename - regex: '(.*)' - action: replace - - source_labels: [label_node_group_name] - action: replace - regex: (.+) - target_label: label_node_pool_name + envoy-bootstrap.yaml: | + dynamic_resources: + ads_config: + transport_api_version: V3 + api_type: GRPC + rate_limit_settings: {} + grpc_services: + - envoy_grpc: {cluster_name: xds_cluster} + cds_config: + resource_api_version: V3 + ads: {} + lds_config: + resource_api_version: V3 + ads: {} + node: + cluster: kourier-knative + id: 3scale-kourier-gateway + static_resources: + listeners: + - name: stats_listener + address: + socket_address: + address: 0.0.0.0 + port_value: 9000 + filter_chains: + - filters: + - name: envoy.filters.network.http_connection_manager + typed_config: + "@type": type.googleapis.com/envoy.extensions.filters.network.http_connection_manager.v3.HttpConnectionManager + stat_prefix: stats_server + http_filters: + - name: envoy.filters.http.router + typed_config: + "@type": type.googleapis.com/envoy.extensions.filters.http.router.v3.Router + route_config: + virtual_hosts: + - name: admin_interface + domains: + - "*" + routes: + - match: + safe_regex: + regex: '/(certs|stats(/prometheus)?|server_info|clusters|listeners|ready)?' + headers: + - name: ':method' + string_match: + exact: GET + route: + cluster: service_stats + - match: + safe_regex: + regex: '/drain_listeners' + headers: + - name: ':method' + string_match: + exact: POST + route: + cluster: service_stats + clusters: + - name: service_stats + connect_timeout: 0.250s + type: static + load_assignment: + cluster_name: service_stats + endpoints: + lb_endpoints: + endpoint: + address: + socket_address: + address: 127.0.0.1 + port_value: 9901 + - name: xds_cluster + # This keepalive is recommended by envoy docs. + # https://www.envoyproxy.io/docs/envoy/latest/api-docs/xds_protocol + typed_extension_protocol_options: + envoy.extensions.upstreams.http.v3.HttpProtocolOptions: + "@type": type.googleapis.com/envoy.extensions.upstreams.http.v3.HttpProtocolOptions + explicit_http_config: + http2_protocol_options: + connection_keepalive: + interval: 30s + timeout: 5s + connect_timeout: 1s + load_assignment: + cluster_name: xds_cluster + endpoints: + lb_endpoints: + endpoint: + address: + socket_address: + address: "net-kourier-controller" + port_value: 18000 + type: STRICT_DNS + admin: + access_log: + - name: envoy.access_loggers.stdout + typed_config: + "@type": type.googleapis.com/envoy.extensions.access_loggers.stream.v3.StdoutAccessLog + address: + socket_address: + address: 127.0.0.1 + port_value: 9901 + stats_config: + stats_tags: + - tag_name: name + regex: '^.*?\.u/.*?n=(.*?)/.*?u\..*$' + - tag_name: domain + regex: '^.*?\.u/.*?d=(.*?)/.*?u\..*$' + - tag_name: org + regex: '^.*?\.u/.*?o=(.*?)/.*?u\..*$' + - tag_name: project + regex: '^.*?\.u/.*?p=(.*?)/.*?u\..*$' + - tag_name: target + regex: '^.*?\.u/.*?t=(.*?)/.*?u\..*$' + - tag_name: target_namespace + regex: '^.*?\.u/.*?tns=(.*?)/.*?u\..*$' +--- +# Source: dataplane/templates/webhook/configmap.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: union-pod-webhook-config + namespace: union +data: + core.yaml: | - # cAdvisor container metrics for CPU and memory tracking - - job_name: kubernetes-cadvisor - metrics_path: /metrics - scheme: https - kubernetes_sd_configs: - - role: node - namespaces: - names: [] - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token - tls_config: - ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt - insecure_skip_verify: false - metric_relabel_configs: - - separator: ; - source_labels: [__name__] - regex: container_cpu_usage_seconds_total|container_memory_working_set_bytes - action: keep - relabel_configs: - - separator: ; - regex: __meta_kubernetes_node_label_(.+) - replacement: $1 - action: labelmap - - separator: ; - regex: (.*) - target_label: __address__ - replacement: kubernetes.default.svc:443 - action: replace - - source_labels: [__meta_kubernetes_node_name] - separator: ; - regex: (.+) - target_label: __metrics_path__ - replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor - action: replace - - # Flyte propeller metrics for execution info and fast task duration - - job_name: flytepropeller - kubernetes_sd_configs: - - role: pod - namespaces: - names: - - union - selectors: - - role: pod - label: app.kubernetes.io/name=flytepropeller - metric_relabel_configs: - - source_labels: [__name__] - regex: "flyte:propeller:all:round:execution_info|flyte:propeller:all:node:fast_task:fast_task_execution_duration" - action: keep - relabel_configs: - - source_labels: [__meta_kubernetes_pod_container_name] - regex: flytepropeller - action: keep - - source_labels: [__meta_kubernetes_namespace] - action: replace - target_label: namespace - - source_labels: [__meta_kubernetes_pod_name] - action: replace - target_label: pod - rules.yml: | - groups: - - name: cost_calculations_15s - interval: 15s - rules: - - record: pod_gpu_allocation - expr: | - sum by (namespace, pod) (DCGM_FI_DEV_GPU_UTIL >= bool 0) * on (namespace, pod) group_left() (max by (namespace, pod) (kube_pod_status_phase{phase=~"Running|Pending"} == 1)) - - record: execution_info # A join metric to look up execution-level info. Used to disambiguate workflow/task executions, apps, and workspaces. - expr: | - max by (label_domain, label_project, label_entity_name, label_execution_id, label_entity_id)( - label_replace( - label_replace( - label_replace( - label_replace( - label_replace( - flyte:propeller:all:round:execution_info{domain!="", project!="", workflow_name!="", execution_id!=""}, # filter for workflow/task executions - "label_entity_id", "$1", "execution_id", "(.*)" # join key - ), "label_entity_name", "$1", "workflow_name", "(.*)" # set label_entity_name to the workflow/task name from the workflow_execution_id - ), - "label_execution_id", "$1", "execution_id", "(.*)" - ), - "label_project", "$1", "project", "(.*)" # project - ), - "label_domain", "$1", "domain", "(.*)" # domain - ) - ) - - record: app_info # A join metric to look up app-level info. Used to disambiguate workflow/task executions, apps, and workspaces. - expr: | - max by (label_domain, label_project, label_app_name, label_app_version, label_entity_id)( - label_replace( - label_replace( - label_replace( - kube_pod_labels{ - label_domain!="", - label_project!="", - label_serving_unionai_dev_app_name!="", - label_serving_knative_dev_revision!="" - }, # this filters for apps - "label_app_name", "$1", "label_serving_unionai_dev_app_name", "(.*)" # rename to cleanup - ), - "label_app_version", "$1", "label_serving_knative_dev_revision", "(.*)" # the app_version is equivalent to an execution_id for workflows (lowest level of granularity) - ), - "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # join key - ) - ) - - record: workspace_info # A join metric to look up workspace info. Used to disambiguate workflow/task executions, apps, and workspaces. - expr: | - max by (label_domain, label_project, label_workspace_name, label_entity_id)( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # filter for workspaces - "label_entity_id", "$1", "label_node_id", "(.*)" # join key - ), "label_workspace_name", "$1", "label_node_id", "(.*)" # set label_workspace_name to the workspace name from the kube_pod_labels - ) - ) - - record: fast_task_execution_duration - expr: | - max by (label_domain, label_project, label_execution_id, label_entity_id, namespace, pod)( - label_replace( - label_replace( - label_replace( - label_replace( - label_replace( - label_replace( - flyte:propeller:all:node:fast_task:fast_task_execution_duration{domain!="", project!="", execution_id!="", exported_namespace!="", exported_pod!=""}, - "label_entity_id", "$1", "execution_id", "(.*)" # join key - ), - "label_execution_id", "$1", "execution_id", "(.*)" - ), - "label_project", "$1", "project", "(.*)" # project - ), - "label_domain", "$1", "domain", "(.*)" # domain - ), - "namespace", "$1", "exported_namespace", "(.*)" - ), - "pod", "$1", "exported_pod", "(.*)" - ) - ) - - record: fast_task_execution_duration_rate - expr: | - irate(fast_task_execution_duration[1m]) > 0 or absent(irate(fast_task_execution_duration[1m])) * fast_task_execution_duration - - record: entity_id:mem_usage_bytes_total_per_node:sum # Allocated memory (max(requested, consumed)) aggregated per node and entity, where entity is either a task/workflow execution or an app. - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id, node) ( # aggregate up to entity - # First, calculate the allocated memory for each pod - max by (namespace, pod) ( # this is the case where consumed (the memory working set) exceeds requested memory - ( - sum by (namespace, pod) ( - container_memory_working_set_bytes{namespace!="",pod!="",image!=""} - ) - > sum by (namespace, pod) ( - kube_pod_container_resource_requests{namespace!="", pod!="", node!="", resource="memory"} - ) - ) - or sum by (namespace, pod) ( # this is the case where memory requests are <= consumed memory - kube_pod_container_resource_requests{namespace!="", pod!="", node!="", resource="memory"} # needed to add node!="" to dedupe - ) - ) - # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but we do not want to double the number of pod-level metrics we save - * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) - "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") - ), - "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels - label_replace( - label_replace( - kube_pod_labels{ - label_domain!="", - label_project!="", - label_serving_unionai_dev_app_name!="", - label_serving_knative_dev_revision!="" - }, # this filters for apps only - "label_entity_type", "app", "", "" # set label_entity_type to "app" - ), - "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) - "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" - ), - "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - fast_task_execution_duration_rate{label_domain!="", label_project!="",label_execution_id!=""}, - "label_entity_type", "fast_task", "", "" - ) - ) - ) - # Then filter for pods only in the "Running" or "Pending" phase - * on (namespace, pod) group_left() ( - max by (namespace, pod) ( - kube_pod_status_phase{phase=~"Running|Pending"} == 1 - ) - ) - # Now join in node identifiers which are used for subsequent overhead calculations - * on (namespace, pod) group_left(node) ( - max by (namespace, pod, node) (kube_pod_info{node!=""}) # needed to add node!="" to dedupe - ) - ) - - record: entity_id:cpu_usage_per_node:sum # Allocated cpu (max(requested, consumed)) aggregated per node and entity, where entity is either a task/workflow execution or an app. - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id, node) ( - # First, calculate the allocated cpu for each pod - max by (namespace, pod) ( # this is the case where consumed (the cpu usage seconds total) exceeds requested cpu - ( - sum by (namespace, pod) ( - irate(container_cpu_usage_seconds_total{namespace!="",pod!="",image!=""}[5m]) - ) - > sum by (namespace, pod) ( - kube_pod_container_resource_requests{namespace!="", pod!="", node!="", resource="cpu"} - ) - ) - or sum by (namespace, pod) ( # this is the case where cpu requests are <= consumed cpu - kube_pod_container_resource_requests{namespace!="", pod!="", node!="", resource="cpu"} - ) - ) - # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but I didn't want to double the number of pod-level metrics we save - * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) - "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") - ), - "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels - label_replace( - label_replace( - kube_pod_labels{ - label_domain!="", - label_project!="", - label_serving_unionai_dev_app_name!="", - label_serving_knative_dev_revision!="" - }, # this filters for apps only - "label_entity_type", "app", "", "" # set label_entity_type to "app" - ), - "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) - "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" - ), - "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - fast_task_execution_duration_rate{label_domain!="", label_project!="", label_execution_id!=""}, - "label_entity_type", "fast_task", "", "" - ) - ) - ) - # Then filter for pods only in the "Running" or "Pending" phase - * on (namespace, pod) group_left() ( - max by (namespace, pod) ( - kube_pod_status_phase{phase=~"Running|Pending"} == 1 - ) - ) - # Now join in node identifiers which are used for subsequent overhead calculations - * on (namespace, pod) group_left(node) ( - max by (namespace, pod, node) (kube_pod_info{node!=""}) # needed to add node!="" to dedupe - ) - ) - - record: entity_id:gpu_usage_per_node:sum # Allocated gpu aggregated per node and entity, where entity is either a task/workflow execution or an app. - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id, node) ( - # First, grab the allocated gpu for each pod (which is always either 1 or zero, since k8s can't split gpus the way it can with cpu/memory) - max by (namespace, pod) ( - pod_gpu_allocation - ) - # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but I didn't want to double the number of pod-level metrics we save - * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) - "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") - ), - "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels - label_replace( - label_replace( - kube_pod_labels{ - label_domain!="", - label_project!="", - label_serving_unionai_dev_app_name!="", - label_serving_knative_dev_revision!="" - }, # this filters for apps only - "label_entity_type", "app", "", "" # set label_entity_type to "app" - ), - "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) - "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" - ), - "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - fast_task_execution_duration_rate{label_domain!="", label_project!="", label_execution_id!=""}, - "label_entity_type", "fast_task", "", "" - ) - ) - ) - # Then filter for pods only in the "Running" or "Pending" phase - * on (namespace, pod) group_left() ( - max by (namespace, pod) ( - kube_pod_status_phase{phase=~"Running|Pending"} == 1 - ) - ) - # Now join in node identifiers which are used for subsequent overhead calculations - * on (namespace, pod) group_left(node) ( - max by (namespace, pod, node) (kube_pod_info{node!=""}) # needed to add node!="" to dedupe - ) - ) - - record: entity_id:used_mem_bytes:sum # the sum of used memory across all containers in an entity (numerator for aggregate utilization calculations) - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id) ( # aggregate up to entity - # First, calculate the used memory for each pod - sum by (namespace, pod) ( - container_memory_working_set_bytes{namespace!="",pod!="",image!=""} - ) - # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but we do not want to double the number of pod-level metrics we save - * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) - "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") - ), - "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels - label_replace( - label_replace( - kube_pod_labels{ - label_domain!="", - label_project!="", - label_serving_unionai_dev_app_name!="", - label_serving_knative_dev_revision!="" - }, # this filters for apps only - "label_entity_type", "app", "", "" # set label_entity_type to "app" - ), - "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) - "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" - ), - "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - fast_task_execution_duration_rate{label_domain!="", label_project!="", label_execution_id!=""}, - "label_entity_type", "fast_task", "", "" - ) - ) - ) - # Then filter for pods only in the "Running" or "Pending" phase - * on (namespace, pod) group_left() ( - max by (namespace, pod) ( - kube_pod_status_phase{phase=~"Running|Pending"} == 1 - ) - ) - ) - - record: entity_id:allocated_mem_bytes:sum # the sum of allocated memory across all containers in an entity (denominator for aggregate utilization calculations) - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id) ( # aggregate up to entity (remove node) - entity_id:mem_usage_bytes_total_per_node:sum - ) - - record: entity_id:used_cpu:sum # the sum of used cpu across all containers in an entity (numerator for aggregate utilization calculations) - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id) ( - sum by (namespace, pod) ( - irate(container_cpu_usage_seconds_total{namespace!="",pod!="",image!=""}[5m]) - ) - # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but I didn't want to double the number of pod-level metrics we save - * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) - "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") - ), - "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels - label_replace( - label_replace( - kube_pod_labels{ - label_domain!="", - label_project!="", - label_serving_unionai_dev_app_name!="", - label_serving_knative_dev_revision!="" - }, # this filters for apps only - "label_entity_type", "app", "", "" # set label_entity_type to "app" - ), - "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) - "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" - ), - "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - fast_task_execution_duration_rate{label_domain!="", label_project!="", label_execution_id!=""}, - "label_entity_type", "fast_task", "", "" - ) - ) - ) - # Then filter for pods only in the "Running" or "Pending" phase - * on (namespace, pod) group_left() ( - max by (namespace, pod) ( - kube_pod_status_phase{phase=~"Running|Pending"} == 1 - ) - ) - ) - - record: entity_id:allocated_cpu:sum # the sum of allocated cpu across all containers in an entity (denominator for aggregate utilization calculations) - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id) ( # aggregate up to entity (remove node) - entity_id:cpu_usage_per_node:sum - ) - - record: entity_id:sm_occupancy:avg # the simple average of SM occupancy (a good generic measure of GPU utilization) per entity - expr: | - avg by (label_entity_type, label_domain, label_project, label_entity_id) ( - # First, grab the SM occupancy for each pod - max by (namespace, pod) ( - DCGM_FI_PROF_SM_OCCUPANCY # SM occupancy is a good proxy for actual GPU usage - ) - # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but I didn't want to double the number of pod-level metrics we save - * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) - "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") - ), - "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels - label_replace( - label_replace( - kube_pod_labels{ - label_domain!="", - label_project!="", - label_serving_unionai_dev_app_name!="", - label_serving_knative_dev_revision!="" - }, # this filters for apps only - "label_entity_type", "app", "", "" # set label_entity_type to "app" - ), - "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) - "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" - ), - "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - fast_task_execution_duration_rate{label_domain!="", label_project!="", label_execution_id!=""}, - "label_entity_type", "fast_task", "", "" - ) - ) - ) - # Then filter for pods only in the "Running" or "Pending" phase - * on (namespace, pod) group_left() ( - max by (namespace, pod) ( - kube_pod_status_phase{phase=~"Running|Pending"} == 1 - ) - ) - ) - - record: entity_id:gpu_count:sum # the count of running gpu pods per entity (need this to weight the gpu utilization when aggregating upwards - i.e. project-level) - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id, node) ( - # First, grab the allocated gpu for each pod (which is always either 1 or zero, since k8s can't split gpus the way it can with cpu/memory) - max by (namespace, pod) ( - pod_gpu_allocation - ) - # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but I didn't want to double the number of pod-level metrics we save - * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) - "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") - ), - "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels - label_replace( - label_replace( - kube_pod_labels{ - label_domain!="", - label_project!="", - label_serving_unionai_dev_app_name!="", - label_serving_knative_dev_revision!="" - }, # this filters for apps only - "label_entity_type", "app", "", "" # set label_entity_type to "app" - ), - "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) - "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" - ), - "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - fast_task_execution_duration_rate{label_domain!="", label_project!="", label_execution_id!=""}, - "label_entity_type", "fast_task", "", "" - ) - ) - ) - # Then filter for pods only in the "Running" or "Pending" phase - * on (namespace, pod) group_left() ( - max by (namespace, pod) ( - kube_pod_status_phase{phase=~"Running|Pending"} == 1 - ) - ) - ) - - record: entity_id:weighted_sm_occupancy:sum # product of SM occupancy and allocated GPU count (something like "used memory", numerator of weighted calcs) - expr: | - entity_id:sm_occupancy:avg - * on (label_domain, label_project, label_entity_type, label_entity_id) entity_id:gpu_count:sum - - record: entity_id:allocated_mem_cost:sum # Allocated cost of memory for each workflow/task execution and app. - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id, type) ( - entity_id:mem_usage_bytes_total_per_node:sum / (1024 * 1024 * 1024) # convert bytes to GB - * on (node) group_left(type) label_replace(avg by (node) (node_ram_hourly_cost * (15 / 3600)), "type", "mem", "", "") # convert hourly cost to 15-secondly cost and add type - ) - - record: entity_id:allocated_cpu_cost:sum # Allocated cost of cpu for each workflow/task execution and app. - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id, type)( - entity_id:cpu_usage_per_node:sum - * on (node) group_left(type) label_replace(avg by (node) (node_cpu_hourly_cost * (15 / 3600)), "type", "cpu", "", "") # convert hourly cost to 15-secondly cost and add type - ) - - record: entity_id:allocated_gpu_cost:sum # Allocated cost of gpu for each workflow/task execution and app. - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id, type)( - entity_id:gpu_usage_per_node:sum - * on (node) group_left(type) label_replace(avg by (node) (node_gpu_hourly_cost * (15 / 3600)), "type", "gpu", "", "") # convert hourly cost to 15-secondly cost and add type - ) - - record: entity_id:allocated_cost:sum # Allocated cost of memory, cpu, and gpu for each workflow/task execution and app. - expr: | - label_replace( - sum by (label_entity_type, label_domain, label_project, label_entity_id) ( # for the sum to work, the labels need to be different on each "or" element (type label) - entity_id:allocated_mem_cost:sum - or - entity_id:allocated_cpu_cost:sum - or - entity_id:allocated_gpu_cost:sum - ), - "type", "allocated", "", "" # add type info - ) - - record: entity_id:overhead_cost:sum # The amount of overhead costs (node costs that we can't allocate with container resources) to allocate to each entity (workflow/task execution or app) - expr: | - label_replace( - sum by (label_entity_type, label_entity_id, label_domain, label_project)( # Aggregate the per-node metrics up to workflow/task execution or app (label_entity_id) - # Start with each execution's and app's allocated cost per node - sum by (label_entity_type, label_domain, label_project, label_entity_id, node) ( # for the sum to work, the labels need to be different on each "or" element (type label) - entity_id:mem_usage_bytes_total_per_node:sum / (1024 * 1024 * 1024) # convert bytes to GB - * on (node) group_left(type) label_replace(avg by (node) (node_ram_hourly_cost * (15 / 3600)), "type", "mem", "", "") # convert hourly cost to 15-secondly cost and add type - or - entity_id:cpu_usage_per_node:sum - * on (node) group_left(type) label_replace(avg by (node) (node_cpu_hourly_cost * (15 / 3600)), "type", "cpu", "", "") # convert hourly cost to 15-secondly cost and add type - or - entity_id:gpu_usage_per_node:sum - * on (node) group_left(type) label_replace(avg by (node) (node_gpu_hourly_cost * (15 / 3600)), "type", "gpu", "", "") # convert hourly cost to 15-secondly cost and add type - ) - # Then divide out the total allocated cost per node to get the proportion of allocated cost associated with each entity - / on (node) group_left()( - sum by (node) ( # for the sum to work, the labels need to be different on each "or" element (type label) - entity_id:mem_usage_bytes_total_per_node:sum / (1024 * 1024 * 1024) # convert bytes to GB - * on (node) group_left(type) label_replace(avg by (node) (node_ram_hourly_cost * (15 / 3600)), "type", "mem", "", "") # convert hourly cost to 15-secondly cost and add type - or - entity_id:cpu_usage_per_node:sum - * on (node) group_left(type) label_replace(avg by (node) (node_cpu_hourly_cost * (15 / 3600)), "type", "cpu", "", "") # convert hourly cost to 15-secondly cost and add type - or - entity_id:gpu_usage_per_node:sum - * on (node) group_left(type) label_replace(avg by (node) (node_gpu_hourly_cost * (15 / 3600)), "type", "gpu", "", "") # convert hourly cost to 15-secondly cost and add type - ) - > 0 # need to avoid dividing by zero, or gaps in the data can cause NaNs to proliferate, borking all charts - ) - # Then multiply by the overhead cost per node - * on (node) group_left() ( - # To calculate overhead, start with the true cost of running each node - avg by (node)(kube_node_labels{label_flyte_org_node_role="worker"}) # only look at worker nodes - * on (node) max by (node) ( - node_total_hourly_cost{instance_type!=""} # sometimes, the instance_type can be null, causing an unlabeled label to show up in the Compute Costs dashboard charts - ) * (15 / 3600) # convert hourly cost to 15-secondly cost - # Then subtract out the total allocated cost on each node - - on (node) group_left()( - sum by (node) ( # for the sum to work, the labels need to be different on each "or" element (type label) - entity_id:mem_usage_bytes_total_per_node:sum / (1024 * 1024 * 1024) # convert bytes to GB - * on (node) group_left(type) label_replace(avg by (node) (node_ram_hourly_cost * (15 / 3600)), "type", "mem", "", "") # convert hourly cost to 15-secondly cost and add type - or - entity_id:cpu_usage_per_node:sum - * on (node) group_left(type) label_replace(avg by (node) (node_cpu_hourly_cost * (15 / 3600)), "type", "cpu", "", "") # convert hourly cost to 15-secondly cost and add type - or - entity_id:gpu_usage_per_node:sum - * on (node) group_left(type) label_replace(avg by (node) (node_gpu_hourly_cost * (15 / 3600)), "type", "gpu", "", "") # convert hourly cost to 15-secondly cost and add type - ) - ) - ) - ), - "type", "overhead", "", "" # add type info - ) - - record: entity_id:total_cost:sum # Total cost of each entity (workflow/task execution or app), including allocated (from container resources) and overhead (proportion of unallocated node costs) - expr: | - label_replace( - sum by (label_domain, label_project, label_entity_id, label_entity_type) ( - entity_id:allocated_cost:sum - or - entity_id:overhead_cost:sum - ), - "type", "total", "", "" # add type info - ) - - record: node:total_cost:sum # Total cost of all nodes - expr: | - sum ( - avg by (node)(kube_node_labels{label_flyte_org_node_role="worker", label_node_kubernetes_io_instance_type!=""}) # only look at worker nodes - * on (node) group_left() node_total_hourly_cost{instance_type!=""} * (15 / 3600) # convert hourly cost to 15-secondly cost - ) - - record: node_type:total_cost:sum # Total cost of nodes grouped by node type - expr: | - sum by (node_type)( - avg by (node)(kube_node_labels{label_flyte_org_node_role="worker", label_node_kubernetes_io_instance_type!=""}) # only look at worker nodes - * on (node) group_left(node_type) label_replace(node_total_hourly_cost{instance_type!=""}, "node_type", "$1", "instance_type", "(.*)") * (15 / 3600) # convert hourly cost to 15-secondly cost and rename label - ) - - record: node_type:uptime_hours:sum # Total uptime of nodes grouped by node type - expr: | - sum by (node_type)( - avg by (node, node_type)( # dedupe - label_replace(kube_node_labels{label_flyte_org_node_role="worker", label_node_kubernetes_io_instance_type!=""}, "node_type", "$1", "label_node_kubernetes_io_instance_type", "(.*)") # relabel - ) - ) * (15 / 3600) # convert to number of hours per 15-second observation # Aggregate the above into visible metrics - - name: cost_rollup_15m - interval: 15m - rules: - - record: execution_info15m - expr: | - max_over_time(execution_info[15m:15s]) - - record: app_info15m - expr: | - max_over_time(app_info[15m:15s]) - - record: workspace_info15m - expr: | - max_over_time(workspace_info[15m:15s]) - - record: entity_id:allocated_mem_bytes:sum15m - expr: | - sum_over_time(entity_id:allocated_mem_bytes:sum[15m:15s]) - - record: entity_id:used_mem_bytes:sum15m - expr: | - sum_over_time(entity_id:used_mem_bytes:sum[15m:15s]) - - record: entity_id:allocated_cpu:sum15m - expr: | - sum_over_time(entity_id:allocated_cpu:sum[15m:15s]) - - record: entity_id:used_cpu:sum15m - expr: | - sum_over_time(entity_id:used_cpu:sum[15m:15s]) - - record: entity_id:weighted_sm_occupancy:sum15m - expr: | - sum_over_time(entity_id:weighted_sm_occupancy:sum[15m:15s]) - - record: entity_id:gpu_count:sum15m - expr: | - sum_over_time(entity_id:gpu_count:sum[15m:15s]) - - record: entity_id:allocated_mem_cost:sum15m - expr: | - sum_over_time(entity_id:allocated_mem_cost:sum[15m:15s]) - - record: entity_id:allocated_cpu_cost:sum15m - expr: | - sum_over_time(entity_id:allocated_cpu_cost:sum[15m:15s]) - - record: entity_id:allocated_gpu_cost:sum15m - expr: | - sum_over_time(entity_id:allocated_gpu_cost:sum[15m:15s]) - - record: entity_id:allocated_cost:sum15m - expr: | - sum_over_time(entity_id:allocated_cost:sum[15m:15s]) - - record: entity_id:overhead_cost:sum15m - expr: | - sum_over_time(entity_id:overhead_cost:sum[15m:15s]) - - record: entity_id:total_cost:sum15m - expr: | - sum_over_time(entity_id:total_cost:sum[15m:15s]) - - record: node:total_cost:sum15m - expr: | - sum_over_time(node:total_cost:sum[15m:15s]) - - record: node_type:total_cost:sum15m - expr: | - sum_over_time(node_type:total_cost:sum[15m:15s]) - - record: node_type:uptime_hours:sum15m - expr: | - sum_over_time(node_type:uptime_hours:sum[15m:15s]) ---- -# Source: dataplane/templates/propeller/configmap.yaml -apiVersion: v1 -kind: ConfigMap -metadata: - name: flyte-propeller-config - namespace: union -data: - admin.yaml: | - admin: - clientId: 'dataplane-operator' - clientSecretLocation: /etc/union/secret/client_secret - endpoint: dns:/// - insecure: false - event: - capacity: 1000 - rate: 500 - type: admin - catalog.yaml: | - catalog-cache: - cache-endpoint: dns:/// - endpoint: dns:/// - insecure: false - type: fallback - use-admin-auth: true - copilot.yaml: | - plugins: - k8s: - co-pilot: - image: 'cr.flyte.org/flyteorg/flytecopilot:v1.14.1' - name: flyte-copilot- - start-timeout: 30s - core.yaml: | - propeller: - downstream-eval-duration: 30s - enable-admin-launcher: true - leader-election: - enabled: true - lease-duration: 15s - lock-config-map: - name: propeller-leader - namespace: 'union' - renew-deadline: 10s - retry-period: 2s - limit-namespace: all - literal-offloading-config: - enabled: true - max-workflow-retries: 30 - metadata-prefix: metadata/propeller - metrics-prefix: flyte - prof-port: 10254 - queue: - batch-size: -1 - batching-interval: 2s - queue: - base-delay: 5s - capacity: 1000 - max-delay: 120s - rate: 100 - type: maxof - sub-queue: - capacity: 100 - rate: 10 - type: bucket - type: batch - rawoutput-prefix: 's3://' - workers: 4 - workflow-reeval-duration: 30s webhook: certDir: /etc/webhook/certs + disableCreateMutatingWebhookConfig: true embeddedSecretManagerConfig: imagePullSecrets: enabled: true @@ -3774,286 +3379,1173 @@ data: namespace: 'union' type: 'K8s' listenPort: '9443' + localCert: true secretManagerTypes: - Embedded - K8s - serviceName: flyte-pod-webhook + secretName: union-pod-webhook + serviceName: union-pod-webhook servicePort: '443' - enabled_plugins.yaml: | - plugins: - connector-service: - defaultConnector: - defaultServiceConfig: '{"loadBalancingConfig": [{"round_robin":{}}]}' - endpoint: k8s:///flyteconnector.union:8000 - tasks: - task-plugins: - default-for-task-types: - actor: fast-task - container: container - container_array: k8s-array - sidecar: sidecar - enabled-plugins: - - container - - sidecar - - k8s-array - - echo - - fast-task - - connector-service - k8s.yaml: | - plugins: - k8s: - default-cpus: 100m - default-env-vars: [] - default-memory: 100Mi - logger.yaml: | - logger: - formatter: - type: json - level: 4 - show-source: true - resource_manager.yaml: | - propeller: - resourcemanager: - type: noop - task_logs.yaml: | - plugins: - logs: - cloudwatch-enabled: false - dynamic-log-links: - - vscode: - displayName: VS Code Debugger - templateUris: - - /dataplane/pod/v1/generated_name/task/{{.executionProject}}/{{.executionDomain}}/{{.executionName}}/{{.nodeID}}/{{.taskRetryAttempt}}/{{.taskProject}}/{{.taskDomain}}/{{.taskID}}/{{.taskVersion}}/ - kubernetes-enabled: false - templates: - - displayName: Task Logs - scheme: TaskExecution - templateUris: - - /console/projects/{{.executionProject}}/domains/{{.executionDomain}}/executions/{{.executionName}}/nodeId/{{.nodeID}}/taskId/{{.taskID}}/attempt/{{.taskRetryAttempt}}/view/logs?duration=all&fromExecutionNav=true - storage.yaml: | - storage: - container: "" - type: stow - stow: - kind: s3 - config: - auth_type: accesskey - access_key_id: - secret_key: - disable_ssl: false - endpoint: - region: us-east-1 - enable-multicontainer: false - limits: - maxDownloadMBs: 1024 - cache: - max_size_mbs: 0 - target_gc_percent: 70 --- -# Source: dataplane/charts/fluentbit/templates/clusterrole.yaml +# Source: dataplane/charts/fluentbit/templates/clusterrole.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: release-name-fluentbit + labels: + helm.sh/chart: fluentbit-0.48.9 + app.kubernetes.io/name: fluentbit + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "3.2.8" + app.kubernetes.io/managed-by: Helm +rules: + - apiGroups: + - "" + resources: + - namespaces + - pods + verbs: + - get + - list + - watch +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2020 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: knative-serving-operator-aggregated + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +aggregationRule: + clusterRoleSelectors: + # This (along with escalate below) allows the Operator to pick up any + # roles that are provided to the admin of the cluster by knative serving + # automatically. + - matchExpressions: + - {key: serving.knative.dev/release, operator: Exists} +rules: [] # Rules are automatically filled in by the controller manager. +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: knative-serving-operator-aggregated-stable + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +aggregationRule: + clusterRoleSelectors: + # This (along with escalate below) allows the Operator to pick up any + # roles that are provided to the admin of the cluster by knative serving + # automatically. + - matchExpressions: + - {key: app.kubernetes.io/name, operator: In, values: ["knative-serving"]} +rules: [] # Rules are automatically filled in by the controller manager. +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: knative-eventing-operator-aggregated + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +aggregationRule: + clusterRoleSelectors: + # This (along with escalate below) allows the Operator to pick up any + # roles that are provided to the admin of the cluster by knative eventing + # automatically. + - matchExpressions: + - {key: eventing.knative.dev/release, operator: Exists} +rules: [] # Rules are automatically filled in by the controller manager. +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: knative-eventing-operator-aggregated-stable + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +aggregationRule: + clusterRoleSelectors: + # This (along with escalate below) allows the Operator to pick up any + # roles that are provided to the admin of the cluster by knative eventing + # automatically. + - matchExpressions: + - {key: app.kubernetes.io/name, operator: In, values: ["knative-eventing"]} +rules: [] # Rules are automatically filled in by the controller manager. +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2020 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +kind: ClusterRole +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: knative-serving-operator + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +rules: + - apiGroups: + - operator.knative.dev + resources: + - '*' + verbs: + - '*' + # Bootstrapping permissions. + # Roles that are explicitly bound buch which are specified by this Operator + # MUST be specified here with 'get' and 'bind'. + - apiGroups: + - rbac.authorization.k8s.io + resources: + - clusterroles + resourceNames: + - system:auth-delegator + verbs: + - bind + - get + - apiGroups: + - rbac.authorization.k8s.io + resources: + - roles + resourceNames: + - extension-apiserver-authentication-reader + verbs: + - bind + - get + - apiGroups: + - rbac.authorization.k8s.io + resources: + - clusterroles + - roles + verbs: + - create + - delete + # Escalate is necessary in order to create a role using cluster role aggregation, + # and to allow the Operator to bootstrap itself into the necessary set of + # permissions, even as those continue to evolve upstream. + - escalate + - get + - list + - update + - apiGroups: + - rbac.authorization.k8s.io + resources: + - clusterrolebindings + - rolebindings + verbs: + - create + - delete + - list + - get + - update + # Permissions required for Knative controller + # infra. + - apiGroups: + - apiregistration.k8s.io + resources: + - apiservices + verbs: + - update + - apiGroups: + - "" + resources: + - services + verbs: + - create + - delete + - get + - list + - watch + - apiGroups: + - caching.internal.knative.dev + resources: + - images + verbs: + - '*' + - apiGroups: + - "" + resources: + - namespaces + verbs: + - get + - update + - watch + - apiGroups: + - '' + resources: + - events + verbs: + - create + - update + - patch + - apiGroups: + - '' + resources: + - configmaps + verbs: + - create + - delete + - get + - list + - watch + - apiGroups: + - security.istio.io + - apps + - policy + resources: + - poddisruptionbudgets + - peerauthentications + - deployments + - daemonsets + - replicasets + - statefulsets + verbs: + - create + - delete + - get + - list + - watch + - update + - apiGroups: + - apiregistration.k8s.io + resources: + - apiservices + verbs: + - create + - delete + - get + - list + - apiGroups: + - autoscaling + resources: + - horizontalpodautoscalers + verbs: + - create + - delete + - get + - list + - apiGroups: + - coordination.k8s.io + resources: + - leases + verbs: + - '*' + - apiGroups: + - apiextensions.k8s.io + resources: + - customresourcedefinitions + verbs: + - '*' + # Old resources that need cleaning up that are not in the knative-serving + # namespace. + - apiGroups: + - "" + resources: + - services + - deployments + - horizontalpodautoscalers + resourceNames: + - knative-ingressgateway + verbs: + - delete + - apiGroups: + - "" + resources: + - configmaps + resourceNames: + - config-controller + verbs: + - delete + - apiGroups: + - "" + resources: + - serviceaccounts + resourceNames: + - knative-serving-operator + verbs: + - delete + # for contour TLS + - apiGroups: + - projectcontour.io + resources: + - httpproxies + - tlscertificatedelegations + verbs: + - get + - list + - watch + - update + - create + - delete + - deletecollection + - patch + # for security-guard + - apiGroups: + - guard.security.knative.dev + resources: + - guardians + verbs: + - get + - list + - watch + - create + - update + - patch + - delete + - apiGroups: + - "" + resources: + - configmaps + verbs: + - get + - list + - watch + - create + - update + - patch + - delete + - apiGroups: + - authentication.k8s.io + resources: + - tokenreviews + verbs: + - create + - apiGroups: + - "" + resources: + - pods + verbs: + - get +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +kind: ClusterRole +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: knative-eventing-operator + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +rules: + - apiGroups: + - operator.knative.dev + resources: + - '*' + verbs: + - '*' + # Bootstrapping permissions. + # Roles that are explicitly bound buch which are specified by this Operator + # MUST be specified here with 'get' and 'bind'. + - apiGroups: + - rbac.authorization.k8s.io + resources: + - clusterroles + - roles + verbs: + - create + - delete + # Escalate is necessary in order to create a role using cluster role aggregation, + # and to allow the Operator to bootstrap itself into the necessary set of + # permissions, even as those continue to evolve upstream. + - escalate + - get + - list + - update + - apiGroups: + - rbac.authorization.k8s.io + resources: + - clusterrolebindings + - rolebindings + verbs: + - create + - delete + - list + - get + - update + # Permissions required for Knative controller + # infra. + - apiGroups: + - apiregistration.k8s.io + resources: + - apiservices + verbs: + - update + - apiGroups: + - "" + resources: + - services + verbs: + - create + - delete + - get + - list + - watch + - apiGroups: + - caching.internal.knative.dev + resources: + - images + verbs: + - '*' + - apiGroups: + - "" + resources: + - namespaces + verbs: + - get + - update + - watch + - apiGroups: + - '' + resources: + - events + verbs: + - create + - update + - patch + - apiGroups: + - '' + resources: + - configmaps + verbs: + - create + - delete + - get + - list + - watch + - apiGroups: + - apps + resources: + - deployments + - daemonsets + - replicasets + - statefulsets + verbs: + - create + - delete + - get + - list + - watch + - apiGroups: + - apiregistration.k8s.io + resources: + - apiservices + verbs: + - create + - delete + - get + - list + - apiGroups: + - autoscaling + resources: + - horizontalpodautoscalers + verbs: + - create + - delete + - update + - get + - list + - apiGroups: + - coordination.k8s.io + resources: + - leases + verbs: + - '*' + - apiGroups: + - apiextensions.k8s.io + resources: + - customresourcedefinitions + verbs: + - '*' + - apiGroups: + - batch + resources: + - jobs + verbs: + - create + - delete + - update + - get + - list + - watch + # Old resources that need cleaning up that are not in the knative-eventing + # namespace. + - apiGroups: + - "" + resources: + - serviceaccounts + resourceNames: + - knative-eventing-operator + verbs: + - delete + # for RabbitMQ messaging topology objects + - apiGroups: + - rabbitmq.com + resources: + - rabbitmqclusters + verbs: + - "get" + - "list" + - "watch" + - apiGroups: + - rabbitmq.com + resources: + - bindings + - queues + - exchanges + verbs: + - create + - delete + - get + - list + - patch + - update + - watch + - apiGroups: + - rabbitmq.com + resources: + - bindings/status + - queues/status + - exchanges/status + verbs: + - get + # for Kafka eventing source + - apiGroups: + - keda.sh + resources: + - scaledobjects + - scaledobjects/finalizers + - scaledobjects/status + - triggerauthentications + - triggerauthentications/status + verbs: + - get + - list + - watch + - update + - create + - delete + # Internal APIs + - apiGroups: + - "internal.kafka.eventing.knative.dev" + resources: + - "consumers" + - "consumers/status" + - "consumergroups" + - "consumergroups/status" + verbs: + - create + - get + - list + - watch + - patch + - update + - delete + - apiGroups: + - "internal.kafka.eventing.knative.dev" + resources: + - "consumers/finalizers" + - "consumergroups/finalizers" + verbs: + - update + - delete + - apiGroups: + - apps + resources: + - statefulsets/scale + verbs: + - get + - list + - watch + - update + - patch + - create + - delete + - apiGroups: + - rbac.authorization.k8s.io + resources: + - clusterrolebindings + verbs: + - watch + - apiGroups: + - "*" + resources: + - configmaps + verbs: + - delete + - apiGroups: + - "*" + resources: + - configmaps + - services + verbs: + - get + - list + - watch + - update + - create + - delete + - apiGroups: + - "*" + resources: + - pods + verbs: + - list + - update + - get + - watch + - apiGroups: + - "*" + resources: + - pods/finalizers + verbs: + - get + - list + - create + - update + - delete + - apiGroups: + - "*" + resources: + - events + verbs: + - patch + - create + - apiGroups: + - "*" + resources: + - secrets + verbs: + - get + - list + - watch + - update + - create + - delete + - apiGroups: + - "*" + resources: + - nodes + verbs: + - get + - list + - watch + - apiGroups: + - "*" + resources: + - serviceaccounts + verbs: + - get + - list + - watch + - update + - create + - delete + - apiGroups: + - "*" + resources: + - configmaps + resourceNames: + - kafka-channel-config + verbs: + - patch + - apiGroups: + - "*" + resources: + - horizontalpodautoscalers + resourceNames: + - kafka-webhook + verbs: + - delete + - apiGroups: + - "*" + resources: + - leases + verbs: + - delete + - apiGroups: + - "*" + resources: + - poddisruptionbudgets + resourceNames: + - kafka-webhook + verbs: + - delete + - apiGroups: + - "*" + resources: + - services + verbs: + - patch + - apiGroups: + - "apps" + resources: + - deployments + verbs: + - deletecollection + # Eventing TLS + - apiGroups: + - "cert-manager.io" + resources: + - certificates + - issuers + - clusterissuers + verbs: + - create + - delete + - update + - list + - get + - watch + - apiGroups: + - "trust.cert-manager.io" + resources: + - bundles + verbs: + - create + - delete + - update + - list + - get + - watch +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: knative-operator-webhook + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +rules: + # For watching logging configuration and getting certs. + - apiGroups: + - "" + resources: + - "configmaps" + verbs: + - "get" + - "list" + - "watch" + # For manipulating certs into secrets. + - apiGroups: + - "" + resources: + - "namespaces" + verbs: + - "get" + - "create" + - "update" + - "list" + - "watch" + - "patch" + # finalizers are needed for the owner reference of the webhook + - apiGroups: + - "" + resources: + - "namespaces/finalizers" + verbs: + - "update" + # For getting our Deployment so we can decorate with ownerref. + - apiGroups: + - "apps" + resources: + - "deployments" + verbs: + - "get" + - apiGroups: + - "apps" + resources: + - "deployments/finalizers" + verbs: + - update + # For actually registering our webhook. + - apiGroups: + - "admissionregistration.k8s.io" + resources: + - "mutatingwebhookconfigurations" + - "validatingwebhookconfigurations" + verbs: &everything + - "get" + - "list" + - "create" + - "update" + - "delete" + - "patch" + - "watch" + # For leader election + - apiGroups: + - "coordination.k8s.io" + resources: + - "leases" + verbs: *everything + # Necessary for conversion webhook. These are copied from the serving + # TODO: Do we really need all these permissions? + - apiGroups: ["apiextensions.k8s.io"] + resources: ["customresourcedefinitions"] + verbs: ["get", "list", "create", "update", "delete", "patch", "watch"] +--- +# Source: dataplane/charts/fluentbit/templates/clusterrolebinding.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: release-name-fluentbit + labels: + helm.sh/chart: fluentbit-0.48.9 + app.kubernetes.io/name: fluentbit + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "3.2.8" + app.kubernetes.io/managed-by: Helm +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: release-name-fluentbit +subjects: + - kind: ServiceAccount + name: union-system + namespace: union +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2020 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# TODO: Consider restriction of non-aggregated role to knativeservings namespaces. +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: knative-serving-operator + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: knative-serving-operator +subjects: + - kind: ServiceAccount + name: knative-operator + namespace: "union" +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# TODO: Consider restriction of non-aggregated role to knativeeventing namespaces. +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: knative-eventing-operator + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: knative-eventing-operator +subjects: + - kind: ServiceAccount + name: knative-operator + namespace: "union" +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole +kind: ClusterRoleBinding metadata: - name: release-name-fluentbit + name: operator-webhook labels: - helm.sh/chart: fluentbit-0.48.9 - app.kubernetes.io/name: fluentbit - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "3.2.8" - app.kubernetes.io/managed-by: Helm -rules: - - apiGroups: - - "" - resources: - - namespaces - - pods - verbs: - - get - - list - - watch + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: knative-operator-webhook +subjects: + - kind: ServiceAccount + name: operator-webhook + namespace: "union" --- -# Source: dataplane/charts/kube-state-metrics/templates/role.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2020 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole +kind: ClusterRoleBinding metadata: - labels: - helm.sh/chart: kube-state-metrics-5.30.1 - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: metrics - app.kubernetes.io/part-of: kube-state-metrics - app.kubernetes.io/name: kube-state-metrics - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2.15.0" - name: release-name-kube-state-metrics -rules: - -- apiGroups: ["certificates.k8s.io"] - resources: - - certificatesigningrequests - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - configmaps - verbs: ["list", "watch"] - -- apiGroups: ["batch"] - resources: - - cronjobs - verbs: ["list", "watch"] - -- apiGroups: ["extensions", "apps"] - resources: - - daemonsets - verbs: ["list", "watch"] - -- apiGroups: ["extensions", "apps"] - resources: - - deployments - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - endpoints - verbs: ["list", "watch"] - -- apiGroups: ["autoscaling"] - resources: - - horizontalpodautoscalers - verbs: ["list", "watch"] - -- apiGroups: ["extensions", "networking.k8s.io"] - resources: - - ingresses - verbs: ["list", "watch"] - -- apiGroups: ["batch"] - resources: - - jobs - verbs: ["list", "watch"] - -- apiGroups: ["coordination.k8s.io"] - resources: - - leases - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - limitranges - verbs: ["list", "watch"] - -- apiGroups: ["admissionregistration.k8s.io"] - resources: - - mutatingwebhookconfigurations - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - namespaces - verbs: ["list", "watch"] - -- apiGroups: ["networking.k8s.io"] - resources: - - networkpolicies - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - nodes - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - persistentvolumeclaims - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - persistentvolumes - verbs: ["list", "watch"] - -- apiGroups: ["policy"] - resources: - - poddisruptionbudgets - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - pods - verbs: ["list", "watch"] - -- apiGroups: ["extensions", "apps"] - resources: - - replicasets - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - replicationcontrollers - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - resourcequotas - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - secrets - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - services - verbs: ["list", "watch"] - -- apiGroups: ["apps"] - resources: - - statefulsets - verbs: ["list", "watch"] - -- apiGroups: ["storage.k8s.io"] - resources: - - storageclasses - verbs: ["list", "watch"] - -- apiGroups: ["admissionregistration.k8s.io"] - resources: - - validatingwebhookconfigurations - verbs: ["list", "watch"] - -- apiGroups: ["storage.k8s.io"] - resources: - - volumeattachments - verbs: ["list", "watch"] + name: knative-serving-operator-aggregated + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: knative-serving-operator-aggregated +subjects: + - kind: ServiceAccount + name: knative-operator + namespace: "union" --- -# Source: dataplane/templates/clusterresourcesync/serviceaccount.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole +kind: ClusterRoleBinding +metadata: + name: knative-serving-operator-aggregated-stable + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: knative-serving-operator-aggregated-stable +subjects: + - kind: ServiceAccount + name: knative-operator + namespace: "union" +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: knative-eventing-operator-aggregated + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: knative-eventing-operator-aggregated +subjects: + - kind: ServiceAccount + name: knative-operator + namespace: "union" +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: knative-eventing-operator-aggregated-stable + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: knative-eventing-operator-aggregated-stable +subjects: + - kind: ServiceAccount + name: knative-operator + namespace: "union" +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2022 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role metadata: - name: union-clustersync-resource + namespace: "union" + name: knative-operator-webhook + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator rules: + # For manipulating certs into secrets. - apiGroups: - "" - - rbac.authorization.k8s.io resources: - - configmaps - - namespaces - - pods - - resourcequotas - - roles - - rolebindings - - secrets + - "secrets" + verbs: + - "get" + - "create" + - "update" + - "list" + - "watch" + - "patch" +--- +# Source: dataplane/templates/monitoring/prometheus-rbac.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: union-operator-prometheus-rbac + namespace: union + labels: + release: release-name +rules: + # Prometheus server scrape permissions + - apiGroups: [""] + resources: + - nodes + - nodes/proxy + - nodes/metrics - services - - serviceaccounts - - clusterrolebindings - - podtemplates + - endpoints + - pods + - ingresses + - configmaps verbs: - - '*' + - get + - list + - watch + - apiGroups: ["extensions", "networking.k8s.io"] + resources: + - ingresses/status + - ingresses + verbs: + - get + - list + - watch + - apiGroups: ["discovery.k8s.io"] + resources: + - endpointslices + verbs: + - get + - list + - watch + # kube-state-metrics permissions (rbac.create=false on the subchart; managed here instead) + - apiGroups: ["certificates.k8s.io"] + resources: ["certificatesigningrequests"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["configmaps"] + verbs: ["list", "watch"] + - apiGroups: ["batch"] + resources: ["cronjobs"] + verbs: ["list", "watch"] + - apiGroups: ["extensions", "apps"] + resources: ["daemonsets"] + verbs: ["list", "watch"] + - apiGroups: ["extensions", "apps"] + resources: ["deployments"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["endpoints"] + verbs: ["list", "watch"] + - apiGroups: ["discovery.k8s.io"] + resources: ["endpointslices"] + verbs: ["list", "watch"] + - apiGroups: ["autoscaling"] + resources: ["horizontalpodautoscalers"] + verbs: ["list", "watch"] + - apiGroups: ["extensions", "networking.k8s.io"] + resources: ["ingresses"] + verbs: ["list", "watch"] + - apiGroups: ["batch"] + resources: ["jobs"] + verbs: ["list", "watch"] + - apiGroups: ["coordination.k8s.io"] + resources: ["leases"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["limitranges"] + verbs: ["list", "watch"] + - apiGroups: ["admissionregistration.k8s.io"] + resources: ["mutatingwebhookconfigurations"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["namespaces"] + verbs: ["list", "watch"] + - apiGroups: ["networking.k8s.io"] + resources: ["networkpolicies"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["persistentvolumeclaims"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["persistentvolumes"] + verbs: ["list", "watch"] + - apiGroups: ["policy"] + resources: ["poddisruptionbudgets"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["pods"] + verbs: ["list", "watch"] + - apiGroups: ["extensions", "apps"] + resources: ["replicasets"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["replicationcontrollers"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["resourcequotas"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["secrets"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["services"] + verbs: ["list", "watch"] + - apiGroups: ["apps"] + resources: ["statefulsets"] + verbs: ["list", "watch"] + - apiGroups: ["storage.k8s.io"] + resources: ["storageclasses"] + verbs: ["list", "watch"] + - apiGroups: ["admissionregistration.k8s.io"] + resources: ["validatingwebhookconfigurations"] + verbs: ["list", "watch"] + - apiGroups: ["storage.k8s.io"] + resources: ["volumeattachments"] + verbs: ["list", "watch"] --- # Source: dataplane/templates/nodeexecutor/serviceaccount.yaml apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole +kind: Role metadata: name: union-executor labels: @@ -4104,11 +4596,12 @@ rules: - delete - update --- -# Source: dataplane/templates/operator/serviceaccount-proxy.yaml +# Source: dataplane/templates/operator/serviceaccount-proxy-secret.yaml apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole +kind: Role metadata: - name: proxy-system + name: union-system-secret + namespace: union labels: app.kubernetes.io/name: operator-proxy app.kubernetes.io/instance: release-name @@ -4118,146 +4611,86 @@ rules: - apiGroups: - '*' resources: - - events - - flyteworkflows - - pods/log - - pods - - rayjobs - - resourcequotas - verbs: - - get - - list - - watch ---- -# Source: dataplane/templates/operator/serviceaccount.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: operator-system - labels: - app.kubernetes.io/name: union-operator - app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name - app.kubernetes.io/managed-by: Helm -rules: - # Allow Access to all resources under flyte.lyft.com - - apiGroups: - - flyte.lyft.com - resources: - - flyteworkflows - - flyteworkflows/finalizers - verbs: - - get - - list - - watch - - create - - update - - delete - - patch - - post - - deletecollection - - apiGroups: - - '*' - resources: - - resourcequotas - - pods - - configmaps - - podtemplates - secrets - - namespaces - - nodes verbs: - get - list - - watch - create - update - delete - - nonResourceURLs: - - /metrics - verbs: - - get --- -# Source: dataplane/templates/prometheus/rbac.yaml +# Source: dataplane/templates/operator/serviceaccount-proxy.yaml apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole +kind: Role metadata: - name: union-operator-prometheus + name: proxy-system labels: - helm.sh/chart: dataplane-2026.4.2 - app.kubernetes.io/name: release-name-dataplane + app.kubernetes.io/name: operator-proxy app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" + platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: prometheus rules: - - apiGroups: [""] + - apiGroups: + - '*' resources: - - nodes - - nodes/proxy + - events + - flyteworkflows + - pods/log - pods - - endpoints - - services + - rayjobs + - resourcequotas verbs: - get - list - watch - - nonResourceURLs: - - /metrics - - /metrics/cadvisor - verbs: - - get --- -# Source: dataplane/templates/propeller/serviceaccount-webhook.yaml -kind: ClusterRole +# Source: dataplane/templates/operator/serviceaccount.yaml apiVersion: rbac.authorization.k8s.io/v1 +kind: Role metadata: - name: flytepropeller-webhook-role - namespace: union + name: operator-system + labels: + app.kubernetes.io/name: union-operator + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm rules: - apiGroups: - - "*" + - '*' resources: - - mutatingwebhookconfigurations - secrets - - pods - - replicasets/finalizers + - deployments verbs: - get + - list + - watch - create - update - - patch ---- -# Source: dataplane/templates/propeller/serviceaccount.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: flytepropeller-role -rules: - # Allow RO access to PODS - apiGroups: - - "" + - flyte.lyft.com resources: - - pods + - flyteworkflows + - flyteworkflows/finalizers verbs: - get - list - watch - # Allow Event recording access - - apiGroups: - - "" - resources: - - events - verbs: - create - update - delete - patch - # Allow Access All plugin objects + - post + - deletecollection - apiGroups: - '*' resources: - - '*' + - resourcequotas + - pods + - configmaps + - podtemplates + - secrets + - namespaces + - nodes verbs: - get - list @@ -4265,127 +4698,129 @@ rules: - create - update - delete - - patch - # Allow Access to CRD - apiGroups: - - apiextensions.k8s.io + - serving.knative.dev resources: - - customresourcedefinitions + - revisions + - configurations + - services verbs: - get - list - watch - create - - delete - update - # Allow Access to all resources under flyte.lyft.com + - delete +--- +# Source: dataplane/templates/webhook/serviceaccount.yaml +kind: Role +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: union-webhook-role + namespace: union +rules: - apiGroups: - - flyte.lyft.com + - "*" resources: - - flyteworkflows - - flyteworkflows/finalizers + - mutatingwebhookconfigurations + - secrets + - pods + - replicasets/finalizers verbs: - get - - list - - watch - create - update - - delete - patch - - post - - deletecollection --- -# Source: dataplane/charts/fluentbit/templates/clusterrolebinding.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2022 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding +kind: RoleBinding metadata: - name: release-name-fluentbit + namespace: "union" + name: operator-webhook labels: - helm.sh/chart: fluentbit-0.48.9 - app.kubernetes.io/name: fluentbit - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "3.2.8" - app.kubernetes.io/managed-by: Helm -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: release-name-fluentbit + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator subjects: - kind: ServiceAccount - name: fluentbit-system - namespace: union ---- -# Source: dataplane/charts/kube-state-metrics/templates/clusterrolebinding.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - labels: - helm.sh/chart: kube-state-metrics-5.30.1 - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: metrics - app.kubernetes.io/part-of: kube-state-metrics - app.kubernetes.io/name: kube-state-metrics - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2.15.0" - name: release-name-kube-state-metrics + name: operator-webhook + namespace: "union" roleRef: + kind: Role + name: knative-operator-webhook apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: release-name-kube-state-metrics -subjects: -- kind: ServiceAccount - name: release-name-kube-state-metrics - namespace: union --- -# Source: dataplane/templates/clusterresourcesync/serviceaccount.yaml +# Source: dataplane/templates/monitoring/prometheus-rbac.yaml apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding +kind: RoleBinding metadata: - name: union-clustersync-resource -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: union-clustersync-resource + name: union-operator-prometheus-rbac + namespace: union + labels: + release: release-name subjects: - kind: ServiceAccount - name: union-clustersync-system + name: union-operator-prometheus namespace: union +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: union-operator-prometheus-rbac --- -# Source: dataplane/templates/clusterresourcesync/serviceaccount.yaml +# Source: dataplane/templates/monitoring/prometheus-rbac.yaml +# kube-state-metrics service account RoleBinding apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding +kind: RoleBinding metadata: - name: union-clustersync-auth-delegator -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: system:auth-delegator + name: release-name-prometheus-kube-state-metrics + namespace: union + labels: + release: release-name subjects: - kind: ServiceAccount - name: union-clustersync-system + name: release-name-prometheus-kube-state-metrics namespace: union +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: union-operator-prometheus-rbac --- # Source: dataplane/templates/nodeexecutor/serviceaccount.yaml apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding +kind: RoleBinding metadata: name: union-executor labels: app: executor roleRef: apiGroup: rbac.authorization.k8s.io - kind: ClusterRole + kind: Role name: union-executor subjects: - kind: ServiceAccount - name: executor + name: union-system namespace: union --- -# Source: dataplane/templates/operator/serviceaccount-proxy.yaml +# Source: dataplane/templates/operator/serviceaccount-proxy-secret.yaml apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding +kind: RoleBinding metadata: - name: proxy-system + name: union-system-secret + namespace: union labels: app.kubernetes.io/name: operator-proxy app.kubernetes.io/instance: release-name @@ -4393,109 +4828,35 @@ metadata: app.kubernetes.io/managed-by: Helm roleRef: apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: proxy-system + kind: Role + name: union-system-secret subjects: - kind: ServiceAccount - name: proxy-system + name: union-system namespace: union --- -# Source: dataplane/templates/operator/serviceaccount.yaml +# Source: dataplane/templates/operator/serviceaccount-proxy.yaml apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding +kind: RoleBinding metadata: - name: operator-system + name: proxy-system labels: - app.kubernetes.io/name: union-operator + app.kubernetes.io/name: operator-proxy app.kubernetes.io/instance: release-name platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm roleRef: apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: operator-system -subjects: - - kind: ServiceAccount - name: operator-system - namespace: union ---- -# Source: dataplane/templates/prometheus/rbac.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - name: union-operator-prometheus - labels: - helm.sh/chart: dataplane-2026.4.2 - app.kubernetes.io/name: release-name-dataplane - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: prometheus -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: union-operator-prometheus -subjects: - - kind: ServiceAccount - name: union-operator-prometheus - namespace: union ---- -# Source: dataplane/templates/propeller/serviceaccount-webhook.yaml -# Create a binding from Role -> ServiceAccount -kind: ClusterRoleBinding -apiVersion: rbac.authorization.k8s.io/v1 -metadata: - name: flytepropeller-webhook-binding - namespace: union -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: flytepropeller-webhook-role -subjects: - - kind: ServiceAccount - name: flytepropeller-webhook-system - namespace: union ---- -# Source: dataplane/templates/propeller/serviceaccount.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - name: flytepropeller-binding -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: flytepropeller-role + kind: Role + name: proxy-system subjects: - kind: ServiceAccount - name: flytepropeller-system + name: union-system namespace: union --- -# Source: dataplane/templates/operator/serviceaccount-proxy-secret.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: Role -metadata: - name: proxy-system-secret - namespace: union - labels: - app.kubernetes.io/name: operator-proxy - app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name - app.kubernetes.io/managed-by: Helm -rules: - - apiGroups: - - '*' - resources: - - secrets - verbs: - - get - - list - - create - - update - - delete ---- # Source: dataplane/templates/operator/serviceaccount.yaml apiVersion: rbac.authorization.k8s.io/v1 -kind: Role +kind: RoleBinding metadata: name: operator-system labels: @@ -4503,56 +4864,29 @@ metadata: app.kubernetes.io/instance: release-name platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm -rules: - - apiGroups: - - '*' - resources: - - secrets - - deployments - verbs: - - get - - list - - watch - - create - - update ---- -# Source: dataplane/templates/operator/serviceaccount-proxy-secret.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: RoleBinding -metadata: - name: proxy-system-secret - namespace: union - labels: - app.kubernetes.io/name: operator-proxy - app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name - app.kubernetes.io/managed-by: Helm roleRef: apiGroup: rbac.authorization.k8s.io kind: Role - name: proxy-system-secret + name: operator-system subjects: - kind: ServiceAccount - name: proxy-system + name: union-system namespace: union --- -# Source: dataplane/templates/operator/serviceaccount.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: RoleBinding -metadata: - name: operator-system - labels: - app.kubernetes.io/name: union-operator - app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name - app.kubernetes.io/managed-by: Helm +# Source: dataplane/templates/webhook/serviceaccount.yaml +# Create a binding from Role -> ServiceAccount +kind: RoleBinding +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: union-webhook-binding + namespace: union roleRef: apiGroup: rbac.authorization.k8s.io kind: Role - name: operator-system + name: union-webhook-role subjects: - kind: ServiceAccount - name: operator-system + name: union-system namespace: union --- # Source: dataplane/charts/fluentbit/templates/service.yaml @@ -4578,20 +4912,60 @@ spec: app.kubernetes.io/name: fluentbit app.kubernetes.io/instance: release-name --- -# Source: dataplane/charts/kube-state-metrics/templates/service.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2022 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v1 +kind: Service +metadata: + labels: + role: operator-webhook + app.kubernetes.io/component: operator-webhook + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator + name: operator-webhook + namespace: "union" +spec: + ports: + # Define metrics and profiling for them to be accessible within service meshes. + - name: http-metrics + port: 9090 + targetPort: 9090 + - name: http-profiling + port: 8008 + targetPort: 8008 + - name: https-webhook + port: 443 + targetPort: 8443 + selector: + role: operator-webhook +--- +# Source: dataplane/charts/prometheus/charts/kube-state-metrics/templates/service.yaml apiVersion: v1 kind: Service metadata: name: release-name-kube-state-metrics namespace: union labels: - helm.sh/chart: kube-state-metrics-5.30.1 + helm.sh/chart: kube-state-metrics-5.25.1 app.kubernetes.io/managed-by: Helm app.kubernetes.io/component: metrics app.kubernetes.io/part-of: kube-state-metrics app.kubernetes.io/name: kube-state-metrics app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2.15.0" + app.kubernetes.io/version: "2.13.0" annotations: prometheus.io/scrape: 'true' spec: @@ -4606,6 +4980,33 @@ spec: app.kubernetes.io/name: kube-state-metrics app.kubernetes.io/instance: release-name --- +# Source: dataplane/charts/prometheus/templates/service.yaml +apiVersion: v1 +kind: Service +metadata: + labels: + app.kubernetes.io/component: server + app.kubernetes.io/name: prometheus + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: v2.54.1 + helm.sh/chart: prometheus-25.27.0 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: prometheus + name: union-operator-prometheus + namespace: union +spec: + ports: + - name: http + port: 80 + protocol: TCP + targetPort: 9090 + selector: + app.kubernetes.io/component: server + app.kubernetes.io/name: prometheus + app.kubernetes.io/instance: release-name + sessionAffinity: None + type: "ClusterIP" +--- # Source: dataplane/templates/clusterresourcesync/service.yaml apiVersion: v1 kind: Service @@ -4628,6 +5029,31 @@ spec: app.kubernetes.io/name: clusterresourcesync app.kubernetes.io/instance: release-name --- +# Source: dataplane/templates/flyteconnector/service.yaml +apiVersion: v1 +kind: Service +metadata: + name: flyteconnector + namespace: union + labels: + app.kubernetes.io/name: flyteconnector + app.kubernetes.io/instance: release-name + app.kubernetes.io/managed-by: Helm +spec: + clusterIP: None + ports: + - name: grpc + port: 8000 + protocol: TCP + targetPort: grpc + - name: metric + port: 9090 + protocol: TCP + targetPort: metric + selector: + app.kubernetes.io/name: flyteconnector + app.kubernetes.io/instance: release-name +--- # Source: dataplane/templates/imagebuilder/service.yaml apiVersion: v1 kind: Service @@ -4653,7 +5079,7 @@ spec: apiVersion: v1 kind: Service metadata: - name: release-name-dataplane-executor + name: union-operator-executor labels: platform.union.ai/prometheus-group: "union-services" app: executor @@ -4719,39 +5145,14 @@ spec: app.kubernetes.io/name: union-operator app.kubernetes.io/instance: release-name --- -# Source: dataplane/templates/prometheus/service.yaml -apiVersion: v1 -kind: Service -metadata: - name: union-operator-prometheus - namespace: union - labels: - helm.sh/chart: dataplane-2026.4.2 - app.kubernetes.io/name: release-name-dataplane - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: prometheus -spec: - type: ClusterIP - ports: - - port: 80 - targetPort: 9090 - protocol: TCP - name: http - selector: - app.kubernetes.io/component: prometheus - app.kubernetes.io/name: release-name-dataplane - app.kubernetes.io/instance: release-name ---- -# Source: dataplane/templates/propeller/service-webhook.yaml +# Source: dataplane/templates/webhook/service.yaml apiVersion: v1 kind: Service metadata: - name: flyte-pod-webhook + name: union-pod-webhook namespace: union labels: - app.kubernetes.io/name: flyte-pod-webhook + app.kubernetes.io/name: union-pod-webhook app.kubernetes.io/instance: release-name platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm @@ -4759,7 +5160,7 @@ metadata: projectcontour.io/upstream-protocol.h2c: grpc spec: selector: - app.kubernetes.io/name: flyte-pod-webhook + app.kubernetes.io/name: union-pod-webhook app.kubernetes.io/instance: release-name ports: - name: https @@ -4771,23 +5172,23 @@ spec: port: 10254 targetPort: 10254 --- -# Source: dataplane/templates/propeller/service-webhook.yaml +# Source: dataplane/templates/webhook/service.yaml # Headless Service for cache invalidation — resolves to all pod IPs so that # we can fan out invalidation requests to every webhook replica. apiVersion: v1 kind: Service metadata: - name: flyte-pod-webhook-headless + name: union-pod-webhook-headless namespace: union labels: - app.kubernetes.io/name: flyte-pod-webhook + app.kubernetes.io/name: union-pod-webhook app.kubernetes.io/instance: release-name platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm spec: clusterIP: None selector: - app.kubernetes.io/name: flyte-pod-webhook + app.kubernetes.io/name: union-pod-webhook app.kubernetes.io/instance: release-name ports: - name: cache-internal @@ -4795,32 +5196,6 @@ spec: port: 9443 targetPort: 9443 --- -# Source: dataplane/templates/propeller/service.yaml -apiVersion: v1 -kind: Service -metadata: - namespace: union - name: flytepropeller - labels: - platform.union.ai/prometheus-group: "union-services" - app.kubernetes.io/name: flytepropeller - app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name - app.kubernetes.io/managed-by: Helm -spec: - type: ClusterIP - ports: - - name: debug - protocol: TCP - port: 10254 - - name: fasttask - port: 15605 - protocol: TCP - targetPort: 15605 - selector: - app.kubernetes.io/name: flytepropeller - app.kubernetes.io/instance: release-name ---- # Source: dataplane/charts/fluentbit/templates/daemonset.yaml apiVersion: apps/v1 kind: DaemonSet @@ -4846,7 +5221,7 @@ spec: annotations: checksum/config: e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855 spec: - serviceAccountName: fluentbit-system + serviceAccountName: union-system hostNetwork: false dnsPolicy: ClusterFirst containers: @@ -4898,20 +5273,217 @@ spec: tolerations: - operator: Exists --- -# Source: dataplane/charts/kube-state-metrics/templates/deployment.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2022 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: operator-webhook + namespace: "union" + labels: + app.kubernetes.io/component: operator-webhook + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +spec: + selector: + matchLabels: + app: operator-webhook + role: operator-webhook + template: + metadata: + labels: + app: operator-webhook + role: operator-webhook + app.kubernetes.io/component: operator-webhook + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator + sidecar.istio.io/inject: "false" + spec: + # To avoid node becoming SPOF, spread our replicas to different nodes. + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - podAffinityTerm: + labelSelector: + matchLabels: + app: operator-webhook + topologyKey: kubernetes.io/hostname + weight: 100 + serviceAccountName: operator-webhook + containers: + - name: operator-webhook + # This is the Go import path for the binary that is containerized + # and substituted here. + image: gcr.io/knative-releases/knative.dev/operator/cmd/webhook@sha256:d3a7a3304629ccfcaacec620b50736e0b4c902a6328b83369b6cbaf7e94677c9 + resources: + requests: + cpu: 100m + memory: 100Mi + limits: + cpu: 500m + memory: 500Mi + env: + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: SYSTEM_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: CONFIG_LOGGING_NAME + value: config-logging + - name: CONFIG_OBSERVABILITY_NAME + value: config-observability + - name: WEBHOOK_NAME + value: operator-webhook + - name: WEBHOOK_PORT + value: "8443" + - name: WEBHOOK_SECRET_NAME + value: operator-webhook-certs + - name: METRICS_DOMAIN + value: knative.dev/operator + - name: KUBERNETES_MIN_VERSION + value: "" + securityContext: + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + runAsNonRoot: true + capabilities: + drop: + - ALL + ports: + - name: metrics + containerPort: 9090 + - name: profiling + containerPort: 8008 + - name: https-webhook + containerPort: 8443 + readinessProbe: + periodSeconds: 1 + httpGet: + scheme: HTTPS + port: 8443 + httpHeaders: + - name: k-kubelet-probe + value: "webhook" + livenessProbe: + periodSeconds: 1 + httpGet: + scheme: HTTPS + port: 8443 + httpHeaders: + - name: k-kubelet-probe + value: "webhook" + failureThreshold: 6 + initialDelaySeconds: 120 + # Our webhook should gracefully terminate by lame ducking first, set this to a sufficiently + # high value that we respect whatever value it has configured for the lame duck grace period. + terminationGracePeriodSeconds: 300 +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2020 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: knative-operator + namespace: "union" + labels: + app.kubernetes.io/name: knative-operator + app.kubernetes.io/version: "1.16.0" +spec: + replicas: 1 + selector: + matchLabels: + name: knative-operator + template: + metadata: + labels: + name: knative-operator + app.kubernetes.io/name: knative-operator + app.kubernetes.io/version: "1.16.0" + sidecar.istio.io/inject: "false" + spec: + serviceAccountName: knative-operator + containers: + - name: knative-operator + image: gcr.io/knative-releases/knative.dev/operator/cmd/operator@sha256:0b5a3532417f9c8e7b6044e23f6f67ad932a74a40a4092ad965b6f173b2fd887 + resources: + requests: + cpu: 100m + memory: 100Mi + limits: + cpu: 1000m + memory: 1000Mi + imagePullPolicy: IfNotPresent + env: + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: SYSTEM_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: METRICS_DOMAIN + value: knative.dev/operator + - name: CONFIG_LOGGING_NAME + value: config-logging + - name: CONFIG_OBSERVABILITY_NAME + value: config-observability + - name: KUBERNETES_MIN_VERSION + value: "" + securityContext: + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + runAsNonRoot: true + capabilities: + drop: + - ALL + ports: + - name: metrics + containerPort: 9090 +--- +# Source: dataplane/charts/prometheus/charts/kube-state-metrics/templates/deployment.yaml apiVersion: apps/v1 kind: Deployment metadata: name: release-name-kube-state-metrics namespace: union labels: - helm.sh/chart: kube-state-metrics-5.30.1 + helm.sh/chart: kube-state-metrics-5.25.1 app.kubernetes.io/managed-by: Helm app.kubernetes.io/component: metrics app.kubernetes.io/part-of: kube-state-metrics app.kubernetes.io/name: kube-state-metrics app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2.15.0" + app.kubernetes.io/version: "2.13.0" spec: selector: matchLabels: @@ -4924,13 +5496,13 @@ spec: template: metadata: labels: - helm.sh/chart: kube-state-metrics-5.30.1 + helm.sh/chart: kube-state-metrics-5.25.1 app.kubernetes.io/managed-by: Helm app.kubernetes.io/component: metrics app.kubernetes.io/part-of: kube-state-metrics app.kubernetes.io/name: kube-state-metrics app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2.15.0" + app.kubernetes.io/version: "2.13.0" spec: automountServiceAccountToken: true hostNetwork: false @@ -4947,8 +5519,9 @@ spec: args: - --port=8080 - --resources=certificatesigningrequests,configmaps,cronjobs,daemonsets,deployments,endpoints,horizontalpodautoscalers,ingresses,jobs,leases,limitranges,mutatingwebhookconfigurations,namespaces,networkpolicies,nodes,persistentvolumeclaims,persistentvolumes,poddisruptionbudgets,pods,replicasets,replicationcontrollers,resourcequotas,secrets,services,statefulsets,storageclasses,validatingwebhookconfigurations,volumeattachments + - --namespaces=union imagePullPolicy: IfNotPresent - image: registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.15.0 + image: registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.13.0 ports: - containerPort: 8080 name: "http" @@ -4968,7 +5541,7 @@ spec: httpGet: httpHeaders: path: /readyz - port: 8081 + port: 8080 scheme: HTTP initialDelaySeconds: 5 periodSeconds: 10 @@ -4983,107 +5556,204 @@ spec: - ALL readOnlyRootFilesystem: true --- -# Source: dataplane/templates/clusterresourcesync/deployment.yaml +# Source: dataplane/charts/prometheus/templates/deploy.yaml apiVersion: apps/v1 kind: Deployment metadata: - name: union-syncresources - namespace: union labels: - app.kubernetes.io/name: clusterresourcesync + app.kubernetes.io/component: server + app.kubernetes.io/name: prometheus app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name + app.kubernetes.io/version: v2.54.1 + helm.sh/chart: prometheus-25.27.0 app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: prometheus + name: union-operator-prometheus + namespace: union spec: - replicas: 1 selector: matchLabels: - app.kubernetes.io/name: clusterresourcesync + app.kubernetes.io/component: server + app.kubernetes.io/name: prometheus app.kubernetes.io/instance: release-name + replicas: 1 + revisionHistoryLimit: 10 + strategy: + type: Recreate + rollingUpdate: null template: metadata: - annotations: - configChecksum: "7ff0169854ce83fe5e5cb0ec550944a512e1ea0ebc47177b32a5bf3f7fadf9f" - labels: - platform.union.ai/zone: "dataplane" - - app.kubernetes.io/name: clusterresourcesync + app.kubernetes.io/component: server + app.kubernetes.io/name: prometheus app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name + app.kubernetes.io/version: v2.54.1 + helm.sh/chart: prometheus-25.27.0 app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: prometheus spec: + enableServiceLinks: true + serviceAccountName: union-operator-prometheus containers: - - command: - - clusterresource - - --config - - /etc/flyte/config/*.yaml - - clusterresource - - run - env: - - name: POD_NAME - valueFrom: - fieldRef: - fieldPath: metadata.name - - name: POD_NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - - name: GOMEMLIMIT - valueFrom: - resourceFieldRef: - divisor: 1 - resource: limits.memory - - name: GOMAXPROCS - valueFrom: - resourceFieldRef: - divisor: 1 - resource: limits.cpu - - name: CLUSTER_NAME - valueFrom: - secretKeyRef: - name: operator-cluster-name - key: cluster_name - - name: DEPLOYMENT_NAME - value: operator - - name: PROXY_SERVICE_URL - value: http://union-operator-proxy:8080 - - name: PROMETHEUS_SERVICE_URL - value: http://union-operator-prometheus:80 - - name: KNATIVE_PROXY_SERVICE_URL - value: http://kourier-internal - image: "public.ecr.aws/p0i0a9q8/unionoperator:2026.4.5" + - name: prometheus-server-configmap-reload + image: "quay.io/prometheus-operator/prometheus-config-reloader:v0.76.0" + imagePullPolicy: "IfNotPresent" + args: + - --watched-dir=/etc/config + - --listen-address=0.0.0.0:8080 + - --reload-url=http://127.0.0.1:9090/prometheus/-/reload + ports: + - containerPort: 8080 + name: metrics + livenessProbe: + httpGet: + path: /healthz + port: metrics + scheme: HTTP + initialDelaySeconds: 2 + periodSeconds: 10 + readinessProbe: + httpGet: + path: /healthz + port: metrics + scheme: HTTP + periodSeconds: 10 + volumeMounts: + - name: config-volume + mountPath: /etc/config + readOnly: true + + - name: prometheus-server + image: "quay.io/prometheus/prometheus:v2.54.1" + imagePullPolicy: "IfNotPresent" + args: + - --storage.tsdb.retention.time=3d + - --config.file=/etc/config/prometheus.yml + - --storage.tsdb.path=/data + - --web.console.libraries=/etc/prometheus/console_libraries + - --web.console.templates=/etc/prometheus/consoles + - --web.enable-lifecycle + - --web.route-prefix=/prometheus + - --web.external-url=/prometheus/ + ports: + - containerPort: 9090 + readinessProbe: + httpGet: + path: /prometheus/-/ready + port: 9090 + scheme: HTTP + initialDelaySeconds: 30 + periodSeconds: 5 + timeoutSeconds: 4 + failureThreshold: 3 + successThreshold: 1 + livenessProbe: + httpGet: + path: /prometheus/-/healthy + port: 9090 + scheme: HTTP + initialDelaySeconds: 30 + periodSeconds: 15 + timeoutSeconds: 10 + failureThreshold: 3 + successThreshold: 1 + resources: + limits: + cpu: "3" + memory: 3500Mi + requests: + cpu: "1" + memory: 1Gi + volumeMounts: + - name: config-volume + mountPath: /etc/config + - name: storage-volume + mountPath: /data + subPath: "" + - name: server-recording-rules + mountPath: /etc/config/recording + subPath: + readOnly: + dnsPolicy: ClusterFirst + securityContext: + fsGroup: 65534 + runAsGroup: 65534 + runAsNonRoot: true + runAsUser: 65534 + terminationGracePeriodSeconds: 300 + volumes: + - name: config-volume + configMap: + name: union-operator-prometheus + - name: server-recording-rules + configMap: + name: union-recording-rules + - name: storage-volume + emptyDir: + sizeLimit: 10Gi +--- +# Source: dataplane/templates/flyteconnector/deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: flyteconnector + namespace: union + labels: + app.kubernetes.io/name: flyteconnector + app.kubernetes.io/instance: release-name + app.kubernetes.io/managed-by: Helm +spec: + replicas: 2 + selector: + matchLabels: + app.kubernetes.io/name: flyteconnector + app.kubernetes.io/instance: release-name + template: + metadata: + annotations: + labels: + platform.union.ai/zone: "dataplane" + + app.kubernetes.io/name: flyteconnector + app.kubernetes.io/instance: release-name + app.kubernetes.io/managed-by: Helm + spec: + containers: + - command: + - c0 + image: "ghcr.io/flyteorg/flyte-connectors:py3.13-2.0.0b50.dev3-g695bb1db3.d20260122" imagePullPolicy: "IfNotPresent" - name: sync-cluster-resources + name: flyteconnector + readinessProbe: + failureThreshold: 3 + httpGet: + path: / + port: 9090 + scheme: HTTP + initialDelaySeconds: 5 + periodSeconds: 10 + successThreshold: 1 + timeoutSeconds: 5 + env: + - name: AIRFLOW_CONN_GOOGLE_CLOUD_DEFAULT + value: "google-cloud-platform://" + - name: LOG_LEVEL + value: "20" + ports: + - containerPort: 8000 + name: grpc + - containerPort: 9090 + name: metric resources: limits: - cpu: "1" - memory: 500Mi + cpu: "1.5" + ephemeral-storage: 100Mi + memory: 1500Mi requests: - cpu: 500m - memory: 100Mi - volumeMounts: - - name: auth - mountPath: /etc/union/secret - - name: resource-templates - mountPath: /etc/flyte/clusterresource/templates - - name: config-volume - mountPath: /etc/flyte/config - ports: - - name: debug - containerPort: 10254 - protocol: TCP - serviceAccountName: union-clustersync-system - volumes: - - configMap: - name: union-clusterresource-template - name: resource-templates - - configMap: - name: union-clusterresourcesync-config - name: config-volume - - name: auth - secret: - secretName: union-secret-auth + cpu: "1" + ephemeral-storage: 100Mi + memory: 1000Mi + serviceAccountName: flyteconnector --- # Source: dataplane/templates/imagebuilder/deployment.yaml apiVersion: apps/v1 @@ -5105,17 +5775,15 @@ spec: app.kubernetes.io/instance: release-name template: metadata: - annotations: - container.apparmor.security.beta.kubernetes.io/buildkit: unconfined labels: platform.union.ai/zone: "dataplane" app.kubernetes.io/name: imagebuilder-buildkit app.kubernetes.io/instance: release-name spec: - serviceAccountName: "union-imagebuilder" + serviceAccountName: "union-system" containers: - name: "buildkit" - image: "docker.io/moby/buildkit:buildx-stable-1-rootless" + image: "docker.io/moby/buildkit:buildx-stable-1" imagePullPolicy: IfNotPresent env: - name: POD_NAME @@ -5150,18 +5818,15 @@ spec: - name: KNATIVE_PROXY_SERVICE_URL value: http://kourier-internal volumeMounts: - - mountPath: /home/user/.local/share/buildkit - name: buildkitd - mountPath: /etc/buildkit name: buildkit-config args: - --config - /etc/buildkit/buildkitd.toml - --addr - - unix:///run/user/1000/buildkit/buildkitd.sock + - unix:///run//buildkit/buildkitd.sock - --addr - tcp://0.0.0.0:1234 - - --oci-worker-no-process-sandbox ports: - name: tcp containerPort: 1234 @@ -5183,18 +5848,13 @@ spec: initialDelaySeconds: 5 periodSeconds: 30 securityContext: - seccompProfile: # Needs Kubernetes >= 1.19 - type: Unconfined - runAsUser: 1000 - runAsGroup: 1000 + privileged: true resources: requests: - cpu: 1 - ephemeral-storage: 20Gi - memory: 1Gi + cpu: 4 + ephemeral-storage: 50Gi + memory: 4Gi volumes: - - name: buildkitd - emptyDir: {} - configMap: name: union-operator-buildkit name: buildkit-config @@ -5223,16 +5883,18 @@ spec: template: metadata: annotations: - configChecksum: "807825e2bc1d1dc69164fbea82af93461fcf79e35ad2c0929e03df2f1e14935" + configChecksum: "841817ea8873e0592d093d9f564f22eb6a316a7cac8d611afc2582a4856dc7a" labels: platform.union.ai/zone: "dataplane" app: executor + app.kubernetes.io/instance: 'release-name' + app.kubernetes.io/name: executor spec: securityContext: fsGroup: 1337 - serviceAccountName: executor + serviceAccountName: union-system volumes: - name: config-volume configMap: @@ -5248,7 +5910,7 @@ spec: image: "public.ecr.aws/p0i0a9q8/unionoperator:2026.4.5" imagePullPolicy: IfNotPresent command: - - executor + - executorv2 - serve - --config - /etc/config/*.yaml @@ -5326,7 +5988,7 @@ spec: template: metadata: annotations: - configChecksum: "0d3d3093adf1cf4b2bb08dfec3df8021cea0b4660395075f002f399062a3485" + configChecksum: "1cbf9bb44767576a0ce2cfe30facbef94a1d54963db0d92bea7b5e1c1df350d" labels: @@ -5341,12 +6003,10 @@ spec: sources: - configMap: name: union-operator - - configMap: - name: union-clusterresourcesync-config - name: secret-volume secret: secretName: union-secret-auth - serviceAccountName: proxy-system + serviceAccountName: union-system securityContext: {} containers: @@ -5463,7 +6123,7 @@ spec: template: metadata: annotations: - configChecksum: "0d3d3093adf1cf4b2bb08dfec3df8021cea0b4660395075f002f399062a3485" + configChecksum: "1cbf9bb44767576a0ce2cfe30facbef94a1d54963db0d92bea7b5e1c1df350d" labels: @@ -5472,7 +6132,7 @@ spec: platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm spec: - serviceAccountName: operator-system + serviceAccountName: union-system securityContext: {} volumes: @@ -5553,81 +6213,16 @@ spec: containerPort: 10254 protocol: TCP --- -# Source: dataplane/templates/prometheus/deployment.yaml -apiVersion: apps/v1 -kind: Deployment -metadata: - name: union-operator-prometheus - namespace: union - labels: - helm.sh/chart: dataplane-2026.4.2 - app.kubernetes.io/name: release-name-dataplane - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: prometheus -spec: - replicas: 1 - selector: - matchLabels: - app.kubernetes.io/component: prometheus - app.kubernetes.io/name: release-name-dataplane - app.kubernetes.io/instance: release-name - template: - metadata: - annotations: - configChecksum: "7b17013d79df7c092b0a5fd1e84f01d463afd0b4be27bae45a060c3cd055644" - labels: - app.kubernetes.io/component: prometheus - app.kubernetes.io/name: release-name-dataplane - app.kubernetes.io/instance: release-name - spec: - priorityClassName: system-cluster-critical - serviceAccountName: union-operator-prometheus - securityContext: - fsGroup: 65534 - runAsNonRoot: true - runAsUser: 65534 - fsGroupChangePolicy: OnRootMismatch - containers: - - name: prometheus - image: "prom/prometheus:v3.3.1" - args: - - --config.file=/etc/prometheus/prometheus.yml - - --web.external-url=/prometheus/ - - --web.route-prefix=/prometheus/ - - --storage.tsdb.retention.time=3d - ports: - - name: http - containerPort: 9090 - protocol: TCP - resources: - limits: - cpu: "3" - memory: 3500Mi - requests: - cpu: "1" - memory: 1Gi - securityContext: - allowPrivilegeEscalation: false - terminationMessagePolicy: FallbackToLogsOnError - volumeMounts: - - mountPath: /etc/prometheus - name: prometheus-config - volumes: - - name: prometheus-config - configMap: - name: union-operator-prometheus ---- -# Source: dataplane/templates/propeller/deployment-webhook.yaml +# Source: dataplane/templates/webhook/deployment.yaml +# Webhook deployment # Create the actual deployment apiVersion: apps/v1 kind: Deployment metadata: - name: flytepropeller-webhook + name: union-pod-webhook namespace: union labels: - app.kubernetes.io/name: flyte-pod-webhook + app.kubernetes.io/name: union-pod-webhook app.kubernetes.io/instance: release-name platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm @@ -5635,19 +6230,19 @@ spec: replicas: 1 selector: matchLabels: - app.kubernetes.io/name: flyte-pod-webhook + app.kubernetes.io/name: union-pod-webhook app.kubernetes.io/instance: release-name template: metadata: labels: platform.union.ai/zone: "dataplane" - app.kubernetes.io/name: flyte-pod-webhook + app.kubernetes.io/name: union-pod-webhook app.kubernetes.io/instance: release-name platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm annotations: - configChecksum: "f99369e37d4a20dded730f6900a334aaa4b1b1c3e62f534e17d7967785c62a8" + configChecksum: "77958aee0464eee4cb43c70036cf5ffa89f47f58c908845b9eeaecc7e32bba4" spec: securityContext: @@ -5655,64 +6250,8 @@ spec: fsGroupChangePolicy: Always runAsNonRoot: true runAsUser: 1001 - seLinuxOptions: - type: spc_t - serviceAccountName: flytepropeller-webhook-system - initContainers: - - name: generate-secrets - image: "public.ecr.aws/p0i0a9q8/unionoperator:2026.4.5" - imagePullPolicy: "IfNotPresent" - command: - - flytepropeller - args: - - webhook - - init-certs - - --config - - /etc/flyte/config/*.yaml - env: - - name: POD_NAME - valueFrom: - fieldRef: - fieldPath: metadata.name - - name: POD_NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - - name: GOMEMLIMIT - valueFrom: - resourceFieldRef: - divisor: 1 - resource: limits.memory - - name: GOMAXPROCS - valueFrom: - resourceFieldRef: - divisor: 1 - resource: limits.cpu - - name: CLUSTER_NAME - valueFrom: - secretKeyRef: - name: operator-cluster-name - key: cluster_name - - name: DEPLOYMENT_NAME - value: operator - - name: PROXY_SERVICE_URL - value: http://union-operator-proxy:8080 - - name: PROMETHEUS_SERVICE_URL - value: http://union-operator-prometheus:80 - - name: KNATIVE_PROXY_SERVICE_URL - value: http://kourier-internal - volumeMounts: - - name: config-volume - mountPath: /etc/flyte/config - resources: - limits: - cpu: 1 - ephemeral-storage: 500Mi - memory: 500Mi - requests: - cpu: 200m - ephemeral-storage: 500Mi - memory: 500Mi + seLinuxOptions: null + serviceAccountName: union-system containers: - name: webhook image: "public.ecr.aws/p0i0a9q8/unionoperator:2026.4.5" @@ -5777,107 +6316,301 @@ spec: volumes: - name: config-volume configMap: - name: flyte-propeller-config + name: union-pod-webhook-config - name: webhook-certs secret: - secretName: flyte-pod-webhook + secretName: union-pod-webhook --- -# Source: dataplane/templates/propeller/deployment.yaml -apiVersion: apps/v1 -kind: Deployment +# Source: dataplane/templates/flyteconnector/hpa.yaml +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler metadata: - namespace: union - name: flytepropeller + name: flyteconnector + labels: + app.kubernetes.io/name: flyteconnector + app.kubernetes.io/instance: release-name + app.kubernetes.io/managed-by: Helm +spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: flyteconnector + minReplicas: 2 + maxReplicas: 5 + metrics: + - type: Resource + resource: + name: cpu + target: + type: Utilization + averageUtilization: 80 + - type: Resource + resource: + name: memory + target: + type: Utilization + averageUtilization: 80 +--- +# Source: dataplane/templates/webhook/mutatingwebhookconfiguration.yaml +apiVersion: admissionregistration.k8s.io/v1 +kind: MutatingWebhookConfiguration +metadata: + name: union-pod-webhook- labels: - app.kubernetes.io/name: flytepropeller + app.kubernetes.io/name: union-pod-webhook app.kubernetes.io/instance: release-name platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm +webhooks: + - name: union-pod-webhook.flyte.org + admissionReviewVersions: + - v1 + - v1beta1 + clientConfig: + caBundle: LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSUROekNDQWgrZ0F3SUJBZ0lVSm9kQ09lem4vd1BHQTZjYkw1U3duSlpYT2hJd0RRWUpLb1pJaHZjTkFRRUwKQlFBd0t6RXBNQ2NHQTFVRUF3d2dabXg1ZEdWd2NtOXdaV3hzWlhJdGQyVmlhRzl2YXk1MWJtbHZiaTV6ZG1NdwpIaGNOTWpZd01USXpNREl6TmpVMVdoY05Nell3TVRJeE1ESXpOalUxV2pBck1Ta3dKd1lEVlFRRERDQm1iSGwwClpYQnliM0JsYkd4bGNpMTNaV0pvYjI5ckxuVnVhVzl1TG5OMll6Q0NBU0l3RFFZSktvWklodmNOQVFFQkJRQUQKZ2dFUEFEQ0NBUW9DZ2dFQkFJcjBSWDRGRjFNVTNYdFBmUHErMDUrMGI1Qm0xK3hsTVR5ZUp4bG5mWFJuRlgyegpPcHV6NmsreUpBQk4xRkxmNDVYRnlJclFlaW5DWWJtckZXalFRc3ZDWEloNkpPNGMwdmRMVVU5RERYcldWRFlsCkRNZytTUDlJVnZBMm5USkVFZ2RFREVsKzNWeThUbUJoNlNmcGthcXdHajY5SmhIdy9OeE9yZzM2bUY3TU5VZ1MKWXJVUkxqUFFzTUN6NElTVzhhQnEzS2IwRUFWUTA4V1lZQ0ZEbjY1aHBzVFdxR05rL3BKRnVTYjlsZll6RVdyYwpJeW5wNDhqTnIvNUdKS3NnWG9LeTUwZVo4UkppeTVyOUhENGtnSW15TVNQR2RDMXFZSVppRHRlZW8yOElxNk5nCmd1dWcyTGhlNndYYW9YZXFCQ2d3SHpwWFVXM1hJVzI5eFoxUkFRRUNBd0VBQWFOVE1GRXdIUVlEVlIwT0JCWUUKRktITDQ1RWhQaVg5bnlRN241QUk0YUZVNFlCL01COEdBMVVkSXdRWU1CYUFGS0hMNDVFaFBpWDlueVE3bjVBSQo0YUZVNFlCL01BOEdBMVVkRXdFQi93UUZNQU1CQWY4d0RRWUpLb1pJaHZjTkFRRUxCUUFEZ2dFQkFEeTRac21XCkNjZ3B5TmI4cU1ydzNYclBNejZLMTRlTVhqQVk1bERFYXM1NklLZExoZFM1bi9sWk95bENuUUtOZ3RHS0VrY3UKT05tMEE2Z0xNT3UzcE43dG9aNXh0WktOcnUvS0x6dm44QUNNNndCbUR0bG5oRERDa3ZUTHBVRzh6UTZSNmlaegpzTWlhVmpLSUJiWkJsZXNYSHZZQnQ2YlRTN2o1QWM2aTdYc1RPK3ZqQ2RUQjVYdCtEZ0x2QnBJRWh2ZGJzNXhmClFtc3ZzQmwxenJnR3JXemo4d0txcXNUMzZaOVU1TzdSK1RJMlQrUDVOR25SeHNQOGJ0R0RtMGZ5SW55WlJrSkcKVTdpVFJyMlZRWlRyaTNZQ2dlVjN1UjFtMnJjc2hyU1Avb082T0QyTXR5Q1FMRWtFK1pxVUVJVnpSYmVaNXl2RQptM29YMFV2OUtBUy9pWTA9Ci0tLS0tRU5EIENFUlRJRklDQVRFLS0tLS0K + service: + name: union-pod-webhook + namespace: union + path: /mutate--v1-pod/secrets + port: 443 + failurePolicy: Fail + matchPolicy: Equivalent + namespaceSelector: + matchLabels: + kubernetes.io/metadata.name: union + objectSelector: + matchLabels: + inject-flyte-secrets: "true" + organization: '' + reinvocationPolicy: Never + rules: + - apiGroups: + - '*' + apiVersions: + - v1 + operations: + - CREATE + resources: + - pods + scope: '*' + sideEffects: NoneOnDryRun + timeoutSeconds: 30 +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Imported via https://github.com/knative/operator/releases/download/knative-v1.16.0/operator.yaml +# Copyright 2024 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +# Source: dataplane/templates/propeller/serviceaccount.yaml +--- +--- +# Source: dataplane/templates/serving/knative-serving.yaml +apiVersion: operator.knative.dev/v1beta1 +kind: KnativeServing +metadata: + name: union-operator-serving + namespace: union + labels: + helm.sh/chart: dataplane-2026.4.4 + app.kubernetes.io/name: release-name-dataplane + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2026.4.5" + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/component: serving spec: - replicas: 1 - selector: - matchLabels: - app.kubernetes.io/name: flytepropeller + config: + deployment: + progress-deadline: "30m" + queue-sidecar-cpu-request: "25m" + queue-sidecar-cpu-limit: "1000m" + queue-sidecar-memory-request: "400Mi" + queue-sidecar-memory-limit: "800Mi" + queue-sidecar-ephemeral-storage-request: "512Mi" + queue-sidecar-ephemeral-storage-limit: "1024Mi" + registries-skipping-tag-resolving: managed.cr.union.ai + features: + kubernetes.podspec-affinity: "enabled" + kubernetes.podspec-nodeselector: "enabled" + kubernetes.podspec-tolerations: "enabled" + kubernetes.podspec-fieldref: "enabled" + kubernetes.podspec-dnspolicy: "enabled" + kubernetes.podspec-schedulername: "enabled" + kubernetes.podspec-securitycontext: "enabled" + network: + ingress-class: "kourier.ingress.networking.knative.dev" + high-availability: + replicas: 2 + ingress: + kourier: + enabled: true + bootstrap-configmap: "union-operator-serving-envoy-bootstrap" + service-type: ClusterIP + podDisruptionBudgets: + - name: 3scale-kourier-gateway-pdb + minAvailable: 50% + - name: activator-pdb + minAvailable: 50% + - name: webhook-pdb + minAvailable: 50% + registry: + override: + # TODO(jeev): Wire up Union fork of Envoy + 3scale-kourier-gateway/kourier-gateway: ghcr.io/unionai/envoy:456fed84d4ad9a9dfb186d117d9362e9dc0f7c1f + # TODO(jeev): Wire up Union fork of Kourier + net-kourier-controller/controller: ghcr.io/unionai/kourier@sha256:5804c348d15b3959604e3e3ceed216c3a1c7b32cbe254c7d3eb02a35e62ba9c4 + workloads: + - name: 3scale-kourier-gateway + labels: + helm.sh/chart: dataplane-2026.4.4 + app.kubernetes.io/name: release-name-dataplane app.kubernetes.io/instance: release-name - template: - metadata: - annotations: - configChecksum: "f99369e37d4a20dded730f6900a334aaa4b1b1c3e62f534e17d7967785c62a8" - - labels: - platform.union.ai/zone: "dataplane" - - - app.kubernetes.io/name: flytepropeller - app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name - app.kubernetes.io/managed-by: Helm - spec: - priorityClassName: system-cluster-critical - containers: - - command: - - flytepropeller - - --config - - /etc/flyte/config/*.yaml - - --propeller.cluster-id - - '' - env: - - name: POD_NAME - valueFrom: - fieldRef: - fieldPath: metadata.name - - name: POD_NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - - name: GOMEMLIMIT - valueFrom: - resourceFieldRef: - divisor: 1 - resource: limits.memory - - name: GOMAXPROCS - valueFrom: - resourceFieldRef: - divisor: 1 - resource: limits.cpu - - name: CLUSTER_NAME - valueFrom: - secretKeyRef: - name: operator-cluster-name - key: cluster_name - - name: DEPLOYMENT_NAME - value: operator - - name: PROXY_SERVICE_URL - value: http://union-operator-proxy:8080 - - name: PROMETHEUS_SERVICE_URL - value: http://union-operator-prometheus:80 - - name: KNATIVE_PROXY_SERVICE_URL - value: http://kourier-internal - image: "public.ecr.aws/p0i0a9q8/unionoperator:2026.4.5" - imagePullPolicy: "IfNotPresent" - name: flytepropeller - ports: - - containerPort: 10254 - resources: - limits: - cpu: "3" - memory: 3Gi - requests: - cpu: "1" - memory: 1Gi - volumeMounts: - - name: config-volume - mountPath: /etc/flyte/config - - name: auth - mountPath: /etc/union/secret - serviceAccountName: flytepropeller-system - volumes: - - configMap: - name: flyte-propeller-config - name: config-volume - - name: auth - secret: - secretName: union-secret-auth + app.kubernetes.io/version: "2026.4.5" + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/component: 3scale-kourier-gateway + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchLabels: + app.kubernetes.io/name: 3scale-kourier-gateway + app.kubernetes.io/instance: release-name + topologyKey: topology.kubernetes.io/zone + annotations: + checksum/bootstrap-config: 6469f6f592eebc9e0c676d8ccc359a05bc799c0988e474b2da942c4cc328f656 + env: + - container: kourier-gateway + envVars: + - name: UNION_AUTHZ_TENANTAUTHURL + value: "https:///me" + - name: UNION_AUTHZ_TENANTAUTHSIGNINURL + value: "https:///login" + - name: UNION_AUTHZ_TENANTCONTROLPLANEURL + value: "https://" + resources: + - container: kourier-gateway + limits: + cpu: "2" + memory: 2Gi + requests: + cpu: "1" + memory: 1Gi + - name: activator + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app + operator: In + values: + - activator + topologyKey: topology.kubernetes.io/zone + - name: autoscaler + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app + operator: In + values: + - autoscaler + topologyKey: topology.kubernetes.io/zone + - name: autoscaler-hpa + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app + operator: In + values: + - autoscaler-hpa + topologyKey: topology.kubernetes.io/zone + - name: controller + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app + operator: In + values: + - controller + topologyKey: topology.kubernetes.io/zone + - name: net-kourier-controller + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app + operator: In + values: + - net-kourier-controller + topologyKey: topology.kubernetes.io/zone + env: + - container: controller + envVars: + - name: KOURIER_UNION_AUTHZ_ENABLED + value: "true" + resources: + - container: controller + limits: + cpu: "1" + memory: 1Gi + requests: + cpu: 500m + memory: 500Mi + - name: webhook + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app + operator: In + values: + - webhook + topologyKey: topology.kubernetes.io/zone +--- +# Source: dataplane/templates/common/task-podtemplate.yaml +apiVersion: v1 +kind: PodTemplate +metadata: + name: task-template + namespace: union +template: + spec: + serviceAccountName: union + containers: + - name: default + image: docker.io/rwgrim/docker-noop --- # Source: dataplane/charts/fluentbit/templates/tests/test-connection.yaml apiVersion: v1 diff --git a/tests/generated/dataplane.dcgm-exporter.yaml b/tests/generated/dataplane.dcgm-exporter.yaml index 6530b7f7..6f10b2c7 100644 --- a/tests/generated/dataplane.dcgm-exporter.yaml +++ b/tests/generated/dataplane.dcgm-exporter.yaml @@ -1,40 +1,4 @@ --- -# Source: dataplane/templates/common/namespaces.yaml -apiVersion: v1 -kind: Namespace -metadata: - name: flytesnacks-development ---- -# Source: dataplane/templates/common/namespaces.yaml -apiVersion: v1 -kind: Namespace -metadata: - name: flytesnacks-staging ---- -# Source: dataplane/templates/common/namespaces.yaml -apiVersion: v1 -kind: Namespace -metadata: - name: flytesnacks-production ---- -# Source: dataplane/templates/common/namespaces.yaml -apiVersion: v1 -kind: Namespace -metadata: - name: union-health-monitoring-development ---- -# Source: dataplane/templates/common/namespaces.yaml -apiVersion: v1 -kind: Namespace -metadata: - name: union-health-monitoring-staging ---- -# Source: dataplane/templates/common/namespaces.yaml -apiVersion: v1 -kind: Namespace -metadata: - name: union-health-monitoring-production ---- # Source: dataplane/charts/dcgm-exporter/templates/serviceaccount.yaml # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. # @@ -64,121 +28,141 @@ metadata: app.kubernetes.io/managed-by: Helm automountServiceAccountToken: false --- -# Source: dataplane/charts/fluentbit/templates/serviceaccount.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2020 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + apiVersion: v1 kind: ServiceAccount metadata: - name: fluentbit-system - namespace: union + name: knative-operator + namespace: "union" labels: - helm.sh/chart: fluentbit-0.48.9 - app.kubernetes.io/name: fluentbit - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "3.2.8" - app.kubernetes.io/managed-by: Helm + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2020 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v1 +kind: ServiceAccount +metadata: + name: operator-webhook + namespace: "union" + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator --- -# Source: dataplane/charts/kube-state-metrics/templates/serviceaccount.yaml +# Source: dataplane/charts/prometheus/charts/kube-state-metrics/templates/serviceaccount.yaml apiVersion: v1 kind: ServiceAccount automountServiceAccountToken: true metadata: labels: - helm.sh/chart: kube-state-metrics-5.30.1 + helm.sh/chart: kube-state-metrics-5.25.1 app.kubernetes.io/managed-by: Helm app.kubernetes.io/component: metrics app.kubernetes.io/part-of: kube-state-metrics app.kubernetes.io/name: kube-state-metrics app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2.15.0" + app.kubernetes.io/version: "2.13.0" name: release-name-kube-state-metrics namespace: union --- -# Source: dataplane/charts/opencost/templates/serviceaccount.yaml +# Source: dataplane/charts/prometheus/templates/serviceaccount.yaml apiVersion: v1 kind: ServiceAccount metadata: - name: release-name-opencost - namespace: union labels: - helm.sh/chart: opencost-1.42.0 - app.kubernetes.io/name: opencost + app.kubernetes.io/component: server + app.kubernetes.io/name: prometheus app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "1.111.0" - app.kubernetes.io/part-of: opencost + app.kubernetes.io/version: v2.54.1 + helm.sh/chart: prometheus-25.27.0 app.kubernetes.io/managed-by: Helm -automountServiceAccountToken: true ---- -# Source: dataplane/templates/clusterresourcesync/serviceaccount.yaml -apiVersion: v1 -kind: ServiceAccount -metadata: - name: union-clustersync-system + app.kubernetes.io/part-of: prometheus + name: union-operator-prometheus namespace: union + annotations: + {} --- -# Source: dataplane/templates/imagebuilder/serviceaccount.yaml -apiVersion: v1 -kind: ServiceAccount -metadata: - name: union-imagebuilder ---- -# Source: dataplane/templates/nodeexecutor/serviceaccount.yaml +# Source: dataplane/templates/common/system-serviceaccount.yaml apiVersion: v1 kind: ServiceAccount metadata: - name: executor + name: union-system namespace: union - labels: - app: executor ---- -# Source: dataplane/templates/operator/serviceaccount-proxy.yaml -apiVersion: v1 -kind: ServiceAccount -metadata: - name: proxy-system - labels: - app.kubernetes.io/name: operator-proxy - app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name - app.kubernetes.io/managed-by: Helm + annotations: --- -# Source: dataplane/templates/operator/serviceaccount.yaml +# Source: dataplane/templates/common/union-serviceaccount.yaml apiVersion: v1 kind: ServiceAccount metadata: - name: operator-system - labels: - app.kubernetes.io/name: union-operator - app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name - app.kubernetes.io/managed-by: Helm + name: union + namespace: union + annotations: + eks.amazonaws.com/role-arn: arn:aws:iam::ACCOUNT_ID:role/flyte_project_role +automountServiceAccountToken: true --- -# Source: dataplane/templates/prometheus/serviceaccount.yaml +# Source: dataplane/templates/flyteconnector/serviceaccount.yaml apiVersion: v1 kind: ServiceAccount metadata: - name: union-operator-prometheus + name: flyteconnector namespace: union - labels: - helm.sh/chart: dataplane-2026.4.2 - app.kubernetes.io/name: release-name-dataplane + labels: + app.kubernetes.io/name: flyteconnector app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: prometheus --- -# Source: dataplane/templates/propeller/serviceaccount-webhook.yaml -apiVersion: v1 -kind: ServiceAccount -metadata: - name: flytepropeller-webhook-system - namespace: union ---- -# Source: dataplane/templates/propeller/serviceaccount.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2022 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + apiVersion: v1 -kind: ServiceAccount +kind: Secret metadata: - name: flytepropeller-system - namespace: union + name: operator-webhook-certs + namespace: "union" + labels: + app.kubernetes.io/component: operator-webhook + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +# The data is populated at install time. --- # Source: dataplane/templates/common/cluster-secret.yaml apiVersion: v1 @@ -189,14 +173,22 @@ type: Opaque data: cluster_name: --- -# Source: dataplane/templates/propeller/deployment-webhook.yaml -# Create an empty secret that the first propeller pod will populate +# Source: dataplane/templates/webhook/mutatingwebhookconfiguration.yaml apiVersion: v1 kind: Secret metadata: - name: flyte-pod-webhook + name: union-pod-webhook namespace: union + labels: + app.kubernetes.io/name: union-pod-webhook + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm type: Opaque +data: + ca.crt: LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSUROekNDQWgrZ0F3SUJBZ0lVSm9kQ09lem4vd1BHQTZjYkw1U3duSlpYT2hJd0RRWUpLb1pJaHZjTkFRRUwKQlFBd0t6RXBNQ2NHQTFVRUF3d2dabXg1ZEdWd2NtOXdaV3hzWlhJdGQyVmlhRzl2YXk1MWJtbHZiaTV6ZG1NdwpIaGNOTWpZd01USXpNREl6TmpVMVdoY05Nell3TVRJeE1ESXpOalUxV2pBck1Ta3dKd1lEVlFRRERDQm1iSGwwClpYQnliM0JsYkd4bGNpMTNaV0pvYjI5ckxuVnVhVzl1TG5OMll6Q0NBU0l3RFFZSktvWklodmNOQVFFQkJRQUQKZ2dFUEFEQ0NBUW9DZ2dFQkFJcjBSWDRGRjFNVTNYdFBmUHErMDUrMGI1Qm0xK3hsTVR5ZUp4bG5mWFJuRlgyegpPcHV6NmsreUpBQk4xRkxmNDVYRnlJclFlaW5DWWJtckZXalFRc3ZDWEloNkpPNGMwdmRMVVU5RERYcldWRFlsCkRNZytTUDlJVnZBMm5USkVFZ2RFREVsKzNWeThUbUJoNlNmcGthcXdHajY5SmhIdy9OeE9yZzM2bUY3TU5VZ1MKWXJVUkxqUFFzTUN6NElTVzhhQnEzS2IwRUFWUTA4V1lZQ0ZEbjY1aHBzVFdxR05rL3BKRnVTYjlsZll6RVdyYwpJeW5wNDhqTnIvNUdKS3NnWG9LeTUwZVo4UkppeTVyOUhENGtnSW15TVNQR2RDMXFZSVppRHRlZW8yOElxNk5nCmd1dWcyTGhlNndYYW9YZXFCQ2d3SHpwWFVXM1hJVzI5eFoxUkFRRUNBd0VBQWFOVE1GRXdIUVlEVlIwT0JCWUUKRktITDQ1RWhQaVg5bnlRN241QUk0YUZVNFlCL01COEdBMVVkSXdRWU1CYUFGS0hMNDVFaFBpWDlueVE3bjVBSQo0YUZVNFlCL01BOEdBMVVkRXdFQi93UUZNQU1CQWY4d0RRWUpLb1pJaHZjTkFRRUxCUUFEZ2dFQkFEeTRac21XCkNjZ3B5TmI4cU1ydzNYclBNejZLMTRlTVhqQVk1bERFYXM1NklLZExoZFM1bi9sWk95bENuUUtOZ3RHS0VrY3UKT05tMEE2Z0xNT3UzcE43dG9aNXh0WktOcnUvS0x6dm44QUNNNndCbUR0bG5oRERDa3ZUTHBVRzh6UTZSNmlaegpzTWlhVmpLSUJiWkJsZXNYSHZZQnQ2YlRTN2o1QWM2aTdYc1RPK3ZqQ2RUQjVYdCtEZ0x2QnBJRWh2ZGJzNXhmClFtc3ZzQmwxenJnR3JXemo4d0txcXNUMzZaOVU1TzdSK1RJMlQrUDVOR25SeHNQOGJ0R0RtMGZ5SW55WlJrSkcKVTdpVFJyMlZRWlRyaTNZQ2dlVjN1UjFtMnJjc2hyU1Avb082T0QyTXR5Q1FMRWtFK1pxVUVJVnpSYmVaNXl2RQptM29YMFV2OUtBUy9pWTA9Ci0tLS0tRU5EIENFUlRJRklDQVRFLS0tLS0K + tls.crt: LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSUQxakNDQXI2Z0F3SUJBZ0lVREtRN3A4MFpIdkQyUE5Ya3p3VGRLem9iUjg4d0RRWUpLb1pJaHZjTkFRRUwKQlFBd0t6RXBNQ2NHQTFVRUF3d2dabXg1ZEdWd2NtOXdaV3hzWlhJdGQyVmlhRzl2YXk1MWJtbHZiaTV6ZG1NdwpIaGNOTWpZd01USXpNREl6TmpVMVdoY05NamN3TVRJek1ESXpOalUxV2pBck1Ta3dKd1lEVlFRRERDQm1iSGwwClpYQnliM0JsYkd4bGNpMTNaV0pvYjI5ckxuVnVhVzl1TG5OMll6Q0NBU0l3RFFZSktvWklodmNOQVFFQkJRQUQKZ2dFUEFEQ0NBUW9DZ2dFQkFMNnMxUTRaZ3ZxVVdNdFp5VC9EKytEQWl6WmR0Q1lkVzArSGk4S3phQ1Vjbi9zZwo3a21NTEhDM2ZlZWtZK2ZUNENHUTd6K1l2WjRWWSt6WGljKzdrNndLbUZqdm1acEM1Rm96OWo1Q21NRk5HZmlWCnRobzVvSU9MUkIzNXRYMU1ROHUwNFVzT29QcjdMK016eEdxWXVmTFdtN1NVdjh0eEp3ZFMxZVlzMllNV1pMeU4KekI4akdWY01Qc2ZNVkdxdFhTTVljQ3hNYURKaTM1N0VEZSs3clM2U2ZtbnFtYjJxekdQM1BRRStCUm91Yk5nLwo4NCt4c253VXBLZUFpem95dFM2c2ZlOTdMUEtRTW9oVzd2TjZORFJrLy9xSmpHem9iZmZwUHU3endhWHVXMWtMCmllc2g2cVRrdlZLSTF3WFJ2WHArVU5GbjZzdDIwdEVlRks0ZzMwOENBd0VBQWFPQjhUQ0I3akFKQmdOVkhSTUUKQWpBQU1Bc0dBMVVkRHdRRUF3SUY0RENCa3dZRFZSMFJCSUdMTUlHSWdoWm1iSGwwWlhCeWIzQmxiR3hsY2kxMwpaV0pvYjI5cmdoeG1iSGwwWlhCeWIzQmxiR3hsY2kxM1pXSm9iMjlyTG5WdWFXOXVnaUJtYkhsMFpYQnliM0JsCmJHeGxjaTEzWldKb2IyOXJMblZ1YVc5dUxuTjJZNEl1Wm14NWRHVndjbTl3Wld4c1pYSXRkMlZpYUc5dmF5NTEKYm1sdmJpNXpkbU11WTJ4MWMzUmxjaTVzYjJOaGJEQWRCZ05WSFE0RUZnUVV1d25wcDZ1MTNKL0NsNnI3eDZ1Sgo5YmZ1RDhzd0h3WURWUjBqQkJnd0ZvQVVvY3Zqa1NFK0pmMmZKRHVma0FqaG9WVGhnSDh3RFFZSktvWklodmNOCkFRRUxCUUFEZ2dFQkFFMmlOa1MzczhGT3Z0cDhrVmJNZUN2amxIQytvYTVlZTFGNmowNlZ3bFJPOWdxOXJla1UKMGptWUhNWGNjWjcrZXJkNFRQRXgwK3ZqMzNhRTV3cDFmdVJJL2xIZHhuOUpJRHg2bTFQTHdVa3p1WG9EekYxLwozVXVIWU1xOGkrT2xsY0tqbmdVMUFUNmRWRCtIbFVWeHpMdjlNazdPb05SQUY5dTJHa1hiUHdCN0I1ZGRxdWxlCmtNcXBxbDhES2s4SVNVelVsTjNBb2Q5NmJUelIyV3RCVkxpTi9zRHZtTDZwZThFYm55RnBUb1BZTDlGeWZOcmIKT1ZPZEJwUkZFTG1KOHZaZWY5d1lxRmVXMTE3SGV5VDArZkxGZVNSemZUNVVWUlVraVFBcUVJdUhHUVRrOVFabgo1WnYvNUF0eEg3WE1rcXR0VE1zMFlSUG0xYjQzY3FBeFdhND0KLS0tLS1FTkQgQ0VSVElGSUNBVEUtLS0tLQo= + tls.key: LS0tLS1CRUdJTiBQUklWQVRFIEtFWS0tLS0tCk1JSUV2UUlCQURBTkJna3Foa2lHOXcwQkFRRUZBQVNDQktjd2dnU2pBZ0VBQW9JQkFRQytyTlVPR1lMNmxGakwKV2NrL3cvdmd3SXMyWGJRbUhWdFBoNHZDczJnbEhKLzdJTzVKakN4d3QzM25wR1BuMCtBaGtPOC9tTDJlRldQcwoxNG5QdTVPc0NwaFk3NW1hUXVSYU0vWStRcGpCVFJuNGxiWWFPYUNEaTBRZCtiVjlURVBMdE9GTERxRDYreS9qCk04UnFtTG55MXB1MGxML0xjU2NIVXRYbUxObURGbVM4amN3Zkl4bFhERDdIekZScXJWMGpHSEFzVEdneVl0K2UKeEEzdnU2MHVrbjVwNnBtOXFzeGo5ejBCUGdVYUxtellQL09Qc2JKOEZLU25nSXM2TXJVdXJIM3ZleXp5a0RLSQpWdTd6ZWpRMFpQLzZpWXhzNkczMzZUN3U4OEdsN2x0WkM0bnJJZXFrNUwxU2lOY0YwYjE2ZmxEUlorckxkdExSCkhoU3VJTjlQQWdNQkFBRUNnZ0VBRjV2SnMzbTRMSE9DdlFic2NwOC9DQmgvQkNlOC92MGlpYW5VUmJLMkFlWlYKN3Arb3NXV1FPUktYSGIvT2VPMVVjb09DQkFOUzh3aGQrM3pDZlB5U0w3cU9HM0RyT1Z6djdqVkIxM3FpZEVpcQpId3ZXWk0vZXpuckhYOWpEdm5SYmJwVUNVaXRKQmxwa2x3S1pYc056UHB5UTRkNkxFdEw3VEo1V2lxM2g3cWQzCjhncmJoeWlWNDhZRFAyYkhpZGt0S2QvMHBNL3piSzZPTS9iMFpLUmRBWVpHQXlOcklMaFJ3eDNsQXE2SDVqL20KUHBuNlZEdVQ3Y1h6UFhmNTgxR0IyNjQxWFJVTXRNWGVEREtaU3JudnIrbG5tV2lsYXZmNkg3ZHp4NlRBMmk0RApybCtIRVdUN1RSQUZGMnZ5SVUrYSt5RzNPVFBKRCtmVTdCWWh4Zld4aFFLQmdRRGVvcXVCbzQxK1dKU255blZnCmNWMHZoUEkwZ0dyRGQzNFdIbCtGNTZuSmxuMVVlbUQ2SUQ0TUhnNVpHWmNmd3NkRlB0aHl1VnB0OVl1SW11SGQKRHE5ZDhpWnpHUzJVbStkSWlsWjhnZXJ5Q3dON3hIbFM3SzJrc0RmRGJwYmZZSXZ1LzRnSExZRHh3YzQ0OG54ZwpWNGpCUFY2QTdYTHRqQzZzdFE2OW9uTlIwd0tCZ1FEYlFBUTRGdWpnUjJUOXh2Y01MdXhMbm1YaHE4U2NoZlM5CkRoWndDUzZIeHB1YlBxblBjQ04vQUZjbDZ1cDRXVEVZdVY4bEpwcGwwRmpYQ2lMdkhuUU15cm5vRjlUOThpanQKZGVHYjF6ZWlwRUpndFpNVHkvOGdERWorZU1wSFBuYTZwQVpMOVpRVUdLR2dDNGQ0SzljdVNHMmRlb1dSU0ViSQpMVGgzeGpXVEZRS0JnQStJRWFidG5nVmVjS0IwQTFSREZGa29VUzFRZUNKQ3g4MExPV2JDRHBvOW9XaXZVT3lpCkt6SDFOdE1JY2Y2SlBCV2NtTVVJSVVMaWltVnhTS2gvU2NTb0MvNmpsd1p6Q2VPSm94YjBpVXR4Y1VERktDR2MKMlZCUDZ0UDdkeE1HVFR6VEhzNUJZbWw3TjhQSlJ0d0J2MHliMTJmdktNRmhzaS9pUWJFQkVFSjVBb0dBUm9NSApHRmJkM0V0NXdsZzcyYkk1a25SRnhkY3RLejIxb1J1bndhNWlSWTV0T3Zkak8zQ1FLZWNkSC9lMklyQmtwdFB2Ck1vNkF0MS9UUW8xakFNNGxlbnUwWUYxUnhiNGN4WW5VM2Y3UVNNRlZDNjg1dHZNemdNWVNyNngvT1h4d1NNUTUKdGpJcnhtN0poQ1JSRkNmZFUyZzl3SmpIM2hxRmtSbGlBTHRCUGFVQ2dZRUF2QkRHejQvYS9kRzRJT0VpREk4SgpBbkxaRWZaTFJrdnI0ejRyT0p0TGZtWlFjTW5DSUZmKzBaelhTME9XOWhvMjk5RUNtS1l4U2VUWk1WSDgzOFZCCnRvSkNGM3lzb01rSC9meEVXTUpicDlMblJwSFUzUElicHlhRG1nWW1YdmsvK3laL3Y1c0dDTWNtVGkzWVFLRGsKcStlK1o0SHZyaGUxVytUaHpwOCtoNGs9Ci0tLS0tRU5EIFBSSVZBVEUgS0VZLS0tLS0K --- # Source: dataplane/charts/dcgm-exporter/templates/metrics-configmap.yaml apiVersion: v1 @@ -292,142 +284,516 @@ data: DCGM_FI_PROF_PCIE_TX_BYTES, counter, The number of bytes of active pcie tx data including both header and payload. DCGM_FI_PROF_PCIE_RX_BYTES, counter, The number of bytes of active pcie rx data including both header and payload. --- -# Source: dataplane/templates/clusterresourcesync/configmap.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2019 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v1 +kind: ConfigMap +metadata: + name: config-logging + namespace: "union" + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +data: {} +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2019 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + apiVersion: v1 kind: ConfigMap metadata: - name: union-clusterresourcesync-config - namespace: union + name: config-observability + namespace: "union" labels: - app.kubernetes.io/name: clusterresourcesync - app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name - app.kubernetes.io/managed-by: Helm -data: - cluster_resources.yaml: | - cluster_resources: - clusterName: '' - customData: - - production: - - projectQuotaCpu: - value: "4096" - - projectQuotaMemory: - value: 2Ti - - projectQuotaNvidiaGpu: - value: "256" - - defaultUserRoleKey: - value: 'eks.amazonaws.com/role-arn' - - defaultUserRoleValue: - value: 'arn:aws:iam::ACCOUNT_ID:role/flyte_project_role' - - staging: - - projectQuotaCpu: - value: "4096" - - projectQuotaMemory: - value: 2Ti - - projectQuotaNvidiaGpu: - value: "256" - - defaultUserRoleKey: - value: 'eks.amazonaws.com/role-arn' - - defaultUserRoleValue: - value: 'arn:aws:iam::ACCOUNT_ID:role/flyte_project_role' - - development: - - projectQuotaCpu: - value: "4096" - - projectQuotaMemory: - value: 2Ti - - projectQuotaNvidiaGpu: - value: "256" - - defaultUserRoleKey: - value: 'eks.amazonaws.com/role-arn' - - defaultUserRoleValue: - value: 'arn:aws:iam::ACCOUNT_ID:role/flyte_project_role' - refreshInterval: 5m - standaloneDeployment: true - templatePath: /etc/flyte/clusterresource/templates - clusterResourcesPrivate: - app: - isServerless: false - union: - auth: - authorizationMetadataKey: flyte-authorization - clientId: 'dataplane-operator' - clientSecretLocation: /etc/union/secret/client_secret - tokenRefreshWindow: 5m - type: ClientSecret - connection: - host: dns:/// - admin.yaml: | - admin: - clientId: 'dataplane-operator' - clientSecretLocation: /etc/union/secret/client_secret - endpoint: dns:/// - insecure: false - event: - capacity: 1000 - rate: 500 - type: admin - domain.yaml: | - domains: - - id: development - name: development - - id: staging - name: staging - - id: production - name: production - clusters.yaml: | - clusters: - clusterConfigs: [] - labelClusterMap: {} - logger.yaml: | - logger: - formatter: - type: json - level: 4 - show-source: true + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +data: {} --- -# Source: dataplane/templates/clusterresourcesync/configmap.yaml +# Source: dataplane/charts/prometheus/templates/cm.yaml apiVersion: v1 kind: ConfigMap metadata: - name: union-clusterresource-template - namespace: union labels: - app.kubernetes.io/name: clusterresourcesync + app.kubernetes.io/component: server + app.kubernetes.io/name: prometheus app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name + app.kubernetes.io/version: v2.54.1 + helm.sh/chart: prometheus-25.27.0 app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: prometheus + name: union-operator-prometheus + namespace: union data: - a_namespace.yaml: | - apiVersion: v1 - kind: Namespace - metadata: - name: {{ namespace }} - labels: - union.ai/namespace-type: flyte - spec: - finalizers: - - kubernetes - - b_default_service_account.yaml: | - apiVersion: v1 - kind: ServiceAccount - metadata: - name: default - namespace: {{ namespace }} - annotations: - {{ defaultUserRoleKey }}: {{ defaultUserRoleValue }} + allow-snippet-annotations: "false" + alerting_rules.yml: | + {} + alerts: | + {} + prometheus.yml: | + global: + evaluation_interval: 1m + scrape_interval: 1m + scrape_timeout: 10s + rule_files: + - /etc/config/recording_rules.yml + - /etc/config/alerting_rules.yml + - /etc/config/rules + - /etc/config/alerts + scrape_configs: + - job_name: prometheus + static_configs: + - targets: + - localhost:9090 + - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + job_name: kubernetes-apiservers + kubernetes_sd_configs: + - role: endpoints + relabel_configs: + - action: keep + regex: default;kubernetes;https + source_labels: + - __meta_kubernetes_namespace + - __meta_kubernetes_service_name + - __meta_kubernetes_endpoint_port_name + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + insecure_skip_verify: true + - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + job_name: kubernetes-nodes + kubernetes_sd_configs: + - role: node + relabel_configs: + - action: labelmap + regex: __meta_kubernetes_node_label_(.+) + - replacement: kubernetes.default.svc:443 + target_label: __address__ + - regex: (.+) + replacement: /api/v1/nodes/$1/proxy/metrics + source_labels: + - __meta_kubernetes_node_name + target_label: __metrics_path__ + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + insecure_skip_verify: true + - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + job_name: kubernetes-nodes-cadvisor + kubernetes_sd_configs: + - role: node + relabel_configs: + - action: labelmap + regex: __meta_kubernetes_node_label_(.+) + - replacement: kubernetes.default.svc:443 + target_label: __address__ + - regex: (.+) + replacement: /api/v1/nodes/$1/proxy/metrics/cadvisor + source_labels: + - __meta_kubernetes_node_name + target_label: __metrics_path__ + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + insecure_skip_verify: true + - honor_labels: true + job_name: kubernetes-service-endpoints + kubernetes_sd_configs: + - role: endpoints + relabel_configs: + - action: keep + regex: true + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_scrape + - action: drop + regex: true + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_scrape_slow + - action: replace + regex: (https?) + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_scheme + target_label: __scheme__ + - action: replace + regex: (.+) + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_path + target_label: __metrics_path__ + - action: replace + regex: (.+?)(?::\d+)?;(\d+) + replacement: $1:$2 + source_labels: + - __address__ + - __meta_kubernetes_service_annotation_prometheus_io_port + target_label: __address__ + - action: labelmap + regex: __meta_kubernetes_service_annotation_prometheus_io_param_(.+) + replacement: __param_$1 + - action: labelmap + regex: __meta_kubernetes_service_label_(.+) + - action: replace + source_labels: + - __meta_kubernetes_namespace + target_label: namespace + - action: replace + source_labels: + - __meta_kubernetes_service_name + target_label: service + - action: replace + source_labels: + - __meta_kubernetes_pod_node_name + target_label: node + - honor_labels: true + job_name: kubernetes-service-endpoints-slow + kubernetes_sd_configs: + - role: endpoints + relabel_configs: + - action: keep + regex: true + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_scrape_slow + - action: replace + regex: (https?) + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_scheme + target_label: __scheme__ + - action: replace + regex: (.+) + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_path + target_label: __metrics_path__ + - action: replace + regex: (.+?)(?::\d+)?;(\d+) + replacement: $1:$2 + source_labels: + - __address__ + - __meta_kubernetes_service_annotation_prometheus_io_port + target_label: __address__ + - action: labelmap + regex: __meta_kubernetes_service_annotation_prometheus_io_param_(.+) + replacement: __param_$1 + - action: labelmap + regex: __meta_kubernetes_service_label_(.+) + - action: replace + source_labels: + - __meta_kubernetes_namespace + target_label: namespace + - action: replace + source_labels: + - __meta_kubernetes_service_name + target_label: service + - action: replace + source_labels: + - __meta_kubernetes_pod_node_name + target_label: node + scrape_interval: 5m + scrape_timeout: 30s + - honor_labels: true + job_name: prometheus-pushgateway + kubernetes_sd_configs: + - role: service + relabel_configs: + - action: keep + regex: pushgateway + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_probe + - honor_labels: true + job_name: kubernetes-services + kubernetes_sd_configs: + - role: service + metrics_path: /probe + params: + module: + - http_2xx + relabel_configs: + - action: keep + regex: true + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_probe + - source_labels: + - __address__ + target_label: __param_target + - replacement: blackbox + target_label: __address__ + - source_labels: + - __param_target + target_label: instance + - action: labelmap + regex: __meta_kubernetes_service_label_(.+) + - source_labels: + - __meta_kubernetes_namespace + target_label: namespace + - source_labels: + - __meta_kubernetes_service_name + target_label: service + - honor_labels: true + job_name: kubernetes-pods + kubernetes_sd_configs: + - role: pod + relabel_configs: + - action: keep + regex: true + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_scrape + - action: drop + regex: true + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_scrape_slow + - action: replace + regex: (https?) + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_scheme + target_label: __scheme__ + - action: replace + regex: (.+) + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_path + target_label: __metrics_path__ + - action: replace + regex: (\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4}) + replacement: '[$2]:$1' + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_port + - __meta_kubernetes_pod_ip + target_label: __address__ + - action: replace + regex: (\d+);((([0-9]+?)(\.|$)){4}) + replacement: $2:$1 + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_port + - __meta_kubernetes_pod_ip + target_label: __address__ + - action: labelmap + regex: __meta_kubernetes_pod_annotation_prometheus_io_param_(.+) + replacement: __param_$1 + - action: labelmap + regex: __meta_kubernetes_pod_label_(.+) + - action: replace + source_labels: + - __meta_kubernetes_namespace + target_label: namespace + - action: replace + source_labels: + - __meta_kubernetes_pod_name + target_label: pod + - action: drop + regex: Pending|Succeeded|Failed|Completed + source_labels: + - __meta_kubernetes_pod_phase + - action: replace + source_labels: + - __meta_kubernetes_pod_node_name + target_label: node + - honor_labels: true + job_name: kubernetes-pods-slow + kubernetes_sd_configs: + - role: pod + relabel_configs: + - action: keep + regex: true + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_scrape_slow + - action: replace + regex: (https?) + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_scheme + target_label: __scheme__ + - action: replace + regex: (.+) + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_path + target_label: __metrics_path__ + - action: replace + regex: (\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4}) + replacement: '[$2]:$1' + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_port + - __meta_kubernetes_pod_ip + target_label: __address__ + - action: replace + regex: (\d+);((([0-9]+?)(\.|$)){4}) + replacement: $2:$1 + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_port + - __meta_kubernetes_pod_ip + target_label: __address__ + - action: labelmap + regex: __meta_kubernetes_pod_annotation_prometheus_io_param_(.+) + replacement: __param_$1 + - action: labelmap + regex: __meta_kubernetes_pod_label_(.+) + - action: replace + source_labels: + - __meta_kubernetes_namespace + target_label: namespace + - action: replace + source_labels: + - __meta_kubernetes_pod_name + target_label: pod + - action: drop + regex: Pending|Succeeded|Failed|Completed + source_labels: + - __meta_kubernetes_pod_phase + - action: replace + source_labels: + - __meta_kubernetes_pod_node_name + target_label: node + scrape_interval: 5m + scrape_timeout: 30s + - job_name: 'kube-state-metrics' + metrics_path: /metrics + scrape_interval: 1m + honor_labels: true + kubernetes_sd_configs: + - role: endpoints + namespaces: + own_namespace: true + relabel_configs: + - source_labels: [__meta_kubernetes_service_label_app_kubernetes_io_name] + regex: kube-state-metrics + action: keep + - source_labels: [__meta_kubernetes_endpoint_port_name] + regex: http + action: keep + metric_relabel_configs: + - source_labels: [__name__] + regex: "kube_pod_container_resource_(limits|requests)|kube_pod_status_phase|kube_node_(labels|status_allocatable|status_condition|status_capacity)|kube_namespace_labels|kube_pod_container_status_(waiting|terminated|last_terminated).*_reason|kube_daemonset_status_number_unavailable|kube_deployment_status_replicas_unavailable|kube_resourcequota|kube_pod_info|kube_node_info|kube_pod_container_status_restarts_total" + action: keep + - source_labels: [__name__, phase] + separator: ";" + regex: "kube_pod_status_phase;(Succeeded|Failed)" + action: drop + - source_labels: [node] + target_label: nodename + regex: "(.*)" + action: replace + - source_labels: [label_node_group_name] + action: replace + regex: "(.+)" + target_label: label_node_pool_name + - job_name: 'opencost' + metrics_path: /metrics + scrape_interval: 1m + honor_labels: true + kubernetes_sd_configs: + - role: endpoints + namespaces: + own_namespace: true + relabel_configs: + - source_labels: [__meta_kubernetes_service_label_app_kubernetes_io_name] + regex: opencost + action: keep + - source_labels: [__meta_kubernetes_endpoint_port_name] + regex: http + action: keep + metric_relabel_configs: + - source_labels: [__name__] + regex: "kube_node_labels|kube_pod_labels|node_total_hourly_cost|node_ram_hourly_cost|node_cpu_hourly_cost|node_gpu_hourly_cost" + action: keep + - job_name: 'union-services' + metrics_path: /metrics + scrape_interval: 1m + honor_labels: true + kubernetes_sd_configs: + - role: endpoints + namespaces: + own_namespace: true + relabel_configs: + - source_labels: [__meta_kubernetes_service_label_platform_union_ai_service_group] + regex: .+ + action: keep + - source_labels: [__meta_kubernetes_endpoint_port_name] + regex: debug + action: keep + # Flyte propeller metrics for execution info and fast task duration. + # No-op when propeller is not deployed (no pods match the label selector). + - job_name: 'flytepropeller' + kubernetes_sd_configs: + - role: pod + namespaces: + own_namespace: true + selectors: + - role: pod + label: app.kubernetes.io/name=flytepropeller + metric_relabel_configs: + - source_labels: [__name__] + regex: "flyte:propeller:all:round:execution_info|flyte:propeller:all:node:fast_task:fast_task_execution_duration" + action: keep + relabel_configs: + - source_labels: [__meta_kubernetes_pod_container_name] + regex: flytepropeller + action: keep + - source_labels: [__meta_kubernetes_namespace] + action: replace + target_label: namespace + - source_labels: [__meta_kubernetes_pod_name] + action: replace + target_label: pod + # Kourier gateway (envoy) metrics for app serving. + # No-op when serving is not enabled (no pods match the label selector). + - job_name: 'serving-envoy' + kubernetes_sd_configs: + - role: pod + namespaces: + own_namespace: true + selectors: + - role: pod + label: app=3scale-kourier-gateway + metrics_path: /stats/prometheus + metric_relabel_configs: + - source_labels: [__name__] + regex: "envoy_cluster_upstream_rq_xx|envoy_cluster_upstream_rq_time_bucket" + action: keep + relabel_configs: + - source_labels: [__meta_kubernetes_pod_container_port_name] + regex: metrics + action: keep + - source_labels: [__meta_kubernetes_namespace] + action: replace + target_label: namespace + - source_labels: [__meta_kubernetes_pod_name] + action: replace + target_label: pod + # DCGM GPU metrics. + # No-op when dcgm-exporter is not deployed (no pods match the label selector). + - job_name: 'gpu-metrics' + kubernetes_sd_configs: + - role: pod + namespaces: + names: + - kube-system + selectors: + - role: pod + label: app.kubernetes.io/name=dcgm-exporter - c_project_resource_quota.yaml: | - apiVersion: v1 - kind: ResourceQuota - metadata: - name: project-quota - namespace: {{ namespace }} - spec: - hard: - limits.cpu: {{ projectQuotaCpu }} - limits.memory: {{ projectQuotaMemory }} - requests.nvidia.com/gpu: {{ projectQuotaNvidiaGpu }} + recording_rules.yml: | + {} + rules: | + {} --- # Source: dataplane/templates/fluent-bit/configmap.yaml apiVersion: v1 @@ -475,14 +841,48 @@ data: region us-east-1 bucket --- -# Source: dataplane/templates/imagebuilder/configmap.yaml +# Source: dataplane/templates/imagebuilder/build-image-configmap.yaml apiVersion: v1 kind: ConfigMap metadata: - name : union-operator-buildkit + name: build-image-config + labels: + app.kubernetes.io/name: union-operator + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm data: - buildkitd.toml: | - debug = false + storage.yaml: | + storage: + container: "" + type: stow + stow: + kind: s3 + config: + auth_type: accesskey + access_key_id: + secret_key: + disable_ssl: false + endpoint: + region: us-east-1 + enable-multicontainer: false + limits: + maxDownloadMBs: 1024 + cache: + max_size_mbs: 0 + target_gc_percent: 70 + buildkit-uri: "tcp://union-operator-buildkit.union.svc.cluster.local:1234" + default-repository: "union-dataplane" + authentication-type: "noop" +--- +# Source: dataplane/templates/imagebuilder/configmap.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name : union-operator-buildkit +data: + buildkitd.toml: | + debug = false [log] format = "text" @@ -2631,6 +3031,18 @@ data: "version": 1 } --- +# Source: dataplane/templates/monitoring/prometheusrule.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: union-recording-rules + namespace: union + labels: + release: release-name +data: + recording_rules.yml: | + groups: [] +--- # Source: dataplane/templates/nodeexecutor/configmap.yaml apiVersion: v1 kind: ConfigMap @@ -2685,8 +3097,13 @@ data: templateUris: - https://scale.neptune.ai/{{ .taskConfig.project }}/-/run/?customId={{ .taskConfig.id }} - kubernetes-enabled: true + kubernetes-enabled: false enabled_plugins.yaml: | + plugins: + connector-service: + defaultConnector: + defaultServiceConfig: '{"loadBalancingConfig": [{"round_robin":{}}]}' + endpoint: k8s:///flyteconnector.union:8000 tasks: task-plugins: default-for-task-types: @@ -2719,6 +3136,8 @@ data: cpu: 4096 gpu: 256 memory: 2Ti + namespace_mapping: + template: union union: connection: host: dns:/// @@ -2739,7 +3158,7 @@ data: cache-endpoint: dns:/// endpoint: dns:/// insecure: false - type: fallback + type: cacheservicev2 use-admin-auth: true logger: formatter: @@ -2759,6 +3178,7 @@ data: secure: false useAuth: false propeller: + limit-namespace: union node-config: disable-input-file-writes: true plugins: @@ -2766,7 +3186,7 @@ data: additional-worker-args: - --last-ack-grace-period-seconds - "120" - callback-uri: http://unionai-dataplane-executor.union.svc.cluster.local:15605 + callback-uri: http://union-operator-executor.union.svc.cluster.local:15605 grace-period-status-not-found: 2m ioutils: remoteFileOutputPaths: @@ -2776,6 +3196,7 @@ data: default-cpus: 100m default-env-vars: [] default-memory: 100Mi + default-pod-template-name: task-template co-pilot: image: 'cr.flyte.org/flyteorg/flytecopilot:v1.14.1' name: flyte-copilot- @@ -2816,6 +3237,7 @@ data: default-cpus: 100m default-env-vars: [] default-memory: 100Mi + default-pod-template-name: task-template config.yaml: | union: connection: @@ -2835,11 +3257,14 @@ data: operator: enabled: true enableTunnelService: true + # enableDepot: false tunnel: - enableDirectToAppIngress: false + enableDirectToAppIngress: true deploymentToRestart: union-operator-proxy + limitNamespace: union + disableClusterPermissions: true apps: - enabled: 'false' + enabled: 'true' syncClusterConfig: enabled: false clusterId: @@ -2854,19 +3279,19 @@ data: userRole: 'arn:aws:iam::ACCOUNT_ID:role/flyte_project_role' userRoleKey: 'eks.amazonaws.com/role-arn' collectUsages: - enabled: true + enabled: false billing: model: Legacy dependenciesHeartbeat: - prometheus: - endpoint: 'http://union-operator-prometheus:80/-/healthy' - propeller: - endpoint: 'http://flytepropeller:10254' + executor: + endpoint: 'http://union-operator-executor:10254' proxy: endpoint: 'http://union-operator-proxy:10254' secretsWatcher: dryRun: true enabled: false + org: + namespaceTemplate: union imageBuilder: enabled: true executionNamespaceLabels: @@ -2874,9 +3299,6 @@ data: referenceConfigmapName: union-operator targetConfigMapName: "build-image-config" proxy: - imageBuilderConfig: - authenticationType: 'noop' - defaultRepository: '' persistedLogs: objectStore: pathTemplate: namespace-{{.KubernetesNamespace}}.pod-{{.KubernetesPodName}}.cont-{{.KubernetesContainerName}} @@ -2930,999 +3352,149 @@ data: endpoint: region: us-east-1 image-builder.buildkit-uri: "tcp://union-operator-buildkit.union.svc.cluster.local:1234" - image-builder.default-repository: "" + image-builder.default-repository: "union-dataplane" image-builder.authentication-type: "noop" --- -# Source: dataplane/templates/prometheus/configmap.yaml +# Source: dataplane/templates/serving/bootstrap-configmap.yaml +# We need to copy the default configmap here since knative-operator does not automatically update the +# address of the `net-kourier-controller` endpoint to use the release namespace. It assumes that the +# resources are being installed into the `knative-serving` namespace instead. apiVersion: v1 kind: ConfigMap metadata: - name: union-operator-prometheus + name: union-operator-serving-envoy-bootstrap namespace: union - labels: - helm.sh/chart: dataplane-2026.4.2 - app.kubernetes.io/name: release-name-dataplane - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: prometheus data: - prometheus.yml: | - global: - scrape_interval: 15s - evaluation_interval: 15s - alerting: - alertmanagers: - - static_configs: - - targets: - rule_files: - - rules.yml - scrape_configs: - # Self-monitoring - - job_name: prometheus - metrics_path: /prometheus/metrics - static_configs: - - targets: ['localhost:9090'] - metric_relabel_configs: - - source_labels: [__name__] - regex: prometheus_tsdb_head_chunks_storage_size_bytes|prometheus_tsdb_head_series|prometheus_tsdb_head_series_created_total|prometheus_tsdb_head_series_removed_total|prometheus_tsdb_head_samples_appended_total|prometheus_tsdb_storage_blocks_bytes|prometheus_tsdb_retention_limit_bytes|prometheus_tsdb_wal_(.*)|.+_(limit|failed|missed|failures)_total - action: keep - - # Kube state metrics for pod/node resource tracking and cost calculations - - job_name: kube-state-metrics - static_configs: - - targets: ['release-name-kube-state-metrics:8080'] - metric_relabel_configs: - - separator: ; - source_labels: [__name__] - regex: kube_pod_container_resource_(limits|requests)|kube_pod_status_phase|kube_node_(labels|status_allocatable|status_condition|status_capacity)|kube_namespace_labels|kube_pod_container_status_(waiting|terminated|last_terminated).*_reason|kube_daemonset_status_number_unavailable|kube_deployment_status_replicas_unavailable|kube_resourcequota|kube_pod_info|kube_node_info|kube_pod_container_status_restarts_total - action: keep - - separator: ; - source_labels: [__name__, phase] - regex: kube_pod_status_phase;(Succeeded|Failed) - action: drop - - source_labels: [node] - target_label: nodename - regex: '(.*)' - action: replace - - source_labels: [label_node_group_name] - action: replace - regex: (.+) - target_label: label_node_pool_name - - # cAdvisor container metrics for CPU and memory tracking - - job_name: kubernetes-cadvisor - metrics_path: /metrics - scheme: https - kubernetes_sd_configs: - - role: node - namespaces: - names: [] - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token - tls_config: - ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt - insecure_skip_verify: false - metric_relabel_configs: - - separator: ; - source_labels: [__name__] - regex: container_cpu_usage_seconds_total|container_memory_working_set_bytes - action: keep - relabel_configs: - - separator: ; - regex: __meta_kubernetes_node_label_(.+) - replacement: $1 - action: labelmap - - separator: ; - regex: (.*) - target_label: __address__ - replacement: kubernetes.default.svc:443 - action: replace - - source_labels: [__meta_kubernetes_node_name] - separator: ; - regex: (.+) - target_label: __metrics_path__ - replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor - action: replace - - # Flyte propeller metrics for execution info and fast task duration - - job_name: flytepropeller - kubernetes_sd_configs: - - role: pod - namespaces: - names: - - union - selectors: - - role: pod - label: app.kubernetes.io/name=flytepropeller - metric_relabel_configs: - - source_labels: [__name__] - regex: "flyte:propeller:all:round:execution_info|flyte:propeller:all:node:fast_task:fast_task_execution_duration" - action: keep - relabel_configs: - - source_labels: [__meta_kubernetes_pod_container_name] - regex: flytepropeller - action: keep - - source_labels: [__meta_kubernetes_namespace] - action: replace - target_label: namespace - - source_labels: [__meta_kubernetes_pod_name] - action: replace - target_label: pod - # OpenCost metrics for cost tracking - - job_name: opencost - static_configs: - - targets: ['release-name-opencost:9003'] - metric_relabel_configs: - - source_labels: [__name__] - regex: "kube_node_labels|kube_pod_labels|node_total_hourly_cost|node_ram_hourly_cost|node_cpu_hourly_cost|node_gpu_hourly_cost" - action: keep - # DCGM GPU metrics - - job_name: gpu-metrics - kubernetes_sd_configs: - - role: pod - namespaces: - names: - - kube-system - selectors: - - role: pod - label: app.kubernetes.io/name=dcgm-exporter - rules.yml: | - - groups: - - name: cost_calculations_15s - interval: 15s - rules: - - record: pod_gpu_allocation - expr: | - sum by (namespace, pod) (DCGM_FI_DEV_GPU_UTIL >= bool 0) * on (namespace, pod) group_left() (max by (namespace, pod) (kube_pod_status_phase{phase=~"Running|Pending"} == 1)) - - record: execution_info # A join metric to look up execution-level info. Used to disambiguate workflow/task executions, apps, and workspaces. - expr: | - max by (label_domain, label_project, label_entity_name, label_execution_id, label_entity_id)( - label_replace( - label_replace( - label_replace( - label_replace( - label_replace( - flyte:propeller:all:round:execution_info{domain!="", project!="", workflow_name!="", execution_id!=""}, # filter for workflow/task executions - "label_entity_id", "$1", "execution_id", "(.*)" # join key - ), "label_entity_name", "$1", "workflow_name", "(.*)" # set label_entity_name to the workflow/task name from the workflow_execution_id - ), - "label_execution_id", "$1", "execution_id", "(.*)" - ), - "label_project", "$1", "project", "(.*)" # project - ), - "label_domain", "$1", "domain", "(.*)" # domain - ) - ) - - record: app_info # A join metric to look up app-level info. Used to disambiguate workflow/task executions, apps, and workspaces. - expr: | - max by (label_domain, label_project, label_app_name, label_app_version, label_entity_id)( - label_replace( - label_replace( - label_replace( - kube_pod_labels{ - label_domain!="", - label_project!="", - label_serving_unionai_dev_app_name!="", - label_serving_knative_dev_revision!="" - }, # this filters for apps - "label_app_name", "$1", "label_serving_unionai_dev_app_name", "(.*)" # rename to cleanup - ), - "label_app_version", "$1", "label_serving_knative_dev_revision", "(.*)" # the app_version is equivalent to an execution_id for workflows (lowest level of granularity) - ), - "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # join key - ) - ) - - record: workspace_info # A join metric to look up workspace info. Used to disambiguate workflow/task executions, apps, and workspaces. - expr: | - max by (label_domain, label_project, label_workspace_name, label_entity_id)( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # filter for workspaces - "label_entity_id", "$1", "label_node_id", "(.*)" # join key - ), "label_workspace_name", "$1", "label_node_id", "(.*)" # set label_workspace_name to the workspace name from the kube_pod_labels - ) - ) - - record: fast_task_execution_duration - expr: | - max by (label_domain, label_project, label_execution_id, label_entity_id, namespace, pod)( - label_replace( - label_replace( - label_replace( - label_replace( - label_replace( - label_replace( - flyte:propeller:all:node:fast_task:fast_task_execution_duration{domain!="", project!="", execution_id!="", exported_namespace!="", exported_pod!=""}, - "label_entity_id", "$1", "execution_id", "(.*)" # join key - ), - "label_execution_id", "$1", "execution_id", "(.*)" - ), - "label_project", "$1", "project", "(.*)" # project - ), - "label_domain", "$1", "domain", "(.*)" # domain - ), - "namespace", "$1", "exported_namespace", "(.*)" - ), - "pod", "$1", "exported_pod", "(.*)" - ) - ) - - record: fast_task_execution_duration_rate - expr: | - irate(fast_task_execution_duration[1m]) > 0 or absent(irate(fast_task_execution_duration[1m])) * fast_task_execution_duration - - record: entity_id:mem_usage_bytes_total_per_node:sum # Allocated memory (max(requested, consumed)) aggregated per node and entity, where entity is either a task/workflow execution or an app. - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id, node) ( # aggregate up to entity - # First, calculate the allocated memory for each pod - max by (namespace, pod) ( # this is the case where consumed (the memory working set) exceeds requested memory - ( - sum by (namespace, pod) ( - container_memory_working_set_bytes{namespace!="",pod!="",image!=""} - ) - > sum by (namespace, pod) ( - kube_pod_container_resource_requests{namespace!="", pod!="", node!="", resource="memory"} - ) - ) - or sum by (namespace, pod) ( # this is the case where memory requests are <= consumed memory - kube_pod_container_resource_requests{namespace!="", pod!="", node!="", resource="memory"} # needed to add node!="" to dedupe - ) - ) - # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but we do not want to double the number of pod-level metrics we save - * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) - "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") - ), - "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels - label_replace( - label_replace( - kube_pod_labels{ - label_domain!="", - label_project!="", - label_serving_unionai_dev_app_name!="", - label_serving_knative_dev_revision!="" - }, # this filters for apps only - "label_entity_type", "app", "", "" # set label_entity_type to "app" - ), - "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) - "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" - ), - "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - fast_task_execution_duration_rate{label_domain!="", label_project!="",label_execution_id!=""}, - "label_entity_type", "fast_task", "", "" - ) - ) - ) - # Then filter for pods only in the "Running" or "Pending" phase - * on (namespace, pod) group_left() ( - max by (namespace, pod) ( - kube_pod_status_phase{phase=~"Running|Pending"} == 1 - ) - ) - # Now join in node identifiers which are used for subsequent overhead calculations - * on (namespace, pod) group_left(node) ( - max by (namespace, pod, node) (kube_pod_info{node!=""}) # needed to add node!="" to dedupe - ) - ) - - record: entity_id:cpu_usage_per_node:sum # Allocated cpu (max(requested, consumed)) aggregated per node and entity, where entity is either a task/workflow execution or an app. - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id, node) ( - # First, calculate the allocated cpu for each pod - max by (namespace, pod) ( # this is the case where consumed (the cpu usage seconds total) exceeds requested cpu - ( - sum by (namespace, pod) ( - irate(container_cpu_usage_seconds_total{namespace!="",pod!="",image!=""}[5m]) - ) - > sum by (namespace, pod) ( - kube_pod_container_resource_requests{namespace!="", pod!="", node!="", resource="cpu"} - ) - ) - or sum by (namespace, pod) ( # this is the case where cpu requests are <= consumed cpu - kube_pod_container_resource_requests{namespace!="", pod!="", node!="", resource="cpu"} - ) - ) - # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but I didn't want to double the number of pod-level metrics we save - * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) - "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") - ), - "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels - label_replace( - label_replace( - kube_pod_labels{ - label_domain!="", - label_project!="", - label_serving_unionai_dev_app_name!="", - label_serving_knative_dev_revision!="" - }, # this filters for apps only - "label_entity_type", "app", "", "" # set label_entity_type to "app" - ), - "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) - "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" - ), - "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - fast_task_execution_duration_rate{label_domain!="", label_project!="", label_execution_id!=""}, - "label_entity_type", "fast_task", "", "" - ) - ) - ) - # Then filter for pods only in the "Running" or "Pending" phase - * on (namespace, pod) group_left() ( - max by (namespace, pod) ( - kube_pod_status_phase{phase=~"Running|Pending"} == 1 - ) - ) - # Now join in node identifiers which are used for subsequent overhead calculations - * on (namespace, pod) group_left(node) ( - max by (namespace, pod, node) (kube_pod_info{node!=""}) # needed to add node!="" to dedupe - ) - ) - - record: entity_id:gpu_usage_per_node:sum # Allocated gpu aggregated per node and entity, where entity is either a task/workflow execution or an app. - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id, node) ( - # First, grab the allocated gpu for each pod (which is always either 1 or zero, since k8s can't split gpus the way it can with cpu/memory) - max by (namespace, pod) ( - pod_gpu_allocation - ) - # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but I didn't want to double the number of pod-level metrics we save - * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) - "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") - ), - "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels - label_replace( - label_replace( - kube_pod_labels{ - label_domain!="", - label_project!="", - label_serving_unionai_dev_app_name!="", - label_serving_knative_dev_revision!="" - }, # this filters for apps only - "label_entity_type", "app", "", "" # set label_entity_type to "app" - ), - "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) - "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" - ), - "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - fast_task_execution_duration_rate{label_domain!="", label_project!="", label_execution_id!=""}, - "label_entity_type", "fast_task", "", "" - ) - ) - ) - # Then filter for pods only in the "Running" or "Pending" phase - * on (namespace, pod) group_left() ( - max by (namespace, pod) ( - kube_pod_status_phase{phase=~"Running|Pending"} == 1 - ) - ) - # Now join in node identifiers which are used for subsequent overhead calculations - * on (namespace, pod) group_left(node) ( - max by (namespace, pod, node) (kube_pod_info{node!=""}) # needed to add node!="" to dedupe - ) - ) - - record: entity_id:used_mem_bytes:sum # the sum of used memory across all containers in an entity (numerator for aggregate utilization calculations) - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id) ( # aggregate up to entity - # First, calculate the used memory for each pod - sum by (namespace, pod) ( - container_memory_working_set_bytes{namespace!="",pod!="",image!=""} - ) - # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but we do not want to double the number of pod-level metrics we save - * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) - "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") - ), - "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels - label_replace( - label_replace( - kube_pod_labels{ - label_domain!="", - label_project!="", - label_serving_unionai_dev_app_name!="", - label_serving_knative_dev_revision!="" - }, # this filters for apps only - "label_entity_type", "app", "", "" # set label_entity_type to "app" - ), - "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) - "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" - ), - "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - fast_task_execution_duration_rate{label_domain!="", label_project!="", label_execution_id!=""}, - "label_entity_type", "fast_task", "", "" - ) - ) - ) - # Then filter for pods only in the "Running" or "Pending" phase - * on (namespace, pod) group_left() ( - max by (namespace, pod) ( - kube_pod_status_phase{phase=~"Running|Pending"} == 1 - ) - ) - ) - - record: entity_id:allocated_mem_bytes:sum # the sum of allocated memory across all containers in an entity (denominator for aggregate utilization calculations) - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id) ( # aggregate up to entity (remove node) - entity_id:mem_usage_bytes_total_per_node:sum - ) - - record: entity_id:used_cpu:sum # the sum of used cpu across all containers in an entity (numerator for aggregate utilization calculations) - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id) ( - sum by (namespace, pod) ( - irate(container_cpu_usage_seconds_total{namespace!="",pod!="",image!=""}[5m]) - ) - # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but I didn't want to double the number of pod-level metrics we save - * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) - "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") - ), - "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels - label_replace( - label_replace( - kube_pod_labels{ - label_domain!="", - label_project!="", - label_serving_unionai_dev_app_name!="", - label_serving_knative_dev_revision!="" - }, # this filters for apps only - "label_entity_type", "app", "", "" # set label_entity_type to "app" - ), - "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) - "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" - ), - "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - fast_task_execution_duration_rate{label_domain!="", label_project!="", label_execution_id!=""}, - "label_entity_type", "fast_task", "", "" - ) - ) - ) - # Then filter for pods only in the "Running" or "Pending" phase - * on (namespace, pod) group_left() ( - max by (namespace, pod) ( - kube_pod_status_phase{phase=~"Running|Pending"} == 1 - ) - ) - ) - - record: entity_id:allocated_cpu:sum # the sum of allocated cpu across all containers in an entity (denominator for aggregate utilization calculations) - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id) ( # aggregate up to entity (remove node) - entity_id:cpu_usage_per_node:sum - ) - - record: entity_id:sm_occupancy:avg # the simple average of SM occupancy (a good generic measure of GPU utilization) per entity - expr: | - avg by (label_entity_type, label_domain, label_project, label_entity_id) ( - # First, grab the SM occupancy for each pod - max by (namespace, pod) ( - DCGM_FI_PROF_SM_OCCUPANCY # SM occupancy is a good proxy for actual GPU usage - ) - # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but I didn't want to double the number of pod-level metrics we save - * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) - "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") - ), - "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels - label_replace( - label_replace( - kube_pod_labels{ - label_domain!="", - label_project!="", - label_serving_unionai_dev_app_name!="", - label_serving_knative_dev_revision!="" - }, # this filters for apps only - "label_entity_type", "app", "", "" # set label_entity_type to "app" - ), - "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) - "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" - ), - "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - fast_task_execution_duration_rate{label_domain!="", label_project!="", label_execution_id!=""}, - "label_entity_type", "fast_task", "", "" - ) - ) - ) - # Then filter for pods only in the "Running" or "Pending" phase - * on (namespace, pod) group_left() ( - max by (namespace, pod) ( - kube_pod_status_phase{phase=~"Running|Pending"} == 1 - ) - ) - ) - - record: entity_id:gpu_count:sum # the count of running gpu pods per entity (need this to weight the gpu utilization when aggregating upwards - i.e. project-level) - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id, node) ( - # First, grab the allocated gpu for each pod (which is always either 1 or zero, since k8s can't split gpus the way it can with cpu/memory) - max by (namespace, pod) ( - pod_gpu_allocation - ) - # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but I didn't want to double the number of pod-level metrics we save - * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) - "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") - ), - "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels - label_replace( - label_replace( - kube_pod_labels{ - label_domain!="", - label_project!="", - label_serving_unionai_dev_app_name!="", - label_serving_knative_dev_revision!="" - }, # this filters for apps only - "label_entity_type", "app", "", "" # set label_entity_type to "app" - ), - "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) - "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" - ), - "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - fast_task_execution_duration_rate{label_domain!="", label_project!="", label_execution_id!=""}, - "label_entity_type", "fast_task", "", "" - ) - ) - ) - # Then filter for pods only in the "Running" or "Pending" phase - * on (namespace, pod) group_left() ( - max by (namespace, pod) ( - kube_pod_status_phase{phase=~"Running|Pending"} == 1 - ) - ) - ) - - record: entity_id:weighted_sm_occupancy:sum # product of SM occupancy and allocated GPU count (something like "used memory", numerator of weighted calcs) - expr: | - entity_id:sm_occupancy:avg - * on (label_domain, label_project, label_entity_type, label_entity_id) entity_id:gpu_count:sum - - record: entity_id:allocated_mem_cost:sum # Allocated cost of memory for each workflow/task execution and app. - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id, type) ( - entity_id:mem_usage_bytes_total_per_node:sum / (1024 * 1024 * 1024) # convert bytes to GB - * on (node) group_left(type) label_replace(avg by (node) (node_ram_hourly_cost * (15 / 3600)), "type", "mem", "", "") # convert hourly cost to 15-secondly cost and add type - ) - - record: entity_id:allocated_cpu_cost:sum # Allocated cost of cpu for each workflow/task execution and app. - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id, type)( - entity_id:cpu_usage_per_node:sum - * on (node) group_left(type) label_replace(avg by (node) (node_cpu_hourly_cost * (15 / 3600)), "type", "cpu", "", "") # convert hourly cost to 15-secondly cost and add type - ) - - record: entity_id:allocated_gpu_cost:sum # Allocated cost of gpu for each workflow/task execution and app. - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id, type)( - entity_id:gpu_usage_per_node:sum - * on (node) group_left(type) label_replace(avg by (node) (node_gpu_hourly_cost * (15 / 3600)), "type", "gpu", "", "") # convert hourly cost to 15-secondly cost and add type - ) - - record: entity_id:allocated_cost:sum # Allocated cost of memory, cpu, and gpu for each workflow/task execution and app. - expr: | - label_replace( - sum by (label_entity_type, label_domain, label_project, label_entity_id) ( # for the sum to work, the labels need to be different on each "or" element (type label) - entity_id:allocated_mem_cost:sum - or - entity_id:allocated_cpu_cost:sum - or - entity_id:allocated_gpu_cost:sum - ), - "type", "allocated", "", "" # add type info - ) - - record: entity_id:overhead_cost:sum # The amount of overhead costs (node costs that we can't allocate with container resources) to allocate to each entity (workflow/task execution or app) - expr: | - label_replace( - sum by (label_entity_type, label_entity_id, label_domain, label_project)( # Aggregate the per-node metrics up to workflow/task execution or app (label_entity_id) - # Start with each execution's and app's allocated cost per node - sum by (label_entity_type, label_domain, label_project, label_entity_id, node) ( # for the sum to work, the labels need to be different on each "or" element (type label) - entity_id:mem_usage_bytes_total_per_node:sum / (1024 * 1024 * 1024) # convert bytes to GB - * on (node) group_left(type) label_replace(avg by (node) (node_ram_hourly_cost * (15 / 3600)), "type", "mem", "", "") # convert hourly cost to 15-secondly cost and add type - or - entity_id:cpu_usage_per_node:sum - * on (node) group_left(type) label_replace(avg by (node) (node_cpu_hourly_cost * (15 / 3600)), "type", "cpu", "", "") # convert hourly cost to 15-secondly cost and add type - or - entity_id:gpu_usage_per_node:sum - * on (node) group_left(type) label_replace(avg by (node) (node_gpu_hourly_cost * (15 / 3600)), "type", "gpu", "", "") # convert hourly cost to 15-secondly cost and add type - ) - # Then divide out the total allocated cost per node to get the proportion of allocated cost associated with each entity - / on (node) group_left()( - sum by (node) ( # for the sum to work, the labels need to be different on each "or" element (type label) - entity_id:mem_usage_bytes_total_per_node:sum / (1024 * 1024 * 1024) # convert bytes to GB - * on (node) group_left(type) label_replace(avg by (node) (node_ram_hourly_cost * (15 / 3600)), "type", "mem", "", "") # convert hourly cost to 15-secondly cost and add type - or - entity_id:cpu_usage_per_node:sum - * on (node) group_left(type) label_replace(avg by (node) (node_cpu_hourly_cost * (15 / 3600)), "type", "cpu", "", "") # convert hourly cost to 15-secondly cost and add type - or - entity_id:gpu_usage_per_node:sum - * on (node) group_left(type) label_replace(avg by (node) (node_gpu_hourly_cost * (15 / 3600)), "type", "gpu", "", "") # convert hourly cost to 15-secondly cost and add type - ) - > 0 # need to avoid dividing by zero, or gaps in the data can cause NaNs to proliferate, borking all charts - ) - # Then multiply by the overhead cost per node - * on (node) group_left() ( - # To calculate overhead, start with the true cost of running each node - avg by (node)(kube_node_labels{label_flyte_org_node_role="worker"}) # only look at worker nodes - * on (node) max by (node) ( - node_total_hourly_cost{instance_type!=""} # sometimes, the instance_type can be null, causing an unlabeled label to show up in the Compute Costs dashboard charts - ) * (15 / 3600) # convert hourly cost to 15-secondly cost - # Then subtract out the total allocated cost on each node - - on (node) group_left()( - sum by (node) ( # for the sum to work, the labels need to be different on each "or" element (type label) - entity_id:mem_usage_bytes_total_per_node:sum / (1024 * 1024 * 1024) # convert bytes to GB - * on (node) group_left(type) label_replace(avg by (node) (node_ram_hourly_cost * (15 / 3600)), "type", "mem", "", "") # convert hourly cost to 15-secondly cost and add type - or - entity_id:cpu_usage_per_node:sum - * on (node) group_left(type) label_replace(avg by (node) (node_cpu_hourly_cost * (15 / 3600)), "type", "cpu", "", "") # convert hourly cost to 15-secondly cost and add type - or - entity_id:gpu_usage_per_node:sum - * on (node) group_left(type) label_replace(avg by (node) (node_gpu_hourly_cost * (15 / 3600)), "type", "gpu", "", "") # convert hourly cost to 15-secondly cost and add type - ) - ) - ) - ), - "type", "overhead", "", "" # add type info - ) - - record: entity_id:total_cost:sum # Total cost of each entity (workflow/task execution or app), including allocated (from container resources) and overhead (proportion of unallocated node costs) - expr: | - label_replace( - sum by (label_domain, label_project, label_entity_id, label_entity_type) ( - entity_id:allocated_cost:sum - or - entity_id:overhead_cost:sum - ), - "type", "total", "", "" # add type info - ) - - record: node:total_cost:sum # Total cost of all nodes - expr: | - sum ( - avg by (node)(kube_node_labels{label_flyte_org_node_role="worker", label_node_kubernetes_io_instance_type!=""}) # only look at worker nodes - * on (node) group_left() node_total_hourly_cost{instance_type!=""} * (15 / 3600) # convert hourly cost to 15-secondly cost - ) - - record: node_type:total_cost:sum # Total cost of nodes grouped by node type - expr: | - sum by (node_type)( - avg by (node)(kube_node_labels{label_flyte_org_node_role="worker", label_node_kubernetes_io_instance_type!=""}) # only look at worker nodes - * on (node) group_left(node_type) label_replace(node_total_hourly_cost{instance_type!=""}, "node_type", "$1", "instance_type", "(.*)") * (15 / 3600) # convert hourly cost to 15-secondly cost and rename label - ) - - record: node_type:uptime_hours:sum # Total uptime of nodes grouped by node type - expr: | - sum by (node_type)( - avg by (node, node_type)( # dedupe - label_replace(kube_node_labels{label_flyte_org_node_role="worker", label_node_kubernetes_io_instance_type!=""}, "node_type", "$1", "label_node_kubernetes_io_instance_type", "(.*)") # relabel - ) - ) * (15 / 3600) # convert to number of hours per 15-second observation # Aggregate the above into visible metrics - - name: cost_rollup_15m - interval: 15m - rules: - - record: execution_info15m - expr: | - max_over_time(execution_info[15m:15s]) - - record: app_info15m - expr: | - max_over_time(app_info[15m:15s]) - - record: workspace_info15m - expr: | - max_over_time(workspace_info[15m:15s]) - - record: entity_id:allocated_mem_bytes:sum15m - expr: | - sum_over_time(entity_id:allocated_mem_bytes:sum[15m:15s]) - - record: entity_id:used_mem_bytes:sum15m - expr: | - sum_over_time(entity_id:used_mem_bytes:sum[15m:15s]) - - record: entity_id:allocated_cpu:sum15m - expr: | - sum_over_time(entity_id:allocated_cpu:sum[15m:15s]) - - record: entity_id:used_cpu:sum15m - expr: | - sum_over_time(entity_id:used_cpu:sum[15m:15s]) - - record: entity_id:weighted_sm_occupancy:sum15m - expr: | - sum_over_time(entity_id:weighted_sm_occupancy:sum[15m:15s]) - - record: entity_id:gpu_count:sum15m - expr: | - sum_over_time(entity_id:gpu_count:sum[15m:15s]) - - record: entity_id:allocated_mem_cost:sum15m - expr: | - sum_over_time(entity_id:allocated_mem_cost:sum[15m:15s]) - - record: entity_id:allocated_cpu_cost:sum15m - expr: | - sum_over_time(entity_id:allocated_cpu_cost:sum[15m:15s]) - - record: entity_id:allocated_gpu_cost:sum15m - expr: | - sum_over_time(entity_id:allocated_gpu_cost:sum[15m:15s]) - - record: entity_id:allocated_cost:sum15m - expr: | - sum_over_time(entity_id:allocated_cost:sum[15m:15s]) - - record: entity_id:overhead_cost:sum15m - expr: | - sum_over_time(entity_id:overhead_cost:sum[15m:15s]) - - record: entity_id:total_cost:sum15m - expr: | - sum_over_time(entity_id:total_cost:sum[15m:15s]) - - record: node:total_cost:sum15m - expr: | - sum_over_time(node:total_cost:sum[15m:15s]) - - record: node_type:total_cost:sum15m - expr: | - sum_over_time(node_type:total_cost:sum[15m:15s]) - - record: node_type:uptime_hours:sum15m - expr: | - sum_over_time(node_type:uptime_hours:sum[15m:15s]) + envoy-bootstrap.yaml: | + dynamic_resources: + ads_config: + transport_api_version: V3 + api_type: GRPC + rate_limit_settings: {} + grpc_services: + - envoy_grpc: {cluster_name: xds_cluster} + cds_config: + resource_api_version: V3 + ads: {} + lds_config: + resource_api_version: V3 + ads: {} + node: + cluster: kourier-knative + id: 3scale-kourier-gateway + static_resources: + listeners: + - name: stats_listener + address: + socket_address: + address: 0.0.0.0 + port_value: 9000 + filter_chains: + - filters: + - name: envoy.filters.network.http_connection_manager + typed_config: + "@type": type.googleapis.com/envoy.extensions.filters.network.http_connection_manager.v3.HttpConnectionManager + stat_prefix: stats_server + http_filters: + - name: envoy.filters.http.router + typed_config: + "@type": type.googleapis.com/envoy.extensions.filters.http.router.v3.Router + route_config: + virtual_hosts: + - name: admin_interface + domains: + - "*" + routes: + - match: + safe_regex: + regex: '/(certs|stats(/prometheus)?|server_info|clusters|listeners|ready)?' + headers: + - name: ':method' + string_match: + exact: GET + route: + cluster: service_stats + - match: + safe_regex: + regex: '/drain_listeners' + headers: + - name: ':method' + string_match: + exact: POST + route: + cluster: service_stats + clusters: + - name: service_stats + connect_timeout: 0.250s + type: static + load_assignment: + cluster_name: service_stats + endpoints: + lb_endpoints: + endpoint: + address: + socket_address: + address: 127.0.0.1 + port_value: 9901 + - name: xds_cluster + # This keepalive is recommended by envoy docs. + # https://www.envoyproxy.io/docs/envoy/latest/api-docs/xds_protocol + typed_extension_protocol_options: + envoy.extensions.upstreams.http.v3.HttpProtocolOptions: + "@type": type.googleapis.com/envoy.extensions.upstreams.http.v3.HttpProtocolOptions + explicit_http_config: + http2_protocol_options: + connection_keepalive: + interval: 30s + timeout: 5s + connect_timeout: 1s + load_assignment: + cluster_name: xds_cluster + endpoints: + lb_endpoints: + endpoint: + address: + socket_address: + address: "net-kourier-controller" + port_value: 18000 + type: STRICT_DNS + admin: + access_log: + - name: envoy.access_loggers.stdout + typed_config: + "@type": type.googleapis.com/envoy.extensions.access_loggers.stream.v3.StdoutAccessLog + address: + socket_address: + address: 127.0.0.1 + port_value: 9901 + stats_config: + stats_tags: + - tag_name: name + regex: '^.*?\.u/.*?n=(.*?)/.*?u\..*$' + - tag_name: domain + regex: '^.*?\.u/.*?d=(.*?)/.*?u\..*$' + - tag_name: org + regex: '^.*?\.u/.*?o=(.*?)/.*?u\..*$' + - tag_name: project + regex: '^.*?\.u/.*?p=(.*?)/.*?u\..*$' + - tag_name: target + regex: '^.*?\.u/.*?t=(.*?)/.*?u\..*$' + - tag_name: target_namespace + regex: '^.*?\.u/.*?tns=(.*?)/.*?u\..*$' --- -# Source: dataplane/templates/propeller/configmap.yaml +# Source: dataplane/templates/webhook/configmap.yaml apiVersion: v1 kind: ConfigMap metadata: - name: flyte-propeller-config + name: union-pod-webhook-config namespace: union data: - admin.yaml: | - admin: - clientId: 'dataplane-operator' - clientSecretLocation: /etc/union/secret/client_secret - endpoint: dns:/// - insecure: false - event: - capacity: 1000 - rate: 500 - type: admin - catalog.yaml: | - catalog-cache: - cache-endpoint: dns:/// - endpoint: dns:/// - insecure: false - type: fallback - use-admin-auth: true - copilot.yaml: | - plugins: - k8s: - co-pilot: - image: 'cr.flyte.org/flyteorg/flytecopilot:v1.14.1' - name: flyte-copilot- - start-timeout: 30s - core.yaml: | - propeller: - downstream-eval-duration: 30s - enable-admin-launcher: true - leader-election: - enabled: true - lease-duration: 15s - lock-config-map: - name: propeller-leader - namespace: 'union' - renew-deadline: 10s - retry-period: 2s - limit-namespace: all - literal-offloading-config: - enabled: true - max-workflow-retries: 30 - metadata-prefix: metadata/propeller - metrics-prefix: flyte - prof-port: 10254 - queue: - batch-size: -1 - batching-interval: 2s - queue: - base-delay: 5s - capacity: 1000 - max-delay: 120s - rate: 100 - type: maxof - sub-queue: - capacity: 100 - rate: 10 - type: bucket - type: batch - rawoutput-prefix: 's3://' - workers: 4 - workflow-reeval-duration: 30s + core.yaml: | + + webhook: certDir: /etc/webhook/certs + disableCreateMutatingWebhookConfig: true embeddedSecretManagerConfig: imagePullSecrets: enabled: true @@ -3930,81 +3502,13 @@ data: namespace: 'union' type: 'K8s' listenPort: '9443' + localCert: true secretManagerTypes: - Embedded - K8s - serviceName: flyte-pod-webhook + secretName: union-pod-webhook + serviceName: union-pod-webhook servicePort: '443' - enabled_plugins.yaml: | - plugins: - connector-service: - defaultConnector: - defaultServiceConfig: '{"loadBalancingConfig": [{"round_robin":{}}]}' - endpoint: k8s:///flyteconnector.union:8000 - tasks: - task-plugins: - default-for-task-types: - actor: fast-task - container: container - container_array: k8s-array - sidecar: sidecar - enabled-plugins: - - container - - sidecar - - k8s-array - - echo - - fast-task - - connector-service - k8s.yaml: | - plugins: - k8s: - default-cpus: 100m - default-env-vars: [] - default-memory: 100Mi - logger.yaml: | - logger: - formatter: - type: json - level: 4 - show-source: true - resource_manager.yaml: | - propeller: - resourcemanager: - type: noop - task_logs.yaml: | - plugins: - logs: - cloudwatch-enabled: false - dynamic-log-links: - - vscode: - displayName: VS Code Debugger - templateUris: - - /dataplane/pod/v1/generated_name/task/{{.executionProject}}/{{.executionDomain}}/{{.executionName}}/{{.nodeID}}/{{.taskRetryAttempt}}/{{.taskProject}}/{{.taskDomain}}/{{.taskID}}/{{.taskVersion}}/ - kubernetes-enabled: false - templates: - - displayName: Task Logs - scheme: TaskExecution - templateUris: - - /console/projects/{{.executionProject}}/domains/{{.executionDomain}}/executions/{{.executionName}}/nodeId/{{.nodeID}}/taskId/{{.taskID}}/attempt/{{.taskRetryAttempt}}/view/logs?duration=all&fromExecutionNav=true - storage.yaml: | - storage: - container: "" - type: stow - stow: - kind: s3 - config: - auth_type: accesskey - access_key_id: - secret_key: - disable_ssl: false - endpoint: - region: us-east-1 - enable-multicontainer: false - limits: - maxDownloadMBs: 1024 - cache: - max_size_mbs: 0 - target_gc_percent: 70 --- # Source: dataplane/charts/fluentbit/templates/clusterrole.yaml apiVersion: rbac.authorization.k8s.io/v1 @@ -4028,515 +3532,808 @@ rules: - list - watch --- -# Source: dataplane/charts/kube-state-metrics/templates/role.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2020 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: - labels: - helm.sh/chart: kube-state-metrics-5.30.1 - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: metrics - app.kubernetes.io/part-of: kube-state-metrics - app.kubernetes.io/name: kube-state-metrics - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2.15.0" - name: release-name-kube-state-metrics -rules: - -- apiGroups: ["certificates.k8s.io"] - resources: - - certificatesigningrequests - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - configmaps - verbs: ["list", "watch"] - -- apiGroups: ["batch"] - resources: - - cronjobs - verbs: ["list", "watch"] - -- apiGroups: ["extensions", "apps"] - resources: - - daemonsets - verbs: ["list", "watch"] - -- apiGroups: ["extensions", "apps"] - resources: - - deployments - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - endpoints - verbs: ["list", "watch"] - -- apiGroups: ["autoscaling"] - resources: - - horizontalpodautoscalers - verbs: ["list", "watch"] - -- apiGroups: ["extensions", "networking.k8s.io"] - resources: - - ingresses - verbs: ["list", "watch"] - -- apiGroups: ["batch"] - resources: - - jobs - verbs: ["list", "watch"] - -- apiGroups: ["coordination.k8s.io"] - resources: - - leases - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - limitranges - verbs: ["list", "watch"] - -- apiGroups: ["admissionregistration.k8s.io"] - resources: - - mutatingwebhookconfigurations - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - namespaces - verbs: ["list", "watch"] + name: knative-serving-operator-aggregated + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +aggregationRule: + clusterRoleSelectors: + # This (along with escalate below) allows the Operator to pick up any + # roles that are provided to the admin of the cluster by knative serving + # automatically. + - matchExpressions: + - {key: serving.knative.dev/release, operator: Exists} +rules: [] # Rules are automatically filled in by the controller manager. +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: knative-serving-operator-aggregated-stable + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +aggregationRule: + clusterRoleSelectors: + # This (along with escalate below) allows the Operator to pick up any + # roles that are provided to the admin of the cluster by knative serving + # automatically. + - matchExpressions: + - {key: app.kubernetes.io/name, operator: In, values: ["knative-serving"]} +rules: [] # Rules are automatically filled in by the controller manager. +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: knative-eventing-operator-aggregated + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +aggregationRule: + clusterRoleSelectors: + # This (along with escalate below) allows the Operator to pick up any + # roles that are provided to the admin of the cluster by knative eventing + # automatically. + - matchExpressions: + - {key: eventing.knative.dev/release, operator: Exists} +rules: [] # Rules are automatically filled in by the controller manager. +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: knative-eventing-operator-aggregated-stable + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +aggregationRule: + clusterRoleSelectors: + # This (along with escalate below) allows the Operator to pick up any + # roles that are provided to the admin of the cluster by knative eventing + # automatically. + - matchExpressions: + - {key: app.kubernetes.io/name, operator: In, values: ["knative-eventing"]} +rules: [] # Rules are automatically filled in by the controller manager. +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2020 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. -- apiGroups: ["networking.k8s.io"] - resources: - - networkpolicies - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - nodes - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - persistentvolumeclaims - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - persistentvolumes - verbs: ["list", "watch"] - -- apiGroups: ["policy"] - resources: - - poddisruptionbudgets - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - pods - verbs: ["list", "watch"] - -- apiGroups: ["extensions", "apps"] - resources: - - replicasets - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - replicationcontrollers - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - resourcequotas - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - secrets - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - services - verbs: ["list", "watch"] - -- apiGroups: ["apps"] - resources: - - statefulsets - verbs: ["list", "watch"] - -- apiGroups: ["storage.k8s.io"] - resources: - - storageclasses - verbs: ["list", "watch"] - -- apiGroups: ["admissionregistration.k8s.io"] - resources: - - validatingwebhookconfigurations - verbs: ["list", "watch"] - -- apiGroups: ["storage.k8s.io"] - resources: - - volumeattachments - verbs: ["list", "watch"] ---- -# Source: dataplane/charts/opencost/templates/clusterrole.yaml -# Cluster role giving opencost to get, list, watch required resources -# No write permissions are required -apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole +apiVersion: rbac.authorization.k8s.io/v1 metadata: - name: release-name-opencost + name: knative-serving-operator labels: - helm.sh/chart: opencost-1.42.0 - app.kubernetes.io/name: opencost - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "1.111.0" - app.kubernetes.io/part-of: opencost - app.kubernetes.io/managed-by: Helm + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator rules: - - apiGroups: [""] + - apiGroups: + - operator.knative.dev + resources: + - '*' + verbs: + - '*' + # Bootstrapping permissions. + # Roles that are explicitly bound buch which are specified by this Operator + # MUST be specified here with 'get' and 'bind'. + - apiGroups: + - rbac.authorization.k8s.io + resources: + - clusterroles + resourceNames: + - system:auth-delegator + verbs: + - bind + - get + - apiGroups: + - rbac.authorization.k8s.io + resources: + - roles + resourceNames: + - extension-apiserver-authentication-reader + verbs: + - bind + - get + - apiGroups: + - rbac.authorization.k8s.io + resources: + - clusterroles + - roles + verbs: + - create + - delete + # Escalate is necessary in order to create a role using cluster role aggregation, + # and to allow the Operator to bootstrap itself into the necessary set of + # permissions, even as those continue to evolve upstream. + - escalate + - get + - list + - update + - apiGroups: + - rbac.authorization.k8s.io + resources: + - clusterrolebindings + - rolebindings + verbs: + - create + - delete + - list + - get + - update + # Permissions required for Knative controller + # infra. + - apiGroups: + - apiregistration.k8s.io + resources: + - apiservices + verbs: + - update + - apiGroups: + - "" + resources: + - services + verbs: + - create + - delete + - get + - list + - watch + - apiGroups: + - caching.internal.knative.dev + resources: + - images + verbs: + - '*' + - apiGroups: + - "" + resources: + - namespaces + verbs: + - get + - update + - watch + - apiGroups: + - '' + resources: + - events + verbs: + - create + - update + - patch + - apiGroups: + - '' resources: - configmaps + verbs: + - create + - delete + - get + - list + - watch + - apiGroups: + - security.istio.io + - apps + - policy + resources: + - poddisruptionbudgets + - peerauthentications - deployments - - nodes + - daemonsets + - replicasets + - statefulsets + verbs: + - create + - delete + - get + - list + - watch + - update + - apiGroups: + - apiregistration.k8s.io + resources: + - apiservices + verbs: + - create + - delete + - get + - list + - apiGroups: + - autoscaling + resources: + - horizontalpodautoscalers + verbs: + - create + - delete + - get + - list + - apiGroups: + - coordination.k8s.io + resources: + - leases + verbs: + - '*' + - apiGroups: + - apiextensions.k8s.io + resources: + - customresourcedefinitions + verbs: + - '*' + # Old resources that need cleaning up that are not in the knative-serving + # namespace. + - apiGroups: + - "" + resources: + - services + - deployments + - horizontalpodautoscalers + resourceNames: + - knative-ingressgateway + verbs: + - delete + - apiGroups: + - "" + resources: + - configmaps + resourceNames: + - config-controller + verbs: + - delete + - apiGroups: + - "" + resources: + - serviceaccounts + resourceNames: + - knative-serving-operator + verbs: + - delete + # for contour TLS + - apiGroups: + - projectcontour.io + resources: + - httpproxies + - tlscertificatedelegations + verbs: + - get + - list + - watch + - update + - create + - delete + - deletecollection + - patch + # for security-guard + - apiGroups: + - guard.security.knative.dev + resources: + - guardians + verbs: + - get + - list + - watch + - create + - update + - patch + - delete + - apiGroups: + - "" + resources: + - configmaps + verbs: + - get + - list + - watch + - create + - update + - patch + - delete + - apiGroups: + - authentication.k8s.io + resources: + - tokenreviews + verbs: + - create + - apiGroups: + - "" + resources: - pods + verbs: + - get +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +kind: ClusterRole +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: knative-eventing-operator + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +rules: + - apiGroups: + - operator.knative.dev + resources: + - '*' + verbs: + - '*' + # Bootstrapping permissions. + # Roles that are explicitly bound buch which are specified by this Operator + # MUST be specified here with 'get' and 'bind'. + - apiGroups: + - rbac.authorization.k8s.io + resources: + - clusterroles + - roles + verbs: + - create + - delete + # Escalate is necessary in order to create a role using cluster role aggregation, + # and to allow the Operator to bootstrap itself into the necessary set of + # permissions, even as those continue to evolve upstream. + - escalate + - get + - list + - update + - apiGroups: + - rbac.authorization.k8s.io + resources: + - clusterrolebindings + - rolebindings + verbs: + - create + - delete + - list + - get + - update + # Permissions required for Knative controller + # infra. + - apiGroups: + - apiregistration.k8s.io + resources: + - apiservices + verbs: + - update + - apiGroups: + - "" + resources: - services - - resourcequotas - - replicationcontrollers - - limitranges - - persistentvolumeclaims - - persistentvolumes + verbs: + - create + - delete + - get + - list + - watch + - apiGroups: + - caching.internal.knative.dev + resources: + - images + verbs: + - '*' + - apiGroups: + - "" + resources: - namespaces - - endpoints verbs: + - get + - update + - watch + - apiGroups: + - '' + resources: + - events + verbs: + - create + - update + - patch + - apiGroups: + - '' + resources: + - configmaps + verbs: + - create + - delete + - get + - list + - watch + - apiGroups: + - apps + resources: + - deployments + - daemonsets + - replicasets + - statefulsets + verbs: + - create + - delete + - get + - list + - watch + - apiGroups: + - apiregistration.k8s.io + resources: + - apiservices + verbs: + - create + - delete + - get + - list + - apiGroups: + - autoscaling + resources: + - horizontalpodautoscalers + verbs: + - create + - delete + - update + - get + - list + - apiGroups: + - coordination.k8s.io + resources: + - leases + verbs: + - '*' + - apiGroups: + - apiextensions.k8s.io + resources: + - customresourcedefinitions + verbs: + - '*' + - apiGroups: + - batch + resources: + - jobs + verbs: + - create + - delete + - update + - get + - list + - watch + # Old resources that need cleaning up that are not in the knative-eventing + # namespace. + - apiGroups: + - "" + resources: + - serviceaccounts + resourceNames: + - knative-eventing-operator + verbs: + - delete + # for RabbitMQ messaging topology objects + - apiGroups: + - rabbitmq.com + resources: + - rabbitmqclusters + verbs: + - "get" + - "list" + - "watch" + - apiGroups: + - rabbitmq.com + resources: + - bindings + - queues + - exchanges + verbs: + - create + - delete - get - list + - patch + - update - watch - apiGroups: - - extensions + - rabbitmq.com resources: - - daemonsets - - deployments - - replicasets + - bindings/status + - queues/status + - exchanges/status verbs: - get - - list - - watch + # for Kafka eventing source - apiGroups: - - apps + - keda.sh resources: - - statefulsets - - deployments - - daemonsets - - replicasets + - scaledobjects + - scaledobjects/finalizers + - scaledobjects/status + - triggerauthentications + - triggerauthentications/status verbs: + - get - list - watch + - update + - create + - delete + # Internal APIs - apiGroups: - - batch + - "internal.kafka.eventing.knative.dev" resources: - - cronjobs - - jobs + - "consumers" + - "consumers/status" + - "consumergroups" + - "consumergroups/status" verbs: + - create - get - list - watch + - patch + - update + - delete - apiGroups: - - autoscaling + - "internal.kafka.eventing.knative.dev" resources: - - horizontalpodautoscalers + - "consumers/finalizers" + - "consumergroups/finalizers" verbs: - - get - - list - - watch + - update + - delete - apiGroups: - - policy + - apps resources: - - poddisruptionbudgets + - statefulsets/scale verbs: - get - list - watch + - update + - patch + - create + - delete - apiGroups: - - storage.k8s.io + - rbac.authorization.k8s.io resources: - - storageclasses + - clusterrolebindings verbs: - - get - - list - watch ---- -# Source: dataplane/templates/clusterresourcesync/serviceaccount.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: union-clustersync-resource -rules: - apiGroups: - - "" - - rbac.authorization.k8s.io + - "*" + resources: + - configmaps + verbs: + - delete + - apiGroups: + - "*" resources: - configmaps - - namespaces - - pods - - resourcequotas - - roles - - rolebindings - - secrets - services - - serviceaccounts - - clusterrolebindings - - podtemplates verbs: - - '*' ---- -# Source: dataplane/templates/nodeexecutor/serviceaccount.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: union-executor - labels: - app: executor -rules: -# Allow RO access to PODS -- apiGroups: - - "" - resources: - - pods - verbs: - - get - - list - - watch -# Allow Event recording access -- apiGroups: - - "" - resources: - - events - verbs: - - create - - update - - delete - - patch -# Allow Access All plugin objects -- apiGroups: - - '*' - resources: - - '*' - verbs: - - get - - list - - watch - - create - - update - - delete - - patch -# Allow Access to CRD -- apiGroups: - - apiextensions.k8s.io - resources: - - customresourcedefinitions - verbs: - - get - - list - - watch - - create - - delete - - update ---- -# Source: dataplane/templates/operator/serviceaccount-proxy.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: proxy-system - labels: - app.kubernetes.io/name: operator-proxy - app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name - app.kubernetes.io/managed-by: Helm -rules: + - get + - list + - watch + - update + - create + - delete - apiGroups: - - '*' + - "*" resources: - - events - - flyteworkflows - - pods/log - pods - - rayjobs - - resourcequotas verbs: - - get - list + - update + - get - watch ---- -# Source: dataplane/templates/operator/serviceaccount.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: operator-system - labels: - app.kubernetes.io/name: union-operator - app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name - app.kubernetes.io/managed-by: Helm -rules: - # Allow Access to all resources under flyte.lyft.com - apiGroups: - - flyte.lyft.com + - "*" resources: - - flyteworkflows - - flyteworkflows/finalizers + - pods/finalizers verbs: - get - list - - watch - create - update - delete + - apiGroups: + - "*" + resources: + - events + verbs: - patch - - post - - deletecollection + - create - apiGroups: - - '*' + - "*" resources: - - resourcequotas - - pods - - configmaps - - podtemplates - secrets - - namespaces - - nodes verbs: - get - list - watch - - create - update + - create - delete - - nonResourceURLs: - - /metrics + - apiGroups: + - "*" + resources: + - nodes verbs: - get ---- -# Source: dataplane/templates/prometheus/rbac.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: union-operator-prometheus - labels: - helm.sh/chart: dataplane-2026.4.2 - app.kubernetes.io/name: release-name-dataplane - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: prometheus -rules: - - apiGroups: [""] + - list + - watch + - apiGroups: + - "*" resources: - - nodes - - nodes/proxy - - pods - - endpoints - - services + - serviceaccounts verbs: - get - list - watch - - nonResourceURLs: - - /metrics - - /metrics/cadvisor + - update + - create + - delete + - apiGroups: + - "*" + resources: + - configmaps + resourceNames: + - kafka-channel-config + verbs: + - patch + - apiGroups: + - "*" + resources: + - horizontalpodautoscalers + resourceNames: + - kafka-webhook + verbs: + - delete + - apiGroups: + - "*" + resources: + - leases + verbs: + - delete + - apiGroups: + - "*" + resources: + - poddisruptionbudgets + resourceNames: + - kafka-webhook + verbs: + - delete + - apiGroups: + - "*" + resources: + - services + verbs: + - patch + - apiGroups: + - "apps" + resources: + - deployments + verbs: + - deletecollection + # Eventing TLS + - apiGroups: + - "cert-manager.io" + resources: + - certificates + - issuers + - clusterissuers verbs: + - create + - delete + - update + - list - get ---- -# Source: dataplane/templates/propeller/serviceaccount-webhook.yaml -kind: ClusterRole -apiVersion: rbac.authorization.k8s.io/v1 -metadata: - name: flytepropeller-webhook-role - namespace: union -rules: + - watch - apiGroups: - - "*" + - "trust.cert-manager.io" resources: - - mutatingwebhookconfigurations - - secrets - - pods - - replicasets/finalizers + - bundles verbs: - - get - create + - delete - update - - patch + - list + - get + - watch --- -# Source: dataplane/templates/propeller/serviceaccount.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: - name: flytepropeller-role + name: knative-operator-webhook + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator rules: - # Allow RO access to PODS + # For watching logging configuration and getting certs. - apiGroups: - "" resources: - - pods + - "configmaps" verbs: - - get - - list - - watch - # Allow Event recording access + - "get" + - "list" + - "watch" + # For manipulating certs into secrets. - apiGroups: - "" resources: - - events + - "namespaces" verbs: - - create - - update - - delete - - patch - # Allow Access All plugin objects + - "get" + - "create" + - "update" + - "list" + - "watch" + - "patch" + # finalizers are needed for the owner reference of the webhook - apiGroups: - - '*' + - "" resources: - - '*' + - "namespaces/finalizers" verbs: - - get - - list - - watch - - create - - update - - delete - - patch - # Allow Access to CRD + - "update" + # For getting our Deployment so we can decorate with ownerref. - apiGroups: - - apiextensions.k8s.io + - "apps" resources: - - customresourcedefinitions + - "deployments" verbs: - - get - - list - - watch - - create - - delete - - update - # Allow Access to all resources under flyte.lyft.com + - "get" - apiGroups: - - flyte.lyft.com + - "apps" resources: - - flyteworkflows - - flyteworkflows/finalizers + - "deployments/finalizers" verbs: - - get - - list - - watch - - create - update - - delete - - patch - - post - - deletecollection + # For actually registering our webhook. + - apiGroups: + - "admissionregistration.k8s.io" + resources: + - "mutatingwebhookconfigurations" + - "validatingwebhookconfigurations" + verbs: &everything + - "get" + - "list" + - "create" + - "update" + - "delete" + - "patch" + - "watch" + # For leader election + - apiGroups: + - "coordination.k8s.io" + resources: + - "leases" + verbs: *everything + # Necessary for conversion webhook. These are copied from the serving + # TODO: Do we really need all these permissions? + - apiGroups: ["apiextensions.k8s.io"] + resources: ["customresourcedefinitions"] + verbs: ["get", "list", "create", "update", "delete", "patch", "watch"] --- # Source: dataplane/charts/fluentbit/templates/clusterrolebinding.yaml apiVersion: rbac.authorization.k8s.io/v1 @@ -4555,210 +4352,420 @@ roleRef: name: release-name-fluentbit subjects: - kind: ServiceAccount - name: fluentbit-system + name: union-system namespace: union --- -# Source: dataplane/charts/kube-state-metrics/templates/clusterrolebinding.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2020 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# TODO: Consider restriction of non-aggregated role to knativeservings namespaces. apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: - labels: - helm.sh/chart: kube-state-metrics-5.30.1 - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: metrics - app.kubernetes.io/part-of: kube-state-metrics - app.kubernetes.io/name: kube-state-metrics - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2.15.0" - name: release-name-kube-state-metrics + name: knative-serving-operator + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator roleRef: apiGroup: rbac.authorization.k8s.io kind: ClusterRole - name: release-name-kube-state-metrics + name: knative-serving-operator subjects: -- kind: ServiceAccount - name: release-name-kube-state-metrics - namespace: union + - kind: ServiceAccount + name: knative-operator + namespace: "union" --- -# Source: dataplane/charts/opencost/templates/clusterrolebinding.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# TODO: Consider restriction of non-aggregated role to knativeeventing namespaces. apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: - name: release-name-opencost + name: knative-eventing-operator labels: - helm.sh/chart: opencost-1.42.0 - app.kubernetes.io/name: opencost - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "1.111.0" - app.kubernetes.io/part-of: opencost - app.kubernetes.io/managed-by: Helm + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator roleRef: apiGroup: rbac.authorization.k8s.io kind: ClusterRole - name: release-name-opencost + name: knative-eventing-operator subjects: - kind: ServiceAccount - name: release-name-opencost - namespace: union + name: knative-operator + namespace: "union" --- -# Source: dataplane/templates/clusterresourcesync/serviceaccount.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: - name: union-clustersync-resource + name: operator-webhook + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator roleRef: apiGroup: rbac.authorization.k8s.io kind: ClusterRole - name: union-clustersync-resource + name: knative-operator-webhook subjects: - kind: ServiceAccount - name: union-clustersync-system - namespace: union + name: operator-webhook + namespace: "union" --- -# Source: dataplane/templates/clusterresourcesync/serviceaccount.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2020 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: - name: union-clustersync-auth-delegator + name: knative-serving-operator-aggregated + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator roleRef: apiGroup: rbac.authorization.k8s.io kind: ClusterRole - name: system:auth-delegator + name: knative-serving-operator-aggregated subjects: - kind: ServiceAccount - name: union-clustersync-system - namespace: union + name: knative-operator + namespace: "union" --- -# Source: dataplane/templates/nodeexecutor/serviceaccount.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: - name: union-executor + name: knative-serving-operator-aggregated-stable labels: - app: executor + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator roleRef: apiGroup: rbac.authorization.k8s.io kind: ClusterRole - name: union-executor + name: knative-serving-operator-aggregated-stable subjects: -- kind: ServiceAccount - name: executor - namespace: union + - kind: ServiceAccount + name: knative-operator + namespace: "union" --- -# Source: dataplane/templates/operator/serviceaccount-proxy.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: - name: proxy-system + name: knative-eventing-operator-aggregated labels: - app.kubernetes.io/name: operator-proxy - app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name - app.kubernetes.io/managed-by: Helm + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator roleRef: apiGroup: rbac.authorization.k8s.io kind: ClusterRole - name: proxy-system + name: knative-eventing-operator-aggregated subjects: - kind: ServiceAccount - name: proxy-system - namespace: union + name: knative-operator + namespace: "union" --- -# Source: dataplane/templates/operator/serviceaccount.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: - name: operator-system + name: knative-eventing-operator-aggregated-stable labels: - app.kubernetes.io/name: union-operator - app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name - app.kubernetes.io/managed-by: Helm + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator roleRef: apiGroup: rbac.authorization.k8s.io kind: ClusterRole - name: operator-system + name: knative-eventing-operator-aggregated-stable subjects: - kind: ServiceAccount - name: operator-system - namespace: union + name: knative-operator + namespace: "union" --- -# Source: dataplane/templates/prometheus/rbac.yaml +# Source: dataplane/charts/dcgm-exporter/templates/role.yaml apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding +kind: Role metadata: - name: union-operator-prometheus + name: dcgm-exporter-read-cm + namespace: union labels: - helm.sh/chart: dataplane-2026.4.2 - app.kubernetes.io/name: release-name-dataplane + helm.sh/chart: dcgm-exporter-4.7.1 + app.kubernetes.io/name: dcgm-exporter app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" + app.kubernetes.io/component: dcgm-exporter + app.kubernetes.io/version: "4.7.1" app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: prometheus -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: union-operator-prometheus -subjects: - - kind: ServiceAccount - name: union-operator-prometheus - namespace: union +rules: +- apiGroups: [""] + resources: ["configmaps"] + resourceNames: ["exporter-metrics-config-map"] + verbs: ["get"] --- -# Source: dataplane/templates/propeller/serviceaccount-webhook.yaml -# Create a binding from Role -> ServiceAccount -kind: ClusterRoleBinding +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2022 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + namespace: "union" + name: knative-operator-webhook + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +rules: + # For manipulating certs into secrets. + - apiGroups: + - "" + resources: + - "secrets" + verbs: + - "get" + - "create" + - "update" + - "list" + - "watch" + - "patch" +--- +# Source: dataplane/templates/monitoring/prometheus-rbac.yaml apiVersion: rbac.authorization.k8s.io/v1 +kind: Role metadata: - name: flytepropeller-webhook-binding + name: union-operator-prometheus-rbac namespace: union -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: flytepropeller-webhook-role -subjects: - - kind: ServiceAccount - name: flytepropeller-webhook-system - namespace: union + labels: + release: release-name +rules: + # Prometheus server scrape permissions + - apiGroups: [""] + resources: + - nodes + - nodes/proxy + - nodes/metrics + - services + - endpoints + - pods + - ingresses + - configmaps + verbs: + - get + - list + - watch + - apiGroups: ["extensions", "networking.k8s.io"] + resources: + - ingresses/status + - ingresses + verbs: + - get + - list + - watch + - apiGroups: ["discovery.k8s.io"] + resources: + - endpointslices + verbs: + - get + - list + - watch + # kube-state-metrics permissions (rbac.create=false on the subchart; managed here instead) + - apiGroups: ["certificates.k8s.io"] + resources: ["certificatesigningrequests"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["configmaps"] + verbs: ["list", "watch"] + - apiGroups: ["batch"] + resources: ["cronjobs"] + verbs: ["list", "watch"] + - apiGroups: ["extensions", "apps"] + resources: ["daemonsets"] + verbs: ["list", "watch"] + - apiGroups: ["extensions", "apps"] + resources: ["deployments"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["endpoints"] + verbs: ["list", "watch"] + - apiGroups: ["discovery.k8s.io"] + resources: ["endpointslices"] + verbs: ["list", "watch"] + - apiGroups: ["autoscaling"] + resources: ["horizontalpodautoscalers"] + verbs: ["list", "watch"] + - apiGroups: ["extensions", "networking.k8s.io"] + resources: ["ingresses"] + verbs: ["list", "watch"] + - apiGroups: ["batch"] + resources: ["jobs"] + verbs: ["list", "watch"] + - apiGroups: ["coordination.k8s.io"] + resources: ["leases"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["limitranges"] + verbs: ["list", "watch"] + - apiGroups: ["admissionregistration.k8s.io"] + resources: ["mutatingwebhookconfigurations"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["namespaces"] + verbs: ["list", "watch"] + - apiGroups: ["networking.k8s.io"] + resources: ["networkpolicies"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["persistentvolumeclaims"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["persistentvolumes"] + verbs: ["list", "watch"] + - apiGroups: ["policy"] + resources: ["poddisruptionbudgets"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["pods"] + verbs: ["list", "watch"] + - apiGroups: ["extensions", "apps"] + resources: ["replicasets"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["replicationcontrollers"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["resourcequotas"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["secrets"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["services"] + verbs: ["list", "watch"] + - apiGroups: ["apps"] + resources: ["statefulsets"] + verbs: ["list", "watch"] + - apiGroups: ["storage.k8s.io"] + resources: ["storageclasses"] + verbs: ["list", "watch"] + - apiGroups: ["admissionregistration.k8s.io"] + resources: ["validatingwebhookconfigurations"] + verbs: ["list", "watch"] + - apiGroups: ["storage.k8s.io"] + resources: ["volumeattachments"] + verbs: ["list", "watch"] --- -# Source: dataplane/templates/propeller/serviceaccount.yaml +# Source: dataplane/templates/nodeexecutor/serviceaccount.yaml apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding +kind: Role metadata: - name: flytepropeller-binding -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: flytepropeller-role -subjects: - - kind: ServiceAccount - name: flytepropeller-system - namespace: union + name: union-executor + labels: + app: executor +rules: +# Allow RO access to PODS +- apiGroups: + - "" + resources: + - pods + verbs: + - get + - list + - watch +# Allow Event recording access +- apiGroups: + - "" + resources: + - events + verbs: + - create + - update + - delete + - patch +# Allow Access All plugin objects +- apiGroups: + - '*' + resources: + - '*' + verbs: + - get + - list + - watch + - create + - update + - delete + - patch +# Allow Access to CRD +- apiGroups: + - apiextensions.k8s.io + resources: + - customresourcedefinitions + verbs: + - get + - list + - watch + - create + - delete + - update --- -# Source: dataplane/charts/dcgm-exporter/templates/role.yaml +# Source: dataplane/templates/operator/serviceaccount-proxy-secret.yaml apiVersion: rbac.authorization.k8s.io/v1 kind: Role metadata: - name: dcgm-exporter-read-cm + name: union-system-secret namespace: union labels: - helm.sh/chart: dcgm-exporter-4.7.1 - app.kubernetes.io/name: dcgm-exporter + app.kubernetes.io/name: operator-proxy app.kubernetes.io/instance: release-name - app.kubernetes.io/component: dcgm-exporter - app.kubernetes.io/version: "4.7.1" + platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm rules: -- apiGroups: [""] - resources: ["configmaps"] - resourceNames: ["exporter-metrics-config-map"] - verbs: ["get"] + - apiGroups: + - '*' + resources: + - secrets + verbs: + - get + - list + - create + - update + - delete --- -# Source: dataplane/templates/operator/serviceaccount-proxy-secret.yaml +# Source: dataplane/templates/operator/serviceaccount-proxy.yaml apiVersion: rbac.authorization.k8s.io/v1 kind: Role metadata: - name: proxy-system-secret - namespace: union + name: proxy-system labels: app.kubernetes.io/name: operator-proxy app.kubernetes.io/instance: release-name @@ -4768,13 +4775,16 @@ rules: - apiGroups: - '*' resources: - - secrets + - events + - flyteworkflows + - pods/log + - pods + - rayjobs + - resourcequotas verbs: - get - list - - create - - update - - delete + - watch --- # Source: dataplane/templates/operator/serviceaccount.yaml apiVersion: rbac.authorization.k8s.io/v1 @@ -4798,6 +4808,71 @@ rules: - watch - create - update + - apiGroups: + - flyte.lyft.com + resources: + - flyteworkflows + - flyteworkflows/finalizers + verbs: + - get + - list + - watch + - create + - update + - delete + - patch + - post + - deletecollection + - apiGroups: + - '*' + resources: + - resourcequotas + - pods + - configmaps + - podtemplates + - secrets + - namespaces + - nodes + verbs: + - get + - list + - watch + - create + - update + - delete + - apiGroups: + - serving.knative.dev + resources: + - revisions + - configurations + - services + verbs: + - get + - list + - watch + - create + - update + - delete +--- +# Source: dataplane/templates/webhook/serviceaccount.yaml +kind: Role +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: union-webhook-role + namespace: union +rules: + - apiGroups: + - "*" + resources: + - mutatingwebhookconfigurations + - secrets + - pods + - replicasets/finalizers + verbs: + - get + - create + - update + - patch --- # Source: dataplane/charts/dcgm-exporter/templates/rolebinding.yaml apiVersion: rbac.authorization.k8s.io/v1 @@ -4821,11 +4896,94 @@ roleRef: name: dcgm-exporter-read-cm apiGroup: rbac.authorization.k8s.io --- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2022 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + namespace: "union" + name: operator-webhook + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +subjects: + - kind: ServiceAccount + name: operator-webhook + namespace: "union" +roleRef: + kind: Role + name: knative-operator-webhook + apiGroup: rbac.authorization.k8s.io +--- +# Source: dataplane/templates/monitoring/prometheus-rbac.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: union-operator-prometheus-rbac + namespace: union + labels: + release: release-name +subjects: + - kind: ServiceAccount + name: union-operator-prometheus + namespace: union +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: union-operator-prometheus-rbac +--- +# Source: dataplane/templates/monitoring/prometheus-rbac.yaml +# kube-state-metrics service account RoleBinding +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: release-name-prometheus-kube-state-metrics + namespace: union + labels: + release: release-name +subjects: + - kind: ServiceAccount + name: release-name-prometheus-kube-state-metrics + namespace: union +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: union-operator-prometheus-rbac +--- +# Source: dataplane/templates/nodeexecutor/serviceaccount.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: union-executor + labels: + app: executor +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: union-executor +subjects: +- kind: ServiceAccount + name: union-system + namespace: union +--- # Source: dataplane/templates/operator/serviceaccount-proxy-secret.yaml apiVersion: rbac.authorization.k8s.io/v1 kind: RoleBinding metadata: - name: proxy-system-secret + name: union-system-secret namespace: union labels: app.kubernetes.io/name: operator-proxy @@ -4835,10 +4993,29 @@ metadata: roleRef: apiGroup: rbac.authorization.k8s.io kind: Role - name: proxy-system-secret + name: union-system-secret +subjects: + - kind: ServiceAccount + name: union-system + namespace: union +--- +# Source: dataplane/templates/operator/serviceaccount-proxy.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: proxy-system + labels: + app.kubernetes.io/name: operator-proxy + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: proxy-system subjects: - kind: ServiceAccount - name: proxy-system + name: union-system namespace: union --- # Source: dataplane/templates/operator/serviceaccount.yaml @@ -4857,7 +5034,23 @@ roleRef: name: operator-system subjects: - kind: ServiceAccount - name: operator-system + name: union-system + namespace: union +--- +# Source: dataplane/templates/webhook/serviceaccount.yaml +# Create a binding from Role -> ServiceAccount +kind: RoleBinding +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: union-webhook-binding + namespace: union +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: union-webhook-role +subjects: + - kind: ServiceAccount + name: union-system namespace: union --- # Source: dataplane/charts/dcgm-exporter/templates/service.yaml @@ -4923,20 +5116,60 @@ spec: app.kubernetes.io/name: fluentbit app.kubernetes.io/instance: release-name --- -# Source: dataplane/charts/kube-state-metrics/templates/service.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2022 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v1 +kind: Service +metadata: + labels: + role: operator-webhook + app.kubernetes.io/component: operator-webhook + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator + name: operator-webhook + namespace: "union" +spec: + ports: + # Define metrics and profiling for them to be accessible within service meshes. + - name: http-metrics + port: 9090 + targetPort: 9090 + - name: http-profiling + port: 8008 + targetPort: 8008 + - name: https-webhook + port: 443 + targetPort: 8443 + selector: + role: operator-webhook +--- +# Source: dataplane/charts/prometheus/charts/kube-state-metrics/templates/service.yaml apiVersion: v1 kind: Service metadata: name: release-name-kube-state-metrics namespace: union labels: - helm.sh/chart: kube-state-metrics-5.30.1 + helm.sh/chart: kube-state-metrics-5.25.1 app.kubernetes.io/managed-by: Helm app.kubernetes.io/component: metrics app.kubernetes.io/part-of: kube-state-metrics app.kubernetes.io/name: kube-state-metrics app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2.15.0" + app.kubernetes.io/version: "2.13.0" annotations: prometheus.io/scrape: 'true' spec: @@ -4951,28 +5184,32 @@ spec: app.kubernetes.io/name: kube-state-metrics app.kubernetes.io/instance: release-name --- -# Source: dataplane/charts/opencost/templates/service.yaml +# Source: dataplane/charts/prometheus/templates/service.yaml apiVersion: v1 kind: Service metadata: - name: release-name-opencost - namespace: union labels: - helm.sh/chart: opencost-1.42.0 - app.kubernetes.io/name: opencost + app.kubernetes.io/component: server + app.kubernetes.io/name: prometheus app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "1.111.0" - app.kubernetes.io/part-of: opencost + app.kubernetes.io/version: v2.54.1 + helm.sh/chart: prometheus-25.27.0 app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: prometheus + name: union-operator-prometheus + namespace: union spec: + ports: + - name: http + port: 80 + protocol: TCP + targetPort: 9090 selector: - app.kubernetes.io/name: opencost + app.kubernetes.io/component: server + app.kubernetes.io/name: prometheus app.kubernetes.io/instance: release-name + sessionAffinity: None type: "ClusterIP" - ports: - - name: http - port: 9003 - targetPort: 9003 --- # Source: dataplane/templates/clusterresourcesync/service.yaml apiVersion: v1 @@ -4983,18 +5220,43 @@ metadata: platform.union.ai/prometheus-group: "union-services" app.kubernetes.io/name: clusterresourcesync app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm +spec: + type: ClusterIP + ports: + - port: 10254 + targetPort: debug + protocol: TCP + name: debug + selector: + app.kubernetes.io/name: clusterresourcesync + app.kubernetes.io/instance: release-name +--- +# Source: dataplane/templates/flyteconnector/service.yaml +apiVersion: v1 +kind: Service +metadata: + name: flyteconnector + namespace: union + labels: + app.kubernetes.io/name: flyteconnector + app.kubernetes.io/instance: release-name app.kubernetes.io/managed-by: Helm spec: - type: ClusterIP + clusterIP: None ports: - - port: 10254 - targetPort: debug + - name: grpc + port: 8000 protocol: TCP - name: debug - selector: - app.kubernetes.io/name: clusterresourcesync - app.kubernetes.io/instance: release-name + targetPort: grpc + - name: metric + port: 9090 + protocol: TCP + targetPort: metric + selector: + app.kubernetes.io/name: flyteconnector + app.kubernetes.io/instance: release-name --- # Source: dataplane/templates/imagebuilder/service.yaml apiVersion: v1 @@ -5021,7 +5283,7 @@ spec: apiVersion: v1 kind: Service metadata: - name: release-name-dataplane-executor + name: union-operator-executor labels: platform.union.ai/prometheus-group: "union-services" app: executor @@ -5087,39 +5349,14 @@ spec: app.kubernetes.io/name: union-operator app.kubernetes.io/instance: release-name --- -# Source: dataplane/templates/prometheus/service.yaml -apiVersion: v1 -kind: Service -metadata: - name: union-operator-prometheus - namespace: union - labels: - helm.sh/chart: dataplane-2026.4.2 - app.kubernetes.io/name: release-name-dataplane - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: prometheus -spec: - type: ClusterIP - ports: - - port: 80 - targetPort: 9090 - protocol: TCP - name: http - selector: - app.kubernetes.io/component: prometheus - app.kubernetes.io/name: release-name-dataplane - app.kubernetes.io/instance: release-name ---- -# Source: dataplane/templates/propeller/service-webhook.yaml +# Source: dataplane/templates/webhook/service.yaml apiVersion: v1 kind: Service metadata: - name: flyte-pod-webhook + name: union-pod-webhook namespace: union labels: - app.kubernetes.io/name: flyte-pod-webhook + app.kubernetes.io/name: union-pod-webhook app.kubernetes.io/instance: release-name platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm @@ -5127,7 +5364,7 @@ metadata: projectcontour.io/upstream-protocol.h2c: grpc spec: selector: - app.kubernetes.io/name: flyte-pod-webhook + app.kubernetes.io/name: union-pod-webhook app.kubernetes.io/instance: release-name ports: - name: https @@ -5139,23 +5376,23 @@ spec: port: 10254 targetPort: 10254 --- -# Source: dataplane/templates/propeller/service-webhook.yaml +# Source: dataplane/templates/webhook/service.yaml # Headless Service for cache invalidation — resolves to all pod IPs so that # we can fan out invalidation requests to every webhook replica. apiVersion: v1 kind: Service metadata: - name: flyte-pod-webhook-headless + name: union-pod-webhook-headless namespace: union labels: - app.kubernetes.io/name: flyte-pod-webhook + app.kubernetes.io/name: union-pod-webhook app.kubernetes.io/instance: release-name platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm spec: clusterIP: None selector: - app.kubernetes.io/name: flyte-pod-webhook + app.kubernetes.io/name: union-pod-webhook app.kubernetes.io/instance: release-name ports: - name: cache-internal @@ -5163,32 +5400,6 @@ spec: port: 9443 targetPort: 9443 --- -# Source: dataplane/templates/propeller/service.yaml -apiVersion: v1 -kind: Service -metadata: - namespace: union - name: flytepropeller - labels: - platform.union.ai/prometheus-group: "union-services" - app.kubernetes.io/name: flytepropeller - app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name - app.kubernetes.io/managed-by: Helm -spec: - type: ClusterIP - ports: - - name: debug - protocol: TCP - port: 10254 - - name: fasttask - port: 15605 - protocol: TCP - targetPort: 15605 - selector: - app.kubernetes.io/name: flytepropeller - app.kubernetes.io/instance: release-name ---- # Source: dataplane/charts/dcgm-exporter/templates/daemonset.yaml # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. # @@ -5335,7 +5546,7 @@ spec: annotations: checksum/config: e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855 spec: - serviceAccountName: fluentbit-system + serviceAccountName: union-system hostNetwork: false dnsPolicy: ClusterFirst containers: @@ -5387,20 +5598,217 @@ spec: tolerations: - operator: Exists --- -# Source: dataplane/charts/kube-state-metrics/templates/deployment.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2022 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: operator-webhook + namespace: "union" + labels: + app.kubernetes.io/component: operator-webhook + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +spec: + selector: + matchLabels: + app: operator-webhook + role: operator-webhook + template: + metadata: + labels: + app: operator-webhook + role: operator-webhook + app.kubernetes.io/component: operator-webhook + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator + sidecar.istio.io/inject: "false" + spec: + # To avoid node becoming SPOF, spread our replicas to different nodes. + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - podAffinityTerm: + labelSelector: + matchLabels: + app: operator-webhook + topologyKey: kubernetes.io/hostname + weight: 100 + serviceAccountName: operator-webhook + containers: + - name: operator-webhook + # This is the Go import path for the binary that is containerized + # and substituted here. + image: gcr.io/knative-releases/knative.dev/operator/cmd/webhook@sha256:d3a7a3304629ccfcaacec620b50736e0b4c902a6328b83369b6cbaf7e94677c9 + resources: + requests: + cpu: 100m + memory: 100Mi + limits: + cpu: 500m + memory: 500Mi + env: + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: SYSTEM_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: CONFIG_LOGGING_NAME + value: config-logging + - name: CONFIG_OBSERVABILITY_NAME + value: config-observability + - name: WEBHOOK_NAME + value: operator-webhook + - name: WEBHOOK_PORT + value: "8443" + - name: WEBHOOK_SECRET_NAME + value: operator-webhook-certs + - name: METRICS_DOMAIN + value: knative.dev/operator + - name: KUBERNETES_MIN_VERSION + value: "" + securityContext: + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + runAsNonRoot: true + capabilities: + drop: + - ALL + ports: + - name: metrics + containerPort: 9090 + - name: profiling + containerPort: 8008 + - name: https-webhook + containerPort: 8443 + readinessProbe: + periodSeconds: 1 + httpGet: + scheme: HTTPS + port: 8443 + httpHeaders: + - name: k-kubelet-probe + value: "webhook" + livenessProbe: + periodSeconds: 1 + httpGet: + scheme: HTTPS + port: 8443 + httpHeaders: + - name: k-kubelet-probe + value: "webhook" + failureThreshold: 6 + initialDelaySeconds: 120 + # Our webhook should gracefully terminate by lame ducking first, set this to a sufficiently + # high value that we respect whatever value it has configured for the lame duck grace period. + terminationGracePeriodSeconds: 300 +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2020 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: knative-operator + namespace: "union" + labels: + app.kubernetes.io/name: knative-operator + app.kubernetes.io/version: "1.16.0" +spec: + replicas: 1 + selector: + matchLabels: + name: knative-operator + template: + metadata: + labels: + name: knative-operator + app.kubernetes.io/name: knative-operator + app.kubernetes.io/version: "1.16.0" + sidecar.istio.io/inject: "false" + spec: + serviceAccountName: knative-operator + containers: + - name: knative-operator + image: gcr.io/knative-releases/knative.dev/operator/cmd/operator@sha256:0b5a3532417f9c8e7b6044e23f6f67ad932a74a40a4092ad965b6f173b2fd887 + resources: + requests: + cpu: 100m + memory: 100Mi + limits: + cpu: 1000m + memory: 1000Mi + imagePullPolicy: IfNotPresent + env: + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: SYSTEM_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: METRICS_DOMAIN + value: knative.dev/operator + - name: CONFIG_LOGGING_NAME + value: config-logging + - name: CONFIG_OBSERVABILITY_NAME + value: config-observability + - name: KUBERNETES_MIN_VERSION + value: "" + securityContext: + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + runAsNonRoot: true + capabilities: + drop: + - ALL + ports: + - name: metrics + containerPort: 9090 +--- +# Source: dataplane/charts/prometheus/charts/kube-state-metrics/templates/deployment.yaml apiVersion: apps/v1 kind: Deployment metadata: name: release-name-kube-state-metrics namespace: union labels: - helm.sh/chart: kube-state-metrics-5.30.1 + helm.sh/chart: kube-state-metrics-5.25.1 app.kubernetes.io/managed-by: Helm app.kubernetes.io/component: metrics app.kubernetes.io/part-of: kube-state-metrics app.kubernetes.io/name: kube-state-metrics app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2.15.0" + app.kubernetes.io/version: "2.13.0" spec: selector: matchLabels: @@ -5413,13 +5821,13 @@ spec: template: metadata: labels: - helm.sh/chart: kube-state-metrics-5.30.1 + helm.sh/chart: kube-state-metrics-5.25.1 app.kubernetes.io/managed-by: Helm app.kubernetes.io/component: metrics app.kubernetes.io/part-of: kube-state-metrics app.kubernetes.io/name: kube-state-metrics app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2.15.0" + app.kubernetes.io/version: "2.13.0" spec: automountServiceAccountToken: true hostNetwork: false @@ -5436,8 +5844,9 @@ spec: args: - --port=8080 - --resources=certificatesigningrequests,configmaps,cronjobs,daemonsets,deployments,endpoints,horizontalpodautoscalers,ingresses,jobs,leases,limitranges,mutatingwebhookconfigurations,namespaces,networkpolicies,nodes,persistentvolumeclaims,persistentvolumes,poddisruptionbudgets,pods,replicasets,replicationcontrollers,resourcequotas,secrets,services,statefulsets,storageclasses,validatingwebhookconfigurations,volumeattachments + - --namespaces=union imagePullPolicy: IfNotPresent - image: registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.15.0 + image: registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.13.0 ports: - containerPort: 8080 name: "http" @@ -5457,7 +5866,7 @@ spec: httpGet: httpHeaders: path: /readyz - port: 8081 + port: 8080 scheme: HTTP initialDelaySeconds: 5 periodSeconds: 10 @@ -5472,201 +5881,204 @@ spec: - ALL readOnlyRootFilesystem: true --- -# Source: dataplane/charts/opencost/templates/deployment.yaml +# Source: dataplane/charts/prometheus/templates/deploy.yaml apiVersion: apps/v1 kind: Deployment metadata: - name: release-name-opencost - namespace: union labels: - helm.sh/chart: opencost-1.42.0 - app.kubernetes.io/name: opencost + app.kubernetes.io/component: server + app.kubernetes.io/name: prometheus app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "1.111.0" - app.kubernetes.io/part-of: opencost + app.kubernetes.io/version: v2.54.1 + helm.sh/chart: prometheus-25.27.0 app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: prometheus + name: union-operator-prometheus + namespace: union spec: - replicas: 1 selector: matchLabels: - app.kubernetes.io/name: opencost + app.kubernetes.io/component: server + app.kubernetes.io/name: prometheus app.kubernetes.io/instance: release-name - strategy: - rollingUpdate: - maxSurge: 1 - maxUnavailable: 1 - type: RollingUpdate + replicas: 1 + revisionHistoryLimit: 10 + strategy: + type: Recreate + rollingUpdate: null template: metadata: labels: - app.kubernetes.io/name: opencost + app.kubernetes.io/component: server + app.kubernetes.io/name: prometheus app.kubernetes.io/instance: release-name + app.kubernetes.io/version: v2.54.1 + helm.sh/chart: prometheus-25.27.0 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: prometheus spec: - serviceAccountName: release-name-opencost + enableServiceLinks: true + serviceAccountName: union-operator-prometheus containers: - - name: release-name-opencost - image: ghcr.io/opencost/opencost:1.111.0@sha256:6aa68e52a24b14ba41f23db08d1b9db1429a1c0300f4c0381ecc2c61fc311a97 - imagePullPolicy: IfNotPresent + - name: prometheus-server-configmap-reload + image: "quay.io/prometheus-operator/prometheus-config-reloader:v0.76.0" + imagePullPolicy: "IfNotPresent" args: + - --watched-dir=/etc/config + - --listen-address=0.0.0.0:8080 + - --reload-url=http://127.0.0.1:9090/prometheus/-/reload ports: - - containerPort: 9003 - name: http - resources: - limits: - cpu: 1000m - memory: 4Gi - requests: - cpu: 500m - memory: 1Gi - startupProbe: - httpGet: - path: /healthz - port: 9003 - initialDelaySeconds: 10 - periodSeconds: 5 - failureThreshold: 30 + - containerPort: 8080 + name: metrics livenessProbe: httpGet: path: /healthz - port: 9003 - initialDelaySeconds: 10 - periodSeconds: 20 - failureThreshold: 3 + port: metrics + scheme: HTTP + initialDelaySeconds: 2 + periodSeconds: 10 readinessProbe: httpGet: path: /healthz - port: 9003 - initialDelaySeconds: 10 + port: metrics + scheme: HTTP periodSeconds: 10 + volumeMounts: + - name: config-volume + mountPath: /etc/config + readOnly: true + + - name: prometheus-server + image: "quay.io/prometheus/prometheus:v2.54.1" + imagePullPolicy: "IfNotPresent" + args: + - --storage.tsdb.retention.time=3d + - --config.file=/etc/config/prometheus.yml + - --storage.tsdb.path=/data + - --web.console.libraries=/etc/prometheus/console_libraries + - --web.console.templates=/etc/prometheus/consoles + - --web.enable-lifecycle + - --web.route-prefix=/prometheus + - --web.external-url=/prometheus/ + ports: + - containerPort: 9090 + readinessProbe: + httpGet: + path: /prometheus/-/ready + port: 9090 + scheme: HTTP + initialDelaySeconds: 30 + periodSeconds: 5 + timeoutSeconds: 4 failureThreshold: 3 - env: - - name: LOG_LEVEL - value: info - - name: CUSTOM_COST_ENABLED - value: "false" - - name: KUBECOST_NAMESPACE - value: union - - name: API_PORT - value: "9003" - - name: PROMETHEUS_SERVER_ENDPOINT - value: "http://union-operator-prometheus.union.svc:80/prometheus" - - name: CLUSTER_ID - value: "default-cluster" - - name: DATA_RETENTION_DAILY_RESOLUTION_DAYS - value: "15" - - name: CLOUD_COST_ENABLED - value: "false" - - name: CLOUD_COST_MONTH_TO_DATE_INTERVAL - value: "6" - - name: CLOUD_COST_REFRESH_RATE_HOURS - value: "6" - - name: CLOUD_COST_QUERY_WINDOW_DAYS - value: "7" - - name: CLOUD_COST_RUN_WINDOW_DAYS - value: "3" - # Add any additional provided variables + successThreshold: 1 + livenessProbe: + httpGet: + path: /prometheus/-/healthy + port: 9090 + scheme: HTTP + initialDelaySeconds: 30 + periodSeconds: 15 + timeoutSeconds: 10 + failureThreshold: 3 + successThreshold: 1 + resources: + limits: + cpu: "3" + memory: 3500Mi + requests: + cpu: "1" + memory: 1Gi + volumeMounts: + - name: config-volume + mountPath: /etc/config + - name: storage-volume + mountPath: /data + subPath: "" + - name: server-recording-rules + mountPath: /etc/config/recording + subPath: + readOnly: + dnsPolicy: ClusterFirst + securityContext: + fsGroup: 65534 + runAsGroup: 65534 + runAsNonRoot: true + runAsUser: 65534 + terminationGracePeriodSeconds: 300 + volumes: + - name: config-volume + configMap: + name: union-operator-prometheus + - name: server-recording-rules + configMap: + name: union-recording-rules + - name: storage-volume + emptyDir: + sizeLimit: 10Gi --- -# Source: dataplane/templates/clusterresourcesync/deployment.yaml +# Source: dataplane/templates/flyteconnector/deployment.yaml apiVersion: apps/v1 kind: Deployment metadata: - name: union-syncresources + name: flyteconnector namespace: union - labels: - app.kubernetes.io/name: clusterresourcesync + labels: + app.kubernetes.io/name: flyteconnector app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm spec: - replicas: 1 + replicas: 2 selector: - matchLabels: - app.kubernetes.io/name: clusterresourcesync + matchLabels: + app.kubernetes.io/name: flyteconnector app.kubernetes.io/instance: release-name template: metadata: annotations: - configChecksum: "7ff0169854ce83fe5e5cb0ec550944a512e1ea0ebc47177b32a5bf3f7fadf9f" - labels: platform.union.ai/zone: "dataplane" - app.kubernetes.io/name: clusterresourcesync + app.kubernetes.io/name: flyteconnector app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm spec: containers: - command: - - clusterresource - - --config - - /etc/flyte/config/*.yaml - - clusterresource - - run - env: - - name: POD_NAME - valueFrom: - fieldRef: - fieldPath: metadata.name - - name: POD_NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - - name: GOMEMLIMIT - valueFrom: - resourceFieldRef: - divisor: 1 - resource: limits.memory - - name: GOMAXPROCS - valueFrom: - resourceFieldRef: - divisor: 1 - resource: limits.cpu - - name: CLUSTER_NAME - valueFrom: - secretKeyRef: - name: operator-cluster-name - key: cluster_name - - name: DEPLOYMENT_NAME - value: operator - - name: PROXY_SERVICE_URL - value: http://union-operator-proxy:8080 - - name: PROMETHEUS_SERVICE_URL - value: http://union-operator-prometheus:80 - - name: KNATIVE_PROXY_SERVICE_URL - value: http://kourier-internal - image: "public.ecr.aws/p0i0a9q8/unionoperator:2026.4.5" + - c0 + image: "ghcr.io/flyteorg/flyte-connectors:py3.13-2.0.0b50.dev3-g695bb1db3.d20260122" imagePullPolicy: "IfNotPresent" - name: sync-cluster-resources + name: flyteconnector + readinessProbe: + failureThreshold: 3 + httpGet: + path: / + port: 9090 + scheme: HTTP + initialDelaySeconds: 5 + periodSeconds: 10 + successThreshold: 1 + timeoutSeconds: 5 + env: + - name: AIRFLOW_CONN_GOOGLE_CLOUD_DEFAULT + value: "google-cloud-platform://" + - name: LOG_LEVEL + value: "20" + ports: + - containerPort: 8000 + name: grpc + - containerPort: 9090 + name: metric resources: limits: - cpu: "1" - memory: 500Mi + cpu: "1.5" + ephemeral-storage: 100Mi + memory: 1500Mi requests: - cpu: 500m - memory: 100Mi - volumeMounts: - - name: auth - mountPath: /etc/union/secret - - name: resource-templates - mountPath: /etc/flyte/clusterresource/templates - - name: config-volume - mountPath: /etc/flyte/config - ports: - - name: debug - containerPort: 10254 - protocol: TCP - serviceAccountName: union-clustersync-system - volumes: - - configMap: - name: union-clusterresource-template - name: resource-templates - - configMap: - name: union-clusterresourcesync-config - name: config-volume - - name: auth - secret: - secretName: union-secret-auth + cpu: "1" + ephemeral-storage: 100Mi + memory: 1000Mi + serviceAccountName: flyteconnector --- # Source: dataplane/templates/imagebuilder/deployment.yaml apiVersion: apps/v1 @@ -5688,17 +6100,15 @@ spec: app.kubernetes.io/instance: release-name template: metadata: - annotations: - container.apparmor.security.beta.kubernetes.io/buildkit: unconfined labels: platform.union.ai/zone: "dataplane" app.kubernetes.io/name: imagebuilder-buildkit app.kubernetes.io/instance: release-name spec: - serviceAccountName: "union-imagebuilder" + serviceAccountName: "union-system" containers: - name: "buildkit" - image: "docker.io/moby/buildkit:buildx-stable-1-rootless" + image: "docker.io/moby/buildkit:buildx-stable-1" imagePullPolicy: IfNotPresent env: - name: POD_NAME @@ -5733,18 +6143,15 @@ spec: - name: KNATIVE_PROXY_SERVICE_URL value: http://kourier-internal volumeMounts: - - mountPath: /home/user/.local/share/buildkit - name: buildkitd - mountPath: /etc/buildkit name: buildkit-config args: - --config - /etc/buildkit/buildkitd.toml - --addr - - unix:///run/user/1000/buildkit/buildkitd.sock + - unix:///run//buildkit/buildkitd.sock - --addr - tcp://0.0.0.0:1234 - - --oci-worker-no-process-sandbox ports: - name: tcp containerPort: 1234 @@ -5766,18 +6173,13 @@ spec: initialDelaySeconds: 5 periodSeconds: 30 securityContext: - seccompProfile: # Needs Kubernetes >= 1.19 - type: Unconfined - runAsUser: 1000 - runAsGroup: 1000 + privileged: true resources: requests: - cpu: 1 - ephemeral-storage: 20Gi - memory: 1Gi + cpu: 4 + ephemeral-storage: 50Gi + memory: 4Gi volumes: - - name: buildkitd - emptyDir: {} - configMap: name: union-operator-buildkit name: buildkit-config @@ -5806,16 +6208,18 @@ spec: template: metadata: annotations: - configChecksum: "807825e2bc1d1dc69164fbea82af93461fcf79e35ad2c0929e03df2f1e14935" + configChecksum: "841817ea8873e0592d093d9f564f22eb6a316a7cac8d611afc2582a4856dc7a" labels: platform.union.ai/zone: "dataplane" app: executor + app.kubernetes.io/instance: 'release-name' + app.kubernetes.io/name: executor spec: securityContext: fsGroup: 1337 - serviceAccountName: executor + serviceAccountName: union-system volumes: - name: config-volume configMap: @@ -5831,7 +6235,7 @@ spec: image: "public.ecr.aws/p0i0a9q8/unionoperator:2026.4.5" imagePullPolicy: IfNotPresent command: - - executor + - executorv2 - serve - --config - /etc/config/*.yaml @@ -5909,7 +6313,7 @@ spec: template: metadata: annotations: - configChecksum: "0d3d3093adf1cf4b2bb08dfec3df8021cea0b4660395075f002f399062a3485" + configChecksum: "1cbf9bb44767576a0ce2cfe30facbef94a1d54963db0d92bea7b5e1c1df350d" labels: @@ -5924,12 +6328,10 @@ spec: sources: - configMap: name: union-operator - - configMap: - name: union-clusterresourcesync-config - name: secret-volume secret: secretName: union-secret-auth - serviceAccountName: proxy-system + serviceAccountName: union-system securityContext: {} containers: @@ -6046,7 +6448,7 @@ spec: template: metadata: annotations: - configChecksum: "0d3d3093adf1cf4b2bb08dfec3df8021cea0b4660395075f002f399062a3485" + configChecksum: "1cbf9bb44767576a0ce2cfe30facbef94a1d54963db0d92bea7b5e1c1df350d" labels: @@ -6055,7 +6457,7 @@ spec: platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm spec: - serviceAccountName: operator-system + serviceAccountName: union-system securityContext: {} volumes: @@ -6136,81 +6538,16 @@ spec: containerPort: 10254 protocol: TCP --- -# Source: dataplane/templates/prometheus/deployment.yaml -apiVersion: apps/v1 -kind: Deployment -metadata: - name: union-operator-prometheus - namespace: union - labels: - helm.sh/chart: dataplane-2026.4.2 - app.kubernetes.io/name: release-name-dataplane - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: prometheus -spec: - replicas: 1 - selector: - matchLabels: - app.kubernetes.io/component: prometheus - app.kubernetes.io/name: release-name-dataplane - app.kubernetes.io/instance: release-name - template: - metadata: - annotations: - configChecksum: "419ca35014526f736b956303f4160d73038d8a65f6d174286647884e21e6b4e" - labels: - app.kubernetes.io/component: prometheus - app.kubernetes.io/name: release-name-dataplane - app.kubernetes.io/instance: release-name - spec: - priorityClassName: system-cluster-critical - serviceAccountName: union-operator-prometheus - securityContext: - fsGroup: 65534 - runAsNonRoot: true - runAsUser: 65534 - fsGroupChangePolicy: OnRootMismatch - containers: - - name: prometheus - image: "prom/prometheus:v3.3.1" - args: - - --config.file=/etc/prometheus/prometheus.yml - - --web.external-url=/prometheus/ - - --web.route-prefix=/prometheus/ - - --storage.tsdb.retention.time=3d - ports: - - name: http - containerPort: 9090 - protocol: TCP - resources: - limits: - cpu: "3" - memory: 3500Mi - requests: - cpu: "1" - memory: 1Gi - securityContext: - allowPrivilegeEscalation: false - terminationMessagePolicy: FallbackToLogsOnError - volumeMounts: - - mountPath: /etc/prometheus - name: prometheus-config - volumes: - - name: prometheus-config - configMap: - name: union-operator-prometheus ---- -# Source: dataplane/templates/propeller/deployment-webhook.yaml +# Source: dataplane/templates/webhook/deployment.yaml +# Webhook deployment # Create the actual deployment apiVersion: apps/v1 kind: Deployment metadata: - name: flytepropeller-webhook + name: union-pod-webhook namespace: union labels: - app.kubernetes.io/name: flyte-pod-webhook + app.kubernetes.io/name: union-pod-webhook app.kubernetes.io/instance: release-name platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm @@ -6218,19 +6555,19 @@ spec: replicas: 1 selector: matchLabels: - app.kubernetes.io/name: flyte-pod-webhook + app.kubernetes.io/name: union-pod-webhook app.kubernetes.io/instance: release-name template: metadata: labels: platform.union.ai/zone: "dataplane" - app.kubernetes.io/name: flyte-pod-webhook + app.kubernetes.io/name: union-pod-webhook app.kubernetes.io/instance: release-name platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm annotations: - configChecksum: "f99369e37d4a20dded730f6900a334aaa4b1b1c3e62f534e17d7967785c62a8" + configChecksum: "77958aee0464eee4cb43c70036cf5ffa89f47f58c908845b9eeaecc7e32bba4" spec: securityContext: @@ -6238,64 +6575,8 @@ spec: fsGroupChangePolicy: Always runAsNonRoot: true runAsUser: 1001 - seLinuxOptions: - type: spc_t - serviceAccountName: flytepropeller-webhook-system - initContainers: - - name: generate-secrets - image: "public.ecr.aws/p0i0a9q8/unionoperator:2026.4.5" - imagePullPolicy: "IfNotPresent" - command: - - flytepropeller - args: - - webhook - - init-certs - - --config - - /etc/flyte/config/*.yaml - env: - - name: POD_NAME - valueFrom: - fieldRef: - fieldPath: metadata.name - - name: POD_NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - - name: GOMEMLIMIT - valueFrom: - resourceFieldRef: - divisor: 1 - resource: limits.memory - - name: GOMAXPROCS - valueFrom: - resourceFieldRef: - divisor: 1 - resource: limits.cpu - - name: CLUSTER_NAME - valueFrom: - secretKeyRef: - name: operator-cluster-name - key: cluster_name - - name: DEPLOYMENT_NAME - value: operator - - name: PROXY_SERVICE_URL - value: http://union-operator-proxy:8080 - - name: PROMETHEUS_SERVICE_URL - value: http://union-operator-prometheus:80 - - name: KNATIVE_PROXY_SERVICE_URL - value: http://kourier-internal - volumeMounts: - - name: config-volume - mountPath: /etc/flyte/config - resources: - limits: - cpu: 1 - ephemeral-storage: 500Mi - memory: 500Mi - requests: - cpu: 200m - ephemeral-storage: 500Mi - memory: 500Mi + seLinuxOptions: null + serviceAccountName: union-system containers: - name: webhook image: "public.ecr.aws/p0i0a9q8/unionoperator:2026.4.5" @@ -6360,107 +6641,85 @@ spec: volumes: - name: config-volume configMap: - name: flyte-propeller-config + name: union-pod-webhook-config - name: webhook-certs secret: - secretName: flyte-pod-webhook + secretName: union-pod-webhook --- -# Source: dataplane/templates/propeller/deployment.yaml -apiVersion: apps/v1 -kind: Deployment +# Source: dataplane/templates/flyteconnector/hpa.yaml +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler metadata: - namespace: union - name: flytepropeller + name: flyteconnector labels: - app.kubernetes.io/name: flytepropeller + app.kubernetes.io/name: flyteconnector app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm spec: - replicas: 1 - selector: - matchLabels: - app.kubernetes.io/name: flytepropeller - app.kubernetes.io/instance: release-name - template: - metadata: - annotations: - configChecksum: "f99369e37d4a20dded730f6900a334aaa4b1b1c3e62f534e17d7967785c62a8" - - labels: - platform.union.ai/zone: "dataplane" - - - app.kubernetes.io/name: flytepropeller - app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name - app.kubernetes.io/managed-by: Helm - spec: - priorityClassName: system-cluster-critical - containers: - - command: - - flytepropeller - - --config - - /etc/flyte/config/*.yaml - - --propeller.cluster-id - - '' - env: - - name: POD_NAME - valueFrom: - fieldRef: - fieldPath: metadata.name - - name: POD_NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - - name: GOMEMLIMIT - valueFrom: - resourceFieldRef: - divisor: 1 - resource: limits.memory - - name: GOMAXPROCS - valueFrom: - resourceFieldRef: - divisor: 1 - resource: limits.cpu - - name: CLUSTER_NAME - valueFrom: - secretKeyRef: - name: operator-cluster-name - key: cluster_name - - name: DEPLOYMENT_NAME - value: operator - - name: PROXY_SERVICE_URL - value: http://union-operator-proxy:8080 - - name: PROMETHEUS_SERVICE_URL - value: http://union-operator-prometheus:80 - - name: KNATIVE_PROXY_SERVICE_URL - value: http://kourier-internal - image: "public.ecr.aws/p0i0a9q8/unionoperator:2026.4.5" - imagePullPolicy: "IfNotPresent" - name: flytepropeller - ports: - - containerPort: 10254 - resources: - limits: - cpu: "3" - memory: 3Gi - requests: - cpu: "1" - memory: 1Gi - volumeMounts: - - name: config-volume - mountPath: /etc/flyte/config - - name: auth - mountPath: /etc/union/secret - serviceAccountName: flytepropeller-system - volumes: - - configMap: - name: flyte-propeller-config - name: config-volume - - name: auth - secret: - secretName: union-secret-auth + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: flyteconnector + minReplicas: 2 + maxReplicas: 5 + metrics: + - type: Resource + resource: + name: cpu + target: + type: Utilization + averageUtilization: 80 + - type: Resource + resource: + name: memory + target: + type: Utilization + averageUtilization: 80 +--- +# Source: dataplane/templates/webhook/mutatingwebhookconfiguration.yaml +apiVersion: admissionregistration.k8s.io/v1 +kind: MutatingWebhookConfiguration +metadata: + name: union-pod-webhook- + labels: + app.kubernetes.io/name: union-pod-webhook + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm +webhooks: + - name: union-pod-webhook.flyte.org + admissionReviewVersions: + - v1 + - v1beta1 + clientConfig: + caBundle: LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSUROekNDQWgrZ0F3SUJBZ0lVSm9kQ09lem4vd1BHQTZjYkw1U3duSlpYT2hJd0RRWUpLb1pJaHZjTkFRRUwKQlFBd0t6RXBNQ2NHQTFVRUF3d2dabXg1ZEdWd2NtOXdaV3hzWlhJdGQyVmlhRzl2YXk1MWJtbHZiaTV6ZG1NdwpIaGNOTWpZd01USXpNREl6TmpVMVdoY05Nell3TVRJeE1ESXpOalUxV2pBck1Ta3dKd1lEVlFRRERDQm1iSGwwClpYQnliM0JsYkd4bGNpMTNaV0pvYjI5ckxuVnVhVzl1TG5OMll6Q0NBU0l3RFFZSktvWklodmNOQVFFQkJRQUQKZ2dFUEFEQ0NBUW9DZ2dFQkFJcjBSWDRGRjFNVTNYdFBmUHErMDUrMGI1Qm0xK3hsTVR5ZUp4bG5mWFJuRlgyegpPcHV6NmsreUpBQk4xRkxmNDVYRnlJclFlaW5DWWJtckZXalFRc3ZDWEloNkpPNGMwdmRMVVU5RERYcldWRFlsCkRNZytTUDlJVnZBMm5USkVFZ2RFREVsKzNWeThUbUJoNlNmcGthcXdHajY5SmhIdy9OeE9yZzM2bUY3TU5VZ1MKWXJVUkxqUFFzTUN6NElTVzhhQnEzS2IwRUFWUTA4V1lZQ0ZEbjY1aHBzVFdxR05rL3BKRnVTYjlsZll6RVdyYwpJeW5wNDhqTnIvNUdKS3NnWG9LeTUwZVo4UkppeTVyOUhENGtnSW15TVNQR2RDMXFZSVppRHRlZW8yOElxNk5nCmd1dWcyTGhlNndYYW9YZXFCQ2d3SHpwWFVXM1hJVzI5eFoxUkFRRUNBd0VBQWFOVE1GRXdIUVlEVlIwT0JCWUUKRktITDQ1RWhQaVg5bnlRN241QUk0YUZVNFlCL01COEdBMVVkSXdRWU1CYUFGS0hMNDVFaFBpWDlueVE3bjVBSQo0YUZVNFlCL01BOEdBMVVkRXdFQi93UUZNQU1CQWY4d0RRWUpLb1pJaHZjTkFRRUxCUUFEZ2dFQkFEeTRac21XCkNjZ3B5TmI4cU1ydzNYclBNejZLMTRlTVhqQVk1bERFYXM1NklLZExoZFM1bi9sWk95bENuUUtOZ3RHS0VrY3UKT05tMEE2Z0xNT3UzcE43dG9aNXh0WktOcnUvS0x6dm44QUNNNndCbUR0bG5oRERDa3ZUTHBVRzh6UTZSNmlaegpzTWlhVmpLSUJiWkJsZXNYSHZZQnQ2YlRTN2o1QWM2aTdYc1RPK3ZqQ2RUQjVYdCtEZ0x2QnBJRWh2ZGJzNXhmClFtc3ZzQmwxenJnR3JXemo4d0txcXNUMzZaOVU1TzdSK1RJMlQrUDVOR25SeHNQOGJ0R0RtMGZ5SW55WlJrSkcKVTdpVFJyMlZRWlRyaTNZQ2dlVjN1UjFtMnJjc2hyU1Avb082T0QyTXR5Q1FMRWtFK1pxVUVJVnpSYmVaNXl2RQptM29YMFV2OUtBUy9pWTA9Ci0tLS0tRU5EIENFUlRJRklDQVRFLS0tLS0K + service: + name: union-pod-webhook + namespace: union + path: /mutate--v1-pod/secrets + port: 443 + failurePolicy: Fail + matchPolicy: Equivalent + namespaceSelector: + matchLabels: + kubernetes.io/metadata.name: union + objectSelector: + matchLabels: + inject-flyte-secrets: "true" + organization: '' + reinvocationPolicy: Never + rules: + - apiGroups: + - '*' + apiVersions: + - v1 + operations: + - CREATE + resources: + - pods + scope: '*' + sideEffects: NoneOnDryRun + timeoutSeconds: 30 --- # Source: dataplane/charts/dcgm-exporter/templates/tls-secret.yaml # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. @@ -6492,6 +6751,222 @@ spec: # See the License for the specific language governing permissions and # limitations under the License. --- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Imported via https://github.com/knative/operator/releases/download/knative-v1.16.0/operator.yaml +# Copyright 2024 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +# Source: dataplane/templates/propeller/serviceaccount.yaml +--- +--- +# Source: dataplane/templates/serving/knative-serving.yaml +apiVersion: operator.knative.dev/v1beta1 +kind: KnativeServing +metadata: + name: union-operator-serving + namespace: union + labels: + helm.sh/chart: dataplane-2026.4.4 + app.kubernetes.io/name: release-name-dataplane + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2026.4.5" + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/component: serving +spec: + config: + deployment: + progress-deadline: "30m" + queue-sidecar-cpu-request: "25m" + queue-sidecar-cpu-limit: "1000m" + queue-sidecar-memory-request: "400Mi" + queue-sidecar-memory-limit: "800Mi" + queue-sidecar-ephemeral-storage-request: "512Mi" + queue-sidecar-ephemeral-storage-limit: "1024Mi" + registries-skipping-tag-resolving: managed.cr.union.ai + features: + kubernetes.podspec-affinity: "enabled" + kubernetes.podspec-nodeselector: "enabled" + kubernetes.podspec-tolerations: "enabled" + kubernetes.podspec-fieldref: "enabled" + kubernetes.podspec-dnspolicy: "enabled" + kubernetes.podspec-schedulername: "enabled" + kubernetes.podspec-securitycontext: "enabled" + network: + ingress-class: "kourier.ingress.networking.knative.dev" + high-availability: + replicas: 2 + ingress: + kourier: + enabled: true + bootstrap-configmap: "union-operator-serving-envoy-bootstrap" + service-type: ClusterIP + podDisruptionBudgets: + - name: 3scale-kourier-gateway-pdb + minAvailable: 50% + - name: activator-pdb + minAvailable: 50% + - name: webhook-pdb + minAvailable: 50% + registry: + override: + # TODO(jeev): Wire up Union fork of Envoy + 3scale-kourier-gateway/kourier-gateway: ghcr.io/unionai/envoy:456fed84d4ad9a9dfb186d117d9362e9dc0f7c1f + # TODO(jeev): Wire up Union fork of Kourier + net-kourier-controller/controller: ghcr.io/unionai/kourier@sha256:5804c348d15b3959604e3e3ceed216c3a1c7b32cbe254c7d3eb02a35e62ba9c4 + workloads: + - name: 3scale-kourier-gateway + labels: + helm.sh/chart: dataplane-2026.4.4 + app.kubernetes.io/name: release-name-dataplane + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2026.4.5" + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/component: 3scale-kourier-gateway + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchLabels: + app.kubernetes.io/name: 3scale-kourier-gateway + app.kubernetes.io/instance: release-name + topologyKey: topology.kubernetes.io/zone + annotations: + checksum/bootstrap-config: 6469f6f592eebc9e0c676d8ccc359a05bc799c0988e474b2da942c4cc328f656 + env: + - container: kourier-gateway + envVars: + - name: UNION_AUTHZ_TENANTAUTHURL + value: "https:///me" + - name: UNION_AUTHZ_TENANTAUTHSIGNINURL + value: "https:///login" + - name: UNION_AUTHZ_TENANTCONTROLPLANEURL + value: "https://" + resources: + - container: kourier-gateway + limits: + cpu: "2" + memory: 2Gi + requests: + cpu: "1" + memory: 1Gi + - name: activator + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app + operator: In + values: + - activator + topologyKey: topology.kubernetes.io/zone + - name: autoscaler + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app + operator: In + values: + - autoscaler + topologyKey: topology.kubernetes.io/zone + - name: autoscaler-hpa + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app + operator: In + values: + - autoscaler-hpa + topologyKey: topology.kubernetes.io/zone + - name: controller + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app + operator: In + values: + - controller + topologyKey: topology.kubernetes.io/zone + - name: net-kourier-controller + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app + operator: In + values: + - net-kourier-controller + topologyKey: topology.kubernetes.io/zone + env: + - container: controller + envVars: + - name: KOURIER_UNION_AUTHZ_ENABLED + value: "true" + resources: + - container: controller + limits: + cpu: "1" + memory: 1Gi + requests: + cpu: 500m + memory: 500Mi + - name: webhook + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app + operator: In + values: + - webhook + topologyKey: topology.kubernetes.io/zone +--- +# Source: dataplane/templates/common/task-podtemplate.yaml +apiVersion: v1 +kind: PodTemplate +metadata: + name: task-template + namespace: union +template: + spec: + serviceAccountName: union + containers: + - name: default + image: docker.io/rwgrim/docker-noop +--- # Source: dataplane/charts/fluentbit/templates/tests/test-connection.yaml apiVersion: v1 kind: Pod diff --git a/tests/generated/dataplane.fully-selfhosted.yaml b/tests/generated/dataplane.fully-selfhosted.yaml index 7a94d92a..49ed7985 100644 --- a/tests/generated/dataplane.fully-selfhosted.yaml +++ b/tests/generated/dataplane.fully-selfhosted.yaml @@ -1,155 +1,139 @@ --- -# Source: dataplane/templates/common/namespaces.yaml -apiVersion: v1 -kind: Namespace -metadata: - name: flytesnacks-development ---- -# Source: dataplane/templates/common/namespaces.yaml -apiVersion: v1 -kind: Namespace -metadata: - name: flytesnacks-staging ---- -# Source: dataplane/templates/common/namespaces.yaml -apiVersion: v1 -kind: Namespace -metadata: - name: flytesnacks-production ---- -# Source: dataplane/templates/common/namespaces.yaml -apiVersion: v1 -kind: Namespace -metadata: - name: union-health-monitoring-development ---- -# Source: dataplane/templates/common/namespaces.yaml -apiVersion: v1 -kind: Namespace -metadata: - name: union-health-monitoring-staging ---- -# Source: dataplane/templates/common/namespaces.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2020 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + apiVersion: v1 -kind: Namespace +kind: ServiceAccount metadata: - name: union-health-monitoring-production + name: knative-operator + namespace: "union" + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator --- -# Source: dataplane/charts/fluentbit/templates/serviceaccount.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2020 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + apiVersion: v1 kind: ServiceAccount metadata: - name: fluentbit-system - namespace: union + name: operator-webhook + namespace: "union" labels: - helm.sh/chart: fluentbit-0.48.9 - app.kubernetes.io/name: fluentbit - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "3.2.8" - app.kubernetes.io/managed-by: Helm + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator --- -# Source: dataplane/charts/kube-state-metrics/templates/serviceaccount.yaml +# Source: dataplane/charts/prometheus/charts/kube-state-metrics/templates/serviceaccount.yaml apiVersion: v1 kind: ServiceAccount automountServiceAccountToken: true metadata: labels: - helm.sh/chart: kube-state-metrics-5.30.1 + helm.sh/chart: kube-state-metrics-5.25.1 app.kubernetes.io/managed-by: Helm app.kubernetes.io/component: metrics app.kubernetes.io/part-of: kube-state-metrics app.kubernetes.io/name: kube-state-metrics app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2.15.0" + app.kubernetes.io/version: "2.13.0" name: release-name-kube-state-metrics namespace: union --- -# Source: dataplane/charts/opencost/templates/serviceaccount.yaml +# Source: dataplane/charts/prometheus/templates/serviceaccount.yaml apiVersion: v1 kind: ServiceAccount metadata: - name: release-name-opencost - namespace: union labels: - helm.sh/chart: opencost-1.42.0 - app.kubernetes.io/name: opencost + app.kubernetes.io/component: server + app.kubernetes.io/name: prometheus app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "1.111.0" - app.kubernetes.io/part-of: opencost + app.kubernetes.io/version: v2.54.1 + helm.sh/chart: prometheus-25.27.0 app.kubernetes.io/managed-by: Helm -automountServiceAccountToken: true ---- -# Source: dataplane/templates/clusterresourcesync/serviceaccount.yaml -apiVersion: v1 -kind: ServiceAccount -metadata: - name: union-clustersync-system + app.kubernetes.io/part-of: prometheus + name: union-operator-prometheus namespace: union + annotations: + {} --- -# Source: dataplane/templates/imagebuilder/serviceaccount.yaml -apiVersion: v1 -kind: ServiceAccount -metadata: - name: union-imagebuilder ---- -# Source: dataplane/templates/nodeexecutor/serviceaccount.yaml +# Source: dataplane/templates/common/system-serviceaccount.yaml apiVersion: v1 kind: ServiceAccount metadata: - name: executor + name: union-system namespace: union - labels: - app: executor ---- -# Source: dataplane/templates/operator/serviceaccount-proxy.yaml -apiVersion: v1 -kind: ServiceAccount -metadata: - name: proxy-system - labels: - app.kubernetes.io/name: operator-proxy - app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name - app.kubernetes.io/managed-by: Helm + annotations: --- -# Source: dataplane/templates/operator/serviceaccount.yaml +# Source: dataplane/templates/common/union-serviceaccount.yaml apiVersion: v1 kind: ServiceAccount metadata: - name: operator-system - labels: - app.kubernetes.io/name: union-operator - app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name - app.kubernetes.io/managed-by: Helm + name: union + namespace: union + annotations: + eks.amazonaws.com/role-arn: arn:aws:iam::ACCOUNT_ID:role/flyte_project_role +automountServiceAccountToken: true --- -# Source: dataplane/templates/prometheus/serviceaccount.yaml +# Source: dataplane/templates/flyteconnector/serviceaccount.yaml apiVersion: v1 kind: ServiceAccount metadata: - name: union-operator-prometheus + name: flyteconnector namespace: union - labels: - helm.sh/chart: dataplane-2026.4.2 - app.kubernetes.io/name: release-name-dataplane + labels: + app.kubernetes.io/name: flyteconnector app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: prometheus ---- -# Source: dataplane/templates/propeller/serviceaccount-webhook.yaml -apiVersion: v1 -kind: ServiceAccount -metadata: - name: flytepropeller-webhook-system - namespace: union --- -# Source: dataplane/templates/propeller/serviceaccount.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2022 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + apiVersion: v1 -kind: ServiceAccount +kind: Secret metadata: - name: flytepropeller-system - namespace: union + name: operator-webhook-certs + namespace: "union" + labels: + app.kubernetes.io/component: operator-webhook + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +# The data is populated at install time. --- # Source: dataplane/templates/common/cluster-secret.yaml apiVersion: v1 @@ -160,155 +144,533 @@ type: Opaque data: cluster_name: --- -# Source: dataplane/templates/propeller/deployment-webhook.yaml -# Create an empty secret that the first propeller pod will populate +# Source: dataplane/templates/webhook/mutatingwebhookconfiguration.yaml apiVersion: v1 kind: Secret metadata: - name: flyte-pod-webhook + name: union-pod-webhook namespace: union + labels: + app.kubernetes.io/name: union-pod-webhook + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm type: Opaque +data: + ca.crt: LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSUROekNDQWgrZ0F3SUJBZ0lVSm9kQ09lem4vd1BHQTZjYkw1U3duSlpYT2hJd0RRWUpLb1pJaHZjTkFRRUwKQlFBd0t6RXBNQ2NHQTFVRUF3d2dabXg1ZEdWd2NtOXdaV3hzWlhJdGQyVmlhRzl2YXk1MWJtbHZiaTV6ZG1NdwpIaGNOTWpZd01USXpNREl6TmpVMVdoY05Nell3TVRJeE1ESXpOalUxV2pBck1Ta3dKd1lEVlFRRERDQm1iSGwwClpYQnliM0JsYkd4bGNpMTNaV0pvYjI5ckxuVnVhVzl1TG5OMll6Q0NBU0l3RFFZSktvWklodmNOQVFFQkJRQUQKZ2dFUEFEQ0NBUW9DZ2dFQkFJcjBSWDRGRjFNVTNYdFBmUHErMDUrMGI1Qm0xK3hsTVR5ZUp4bG5mWFJuRlgyegpPcHV6NmsreUpBQk4xRkxmNDVYRnlJclFlaW5DWWJtckZXalFRc3ZDWEloNkpPNGMwdmRMVVU5RERYcldWRFlsCkRNZytTUDlJVnZBMm5USkVFZ2RFREVsKzNWeThUbUJoNlNmcGthcXdHajY5SmhIdy9OeE9yZzM2bUY3TU5VZ1MKWXJVUkxqUFFzTUN6NElTVzhhQnEzS2IwRUFWUTA4V1lZQ0ZEbjY1aHBzVFdxR05rL3BKRnVTYjlsZll6RVdyYwpJeW5wNDhqTnIvNUdKS3NnWG9LeTUwZVo4UkppeTVyOUhENGtnSW15TVNQR2RDMXFZSVppRHRlZW8yOElxNk5nCmd1dWcyTGhlNndYYW9YZXFCQ2d3SHpwWFVXM1hJVzI5eFoxUkFRRUNBd0VBQWFOVE1GRXdIUVlEVlIwT0JCWUUKRktITDQ1RWhQaVg5bnlRN241QUk0YUZVNFlCL01COEdBMVVkSXdRWU1CYUFGS0hMNDVFaFBpWDlueVE3bjVBSQo0YUZVNFlCL01BOEdBMVVkRXdFQi93UUZNQU1CQWY4d0RRWUpLb1pJaHZjTkFRRUxCUUFEZ2dFQkFEeTRac21XCkNjZ3B5TmI4cU1ydzNYclBNejZLMTRlTVhqQVk1bERFYXM1NklLZExoZFM1bi9sWk95bENuUUtOZ3RHS0VrY3UKT05tMEE2Z0xNT3UzcE43dG9aNXh0WktOcnUvS0x6dm44QUNNNndCbUR0bG5oRERDa3ZUTHBVRzh6UTZSNmlaegpzTWlhVmpLSUJiWkJsZXNYSHZZQnQ2YlRTN2o1QWM2aTdYc1RPK3ZqQ2RUQjVYdCtEZ0x2QnBJRWh2ZGJzNXhmClFtc3ZzQmwxenJnR3JXemo4d0txcXNUMzZaOVU1TzdSK1RJMlQrUDVOR25SeHNQOGJ0R0RtMGZ5SW55WlJrSkcKVTdpVFJyMlZRWlRyaTNZQ2dlVjN1UjFtMnJjc2hyU1Avb082T0QyTXR5Q1FMRWtFK1pxVUVJVnpSYmVaNXl2RQptM29YMFV2OUtBUy9pWTA9Ci0tLS0tRU5EIENFUlRJRklDQVRFLS0tLS0K + tls.crt: LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSUQxakNDQXI2Z0F3SUJBZ0lVREtRN3A4MFpIdkQyUE5Ya3p3VGRLem9iUjg4d0RRWUpLb1pJaHZjTkFRRUwKQlFBd0t6RXBNQ2NHQTFVRUF3d2dabXg1ZEdWd2NtOXdaV3hzWlhJdGQyVmlhRzl2YXk1MWJtbHZiaTV6ZG1NdwpIaGNOTWpZd01USXpNREl6TmpVMVdoY05NamN3TVRJek1ESXpOalUxV2pBck1Ta3dKd1lEVlFRRERDQm1iSGwwClpYQnliM0JsYkd4bGNpMTNaV0pvYjI5ckxuVnVhVzl1TG5OMll6Q0NBU0l3RFFZSktvWklodmNOQVFFQkJRQUQKZ2dFUEFEQ0NBUW9DZ2dFQkFMNnMxUTRaZ3ZxVVdNdFp5VC9EKytEQWl6WmR0Q1lkVzArSGk4S3phQ1Vjbi9zZwo3a21NTEhDM2ZlZWtZK2ZUNENHUTd6K1l2WjRWWSt6WGljKzdrNndLbUZqdm1acEM1Rm96OWo1Q21NRk5HZmlWCnRobzVvSU9MUkIzNXRYMU1ROHUwNFVzT29QcjdMK016eEdxWXVmTFdtN1NVdjh0eEp3ZFMxZVlzMllNV1pMeU4KekI4akdWY01Qc2ZNVkdxdFhTTVljQ3hNYURKaTM1N0VEZSs3clM2U2ZtbnFtYjJxekdQM1BRRStCUm91Yk5nLwo4NCt4c253VXBLZUFpem95dFM2c2ZlOTdMUEtRTW9oVzd2TjZORFJrLy9xSmpHem9iZmZwUHU3endhWHVXMWtMCmllc2g2cVRrdlZLSTF3WFJ2WHArVU5GbjZzdDIwdEVlRks0ZzMwOENBd0VBQWFPQjhUQ0I3akFKQmdOVkhSTUUKQWpBQU1Bc0dBMVVkRHdRRUF3SUY0RENCa3dZRFZSMFJCSUdMTUlHSWdoWm1iSGwwWlhCeWIzQmxiR3hsY2kxMwpaV0pvYjI5cmdoeG1iSGwwWlhCeWIzQmxiR3hsY2kxM1pXSm9iMjlyTG5WdWFXOXVnaUJtYkhsMFpYQnliM0JsCmJHeGxjaTEzWldKb2IyOXJMblZ1YVc5dUxuTjJZNEl1Wm14NWRHVndjbTl3Wld4c1pYSXRkMlZpYUc5dmF5NTEKYm1sdmJpNXpkbU11WTJ4MWMzUmxjaTVzYjJOaGJEQWRCZ05WSFE0RUZnUVV1d25wcDZ1MTNKL0NsNnI3eDZ1Sgo5YmZ1RDhzd0h3WURWUjBqQkJnd0ZvQVVvY3Zqa1NFK0pmMmZKRHVma0FqaG9WVGhnSDh3RFFZSktvWklodmNOCkFRRUxCUUFEZ2dFQkFFMmlOa1MzczhGT3Z0cDhrVmJNZUN2amxIQytvYTVlZTFGNmowNlZ3bFJPOWdxOXJla1UKMGptWUhNWGNjWjcrZXJkNFRQRXgwK3ZqMzNhRTV3cDFmdVJJL2xIZHhuOUpJRHg2bTFQTHdVa3p1WG9EekYxLwozVXVIWU1xOGkrT2xsY0tqbmdVMUFUNmRWRCtIbFVWeHpMdjlNazdPb05SQUY5dTJHa1hiUHdCN0I1ZGRxdWxlCmtNcXBxbDhES2s4SVNVelVsTjNBb2Q5NmJUelIyV3RCVkxpTi9zRHZtTDZwZThFYm55RnBUb1BZTDlGeWZOcmIKT1ZPZEJwUkZFTG1KOHZaZWY5d1lxRmVXMTE3SGV5VDArZkxGZVNSemZUNVVWUlVraVFBcUVJdUhHUVRrOVFabgo1WnYvNUF0eEg3WE1rcXR0VE1zMFlSUG0xYjQzY3FBeFdhND0KLS0tLS1FTkQgQ0VSVElGSUNBVEUtLS0tLQo= + tls.key: LS0tLS1CRUdJTiBQUklWQVRFIEtFWS0tLS0tCk1JSUV2UUlCQURBTkJna3Foa2lHOXcwQkFRRUZBQVNDQktjd2dnU2pBZ0VBQW9JQkFRQytyTlVPR1lMNmxGakwKV2NrL3cvdmd3SXMyWGJRbUhWdFBoNHZDczJnbEhKLzdJTzVKakN4d3QzM25wR1BuMCtBaGtPOC9tTDJlRldQcwoxNG5QdTVPc0NwaFk3NW1hUXVSYU0vWStRcGpCVFJuNGxiWWFPYUNEaTBRZCtiVjlURVBMdE9GTERxRDYreS9qCk04UnFtTG55MXB1MGxML0xjU2NIVXRYbUxObURGbVM4amN3Zkl4bFhERDdIekZScXJWMGpHSEFzVEdneVl0K2UKeEEzdnU2MHVrbjVwNnBtOXFzeGo5ejBCUGdVYUxtellQL09Qc2JKOEZLU25nSXM2TXJVdXJIM3ZleXp5a0RLSQpWdTd6ZWpRMFpQLzZpWXhzNkczMzZUN3U4OEdsN2x0WkM0bnJJZXFrNUwxU2lOY0YwYjE2ZmxEUlorckxkdExSCkhoU3VJTjlQQWdNQkFBRUNnZ0VBRjV2SnMzbTRMSE9DdlFic2NwOC9DQmgvQkNlOC92MGlpYW5VUmJLMkFlWlYKN3Arb3NXV1FPUktYSGIvT2VPMVVjb09DQkFOUzh3aGQrM3pDZlB5U0w3cU9HM0RyT1Z6djdqVkIxM3FpZEVpcQpId3ZXWk0vZXpuckhYOWpEdm5SYmJwVUNVaXRKQmxwa2x3S1pYc056UHB5UTRkNkxFdEw3VEo1V2lxM2g3cWQzCjhncmJoeWlWNDhZRFAyYkhpZGt0S2QvMHBNL3piSzZPTS9iMFpLUmRBWVpHQXlOcklMaFJ3eDNsQXE2SDVqL20KUHBuNlZEdVQ3Y1h6UFhmNTgxR0IyNjQxWFJVTXRNWGVEREtaU3JudnIrbG5tV2lsYXZmNkg3ZHp4NlRBMmk0RApybCtIRVdUN1RSQUZGMnZ5SVUrYSt5RzNPVFBKRCtmVTdCWWh4Zld4aFFLQmdRRGVvcXVCbzQxK1dKU255blZnCmNWMHZoUEkwZ0dyRGQzNFdIbCtGNTZuSmxuMVVlbUQ2SUQ0TUhnNVpHWmNmd3NkRlB0aHl1VnB0OVl1SW11SGQKRHE5ZDhpWnpHUzJVbStkSWlsWjhnZXJ5Q3dON3hIbFM3SzJrc0RmRGJwYmZZSXZ1LzRnSExZRHh3YzQ0OG54ZwpWNGpCUFY2QTdYTHRqQzZzdFE2OW9uTlIwd0tCZ1FEYlFBUTRGdWpnUjJUOXh2Y01MdXhMbm1YaHE4U2NoZlM5CkRoWndDUzZIeHB1YlBxblBjQ04vQUZjbDZ1cDRXVEVZdVY4bEpwcGwwRmpYQ2lMdkhuUU15cm5vRjlUOThpanQKZGVHYjF6ZWlwRUpndFpNVHkvOGdERWorZU1wSFBuYTZwQVpMOVpRVUdLR2dDNGQ0SzljdVNHMmRlb1dSU0ViSQpMVGgzeGpXVEZRS0JnQStJRWFidG5nVmVjS0IwQTFSREZGa29VUzFRZUNKQ3g4MExPV2JDRHBvOW9XaXZVT3lpCkt6SDFOdE1JY2Y2SlBCV2NtTVVJSVVMaWltVnhTS2gvU2NTb0MvNmpsd1p6Q2VPSm94YjBpVXR4Y1VERktDR2MKMlZCUDZ0UDdkeE1HVFR6VEhzNUJZbWw3TjhQSlJ0d0J2MHliMTJmdktNRmhzaS9pUWJFQkVFSjVBb0dBUm9NSApHRmJkM0V0NXdsZzcyYkk1a25SRnhkY3RLejIxb1J1bndhNWlSWTV0T3Zkak8zQ1FLZWNkSC9lMklyQmtwdFB2Ck1vNkF0MS9UUW8xakFNNGxlbnUwWUYxUnhiNGN4WW5VM2Y3UVNNRlZDNjg1dHZNemdNWVNyNngvT1h4d1NNUTUKdGpJcnhtN0poQ1JSRkNmZFUyZzl3SmpIM2hxRmtSbGlBTHRCUGFVQ2dZRUF2QkRHejQvYS9kRzRJT0VpREk4SgpBbkxaRWZaTFJrdnI0ejRyT0p0TGZtWlFjTW5DSUZmKzBaelhTME9XOWhvMjk5RUNtS1l4U2VUWk1WSDgzOFZCCnRvSkNGM3lzb01rSC9meEVXTUpicDlMblJwSFUzUElicHlhRG1nWW1YdmsvK3laL3Y1c0dDTWNtVGkzWVFLRGsKcStlK1o0SHZyaGUxVytUaHpwOCtoNGs9Ci0tLS0tRU5EIFBSSVZBVEUgS0VZLS0tLS0K --- -# Source: dataplane/templates/clusterresourcesync/configmap.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2019 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + apiVersion: v1 kind: ConfigMap metadata: - name: union-clusterresourcesync-config - namespace: union + name: config-logging + namespace: "union" labels: - app.kubernetes.io/name: clusterresourcesync - app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name - app.kubernetes.io/managed-by: Helm -data: - cluster_resources.yaml: | - cluster_resources: - clusterName: '' - customData: - - production: - - projectQuotaCpu: - value: "4096" - - projectQuotaMemory: - value: 2Ti - - projectQuotaNvidiaGpu: - value: "256" - - defaultUserRoleKey: - value: 'eks.amazonaws.com/role-arn' - - defaultUserRoleValue: - value: 'arn:aws:iam::ACCOUNT_ID:role/flyte_project_role' - - staging: - - projectQuotaCpu: - value: "4096" - - projectQuotaMemory: - value: 2Ti - - projectQuotaNvidiaGpu: - value: "256" - - defaultUserRoleKey: - value: 'eks.amazonaws.com/role-arn' - - defaultUserRoleValue: - value: 'arn:aws:iam::ACCOUNT_ID:role/flyte_project_role' - - development: - - projectQuotaCpu: - value: "4096" - - projectQuotaMemory: - value: 2Ti - - projectQuotaNvidiaGpu: - value: "256" - - defaultUserRoleKey: - value: 'eks.amazonaws.com/role-arn' - - defaultUserRoleValue: - value: 'arn:aws:iam::ACCOUNT_ID:role/flyte_project_role' - refreshInterval: 5m - standaloneDeployment: true - templatePath: /etc/flyte/clusterresource/templates - clusterResourcesPrivate: - app: - isServerless: false - union: - auth: - authorizationMetadataKey: flyte-authorization - clientId: 'dataplane-operator' - clientSecretLocation: /etc/union/secret/client_secret - enable: false - tokenRefreshWindow: 5m - type: ClientSecret - connection: - host: dns:/// - admin.yaml: | - admin: - clientId: 'dataplane-operator' - clientSecretLocation: /etc/union/secret/client_secret - endpoint: dns:/// - insecure: false - event: - capacity: 1000 - rate: 500 - type: admin - domain.yaml: | - domains: - - id: development - name: development - - id: staging - name: staging - - id: production - name: production - clusters.yaml: | - clusters: - clusterConfigs: [] - labelClusterMap: {} - logger.yaml: | - logger: - formatter: - type: json - level: 4 - show-source: true - namespace_mapping.yaml: | - namespace_mapping: - template: '{{ project }}-{{ domain }}' + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +data: {} --- -# Source: dataplane/templates/clusterresourcesync/configmap.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2019 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + apiVersion: v1 kind: ConfigMap metadata: - name: union-clusterresource-template - namespace: union + name: config-observability + namespace: "union" labels: - app.kubernetes.io/name: clusterresourcesync + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +data: {} +--- +# Source: dataplane/charts/prometheus/templates/cm.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + labels: + app.kubernetes.io/component: server + app.kubernetes.io/name: prometheus app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name + app.kubernetes.io/version: v2.54.1 + helm.sh/chart: prometheus-25.27.0 app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: prometheus + name: union-operator-prometheus + namespace: union data: - a_namespace.yaml: | - apiVersion: v1 - kind: Namespace - metadata: - name: {{ namespace }} - labels: - union.ai/namespace-type: flyte - spec: - finalizers: - - kubernetes - - b_default_service_account.yaml: | - apiVersion: v1 - kind: ServiceAccount - metadata: - name: default - namespace: {{ namespace }} - annotations: - {{ defaultUserRoleKey }}: {{ defaultUserRoleValue }} + allow-snippet-annotations: "false" + alerting_rules.yml: | + {} + alerts: | + {} + prometheus.yml: | + global: + evaluation_interval: 1m + scrape_interval: 1m + scrape_timeout: 10s + rule_files: + - /etc/config/recording_rules.yml + - /etc/config/alerting_rules.yml + - /etc/config/rules + - /etc/config/alerts + scrape_configs: + - job_name: prometheus + static_configs: + - targets: + - localhost:9090 + - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + job_name: kubernetes-apiservers + kubernetes_sd_configs: + - role: endpoints + relabel_configs: + - action: keep + regex: default;kubernetes;https + source_labels: + - __meta_kubernetes_namespace + - __meta_kubernetes_service_name + - __meta_kubernetes_endpoint_port_name + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + insecure_skip_verify: true + - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + job_name: kubernetes-nodes + kubernetes_sd_configs: + - role: node + relabel_configs: + - action: labelmap + regex: __meta_kubernetes_node_label_(.+) + - replacement: kubernetes.default.svc:443 + target_label: __address__ + - regex: (.+) + replacement: /api/v1/nodes/$1/proxy/metrics + source_labels: + - __meta_kubernetes_node_name + target_label: __metrics_path__ + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + insecure_skip_verify: true + - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + job_name: kubernetes-nodes-cadvisor + kubernetes_sd_configs: + - role: node + relabel_configs: + - action: labelmap + regex: __meta_kubernetes_node_label_(.+) + - replacement: kubernetes.default.svc:443 + target_label: __address__ + - regex: (.+) + replacement: /api/v1/nodes/$1/proxy/metrics/cadvisor + source_labels: + - __meta_kubernetes_node_name + target_label: __metrics_path__ + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + insecure_skip_verify: true + - honor_labels: true + job_name: kubernetes-service-endpoints + kubernetes_sd_configs: + - role: endpoints + relabel_configs: + - action: keep + regex: true + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_scrape + - action: drop + regex: true + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_scrape_slow + - action: replace + regex: (https?) + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_scheme + target_label: __scheme__ + - action: replace + regex: (.+) + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_path + target_label: __metrics_path__ + - action: replace + regex: (.+?)(?::\d+)?;(\d+) + replacement: $1:$2 + source_labels: + - __address__ + - __meta_kubernetes_service_annotation_prometheus_io_port + target_label: __address__ + - action: labelmap + regex: __meta_kubernetes_service_annotation_prometheus_io_param_(.+) + replacement: __param_$1 + - action: labelmap + regex: __meta_kubernetes_service_label_(.+) + - action: replace + source_labels: + - __meta_kubernetes_namespace + target_label: namespace + - action: replace + source_labels: + - __meta_kubernetes_service_name + target_label: service + - action: replace + source_labels: + - __meta_kubernetes_pod_node_name + target_label: node + - honor_labels: true + job_name: kubernetes-service-endpoints-slow + kubernetes_sd_configs: + - role: endpoints + relabel_configs: + - action: keep + regex: true + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_scrape_slow + - action: replace + regex: (https?) + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_scheme + target_label: __scheme__ + - action: replace + regex: (.+) + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_path + target_label: __metrics_path__ + - action: replace + regex: (.+?)(?::\d+)?;(\d+) + replacement: $1:$2 + source_labels: + - __address__ + - __meta_kubernetes_service_annotation_prometheus_io_port + target_label: __address__ + - action: labelmap + regex: __meta_kubernetes_service_annotation_prometheus_io_param_(.+) + replacement: __param_$1 + - action: labelmap + regex: __meta_kubernetes_service_label_(.+) + - action: replace + source_labels: + - __meta_kubernetes_namespace + target_label: namespace + - action: replace + source_labels: + - __meta_kubernetes_service_name + target_label: service + - action: replace + source_labels: + - __meta_kubernetes_pod_node_name + target_label: node + scrape_interval: 5m + scrape_timeout: 30s + - honor_labels: true + job_name: prometheus-pushgateway + kubernetes_sd_configs: + - role: service + relabel_configs: + - action: keep + regex: pushgateway + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_probe + - honor_labels: true + job_name: kubernetes-services + kubernetes_sd_configs: + - role: service + metrics_path: /probe + params: + module: + - http_2xx + relabel_configs: + - action: keep + regex: true + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_probe + - source_labels: + - __address__ + target_label: __param_target + - replacement: blackbox + target_label: __address__ + - source_labels: + - __param_target + target_label: instance + - action: labelmap + regex: __meta_kubernetes_service_label_(.+) + - source_labels: + - __meta_kubernetes_namespace + target_label: namespace + - source_labels: + - __meta_kubernetes_service_name + target_label: service + - honor_labels: true + job_name: kubernetes-pods + kubernetes_sd_configs: + - role: pod + relabel_configs: + - action: keep + regex: true + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_scrape + - action: drop + regex: true + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_scrape_slow + - action: replace + regex: (https?) + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_scheme + target_label: __scheme__ + - action: replace + regex: (.+) + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_path + target_label: __metrics_path__ + - action: replace + regex: (\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4}) + replacement: '[$2]:$1' + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_port + - __meta_kubernetes_pod_ip + target_label: __address__ + - action: replace + regex: (\d+);((([0-9]+?)(\.|$)){4}) + replacement: $2:$1 + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_port + - __meta_kubernetes_pod_ip + target_label: __address__ + - action: labelmap + regex: __meta_kubernetes_pod_annotation_prometheus_io_param_(.+) + replacement: __param_$1 + - action: labelmap + regex: __meta_kubernetes_pod_label_(.+) + - action: replace + source_labels: + - __meta_kubernetes_namespace + target_label: namespace + - action: replace + source_labels: + - __meta_kubernetes_pod_name + target_label: pod + - action: drop + regex: Pending|Succeeded|Failed|Completed + source_labels: + - __meta_kubernetes_pod_phase + - action: replace + source_labels: + - __meta_kubernetes_pod_node_name + target_label: node + - honor_labels: true + job_name: kubernetes-pods-slow + kubernetes_sd_configs: + - role: pod + relabel_configs: + - action: keep + regex: true + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_scrape_slow + - action: replace + regex: (https?) + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_scheme + target_label: __scheme__ + - action: replace + regex: (.+) + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_path + target_label: __metrics_path__ + - action: replace + regex: (\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4}) + replacement: '[$2]:$1' + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_port + - __meta_kubernetes_pod_ip + target_label: __address__ + - action: replace + regex: (\d+);((([0-9]+?)(\.|$)){4}) + replacement: $2:$1 + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_port + - __meta_kubernetes_pod_ip + target_label: __address__ + - action: labelmap + regex: __meta_kubernetes_pod_annotation_prometheus_io_param_(.+) + replacement: __param_$1 + - action: labelmap + regex: __meta_kubernetes_pod_label_(.+) + - action: replace + source_labels: + - __meta_kubernetes_namespace + target_label: namespace + - action: replace + source_labels: + - __meta_kubernetes_pod_name + target_label: pod + - action: drop + regex: Pending|Succeeded|Failed|Completed + source_labels: + - __meta_kubernetes_pod_phase + - action: replace + source_labels: + - __meta_kubernetes_pod_node_name + target_label: node + scrape_interval: 5m + scrape_timeout: 30s + - job_name: 'kube-state-metrics' + metrics_path: /metrics + scrape_interval: 1m + honor_labels: true + kubernetes_sd_configs: + - role: endpoints + namespaces: + own_namespace: true + relabel_configs: + - source_labels: [__meta_kubernetes_service_label_app_kubernetes_io_name] + regex: kube-state-metrics + action: keep + - source_labels: [__meta_kubernetes_endpoint_port_name] + regex: http + action: keep + metric_relabel_configs: + - source_labels: [__name__] + regex: "kube_pod_container_resource_(limits|requests)|kube_pod_status_phase|kube_node_(labels|status_allocatable|status_condition|status_capacity)|kube_namespace_labels|kube_pod_container_status_(waiting|terminated|last_terminated).*_reason|kube_daemonset_status_number_unavailable|kube_deployment_status_replicas_unavailable|kube_resourcequota|kube_pod_info|kube_node_info|kube_pod_container_status_restarts_total" + action: keep + - source_labels: [__name__, phase] + separator: ";" + regex: "kube_pod_status_phase;(Succeeded|Failed)" + action: drop + - source_labels: [node] + target_label: nodename + regex: "(.*)" + action: replace + - source_labels: [label_node_group_name] + action: replace + regex: "(.+)" + target_label: label_node_pool_name + - job_name: 'opencost' + metrics_path: /metrics + scrape_interval: 1m + honor_labels: true + kubernetes_sd_configs: + - role: endpoints + namespaces: + own_namespace: true + relabel_configs: + - source_labels: [__meta_kubernetes_service_label_app_kubernetes_io_name] + regex: opencost + action: keep + - source_labels: [__meta_kubernetes_endpoint_port_name] + regex: http + action: keep + metric_relabel_configs: + - source_labels: [__name__] + regex: "kube_node_labels|kube_pod_labels|node_total_hourly_cost|node_ram_hourly_cost|node_cpu_hourly_cost|node_gpu_hourly_cost" + action: keep + - job_name: 'union-services' + metrics_path: /metrics + scrape_interval: 1m + honor_labels: true + kubernetes_sd_configs: + - role: endpoints + namespaces: + own_namespace: true + relabel_configs: + - source_labels: [__meta_kubernetes_service_label_platform_union_ai_service_group] + regex: .+ + action: keep + - source_labels: [__meta_kubernetes_endpoint_port_name] + regex: debug + action: keep + # Flyte propeller metrics for execution info and fast task duration. + # No-op when propeller is not deployed (no pods match the label selector). + - job_name: 'flytepropeller' + kubernetes_sd_configs: + - role: pod + namespaces: + own_namespace: true + selectors: + - role: pod + label: app.kubernetes.io/name=flytepropeller + metric_relabel_configs: + - source_labels: [__name__] + regex: "flyte:propeller:all:round:execution_info|flyte:propeller:all:node:fast_task:fast_task_execution_duration" + action: keep + relabel_configs: + - source_labels: [__meta_kubernetes_pod_container_name] + regex: flytepropeller + action: keep + - source_labels: [__meta_kubernetes_namespace] + action: replace + target_label: namespace + - source_labels: [__meta_kubernetes_pod_name] + action: replace + target_label: pod + # Kourier gateway (envoy) metrics for app serving. + # No-op when serving is not enabled (no pods match the label selector). + - job_name: 'serving-envoy' + kubernetes_sd_configs: + - role: pod + namespaces: + own_namespace: true + selectors: + - role: pod + label: app=3scale-kourier-gateway + metrics_path: /stats/prometheus + metric_relabel_configs: + - source_labels: [__name__] + regex: "envoy_cluster_upstream_rq_xx|envoy_cluster_upstream_rq_time_bucket" + action: keep + relabel_configs: + - source_labels: [__meta_kubernetes_pod_container_port_name] + regex: metrics + action: keep + - source_labels: [__meta_kubernetes_namespace] + action: replace + target_label: namespace + - source_labels: [__meta_kubernetes_pod_name] + action: replace + target_label: pod + # DCGM GPU metrics. + # No-op when dcgm-exporter is not deployed (no pods match the label selector). + - job_name: 'gpu-metrics' + kubernetes_sd_configs: + - role: pod + namespaces: + names: + - kube-system + selectors: + - role: pod + label: app.kubernetes.io/name=dcgm-exporter - c_project_resource_quota.yaml: | - apiVersion: v1 - kind: ResourceQuota - metadata: - name: project-quota - namespace: {{ namespace }} - spec: - hard: - limits.cpu: {{ projectQuotaCpu }} - limits.memory: {{ projectQuotaMemory }} - requests.nvidia.com/gpu: {{ projectQuotaNvidiaGpu }} + recording_rules.yml: | + {} + rules: | + {} --- # Source: dataplane/templates/fluent-bit/configmap.yaml apiVersion: v1 @@ -356,6 +718,40 @@ data: region us-east-1 bucket --- +# Source: dataplane/templates/imagebuilder/build-image-configmap.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: build-image-config + labels: + app.kubernetes.io/name: union-operator + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm +data: + storage.yaml: | + storage: + container: "" + type: stow + stow: + kind: s3 + config: + auth_type: accesskey + access_key_id: + secret_key: + disable_ssl: false + endpoint: + region: us-east-1 + enable-multicontainer: false + limits: + maxDownloadMBs: 1024 + cache: + max_size_mbs: 0 + target_gc_percent: 70 + buildkit-uri: "tcp://union-operator-buildkit.union.svc.cluster.local:1234" + default-repository: "union-dataplane" + authentication-type: "noop" +--- # Source: dataplane/templates/imagebuilder/configmap.yaml apiVersion: v1 kind: ConfigMap @@ -2512,6 +2908,18 @@ data: "version": 1 } --- +# Source: dataplane/templates/monitoring/prometheusrule.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: union-recording-rules + namespace: union + labels: + release: release-name +data: + recording_rules.yml: | + groups: [] +--- # Source: dataplane/templates/nodeexecutor/configmap.yaml apiVersion: v1 kind: ConfigMap @@ -2566,8 +2974,13 @@ data: templateUris: - https://scale.neptune.ai/{{ .taskConfig.project }}/-/run/?customId={{ .taskConfig.id }} - kubernetes-enabled: true + kubernetes-enabled: false enabled_plugins.yaml: | + plugins: + connector-service: + defaultConnector: + defaultServiceConfig: '{"loadBalancingConfig": [{"round_robin":{}}]}' + endpoint: k8s:///flyteconnector.union:8000 tasks: task-plugins: default-for-task-types: @@ -2600,6 +3013,8 @@ data: cpu: 4096 gpu: 256 memory: 2Ti + namespace_mapping: + template: union namespace_mapping: template: '{{ project }}-{{ domain }}' union: @@ -2623,7 +3038,7 @@ data: cache-endpoint: dns:/// endpoint: dns:/// insecure: false - type: fallback + type: cacheservicev2 use-admin-auth: true logger: formatter: @@ -2643,6 +3058,7 @@ data: secure: false useAuth: false propeller: + limit-namespace: union node-config: disable-input-file-writes: true plugins: @@ -2650,7 +3066,7 @@ data: additional-worker-args: - --last-ack-grace-period-seconds - "120" - callback-uri: http://unionai-dataplane-executor.union.svc.cluster.local:15605 + callback-uri: http://union-operator-executor.union.svc.cluster.local:15605 grace-period-status-not-found: 2m ioutils: remoteFileOutputPaths: @@ -2660,6 +3076,7 @@ data: default-cpus: 100m default-env-vars: [] default-memory: 100Mi + default-pod-template-name: task-template co-pilot: image: 'cr.flyte.org/flyteorg/flytecopilot:v1.14.1' name: flyte-copilot- @@ -2700,6 +3117,7 @@ data: default-cpus: 100m default-env-vars: [] default-memory: 100Mi + default-pod-template-name: task-template config.yaml: | union: connection: @@ -2720,11 +3138,14 @@ data: operator: enabled: true enableTunnelService: false + # enableDepot: false tunnel: - enableDirectToAppIngress: false + enableDirectToAppIngress: true deploymentToRestart: union-operator-proxy + limitNamespace: union + disableClusterPermissions: true apps: - enabled: 'false' + enabled: 'true' syncClusterConfig: enabled: false clusterId: @@ -2739,14 +3160,12 @@ data: userRole: 'arn:aws:iam::ACCOUNT_ID:role/flyte_project_role' userRoleKey: 'eks.amazonaws.com/role-arn' collectUsages: - enabled: true + enabled: false billing: model: Legacy dependenciesHeartbeat: - prometheus: - endpoint: 'http://union-operator-prometheus:80/-/healthy' - propeller: - endpoint: 'http://flytepropeller:10254' + executor: + endpoint: 'http://union-operator-executor:10254' proxy: endpoint: 'http://union-operator-proxy:10254' secretsWatcher: @@ -2761,9 +3180,6 @@ data: referenceConfigmapName: union-operator targetConfigMapName: "build-image-config" proxy: - imageBuilderConfig: - authenticationType: 'noop' - defaultRepository: '' persistedLogs: objectStore: pathTemplate: namespace-{{.KubernetesNamespace}}.pod-{{.KubernetesPodName}}.cont-{{.KubernetesContainerName}} @@ -2817,989 +3233,149 @@ data: endpoint: region: us-east-1 image-builder.buildkit-uri: "tcp://union-operator-buildkit.union.svc.cluster.local:1234" - image-builder.default-repository: "" + image-builder.default-repository: "union-dataplane" image-builder.authentication-type: "noop" --- -# Source: dataplane/templates/prometheus/configmap.yaml +# Source: dataplane/templates/serving/bootstrap-configmap.yaml +# We need to copy the default configmap here since knative-operator does not automatically update the +# address of the `net-kourier-controller` endpoint to use the release namespace. It assumes that the +# resources are being installed into the `knative-serving` namespace instead. apiVersion: v1 kind: ConfigMap metadata: - name: union-operator-prometheus + name: union-operator-serving-envoy-bootstrap namespace: union - labels: - helm.sh/chart: dataplane-2026.4.2 - app.kubernetes.io/name: release-name-dataplane - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: prometheus data: - prometheus.yml: | - global: - scrape_interval: 15s - evaluation_interval: 15s - alerting: - alertmanagers: - - static_configs: - - targets: - rule_files: - - rules.yml - scrape_configs: - # Self-monitoring - - job_name: prometheus - metrics_path: /prometheus/metrics - static_configs: - - targets: ['localhost:9090'] - metric_relabel_configs: - - source_labels: [__name__] - regex: prometheus_tsdb_head_chunks_storage_size_bytes|prometheus_tsdb_head_series|prometheus_tsdb_head_series_created_total|prometheus_tsdb_head_series_removed_total|prometheus_tsdb_head_samples_appended_total|prometheus_tsdb_storage_blocks_bytes|prometheus_tsdb_retention_limit_bytes|prometheus_tsdb_wal_(.*)|.+_(limit|failed|missed|failures)_total - action: keep + envoy-bootstrap.yaml: | + dynamic_resources: + ads_config: + transport_api_version: V3 + api_type: GRPC + rate_limit_settings: {} + grpc_services: + - envoy_grpc: {cluster_name: xds_cluster} + cds_config: + resource_api_version: V3 + ads: {} + lds_config: + resource_api_version: V3 + ads: {} + node: + cluster: kourier-knative + id: 3scale-kourier-gateway + static_resources: + listeners: + - name: stats_listener + address: + socket_address: + address: 0.0.0.0 + port_value: 9000 + filter_chains: + - filters: + - name: envoy.filters.network.http_connection_manager + typed_config: + "@type": type.googleapis.com/envoy.extensions.filters.network.http_connection_manager.v3.HttpConnectionManager + stat_prefix: stats_server + http_filters: + - name: envoy.filters.http.router + typed_config: + "@type": type.googleapis.com/envoy.extensions.filters.http.router.v3.Router + route_config: + virtual_hosts: + - name: admin_interface + domains: + - "*" + routes: + - match: + safe_regex: + regex: '/(certs|stats(/prometheus)?|server_info|clusters|listeners|ready)?' + headers: + - name: ':method' + string_match: + exact: GET + route: + cluster: service_stats + - match: + safe_regex: + regex: '/drain_listeners' + headers: + - name: ':method' + string_match: + exact: POST + route: + cluster: service_stats + clusters: + - name: service_stats + connect_timeout: 0.250s + type: static + load_assignment: + cluster_name: service_stats + endpoints: + lb_endpoints: + endpoint: + address: + socket_address: + address: 127.0.0.1 + port_value: 9901 + - name: xds_cluster + # This keepalive is recommended by envoy docs. + # https://www.envoyproxy.io/docs/envoy/latest/api-docs/xds_protocol + typed_extension_protocol_options: + envoy.extensions.upstreams.http.v3.HttpProtocolOptions: + "@type": type.googleapis.com/envoy.extensions.upstreams.http.v3.HttpProtocolOptions + explicit_http_config: + http2_protocol_options: + connection_keepalive: + interval: 30s + timeout: 5s + connect_timeout: 1s + load_assignment: + cluster_name: xds_cluster + endpoints: + lb_endpoints: + endpoint: + address: + socket_address: + address: "net-kourier-controller" + port_value: 18000 + type: STRICT_DNS + admin: + access_log: + - name: envoy.access_loggers.stdout + typed_config: + "@type": type.googleapis.com/envoy.extensions.access_loggers.stream.v3.StdoutAccessLog + address: + socket_address: + address: 127.0.0.1 + port_value: 9901 + stats_config: + stats_tags: + - tag_name: name + regex: '^.*?\.u/.*?n=(.*?)/.*?u\..*$' + - tag_name: domain + regex: '^.*?\.u/.*?d=(.*?)/.*?u\..*$' + - tag_name: org + regex: '^.*?\.u/.*?o=(.*?)/.*?u\..*$' + - tag_name: project + regex: '^.*?\.u/.*?p=(.*?)/.*?u\..*$' + - tag_name: target + regex: '^.*?\.u/.*?t=(.*?)/.*?u\..*$' + - tag_name: target_namespace + regex: '^.*?\.u/.*?tns=(.*?)/.*?u\..*$' +--- +# Source: dataplane/templates/webhook/configmap.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: union-pod-webhook-config + namespace: union +data: + core.yaml: | - # Kube state metrics for pod/node resource tracking and cost calculations - - job_name: kube-state-metrics - static_configs: - - targets: ['release-name-kube-state-metrics:8080'] - metric_relabel_configs: - - separator: ; - source_labels: [__name__] - regex: kube_pod_container_resource_(limits|requests)|kube_pod_status_phase|kube_node_(labels|status_allocatable|status_condition|status_capacity)|kube_namespace_labels|kube_pod_container_status_(waiting|terminated|last_terminated).*_reason|kube_daemonset_status_number_unavailable|kube_deployment_status_replicas_unavailable|kube_resourcequota|kube_pod_info|kube_node_info|kube_pod_container_status_restarts_total - action: keep - - separator: ; - source_labels: [__name__, phase] - regex: kube_pod_status_phase;(Succeeded|Failed) - action: drop - - source_labels: [node] - target_label: nodename - regex: '(.*)' - action: replace - - source_labels: [label_node_group_name] - action: replace - regex: (.+) - target_label: label_node_pool_name - - # cAdvisor container metrics for CPU and memory tracking - - job_name: kubernetes-cadvisor - metrics_path: /metrics - scheme: https - kubernetes_sd_configs: - - role: node - namespaces: - names: [] - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token - tls_config: - ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt - insecure_skip_verify: false - metric_relabel_configs: - - separator: ; - source_labels: [__name__] - regex: container_cpu_usage_seconds_total|container_memory_working_set_bytes - action: keep - relabel_configs: - - separator: ; - regex: __meta_kubernetes_node_label_(.+) - replacement: $1 - action: labelmap - - separator: ; - regex: (.*) - target_label: __address__ - replacement: kubernetes.default.svc:443 - action: replace - - source_labels: [__meta_kubernetes_node_name] - separator: ; - regex: (.+) - target_label: __metrics_path__ - replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor - action: replace - - # Flyte propeller metrics for execution info and fast task duration - - job_name: flytepropeller - kubernetes_sd_configs: - - role: pod - namespaces: - names: - - union - selectors: - - role: pod - label: app.kubernetes.io/name=flytepropeller - metric_relabel_configs: - - source_labels: [__name__] - regex: "flyte:propeller:all:round:execution_info|flyte:propeller:all:node:fast_task:fast_task_execution_duration" - action: keep - relabel_configs: - - source_labels: [__meta_kubernetes_pod_container_name] - regex: flytepropeller - action: keep - - source_labels: [__meta_kubernetes_namespace] - action: replace - target_label: namespace - - source_labels: [__meta_kubernetes_pod_name] - action: replace - target_label: pod - # OpenCost metrics for cost tracking - - job_name: opencost - static_configs: - - targets: ['release-name-opencost:9003'] - metric_relabel_configs: - - source_labels: [__name__] - regex: "kube_node_labels|kube_pod_labels|node_total_hourly_cost|node_ram_hourly_cost|node_cpu_hourly_cost|node_gpu_hourly_cost" - action: keep - rules.yml: | - groups: - - name: cost_calculations_15s - interval: 15s - rules: - - record: pod_gpu_allocation - expr: | - sum by (namespace, pod) (DCGM_FI_DEV_GPU_UTIL >= bool 0) * on (namespace, pod) group_left() (max by (namespace, pod) (kube_pod_status_phase{phase=~"Running|Pending"} == 1)) - - record: execution_info # A join metric to look up execution-level info. Used to disambiguate workflow/task executions, apps, and workspaces. - expr: | - max by (label_domain, label_project, label_entity_name, label_execution_id, label_entity_id)( - label_replace( - label_replace( - label_replace( - label_replace( - label_replace( - flyte:propeller:all:round:execution_info{domain!="", project!="", workflow_name!="", execution_id!=""}, # filter for workflow/task executions - "label_entity_id", "$1", "execution_id", "(.*)" # join key - ), "label_entity_name", "$1", "workflow_name", "(.*)" # set label_entity_name to the workflow/task name from the workflow_execution_id - ), - "label_execution_id", "$1", "execution_id", "(.*)" - ), - "label_project", "$1", "project", "(.*)" # project - ), - "label_domain", "$1", "domain", "(.*)" # domain - ) - ) - - record: app_info # A join metric to look up app-level info. Used to disambiguate workflow/task executions, apps, and workspaces. - expr: | - max by (label_domain, label_project, label_app_name, label_app_version, label_entity_id)( - label_replace( - label_replace( - label_replace( - kube_pod_labels{ - label_domain!="", - label_project!="", - label_serving_unionai_dev_app_name!="", - label_serving_knative_dev_revision!="" - }, # this filters for apps - "label_app_name", "$1", "label_serving_unionai_dev_app_name", "(.*)" # rename to cleanup - ), - "label_app_version", "$1", "label_serving_knative_dev_revision", "(.*)" # the app_version is equivalent to an execution_id for workflows (lowest level of granularity) - ), - "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # join key - ) - ) - - record: workspace_info # A join metric to look up workspace info. Used to disambiguate workflow/task executions, apps, and workspaces. - expr: | - max by (label_domain, label_project, label_workspace_name, label_entity_id)( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # filter for workspaces - "label_entity_id", "$1", "label_node_id", "(.*)" # join key - ), "label_workspace_name", "$1", "label_node_id", "(.*)" # set label_workspace_name to the workspace name from the kube_pod_labels - ) - ) - - record: fast_task_execution_duration - expr: | - max by (label_domain, label_project, label_execution_id, label_entity_id, namespace, pod)( - label_replace( - label_replace( - label_replace( - label_replace( - label_replace( - label_replace( - flyte:propeller:all:node:fast_task:fast_task_execution_duration{domain!="", project!="", execution_id!="", exported_namespace!="", exported_pod!=""}, - "label_entity_id", "$1", "execution_id", "(.*)" # join key - ), - "label_execution_id", "$1", "execution_id", "(.*)" - ), - "label_project", "$1", "project", "(.*)" # project - ), - "label_domain", "$1", "domain", "(.*)" # domain - ), - "namespace", "$1", "exported_namespace", "(.*)" - ), - "pod", "$1", "exported_pod", "(.*)" - ) - ) - - record: fast_task_execution_duration_rate - expr: | - irate(fast_task_execution_duration[1m]) > 0 or absent(irate(fast_task_execution_duration[1m])) * fast_task_execution_duration - - record: entity_id:mem_usage_bytes_total_per_node:sum # Allocated memory (max(requested, consumed)) aggregated per node and entity, where entity is either a task/workflow execution or an app. - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id, node) ( # aggregate up to entity - # First, calculate the allocated memory for each pod - max by (namespace, pod) ( # this is the case where consumed (the memory working set) exceeds requested memory - ( - sum by (namespace, pod) ( - container_memory_working_set_bytes{namespace!="",pod!="",image!=""} - ) - > sum by (namespace, pod) ( - kube_pod_container_resource_requests{namespace!="", pod!="", node!="", resource="memory"} - ) - ) - or sum by (namespace, pod) ( # this is the case where memory requests are <= consumed memory - kube_pod_container_resource_requests{namespace!="", pod!="", node!="", resource="memory"} # needed to add node!="" to dedupe - ) - ) - # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but we do not want to double the number of pod-level metrics we save - * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) - "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") - ), - "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels - label_replace( - label_replace( - kube_pod_labels{ - label_domain!="", - label_project!="", - label_serving_unionai_dev_app_name!="", - label_serving_knative_dev_revision!="" - }, # this filters for apps only - "label_entity_type", "app", "", "" # set label_entity_type to "app" - ), - "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) - "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" - ), - "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - fast_task_execution_duration_rate{label_domain!="", label_project!="",label_execution_id!=""}, - "label_entity_type", "fast_task", "", "" - ) - ) - ) - # Then filter for pods only in the "Running" or "Pending" phase - * on (namespace, pod) group_left() ( - max by (namespace, pod) ( - kube_pod_status_phase{phase=~"Running|Pending"} == 1 - ) - ) - # Now join in node identifiers which are used for subsequent overhead calculations - * on (namespace, pod) group_left(node) ( - max by (namespace, pod, node) (kube_pod_info{node!=""}) # needed to add node!="" to dedupe - ) - ) - - record: entity_id:cpu_usage_per_node:sum # Allocated cpu (max(requested, consumed)) aggregated per node and entity, where entity is either a task/workflow execution or an app. - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id, node) ( - # First, calculate the allocated cpu for each pod - max by (namespace, pod) ( # this is the case where consumed (the cpu usage seconds total) exceeds requested cpu - ( - sum by (namespace, pod) ( - irate(container_cpu_usage_seconds_total{namespace!="",pod!="",image!=""}[5m]) - ) - > sum by (namespace, pod) ( - kube_pod_container_resource_requests{namespace!="", pod!="", node!="", resource="cpu"} - ) - ) - or sum by (namespace, pod) ( # this is the case where cpu requests are <= consumed cpu - kube_pod_container_resource_requests{namespace!="", pod!="", node!="", resource="cpu"} - ) - ) - # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but I didn't want to double the number of pod-level metrics we save - * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) - "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") - ), - "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels - label_replace( - label_replace( - kube_pod_labels{ - label_domain!="", - label_project!="", - label_serving_unionai_dev_app_name!="", - label_serving_knative_dev_revision!="" - }, # this filters for apps only - "label_entity_type", "app", "", "" # set label_entity_type to "app" - ), - "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) - "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" - ), - "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - fast_task_execution_duration_rate{label_domain!="", label_project!="", label_execution_id!=""}, - "label_entity_type", "fast_task", "", "" - ) - ) - ) - # Then filter for pods only in the "Running" or "Pending" phase - * on (namespace, pod) group_left() ( - max by (namespace, pod) ( - kube_pod_status_phase{phase=~"Running|Pending"} == 1 - ) - ) - # Now join in node identifiers which are used for subsequent overhead calculations - * on (namespace, pod) group_left(node) ( - max by (namespace, pod, node) (kube_pod_info{node!=""}) # needed to add node!="" to dedupe - ) - ) - - record: entity_id:gpu_usage_per_node:sum # Allocated gpu aggregated per node and entity, where entity is either a task/workflow execution or an app. - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id, node) ( - # First, grab the allocated gpu for each pod (which is always either 1 or zero, since k8s can't split gpus the way it can with cpu/memory) - max by (namespace, pod) ( - pod_gpu_allocation - ) - # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but I didn't want to double the number of pod-level metrics we save - * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) - "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") - ), - "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels - label_replace( - label_replace( - kube_pod_labels{ - label_domain!="", - label_project!="", - label_serving_unionai_dev_app_name!="", - label_serving_knative_dev_revision!="" - }, # this filters for apps only - "label_entity_type", "app", "", "" # set label_entity_type to "app" - ), - "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) - "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" - ), - "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - fast_task_execution_duration_rate{label_domain!="", label_project!="", label_execution_id!=""}, - "label_entity_type", "fast_task", "", "" - ) - ) - ) - # Then filter for pods only in the "Running" or "Pending" phase - * on (namespace, pod) group_left() ( - max by (namespace, pod) ( - kube_pod_status_phase{phase=~"Running|Pending"} == 1 - ) - ) - # Now join in node identifiers which are used for subsequent overhead calculations - * on (namespace, pod) group_left(node) ( - max by (namespace, pod, node) (kube_pod_info{node!=""}) # needed to add node!="" to dedupe - ) - ) - - record: entity_id:used_mem_bytes:sum # the sum of used memory across all containers in an entity (numerator for aggregate utilization calculations) - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id) ( # aggregate up to entity - # First, calculate the used memory for each pod - sum by (namespace, pod) ( - container_memory_working_set_bytes{namespace!="",pod!="",image!=""} - ) - # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but we do not want to double the number of pod-level metrics we save - * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) - "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") - ), - "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels - label_replace( - label_replace( - kube_pod_labels{ - label_domain!="", - label_project!="", - label_serving_unionai_dev_app_name!="", - label_serving_knative_dev_revision!="" - }, # this filters for apps only - "label_entity_type", "app", "", "" # set label_entity_type to "app" - ), - "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) - "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" - ), - "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - fast_task_execution_duration_rate{label_domain!="", label_project!="", label_execution_id!=""}, - "label_entity_type", "fast_task", "", "" - ) - ) - ) - # Then filter for pods only in the "Running" or "Pending" phase - * on (namespace, pod) group_left() ( - max by (namespace, pod) ( - kube_pod_status_phase{phase=~"Running|Pending"} == 1 - ) - ) - ) - - record: entity_id:allocated_mem_bytes:sum # the sum of allocated memory across all containers in an entity (denominator for aggregate utilization calculations) - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id) ( # aggregate up to entity (remove node) - entity_id:mem_usage_bytes_total_per_node:sum - ) - - record: entity_id:used_cpu:sum # the sum of used cpu across all containers in an entity (numerator for aggregate utilization calculations) - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id) ( - sum by (namespace, pod) ( - irate(container_cpu_usage_seconds_total{namespace!="",pod!="",image!=""}[5m]) - ) - # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but I didn't want to double the number of pod-level metrics we save - * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) - "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") - ), - "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels - label_replace( - label_replace( - kube_pod_labels{ - label_domain!="", - label_project!="", - label_serving_unionai_dev_app_name!="", - label_serving_knative_dev_revision!="" - }, # this filters for apps only - "label_entity_type", "app", "", "" # set label_entity_type to "app" - ), - "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) - "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" - ), - "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - fast_task_execution_duration_rate{label_domain!="", label_project!="", label_execution_id!=""}, - "label_entity_type", "fast_task", "", "" - ) - ) - ) - # Then filter for pods only in the "Running" or "Pending" phase - * on (namespace, pod) group_left() ( - max by (namespace, pod) ( - kube_pod_status_phase{phase=~"Running|Pending"} == 1 - ) - ) - ) - - record: entity_id:allocated_cpu:sum # the sum of allocated cpu across all containers in an entity (denominator for aggregate utilization calculations) - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id) ( # aggregate up to entity (remove node) - entity_id:cpu_usage_per_node:sum - ) - - record: entity_id:sm_occupancy:avg # the simple average of SM occupancy (a good generic measure of GPU utilization) per entity - expr: | - avg by (label_entity_type, label_domain, label_project, label_entity_id) ( - # First, grab the SM occupancy for each pod - max by (namespace, pod) ( - DCGM_FI_PROF_SM_OCCUPANCY # SM occupancy is a good proxy for actual GPU usage - ) - # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but I didn't want to double the number of pod-level metrics we save - * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) - "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") - ), - "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels - label_replace( - label_replace( - kube_pod_labels{ - label_domain!="", - label_project!="", - label_serving_unionai_dev_app_name!="", - label_serving_knative_dev_revision!="" - }, # this filters for apps only - "label_entity_type", "app", "", "" # set label_entity_type to "app" - ), - "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) - "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" - ), - "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - fast_task_execution_duration_rate{label_domain!="", label_project!="", label_execution_id!=""}, - "label_entity_type", "fast_task", "", "" - ) - ) - ) - # Then filter for pods only in the "Running" or "Pending" phase - * on (namespace, pod) group_left() ( - max by (namespace, pod) ( - kube_pod_status_phase{phase=~"Running|Pending"} == 1 - ) - ) - ) - - record: entity_id:gpu_count:sum # the count of running gpu pods per entity (need this to weight the gpu utilization when aggregating upwards - i.e. project-level) - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id, node) ( - # First, grab the allocated gpu for each pod (which is always either 1 or zero, since k8s can't split gpus the way it can with cpu/memory) - max by (namespace, pod) ( - pod_gpu_allocation - ) - # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but I didn't want to double the number of pod-level metrics we save - * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) - "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") - ), - "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels - label_replace( - label_replace( - kube_pod_labels{ - label_domain!="", - label_project!="", - label_serving_unionai_dev_app_name!="", - label_serving_knative_dev_revision!="" - }, # this filters for apps only - "label_entity_type", "app", "", "" # set label_entity_type to "app" - ), - "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) - "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" - ), - "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - fast_task_execution_duration_rate{label_domain!="", label_project!="", label_execution_id!=""}, - "label_entity_type", "fast_task", "", "" - ) - ) - ) - # Then filter for pods only in the "Running" or "Pending" phase - * on (namespace, pod) group_left() ( - max by (namespace, pod) ( - kube_pod_status_phase{phase=~"Running|Pending"} == 1 - ) - ) - ) - - record: entity_id:weighted_sm_occupancy:sum # product of SM occupancy and allocated GPU count (something like "used memory", numerator of weighted calcs) - expr: | - entity_id:sm_occupancy:avg - * on (label_domain, label_project, label_entity_type, label_entity_id) entity_id:gpu_count:sum - - record: entity_id:allocated_mem_cost:sum # Allocated cost of memory for each workflow/task execution and app. - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id, type) ( - entity_id:mem_usage_bytes_total_per_node:sum / (1024 * 1024 * 1024) # convert bytes to GB - * on (node) group_left(type) label_replace(avg by (node) (node_ram_hourly_cost * (15 / 3600)), "type", "mem", "", "") # convert hourly cost to 15-secondly cost and add type - ) - - record: entity_id:allocated_cpu_cost:sum # Allocated cost of cpu for each workflow/task execution and app. - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id, type)( - entity_id:cpu_usage_per_node:sum - * on (node) group_left(type) label_replace(avg by (node) (node_cpu_hourly_cost * (15 / 3600)), "type", "cpu", "", "") # convert hourly cost to 15-secondly cost and add type - ) - - record: entity_id:allocated_gpu_cost:sum # Allocated cost of gpu for each workflow/task execution and app. - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id, type)( - entity_id:gpu_usage_per_node:sum - * on (node) group_left(type) label_replace(avg by (node) (node_gpu_hourly_cost * (15 / 3600)), "type", "gpu", "", "") # convert hourly cost to 15-secondly cost and add type - ) - - record: entity_id:allocated_cost:sum # Allocated cost of memory, cpu, and gpu for each workflow/task execution and app. - expr: | - label_replace( - sum by (label_entity_type, label_domain, label_project, label_entity_id) ( # for the sum to work, the labels need to be different on each "or" element (type label) - entity_id:allocated_mem_cost:sum - or - entity_id:allocated_cpu_cost:sum - or - entity_id:allocated_gpu_cost:sum - ), - "type", "allocated", "", "" # add type info - ) - - record: entity_id:overhead_cost:sum # The amount of overhead costs (node costs that we can't allocate with container resources) to allocate to each entity (workflow/task execution or app) - expr: | - label_replace( - sum by (label_entity_type, label_entity_id, label_domain, label_project)( # Aggregate the per-node metrics up to workflow/task execution or app (label_entity_id) - # Start with each execution's and app's allocated cost per node - sum by (label_entity_type, label_domain, label_project, label_entity_id, node) ( # for the sum to work, the labels need to be different on each "or" element (type label) - entity_id:mem_usage_bytes_total_per_node:sum / (1024 * 1024 * 1024) # convert bytes to GB - * on (node) group_left(type) label_replace(avg by (node) (node_ram_hourly_cost * (15 / 3600)), "type", "mem", "", "") # convert hourly cost to 15-secondly cost and add type - or - entity_id:cpu_usage_per_node:sum - * on (node) group_left(type) label_replace(avg by (node) (node_cpu_hourly_cost * (15 / 3600)), "type", "cpu", "", "") # convert hourly cost to 15-secondly cost and add type - or - entity_id:gpu_usage_per_node:sum - * on (node) group_left(type) label_replace(avg by (node) (node_gpu_hourly_cost * (15 / 3600)), "type", "gpu", "", "") # convert hourly cost to 15-secondly cost and add type - ) - # Then divide out the total allocated cost per node to get the proportion of allocated cost associated with each entity - / on (node) group_left()( - sum by (node) ( # for the sum to work, the labels need to be different on each "or" element (type label) - entity_id:mem_usage_bytes_total_per_node:sum / (1024 * 1024 * 1024) # convert bytes to GB - * on (node) group_left(type) label_replace(avg by (node) (node_ram_hourly_cost * (15 / 3600)), "type", "mem", "", "") # convert hourly cost to 15-secondly cost and add type - or - entity_id:cpu_usage_per_node:sum - * on (node) group_left(type) label_replace(avg by (node) (node_cpu_hourly_cost * (15 / 3600)), "type", "cpu", "", "") # convert hourly cost to 15-secondly cost and add type - or - entity_id:gpu_usage_per_node:sum - * on (node) group_left(type) label_replace(avg by (node) (node_gpu_hourly_cost * (15 / 3600)), "type", "gpu", "", "") # convert hourly cost to 15-secondly cost and add type - ) - > 0 # need to avoid dividing by zero, or gaps in the data can cause NaNs to proliferate, borking all charts - ) - # Then multiply by the overhead cost per node - * on (node) group_left() ( - # To calculate overhead, start with the true cost of running each node - avg by (node)(kube_node_labels{label_flyte_org_node_role="worker"}) # only look at worker nodes - * on (node) max by (node) ( - node_total_hourly_cost{instance_type!=""} # sometimes, the instance_type can be null, causing an unlabeled label to show up in the Compute Costs dashboard charts - ) * (15 / 3600) # convert hourly cost to 15-secondly cost - # Then subtract out the total allocated cost on each node - - on (node) group_left()( - sum by (node) ( # for the sum to work, the labels need to be different on each "or" element (type label) - entity_id:mem_usage_bytes_total_per_node:sum / (1024 * 1024 * 1024) # convert bytes to GB - * on (node) group_left(type) label_replace(avg by (node) (node_ram_hourly_cost * (15 / 3600)), "type", "mem", "", "") # convert hourly cost to 15-secondly cost and add type - or - entity_id:cpu_usage_per_node:sum - * on (node) group_left(type) label_replace(avg by (node) (node_cpu_hourly_cost * (15 / 3600)), "type", "cpu", "", "") # convert hourly cost to 15-secondly cost and add type - or - entity_id:gpu_usage_per_node:sum - * on (node) group_left(type) label_replace(avg by (node) (node_gpu_hourly_cost * (15 / 3600)), "type", "gpu", "", "") # convert hourly cost to 15-secondly cost and add type - ) - ) - ) - ), - "type", "overhead", "", "" # add type info - ) - - record: entity_id:total_cost:sum # Total cost of each entity (workflow/task execution or app), including allocated (from container resources) and overhead (proportion of unallocated node costs) - expr: | - label_replace( - sum by (label_domain, label_project, label_entity_id, label_entity_type) ( - entity_id:allocated_cost:sum - or - entity_id:overhead_cost:sum - ), - "type", "total", "", "" # add type info - ) - - record: node:total_cost:sum # Total cost of all nodes - expr: | - sum ( - avg by (node)(kube_node_labels{label_flyte_org_node_role="worker", label_node_kubernetes_io_instance_type!=""}) # only look at worker nodes - * on (node) group_left() node_total_hourly_cost{instance_type!=""} * (15 / 3600) # convert hourly cost to 15-secondly cost - ) - - record: node_type:total_cost:sum # Total cost of nodes grouped by node type - expr: | - sum by (node_type)( - avg by (node)(kube_node_labels{label_flyte_org_node_role="worker", label_node_kubernetes_io_instance_type!=""}) # only look at worker nodes - * on (node) group_left(node_type) label_replace(node_total_hourly_cost{instance_type!=""}, "node_type", "$1", "instance_type", "(.*)") * (15 / 3600) # convert hourly cost to 15-secondly cost and rename label - ) - - record: node_type:uptime_hours:sum # Total uptime of nodes grouped by node type - expr: | - sum by (node_type)( - avg by (node, node_type)( # dedupe - label_replace(kube_node_labels{label_flyte_org_node_role="worker", label_node_kubernetes_io_instance_type!=""}, "node_type", "$1", "label_node_kubernetes_io_instance_type", "(.*)") # relabel - ) - ) * (15 / 3600) # convert to number of hours per 15-second observation # Aggregate the above into visible metrics - - name: cost_rollup_15m - interval: 15m - rules: - - record: execution_info15m - expr: | - max_over_time(execution_info[15m:15s]) - - record: app_info15m - expr: | - max_over_time(app_info[15m:15s]) - - record: workspace_info15m - expr: | - max_over_time(workspace_info[15m:15s]) - - record: entity_id:allocated_mem_bytes:sum15m - expr: | - sum_over_time(entity_id:allocated_mem_bytes:sum[15m:15s]) - - record: entity_id:used_mem_bytes:sum15m - expr: | - sum_over_time(entity_id:used_mem_bytes:sum[15m:15s]) - - record: entity_id:allocated_cpu:sum15m - expr: | - sum_over_time(entity_id:allocated_cpu:sum[15m:15s]) - - record: entity_id:used_cpu:sum15m - expr: | - sum_over_time(entity_id:used_cpu:sum[15m:15s]) - - record: entity_id:weighted_sm_occupancy:sum15m - expr: | - sum_over_time(entity_id:weighted_sm_occupancy:sum[15m:15s]) - - record: entity_id:gpu_count:sum15m - expr: | - sum_over_time(entity_id:gpu_count:sum[15m:15s]) - - record: entity_id:allocated_mem_cost:sum15m - expr: | - sum_over_time(entity_id:allocated_mem_cost:sum[15m:15s]) - - record: entity_id:allocated_cpu_cost:sum15m - expr: | - sum_over_time(entity_id:allocated_cpu_cost:sum[15m:15s]) - - record: entity_id:allocated_gpu_cost:sum15m - expr: | - sum_over_time(entity_id:allocated_gpu_cost:sum[15m:15s]) - - record: entity_id:allocated_cost:sum15m - expr: | - sum_over_time(entity_id:allocated_cost:sum[15m:15s]) - - record: entity_id:overhead_cost:sum15m - expr: | - sum_over_time(entity_id:overhead_cost:sum[15m:15s]) - - record: entity_id:total_cost:sum15m - expr: | - sum_over_time(entity_id:total_cost:sum[15m:15s]) - - record: node:total_cost:sum15m - expr: | - sum_over_time(node:total_cost:sum[15m:15s]) - - record: node_type:total_cost:sum15m - expr: | - sum_over_time(node_type:total_cost:sum[15m:15s]) - - record: node_type:uptime_hours:sum15m - expr: | - sum_over_time(node_type:uptime_hours:sum[15m:15s]) ---- -# Source: dataplane/templates/propeller/configmap.yaml -apiVersion: v1 -kind: ConfigMap -metadata: - name: flyte-propeller-config - namespace: union -data: - admin.yaml: | - admin: - clientId: 'dataplane-operator' - clientSecretLocation: /etc/union/secret/client_secret - endpoint: dns:/// - insecure: false - event: - capacity: 1000 - rate: 500 - type: admin - catalog.yaml: | - catalog-cache: - cache-endpoint: dns:/// - endpoint: dns:/// - insecure: false - type: fallback - use-admin-auth: true - copilot.yaml: | - plugins: - k8s: - co-pilot: - image: 'cr.flyte.org/flyteorg/flytecopilot:v1.14.1' - name: flyte-copilot- - start-timeout: 30s - core.yaml: | - propeller: - downstream-eval-duration: 30s - enable-admin-launcher: true - leader-election: - enabled: true - lease-duration: 15s - lock-config-map: - name: propeller-leader - namespace: 'union' - renew-deadline: 10s - retry-period: 2s - limit-namespace: all - literal-offloading-config: - enabled: true - max-workflow-retries: 30 - metadata-prefix: metadata/propeller - metrics-prefix: flyte - prof-port: 10254 - queue: - batch-size: -1 - batching-interval: 2s - queue: - base-delay: 5s - capacity: 1000 - max-delay: 120s - rate: 100 - type: maxof - sub-queue: - capacity: 100 - rate: 10 - type: bucket - type: batch - rawoutput-prefix: 's3://' - workers: 4 - workflow-reeval-duration: 30s webhook: certDir: /etc/webhook/certs + disableCreateMutatingWebhookConfig: true embeddedSecretManagerConfig: imagePullSecrets: enabled: true @@ -3807,377 +3383,1173 @@ data: namespace: 'union' type: 'K8s' listenPort: '9443' + localCert: true secretManagerTypes: - Embedded - K8s - serviceName: flyte-pod-webhook + secretName: union-pod-webhook + serviceName: union-pod-webhook servicePort: '443' - enabled_plugins.yaml: | - plugins: - connector-service: - defaultConnector: - defaultServiceConfig: '{"loadBalancingConfig": [{"round_robin":{}}]}' - endpoint: k8s:///flyteconnector.union:8000 - tasks: - task-plugins: - default-for-task-types: - actor: fast-task - container: container - container_array: k8s-array - sidecar: sidecar - enabled-plugins: - - container - - sidecar - - k8s-array - - echo - - fast-task - - connector-service - k8s.yaml: | - plugins: - k8s: - default-cpus: 100m - default-env-vars: [] - default-memory: 100Mi - logger.yaml: | - logger: - formatter: - type: json - level: 4 - show-source: true - namespace_config.yaml: | - namespace_config: - namespace_mapping: - template: '{{ project }}-{{ domain }}' - resource_manager.yaml: | - propeller: - resourcemanager: - type: noop - task_logs.yaml: | - plugins: - logs: - cloudwatch-enabled: false - dynamic-log-links: - - vscode: - displayName: VS Code Debugger - templateUris: - - /dataplane/pod/v1/generated_name/task/{{.executionProject}}/{{.executionDomain}}/{{.executionName}}/{{.nodeID}}/{{.taskRetryAttempt}}/{{.taskProject}}/{{.taskDomain}}/{{.taskID}}/{{.taskVersion}}/ - kubernetes-enabled: false - templates: - - displayName: Task Logs - scheme: TaskExecution - templateUris: - - /console/projects/{{.executionProject}}/domains/{{.executionDomain}}/executions/{{.executionName}}/nodeId/{{.nodeID}}/taskId/{{.taskID}}/attempt/{{.taskRetryAttempt}}/view/logs?duration=all&fromExecutionNav=true - storage.yaml: | - storage: - container: "" - type: stow - stow: - kind: s3 - config: - auth_type: accesskey - access_key_id: - secret_key: - disable_ssl: false - endpoint: - region: us-east-1 - enable-multicontainer: false - limits: - maxDownloadMBs: 1024 - cache: - max_size_mbs: 0 - target_gc_percent: 70 --- -# Source: dataplane/charts/fluentbit/templates/clusterrole.yaml +# Source: dataplane/charts/fluentbit/templates/clusterrole.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: release-name-fluentbit + labels: + helm.sh/chart: fluentbit-0.48.9 + app.kubernetes.io/name: fluentbit + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "3.2.8" + app.kubernetes.io/managed-by: Helm +rules: + - apiGroups: + - "" + resources: + - namespaces + - pods + verbs: + - get + - list + - watch +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2020 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: knative-serving-operator-aggregated + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +aggregationRule: + clusterRoleSelectors: + # This (along with escalate below) allows the Operator to pick up any + # roles that are provided to the admin of the cluster by knative serving + # automatically. + - matchExpressions: + - {key: serving.knative.dev/release, operator: Exists} +rules: [] # Rules are automatically filled in by the controller manager. +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: knative-serving-operator-aggregated-stable + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +aggregationRule: + clusterRoleSelectors: + # This (along with escalate below) allows the Operator to pick up any + # roles that are provided to the admin of the cluster by knative serving + # automatically. + - matchExpressions: + - {key: app.kubernetes.io/name, operator: In, values: ["knative-serving"]} +rules: [] # Rules are automatically filled in by the controller manager. +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: knative-eventing-operator-aggregated + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +aggregationRule: + clusterRoleSelectors: + # This (along with escalate below) allows the Operator to pick up any + # roles that are provided to the admin of the cluster by knative eventing + # automatically. + - matchExpressions: + - {key: eventing.knative.dev/release, operator: Exists} +rules: [] # Rules are automatically filled in by the controller manager. +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: knative-eventing-operator-aggregated-stable + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +aggregationRule: + clusterRoleSelectors: + # This (along with escalate below) allows the Operator to pick up any + # roles that are provided to the admin of the cluster by knative eventing + # automatically. + - matchExpressions: + - {key: app.kubernetes.io/name, operator: In, values: ["knative-eventing"]} +rules: [] # Rules are automatically filled in by the controller manager. +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2020 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +kind: ClusterRole +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: knative-serving-operator + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +rules: + - apiGroups: + - operator.knative.dev + resources: + - '*' + verbs: + - '*' + # Bootstrapping permissions. + # Roles that are explicitly bound buch which are specified by this Operator + # MUST be specified here with 'get' and 'bind'. + - apiGroups: + - rbac.authorization.k8s.io + resources: + - clusterroles + resourceNames: + - system:auth-delegator + verbs: + - bind + - get + - apiGroups: + - rbac.authorization.k8s.io + resources: + - roles + resourceNames: + - extension-apiserver-authentication-reader + verbs: + - bind + - get + - apiGroups: + - rbac.authorization.k8s.io + resources: + - clusterroles + - roles + verbs: + - create + - delete + # Escalate is necessary in order to create a role using cluster role aggregation, + # and to allow the Operator to bootstrap itself into the necessary set of + # permissions, even as those continue to evolve upstream. + - escalate + - get + - list + - update + - apiGroups: + - rbac.authorization.k8s.io + resources: + - clusterrolebindings + - rolebindings + verbs: + - create + - delete + - list + - get + - update + # Permissions required for Knative controller + # infra. + - apiGroups: + - apiregistration.k8s.io + resources: + - apiservices + verbs: + - update + - apiGroups: + - "" + resources: + - services + verbs: + - create + - delete + - get + - list + - watch + - apiGroups: + - caching.internal.knative.dev + resources: + - images + verbs: + - '*' + - apiGroups: + - "" + resources: + - namespaces + verbs: + - get + - update + - watch + - apiGroups: + - '' + resources: + - events + verbs: + - create + - update + - patch + - apiGroups: + - '' + resources: + - configmaps + verbs: + - create + - delete + - get + - list + - watch + - apiGroups: + - security.istio.io + - apps + - policy + resources: + - poddisruptionbudgets + - peerauthentications + - deployments + - daemonsets + - replicasets + - statefulsets + verbs: + - create + - delete + - get + - list + - watch + - update + - apiGroups: + - apiregistration.k8s.io + resources: + - apiservices + verbs: + - create + - delete + - get + - list + - apiGroups: + - autoscaling + resources: + - horizontalpodautoscalers + verbs: + - create + - delete + - get + - list + - apiGroups: + - coordination.k8s.io + resources: + - leases + verbs: + - '*' + - apiGroups: + - apiextensions.k8s.io + resources: + - customresourcedefinitions + verbs: + - '*' + # Old resources that need cleaning up that are not in the knative-serving + # namespace. + - apiGroups: + - "" + resources: + - services + - deployments + - horizontalpodautoscalers + resourceNames: + - knative-ingressgateway + verbs: + - delete + - apiGroups: + - "" + resources: + - configmaps + resourceNames: + - config-controller + verbs: + - delete + - apiGroups: + - "" + resources: + - serviceaccounts + resourceNames: + - knative-serving-operator + verbs: + - delete + # for contour TLS + - apiGroups: + - projectcontour.io + resources: + - httpproxies + - tlscertificatedelegations + verbs: + - get + - list + - watch + - update + - create + - delete + - deletecollection + - patch + # for security-guard + - apiGroups: + - guard.security.knative.dev + resources: + - guardians + verbs: + - get + - list + - watch + - create + - update + - patch + - delete + - apiGroups: + - "" + resources: + - configmaps + verbs: + - get + - list + - watch + - create + - update + - patch + - delete + - apiGroups: + - authentication.k8s.io + resources: + - tokenreviews + verbs: + - create + - apiGroups: + - "" + resources: + - pods + verbs: + - get +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +kind: ClusterRole +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: knative-eventing-operator + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +rules: + - apiGroups: + - operator.knative.dev + resources: + - '*' + verbs: + - '*' + # Bootstrapping permissions. + # Roles that are explicitly bound buch which are specified by this Operator + # MUST be specified here with 'get' and 'bind'. + - apiGroups: + - rbac.authorization.k8s.io + resources: + - clusterroles + - roles + verbs: + - create + - delete + # Escalate is necessary in order to create a role using cluster role aggregation, + # and to allow the Operator to bootstrap itself into the necessary set of + # permissions, even as those continue to evolve upstream. + - escalate + - get + - list + - update + - apiGroups: + - rbac.authorization.k8s.io + resources: + - clusterrolebindings + - rolebindings + verbs: + - create + - delete + - list + - get + - update + # Permissions required for Knative controller + # infra. + - apiGroups: + - apiregistration.k8s.io + resources: + - apiservices + verbs: + - update + - apiGroups: + - "" + resources: + - services + verbs: + - create + - delete + - get + - list + - watch + - apiGroups: + - caching.internal.knative.dev + resources: + - images + verbs: + - '*' + - apiGroups: + - "" + resources: + - namespaces + verbs: + - get + - update + - watch + - apiGroups: + - '' + resources: + - events + verbs: + - create + - update + - patch + - apiGroups: + - '' + resources: + - configmaps + verbs: + - create + - delete + - get + - list + - watch + - apiGroups: + - apps + resources: + - deployments + - daemonsets + - replicasets + - statefulsets + verbs: + - create + - delete + - get + - list + - watch + - apiGroups: + - apiregistration.k8s.io + resources: + - apiservices + verbs: + - create + - delete + - get + - list + - apiGroups: + - autoscaling + resources: + - horizontalpodautoscalers + verbs: + - create + - delete + - update + - get + - list + - apiGroups: + - coordination.k8s.io + resources: + - leases + verbs: + - '*' + - apiGroups: + - apiextensions.k8s.io + resources: + - customresourcedefinitions + verbs: + - '*' + - apiGroups: + - batch + resources: + - jobs + verbs: + - create + - delete + - update + - get + - list + - watch + # Old resources that need cleaning up that are not in the knative-eventing + # namespace. + - apiGroups: + - "" + resources: + - serviceaccounts + resourceNames: + - knative-eventing-operator + verbs: + - delete + # for RabbitMQ messaging topology objects + - apiGroups: + - rabbitmq.com + resources: + - rabbitmqclusters + verbs: + - "get" + - "list" + - "watch" + - apiGroups: + - rabbitmq.com + resources: + - bindings + - queues + - exchanges + verbs: + - create + - delete + - get + - list + - patch + - update + - watch + - apiGroups: + - rabbitmq.com + resources: + - bindings/status + - queues/status + - exchanges/status + verbs: + - get + # for Kafka eventing source + - apiGroups: + - keda.sh + resources: + - scaledobjects + - scaledobjects/finalizers + - scaledobjects/status + - triggerauthentications + - triggerauthentications/status + verbs: + - get + - list + - watch + - update + - create + - delete + # Internal APIs + - apiGroups: + - "internal.kafka.eventing.knative.dev" + resources: + - "consumers" + - "consumers/status" + - "consumergroups" + - "consumergroups/status" + verbs: + - create + - get + - list + - watch + - patch + - update + - delete + - apiGroups: + - "internal.kafka.eventing.knative.dev" + resources: + - "consumers/finalizers" + - "consumergroups/finalizers" + verbs: + - update + - delete + - apiGroups: + - apps + resources: + - statefulsets/scale + verbs: + - get + - list + - watch + - update + - patch + - create + - delete + - apiGroups: + - rbac.authorization.k8s.io + resources: + - clusterrolebindings + verbs: + - watch + - apiGroups: + - "*" + resources: + - configmaps + verbs: + - delete + - apiGroups: + - "*" + resources: + - configmaps + - services + verbs: + - get + - list + - watch + - update + - create + - delete + - apiGroups: + - "*" + resources: + - pods + verbs: + - list + - update + - get + - watch + - apiGroups: + - "*" + resources: + - pods/finalizers + verbs: + - get + - list + - create + - update + - delete + - apiGroups: + - "*" + resources: + - events + verbs: + - patch + - create + - apiGroups: + - "*" + resources: + - secrets + verbs: + - get + - list + - watch + - update + - create + - delete + - apiGroups: + - "*" + resources: + - nodes + verbs: + - get + - list + - watch + - apiGroups: + - "*" + resources: + - serviceaccounts + verbs: + - get + - list + - watch + - update + - create + - delete + - apiGroups: + - "*" + resources: + - configmaps + resourceNames: + - kafka-channel-config + verbs: + - patch + - apiGroups: + - "*" + resources: + - horizontalpodautoscalers + resourceNames: + - kafka-webhook + verbs: + - delete + - apiGroups: + - "*" + resources: + - leases + verbs: + - delete + - apiGroups: + - "*" + resources: + - poddisruptionbudgets + resourceNames: + - kafka-webhook + verbs: + - delete + - apiGroups: + - "*" + resources: + - services + verbs: + - patch + - apiGroups: + - "apps" + resources: + - deployments + verbs: + - deletecollection + # Eventing TLS + - apiGroups: + - "cert-manager.io" + resources: + - certificates + - issuers + - clusterissuers + verbs: + - create + - delete + - update + - list + - get + - watch + - apiGroups: + - "trust.cert-manager.io" + resources: + - bundles + verbs: + - create + - delete + - update + - list + - get + - watch +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: knative-operator-webhook + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +rules: + # For watching logging configuration and getting certs. + - apiGroups: + - "" + resources: + - "configmaps" + verbs: + - "get" + - "list" + - "watch" + # For manipulating certs into secrets. + - apiGroups: + - "" + resources: + - "namespaces" + verbs: + - "get" + - "create" + - "update" + - "list" + - "watch" + - "patch" + # finalizers are needed for the owner reference of the webhook + - apiGroups: + - "" + resources: + - "namespaces/finalizers" + verbs: + - "update" + # For getting our Deployment so we can decorate with ownerref. + - apiGroups: + - "apps" + resources: + - "deployments" + verbs: + - "get" + - apiGroups: + - "apps" + resources: + - "deployments/finalizers" + verbs: + - update + # For actually registering our webhook. + - apiGroups: + - "admissionregistration.k8s.io" + resources: + - "mutatingwebhookconfigurations" + - "validatingwebhookconfigurations" + verbs: &everything + - "get" + - "list" + - "create" + - "update" + - "delete" + - "patch" + - "watch" + # For leader election + - apiGroups: + - "coordination.k8s.io" + resources: + - "leases" + verbs: *everything + # Necessary for conversion webhook. These are copied from the serving + # TODO: Do we really need all these permissions? + - apiGroups: ["apiextensions.k8s.io"] + resources: ["customresourcedefinitions"] + verbs: ["get", "list", "create", "update", "delete", "patch", "watch"] +--- +# Source: dataplane/charts/fluentbit/templates/clusterrolebinding.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: release-name-fluentbit + labels: + helm.sh/chart: fluentbit-0.48.9 + app.kubernetes.io/name: fluentbit + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "3.2.8" + app.kubernetes.io/managed-by: Helm +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: release-name-fluentbit +subjects: + - kind: ServiceAccount + name: union-system + namespace: union +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2020 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# TODO: Consider restriction of non-aggregated role to knativeservings namespaces. +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: knative-serving-operator + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: knative-serving-operator +subjects: + - kind: ServiceAccount + name: knative-operator + namespace: "union" +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# TODO: Consider restriction of non-aggregated role to knativeeventing namespaces. +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: knative-eventing-operator + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: knative-eventing-operator +subjects: + - kind: ServiceAccount + name: knative-operator + namespace: "union" +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: operator-webhook + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: knative-operator-webhook +subjects: + - kind: ServiceAccount + name: operator-webhook + namespace: "union" +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2020 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: knative-serving-operator-aggregated + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: knative-serving-operator-aggregated +subjects: + - kind: ServiceAccount + name: knative-operator + namespace: "union" +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: knative-serving-operator-aggregated-stable + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: knative-serving-operator-aggregated-stable +subjects: + - kind: ServiceAccount + name: knative-operator + namespace: "union" +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: knative-eventing-operator-aggregated + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: knative-eventing-operator-aggregated +subjects: + - kind: ServiceAccount + name: knative-operator + namespace: "union" +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: knative-eventing-operator-aggregated-stable + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: knative-eventing-operator-aggregated-stable +subjects: + - kind: ServiceAccount + name: knative-operator + namespace: "union" +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2022 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole +kind: Role metadata: - name: release-name-fluentbit + namespace: "union" + name: knative-operator-webhook labels: - helm.sh/chart: fluentbit-0.48.9 - app.kubernetes.io/name: fluentbit - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "3.2.8" - app.kubernetes.io/managed-by: Helm + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator rules: + # For manipulating certs into secrets. - apiGroups: - "" resources: - - namespaces - - pods + - "secrets" verbs: - - get - - list - - watch ---- -# Source: dataplane/charts/kube-state-metrics/templates/role.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - labels: - helm.sh/chart: kube-state-metrics-5.30.1 - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: metrics - app.kubernetes.io/part-of: kube-state-metrics - app.kubernetes.io/name: kube-state-metrics - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2.15.0" - name: release-name-kube-state-metrics -rules: - -- apiGroups: ["certificates.k8s.io"] - resources: - - certificatesigningrequests - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - configmaps - verbs: ["list", "watch"] - -- apiGroups: ["batch"] - resources: - - cronjobs - verbs: ["list", "watch"] - -- apiGroups: ["extensions", "apps"] - resources: - - daemonsets - verbs: ["list", "watch"] - -- apiGroups: ["extensions", "apps"] - resources: - - deployments - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - endpoints - verbs: ["list", "watch"] - -- apiGroups: ["autoscaling"] - resources: - - horizontalpodautoscalers - verbs: ["list", "watch"] - -- apiGroups: ["extensions", "networking.k8s.io"] - resources: - - ingresses - verbs: ["list", "watch"] - -- apiGroups: ["batch"] - resources: - - jobs - verbs: ["list", "watch"] - -- apiGroups: ["coordination.k8s.io"] - resources: - - leases - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - limitranges - verbs: ["list", "watch"] - -- apiGroups: ["admissionregistration.k8s.io"] - resources: - - mutatingwebhookconfigurations - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - namespaces - verbs: ["list", "watch"] - -- apiGroups: ["networking.k8s.io"] - resources: - - networkpolicies - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - nodes - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - persistentvolumeclaims - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - persistentvolumes - verbs: ["list", "watch"] - -- apiGroups: ["policy"] - resources: - - poddisruptionbudgets - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - pods - verbs: ["list", "watch"] - -- apiGroups: ["extensions", "apps"] - resources: - - replicasets - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - replicationcontrollers - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - resourcequotas - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - secrets - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - services - verbs: ["list", "watch"] - -- apiGroups: ["apps"] - resources: - - statefulsets - verbs: ["list", "watch"] - -- apiGroups: ["storage.k8s.io"] - resources: - - storageclasses - verbs: ["list", "watch"] - -- apiGroups: ["admissionregistration.k8s.io"] - resources: - - validatingwebhookconfigurations - verbs: ["list", "watch"] - -- apiGroups: ["storage.k8s.io"] - resources: - - volumeattachments - verbs: ["list", "watch"] + - "get" + - "create" + - "update" + - "list" + - "watch" + - "patch" --- -# Source: dataplane/charts/opencost/templates/clusterrole.yaml -# Cluster role giving opencost to get, list, watch required resources -# No write permissions are required +# Source: dataplane/templates/monitoring/prometheus-rbac.yaml apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole +kind: Role metadata: - name: release-name-opencost + name: union-operator-prometheus-rbac + namespace: union labels: - helm.sh/chart: opencost-1.42.0 - app.kubernetes.io/name: opencost - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "1.111.0" - app.kubernetes.io/part-of: opencost - app.kubernetes.io/managed-by: Helm + release: release-name rules: + # Prometheus server scrape permissions - apiGroups: [""] resources: - - configmaps - - deployments - nodes - - pods + - nodes/proxy + - nodes/metrics - services - - resourcequotas - - replicationcontrollers - - limitranges - - persistentvolumeclaims - - persistentvolumes - - namespaces - endpoints + - pods + - ingresses + - configmaps verbs: - get - list - watch - - apiGroups: - - extensions - resources: - - daemonsets - - deployments - - replicasets - verbs: - - get - - list - - watch - - apiGroups: - - apps - resources: - - statefulsets - - deployments - - daemonsets - - replicasets - verbs: - - list - - watch - - apiGroups: - - batch - resources: - - cronjobs - - jobs - verbs: - - get - - list - - watch - - apiGroups: - - autoscaling - resources: - - horizontalpodautoscalers - verbs: - - get - - list - - watch - - apiGroups: - - policy + - apiGroups: ["extensions", "networking.k8s.io"] resources: - - poddisruptionbudgets + - ingresses/status + - ingresses verbs: - get - list - watch - - apiGroups: - - storage.k8s.io + - apiGroups: ["discovery.k8s.io"] resources: - - storageclasses + - endpointslices verbs: - get - list - watch ---- -# Source: dataplane/templates/clusterresourcesync/serviceaccount.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: union-clustersync-resource -rules: - - apiGroups: - - "" - - rbac.authorization.k8s.io - resources: - - configmaps - - namespaces - - pods - - resourcequotas - - roles - - rolebindings - - secrets - - services - - serviceaccounts - - clusterrolebindings - - podtemplates - verbs: - - '*' + # kube-state-metrics permissions (rbac.create=false on the subchart; managed here instead) + - apiGroups: ["certificates.k8s.io"] + resources: ["certificatesigningrequests"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["configmaps"] + verbs: ["list", "watch"] + - apiGroups: ["batch"] + resources: ["cronjobs"] + verbs: ["list", "watch"] + - apiGroups: ["extensions", "apps"] + resources: ["daemonsets"] + verbs: ["list", "watch"] + - apiGroups: ["extensions", "apps"] + resources: ["deployments"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["endpoints"] + verbs: ["list", "watch"] + - apiGroups: ["discovery.k8s.io"] + resources: ["endpointslices"] + verbs: ["list", "watch"] + - apiGroups: ["autoscaling"] + resources: ["horizontalpodautoscalers"] + verbs: ["list", "watch"] + - apiGroups: ["extensions", "networking.k8s.io"] + resources: ["ingresses"] + verbs: ["list", "watch"] + - apiGroups: ["batch"] + resources: ["jobs"] + verbs: ["list", "watch"] + - apiGroups: ["coordination.k8s.io"] + resources: ["leases"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["limitranges"] + verbs: ["list", "watch"] + - apiGroups: ["admissionregistration.k8s.io"] + resources: ["mutatingwebhookconfigurations"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["namespaces"] + verbs: ["list", "watch"] + - apiGroups: ["networking.k8s.io"] + resources: ["networkpolicies"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["persistentvolumeclaims"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["persistentvolumes"] + verbs: ["list", "watch"] + - apiGroups: ["policy"] + resources: ["poddisruptionbudgets"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["pods"] + verbs: ["list", "watch"] + - apiGroups: ["extensions", "apps"] + resources: ["replicasets"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["replicationcontrollers"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["resourcequotas"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["secrets"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["services"] + verbs: ["list", "watch"] + - apiGroups: ["apps"] + resources: ["statefulsets"] + verbs: ["list", "watch"] + - apiGroups: ["storage.k8s.io"] + resources: ["storageclasses"] + verbs: ["list", "watch"] + - apiGroups: ["admissionregistration.k8s.io"] + resources: ["validatingwebhookconfigurations"] + verbs: ["list", "watch"] + - apiGroups: ["storage.k8s.io"] + resources: ["volumeattachments"] + verbs: ["list", "watch"] --- # Source: dataplane/templates/nodeexecutor/serviceaccount.yaml apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole +kind: Role metadata: name: union-executor labels: @@ -4228,11 +4600,12 @@ rules: - delete - update --- -# Source: dataplane/templates/operator/serviceaccount-proxy.yaml +# Source: dataplane/templates/operator/serviceaccount-proxy-secret.yaml apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole +kind: Role metadata: - name: proxy-system + name: union-system-secret + namespace: union labels: app.kubernetes.io/name: operator-proxy app.kubernetes.io/instance: release-name @@ -4242,146 +4615,86 @@ rules: - apiGroups: - '*' resources: - - events - - flyteworkflows - - pods/log - - pods - - rayjobs - - resourcequotas + - secrets verbs: - get - list - - watch + - create + - update + - delete --- -# Source: dataplane/templates/operator/serviceaccount.yaml +# Source: dataplane/templates/operator/serviceaccount-proxy.yaml apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole +kind: Role metadata: - name: operator-system + name: proxy-system labels: - app.kubernetes.io/name: union-operator + app.kubernetes.io/name: operator-proxy app.kubernetes.io/instance: release-name platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm rules: - # Allow Access to all resources under flyte.lyft.com - - apiGroups: - - flyte.lyft.com - resources: - - flyteworkflows - - flyteworkflows/finalizers - verbs: - - get - - list - - watch - - create - - update - - delete - - patch - - post - - deletecollection - apiGroups: - '*' resources: - - resourcequotas + - events + - flyteworkflows + - pods/log - pods - - configmaps - - podtemplates - - secrets - - namespaces - - nodes + - rayjobs + - resourcequotas verbs: - get - list - watch - - create - - update - - delete - - nonResourceURLs: - - /metrics - verbs: - - get --- -# Source: dataplane/templates/prometheus/rbac.yaml +# Source: dataplane/templates/operator/serviceaccount.yaml apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole +kind: Role metadata: - name: union-operator-prometheus + name: operator-system labels: - helm.sh/chart: dataplane-2026.4.2 - app.kubernetes.io/name: release-name-dataplane + app.kubernetes.io/name: union-operator app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" + platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: prometheus -rules: - - apiGroups: [""] - resources: - - nodes - - nodes/proxy - - pods - - endpoints - - services - verbs: - - get - - list - - watch - - nonResourceURLs: - - /metrics - - /metrics/cadvisor - verbs: - - get ---- -# Source: dataplane/templates/propeller/serviceaccount-webhook.yaml -kind: ClusterRole -apiVersion: rbac.authorization.k8s.io/v1 -metadata: - name: flytepropeller-webhook-role - namespace: union rules: - apiGroups: - - "*" + - '*' resources: - - mutatingwebhookconfigurations - secrets - - pods - - replicasets/finalizers + - deployments verbs: - get + - list + - watch - create - update - - patch ---- -# Source: dataplane/templates/propeller/serviceaccount.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: flytepropeller-role -rules: - # Allow RO access to PODS - apiGroups: - - "" + - flyte.lyft.com resources: - - pods + - flyteworkflows + - flyteworkflows/finalizers verbs: - get - list - watch - # Allow Event recording access - - apiGroups: - - "" - resources: - - events - verbs: - create - update - delete - patch - # Allow Access All plugin objects + - post + - deletecollection - apiGroups: - '*' resources: - - '*' + - resourcequotas + - pods + - configmaps + - podtemplates + - secrets + - namespaces + - nodes verbs: - get - list @@ -4389,148 +4702,129 @@ rules: - create - update - delete - - patch - # Allow Access to CRD - apiGroups: - - apiextensions.k8s.io + - serving.knative.dev resources: - - customresourcedefinitions + - revisions + - configurations + - services verbs: - get - list - watch - create - - delete - update - # Allow Access to all resources under flyte.lyft.com + - delete +--- +# Source: dataplane/templates/webhook/serviceaccount.yaml +kind: Role +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: union-webhook-role + namespace: union +rules: - apiGroups: - - flyte.lyft.com + - "*" resources: - - flyteworkflows - - flyteworkflows/finalizers + - mutatingwebhookconfigurations + - secrets + - pods + - replicasets/finalizers verbs: - get - - list - - watch - create - update - - delete - patch - - post - - deletecollection --- -# Source: dataplane/charts/fluentbit/templates/clusterrolebinding.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2022 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding +kind: RoleBinding metadata: - name: release-name-fluentbit + namespace: "union" + name: operator-webhook labels: - helm.sh/chart: fluentbit-0.48.9 - app.kubernetes.io/name: fluentbit - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "3.2.8" - app.kubernetes.io/managed-by: Helm -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: release-name-fluentbit + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator subjects: - kind: ServiceAccount - name: fluentbit-system - namespace: union ---- -# Source: dataplane/charts/kube-state-metrics/templates/clusterrolebinding.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - labels: - helm.sh/chart: kube-state-metrics-5.30.1 - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: metrics - app.kubernetes.io/part-of: kube-state-metrics - app.kubernetes.io/name: kube-state-metrics - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2.15.0" - name: release-name-kube-state-metrics + name: operator-webhook + namespace: "union" roleRef: + kind: Role + name: knative-operator-webhook apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: release-name-kube-state-metrics -subjects: -- kind: ServiceAccount - name: release-name-kube-state-metrics - namespace: union --- -# Source: dataplane/charts/opencost/templates/clusterrolebinding.yaml +# Source: dataplane/templates/monitoring/prometheus-rbac.yaml apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding +kind: RoleBinding metadata: - name: release-name-opencost + name: union-operator-prometheus-rbac + namespace: union labels: - helm.sh/chart: opencost-1.42.0 - app.kubernetes.io/name: opencost - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "1.111.0" - app.kubernetes.io/part-of: opencost - app.kubernetes.io/managed-by: Helm -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: release-name-opencost + release: release-name subjects: - kind: ServiceAccount - name: release-name-opencost + name: union-operator-prometheus namespace: union ---- -# Source: dataplane/templates/clusterresourcesync/serviceaccount.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - name: union-clustersync-resource roleRef: apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: union-clustersync-resource -subjects: - - kind: ServiceAccount - name: union-clustersync-system - namespace: union + kind: Role + name: union-operator-prometheus-rbac --- -# Source: dataplane/templates/clusterresourcesync/serviceaccount.yaml +# Source: dataplane/templates/monitoring/prometheus-rbac.yaml +# kube-state-metrics service account RoleBinding apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding +kind: RoleBinding metadata: - name: union-clustersync-auth-delegator -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: system:auth-delegator + name: release-name-prometheus-kube-state-metrics + namespace: union + labels: + release: release-name subjects: - kind: ServiceAccount - name: union-clustersync-system + name: release-name-prometheus-kube-state-metrics namespace: union +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: union-operator-prometheus-rbac --- # Source: dataplane/templates/nodeexecutor/serviceaccount.yaml apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding +kind: RoleBinding metadata: name: union-executor labels: app: executor roleRef: apiGroup: rbac.authorization.k8s.io - kind: ClusterRole + kind: Role name: union-executor subjects: - kind: ServiceAccount - name: executor + name: union-system namespace: union --- -# Source: dataplane/templates/operator/serviceaccount-proxy.yaml +# Source: dataplane/templates/operator/serviceaccount-proxy-secret.yaml apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding +kind: RoleBinding metadata: - name: proxy-system + name: union-system-secret + namespace: union labels: app.kubernetes.io/name: operator-proxy app.kubernetes.io/instance: release-name @@ -4538,109 +4832,35 @@ metadata: app.kubernetes.io/managed-by: Helm roleRef: apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: proxy-system + kind: Role + name: union-system-secret subjects: - kind: ServiceAccount - name: proxy-system + name: union-system namespace: union --- -# Source: dataplane/templates/operator/serviceaccount.yaml +# Source: dataplane/templates/operator/serviceaccount-proxy.yaml apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding +kind: RoleBinding metadata: - name: operator-system + name: proxy-system labels: - app.kubernetes.io/name: union-operator + app.kubernetes.io/name: operator-proxy app.kubernetes.io/instance: release-name platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm roleRef: apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: operator-system -subjects: - - kind: ServiceAccount - name: operator-system - namespace: union ---- -# Source: dataplane/templates/prometheus/rbac.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - name: union-operator-prometheus - labels: - helm.sh/chart: dataplane-2026.4.2 - app.kubernetes.io/name: release-name-dataplane - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: prometheus -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: union-operator-prometheus -subjects: - - kind: ServiceAccount - name: union-operator-prometheus - namespace: union ---- -# Source: dataplane/templates/propeller/serviceaccount-webhook.yaml -# Create a binding from Role -> ServiceAccount -kind: ClusterRoleBinding -apiVersion: rbac.authorization.k8s.io/v1 -metadata: - name: flytepropeller-webhook-binding - namespace: union -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: flytepropeller-webhook-role -subjects: - - kind: ServiceAccount - name: flytepropeller-webhook-system - namespace: union ---- -# Source: dataplane/templates/propeller/serviceaccount.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - name: flytepropeller-binding -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: flytepropeller-role + kind: Role + name: proxy-system subjects: - kind: ServiceAccount - name: flytepropeller-system + name: union-system namespace: union --- -# Source: dataplane/templates/operator/serviceaccount-proxy-secret.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: Role -metadata: - name: proxy-system-secret - namespace: union - labels: - app.kubernetes.io/name: operator-proxy - app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name - app.kubernetes.io/managed-by: Helm -rules: - - apiGroups: - - '*' - resources: - - secrets - verbs: - - get - - list - - create - - update - - delete ---- # Source: dataplane/templates/operator/serviceaccount.yaml apiVersion: rbac.authorization.k8s.io/v1 -kind: Role +kind: RoleBinding metadata: name: operator-system labels: @@ -4648,56 +4868,29 @@ metadata: app.kubernetes.io/instance: release-name platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm -rules: - - apiGroups: - - '*' - resources: - - secrets - - deployments - verbs: - - get - - list - - watch - - create - - update ---- -# Source: dataplane/templates/operator/serviceaccount-proxy-secret.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: RoleBinding -metadata: - name: proxy-system-secret - namespace: union - labels: - app.kubernetes.io/name: operator-proxy - app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name - app.kubernetes.io/managed-by: Helm roleRef: apiGroup: rbac.authorization.k8s.io kind: Role - name: proxy-system-secret + name: operator-system subjects: - kind: ServiceAccount - name: proxy-system + name: union-system namespace: union --- -# Source: dataplane/templates/operator/serviceaccount.yaml -apiVersion: rbac.authorization.k8s.io/v1 +# Source: dataplane/templates/webhook/serviceaccount.yaml +# Create a binding from Role -> ServiceAccount kind: RoleBinding +apiVersion: rbac.authorization.k8s.io/v1 metadata: - name: operator-system - labels: - app.kubernetes.io/name: union-operator - app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name - app.kubernetes.io/managed-by: Helm + name: union-webhook-binding + namespace: union roleRef: apiGroup: rbac.authorization.k8s.io kind: Role - name: operator-system + name: union-webhook-role subjects: - kind: ServiceAccount - name: operator-system + name: union-system namespace: union --- # Source: dataplane/charts/fluentbit/templates/service.yaml @@ -4720,23 +4913,63 @@ spec: protocol: TCP name: http selector: - app.kubernetes.io/name: fluentbit - app.kubernetes.io/instance: release-name + app.kubernetes.io/name: fluentbit + app.kubernetes.io/instance: release-name +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2022 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v1 +kind: Service +metadata: + labels: + role: operator-webhook + app.kubernetes.io/component: operator-webhook + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator + name: operator-webhook + namespace: "union" +spec: + ports: + # Define metrics and profiling for them to be accessible within service meshes. + - name: http-metrics + port: 9090 + targetPort: 9090 + - name: http-profiling + port: 8008 + targetPort: 8008 + - name: https-webhook + port: 443 + targetPort: 8443 + selector: + role: operator-webhook --- -# Source: dataplane/charts/kube-state-metrics/templates/service.yaml +# Source: dataplane/charts/prometheus/charts/kube-state-metrics/templates/service.yaml apiVersion: v1 kind: Service metadata: name: release-name-kube-state-metrics namespace: union labels: - helm.sh/chart: kube-state-metrics-5.30.1 + helm.sh/chart: kube-state-metrics-5.25.1 app.kubernetes.io/managed-by: Helm app.kubernetes.io/component: metrics app.kubernetes.io/part-of: kube-state-metrics app.kubernetes.io/name: kube-state-metrics app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2.15.0" + app.kubernetes.io/version: "2.13.0" annotations: prometheus.io/scrape: 'true' spec: @@ -4751,28 +4984,32 @@ spec: app.kubernetes.io/name: kube-state-metrics app.kubernetes.io/instance: release-name --- -# Source: dataplane/charts/opencost/templates/service.yaml +# Source: dataplane/charts/prometheus/templates/service.yaml apiVersion: v1 kind: Service metadata: - name: release-name-opencost - namespace: union labels: - helm.sh/chart: opencost-1.42.0 - app.kubernetes.io/name: opencost + app.kubernetes.io/component: server + app.kubernetes.io/name: prometheus app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "1.111.0" - app.kubernetes.io/part-of: opencost + app.kubernetes.io/version: v2.54.1 + helm.sh/chart: prometheus-25.27.0 app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: prometheus + name: union-operator-prometheus + namespace: union spec: + ports: + - name: http + port: 80 + protocol: TCP + targetPort: 9090 selector: - app.kubernetes.io/name: opencost + app.kubernetes.io/component: server + app.kubernetes.io/name: prometheus app.kubernetes.io/instance: release-name + sessionAffinity: None type: "ClusterIP" - ports: - - name: http - port: 9003 - targetPort: 9003 --- # Source: dataplane/templates/clusterresourcesync/service.yaml apiVersion: v1 @@ -4796,6 +5033,31 @@ spec: app.kubernetes.io/name: clusterresourcesync app.kubernetes.io/instance: release-name --- +# Source: dataplane/templates/flyteconnector/service.yaml +apiVersion: v1 +kind: Service +metadata: + name: flyteconnector + namespace: union + labels: + app.kubernetes.io/name: flyteconnector + app.kubernetes.io/instance: release-name + app.kubernetes.io/managed-by: Helm +spec: + clusterIP: None + ports: + - name: grpc + port: 8000 + protocol: TCP + targetPort: grpc + - name: metric + port: 9090 + protocol: TCP + targetPort: metric + selector: + app.kubernetes.io/name: flyteconnector + app.kubernetes.io/instance: release-name +--- # Source: dataplane/templates/imagebuilder/service.yaml apiVersion: v1 kind: Service @@ -4821,7 +5083,7 @@ spec: apiVersion: v1 kind: Service metadata: - name: release-name-dataplane-executor + name: union-operator-executor labels: platform.union.ai/prometheus-group: "union-services" app: executor @@ -4887,39 +5149,14 @@ spec: app.kubernetes.io/name: union-operator app.kubernetes.io/instance: release-name --- -# Source: dataplane/templates/prometheus/service.yaml -apiVersion: v1 -kind: Service -metadata: - name: union-operator-prometheus - namespace: union - labels: - helm.sh/chart: dataplane-2026.4.2 - app.kubernetes.io/name: release-name-dataplane - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: prometheus -spec: - type: ClusterIP - ports: - - port: 80 - targetPort: 9090 - protocol: TCP - name: http - selector: - app.kubernetes.io/component: prometheus - app.kubernetes.io/name: release-name-dataplane - app.kubernetes.io/instance: release-name ---- -# Source: dataplane/templates/propeller/service-webhook.yaml +# Source: dataplane/templates/webhook/service.yaml apiVersion: v1 kind: Service metadata: - name: flyte-pod-webhook + name: union-pod-webhook namespace: union labels: - app.kubernetes.io/name: flyte-pod-webhook + app.kubernetes.io/name: union-pod-webhook app.kubernetes.io/instance: release-name platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm @@ -4927,7 +5164,7 @@ metadata: projectcontour.io/upstream-protocol.h2c: grpc spec: selector: - app.kubernetes.io/name: flyte-pod-webhook + app.kubernetes.io/name: union-pod-webhook app.kubernetes.io/instance: release-name ports: - name: https @@ -4939,23 +5176,23 @@ spec: port: 10254 targetPort: 10254 --- -# Source: dataplane/templates/propeller/service-webhook.yaml +# Source: dataplane/templates/webhook/service.yaml # Headless Service for cache invalidation — resolves to all pod IPs so that # we can fan out invalidation requests to every webhook replica. apiVersion: v1 kind: Service metadata: - name: flyte-pod-webhook-headless + name: union-pod-webhook-headless namespace: union labels: - app.kubernetes.io/name: flyte-pod-webhook + app.kubernetes.io/name: union-pod-webhook app.kubernetes.io/instance: release-name platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm spec: clusterIP: None selector: - app.kubernetes.io/name: flyte-pod-webhook + app.kubernetes.io/name: union-pod-webhook app.kubernetes.io/instance: release-name ports: - name: cache-internal @@ -4963,32 +5200,6 @@ spec: port: 9443 targetPort: 9443 --- -# Source: dataplane/templates/propeller/service.yaml -apiVersion: v1 -kind: Service -metadata: - namespace: union - name: flytepropeller - labels: - platform.union.ai/prometheus-group: "union-services" - app.kubernetes.io/name: flytepropeller - app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name - app.kubernetes.io/managed-by: Helm -spec: - type: ClusterIP - ports: - - name: debug - protocol: TCP - port: 10254 - - name: fasttask - port: 15605 - protocol: TCP - targetPort: 15605 - selector: - app.kubernetes.io/name: flytepropeller - app.kubernetes.io/instance: release-name ---- # Source: dataplane/charts/fluentbit/templates/daemonset.yaml apiVersion: apps/v1 kind: DaemonSet @@ -5014,7 +5225,7 @@ spec: annotations: checksum/config: e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855 spec: - serviceAccountName: fluentbit-system + serviceAccountName: union-system hostNetwork: false dnsPolicy: ClusterFirst containers: @@ -5066,20 +5277,217 @@ spec: tolerations: - operator: Exists --- -# Source: dataplane/charts/kube-state-metrics/templates/deployment.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2022 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: operator-webhook + namespace: "union" + labels: + app.kubernetes.io/component: operator-webhook + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +spec: + selector: + matchLabels: + app: operator-webhook + role: operator-webhook + template: + metadata: + labels: + app: operator-webhook + role: operator-webhook + app.kubernetes.io/component: operator-webhook + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator + sidecar.istio.io/inject: "false" + spec: + # To avoid node becoming SPOF, spread our replicas to different nodes. + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - podAffinityTerm: + labelSelector: + matchLabels: + app: operator-webhook + topologyKey: kubernetes.io/hostname + weight: 100 + serviceAccountName: operator-webhook + containers: + - name: operator-webhook + # This is the Go import path for the binary that is containerized + # and substituted here. + image: gcr.io/knative-releases/knative.dev/operator/cmd/webhook@sha256:d3a7a3304629ccfcaacec620b50736e0b4c902a6328b83369b6cbaf7e94677c9 + resources: + requests: + cpu: 100m + memory: 100Mi + limits: + cpu: 500m + memory: 500Mi + env: + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: SYSTEM_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: CONFIG_LOGGING_NAME + value: config-logging + - name: CONFIG_OBSERVABILITY_NAME + value: config-observability + - name: WEBHOOK_NAME + value: operator-webhook + - name: WEBHOOK_PORT + value: "8443" + - name: WEBHOOK_SECRET_NAME + value: operator-webhook-certs + - name: METRICS_DOMAIN + value: knative.dev/operator + - name: KUBERNETES_MIN_VERSION + value: "" + securityContext: + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + runAsNonRoot: true + capabilities: + drop: + - ALL + ports: + - name: metrics + containerPort: 9090 + - name: profiling + containerPort: 8008 + - name: https-webhook + containerPort: 8443 + readinessProbe: + periodSeconds: 1 + httpGet: + scheme: HTTPS + port: 8443 + httpHeaders: + - name: k-kubelet-probe + value: "webhook" + livenessProbe: + periodSeconds: 1 + httpGet: + scheme: HTTPS + port: 8443 + httpHeaders: + - name: k-kubelet-probe + value: "webhook" + failureThreshold: 6 + initialDelaySeconds: 120 + # Our webhook should gracefully terminate by lame ducking first, set this to a sufficiently + # high value that we respect whatever value it has configured for the lame duck grace period. + terminationGracePeriodSeconds: 300 +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2020 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: knative-operator + namespace: "union" + labels: + app.kubernetes.io/name: knative-operator + app.kubernetes.io/version: "1.16.0" +spec: + replicas: 1 + selector: + matchLabels: + name: knative-operator + template: + metadata: + labels: + name: knative-operator + app.kubernetes.io/name: knative-operator + app.kubernetes.io/version: "1.16.0" + sidecar.istio.io/inject: "false" + spec: + serviceAccountName: knative-operator + containers: + - name: knative-operator + image: gcr.io/knative-releases/knative.dev/operator/cmd/operator@sha256:0b5a3532417f9c8e7b6044e23f6f67ad932a74a40a4092ad965b6f173b2fd887 + resources: + requests: + cpu: 100m + memory: 100Mi + limits: + cpu: 1000m + memory: 1000Mi + imagePullPolicy: IfNotPresent + env: + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: SYSTEM_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: METRICS_DOMAIN + value: knative.dev/operator + - name: CONFIG_LOGGING_NAME + value: config-logging + - name: CONFIG_OBSERVABILITY_NAME + value: config-observability + - name: KUBERNETES_MIN_VERSION + value: "" + securityContext: + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + runAsNonRoot: true + capabilities: + drop: + - ALL + ports: + - name: metrics + containerPort: 9090 +--- +# Source: dataplane/charts/prometheus/charts/kube-state-metrics/templates/deployment.yaml apiVersion: apps/v1 kind: Deployment metadata: name: release-name-kube-state-metrics namespace: union labels: - helm.sh/chart: kube-state-metrics-5.30.1 + helm.sh/chart: kube-state-metrics-5.25.1 app.kubernetes.io/managed-by: Helm app.kubernetes.io/component: metrics app.kubernetes.io/part-of: kube-state-metrics app.kubernetes.io/name: kube-state-metrics app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2.15.0" + app.kubernetes.io/version: "2.13.0" spec: selector: matchLabels: @@ -5092,13 +5500,13 @@ spec: template: metadata: labels: - helm.sh/chart: kube-state-metrics-5.30.1 + helm.sh/chart: kube-state-metrics-5.25.1 app.kubernetes.io/managed-by: Helm app.kubernetes.io/component: metrics app.kubernetes.io/part-of: kube-state-metrics app.kubernetes.io/name: kube-state-metrics app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2.15.0" + app.kubernetes.io/version: "2.13.0" spec: automountServiceAccountToken: true hostNetwork: false @@ -5115,8 +5523,9 @@ spec: args: - --port=8080 - --resources=certificatesigningrequests,configmaps,cronjobs,daemonsets,deployments,endpoints,horizontalpodautoscalers,ingresses,jobs,leases,limitranges,mutatingwebhookconfigurations,namespaces,networkpolicies,nodes,persistentvolumeclaims,persistentvolumes,poddisruptionbudgets,pods,replicasets,replicationcontrollers,resourcequotas,secrets,services,statefulsets,storageclasses,validatingwebhookconfigurations,volumeattachments + - --namespaces=union imagePullPolicy: IfNotPresent - image: registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.15.0 + image: registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.13.0 ports: - containerPort: 8080 name: "http" @@ -5136,7 +5545,7 @@ spec: httpGet: httpHeaders: path: /readyz - port: 8081 + port: 8080 scheme: HTTP initialDelaySeconds: 5 periodSeconds: 10 @@ -5151,196 +5560,204 @@ spec: - ALL readOnlyRootFilesystem: true --- -# Source: dataplane/charts/opencost/templates/deployment.yaml +# Source: dataplane/charts/prometheus/templates/deploy.yaml apiVersion: apps/v1 kind: Deployment metadata: - name: release-name-opencost - namespace: union labels: - helm.sh/chart: opencost-1.42.0 - app.kubernetes.io/name: opencost + app.kubernetes.io/component: server + app.kubernetes.io/name: prometheus app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "1.111.0" - app.kubernetes.io/part-of: opencost + app.kubernetes.io/version: v2.54.1 + helm.sh/chart: prometheus-25.27.0 app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: prometheus + name: union-operator-prometheus + namespace: union spec: - replicas: 1 selector: matchLabels: - app.kubernetes.io/name: opencost + app.kubernetes.io/component: server + app.kubernetes.io/name: prometheus app.kubernetes.io/instance: release-name - strategy: - rollingUpdate: - maxSurge: 1 - maxUnavailable: 1 - type: RollingUpdate + replicas: 1 + revisionHistoryLimit: 10 + strategy: + type: Recreate + rollingUpdate: null template: metadata: labels: - app.kubernetes.io/name: opencost + app.kubernetes.io/component: server + app.kubernetes.io/name: prometheus app.kubernetes.io/instance: release-name + app.kubernetes.io/version: v2.54.1 + helm.sh/chart: prometheus-25.27.0 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: prometheus spec: - serviceAccountName: release-name-opencost + enableServiceLinks: true + serviceAccountName: union-operator-prometheus containers: - - name: release-name-opencost - image: ghcr.io/opencost/opencost:1.111.0@sha256:6aa68e52a24b14ba41f23db08d1b9db1429a1c0300f4c0381ecc2c61fc311a97 - imagePullPolicy: IfNotPresent + - name: prometheus-server-configmap-reload + image: "quay.io/prometheus-operator/prometheus-config-reloader:v0.76.0" + imagePullPolicy: "IfNotPresent" args: + - --watched-dir=/etc/config + - --listen-address=0.0.0.0:8080 + - --reload-url=http://127.0.0.1:9090/prometheus/-/reload ports: - - containerPort: 9003 - name: http - resources: - limits: - cpu: 1000m - memory: 4Gi - requests: - cpu: 500m - memory: 1Gi - startupProbe: - httpGet: - path: /healthz - port: 9003 - initialDelaySeconds: 10 - periodSeconds: 5 - failureThreshold: 30 + - containerPort: 8080 + name: metrics livenessProbe: httpGet: path: /healthz - port: 9003 - initialDelaySeconds: 10 - periodSeconds: 20 - failureThreshold: 3 + port: metrics + scheme: HTTP + initialDelaySeconds: 2 + periodSeconds: 10 readinessProbe: httpGet: path: /healthz - port: 9003 - initialDelaySeconds: 10 + port: metrics + scheme: HTTP periodSeconds: 10 + volumeMounts: + - name: config-volume + mountPath: /etc/config + readOnly: true + + - name: prometheus-server + image: "quay.io/prometheus/prometheus:v2.54.1" + imagePullPolicy: "IfNotPresent" + args: + - --storage.tsdb.retention.time=3d + - --config.file=/etc/config/prometheus.yml + - --storage.tsdb.path=/data + - --web.console.libraries=/etc/prometheus/console_libraries + - --web.console.templates=/etc/prometheus/consoles + - --web.enable-lifecycle + - --web.route-prefix=/prometheus + - --web.external-url=/prometheus/ + ports: + - containerPort: 9090 + readinessProbe: + httpGet: + path: /prometheus/-/ready + port: 9090 + scheme: HTTP + initialDelaySeconds: 30 + periodSeconds: 5 + timeoutSeconds: 4 failureThreshold: 3 - env: - - name: LOG_LEVEL - value: info - - name: CUSTOM_COST_ENABLED - value: "false" - - name: KUBECOST_NAMESPACE - value: union - - name: API_PORT - value: "9003" - - name: PROMETHEUS_SERVER_ENDPOINT - value: "http://union-operator-prometheus.union.svc:80/prometheus" - - name: CLUSTER_ID - value: "default-cluster" - - name: DATA_RETENTION_DAILY_RESOLUTION_DAYS - value: "15" - - name: CLOUD_COST_ENABLED - value: "false" - - name: CLOUD_COST_MONTH_TO_DATE_INTERVAL - value: "6" - - name: CLOUD_COST_REFRESH_RATE_HOURS - value: "6" - - name: CLOUD_COST_QUERY_WINDOW_DAYS - value: "7" - - name: CLOUD_COST_RUN_WINDOW_DAYS - value: "3" - # Add any additional provided variables + successThreshold: 1 + livenessProbe: + httpGet: + path: /prometheus/-/healthy + port: 9090 + scheme: HTTP + initialDelaySeconds: 30 + periodSeconds: 15 + timeoutSeconds: 10 + failureThreshold: 3 + successThreshold: 1 + resources: + limits: + cpu: "3" + memory: 3500Mi + requests: + cpu: "1" + memory: 1Gi + volumeMounts: + - name: config-volume + mountPath: /etc/config + - name: storage-volume + mountPath: /data + subPath: "" + - name: server-recording-rules + mountPath: /etc/config/recording + subPath: + readOnly: + dnsPolicy: ClusterFirst + securityContext: + fsGroup: 65534 + runAsGroup: 65534 + runAsNonRoot: true + runAsUser: 65534 + terminationGracePeriodSeconds: 300 + volumes: + - name: config-volume + configMap: + name: union-operator-prometheus + - name: server-recording-rules + configMap: + name: union-recording-rules + - name: storage-volume + emptyDir: + sizeLimit: 10Gi --- -# Source: dataplane/templates/clusterresourcesync/deployment.yaml +# Source: dataplane/templates/flyteconnector/deployment.yaml apiVersion: apps/v1 kind: Deployment metadata: - name: union-syncresources + name: flyteconnector namespace: union - labels: - app.kubernetes.io/name: clusterresourcesync + labels: + app.kubernetes.io/name: flyteconnector app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm spec: - replicas: 1 + replicas: 2 selector: - matchLabels: - app.kubernetes.io/name: clusterresourcesync + matchLabels: + app.kubernetes.io/name: flyteconnector app.kubernetes.io/instance: release-name template: metadata: annotations: - configChecksum: "43486113af17e32bea848a9fe1704b1bc13db034b74a380e02a1dfa06246bf6" - - labels: - platform.union.ai/zone: "dataplane" - - app.kubernetes.io/name: clusterresourcesync - app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name - app.kubernetes.io/managed-by: Helm - spec: - containers: - - command: - - clusterresource - - --config - - /etc/flyte/config/*.yaml - - clusterresource - - run - env: - - name: POD_NAME - valueFrom: - fieldRef: - fieldPath: metadata.name - - name: POD_NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - - name: GOMEMLIMIT - valueFrom: - resourceFieldRef: - divisor: 1 - resource: limits.memory - - name: GOMAXPROCS - valueFrom: - resourceFieldRef: - divisor: 1 - resource: limits.cpu - - name: CLUSTER_NAME - valueFrom: - secretKeyRef: - name: operator-cluster-name - key: cluster_name - - name: DEPLOYMENT_NAME - value: operator - - name: PROXY_SERVICE_URL - value: http://union-operator-proxy:8080 - - name: PROMETHEUS_SERVICE_URL - value: http://union-operator-prometheus:80 - - name: KNATIVE_PROXY_SERVICE_URL - value: http://kourier-internal - image: "public.ecr.aws/p0i0a9q8/unionoperator:2026.4.5" + labels: + platform.union.ai/zone: "dataplane" + + app.kubernetes.io/name: flyteconnector + app.kubernetes.io/instance: release-name + app.kubernetes.io/managed-by: Helm + spec: + containers: + - command: + - c0 + image: "ghcr.io/flyteorg/flyte-connectors:py3.13-2.0.0b50.dev3-g695bb1db3.d20260122" imagePullPolicy: "IfNotPresent" - name: sync-cluster-resources + name: flyteconnector + readinessProbe: + failureThreshold: 3 + httpGet: + path: / + port: 9090 + scheme: HTTP + initialDelaySeconds: 5 + periodSeconds: 10 + successThreshold: 1 + timeoutSeconds: 5 + env: + - name: AIRFLOW_CONN_GOOGLE_CLOUD_DEFAULT + value: "google-cloud-platform://" + - name: LOG_LEVEL + value: "20" + ports: + - containerPort: 8000 + name: grpc + - containerPort: 9090 + name: metric resources: limits: - cpu: "1" - memory: 500Mi + cpu: "1.5" + ephemeral-storage: 100Mi + memory: 1500Mi requests: - cpu: 500m - memory: 100Mi - volumeMounts: - - name: resource-templates - mountPath: /etc/flyte/clusterresource/templates - - name: config-volume - mountPath: /etc/flyte/config - ports: - - name: debug - containerPort: 10254 - protocol: TCP - serviceAccountName: union-clustersync-system - volumes: - - configMap: - name: union-clusterresource-template - name: resource-templates - - configMap: - name: union-clusterresourcesync-config - name: config-volume + cpu: "1" + ephemeral-storage: 100Mi + memory: 1000Mi + serviceAccountName: flyteconnector --- # Source: dataplane/templates/imagebuilder/deployment.yaml apiVersion: apps/v1 @@ -5362,17 +5779,15 @@ spec: app.kubernetes.io/instance: release-name template: metadata: - annotations: - container.apparmor.security.beta.kubernetes.io/buildkit: unconfined labels: platform.union.ai/zone: "dataplane" app.kubernetes.io/name: imagebuilder-buildkit app.kubernetes.io/instance: release-name spec: - serviceAccountName: "union-imagebuilder" + serviceAccountName: "union-system" containers: - name: "buildkit" - image: "docker.io/moby/buildkit:buildx-stable-1-rootless" + image: "docker.io/moby/buildkit:buildx-stable-1" imagePullPolicy: IfNotPresent env: - name: POD_NAME @@ -5407,18 +5822,15 @@ spec: - name: KNATIVE_PROXY_SERVICE_URL value: http://kourier-internal volumeMounts: - - mountPath: /home/user/.local/share/buildkit - name: buildkitd - mountPath: /etc/buildkit name: buildkit-config args: - --config - /etc/buildkit/buildkitd.toml - --addr - - unix:///run/user/1000/buildkit/buildkitd.sock + - unix:///run//buildkit/buildkitd.sock - --addr - tcp://0.0.0.0:1234 - - --oci-worker-no-process-sandbox ports: - name: tcp containerPort: 1234 @@ -5440,18 +5852,13 @@ spec: initialDelaySeconds: 5 periodSeconds: 30 securityContext: - seccompProfile: # Needs Kubernetes >= 1.19 - type: Unconfined - runAsUser: 1000 - runAsGroup: 1000 + privileged: true resources: requests: - cpu: 1 - ephemeral-storage: 20Gi - memory: 1Gi + cpu: 4 + ephemeral-storage: 50Gi + memory: 4Gi volumes: - - name: buildkitd - emptyDir: {} - configMap: name: union-operator-buildkit name: buildkit-config @@ -5480,16 +5887,18 @@ spec: template: metadata: annotations: - configChecksum: "b70edc56ad7dbaa4adb4206c5ffc13ddfdf676c4ff12a7132a6795c9958ba38" + configChecksum: "22ca0f09911140e21d1d5702aed040eaf20f690b624ea6311bda41c4cd3266e" labels: platform.union.ai/zone: "dataplane" app: executor + app.kubernetes.io/instance: 'release-name' + app.kubernetes.io/name: executor spec: securityContext: fsGroup: 1337 - serviceAccountName: executor + serviceAccountName: union-system volumes: - name: config-volume configMap: @@ -5499,7 +5908,7 @@ spec: image: "public.ecr.aws/p0i0a9q8/unionoperator:2026.4.5" imagePullPolicy: IfNotPresent command: - - executor + - executorv2 - serve - --config - /etc/config/*.yaml @@ -5573,7 +5982,7 @@ spec: template: metadata: annotations: - configChecksum: "f4c720571931d4374bbaaa22101950fceed6bbb44dec2a3f0d0d8f1e0ab5fde" + configChecksum: "bb0a9121a9efbaa78a6516acc004f1c9cbc14775f49cb50de140ec933213962" labels: @@ -5588,9 +5997,7 @@ spec: sources: - configMap: name: union-operator - - configMap: - name: union-clusterresourcesync-config - serviceAccountName: proxy-system + serviceAccountName: union-system securityContext: {} containers: @@ -5679,7 +6086,7 @@ spec: template: metadata: annotations: - configChecksum: "f4c720571931d4374bbaaa22101950fceed6bbb44dec2a3f0d0d8f1e0ab5fde" + configChecksum: "bb0a9121a9efbaa78a6516acc004f1c9cbc14775f49cb50de140ec933213962" labels: @@ -5688,7 +6095,7 @@ spec: platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm spec: - serviceAccountName: operator-system + serviceAccountName: union-system securityContext: {} volumes: @@ -5764,81 +6171,16 @@ spec: containerPort: 10254 protocol: TCP --- -# Source: dataplane/templates/prometheus/deployment.yaml -apiVersion: apps/v1 -kind: Deployment -metadata: - name: union-operator-prometheus - namespace: union - labels: - helm.sh/chart: dataplane-2026.4.2 - app.kubernetes.io/name: release-name-dataplane - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: prometheus -spec: - replicas: 1 - selector: - matchLabels: - app.kubernetes.io/component: prometheus - app.kubernetes.io/name: release-name-dataplane - app.kubernetes.io/instance: release-name - template: - metadata: - annotations: - configChecksum: "72a39ca838f02fc499675b5708fa77e2a4938e316a99315b59e33ef7f2d31ae" - labels: - app.kubernetes.io/component: prometheus - app.kubernetes.io/name: release-name-dataplane - app.kubernetes.io/instance: release-name - spec: - priorityClassName: system-cluster-critical - serviceAccountName: union-operator-prometheus - securityContext: - fsGroup: 65534 - runAsNonRoot: true - runAsUser: 65534 - fsGroupChangePolicy: OnRootMismatch - containers: - - name: prometheus - image: "prom/prometheus:v3.3.1" - args: - - --config.file=/etc/prometheus/prometheus.yml - - --web.external-url=/prometheus/ - - --web.route-prefix=/prometheus/ - - --storage.tsdb.retention.time=3d - ports: - - name: http - containerPort: 9090 - protocol: TCP - resources: - limits: - cpu: "3" - memory: 3500Mi - requests: - cpu: "1" - memory: 1Gi - securityContext: - allowPrivilegeEscalation: false - terminationMessagePolicy: FallbackToLogsOnError - volumeMounts: - - mountPath: /etc/prometheus - name: prometheus-config - volumes: - - name: prometheus-config - configMap: - name: union-operator-prometheus ---- -# Source: dataplane/templates/propeller/deployment-webhook.yaml +# Source: dataplane/templates/webhook/deployment.yaml +# Webhook deployment # Create the actual deployment apiVersion: apps/v1 kind: Deployment metadata: - name: flytepropeller-webhook + name: union-pod-webhook namespace: union labels: - app.kubernetes.io/name: flyte-pod-webhook + app.kubernetes.io/name: union-pod-webhook app.kubernetes.io/instance: release-name platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm @@ -5846,19 +6188,19 @@ spec: replicas: 1 selector: matchLabels: - app.kubernetes.io/name: flyte-pod-webhook + app.kubernetes.io/name: union-pod-webhook app.kubernetes.io/instance: release-name template: metadata: labels: platform.union.ai/zone: "dataplane" - app.kubernetes.io/name: flyte-pod-webhook + app.kubernetes.io/name: union-pod-webhook app.kubernetes.io/instance: release-name platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm annotations: - configChecksum: "f02b9746ff65e5fbb4b730cc9ee015824d61057531d60ea6078ee2a20155bce" + configChecksum: "77958aee0464eee4cb43c70036cf5ffa89f47f58c908845b9eeaecc7e32bba4" spec: securityContext: @@ -5866,64 +6208,8 @@ spec: fsGroupChangePolicy: Always runAsNonRoot: true runAsUser: 1001 - seLinuxOptions: - type: spc_t - serviceAccountName: flytepropeller-webhook-system - initContainers: - - name: generate-secrets - image: "public.ecr.aws/p0i0a9q8/unionoperator:2026.4.5" - imagePullPolicy: "IfNotPresent" - command: - - flytepropeller - args: - - webhook - - init-certs - - --config - - /etc/flyte/config/*.yaml - env: - - name: POD_NAME - valueFrom: - fieldRef: - fieldPath: metadata.name - - name: POD_NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - - name: GOMEMLIMIT - valueFrom: - resourceFieldRef: - divisor: 1 - resource: limits.memory - - name: GOMAXPROCS - valueFrom: - resourceFieldRef: - divisor: 1 - resource: limits.cpu - - name: CLUSTER_NAME - valueFrom: - secretKeyRef: - name: operator-cluster-name - key: cluster_name - - name: DEPLOYMENT_NAME - value: operator - - name: PROXY_SERVICE_URL - value: http://union-operator-proxy:8080 - - name: PROMETHEUS_SERVICE_URL - value: http://union-operator-prometheus:80 - - name: KNATIVE_PROXY_SERVICE_URL - value: http://kourier-internal - volumeMounts: - - name: config-volume - mountPath: /etc/flyte/config - resources: - limits: - cpu: 1 - ephemeral-storage: 500Mi - memory: 500Mi - requests: - cpu: 200m - ephemeral-storage: 500Mi - memory: 500Mi + seLinuxOptions: null + serviceAccountName: union-system containers: - name: webhook image: "public.ecr.aws/p0i0a9q8/unionoperator:2026.4.5" @@ -5988,102 +6274,63 @@ spec: volumes: - name: config-volume configMap: - name: flyte-propeller-config + name: union-pod-webhook-config - name: webhook-certs secret: - secretName: flyte-pod-webhook + secretName: union-pod-webhook --- -# Source: dataplane/templates/propeller/deployment.yaml -apiVersion: apps/v1 -kind: Deployment +# Source: dataplane/templates/flyteconnector/hpa.yaml +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler metadata: - namespace: union - name: flytepropeller + name: flyteconnector labels: - app.kubernetes.io/name: flytepropeller + app.kubernetes.io/name: flyteconnector app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm spec: - replicas: 1 - selector: - matchLabels: - app.kubernetes.io/name: flytepropeller - app.kubernetes.io/instance: release-name - template: - metadata: - annotations: - configChecksum: "f02b9746ff65e5fbb4b730cc9ee015824d61057531d60ea6078ee2a20155bce" - - labels: - platform.union.ai/zone: "dataplane" - - - app.kubernetes.io/name: flytepropeller - app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name - app.kubernetes.io/managed-by: Helm - spec: - priorityClassName: system-cluster-critical - containers: - - command: - - flytepropeller - - --config - - /etc/flyte/config/*.yaml - - --propeller.cluster-id - - '' - env: - - name: POD_NAME - valueFrom: - fieldRef: - fieldPath: metadata.name - - name: POD_NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - - name: GOMEMLIMIT - valueFrom: - resourceFieldRef: - divisor: 1 - resource: limits.memory - - name: GOMAXPROCS - valueFrom: - resourceFieldRef: - divisor: 1 - resource: limits.cpu - - name: CLUSTER_NAME - valueFrom: - secretKeyRef: - name: operator-cluster-name - key: cluster_name - - name: DEPLOYMENT_NAME - value: operator - - name: PROXY_SERVICE_URL - value: http://union-operator-proxy:8080 - - name: PROMETHEUS_SERVICE_URL - value: http://union-operator-prometheus:80 - - name: KNATIVE_PROXY_SERVICE_URL - value: http://kourier-internal - image: "public.ecr.aws/p0i0a9q8/unionoperator:2026.4.5" - imagePullPolicy: "IfNotPresent" - name: flytepropeller - ports: - - containerPort: 10254 - resources: - limits: - cpu: "3" - memory: 3Gi - requests: - cpu: "1" - memory: 1Gi - volumeMounts: - - name: config-volume - mountPath: /etc/flyte/config - serviceAccountName: flytepropeller-system - volumes: - - configMap: - name: flyte-propeller-config - name: config-volume + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: flyteconnector + minReplicas: 2 + maxReplicas: 5 + metrics: + - type: Resource + resource: + name: cpu + target: + type: Utilization + averageUtilization: 80 + - type: Resource + resource: + name: memory + target: + type: Utilization + averageUtilization: 80 +--- +# Source: dataplane/templates/common/ingress.yaml +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: release-name-dataplane-serving + namespace: union + labels: + app.kubernetes.io/name: dataplane + app.kubernetes.io/instance: release-name +spec: + ingressClassName: "nginx" + rules: + - http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: kourier-internal + port: + number: 80 + host: "*.apps." --- # Source: dataplane/templates/common/ingress.yaml apiVersion: networking.k8s.io/v1 @@ -6185,6 +6432,267 @@ spec: number: 8080 host: "ingress-nginx-internal.ingress-nginx.svc.cluster.local" --- +# Source: dataplane/templates/webhook/mutatingwebhookconfiguration.yaml +apiVersion: admissionregistration.k8s.io/v1 +kind: MutatingWebhookConfiguration +metadata: + name: union-pod-webhook- + labels: + app.kubernetes.io/name: union-pod-webhook + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm +webhooks: + - name: union-pod-webhook.flyte.org + admissionReviewVersions: + - v1 + - v1beta1 + clientConfig: + caBundle: LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSUROekNDQWgrZ0F3SUJBZ0lVSm9kQ09lem4vd1BHQTZjYkw1U3duSlpYT2hJd0RRWUpLb1pJaHZjTkFRRUwKQlFBd0t6RXBNQ2NHQTFVRUF3d2dabXg1ZEdWd2NtOXdaV3hzWlhJdGQyVmlhRzl2YXk1MWJtbHZiaTV6ZG1NdwpIaGNOTWpZd01USXpNREl6TmpVMVdoY05Nell3TVRJeE1ESXpOalUxV2pBck1Ta3dKd1lEVlFRRERDQm1iSGwwClpYQnliM0JsYkd4bGNpMTNaV0pvYjI5ckxuVnVhVzl1TG5OMll6Q0NBU0l3RFFZSktvWklodmNOQVFFQkJRQUQKZ2dFUEFEQ0NBUW9DZ2dFQkFJcjBSWDRGRjFNVTNYdFBmUHErMDUrMGI1Qm0xK3hsTVR5ZUp4bG5mWFJuRlgyegpPcHV6NmsreUpBQk4xRkxmNDVYRnlJclFlaW5DWWJtckZXalFRc3ZDWEloNkpPNGMwdmRMVVU5RERYcldWRFlsCkRNZytTUDlJVnZBMm5USkVFZ2RFREVsKzNWeThUbUJoNlNmcGthcXdHajY5SmhIdy9OeE9yZzM2bUY3TU5VZ1MKWXJVUkxqUFFzTUN6NElTVzhhQnEzS2IwRUFWUTA4V1lZQ0ZEbjY1aHBzVFdxR05rL3BKRnVTYjlsZll6RVdyYwpJeW5wNDhqTnIvNUdKS3NnWG9LeTUwZVo4UkppeTVyOUhENGtnSW15TVNQR2RDMXFZSVppRHRlZW8yOElxNk5nCmd1dWcyTGhlNndYYW9YZXFCQ2d3SHpwWFVXM1hJVzI5eFoxUkFRRUNBd0VBQWFOVE1GRXdIUVlEVlIwT0JCWUUKRktITDQ1RWhQaVg5bnlRN241QUk0YUZVNFlCL01COEdBMVVkSXdRWU1CYUFGS0hMNDVFaFBpWDlueVE3bjVBSQo0YUZVNFlCL01BOEdBMVVkRXdFQi93UUZNQU1CQWY4d0RRWUpLb1pJaHZjTkFRRUxCUUFEZ2dFQkFEeTRac21XCkNjZ3B5TmI4cU1ydzNYclBNejZLMTRlTVhqQVk1bERFYXM1NklLZExoZFM1bi9sWk95bENuUUtOZ3RHS0VrY3UKT05tMEE2Z0xNT3UzcE43dG9aNXh0WktOcnUvS0x6dm44QUNNNndCbUR0bG5oRERDa3ZUTHBVRzh6UTZSNmlaegpzTWlhVmpLSUJiWkJsZXNYSHZZQnQ2YlRTN2o1QWM2aTdYc1RPK3ZqQ2RUQjVYdCtEZ0x2QnBJRWh2ZGJzNXhmClFtc3ZzQmwxenJnR3JXemo4d0txcXNUMzZaOVU1TzdSK1RJMlQrUDVOR25SeHNQOGJ0R0RtMGZ5SW55WlJrSkcKVTdpVFJyMlZRWlRyaTNZQ2dlVjN1UjFtMnJjc2hyU1Avb082T0QyTXR5Q1FMRWtFK1pxVUVJVnpSYmVaNXl2RQptM29YMFV2OUtBUy9pWTA9Ci0tLS0tRU5EIENFUlRJRklDQVRFLS0tLS0K + service: + name: union-pod-webhook + namespace: union + path: /mutate--v1-pod/secrets + port: 443 + failurePolicy: Fail + matchPolicy: Equivalent + namespaceSelector: + matchLabels: + kubernetes.io/metadata.name: union + objectSelector: + matchLabels: + inject-flyte-secrets: "true" + organization: '' + reinvocationPolicy: Never + rules: + - apiGroups: + - '*' + apiVersions: + - v1 + operations: + - CREATE + resources: + - pods + scope: '*' + sideEffects: NoneOnDryRun + timeoutSeconds: 30 +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Imported via https://github.com/knative/operator/releases/download/knative-v1.16.0/operator.yaml +# Copyright 2024 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +# Source: dataplane/templates/propeller/serviceaccount.yaml +--- +--- +# Source: dataplane/templates/serving/knative-serving.yaml +apiVersion: operator.knative.dev/v1beta1 +kind: KnativeServing +metadata: + name: union-operator-serving + namespace: union + labels: + helm.sh/chart: dataplane-2026.4.4 + app.kubernetes.io/name: release-name-dataplane + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2026.4.5" + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/component: serving +spec: + config: + deployment: + progress-deadline: "30m" + queue-sidecar-cpu-request: "25m" + queue-sidecar-cpu-limit: "1000m" + queue-sidecar-memory-request: "400Mi" + queue-sidecar-memory-limit: "800Mi" + queue-sidecar-ephemeral-storage-request: "512Mi" + queue-sidecar-ephemeral-storage-limit: "1024Mi" + registries-skipping-tag-resolving: managed.cr.union.ai + features: + kubernetes.podspec-affinity: "enabled" + kubernetes.podspec-nodeselector: "enabled" + kubernetes.podspec-tolerations: "enabled" + kubernetes.podspec-fieldref: "enabled" + kubernetes.podspec-dnspolicy: "enabled" + kubernetes.podspec-schedulername: "enabled" + kubernetes.podspec-securitycontext: "enabled" + network: + ingress-class: "kourier.ingress.networking.knative.dev" + high-availability: + replicas: 2 + ingress: + kourier: + enabled: true + bootstrap-configmap: "union-operator-serving-envoy-bootstrap" + service-type: ClusterIP + podDisruptionBudgets: + - name: 3scale-kourier-gateway-pdb + minAvailable: 50% + - name: activator-pdb + minAvailable: 50% + - name: webhook-pdb + minAvailable: 50% + registry: + override: + # TODO(jeev): Wire up Union fork of Envoy + 3scale-kourier-gateway/kourier-gateway: ghcr.io/unionai/envoy:456fed84d4ad9a9dfb186d117d9362e9dc0f7c1f + # TODO(jeev): Wire up Union fork of Kourier + net-kourier-controller/controller: ghcr.io/unionai/kourier@sha256:5804c348d15b3959604e3e3ceed216c3a1c7b32cbe254c7d3eb02a35e62ba9c4 + workloads: + - name: 3scale-kourier-gateway + labels: + helm.sh/chart: dataplane-2026.4.4 + app.kubernetes.io/name: release-name-dataplane + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2026.4.5" + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/component: 3scale-kourier-gateway + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchLabels: + app.kubernetes.io/name: 3scale-kourier-gateway + app.kubernetes.io/instance: release-name + topologyKey: topology.kubernetes.io/zone + annotations: + checksum/bootstrap-config: 6469f6f592eebc9e0c676d8ccc359a05bc799c0988e474b2da942c4cc328f656 + env: + - container: kourier-gateway + envVars: + - name: UNION_AUTHZ_TENANTAUTHURL + value: "https:///me" + - name: UNION_AUTHZ_TENANTAUTHSIGNINURL + value: "https:///login" + - name: UNION_AUTHZ_TENANTCONTROLPLANEURL + value: "https://" + resources: + - container: kourier-gateway + limits: + cpu: "2" + memory: 2Gi + requests: + cpu: "1" + memory: 1Gi + - name: activator + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app + operator: In + values: + - activator + topologyKey: topology.kubernetes.io/zone + - name: autoscaler + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app + operator: In + values: + - autoscaler + topologyKey: topology.kubernetes.io/zone + - name: autoscaler-hpa + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app + operator: In + values: + - autoscaler-hpa + topologyKey: topology.kubernetes.io/zone + - name: controller + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app + operator: In + values: + - controller + topologyKey: topology.kubernetes.io/zone + - name: net-kourier-controller + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app + operator: In + values: + - net-kourier-controller + topologyKey: topology.kubernetes.io/zone + env: + - container: controller + envVars: + - name: KOURIER_UNION_AUTHZ_ENABLED + value: "false" + resources: + - container: controller + limits: + cpu: "1" + memory: 1Gi + requests: + cpu: 500m + memory: 500Mi + - name: webhook + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app + operator: In + values: + - webhook + topologyKey: topology.kubernetes.io/zone +--- +# Source: dataplane/templates/common/task-podtemplate.yaml +apiVersion: v1 +kind: PodTemplate +metadata: + name: task-template + namespace: union +template: + spec: + serviceAccountName: union + containers: + - name: default + image: docker.io/rwgrim/docker-noop +--- # Source: dataplane/charts/fluentbit/templates/tests/test-connection.yaml apiVersion: v1 kind: Pod diff --git a/tests/generated/dataplane.gcp.yaml b/tests/generated/dataplane.gcp.yaml new file mode 100644 index 00000000..f69eca8e --- /dev/null +++ b/tests/generated/dataplane.gcp.yaml @@ -0,0 +1,6636 @@ +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2020 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v1 +kind: ServiceAccount +metadata: + name: knative-operator + namespace: "union" + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2020 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v1 +kind: ServiceAccount +metadata: + name: operator-webhook + namespace: "union" + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +--- +# Source: dataplane/charts/prometheus/charts/kube-state-metrics/templates/serviceaccount.yaml +apiVersion: v1 +kind: ServiceAccount +automountServiceAccountToken: true +metadata: + labels: + helm.sh/chart: kube-state-metrics-5.25.1 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/component: metrics + app.kubernetes.io/part-of: kube-state-metrics + app.kubernetes.io/name: kube-state-metrics + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2.13.0" + name: release-name-kube-state-metrics + namespace: union +--- +# Source: dataplane/charts/prometheus/templates/serviceaccount.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + labels: + app.kubernetes.io/component: server + app.kubernetes.io/name: prometheus + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: v2.54.1 + helm.sh/chart: prometheus-25.27.0 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: prometheus + name: union-operator-prometheus + namespace: union + annotations: + {} +--- +# Source: dataplane/templates/common/system-serviceaccount.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: union-system + namespace: union + annotations: + iam.gke.io/gcp-service-account: union-backend@test-gcp-project-123.iam.gserviceaccount.com +--- +# Source: dataplane/templates/common/union-serviceaccount.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: union + namespace: union + annotations: + iam.gke.io/gcp-service-account: union-worker@test-gcp-project-123.iam.gserviceaccount.com +automountServiceAccountToken: true +--- +# Source: dataplane/templates/flyteconnector/serviceaccount.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: flyteconnector + namespace: union + labels: + app.kubernetes.io/name: flyteconnector + app.kubernetes.io/instance: release-name + app.kubernetes.io/managed-by: Helm +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2022 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v1 +kind: Secret +metadata: + name: operator-webhook-certs + namespace: "union" + labels: + app.kubernetes.io/component: operator-webhook + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +# The data is populated at install time. +--- +# Source: dataplane/templates/common/auth-secret.yaml +apiVersion: v1 +kind: Secret +metadata: + name: union-secret-auth + namespace: union +type: Opaque +data: + # TODO(rob): update or configure operator to use client_secret like all the other components. + app_secret: dGVzdC1ub3QtcmVhbC1zZWNyZXQ= + client_secret: dGVzdC1ub3QtcmVhbC1zZWNyZXQ= +--- +# Source: dataplane/templates/common/cluster-secret.yaml +apiVersion: v1 +kind: Secret +metadata: + name: operator-cluster-name +type: Opaque +data: + cluster_name: dGVzdC1lMmUtZ2Nw +--- +# Source: dataplane/templates/webhook/mutatingwebhookconfiguration.yaml +apiVersion: v1 +kind: Secret +metadata: + name: union-pod-webhook + namespace: union + labels: + app.kubernetes.io/name: union-pod-webhook + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm +type: Opaque +data: + ca.crt: LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSUROekNDQWgrZ0F3SUJBZ0lVSm9kQ09lem4vd1BHQTZjYkw1U3duSlpYT2hJd0RRWUpLb1pJaHZjTkFRRUwKQlFBd0t6RXBNQ2NHQTFVRUF3d2dabXg1ZEdWd2NtOXdaV3hzWlhJdGQyVmlhRzl2YXk1MWJtbHZiaTV6ZG1NdwpIaGNOTWpZd01USXpNREl6TmpVMVdoY05Nell3TVRJeE1ESXpOalUxV2pBck1Ta3dKd1lEVlFRRERDQm1iSGwwClpYQnliM0JsYkd4bGNpMTNaV0pvYjI5ckxuVnVhVzl1TG5OMll6Q0NBU0l3RFFZSktvWklodmNOQVFFQkJRQUQKZ2dFUEFEQ0NBUW9DZ2dFQkFJcjBSWDRGRjFNVTNYdFBmUHErMDUrMGI1Qm0xK3hsTVR5ZUp4bG5mWFJuRlgyegpPcHV6NmsreUpBQk4xRkxmNDVYRnlJclFlaW5DWWJtckZXalFRc3ZDWEloNkpPNGMwdmRMVVU5RERYcldWRFlsCkRNZytTUDlJVnZBMm5USkVFZ2RFREVsKzNWeThUbUJoNlNmcGthcXdHajY5SmhIdy9OeE9yZzM2bUY3TU5VZ1MKWXJVUkxqUFFzTUN6NElTVzhhQnEzS2IwRUFWUTA4V1lZQ0ZEbjY1aHBzVFdxR05rL3BKRnVTYjlsZll6RVdyYwpJeW5wNDhqTnIvNUdKS3NnWG9LeTUwZVo4UkppeTVyOUhENGtnSW15TVNQR2RDMXFZSVppRHRlZW8yOElxNk5nCmd1dWcyTGhlNndYYW9YZXFCQ2d3SHpwWFVXM1hJVzI5eFoxUkFRRUNBd0VBQWFOVE1GRXdIUVlEVlIwT0JCWUUKRktITDQ1RWhQaVg5bnlRN241QUk0YUZVNFlCL01COEdBMVVkSXdRWU1CYUFGS0hMNDVFaFBpWDlueVE3bjVBSQo0YUZVNFlCL01BOEdBMVVkRXdFQi93UUZNQU1CQWY4d0RRWUpLb1pJaHZjTkFRRUxCUUFEZ2dFQkFEeTRac21XCkNjZ3B5TmI4cU1ydzNYclBNejZLMTRlTVhqQVk1bERFYXM1NklLZExoZFM1bi9sWk95bENuUUtOZ3RHS0VrY3UKT05tMEE2Z0xNT3UzcE43dG9aNXh0WktOcnUvS0x6dm44QUNNNndCbUR0bG5oRERDa3ZUTHBVRzh6UTZSNmlaegpzTWlhVmpLSUJiWkJsZXNYSHZZQnQ2YlRTN2o1QWM2aTdYc1RPK3ZqQ2RUQjVYdCtEZ0x2QnBJRWh2ZGJzNXhmClFtc3ZzQmwxenJnR3JXemo4d0txcXNUMzZaOVU1TzdSK1RJMlQrUDVOR25SeHNQOGJ0R0RtMGZ5SW55WlJrSkcKVTdpVFJyMlZRWlRyaTNZQ2dlVjN1UjFtMnJjc2hyU1Avb082T0QyTXR5Q1FMRWtFK1pxVUVJVnpSYmVaNXl2RQptM29YMFV2OUtBUy9pWTA9Ci0tLS0tRU5EIENFUlRJRklDQVRFLS0tLS0K + tls.crt: LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSUQxakNDQXI2Z0F3SUJBZ0lVREtRN3A4MFpIdkQyUE5Ya3p3VGRLem9iUjg4d0RRWUpLb1pJaHZjTkFRRUwKQlFBd0t6RXBNQ2NHQTFVRUF3d2dabXg1ZEdWd2NtOXdaV3hzWlhJdGQyVmlhRzl2YXk1MWJtbHZiaTV6ZG1NdwpIaGNOTWpZd01USXpNREl6TmpVMVdoY05NamN3TVRJek1ESXpOalUxV2pBck1Ta3dKd1lEVlFRRERDQm1iSGwwClpYQnliM0JsYkd4bGNpMTNaV0pvYjI5ckxuVnVhVzl1TG5OMll6Q0NBU0l3RFFZSktvWklodmNOQVFFQkJRQUQKZ2dFUEFEQ0NBUW9DZ2dFQkFMNnMxUTRaZ3ZxVVdNdFp5VC9EKytEQWl6WmR0Q1lkVzArSGk4S3phQ1Vjbi9zZwo3a21NTEhDM2ZlZWtZK2ZUNENHUTd6K1l2WjRWWSt6WGljKzdrNndLbUZqdm1acEM1Rm96OWo1Q21NRk5HZmlWCnRobzVvSU9MUkIzNXRYMU1ROHUwNFVzT29QcjdMK016eEdxWXVmTFdtN1NVdjh0eEp3ZFMxZVlzMllNV1pMeU4KekI4akdWY01Qc2ZNVkdxdFhTTVljQ3hNYURKaTM1N0VEZSs3clM2U2ZtbnFtYjJxekdQM1BRRStCUm91Yk5nLwo4NCt4c253VXBLZUFpem95dFM2c2ZlOTdMUEtRTW9oVzd2TjZORFJrLy9xSmpHem9iZmZwUHU3endhWHVXMWtMCmllc2g2cVRrdlZLSTF3WFJ2WHArVU5GbjZzdDIwdEVlRks0ZzMwOENBd0VBQWFPQjhUQ0I3akFKQmdOVkhSTUUKQWpBQU1Bc0dBMVVkRHdRRUF3SUY0RENCa3dZRFZSMFJCSUdMTUlHSWdoWm1iSGwwWlhCeWIzQmxiR3hsY2kxMwpaV0pvYjI5cmdoeG1iSGwwWlhCeWIzQmxiR3hsY2kxM1pXSm9iMjlyTG5WdWFXOXVnaUJtYkhsMFpYQnliM0JsCmJHeGxjaTEzWldKb2IyOXJMblZ1YVc5dUxuTjJZNEl1Wm14NWRHVndjbTl3Wld4c1pYSXRkMlZpYUc5dmF5NTEKYm1sdmJpNXpkbU11WTJ4MWMzUmxjaTVzYjJOaGJEQWRCZ05WSFE0RUZnUVV1d25wcDZ1MTNKL0NsNnI3eDZ1Sgo5YmZ1RDhzd0h3WURWUjBqQkJnd0ZvQVVvY3Zqa1NFK0pmMmZKRHVma0FqaG9WVGhnSDh3RFFZSktvWklodmNOCkFRRUxCUUFEZ2dFQkFFMmlOa1MzczhGT3Z0cDhrVmJNZUN2amxIQytvYTVlZTFGNmowNlZ3bFJPOWdxOXJla1UKMGptWUhNWGNjWjcrZXJkNFRQRXgwK3ZqMzNhRTV3cDFmdVJJL2xIZHhuOUpJRHg2bTFQTHdVa3p1WG9EekYxLwozVXVIWU1xOGkrT2xsY0tqbmdVMUFUNmRWRCtIbFVWeHpMdjlNazdPb05SQUY5dTJHa1hiUHdCN0I1ZGRxdWxlCmtNcXBxbDhES2s4SVNVelVsTjNBb2Q5NmJUelIyV3RCVkxpTi9zRHZtTDZwZThFYm55RnBUb1BZTDlGeWZOcmIKT1ZPZEJwUkZFTG1KOHZaZWY5d1lxRmVXMTE3SGV5VDArZkxGZVNSemZUNVVWUlVraVFBcUVJdUhHUVRrOVFabgo1WnYvNUF0eEg3WE1rcXR0VE1zMFlSUG0xYjQzY3FBeFdhND0KLS0tLS1FTkQgQ0VSVElGSUNBVEUtLS0tLQo= + tls.key: LS0tLS1CRUdJTiBQUklWQVRFIEtFWS0tLS0tCk1JSUV2UUlCQURBTkJna3Foa2lHOXcwQkFRRUZBQVNDQktjd2dnU2pBZ0VBQW9JQkFRQytyTlVPR1lMNmxGakwKV2NrL3cvdmd3SXMyWGJRbUhWdFBoNHZDczJnbEhKLzdJTzVKakN4d3QzM25wR1BuMCtBaGtPOC9tTDJlRldQcwoxNG5QdTVPc0NwaFk3NW1hUXVSYU0vWStRcGpCVFJuNGxiWWFPYUNEaTBRZCtiVjlURVBMdE9GTERxRDYreS9qCk04UnFtTG55MXB1MGxML0xjU2NIVXRYbUxObURGbVM4amN3Zkl4bFhERDdIekZScXJWMGpHSEFzVEdneVl0K2UKeEEzdnU2MHVrbjVwNnBtOXFzeGo5ejBCUGdVYUxtellQL09Qc2JKOEZLU25nSXM2TXJVdXJIM3ZleXp5a0RLSQpWdTd6ZWpRMFpQLzZpWXhzNkczMzZUN3U4OEdsN2x0WkM0bnJJZXFrNUwxU2lOY0YwYjE2ZmxEUlorckxkdExSCkhoU3VJTjlQQWdNQkFBRUNnZ0VBRjV2SnMzbTRMSE9DdlFic2NwOC9DQmgvQkNlOC92MGlpYW5VUmJLMkFlWlYKN3Arb3NXV1FPUktYSGIvT2VPMVVjb09DQkFOUzh3aGQrM3pDZlB5U0w3cU9HM0RyT1Z6djdqVkIxM3FpZEVpcQpId3ZXWk0vZXpuckhYOWpEdm5SYmJwVUNVaXRKQmxwa2x3S1pYc056UHB5UTRkNkxFdEw3VEo1V2lxM2g3cWQzCjhncmJoeWlWNDhZRFAyYkhpZGt0S2QvMHBNL3piSzZPTS9iMFpLUmRBWVpHQXlOcklMaFJ3eDNsQXE2SDVqL20KUHBuNlZEdVQ3Y1h6UFhmNTgxR0IyNjQxWFJVTXRNWGVEREtaU3JudnIrbG5tV2lsYXZmNkg3ZHp4NlRBMmk0RApybCtIRVdUN1RSQUZGMnZ5SVUrYSt5RzNPVFBKRCtmVTdCWWh4Zld4aFFLQmdRRGVvcXVCbzQxK1dKU255blZnCmNWMHZoUEkwZ0dyRGQzNFdIbCtGNTZuSmxuMVVlbUQ2SUQ0TUhnNVpHWmNmd3NkRlB0aHl1VnB0OVl1SW11SGQKRHE5ZDhpWnpHUzJVbStkSWlsWjhnZXJ5Q3dON3hIbFM3SzJrc0RmRGJwYmZZSXZ1LzRnSExZRHh3YzQ0OG54ZwpWNGpCUFY2QTdYTHRqQzZzdFE2OW9uTlIwd0tCZ1FEYlFBUTRGdWpnUjJUOXh2Y01MdXhMbm1YaHE4U2NoZlM5CkRoWndDUzZIeHB1YlBxblBjQ04vQUZjbDZ1cDRXVEVZdVY4bEpwcGwwRmpYQ2lMdkhuUU15cm5vRjlUOThpanQKZGVHYjF6ZWlwRUpndFpNVHkvOGdERWorZU1wSFBuYTZwQVpMOVpRVUdLR2dDNGQ0SzljdVNHMmRlb1dSU0ViSQpMVGgzeGpXVEZRS0JnQStJRWFidG5nVmVjS0IwQTFSREZGa29VUzFRZUNKQ3g4MExPV2JDRHBvOW9XaXZVT3lpCkt6SDFOdE1JY2Y2SlBCV2NtTVVJSVVMaWltVnhTS2gvU2NTb0MvNmpsd1p6Q2VPSm94YjBpVXR4Y1VERktDR2MKMlZCUDZ0UDdkeE1HVFR6VEhzNUJZbWw3TjhQSlJ0d0J2MHliMTJmdktNRmhzaS9pUWJFQkVFSjVBb0dBUm9NSApHRmJkM0V0NXdsZzcyYkk1a25SRnhkY3RLejIxb1J1bndhNWlSWTV0T3Zkak8zQ1FLZWNkSC9lMklyQmtwdFB2Ck1vNkF0MS9UUW8xakFNNGxlbnUwWUYxUnhiNGN4WW5VM2Y3UVNNRlZDNjg1dHZNemdNWVNyNngvT1h4d1NNUTUKdGpJcnhtN0poQ1JSRkNmZFUyZzl3SmpIM2hxRmtSbGlBTHRCUGFVQ2dZRUF2QkRHejQvYS9kRzRJT0VpREk4SgpBbkxaRWZaTFJrdnI0ejRyT0p0TGZtWlFjTW5DSUZmKzBaelhTME9XOWhvMjk5RUNtS1l4U2VUWk1WSDgzOFZCCnRvSkNGM3lzb01rSC9meEVXTUpicDlMblJwSFUzUElicHlhRG1nWW1YdmsvK3laL3Y1c0dDTWNtVGkzWVFLRGsKcStlK1o0SHZyaGUxVytUaHpwOCtoNGs9Ci0tLS0tRU5EIFBSSVZBVEUgS0VZLS0tLS0K +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2019 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v1 +kind: ConfigMap +metadata: + name: config-logging + namespace: "union" + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +data: {} +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2019 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v1 +kind: ConfigMap +metadata: + name: config-observability + namespace: "union" + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +data: {} +--- +# Source: dataplane/charts/prometheus/templates/cm.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + labels: + app.kubernetes.io/component: server + app.kubernetes.io/name: prometheus + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: v2.54.1 + helm.sh/chart: prometheus-25.27.0 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: prometheus + name: union-operator-prometheus + namespace: union +data: + allow-snippet-annotations: "false" + alerting_rules.yml: | + {} + alerts: | + {} + prometheus.yml: | + global: + evaluation_interval: 1m + scrape_interval: 1m + scrape_timeout: 10s + rule_files: + - /etc/config/recording_rules.yml + - /etc/config/alerting_rules.yml + - /etc/config/rules + - /etc/config/alerts + scrape_configs: + - job_name: prometheus + static_configs: + - targets: + - localhost:9090 + - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + job_name: kubernetes-apiservers + kubernetes_sd_configs: + - role: endpoints + relabel_configs: + - action: keep + regex: default;kubernetes;https + source_labels: + - __meta_kubernetes_namespace + - __meta_kubernetes_service_name + - __meta_kubernetes_endpoint_port_name + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + insecure_skip_verify: true + - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + job_name: kubernetes-nodes + kubernetes_sd_configs: + - role: node + relabel_configs: + - action: labelmap + regex: __meta_kubernetes_node_label_(.+) + - replacement: kubernetes.default.svc:443 + target_label: __address__ + - regex: (.+) + replacement: /api/v1/nodes/$1/proxy/metrics + source_labels: + - __meta_kubernetes_node_name + target_label: __metrics_path__ + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + insecure_skip_verify: true + - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + job_name: kubernetes-nodes-cadvisor + kubernetes_sd_configs: + - role: node + relabel_configs: + - action: labelmap + regex: __meta_kubernetes_node_label_(.+) + - replacement: kubernetes.default.svc:443 + target_label: __address__ + - regex: (.+) + replacement: /api/v1/nodes/$1/proxy/metrics/cadvisor + source_labels: + - __meta_kubernetes_node_name + target_label: __metrics_path__ + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + insecure_skip_verify: true + - honor_labels: true + job_name: kubernetes-service-endpoints + kubernetes_sd_configs: + - role: endpoints + relabel_configs: + - action: keep + regex: true + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_scrape + - action: drop + regex: true + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_scrape_slow + - action: replace + regex: (https?) + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_scheme + target_label: __scheme__ + - action: replace + regex: (.+) + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_path + target_label: __metrics_path__ + - action: replace + regex: (.+?)(?::\d+)?;(\d+) + replacement: $1:$2 + source_labels: + - __address__ + - __meta_kubernetes_service_annotation_prometheus_io_port + target_label: __address__ + - action: labelmap + regex: __meta_kubernetes_service_annotation_prometheus_io_param_(.+) + replacement: __param_$1 + - action: labelmap + regex: __meta_kubernetes_service_label_(.+) + - action: replace + source_labels: + - __meta_kubernetes_namespace + target_label: namespace + - action: replace + source_labels: + - __meta_kubernetes_service_name + target_label: service + - action: replace + source_labels: + - __meta_kubernetes_pod_node_name + target_label: node + - honor_labels: true + job_name: kubernetes-service-endpoints-slow + kubernetes_sd_configs: + - role: endpoints + relabel_configs: + - action: keep + regex: true + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_scrape_slow + - action: replace + regex: (https?) + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_scheme + target_label: __scheme__ + - action: replace + regex: (.+) + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_path + target_label: __metrics_path__ + - action: replace + regex: (.+?)(?::\d+)?;(\d+) + replacement: $1:$2 + source_labels: + - __address__ + - __meta_kubernetes_service_annotation_prometheus_io_port + target_label: __address__ + - action: labelmap + regex: __meta_kubernetes_service_annotation_prometheus_io_param_(.+) + replacement: __param_$1 + - action: labelmap + regex: __meta_kubernetes_service_label_(.+) + - action: replace + source_labels: + - __meta_kubernetes_namespace + target_label: namespace + - action: replace + source_labels: + - __meta_kubernetes_service_name + target_label: service + - action: replace + source_labels: + - __meta_kubernetes_pod_node_name + target_label: node + scrape_interval: 5m + scrape_timeout: 30s + - honor_labels: true + job_name: prometheus-pushgateway + kubernetes_sd_configs: + - role: service + relabel_configs: + - action: keep + regex: pushgateway + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_probe + - honor_labels: true + job_name: kubernetes-services + kubernetes_sd_configs: + - role: service + metrics_path: /probe + params: + module: + - http_2xx + relabel_configs: + - action: keep + regex: true + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_probe + - source_labels: + - __address__ + target_label: __param_target + - replacement: blackbox + target_label: __address__ + - source_labels: + - __param_target + target_label: instance + - action: labelmap + regex: __meta_kubernetes_service_label_(.+) + - source_labels: + - __meta_kubernetes_namespace + target_label: namespace + - source_labels: + - __meta_kubernetes_service_name + target_label: service + - honor_labels: true + job_name: kubernetes-pods + kubernetes_sd_configs: + - role: pod + relabel_configs: + - action: keep + regex: true + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_scrape + - action: drop + regex: true + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_scrape_slow + - action: replace + regex: (https?) + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_scheme + target_label: __scheme__ + - action: replace + regex: (.+) + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_path + target_label: __metrics_path__ + - action: replace + regex: (\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4}) + replacement: '[$2]:$1' + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_port + - __meta_kubernetes_pod_ip + target_label: __address__ + - action: replace + regex: (\d+);((([0-9]+?)(\.|$)){4}) + replacement: $2:$1 + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_port + - __meta_kubernetes_pod_ip + target_label: __address__ + - action: labelmap + regex: __meta_kubernetes_pod_annotation_prometheus_io_param_(.+) + replacement: __param_$1 + - action: labelmap + regex: __meta_kubernetes_pod_label_(.+) + - action: replace + source_labels: + - __meta_kubernetes_namespace + target_label: namespace + - action: replace + source_labels: + - __meta_kubernetes_pod_name + target_label: pod + - action: drop + regex: Pending|Succeeded|Failed|Completed + source_labels: + - __meta_kubernetes_pod_phase + - action: replace + source_labels: + - __meta_kubernetes_pod_node_name + target_label: node + - honor_labels: true + job_name: kubernetes-pods-slow + kubernetes_sd_configs: + - role: pod + relabel_configs: + - action: keep + regex: true + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_scrape_slow + - action: replace + regex: (https?) + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_scheme + target_label: __scheme__ + - action: replace + regex: (.+) + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_path + target_label: __metrics_path__ + - action: replace + regex: (\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4}) + replacement: '[$2]:$1' + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_port + - __meta_kubernetes_pod_ip + target_label: __address__ + - action: replace + regex: (\d+);((([0-9]+?)(\.|$)){4}) + replacement: $2:$1 + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_port + - __meta_kubernetes_pod_ip + target_label: __address__ + - action: labelmap + regex: __meta_kubernetes_pod_annotation_prometheus_io_param_(.+) + replacement: __param_$1 + - action: labelmap + regex: __meta_kubernetes_pod_label_(.+) + - action: replace + source_labels: + - __meta_kubernetes_namespace + target_label: namespace + - action: replace + source_labels: + - __meta_kubernetes_pod_name + target_label: pod + - action: drop + regex: Pending|Succeeded|Failed|Completed + source_labels: + - __meta_kubernetes_pod_phase + - action: replace + source_labels: + - __meta_kubernetes_pod_node_name + target_label: node + scrape_interval: 5m + scrape_timeout: 30s + - job_name: 'kube-state-metrics' + metrics_path: /metrics + scrape_interval: 1m + honor_labels: true + kubernetes_sd_configs: + - role: endpoints + namespaces: + own_namespace: true + relabel_configs: + - source_labels: [__meta_kubernetes_service_label_app_kubernetes_io_name] + regex: kube-state-metrics + action: keep + - source_labels: [__meta_kubernetes_endpoint_port_name] + regex: http + action: keep + metric_relabel_configs: + - source_labels: [__name__] + regex: "kube_pod_container_resource_(limits|requests)|kube_pod_status_phase|kube_node_(labels|status_allocatable|status_condition|status_capacity)|kube_namespace_labels|kube_pod_container_status_(waiting|terminated|last_terminated).*_reason|kube_daemonset_status_number_unavailable|kube_deployment_status_replicas_unavailable|kube_resourcequota|kube_pod_info|kube_node_info|kube_pod_container_status_restarts_total" + action: keep + - source_labels: [__name__, phase] + separator: ";" + regex: "kube_pod_status_phase;(Succeeded|Failed)" + action: drop + - source_labels: [node] + target_label: nodename + regex: "(.*)" + action: replace + - source_labels: [label_node_group_name] + action: replace + regex: "(.+)" + target_label: label_node_pool_name + - job_name: 'opencost' + metrics_path: /metrics + scrape_interval: 1m + honor_labels: true + kubernetes_sd_configs: + - role: endpoints + namespaces: + own_namespace: true + relabel_configs: + - source_labels: [__meta_kubernetes_service_label_app_kubernetes_io_name] + regex: opencost + action: keep + - source_labels: [__meta_kubernetes_endpoint_port_name] + regex: http + action: keep + metric_relabel_configs: + - source_labels: [__name__] + regex: "kube_node_labels|kube_pod_labels|node_total_hourly_cost|node_ram_hourly_cost|node_cpu_hourly_cost|node_gpu_hourly_cost" + action: keep + - job_name: 'union-services' + metrics_path: /metrics + scrape_interval: 1m + honor_labels: true + kubernetes_sd_configs: + - role: endpoints + namespaces: + own_namespace: true + relabel_configs: + - source_labels: [__meta_kubernetes_service_label_platform_union_ai_service_group] + regex: .+ + action: keep + - source_labels: [__meta_kubernetes_endpoint_port_name] + regex: debug + action: keep + # Flyte propeller metrics for execution info and fast task duration. + # No-op when propeller is not deployed (no pods match the label selector). + - job_name: 'flytepropeller' + kubernetes_sd_configs: + - role: pod + namespaces: + own_namespace: true + selectors: + - role: pod + label: app.kubernetes.io/name=flytepropeller + metric_relabel_configs: + - source_labels: [__name__] + regex: "flyte:propeller:all:round:execution_info|flyte:propeller:all:node:fast_task:fast_task_execution_duration" + action: keep + relabel_configs: + - source_labels: [__meta_kubernetes_pod_container_name] + regex: flytepropeller + action: keep + - source_labels: [__meta_kubernetes_namespace] + action: replace + target_label: namespace + - source_labels: [__meta_kubernetes_pod_name] + action: replace + target_label: pod + # Kourier gateway (envoy) metrics for app serving. + # No-op when serving is not enabled (no pods match the label selector). + - job_name: 'serving-envoy' + kubernetes_sd_configs: + - role: pod + namespaces: + own_namespace: true + selectors: + - role: pod + label: app=3scale-kourier-gateway + metrics_path: /stats/prometheus + metric_relabel_configs: + - source_labels: [__name__] + regex: "envoy_cluster_upstream_rq_xx|envoy_cluster_upstream_rq_time_bucket" + action: keep + relabel_configs: + - source_labels: [__meta_kubernetes_pod_container_port_name] + regex: metrics + action: keep + - source_labels: [__meta_kubernetes_namespace] + action: replace + target_label: namespace + - source_labels: [__meta_kubernetes_pod_name] + action: replace + target_label: pod + # DCGM GPU metrics. + # No-op when dcgm-exporter is not deployed (no pods match the label selector). + - job_name: 'gpu-metrics' + kubernetes_sd_configs: + - role: pod + namespaces: + names: + - kube-system + selectors: + - role: pod + label: app.kubernetes.io/name=dcgm-exporter + + recording_rules.yml: | + {} + rules: | + {} +--- +# Source: dataplane/templates/fluent-bit/configmap.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: fluentbit-system + namespace: union + labels: + app.kubernetes.io/instance: release-name + app.kubernetes.io/managed-by: Helm +data: + custom_parsers.conf: | + [PARSER] + Name docker_no_time + Format json + Time_Keep Off + Time_Key time + Time_Format %Y-%m-%dT%H:%M:%S.%L + fluent-bit.conf: | + [SERVICE] + Parsers_File /fluent-bit/etc/parsers.conf + Parsers_File /fluent-bit/etc/conf/custom_parsers.conf + HTTP_Server On + HTTP_Listen 0.0.0.0 + Health_Check On + [INPUT] + Name tail + Tag namespace-.pod-.cont- + Tag_Regex (?[a-z0-9](?:[-a-z0-9]*[a-z0-9])?(?:\\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*)_(?[^_]+)_(?.+)- + Path /var/log/containers/*.log + DB /var/log/flb_kube.db + multiline.parser docker, cri + Mem_Buf_Limit 5MB + Skip_Long_Lines On + Refresh_Interval 10 + + + [OUTPUT] + Name s3 + Match * + upload_timeout 1m + s3_key_format /persisted-logs/$TAG + static_file_path true + json_date_key false + region us-central1 + bucket test-gcp-bucket +--- +# Source: dataplane/templates/imagebuilder/build-image-configmap.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: build-image-config + labels: + app.kubernetes.io/name: union-operator + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm +data: + storage.yaml: | + storage: + container: "test-gcp-bucket" + type: stow + stow: + kind: google + config: + json: "" + project_id: test-gcp-project-123 + scopes: https://www.googleapis.com/auth/cloud-platform + enable-multicontainer: true + limits: + maxDownloadMBs: 1024 + cache: + max_size_mbs: 0 + target_gc_percent: 70 + buildkit-uri: "tcp://union-operator-buildkit.union.svc.cluster.local:1234" + default-repository: "us-central1-docker.pkg.dev/test-gcp-project-123/union-dataplane" + authentication-type: "google" +--- +# Source: dataplane/templates/imagebuilder/configmap.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name : union-operator-buildkit +data: + buildkitd.toml: | + debug = false + + [log] + format = "text" + + [worker.oci] + enabled = true + snapshotter = "auto" + gc = true + max-parallelism = 0 + + # Should not be used if Policies are defined + gckeepstorage = "10%" + [[worker.oci.gcpolicy]] + # Remove COPY/ADD and git checkout files + keepBytes = "10%" + keepDuration = "24h" + filters = [ "type==source.local", "type==source.git.checkout" ] + [[worker.oci.gcpolicy]] + # Remove locally cached image layers after it's unused for 24 hours + keepBytes = "10%" + keepDuration = "24h" + filters = [ "regular" ] + [[worker.oci.gcpolicy]] + # Remove shared cache mounts. E.G. Pip cache + keepBytes = "10%" + keepDuration = "72h" + filters = [ "type==exec.cachemount" ] + [[worker.oci.gcpolicy]] + # Remove everything else to keep the cache size under total file system limit + all = true + keepBytes = "80%" +--- +# Source: dataplane/templates/monitoring/dashboard-configmap.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: release-name-dashboard-union-dataplane-overview + namespace: union + labels: + grafana_dashboard: "1" + app.kubernetes.io/managed-by: Helm +data: + union-dataplane-overview.json: |- + { + "annotations": { + "list": [] + }, + "description": "Union Dataplane health and service metrics", + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "links": [], + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 1, + "title": "Health", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "orange", + "value": 0.5 + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "percentunit", + "min": 0, + "max": 1 + } + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 0, + "y": 1 + }, + "id": 2, + "options": { + "colorMode": "background", + "graphMode": "none", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + }, + "textMode": "auto" + }, + "title": "Service Availability", + "type": "stat", + "targets": [ + { + "expr": "avg(kube_deployment_status_replicas_available{namespace=\"$namespace\"} / kube_deployment_spec_replicas{namespace=\"$namespace\"})", + "legendFormat": "Availability", + "refId": "A" + } + ], + "description": "Percentage of deployments with all requested replicas available. 1.0 = all healthy." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "orange", + "value": 3 + }, + { + "color": "red", + "value": 10 + } + ] + }, + "unit": "short" + } + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 6, + "y": 1 + }, + "id": 3, + "options": { + "colorMode": "background", + "graphMode": "none", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + }, + "textMode": "auto" + }, + "title": "Pod Restarts (1h)", + "type": "stat", + "targets": [ + { + "expr": "sum(increase(kube_pod_container_status_restarts_total{namespace=\"$namespace\"}[1h]))", + "legendFormat": "Restarts", + "refId": "A" + } + ], + "description": "Total container restarts in the last hour. Non-zero indicates crashlooping or OOM kills." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 10 + }, + { + "color": "red", + "value": 50 + } + ] + }, + "unit": "short" + } + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 18, + "y": 1 + }, + "id": 8, + "options": { + "colorMode": "background", + "graphMode": "area", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + }, + "textMode": "auto" + }, + "title": "Active Workflows", + "type": "stat", + "targets": [ + { + "expr": "sum(flyte:propeller:all:execstats:active_workflow_executions{namespace=\"$namespace\"})", + "legendFormat": "Workflows", + "refId": "A" + } + ], + "description": "Current active FlyteWorkflow CRD count managed by Propeller." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "short" + } + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 5 + }, + "id": 10, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "title": "Active Executions", + "type": "timeseries", + "targets": [ + { + "expr": "sum(flyte:propeller:all:execstats:active_workflow_executions{namespace=\"$namespace\"})", + "legendFormat": "Workflows", + "refId": "A" + }, + { + "expr": "sum(flyte:propeller:all:execstats:active_node_executions{namespace=\"$namespace\"})", + "legendFormat": "Nodes", + "refId": "B" + }, + { + "expr": "sum(flyte:propeller:all:execstats:active_task_executions{namespace=\"$namespace\"})", + "legendFormat": "Tasks", + "refId": "C" + } + ], + "description": "Current count of active workflow, node, and task executions tracked by FlyteAdmin." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "short" + } + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 5 + }, + "id": 7, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "title": "Queue Depth", + "type": "timeseries", + "targets": [ + { + "expr": "sum(flyte:propeller:all:main_depth{namespace=\"$namespace\"})", + "legendFormat": "Main", + "refId": "A" + }, + { + "expr": "sum(flyte:propeller:all:sub_depth{namespace=\"$namespace\"})", + "legendFormat": "Sub", + "refId": "B" + } + ], + "description": "Main and sub workqueue depth over time." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 1 + } + ] + }, + "unit": "short" + } + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 12, + "y": 1 + }, + "id": 11, + "options": { + "colorMode": "background", + "graphMode": "none", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + }, + "textMode": "value_and_name" + }, + "title": "Handler Panics", + "type": "stat", + "targets": [ + { + "expr": "sum(executor:handler_panic{namespace=\"$namespace\"})", + "legendFormat": "Total", + "refId": "A" + } + ], + "description": "Total handler panics in DP services. Any non-zero value indicates a service caught a panic during request handling." + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 9 + }, + "id": 1200, + "title": "SLOs", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "orange", + "value": 0.99 + }, + { + "color": "green", + "value": 0.999 + } + ] + }, + "unit": "percentunit", + "decimals": 3 + } + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 0, + "y": 10 + }, + "id": 1201, + "title": "Service Availability", + "type": "stat", + "options": { + "colorMode": "background", + "graphMode": "none", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + }, + "textMode": "value" + }, + "targets": [ + { + "expr": "avg(kube_deployment_status_replicas_available{namespace=\"$namespace\"} / kube_deployment_spec_replicas{namespace=\"$namespace\"})", + "refId": "A" + } + ], + "description": "Current service availability across all DP deployments." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": -999 + }, + { + "color": "orange", + "value": 0 + }, + { + "color": "green", + "value": 0.5 + } + ] + }, + "unit": "percentunit", + "decimals": 1, + "noValue": "N/A" + } + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 6, + "y": 10 + }, + "id": 1202, + "title": "Error Budget Remaining", + "type": "stat", + "options": { + "colorMode": "background", + "graphMode": "area", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + }, + "textMode": "value" + }, + "targets": [ + { + "expr": "union:dp:slo:error_budget_remaining", + "refId": "A" + } + ], + "description": "Fraction of error budget remaining. Requires monitoring.slos.enabled." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 0 + }, + { + "color": "orange", + "value": 0.95 + }, + { + "color": "green", + "value": 0.999 + } + ] + }, + "unit": "percentunit", + "decimals": 2, + "noValue": "N/A" + } + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 12, + "y": 10 + }, + "id": 1203, + "title": "Execution Success Rate", + "type": "stat", + "options": { + "colorMode": "background", + "graphMode": "none", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + }, + "textMode": "value" + }, + "targets": [ + { + "expr": "union:dp:slo:execution_success_rate or (union:dp:slo:propeller_success_rate + union:dp:slo:executor_success_rate) / 2 or union:dp:slo:propeller_success_rate or vector(1)", + "refId": "A" + } + ], + "description": "Combined V1 (propeller) and V2 (executor) task success rate. Falls back to propeller-only or 100% when idle." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "orange", + "value": 2 + }, + { + "color": "red", + "value": 5 + } + ] + }, + "unit": "s", + "decimals": 2 + } + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 18, + "y": 10 + }, + "id": 1204, + "title": "Propeller Latency p99", + "type": "stat", + "options": { + "colorMode": "background", + "graphMode": "none", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + }, + "textMode": "value" + }, + "targets": [ + { + "expr": "(union:dp:slo:propeller_round_latency_p99 or flyte:propeller:all:round:round_time_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.99\"} / 1000)", + "refId": "A" + } + ], + "description": "Propeller round p99 latency in seconds. Falls back to raw metric if SLO rules not enabled." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "percentunit" + } + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 16 + }, + "id": 1205, + "title": "Availability Over Time", + "type": "timeseries", + "targets": [ + { + "expr": "avg(kube_deployment_status_replicas_available{namespace=\"$namespace\"} / kube_deployment_spec_replicas{namespace=\"$namespace\"})", + "legendFormat": "Availability", + "refId": "A" + }, + { + "expr": "vector(0.999)", + "legendFormat": "Target (99.9%)", + "refId": "B" + } + ], + "description": "DP service availability over time with SLO target line." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "percentunit", + "max": 1, + "min": -0.5 + } + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 16 + }, + "id": 1206, + "title": "Error Budget Burn Rate", + "type": "timeseries", + "targets": [ + { + "expr": "union:dp:slo:error_budget_remaining", + "legendFormat": "Budget remaining", + "refId": "A" + }, + { + "expr": "vector(0)", + "legendFormat": "Exhausted", + "refId": "B" + } + ], + "description": "DP error budget remaining over time. Requires monitoring.slos.enabled." + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 31 + }, + "id": 200, + "title": "Union Operator", + "type": "row", + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 10 + }, + "id": 201, + "title": "Work Queue Operations", + "type": "timeseries", + "targets": [ + { + "expr": "rate(union_operator:work_queue:operations_processed{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Processed", + "refId": "A" + }, + { + "expr": "rate(union_operator:work_queue:operations_failed{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Failed", + "refId": "B" + } + ], + "description": "Operator execution operation processing rate and failure rate." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 10 + }, + "id": 202, + "title": "Background Process Runs / Errors", + "type": "timeseries", + "targets": [ + { + "expr": "rate(union_operator:heartbeat_updater:runs{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Heartbeat runs", + "refId": "A" + }, + { + "expr": "rate(union_operator:heartbeat_updater:run_errors{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Heartbeat errors", + "refId": "B" + }, + { + "expr": "rate(union_operator:status_updater:runs{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Status runs", + "refId": "C" + }, + { + "expr": "rate(union_operator:status_updater:run_errors{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Status errors", + "refId": "D" + }, + { + "expr": "rate(union_operator:prometheus_health_checker:run_errors{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Prom health errors", + "refId": "E" + } + ], + "description": "FlyteAdmin error rates: propeller communication failures, model transform errors, notification publish failures." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "ms" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 10 + }, + "id": 203, + "title": "Heartbeat Latency", + "type": "timeseries", + "targets": [ + { + "expr": "union_operator:heartbeat:compute_capabilities_ms{namespace=\"$namespace\", quantile=\"0.9\"}", + "legendFormat": "Capabilities p90", + "refId": "A" + }, + { + "expr": "union_operator:heartbeat:compute_usages_ms{namespace=\"$namespace\", quantile=\"0.9\"}", + "legendFormat": "Usages p90", + "refId": "B" + }, + { + "expr": "union_operator:heartbeat:list_workflows_ms{namespace=\"$namespace\", quantile=\"0.9\"}", + "legendFormat": "List WFs p90", + "refId": "C" + } + ], + "description": "Breakdown of operator heartbeat computation: capabilities, usages, list workflows." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 18 + }, + "id": 204, + "title": "Config Syncer", + "type": "timeseries", + "targets": [ + { + "expr": "rate(union_operator:config_syncer:runs{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Sync runs", + "refId": "A" + }, + { + "expr": "rate(union_operator:config_syncer:run_errors{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Sync errors", + "refId": "B" + }, + { + "expr": "rate(union_operator:config_syncer:propeller_configmap_updated{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Propeller CM updated", + "refId": "C" + } + ], + "description": "Config sync cycle rate, errors, and propeller ConfigMap update count." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 18 + }, + "id": 205, + "title": "Billable Usage Collector", + "type": "timeseries", + "targets": [ + { + "expr": "rate(union_operator:billable_usage_collector:runs{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Runs", + "refId": "A" + }, + { + "expr": "rate(union_operator:billable_usage_collector:run_errors{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Errors", + "refId": "B" + } + ], + "description": "Billing collection cycle rate and errors. Failures mean billing data may be delayed." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 1 + } + ] + }, + "unit": "bool_yes_no" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 18 + }, + "id": 206, + "title": "Work Queue Paused", + "type": "stat", + "options": { + "colorMode": "background", + "graphMode": "area", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + }, + "textMode": "auto" + }, + "targets": [ + { + "expr": "union_operator:work_queue:paused{namespace=\"$namespace\"}", + "legendFormat": "Paused", + "refId": "A" + } + ], + "description": "1 when operator paused due to resource limits (FlyteWorkflow count or storage exceeded)." + } + ] + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 32 + }, + "id": 300, + "title": "Executor (V2)", + "type": "row", + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 11 + }, + "id": 301, + "title": "Active Actions & Capacity", + "type": "timeseries", + "targets": [ + { + "expr": "executor:active_actions_count{namespace=\"$namespace\"}", + "legendFormat": "Active actions", + "refId": "A" + }, + { + "expr": "executor:available_capacity{namespace=\"$namespace\"}", + "legendFormat": "Available capacity", + "refId": "B" + } + ], + "description": "V2 executor active actions vs available capacity. Capacity=0 means executor is saturated." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 11 + }, + "id": 302, + "title": "Cache Discovery", + "type": "timeseries", + "targets": [ + { + "expr": "rate(executor:discovery_miss_count{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Miss", + "refId": "A" + }, + { + "expr": "rate(executor:discovery_put_success_count{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Put success", + "refId": "B" + } + ], + "description": "V2 executor cache discovery miss/put rates for task output caching." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never", + "stacking": { + "mode": "normal" + } + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 19 + }, + "id": 303, + "title": "Actions Terminated by Phase", + "type": "timeseries", + "targets": [ + { + "expr": "sum by (phase) (rate(executor:actions_terminated{namespace=\"$namespace\"}[$__rate_interval]))", + "legendFormat": "{{ phase }}", + "refId": "A" + } + ], + "description": "Task completion rate by outcome: succeeded, failed, aborted. Key V2 SLI for task health. [Metrics pending: requires cloud service instrumentation to be deployed]" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "ms" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 19 + }, + "id": 304, + "title": "Evaluator Duration (pod creation)", + "type": "timeseries", + "targets": [ + { + "expr": "executor:evaluator:evaluate_duration{namespace=\"$namespace\", quantile=\"0.5\"}", + "legendFormat": "Evaluate p50", + "refId": "A" + }, + { + "expr": "executor:evaluator:evaluate_duration{namespace=\"$namespace\", quantile=\"0.9\"}", + "legendFormat": "Evaluate p90", + "refId": "B" + }, + { + "expr": "executor:evaluator:evaluate_duration{namespace=\"$namespace\", quantile=\"0.99\"}", + "legendFormat": "Evaluate p99", + "refId": "C" + } + ], + "description": "Time spent in RecursiveNodeHandler (pod creation). Dominant component of V2 task latency. [Metrics pending: requires cloud service instrumentation to be deployed]" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 19 + }, + "id": 305, + "title": "System Failures & Invalid Leases", + "type": "timeseries", + "targets": [ + { + "expr": "rate(executor:system_failures{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "System failures", + "refId": "A" + }, + { + "expr": "rate(executor:system_failures_exhausted{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Exhausted retries", + "refId": "B" + }, + { + "expr": "rate(executor:invalid_leases{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Invalid leases", + "refId": "C" + }, + { + "expr": "rate(executor:evaluator:evaluate_errors{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Evaluate errors", + "refId": "D" + } + ], + "description": "Executor error rates. System failures retry; exhausted means task permanently failed. Invalid leases = malformed from queue service. [Metrics pending: requires cloud service instrumentation to be deployed]" + } + ] + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 33 + }, + "id": 100, + "title": "Flyte Propeller (V1)", + "type": "row", + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "ms" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 12 + }, + "id": 101, + "title": "Round Time (p50 / p90 / p99)", + "type": "timeseries", + "targets": [ + { + "expr": "flyte:propeller:all:round:round_time_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.5\"}", + "legendFormat": "p50", + "refId": "A" + }, + { + "expr": "flyte:propeller:all:round:round_time_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.9\"}", + "legendFormat": "p90", + "refId": "B" + }, + { + "expr": "flyte:propeller:all:round:round_time_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.99\"}", + "legendFormat": "p99", + "refId": "C" + } + ], + "description": "Propeller reconciliation round time. One round = one FlyteWorkflow CRD processed." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 12 + }, + "id": 102, + "title": "Round Success / Error Rate", + "type": "timeseries", + "targets": [ + { + "expr": "rate(flyte:propeller:all:round:success_count{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Success", + "refId": "A" + }, + { + "expr": "rate(flyte:propeller:all:round:error_count{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Errors", + "refId": "B" + }, + { + "expr": "rate(flyte:propeller:all:round:panic_unlabeled{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Panics", + "refId": "C" + } + ], + "description": "Propeller round outcomes: success, errors, and panics per second." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 12 + }, + "id": 103, + "title": "Free Workers", + "type": "timeseries", + "targets": [ + { + "expr": "sum(flyte:propeller:all:free_workers_count{namespace=\"$namespace\"})", + "legendFormat": "Free workers", + "refId": "A" + } + ], + "description": "Idle propeller worker goroutines. Zero = all workers busy processing workflows." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 20 + }, + "id": 104, + "title": "Queue Add Rate", + "type": "timeseries", + "targets": [ + { + "expr": "rate(flyte:propeller:all:main_adds{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Main adds", + "refId": "A" + }, + { + "expr": "rate(flyte:propeller:all:sub_adds{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Sub adds", + "refId": "B" + }, + { + "expr": "rate(flyte:propeller:all:main_retries{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Main retries", + "refId": "C" + } + ], + "description": "Items enqueued to propeller's main/sub workqueues, plus retry rate." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 20 + }, + "id": 105, + "title": "Workflow Updates", + "type": "timeseries", + "targets": [ + { + "expr": "rate(flyte:propeller:all:wf_updated{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Updated", + "refId": "A" + }, + { + "expr": "rate(flyte:propeller:all:wf_update_failed{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Failed", + "refId": "B" + }, + { + "expr": "rate(flyte:propeller:all:wf_too_large{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Too large", + "refId": "C" + }, + { + "expr": "rate(flyte:propeller:all:wf_update_conflict{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Conflict", + "refId": "D" + } + ], + "description": "etcd write outcomes: successful updates, failures, too-large (>1.5MB), optimistic concurrency conflicts." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "ms" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 20 + }, + "id": 106, + "title": "Workflow Update Latency", + "type": "timeseries", + "targets": [ + { + "expr": "flyte:propeller:all:wf_update_latency_ms{namespace=\"$namespace\", quantile=\"0.5\"}", + "legendFormat": "p50", + "refId": "A" + }, + { + "expr": "flyte:propeller:all:wf_update_latency_ms{namespace=\"$namespace\", quantile=\"0.9\"}", + "legendFormat": "p90", + "refId": "B" + }, + { + "expr": "flyte:propeller:all:wf_update_latency_ms{namespace=\"$namespace\", quantile=\"0.99\"}", + "legendFormat": "p99", + "refId": "C" + } + ], + "description": "etcd write latency for FlyteWorkflow status updates." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "ms" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 28 + }, + "id": 107, + "title": "Node Queueing & Execution Latency", + "type": "timeseries", + "targets": [ + { + "expr": "flyte:propeller:all:node:queueing_latency_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.5\"}", + "legendFormat": "Queue p50", + "refId": "A" + }, + { + "expr": "flyte:propeller:all:node:queueing_latency_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.9\"}", + "legendFormat": "Queue p90", + "refId": "B" + }, + { + "expr": "flyte:propeller:all:node:node_exec_latency_unlabeled_us{namespace=\"$namespace\", quantile=\"0.9\"} / 1000", + "legendFormat": "Exec p90 (ms)", + "refId": "C" + } + ], + "description": "Node lifecycle: queueing latency (queued\u2192running) and execution latency (time in handler)." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "percentunit" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 28 + }, + "id": 108, + "title": "Metastore Cache Hit Rate", + "type": "timeseries", + "targets": [ + { + "expr": "rate(flyte:propeller:all:metastore:cache_hit{namespace=\"$namespace\"}[$__rate_interval]) / (rate(flyte:propeller:all:metastore:cache_hit{namespace=\"$namespace\"}[$__rate_interval]) + rate(flyte:propeller:all:metastore:cache_miss{namespace=\"$namespace\"}[$__rate_interval]))", + "legendFormat": "Hit rate", + "refId": "A" + } + ], + "description": "In-memory cache effectiveness for object store (S3/GCS) reads. Low hit rate = excessive storage calls." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 36 + }, + "id": 109, + "title": "Event Recording (DP \u2192 CP)", + "type": "timeseries", + "targets": [ + { + "expr": "sum(rate(flyte:propeller:all:task:event_recording:success_duration_ms_count{namespace=\"$namespace\"}[$__rate_interval]))", + "legendFormat": "Task success", + "refId": "A" + }, + { + "expr": "sum(rate(flyte:propeller:all:node:event_recording:success_duration_ms_count{namespace=\"$namespace\"}[$__rate_interval]))", + "legendFormat": "Node success", + "refId": "B" + }, + { + "expr": "sum(rate(flyte:propeller:all:task:event_recording:failure_duration_ms_count{namespace=\"$namespace\"}[$__rate_interval]))", + "legendFormat": "Task failure", + "refId": "C" + }, + { + "expr": "sum(rate(flyte:propeller:all:node:event_recording:failure_duration_ms_count{namespace=\"$namespace\"}[$__rate_interval]))", + "legendFormat": "Node failure", + "refId": "D" + } + ], + "description": "Task/node event recording rate from propeller to FlyteAdmin. Failures indicate CP connectivity issues." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 36 + }, + "id": 110, + "title": "Cache Discovery (hit/miss/skip)", + "type": "timeseries", + "targets": [ + { + "expr": "rate(flyte:propeller:all:discovery_hit_count{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Hits", + "refId": "A" + }, + { + "expr": "rate(flyte:propeller:all:discovery_miss_count{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Misses", + "refId": "B" + }, + { + "expr": "rate(flyte:propeller:all:discovery_skip_count{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Skips", + "refId": "C" + }, + { + "expr": "rate(flyte:propeller:all:discovery_get_failure_count{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Get failures", + "refId": "D" + } + ], + "description": "V2 executor cache discovery miss/put rates for task output caching." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "reqps" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 44 + }, + "id": 111, + "title": "K8s API Client Rate", + "type": "timeseries", + "targets": [ + { + "expr": "sum(rate(k8s_client_request_total_unlabeled{namespace=\"$namespace\"}[$__rate_interval]))", + "legendFormat": "K8s requests/s", + "refId": "A" + } + ], + "description": "Propeller's K8s API request rate. High rates may indicate excessive pod watches or creates." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "s" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 44 + }, + "id": 112, + "title": "K8s API Client Latency (p90)", + "type": "timeseries", + "targets": [ + { + "expr": "histogram_quantile(0.90, sum by (le) (rate(k8s_client_request_latency_unlabeled_bucket{namespace=\"$namespace\"}[$__rate_interval])))", + "legendFormat": "Request p90", + "refId": "A" + }, + { + "expr": "histogram_quantile(0.90, sum by (le) (rate(k8s_client_rate_limiter_latency_unlabeled_bucket{namespace=\"$namespace\"}[$__rate_interval])))", + "legendFormat": "Rate limiter p90", + "refId": "B" + } + ], + "description": "K8s API request latency and client-side rate limiter wait time at p90." + } + ] + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 34 + }, + "id": 400, + "title": "gRPC Client (DP \u2192 CP)", + "type": "row", + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "reqps" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 13 + }, + "id": 401, + "title": "gRPC Client Request Rate", + "type": "timeseries", + "targets": [ + { + "expr": "sum by (grpc_service, grpc_method) (rate(grpc_client_handled_total{namespace=\"$namespace\"}[$__rate_interval]))", + "legendFormat": "{{ grpc_service }}/{{ grpc_method }}", + "refId": "A" + } + ], + "description": "DP\u2192CP gRPC client request rate by service and method (e.g. CreateWorkflowEvent)." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "reqps" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 13 + }, + "id": 402, + "title": "gRPC Client Error Rate", + "type": "timeseries", + "targets": [ + { + "expr": "sum by (grpc_service, grpc_method, grpc_code) (rate(grpc_client_handled_total{namespace=\"$namespace\", grpc_code!=\"OK\"}[$__rate_interval]))", + "legendFormat": "{{ grpc_method }} {{ grpc_code }}", + "refId": "A" + } + ], + "description": "DP\u2192CP gRPC errors by method and code. Non-OK responses from the control plane." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "s" + } + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 21 + }, + "id": 403, + "title": "gRPC Client Latency (p95)", + "type": "timeseries", + "targets": [ + { + "expr": "histogram_quantile(0.95, sum by (le, grpc_method) (rate(grpc_client_handling_seconds_bucket{namespace=\"$namespace\"}[$__rate_interval])))", + "legendFormat": "{{ grpc_method }} p95", + "refId": "A" + } + ], + "description": "DP\u2192CP gRPC call latency at p95. High latency = slow control plane or network issues." + } + ] + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 35 + }, + "id": 1100, + "title": "Infrastructure", + "type": "row", + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never", + "stacking": { + "mode": "normal" + } + }, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 14 + }, + "id": 1101, + "title": "CPU Usage by Service", + "type": "timeseries", + "targets": [ + { + "expr": "sum by (container) (rate(container_cpu_usage_seconds_total{namespace=\"$namespace\", container!=\"\", container!=\"POD\"}[$__rate_interval]))", + "legendFormat": "{{ container }}", + "refId": "A" + } + ], + "description": "CPU usage in cores per container, stacked. Identifies resource-heavy services." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never", + "stacking": { + "mode": "normal" + } + }, + "unit": "bytes" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 14 + }, + "id": 1102, + "title": "Memory Usage by Service", + "type": "timeseries", + "targets": [ + { + "expr": "sum by (container) (container_memory_working_set_bytes{namespace=\"$namespace\", container!=\"\", container!=\"POD\"})", + "legendFormat": "{{ container }}", + "refId": "A" + } + ], + "description": "Working set memory per container, stacked. Watch for approaching limits." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "bars", + "fillOpacity": 80, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 22 + }, + "id": 1103, + "title": "Pod Restart Count by Container", + "type": "timeseries", + "targets": [ + { + "expr": "increase(kube_pod_container_status_restarts_total{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "{{ pod }}/{{ container }}", + "refId": "A" + } + ], + "description": "Per-container restart events. Spikes indicate crashes or OOM kills." + } + ] + } + ], + "schemaVersion": 39, + "tags": [ + "union", + "dataplane" + ], + "templating": { + "list": [ + { + "current": {}, + "hide": 0, + "includeAll": false, + "label": "Data Source", + "multi": false, + "name": "datasource", + "options": [], + "query": "prometheus", + "refresh": 1, + "type": "datasource" + }, + { + "current": { + "selected": true, + "text": "union", + "value": "union" + }, + "hide": 2, + "label": "Namespace", + "name": "namespace", + "options": [ + { + "selected": true, + "text": "union", + "value": "union" + } + ], + "query": "union", + "type": "constant" + } + ] + }, + "time": { + "from": "now-3h", + "to": "now" + }, + "timepicker": {}, + "timezone": "browser", + "title": "Union Dataplane Overview", + "uid": "union-dp-overview", + "version": 1 + } +--- +# Source: dataplane/templates/monitoring/prometheusrule.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: union-recording-rules + namespace: union + labels: + release: release-name +data: + recording_rules.yml: | + groups: [] +--- +# Source: dataplane/templates/nodeexecutor/configmap.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: executor + namespace: union + labels: + app: executor +data: + task_logs.yaml: | + plugins: + logs: + cloudwatch-enabled: false + dynamic-log-links: + - vscode: + displayName: VS Code Debugger + linkType: ide + templateUris: + - /dataplane/pod/v1/generated_name/6060/task/{{.executionProject}}/{{.executionDomain}}/{{.executionName}}/{{.nodeID}}/{{.taskRetryAttempt}}/test-e2e-gcp/{{.namespace}}/{{.taskProject}}/{{.taskDomain}}/{{.taskID}}/{{.taskVersion}}/{{.generatedName}}/ + - wandb-execution-id: + displayName: Weights & Biases + linkType: dashboard + templateUris: + - '{{ .taskConfig.host }}/{{ .taskConfig.entity }}/{{ .taskConfig.project + }}/runs/{{ .podName }}' + - wandb-custom-id: + displayName: Weights & Biases + linkType: dashboard + templateUris: + - '{{ .taskConfig.host }}/{{ .taskConfig.entity }}/{{ .taskConfig.project + }}/runs/{{ .taskConfig.id }}' + - comet-ml-execution-id: + displayName: Comet + linkType: dashboard + templateUris: '{{ .taskConfig.host }}/{{ .taskConfig.workspace }}/{{ + .taskConfig.project_name }}/{{ .executionName }}{{ .nodeId }}{{ + .taskRetryAttempt }}{{ .taskConfig.link_suffix }}' + - comet-ml-custom-id: + displayName: Comet + linkType: dashboard + templateUris: '{{ .taskConfig.host }}/{{ .taskConfig.workspace }}/{{ + .taskConfig.project_name }}/{{ .taskConfig.experiment_key }}' + - neptune-scale-run: + displayName: Neptune Run + linkType: dashboard + templateUris: + - https://scale.neptune.ai/{{ .taskConfig.project }}/-/run/?customId={{ + .podName }} + - neptune-scale-custom-id: + displayName: Neptune Run + linkType: dashboard + templateUris: + - https://scale.neptune.ai/{{ .taskConfig.project }}/-/run/?customId={{ + .taskConfig.id }} + kubernetes-enabled: false + enabled_plugins.yaml: | + plugins: + connector-service: + defaultConnector: + defaultServiceConfig: '{"loadBalancingConfig": [{"round_robin":{}}]}' + endpoint: k8s:///flyteconnector.union:8000 + tasks: + task-plugins: + default-for-task-types: + actor: fast-task + container: container + container_array: k8s-array + sidecar: sidecar + enabled-plugins: + - container + - sidecar + - k8s-array + - echo + - fast-task + - connector-service + config.yaml: | + executor: + cluster: 'test-e2e-gcp' + evaluatorCount: 64 + maxActions: 2000 + organization: 'byok' + unionAuth: + injectSecret: true + secretName: EAGER_API_KEY + workerName: worker1 + task_resources: + defaults: + cpu: 100m + memory: 500Mi + limits: + cpu: 4096 + gpu: 256 + memory: 2Ti + namespace_mapping: + template: union + union: + connection: + host: dns:///byok.us-west-2.union.ai + auth: + authorizationMetadataKey: flyte-authorization + clientId: 'byok-test-e2e-gcp-operator' + clientSecretLocation: /etc/union/secret/client_secret + tokenRefreshWindow: 5m + type: ClientSecret + admin: + clientId: 'byok-test-e2e-gcp-operator' + clientSecretLocation: /etc/union/secret/client_secret + endpoint: dns:///byok.us-west-2.union.ai + insecure: false + authorizer: + type: noop + catalog-cache: + cache-endpoint: dns:///byok.us-west-2.union.ai + endpoint: dns:///byok.us-west-2.union.ai + insecure: false + type: cacheservicev2 + use-admin-auth: true + logger: + formatter: + type: json + level: 4 + show-source: true + sharedService: + metrics: + scope: 'executor:' + security: + allowCors: true + allowLocalhostAccess: true + allowedHeaders: + - Content-Type + allowedOrigins: + - '*' + secure: false + useAuth: false + propeller: + limit-namespace: union + node-config: + disable-input-file-writes: true + plugins: + fasttask: + additional-worker-args: + - --last-ack-grace-period-seconds + - "120" + callback-uri: http://union-operator-executor.union.svc.cluster.local:15605 + grace-period-status-not-found: 2m + ioutils: + remoteFileOutputPaths: + deckFilename: report.html + k8s: + disable-inject-owner-references: true + default-cpus: 100m + default-env-vars: [] + default-memory: 100Mi + default-pod-template-name: task-template + co-pilot: + image: 'cr.flyte.org/flyteorg/flytecopilot:v1.14.1' + name: flyte-copilot- + start-timeout: 30s + storage: + container: "test-gcp-bucket" + type: stow + stow: + kind: google + config: + json: "" + project_id: test-gcp-project-123 + scopes: https://www.googleapis.com/auth/cloud-platform + enable-multicontainer: true + limits: + maxDownloadMBs: 1024 + cache: + max_size_mbs: 0 + target_gc_percent: 70 +--- +# Source: dataplane/templates/operator/configmap.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: union-operator + labels: + app.kubernetes.io/name: union-operator + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm +data: + k8s.yaml: | + plugins: + k8s: + default-cpus: 100m + default-env-vars: [] + default-memory: 100Mi + default-pod-template-name: task-template + config.yaml: | + union: + connection: + host: dns:///byok.us-west-2.union.ai + auth: + authorizationMetadataKey: flyte-authorization + clientId: 'byok-test-e2e-gcp-operator' + clientSecretLocation: /etc/union/secret/client_secret + tokenRefreshWindow: 5m + type: ClientSecret + sharedService: + features: + gatewayV2: true + port: 8081 + authorizer: + type: noop + operator: + enabled: true + enableTunnelService: true + # enableDepot: false + tunnel: + enableDirectToAppIngress: true + deploymentToRestart: union-operator-proxy + limitNamespace: union + disableClusterPermissions: true + apps: + enabled: 'true' + syncClusterConfig: + enabled: false + clusterId: + organization: 'byok' + clusterData: + appId: 'byok-test-e2e-gcp-operator' + bucketName: 'test-gcp-bucket' + bucketRegion: 'us-central1' + cloudHostName: 'byok.us-west-2.union.ai' + gcpProjectId: 'test-gcp-project-123' + metadataBucketPrefix: 'gs://test-gcp-bucket' + userRole: 'union-worker@test-gcp-project-123.iam.gserviceaccount.com' + userRoleKey: 'iam.gke.io/gcp-service-account' + collectUsages: + enabled: false + billing: + model: Legacy + dependenciesHeartbeat: + executor: + endpoint: 'http://union-operator-executor:10254' + proxy: + endpoint: 'http://union-operator-proxy:10254' + secretsWatcher: + dryRun: true + enabled: false + org: + namespaceTemplate: union + imageBuilder: + enabled: true + executionNamespaceLabels: + union.ai/namespace-type: flyte + referenceConfigmapName: union-operator + targetConfigMapName: "build-image-config" + proxy: + persistedLogs: + objectStore: + pathTemplate: namespace-{{.KubernetesNamespace}}.pod-{{.KubernetesPodName}}.cont-{{.KubernetesContainerName}} + prefix: persisted-logs + sourceType: ObjectStore + smConfig: + enabled: 'true' + k8sConfig: + namespace: 'union' + type: 'K8s' + logger.yaml: | + logger: + formatter: + type: json + level: 4 + show-source: true + config-overrides.yaml: | + cache: + identity: + enabled: false + storage.yaml: | + storage: + container: "test-gcp-bucket" + type: stow + stow: + kind: google + config: + json: "" + project_id: test-gcp-project-123 + scopes: https://www.googleapis.com/auth/cloud-platform + enable-multicontainer: true + limits: + maxDownloadMBs: 1024 + cache: + max_size_mbs: 0 + target_gc_percent: 70 + fast_registration_storage.yaml: | + fastRegistrationStorage: + container: "test-gcp-bucket" + type: stow + stow: + kind: google + config: + json: "" + project_id: test-gcp-project-123 + scopes: https://www.googleapis.com/auth/cloud-platform + image-builder.buildkit-uri: "tcp://union-operator-buildkit.union.svc.cluster.local:1234" + image-builder.default-repository: "us-central1-docker.pkg.dev/test-gcp-project-123/union-dataplane" + image-builder.authentication-type: "google" +--- +# Source: dataplane/templates/serving/bootstrap-configmap.yaml +# We need to copy the default configmap here since knative-operator does not automatically update the +# address of the `net-kourier-controller` endpoint to use the release namespace. It assumes that the +# resources are being installed into the `knative-serving` namespace instead. +apiVersion: v1 +kind: ConfigMap +metadata: + name: union-operator-serving-envoy-bootstrap + namespace: union +data: + envoy-bootstrap.yaml: | + dynamic_resources: + ads_config: + transport_api_version: V3 + api_type: GRPC + rate_limit_settings: {} + grpc_services: + - envoy_grpc: {cluster_name: xds_cluster} + cds_config: + resource_api_version: V3 + ads: {} + lds_config: + resource_api_version: V3 + ads: {} + node: + cluster: kourier-knative + id: 3scale-kourier-gateway + static_resources: + listeners: + - name: stats_listener + address: + socket_address: + address: 0.0.0.0 + port_value: 9000 + filter_chains: + - filters: + - name: envoy.filters.network.http_connection_manager + typed_config: + "@type": type.googleapis.com/envoy.extensions.filters.network.http_connection_manager.v3.HttpConnectionManager + stat_prefix: stats_server + http_filters: + - name: envoy.filters.http.router + typed_config: + "@type": type.googleapis.com/envoy.extensions.filters.http.router.v3.Router + route_config: + virtual_hosts: + - name: admin_interface + domains: + - "*" + routes: + - match: + safe_regex: + regex: '/(certs|stats(/prometheus)?|server_info|clusters|listeners|ready)?' + headers: + - name: ':method' + string_match: + exact: GET + route: + cluster: service_stats + - match: + safe_regex: + regex: '/drain_listeners' + headers: + - name: ':method' + string_match: + exact: POST + route: + cluster: service_stats + clusters: + - name: service_stats + connect_timeout: 0.250s + type: static + load_assignment: + cluster_name: service_stats + endpoints: + lb_endpoints: + endpoint: + address: + socket_address: + address: 127.0.0.1 + port_value: 9901 + - name: xds_cluster + # This keepalive is recommended by envoy docs. + # https://www.envoyproxy.io/docs/envoy/latest/api-docs/xds_protocol + typed_extension_protocol_options: + envoy.extensions.upstreams.http.v3.HttpProtocolOptions: + "@type": type.googleapis.com/envoy.extensions.upstreams.http.v3.HttpProtocolOptions + explicit_http_config: + http2_protocol_options: + connection_keepalive: + interval: 30s + timeout: 5s + connect_timeout: 1s + load_assignment: + cluster_name: xds_cluster + endpoints: + lb_endpoints: + endpoint: + address: + socket_address: + address: "net-kourier-controller" + port_value: 18000 + type: STRICT_DNS + admin: + access_log: + - name: envoy.access_loggers.stdout + typed_config: + "@type": type.googleapis.com/envoy.extensions.access_loggers.stream.v3.StdoutAccessLog + address: + socket_address: + address: 127.0.0.1 + port_value: 9901 + stats_config: + stats_tags: + - tag_name: name + regex: '^.*?\.u/.*?n=(.*?)/.*?u\..*$' + - tag_name: domain + regex: '^.*?\.u/.*?d=(.*?)/.*?u\..*$' + - tag_name: org + regex: '^.*?\.u/.*?o=(.*?)/.*?u\..*$' + - tag_name: project + regex: '^.*?\.u/.*?p=(.*?)/.*?u\..*$' + - tag_name: target + regex: '^.*?\.u/.*?t=(.*?)/.*?u\..*$' + - tag_name: target_namespace + regex: '^.*?\.u/.*?tns=(.*?)/.*?u\..*$' +--- +# Source: dataplane/templates/webhook/configmap.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: union-pod-webhook-config + namespace: union +data: + core.yaml: | + + + webhook: + certDir: /etc/webhook/certs + disableCreateMutatingWebhookConfig: true + embeddedSecretManagerConfig: + imagePullSecrets: + enabled: true + k8sConfig: + namespace: 'union' + type: 'K8s' + listenPort: '9443' + localCert: true + secretManagerTypes: + - Embedded + - K8s + secretName: union-pod-webhook + serviceName: union-pod-webhook + servicePort: '443' +--- +# Source: dataplane/charts/fluentbit/templates/clusterrole.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: release-name-fluentbit + labels: + helm.sh/chart: fluentbit-0.48.9 + app.kubernetes.io/name: fluentbit + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "3.2.8" + app.kubernetes.io/managed-by: Helm +rules: + - apiGroups: + - "" + resources: + - namespaces + - pods + verbs: + - get + - list + - watch +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2020 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: knative-serving-operator-aggregated + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +aggregationRule: + clusterRoleSelectors: + # This (along with escalate below) allows the Operator to pick up any + # roles that are provided to the admin of the cluster by knative serving + # automatically. + - matchExpressions: + - {key: serving.knative.dev/release, operator: Exists} +rules: [] # Rules are automatically filled in by the controller manager. +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: knative-serving-operator-aggregated-stable + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +aggregationRule: + clusterRoleSelectors: + # This (along with escalate below) allows the Operator to pick up any + # roles that are provided to the admin of the cluster by knative serving + # automatically. + - matchExpressions: + - {key: app.kubernetes.io/name, operator: In, values: ["knative-serving"]} +rules: [] # Rules are automatically filled in by the controller manager. +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: knative-eventing-operator-aggregated + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +aggregationRule: + clusterRoleSelectors: + # This (along with escalate below) allows the Operator to pick up any + # roles that are provided to the admin of the cluster by knative eventing + # automatically. + - matchExpressions: + - {key: eventing.knative.dev/release, operator: Exists} +rules: [] # Rules are automatically filled in by the controller manager. +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: knative-eventing-operator-aggregated-stable + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +aggregationRule: + clusterRoleSelectors: + # This (along with escalate below) allows the Operator to pick up any + # roles that are provided to the admin of the cluster by knative eventing + # automatically. + - matchExpressions: + - {key: app.kubernetes.io/name, operator: In, values: ["knative-eventing"]} +rules: [] # Rules are automatically filled in by the controller manager. +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2020 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +kind: ClusterRole +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: knative-serving-operator + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +rules: + - apiGroups: + - operator.knative.dev + resources: + - '*' + verbs: + - '*' + # Bootstrapping permissions. + # Roles that are explicitly bound buch which are specified by this Operator + # MUST be specified here with 'get' and 'bind'. + - apiGroups: + - rbac.authorization.k8s.io + resources: + - clusterroles + resourceNames: + - system:auth-delegator + verbs: + - bind + - get + - apiGroups: + - rbac.authorization.k8s.io + resources: + - roles + resourceNames: + - extension-apiserver-authentication-reader + verbs: + - bind + - get + - apiGroups: + - rbac.authorization.k8s.io + resources: + - clusterroles + - roles + verbs: + - create + - delete + # Escalate is necessary in order to create a role using cluster role aggregation, + # and to allow the Operator to bootstrap itself into the necessary set of + # permissions, even as those continue to evolve upstream. + - escalate + - get + - list + - update + - apiGroups: + - rbac.authorization.k8s.io + resources: + - clusterrolebindings + - rolebindings + verbs: + - create + - delete + - list + - get + - update + # Permissions required for Knative controller + # infra. + - apiGroups: + - apiregistration.k8s.io + resources: + - apiservices + verbs: + - update + - apiGroups: + - "" + resources: + - services + verbs: + - create + - delete + - get + - list + - watch + - apiGroups: + - caching.internal.knative.dev + resources: + - images + verbs: + - '*' + - apiGroups: + - "" + resources: + - namespaces + verbs: + - get + - update + - watch + - apiGroups: + - '' + resources: + - events + verbs: + - create + - update + - patch + - apiGroups: + - '' + resources: + - configmaps + verbs: + - create + - delete + - get + - list + - watch + - apiGroups: + - security.istio.io + - apps + - policy + resources: + - poddisruptionbudgets + - peerauthentications + - deployments + - daemonsets + - replicasets + - statefulsets + verbs: + - create + - delete + - get + - list + - watch + - update + - apiGroups: + - apiregistration.k8s.io + resources: + - apiservices + verbs: + - create + - delete + - get + - list + - apiGroups: + - autoscaling + resources: + - horizontalpodautoscalers + verbs: + - create + - delete + - get + - list + - apiGroups: + - coordination.k8s.io + resources: + - leases + verbs: + - '*' + - apiGroups: + - apiextensions.k8s.io + resources: + - customresourcedefinitions + verbs: + - '*' + # Old resources that need cleaning up that are not in the knative-serving + # namespace. + - apiGroups: + - "" + resources: + - services + - deployments + - horizontalpodautoscalers + resourceNames: + - knative-ingressgateway + verbs: + - delete + - apiGroups: + - "" + resources: + - configmaps + resourceNames: + - config-controller + verbs: + - delete + - apiGroups: + - "" + resources: + - serviceaccounts + resourceNames: + - knative-serving-operator + verbs: + - delete + # for contour TLS + - apiGroups: + - projectcontour.io + resources: + - httpproxies + - tlscertificatedelegations + verbs: + - get + - list + - watch + - update + - create + - delete + - deletecollection + - patch + # for security-guard + - apiGroups: + - guard.security.knative.dev + resources: + - guardians + verbs: + - get + - list + - watch + - create + - update + - patch + - delete + - apiGroups: + - "" + resources: + - configmaps + verbs: + - get + - list + - watch + - create + - update + - patch + - delete + - apiGroups: + - authentication.k8s.io + resources: + - tokenreviews + verbs: + - create + - apiGroups: + - "" + resources: + - pods + verbs: + - get +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +kind: ClusterRole +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: knative-eventing-operator + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +rules: + - apiGroups: + - operator.knative.dev + resources: + - '*' + verbs: + - '*' + # Bootstrapping permissions. + # Roles that are explicitly bound buch which are specified by this Operator + # MUST be specified here with 'get' and 'bind'. + - apiGroups: + - rbac.authorization.k8s.io + resources: + - clusterroles + - roles + verbs: + - create + - delete + # Escalate is necessary in order to create a role using cluster role aggregation, + # and to allow the Operator to bootstrap itself into the necessary set of + # permissions, even as those continue to evolve upstream. + - escalate + - get + - list + - update + - apiGroups: + - rbac.authorization.k8s.io + resources: + - clusterrolebindings + - rolebindings + verbs: + - create + - delete + - list + - get + - update + # Permissions required for Knative controller + # infra. + - apiGroups: + - apiregistration.k8s.io + resources: + - apiservices + verbs: + - update + - apiGroups: + - "" + resources: + - services + verbs: + - create + - delete + - get + - list + - watch + - apiGroups: + - caching.internal.knative.dev + resources: + - images + verbs: + - '*' + - apiGroups: + - "" + resources: + - namespaces + verbs: + - get + - update + - watch + - apiGroups: + - '' + resources: + - events + verbs: + - create + - update + - patch + - apiGroups: + - '' + resources: + - configmaps + verbs: + - create + - delete + - get + - list + - watch + - apiGroups: + - apps + resources: + - deployments + - daemonsets + - replicasets + - statefulsets + verbs: + - create + - delete + - get + - list + - watch + - apiGroups: + - apiregistration.k8s.io + resources: + - apiservices + verbs: + - create + - delete + - get + - list + - apiGroups: + - autoscaling + resources: + - horizontalpodautoscalers + verbs: + - create + - delete + - update + - get + - list + - apiGroups: + - coordination.k8s.io + resources: + - leases + verbs: + - '*' + - apiGroups: + - apiextensions.k8s.io + resources: + - customresourcedefinitions + verbs: + - '*' + - apiGroups: + - batch + resources: + - jobs + verbs: + - create + - delete + - update + - get + - list + - watch + # Old resources that need cleaning up that are not in the knative-eventing + # namespace. + - apiGroups: + - "" + resources: + - serviceaccounts + resourceNames: + - knative-eventing-operator + verbs: + - delete + # for RabbitMQ messaging topology objects + - apiGroups: + - rabbitmq.com + resources: + - rabbitmqclusters + verbs: + - "get" + - "list" + - "watch" + - apiGroups: + - rabbitmq.com + resources: + - bindings + - queues + - exchanges + verbs: + - create + - delete + - get + - list + - patch + - update + - watch + - apiGroups: + - rabbitmq.com + resources: + - bindings/status + - queues/status + - exchanges/status + verbs: + - get + # for Kafka eventing source + - apiGroups: + - keda.sh + resources: + - scaledobjects + - scaledobjects/finalizers + - scaledobjects/status + - triggerauthentications + - triggerauthentications/status + verbs: + - get + - list + - watch + - update + - create + - delete + # Internal APIs + - apiGroups: + - "internal.kafka.eventing.knative.dev" + resources: + - "consumers" + - "consumers/status" + - "consumergroups" + - "consumergroups/status" + verbs: + - create + - get + - list + - watch + - patch + - update + - delete + - apiGroups: + - "internal.kafka.eventing.knative.dev" + resources: + - "consumers/finalizers" + - "consumergroups/finalizers" + verbs: + - update + - delete + - apiGroups: + - apps + resources: + - statefulsets/scale + verbs: + - get + - list + - watch + - update + - patch + - create + - delete + - apiGroups: + - rbac.authorization.k8s.io + resources: + - clusterrolebindings + verbs: + - watch + - apiGroups: + - "*" + resources: + - configmaps + verbs: + - delete + - apiGroups: + - "*" + resources: + - configmaps + - services + verbs: + - get + - list + - watch + - update + - create + - delete + - apiGroups: + - "*" + resources: + - pods + verbs: + - list + - update + - get + - watch + - apiGroups: + - "*" + resources: + - pods/finalizers + verbs: + - get + - list + - create + - update + - delete + - apiGroups: + - "*" + resources: + - events + verbs: + - patch + - create + - apiGroups: + - "*" + resources: + - secrets + verbs: + - get + - list + - watch + - update + - create + - delete + - apiGroups: + - "*" + resources: + - nodes + verbs: + - get + - list + - watch + - apiGroups: + - "*" + resources: + - serviceaccounts + verbs: + - get + - list + - watch + - update + - create + - delete + - apiGroups: + - "*" + resources: + - configmaps + resourceNames: + - kafka-channel-config + verbs: + - patch + - apiGroups: + - "*" + resources: + - horizontalpodautoscalers + resourceNames: + - kafka-webhook + verbs: + - delete + - apiGroups: + - "*" + resources: + - leases + verbs: + - delete + - apiGroups: + - "*" + resources: + - poddisruptionbudgets + resourceNames: + - kafka-webhook + verbs: + - delete + - apiGroups: + - "*" + resources: + - services + verbs: + - patch + - apiGroups: + - "apps" + resources: + - deployments + verbs: + - deletecollection + # Eventing TLS + - apiGroups: + - "cert-manager.io" + resources: + - certificates + - issuers + - clusterissuers + verbs: + - create + - delete + - update + - list + - get + - watch + - apiGroups: + - "trust.cert-manager.io" + resources: + - bundles + verbs: + - create + - delete + - update + - list + - get + - watch +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: knative-operator-webhook + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +rules: + # For watching logging configuration and getting certs. + - apiGroups: + - "" + resources: + - "configmaps" + verbs: + - "get" + - "list" + - "watch" + # For manipulating certs into secrets. + - apiGroups: + - "" + resources: + - "namespaces" + verbs: + - "get" + - "create" + - "update" + - "list" + - "watch" + - "patch" + # finalizers are needed for the owner reference of the webhook + - apiGroups: + - "" + resources: + - "namespaces/finalizers" + verbs: + - "update" + # For getting our Deployment so we can decorate with ownerref. + - apiGroups: + - "apps" + resources: + - "deployments" + verbs: + - "get" + - apiGroups: + - "apps" + resources: + - "deployments/finalizers" + verbs: + - update + # For actually registering our webhook. + - apiGroups: + - "admissionregistration.k8s.io" + resources: + - "mutatingwebhookconfigurations" + - "validatingwebhookconfigurations" + verbs: &everything + - "get" + - "list" + - "create" + - "update" + - "delete" + - "patch" + - "watch" + # For leader election + - apiGroups: + - "coordination.k8s.io" + resources: + - "leases" + verbs: *everything + # Necessary for conversion webhook. These are copied from the serving + # TODO: Do we really need all these permissions? + - apiGroups: ["apiextensions.k8s.io"] + resources: ["customresourcedefinitions"] + verbs: ["get", "list", "create", "update", "delete", "patch", "watch"] +--- +# Source: dataplane/charts/fluentbit/templates/clusterrolebinding.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: release-name-fluentbit + labels: + helm.sh/chart: fluentbit-0.48.9 + app.kubernetes.io/name: fluentbit + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "3.2.8" + app.kubernetes.io/managed-by: Helm +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: release-name-fluentbit +subjects: + - kind: ServiceAccount + name: union-system + namespace: union +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2020 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# TODO: Consider restriction of non-aggregated role to knativeservings namespaces. +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: knative-serving-operator + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: knative-serving-operator +subjects: + - kind: ServiceAccount + name: knative-operator + namespace: "union" +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# TODO: Consider restriction of non-aggregated role to knativeeventing namespaces. +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: knative-eventing-operator + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: knative-eventing-operator +subjects: + - kind: ServiceAccount + name: knative-operator + namespace: "union" +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: operator-webhook + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: knative-operator-webhook +subjects: + - kind: ServiceAccount + name: operator-webhook + namespace: "union" +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2020 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: knative-serving-operator-aggregated + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: knative-serving-operator-aggregated +subjects: + - kind: ServiceAccount + name: knative-operator + namespace: "union" +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: knative-serving-operator-aggregated-stable + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: knative-serving-operator-aggregated-stable +subjects: + - kind: ServiceAccount + name: knative-operator + namespace: "union" +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: knative-eventing-operator-aggregated + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: knative-eventing-operator-aggregated +subjects: + - kind: ServiceAccount + name: knative-operator + namespace: "union" +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: knative-eventing-operator-aggregated-stable + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: knative-eventing-operator-aggregated-stable +subjects: + - kind: ServiceAccount + name: knative-operator + namespace: "union" +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2022 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + namespace: "union" + name: knative-operator-webhook + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +rules: + # For manipulating certs into secrets. + - apiGroups: + - "" + resources: + - "secrets" + verbs: + - "get" + - "create" + - "update" + - "list" + - "watch" + - "patch" +--- +# Source: dataplane/templates/monitoring/prometheus-rbac.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: union-operator-prometheus-rbac + namespace: union + labels: + release: release-name +rules: + # Prometheus server scrape permissions + - apiGroups: [""] + resources: + - nodes + - nodes/proxy + - nodes/metrics + - services + - endpoints + - pods + - ingresses + - configmaps + verbs: + - get + - list + - watch + - apiGroups: ["extensions", "networking.k8s.io"] + resources: + - ingresses/status + - ingresses + verbs: + - get + - list + - watch + - apiGroups: ["discovery.k8s.io"] + resources: + - endpointslices + verbs: + - get + - list + - watch + # kube-state-metrics permissions (rbac.create=false on the subchart; managed here instead) + - apiGroups: ["certificates.k8s.io"] + resources: ["certificatesigningrequests"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["configmaps"] + verbs: ["list", "watch"] + - apiGroups: ["batch"] + resources: ["cronjobs"] + verbs: ["list", "watch"] + - apiGroups: ["extensions", "apps"] + resources: ["daemonsets"] + verbs: ["list", "watch"] + - apiGroups: ["extensions", "apps"] + resources: ["deployments"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["endpoints"] + verbs: ["list", "watch"] + - apiGroups: ["discovery.k8s.io"] + resources: ["endpointslices"] + verbs: ["list", "watch"] + - apiGroups: ["autoscaling"] + resources: ["horizontalpodautoscalers"] + verbs: ["list", "watch"] + - apiGroups: ["extensions", "networking.k8s.io"] + resources: ["ingresses"] + verbs: ["list", "watch"] + - apiGroups: ["batch"] + resources: ["jobs"] + verbs: ["list", "watch"] + - apiGroups: ["coordination.k8s.io"] + resources: ["leases"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["limitranges"] + verbs: ["list", "watch"] + - apiGroups: ["admissionregistration.k8s.io"] + resources: ["mutatingwebhookconfigurations"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["namespaces"] + verbs: ["list", "watch"] + - apiGroups: ["networking.k8s.io"] + resources: ["networkpolicies"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["persistentvolumeclaims"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["persistentvolumes"] + verbs: ["list", "watch"] + - apiGroups: ["policy"] + resources: ["poddisruptionbudgets"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["pods"] + verbs: ["list", "watch"] + - apiGroups: ["extensions", "apps"] + resources: ["replicasets"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["replicationcontrollers"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["resourcequotas"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["secrets"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["services"] + verbs: ["list", "watch"] + - apiGroups: ["apps"] + resources: ["statefulsets"] + verbs: ["list", "watch"] + - apiGroups: ["storage.k8s.io"] + resources: ["storageclasses"] + verbs: ["list", "watch"] + - apiGroups: ["admissionregistration.k8s.io"] + resources: ["validatingwebhookconfigurations"] + verbs: ["list", "watch"] + - apiGroups: ["storage.k8s.io"] + resources: ["volumeattachments"] + verbs: ["list", "watch"] +--- +# Source: dataplane/templates/nodeexecutor/serviceaccount.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: union-executor + labels: + app: executor +rules: +# Allow RO access to PODS +- apiGroups: + - "" + resources: + - pods + verbs: + - get + - list + - watch +# Allow Event recording access +- apiGroups: + - "" + resources: + - events + verbs: + - create + - update + - delete + - patch +# Allow Access All plugin objects +- apiGroups: + - '*' + resources: + - '*' + verbs: + - get + - list + - watch + - create + - update + - delete + - patch +# Allow Access to CRD +- apiGroups: + - apiextensions.k8s.io + resources: + - customresourcedefinitions + verbs: + - get + - list + - watch + - create + - delete + - update +--- +# Source: dataplane/templates/operator/serviceaccount-proxy-secret.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: union-system-secret + namespace: union + labels: + app.kubernetes.io/name: operator-proxy + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm +rules: + - apiGroups: + - '*' + resources: + - secrets + verbs: + - get + - list + - create + - update + - delete +--- +# Source: dataplane/templates/operator/serviceaccount-proxy.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: proxy-system + labels: + app.kubernetes.io/name: operator-proxy + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm +rules: + - apiGroups: + - '*' + resources: + - events + - flyteworkflows + - pods/log + - pods + - rayjobs + - resourcequotas + verbs: + - get + - list + - watch +--- +# Source: dataplane/templates/operator/serviceaccount.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: operator-system + labels: + app.kubernetes.io/name: union-operator + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm +rules: + - apiGroups: + - '*' + resources: + - secrets + - deployments + verbs: + - get + - list + - watch + - create + - update + - apiGroups: + - flyte.lyft.com + resources: + - flyteworkflows + - flyteworkflows/finalizers + verbs: + - get + - list + - watch + - create + - update + - delete + - patch + - post + - deletecollection + - apiGroups: + - '*' + resources: + - resourcequotas + - pods + - configmaps + - podtemplates + - secrets + - namespaces + - nodes + verbs: + - get + - list + - watch + - create + - update + - delete + - apiGroups: + - serving.knative.dev + resources: + - revisions + - configurations + - services + verbs: + - get + - list + - watch + - create + - update + - delete +--- +# Source: dataplane/templates/webhook/serviceaccount.yaml +kind: Role +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: union-webhook-role + namespace: union +rules: + - apiGroups: + - "*" + resources: + - mutatingwebhookconfigurations + - secrets + - pods + - replicasets/finalizers + verbs: + - get + - create + - update + - patch +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2022 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + namespace: "union" + name: operator-webhook + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +subjects: + - kind: ServiceAccount + name: operator-webhook + namespace: "union" +roleRef: + kind: Role + name: knative-operator-webhook + apiGroup: rbac.authorization.k8s.io +--- +# Source: dataplane/templates/monitoring/prometheus-rbac.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: union-operator-prometheus-rbac + namespace: union + labels: + release: release-name +subjects: + - kind: ServiceAccount + name: union-operator-prometheus + namespace: union +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: union-operator-prometheus-rbac +--- +# Source: dataplane/templates/monitoring/prometheus-rbac.yaml +# kube-state-metrics service account RoleBinding +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: release-name-prometheus-kube-state-metrics + namespace: union + labels: + release: release-name +subjects: + - kind: ServiceAccount + name: release-name-prometheus-kube-state-metrics + namespace: union +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: union-operator-prometheus-rbac +--- +# Source: dataplane/templates/nodeexecutor/serviceaccount.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: union-executor + labels: + app: executor +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: union-executor +subjects: +- kind: ServiceAccount + name: union-system + namespace: union +--- +# Source: dataplane/templates/operator/serviceaccount-proxy-secret.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: union-system-secret + namespace: union + labels: + app.kubernetes.io/name: operator-proxy + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: union-system-secret +subjects: + - kind: ServiceAccount + name: union-system + namespace: union +--- +# Source: dataplane/templates/operator/serviceaccount-proxy.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: proxy-system + labels: + app.kubernetes.io/name: operator-proxy + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: proxy-system +subjects: + - kind: ServiceAccount + name: union-system + namespace: union +--- +# Source: dataplane/templates/operator/serviceaccount.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: operator-system + labels: + app.kubernetes.io/name: union-operator + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: operator-system +subjects: + - kind: ServiceAccount + name: union-system + namespace: union +--- +# Source: dataplane/templates/webhook/serviceaccount.yaml +# Create a binding from Role -> ServiceAccount +kind: RoleBinding +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: union-webhook-binding + namespace: union +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: union-webhook-role +subjects: + - kind: ServiceAccount + name: union-system + namespace: union +--- +# Source: dataplane/charts/fluentbit/templates/service.yaml +apiVersion: v1 +kind: Service +metadata: + name: release-name-fluentbit + namespace: union + labels: + helm.sh/chart: fluentbit-0.48.9 + app.kubernetes.io/name: fluentbit + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "3.2.8" + app.kubernetes.io/managed-by: Helm +spec: + type: ClusterIP + ports: + - port: 2020 + targetPort: http + protocol: TCP + name: http + selector: + app.kubernetes.io/name: fluentbit + app.kubernetes.io/instance: release-name +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2022 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v1 +kind: Service +metadata: + labels: + role: operator-webhook + app.kubernetes.io/component: operator-webhook + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator + name: operator-webhook + namespace: "union" +spec: + ports: + # Define metrics and profiling for them to be accessible within service meshes. + - name: http-metrics + port: 9090 + targetPort: 9090 + - name: http-profiling + port: 8008 + targetPort: 8008 + - name: https-webhook + port: 443 + targetPort: 8443 + selector: + role: operator-webhook +--- +# Source: dataplane/charts/prometheus/charts/kube-state-metrics/templates/service.yaml +apiVersion: v1 +kind: Service +metadata: + name: release-name-kube-state-metrics + namespace: union + labels: + helm.sh/chart: kube-state-metrics-5.25.1 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/component: metrics + app.kubernetes.io/part-of: kube-state-metrics + app.kubernetes.io/name: kube-state-metrics + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2.13.0" + annotations: + prometheus.io/scrape: 'true' +spec: + type: "ClusterIP" + ports: + - name: "http" + protocol: TCP + port: 8080 + targetPort: 8080 + + selector: + app.kubernetes.io/name: kube-state-metrics + app.kubernetes.io/instance: release-name +--- +# Source: dataplane/charts/prometheus/templates/service.yaml +apiVersion: v1 +kind: Service +metadata: + labels: + app.kubernetes.io/component: server + app.kubernetes.io/name: prometheus + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: v2.54.1 + helm.sh/chart: prometheus-25.27.0 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: prometheus + name: union-operator-prometheus + namespace: union +spec: + ports: + - name: http + port: 80 + protocol: TCP + targetPort: 9090 + selector: + app.kubernetes.io/component: server + app.kubernetes.io/name: prometheus + app.kubernetes.io/instance: release-name + sessionAffinity: None + type: "ClusterIP" +--- +# Source: dataplane/templates/clusterresourcesync/service.yaml +apiVersion: v1 +kind: Service +metadata: + name: syncresources + labels: + platform.union.ai/prometheus-group: "union-services" + app.kubernetes.io/name: clusterresourcesync + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm +spec: + type: ClusterIP + ports: + - port: 10254 + targetPort: debug + protocol: TCP + name: debug + selector: + app.kubernetes.io/name: clusterresourcesync + app.kubernetes.io/instance: release-name +--- +# Source: dataplane/templates/flyteconnector/service.yaml +apiVersion: v1 +kind: Service +metadata: + name: flyteconnector + namespace: union + labels: + app.kubernetes.io/name: flyteconnector + app.kubernetes.io/instance: release-name + app.kubernetes.io/managed-by: Helm +spec: + clusterIP: None + ports: + - name: grpc + port: 8000 + protocol: TCP + targetPort: grpc + - name: metric + port: 9090 + protocol: TCP + targetPort: metric + selector: + app.kubernetes.io/name: flyteconnector + app.kubernetes.io/instance: release-name +--- +# Source: dataplane/templates/imagebuilder/service.yaml +apiVersion: v1 +kind: Service +metadata: + name: union-operator-buildkit + labels: + app.kubernetes.io/name: imagebuilder-buildkit + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm +spec: + type: ClusterIP + ports: + - port: 1234 + targetPort: tcp + protocol: TCP + name: tcp + selector: + app.kubernetes.io/name: imagebuilder-buildkit + app.kubernetes.io/instance: release-name +--- +# Source: dataplane/templates/nodeexecutor/service.yaml +apiVersion: v1 +kind: Service +metadata: + name: union-operator-executor + labels: + platform.union.ai/prometheus-group: "union-services" + app: executor +spec: + type: ClusterIP + ports: + - port: 15605 + targetPort: 15605 + protocol: TCP + name: fasttask + - port: 10254 + targetPort: debug + protocol: TCP + name: debug + selector: + app: executor +--- +# Source: dataplane/templates/operator/service-proxy.yaml +apiVersion: v1 +kind: Service +metadata: + name: union-operator-proxy + labels: + platform.union.ai/prometheus-group: "union-services" + app.kubernetes.io/name: operator-proxy + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm +spec: + type: ClusterIP + ports: + - port: 8080 + targetPort: http + protocol: TCP + name: http + - port: 10254 + targetPort: debug + protocol: TCP + name: debug + selector: + app.kubernetes.io/name: operator-proxy + app.kubernetes.io/instance: release-name +--- +# Source: dataplane/templates/operator/service.yaml +apiVersion: v1 +kind: Service +metadata: + name: union-operator + labels: + platform.union.ai/prometheus-group: "union-services" + app.kubernetes.io/name: union-operator + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm +spec: + type: ClusterIP + ports: + - port: 10254 + targetPort: debug + protocol: TCP + name: debug + selector: + app.kubernetes.io/name: union-operator + app.kubernetes.io/instance: release-name +--- +# Source: dataplane/templates/webhook/service.yaml +apiVersion: v1 +kind: Service +metadata: + name: union-pod-webhook + namespace: union + labels: + app.kubernetes.io/name: union-pod-webhook + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm + annotations: + projectcontour.io/upstream-protocol.h2c: grpc +spec: + selector: + app.kubernetes.io/name: union-pod-webhook + app.kubernetes.io/instance: release-name + ports: + - name: https + protocol: TCP + port: 443 + targetPort: 9443 + - name: debug + protocol: TCP + port: 10254 + targetPort: 10254 +--- +# Source: dataplane/templates/webhook/service.yaml +# Headless Service for cache invalidation — resolves to all pod IPs so that +# we can fan out invalidation requests to every webhook replica. +apiVersion: v1 +kind: Service +metadata: + name: union-pod-webhook-headless + namespace: union + labels: + app.kubernetes.io/name: union-pod-webhook + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm +spec: + clusterIP: None + selector: + app.kubernetes.io/name: union-pod-webhook + app.kubernetes.io/instance: release-name + ports: + - name: cache-internal + protocol: TCP + port: 9443 + targetPort: 9443 +--- +# Source: dataplane/charts/fluentbit/templates/daemonset.yaml +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: release-name-fluentbit + namespace: union + labels: + helm.sh/chart: fluentbit-0.48.9 + app.kubernetes.io/name: fluentbit + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "3.2.8" + app.kubernetes.io/managed-by: Helm +spec: + selector: + matchLabels: + app.kubernetes.io/name: fluentbit + app.kubernetes.io/instance: release-name + template: + metadata: + labels: + app.kubernetes.io/name: fluentbit + app.kubernetes.io/instance: release-name + annotations: + checksum/config: e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855 + spec: + serviceAccountName: union-system + hostNetwork: false + dnsPolicy: ClusterFirst + containers: + - name: fluentbit + image: "cr.fluentbit.io/fluent/fluent-bit:3.2.8" + imagePullPolicy: IfNotPresent + command: + - /fluent-bit/bin/fluent-bit + args: + - --workdir=/fluent-bit/etc + - --config=/fluent-bit/etc/conf/fluent-bit.conf + ports: + - name: http + containerPort: 2020 + protocol: TCP + livenessProbe: + httpGet: + path: / + port: http + readinessProbe: + httpGet: + path: /api/v1/health + port: http + volumeMounts: + - name: config + mountPath: /fluent-bit/etc/conf + - mountPath: /var/log + name: varlog + - mountPath: /var/lib/docker/containers + name: varlibdockercontainers + readOnly: true + - mountPath: /etc/machine-id + name: etcmachineid + readOnly: true + volumes: + - name: config + configMap: + name: fluentbit-system + - hostPath: + path: /var/log + name: varlog + - hostPath: + path: /var/lib/docker/containers + name: varlibdockercontainers + - hostPath: + path: /etc/machine-id + type: File + name: etcmachineid + tolerations: + - operator: Exists +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2022 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: operator-webhook + namespace: "union" + labels: + app.kubernetes.io/component: operator-webhook + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +spec: + selector: + matchLabels: + app: operator-webhook + role: operator-webhook + template: + metadata: + labels: + app: operator-webhook + role: operator-webhook + app.kubernetes.io/component: operator-webhook + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator + sidecar.istio.io/inject: "false" + spec: + # To avoid node becoming SPOF, spread our replicas to different nodes. + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - podAffinityTerm: + labelSelector: + matchLabels: + app: operator-webhook + topologyKey: kubernetes.io/hostname + weight: 100 + serviceAccountName: operator-webhook + containers: + - name: operator-webhook + # This is the Go import path for the binary that is containerized + # and substituted here. + image: gcr.io/knative-releases/knative.dev/operator/cmd/webhook@sha256:d3a7a3304629ccfcaacec620b50736e0b4c902a6328b83369b6cbaf7e94677c9 + resources: + requests: + cpu: 100m + memory: 100Mi + limits: + cpu: 500m + memory: 500Mi + env: + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: SYSTEM_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: CONFIG_LOGGING_NAME + value: config-logging + - name: CONFIG_OBSERVABILITY_NAME + value: config-observability + - name: WEBHOOK_NAME + value: operator-webhook + - name: WEBHOOK_PORT + value: "8443" + - name: WEBHOOK_SECRET_NAME + value: operator-webhook-certs + - name: METRICS_DOMAIN + value: knative.dev/operator + - name: KUBERNETES_MIN_VERSION + value: "" + securityContext: + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + runAsNonRoot: true + capabilities: + drop: + - ALL + ports: + - name: metrics + containerPort: 9090 + - name: profiling + containerPort: 8008 + - name: https-webhook + containerPort: 8443 + readinessProbe: + periodSeconds: 1 + httpGet: + scheme: HTTPS + port: 8443 + httpHeaders: + - name: k-kubelet-probe + value: "webhook" + livenessProbe: + periodSeconds: 1 + httpGet: + scheme: HTTPS + port: 8443 + httpHeaders: + - name: k-kubelet-probe + value: "webhook" + failureThreshold: 6 + initialDelaySeconds: 120 + # Our webhook should gracefully terminate by lame ducking first, set this to a sufficiently + # high value that we respect whatever value it has configured for the lame duck grace period. + terminationGracePeriodSeconds: 300 +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2020 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: knative-operator + namespace: "union" + labels: + app.kubernetes.io/name: knative-operator + app.kubernetes.io/version: "1.16.0" +spec: + replicas: 1 + selector: + matchLabels: + name: knative-operator + template: + metadata: + labels: + name: knative-operator + app.kubernetes.io/name: knative-operator + app.kubernetes.io/version: "1.16.0" + sidecar.istio.io/inject: "false" + spec: + serviceAccountName: knative-operator + containers: + - name: knative-operator + image: gcr.io/knative-releases/knative.dev/operator/cmd/operator@sha256:0b5a3532417f9c8e7b6044e23f6f67ad932a74a40a4092ad965b6f173b2fd887 + resources: + requests: + cpu: 100m + memory: 100Mi + limits: + cpu: 1000m + memory: 1000Mi + imagePullPolicy: IfNotPresent + env: + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: SYSTEM_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: METRICS_DOMAIN + value: knative.dev/operator + - name: CONFIG_LOGGING_NAME + value: config-logging + - name: CONFIG_OBSERVABILITY_NAME + value: config-observability + - name: KUBERNETES_MIN_VERSION + value: "" + securityContext: + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + runAsNonRoot: true + capabilities: + drop: + - ALL + ports: + - name: metrics + containerPort: 9090 +--- +# Source: dataplane/charts/prometheus/charts/kube-state-metrics/templates/deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: release-name-kube-state-metrics + namespace: union + labels: + helm.sh/chart: kube-state-metrics-5.25.1 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/component: metrics + app.kubernetes.io/part-of: kube-state-metrics + app.kubernetes.io/name: kube-state-metrics + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2.13.0" +spec: + selector: + matchLabels: + app.kubernetes.io/name: kube-state-metrics + app.kubernetes.io/instance: release-name + replicas: 1 + strategy: + type: RollingUpdate + revisionHistoryLimit: 10 + template: + metadata: + labels: + helm.sh/chart: kube-state-metrics-5.25.1 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/component: metrics + app.kubernetes.io/part-of: kube-state-metrics + app.kubernetes.io/name: kube-state-metrics + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2.13.0" + spec: + automountServiceAccountToken: true + hostNetwork: false + serviceAccountName: release-name-kube-state-metrics + securityContext: + fsGroup: 65534 + runAsGroup: 65534 + runAsNonRoot: true + runAsUser: 65534 + seccompProfile: + type: RuntimeDefault + containers: + - name: kube-state-metrics + args: + - --port=8080 + - --resources=certificatesigningrequests,configmaps,cronjobs,daemonsets,deployments,endpoints,horizontalpodautoscalers,ingresses,jobs,leases,limitranges,mutatingwebhookconfigurations,namespaces,networkpolicies,nodes,persistentvolumeclaims,persistentvolumes,poddisruptionbudgets,pods,replicasets,replicationcontrollers,resourcequotas,secrets,services,statefulsets,storageclasses,validatingwebhookconfigurations,volumeattachments + - --namespaces=union + imagePullPolicy: IfNotPresent + image: registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.13.0 + ports: + - containerPort: 8080 + name: "http" + livenessProbe: + failureThreshold: 3 + httpGet: + httpHeaders: + path: /livez + port: 8080 + scheme: HTTP + initialDelaySeconds: 5 + periodSeconds: 10 + successThreshold: 1 + timeoutSeconds: 5 + readinessProbe: + failureThreshold: 3 + httpGet: + httpHeaders: + path: /readyz + port: 8080 + scheme: HTTP + initialDelaySeconds: 5 + periodSeconds: 10 + successThreshold: 1 + timeoutSeconds: 5 + resources: + {} + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + readOnlyRootFilesystem: true +--- +# Source: dataplane/charts/prometheus/templates/deploy.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + labels: + app.kubernetes.io/component: server + app.kubernetes.io/name: prometheus + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: v2.54.1 + helm.sh/chart: prometheus-25.27.0 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: prometheus + name: union-operator-prometheus + namespace: union +spec: + selector: + matchLabels: + app.kubernetes.io/component: server + app.kubernetes.io/name: prometheus + app.kubernetes.io/instance: release-name + replicas: 1 + revisionHistoryLimit: 10 + strategy: + type: Recreate + rollingUpdate: null + template: + metadata: + labels: + app.kubernetes.io/component: server + app.kubernetes.io/name: prometheus + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: v2.54.1 + helm.sh/chart: prometheus-25.27.0 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: prometheus + spec: + enableServiceLinks: true + serviceAccountName: union-operator-prometheus + containers: + - name: prometheus-server-configmap-reload + image: "quay.io/prometheus-operator/prometheus-config-reloader:v0.76.0" + imagePullPolicy: "IfNotPresent" + args: + - --watched-dir=/etc/config + - --listen-address=0.0.0.0:8080 + - --reload-url=http://127.0.0.1:9090/prometheus/-/reload + ports: + - containerPort: 8080 + name: metrics + livenessProbe: + httpGet: + path: /healthz + port: metrics + scheme: HTTP + initialDelaySeconds: 2 + periodSeconds: 10 + readinessProbe: + httpGet: + path: /healthz + port: metrics + scheme: HTTP + periodSeconds: 10 + volumeMounts: + - name: config-volume + mountPath: /etc/config + readOnly: true + + - name: prometheus-server + image: "quay.io/prometheus/prometheus:v2.54.1" + imagePullPolicy: "IfNotPresent" + args: + - --storage.tsdb.retention.time=3d + - --config.file=/etc/config/prometheus.yml + - --storage.tsdb.path=/data + - --web.console.libraries=/etc/prometheus/console_libraries + - --web.console.templates=/etc/prometheus/consoles + - --web.enable-lifecycle + - --web.route-prefix=/prometheus + - --web.external-url=/prometheus/ + ports: + - containerPort: 9090 + readinessProbe: + httpGet: + path: /prometheus/-/ready + port: 9090 + scheme: HTTP + initialDelaySeconds: 30 + periodSeconds: 5 + timeoutSeconds: 4 + failureThreshold: 3 + successThreshold: 1 + livenessProbe: + httpGet: + path: /prometheus/-/healthy + port: 9090 + scheme: HTTP + initialDelaySeconds: 30 + periodSeconds: 15 + timeoutSeconds: 10 + failureThreshold: 3 + successThreshold: 1 + resources: + limits: + cpu: "3" + memory: 3500Mi + requests: + cpu: "1" + memory: 1Gi + volumeMounts: + - name: config-volume + mountPath: /etc/config + - name: storage-volume + mountPath: /data + subPath: "" + - name: server-recording-rules + mountPath: /etc/config/recording + subPath: + readOnly: + dnsPolicy: ClusterFirst + securityContext: + fsGroup: 65534 + runAsGroup: 65534 + runAsNonRoot: true + runAsUser: 65534 + terminationGracePeriodSeconds: 300 + volumes: + - name: config-volume + configMap: + name: union-operator-prometheus + - name: server-recording-rules + configMap: + name: union-recording-rules + - name: storage-volume + emptyDir: + sizeLimit: 10Gi +--- +# Source: dataplane/templates/flyteconnector/deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: flyteconnector + namespace: union + labels: + app.kubernetes.io/name: flyteconnector + app.kubernetes.io/instance: release-name + app.kubernetes.io/managed-by: Helm +spec: + replicas: 2 + selector: + matchLabels: + app.kubernetes.io/name: flyteconnector + app.kubernetes.io/instance: release-name + template: + metadata: + annotations: + labels: + platform.union.ai/zone: "dataplane" + + app.kubernetes.io/name: flyteconnector + app.kubernetes.io/instance: release-name + app.kubernetes.io/managed-by: Helm + spec: + containers: + - command: + - c0 + image: "ghcr.io/flyteorg/flyte-connectors:py3.13-2.0.0b50.dev3-g695bb1db3.d20260122" + imagePullPolicy: "IfNotPresent" + name: flyteconnector + readinessProbe: + failureThreshold: 3 + httpGet: + path: / + port: 9090 + scheme: HTTP + initialDelaySeconds: 5 + periodSeconds: 10 + successThreshold: 1 + timeoutSeconds: 5 + env: + - name: AIRFLOW_CONN_GOOGLE_CLOUD_DEFAULT + value: "google-cloud-platform://" + - name: LOG_LEVEL + value: "20" + ports: + - containerPort: 8000 + name: grpc + - containerPort: 9090 + name: metric + resources: + limits: + cpu: "1.5" + ephemeral-storage: 100Mi + memory: 1500Mi + requests: + cpu: "1" + ephemeral-storage: 100Mi + memory: 1000Mi + serviceAccountName: flyteconnector +--- +# Source: dataplane/templates/imagebuilder/deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: union-operator-buildkit + labels: + app.kubernetes.io/name: imagebuilder-buildkit + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm +spec: + strategy: + type: Recreate + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: imagebuilder-buildkit + app.kubernetes.io/instance: release-name + template: + metadata: + labels: + platform.union.ai/zone: "dataplane" + app.kubernetes.io/name: imagebuilder-buildkit + app.kubernetes.io/instance: release-name + spec: + serviceAccountName: "union-system" + containers: + - name: "buildkit" + image: "docker.io/moby/buildkit:buildx-stable-1" + imagePullPolicy: IfNotPresent + env: + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: POD_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: GOMEMLIMIT + valueFrom: + resourceFieldRef: + divisor: 1 + resource: limits.memory + - name: GOMAXPROCS + valueFrom: + resourceFieldRef: + divisor: 1 + resource: limits.cpu + - name: CLUSTER_NAME + valueFrom: + secretKeyRef: + name: operator-cluster-name + key: cluster_name + - name: DEPLOYMENT_NAME + value: operator + - name: PROXY_SERVICE_URL + value: http://union-operator-proxy:8080 + - name: PROMETHEUS_SERVICE_URL + value: http://union-operator-prometheus:80 + - name: KNATIVE_PROXY_SERVICE_URL + value: http://kourier-internal + volumeMounts: + - mountPath: /etc/buildkit + name: buildkit-config + args: + - --config + - /etc/buildkit/buildkitd.toml + - --addr + - unix:///run//buildkit/buildkitd.sock + - --addr + - tcp://0.0.0.0:1234 + ports: + - name: tcp + containerPort: 1234 + protocol: TCP + readinessProbe: + exec: + command: + - buildctl + - debug + - workers + initialDelaySeconds: 5 + periodSeconds: 30 + livenessProbe: + exec: + command: + - buildctl + - debug + - workers + initialDelaySeconds: 5 + periodSeconds: 30 + securityContext: + privileged: true + resources: + requests: + cpu: 4 + ephemeral-storage: 50Gi + memory: 4Gi + volumes: + - configMap: + name: union-operator-buildkit + name: buildkit-config + affinity: + podAntiAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchLabels: + app.kubernetes.io/name: imagebuilder-buildkit + app.kubernetes.io/instance: release-name + topologyKey: "kubernetes.io/hostname" +--- +# Source: dataplane/templates/nodeexecutor/deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: executor + namespace: union + labels: + app: executor +spec: + replicas: 1 + selector: + matchLabels: + app: executor + template: + metadata: + annotations: + configChecksum: "cdc54eefe9995eb3d202a14f0bd36b5ec1cd7efa44d0be966cfcf30d938b64a" + + labels: + platform.union.ai/zone: "dataplane" + + app: executor + app.kubernetes.io/instance: 'release-name' + app.kubernetes.io/name: executor + spec: + securityContext: + fsGroup: 1337 + serviceAccountName: union-system + volumes: + - name: config-volume + configMap: + name: executor + - name: secret-volume + secret: + secretName: union-secret-auth + - name: auth + secret: + secretName: union-secret-auth + containers: + - name: executor + image: "public.ecr.aws/p0i0a9q8/unionoperator:2026.4.5" + imagePullPolicy: IfNotPresent + command: + - executorv2 + - serve + - --config + - /etc/config/*.yaml + ports: + - name: http + containerPort: 8089 + protocol: TCP + - name: debug + containerPort: 10254 + protocol: TCP + env: + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: POD_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: GOMEMLIMIT + valueFrom: + resourceFieldRef: + divisor: 1 + resource: limits.memory + - name: GOMAXPROCS + valueFrom: + resourceFieldRef: + divisor: 1 + resource: limits.cpu + - name: CLUSTER_NAME + valueFrom: + secretKeyRef: + name: operator-cluster-name + key: cluster_name + - name: DEPLOYMENT_NAME + value: operator + - name: PROXY_SERVICE_URL + value: http://union-operator-proxy:8080 + - name: PROMETHEUS_SERVICE_URL + value: http://union-operator-prometheus:80 + - name: KNATIVE_PROXY_SERVICE_URL + value: http://kourier-internal + resources: + limits: + cpu: "4" + memory: "8Gi" + requests: + cpu: "1" + memory: "1Gi" + volumeMounts: + - name: config-volume + mountPath: /etc/config + - name: secret-volume + mountPath: /etc/union/secret + - name: auth + mountPath: /etc/secrets/ +--- +# Source: dataplane/templates/operator/deployment-proxy.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: union-operator-proxy + namespace: union + labels: + app.kubernetes.io/name: operator-proxy + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: operator-proxy + app.kubernetes.io/instance: release-name + template: + metadata: + annotations: + configChecksum: "c6d4293594a69380b3811417ec589d33d98a6e913ebfe236e370a497af7de76" + + labels: + + app.kubernetes.io/name: operator-proxy + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm + spec: + volumes: + - name: config-volume + projected: + sources: + - configMap: + name: union-operator + - name: secret-volume + secret: + secretName: union-secret-auth + serviceAccountName: union-system + securityContext: + {} + containers: + - name: operator-proxy + securityContext: + {} + image: "public.ecr.aws/p0i0a9q8/unionoperator:2026.4.5" + imagePullPolicy: IfNotPresent + terminationMessagePolicy: FallbackToLogsOnError + resources: + limits: + cpu: "3" + memory: 3Gi + requests: + cpu: 500m + memory: 500Mi + volumeMounts: + - mountPath: /etc/union/config + name: config-volume + - mountPath: /etc/union/secret + name: secret-volume + args: + - operator + - proxy + - --config + - /etc/union/config/*.yaml + ports: + - name: http + containerPort: 8089 + protocol: TCP + - name: connect + containerPort: 8080 + protocol: TCP + - name: grpc + containerPort: 8081 + protocol: TCP + - name: debug + containerPort: 10254 + protocol: TCP + env: + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: POD_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: GOMEMLIMIT + valueFrom: + resourceFieldRef: + divisor: 1 + resource: limits.memory + - name: GOMAXPROCS + valueFrom: + resourceFieldRef: + divisor: 1 + resource: limits.cpu + - name: CLUSTER_NAME + valueFrom: + secretKeyRef: + name: operator-cluster-name + key: cluster_name + - name: DEPLOYMENT_NAME + value: operator + - name: PROXY_SERVICE_URL + value: http://union-operator-proxy:8080 + - name: PROMETHEUS_SERVICE_URL + value: http://union-operator-prometheus:80 + - name: KNATIVE_PROXY_SERVICE_URL + value: http://kourier-internal + - name: "tunnel" + securityContext: + {} + image: "public.ecr.aws/p0i0a9q8/unionoperator:2026.4.5" + imagePullPolicy: IfNotPresent + args: + - cloudflared + - tunnel + - --no-autoupdate + - run + - --token + - $(TUNNEL_TOKEN) + env: + - name: TUNNEL_TOKEN + valueFrom: + secretKeyRef: + name: union-secret-auth + key: tunnel_token + optional: true + resources: + limits: + cpu: "3" + memory: 3Gi + requests: + cpu: 500m + memory: 500Mi +--- +# Source: dataplane/templates/operator/deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: union-operator + labels: + app.kubernetes.io/name: union-operator + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm +spec: + selector: + matchLabels: + app.kubernetes.io/name: union-operator + app.kubernetes.io/instance: release-name + template: + metadata: + annotations: + configChecksum: "c6d4293594a69380b3811417ec589d33d98a6e913ebfe236e370a497af7de76" + + labels: + + app.kubernetes.io/name: union-operator + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm + spec: + serviceAccountName: union-system + securityContext: + {} + volumes: + - name: config-volume + configMap: + name: union-operator + - name: secret-volume + secret: + secretName: union-secret-auth + containers: + - name: operator + securityContext: + {} + image: "public.ecr.aws/p0i0a9q8/unionoperator:2026.4.5" + imagePullPolicy: IfNotPresent + terminationMessagePolicy: FallbackToLogsOnError + resources: + limits: + cpu: "2" + memory: 3Gi + requests: + cpu: "1" + memory: 1Gi + volumeMounts: + - mountPath: /etc/union/config + name: config-volume + - mountPath: /etc/union/secret + name: secret-volume + env: + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: POD_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: GOMEMLIMIT + valueFrom: + resourceFieldRef: + divisor: 1 + resource: limits.memory + - name: GOMAXPROCS + valueFrom: + resourceFieldRef: + divisor: 1 + resource: limits.cpu + - name: CLUSTER_NAME + valueFrom: + secretKeyRef: + name: operator-cluster-name + key: cluster_name + - name: DEPLOYMENT_NAME + value: operator + - name: PROXY_SERVICE_URL + value: http://union-operator-proxy:8080 + - name: PROMETHEUS_SERVICE_URL + value: http://union-operator-prometheus:80 + - name: KNATIVE_PROXY_SERVICE_URL + value: http://kourier-internal + args: + - operator + - serve + - --config + - /etc/union/config/*.yaml + - --operator.clusterId.name + - "$(CLUSTER_NAME)" + - --operator.tunnel.k8sSecretName + - union-secret-auth + ports: + - name: grpc + containerPort: 8080 + protocol: TCP + - name: http + containerPort: 8089 + protocol: TCP + - name: debug + containerPort: 10254 + protocol: TCP +--- +# Source: dataplane/templates/webhook/deployment.yaml +# Webhook deployment +# Create the actual deployment +apiVersion: apps/v1 +kind: Deployment +metadata: + name: union-pod-webhook + namespace: union + labels: + app.kubernetes.io/name: union-pod-webhook + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: union-pod-webhook + app.kubernetes.io/instance: release-name + template: + metadata: + labels: + platform.union.ai/zone: "dataplane" + + app.kubernetes.io/name: union-pod-webhook + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm + annotations: + configChecksum: "77958aee0464eee4cb43c70036cf5ffa89f47f58c908845b9eeaecc7e32bba4" + + spec: + securityContext: + fsGroup: 65534 + fsGroupChangePolicy: Always + runAsNonRoot: true + runAsUser: 1001 + seLinuxOptions: null + serviceAccountName: union-system + containers: + - name: webhook + image: "public.ecr.aws/p0i0a9q8/unionoperator:2026.4.5" + imagePullPolicy: "IfNotPresent" + command: + - flytepropeller + args: + - webhook + - --config + - /etc/flyte/config/*.yaml + env: + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: POD_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: GOMEMLIMIT + valueFrom: + resourceFieldRef: + divisor: 1 + resource: limits.memory + - name: GOMAXPROCS + valueFrom: + resourceFieldRef: + divisor: 1 + resource: limits.cpu + - name: CLUSTER_NAME + valueFrom: + secretKeyRef: + name: operator-cluster-name + key: cluster_name + - name: DEPLOYMENT_NAME + value: operator + - name: PROXY_SERVICE_URL + value: http://union-operator-proxy:8080 + - name: PROMETHEUS_SERVICE_URL + value: http://union-operator-prometheus:80 + - name: KNATIVE_PROXY_SERVICE_URL + value: http://kourier-internal + ports: + - containerPort: 9443 + - containerPort: 10254 + resources: + limits: + cpu: 1 + ephemeral-storage: 500Mi + memory: 500Mi + requests: + cpu: 200m + ephemeral-storage: 500Mi + memory: 500Mi + volumeMounts: + - name: config-volume + mountPath: /etc/flyte/config + readOnly: true + - name: webhook-certs + mountPath: /etc/webhook/certs + readOnly: true + volumes: + - name: config-volume + configMap: + name: union-pod-webhook-config + - name: webhook-certs + secret: + secretName: union-pod-webhook +--- +# Source: dataplane/templates/flyteconnector/hpa.yaml +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: flyteconnector + labels: + app.kubernetes.io/name: flyteconnector + app.kubernetes.io/instance: release-name + app.kubernetes.io/managed-by: Helm +spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: flyteconnector + minReplicas: 2 + maxReplicas: 5 + metrics: + - type: Resource + resource: + name: cpu + target: + type: Utilization + averageUtilization: 80 + - type: Resource + resource: + name: memory + target: + type: Utilization + averageUtilization: 80 +--- +# Source: dataplane/templates/webhook/mutatingwebhookconfiguration.yaml +apiVersion: admissionregistration.k8s.io/v1 +kind: MutatingWebhookConfiguration +metadata: + name: union-pod-webhook-byok + labels: + app.kubernetes.io/name: union-pod-webhook + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm +webhooks: + - name: union-pod-webhook.flyte.org + admissionReviewVersions: + - v1 + - v1beta1 + clientConfig: + caBundle: LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSUROekNDQWgrZ0F3SUJBZ0lVSm9kQ09lem4vd1BHQTZjYkw1U3duSlpYT2hJd0RRWUpLb1pJaHZjTkFRRUwKQlFBd0t6RXBNQ2NHQTFVRUF3d2dabXg1ZEdWd2NtOXdaV3hzWlhJdGQyVmlhRzl2YXk1MWJtbHZiaTV6ZG1NdwpIaGNOTWpZd01USXpNREl6TmpVMVdoY05Nell3TVRJeE1ESXpOalUxV2pBck1Ta3dKd1lEVlFRRERDQm1iSGwwClpYQnliM0JsYkd4bGNpMTNaV0pvYjI5ckxuVnVhVzl1TG5OMll6Q0NBU0l3RFFZSktvWklodmNOQVFFQkJRQUQKZ2dFUEFEQ0NBUW9DZ2dFQkFJcjBSWDRGRjFNVTNYdFBmUHErMDUrMGI1Qm0xK3hsTVR5ZUp4bG5mWFJuRlgyegpPcHV6NmsreUpBQk4xRkxmNDVYRnlJclFlaW5DWWJtckZXalFRc3ZDWEloNkpPNGMwdmRMVVU5RERYcldWRFlsCkRNZytTUDlJVnZBMm5USkVFZ2RFREVsKzNWeThUbUJoNlNmcGthcXdHajY5SmhIdy9OeE9yZzM2bUY3TU5VZ1MKWXJVUkxqUFFzTUN6NElTVzhhQnEzS2IwRUFWUTA4V1lZQ0ZEbjY1aHBzVFdxR05rL3BKRnVTYjlsZll6RVdyYwpJeW5wNDhqTnIvNUdKS3NnWG9LeTUwZVo4UkppeTVyOUhENGtnSW15TVNQR2RDMXFZSVppRHRlZW8yOElxNk5nCmd1dWcyTGhlNndYYW9YZXFCQ2d3SHpwWFVXM1hJVzI5eFoxUkFRRUNBd0VBQWFOVE1GRXdIUVlEVlIwT0JCWUUKRktITDQ1RWhQaVg5bnlRN241QUk0YUZVNFlCL01COEdBMVVkSXdRWU1CYUFGS0hMNDVFaFBpWDlueVE3bjVBSQo0YUZVNFlCL01BOEdBMVVkRXdFQi93UUZNQU1CQWY4d0RRWUpLb1pJaHZjTkFRRUxCUUFEZ2dFQkFEeTRac21XCkNjZ3B5TmI4cU1ydzNYclBNejZLMTRlTVhqQVk1bERFYXM1NklLZExoZFM1bi9sWk95bENuUUtOZ3RHS0VrY3UKT05tMEE2Z0xNT3UzcE43dG9aNXh0WktOcnUvS0x6dm44QUNNNndCbUR0bG5oRERDa3ZUTHBVRzh6UTZSNmlaegpzTWlhVmpLSUJiWkJsZXNYSHZZQnQ2YlRTN2o1QWM2aTdYc1RPK3ZqQ2RUQjVYdCtEZ0x2QnBJRWh2ZGJzNXhmClFtc3ZzQmwxenJnR3JXemo4d0txcXNUMzZaOVU1TzdSK1RJMlQrUDVOR25SeHNQOGJ0R0RtMGZ5SW55WlJrSkcKVTdpVFJyMlZRWlRyaTNZQ2dlVjN1UjFtMnJjc2hyU1Avb082T0QyTXR5Q1FMRWtFK1pxVUVJVnpSYmVaNXl2RQptM29YMFV2OUtBUy9pWTA9Ci0tLS0tRU5EIENFUlRJRklDQVRFLS0tLS0K + service: + name: union-pod-webhook + namespace: union + path: /mutate--v1-pod/secrets + port: 443 + failurePolicy: Fail + matchPolicy: Equivalent + namespaceSelector: + matchLabels: + kubernetes.io/metadata.name: union + objectSelector: + matchLabels: + inject-flyte-secrets: "true" + organization: 'byok' + reinvocationPolicy: Never + rules: + - apiGroups: + - '*' + apiVersions: + - v1 + operations: + - CREATE + resources: + - pods + scope: '*' + sideEffects: NoneOnDryRun + timeoutSeconds: 30 +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Imported via https://github.com/knative/operator/releases/download/knative-v1.16.0/operator.yaml +# Copyright 2024 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +# Source: dataplane/templates/propeller/serviceaccount.yaml +--- +--- +# Source: dataplane/templates/serving/knative-serving.yaml +apiVersion: operator.knative.dev/v1beta1 +kind: KnativeServing +metadata: + name: union-operator-serving + namespace: union + labels: + helm.sh/chart: dataplane-2026.4.4 + app.kubernetes.io/name: release-name-dataplane + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2026.4.5" + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/component: serving +spec: + config: + deployment: + progress-deadline: "30m" + queue-sidecar-cpu-request: "25m" + queue-sidecar-cpu-limit: "1000m" + queue-sidecar-memory-request: "400Mi" + queue-sidecar-memory-limit: "800Mi" + queue-sidecar-ephemeral-storage-request: "512Mi" + queue-sidecar-ephemeral-storage-limit: "1024Mi" + registries-skipping-tag-resolving: managed.cr.union.ai + features: + kubernetes.podspec-affinity: "enabled" + kubernetes.podspec-nodeselector: "enabled" + kubernetes.podspec-tolerations: "enabled" + kubernetes.podspec-fieldref: "enabled" + kubernetes.podspec-dnspolicy: "enabled" + kubernetes.podspec-schedulername: "enabled" + kubernetes.podspec-securitycontext: "enabled" + network: + ingress-class: "kourier.ingress.networking.knative.dev" + high-availability: + replicas: 2 + ingress: + kourier: + enabled: true + bootstrap-configmap: "union-operator-serving-envoy-bootstrap" + service-type: ClusterIP + podDisruptionBudgets: + - name: 3scale-kourier-gateway-pdb + minAvailable: 50% + - name: activator-pdb + minAvailable: 50% + - name: webhook-pdb + minAvailable: 50% + registry: + override: + # TODO(jeev): Wire up Union fork of Envoy + 3scale-kourier-gateway/kourier-gateway: ghcr.io/unionai/envoy:456fed84d4ad9a9dfb186d117d9362e9dc0f7c1f + # TODO(jeev): Wire up Union fork of Kourier + net-kourier-controller/controller: ghcr.io/unionai/kourier@sha256:5804c348d15b3959604e3e3ceed216c3a1c7b32cbe254c7d3eb02a35e62ba9c4 + workloads: + - name: 3scale-kourier-gateway + labels: + helm.sh/chart: dataplane-2026.4.4 + app.kubernetes.io/name: release-name-dataplane + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2026.4.5" + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/component: 3scale-kourier-gateway + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchLabels: + app.kubernetes.io/name: 3scale-kourier-gateway + app.kubernetes.io/instance: release-name + topologyKey: topology.kubernetes.io/zone + annotations: + checksum/bootstrap-config: 6469f6f592eebc9e0c676d8ccc359a05bc799c0988e474b2da942c4cc328f656 + env: + - container: kourier-gateway + envVars: + - name: UNION_AUTHZ_TENANTAUTHURL + value: "https://byok.us-west-2.union.ai/me" + - name: UNION_AUTHZ_TENANTAUTHSIGNINURL + value: "https://byok.us-west-2.union.ai/login" + - name: UNION_AUTHZ_TENANTCONTROLPLANEURL + value: "https://byok.us-west-2.union.ai" + resources: + - container: kourier-gateway + limits: + cpu: "2" + memory: 2Gi + requests: + cpu: "1" + memory: 1Gi + - name: activator + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app + operator: In + values: + - activator + topologyKey: topology.kubernetes.io/zone + - name: autoscaler + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app + operator: In + values: + - autoscaler + topologyKey: topology.kubernetes.io/zone + - name: autoscaler-hpa + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app + operator: In + values: + - autoscaler-hpa + topologyKey: topology.kubernetes.io/zone + - name: controller + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app + operator: In + values: + - controller + topologyKey: topology.kubernetes.io/zone + - name: net-kourier-controller + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app + operator: In + values: + - net-kourier-controller + topologyKey: topology.kubernetes.io/zone + env: + - container: controller + envVars: + - name: KOURIER_UNION_AUTHZ_ENABLED + value: "true" + resources: + - container: controller + limits: + cpu: "1" + memory: 1Gi + requests: + cpu: 500m + memory: 500Mi + - name: webhook + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app + operator: In + values: + - webhook + topologyKey: topology.kubernetes.io/zone +--- +# Source: dataplane/templates/common/task-podtemplate.yaml +apiVersion: v1 +kind: PodTemplate +metadata: + name: task-template + namespace: union +template: + spec: + serviceAccountName: union + containers: + - name: default + image: docker.io/rwgrim/docker-noop +--- +# Source: dataplane/charts/fluentbit/templates/tests/test-connection.yaml +apiVersion: v1 +kind: Pod +metadata: + name: "release-name-fluentbit-test-connection" + namespace: union + labels: + helm.sh/chart: fluentbit-0.48.9 + app.kubernetes.io/version: "3.2.8" + app.kubernetes.io/managed-by: Helm + annotations: + helm.sh/hook: test + helm.sh/hook-delete-policy: hook-succeeded +spec: + containers: + - name: wget + image: "busybox:latest" + imagePullPolicy: Always + command: ["sh"] + args: ["-c", "sleep 5s && wget -O- release-name-fluentbit:2020"] + restartPolicy: Never diff --git a/tests/generated/dataplane.low-priv.yaml b/tests/generated/dataplane.low-priv.yaml index 463cb9a0..b57c9d1d 100644 --- a/tests/generated/dataplane.low-priv.yaml +++ b/tests/generated/dataplane.low-priv.yaml @@ -1,94 +1,139 @@ --- -# Source: dataplane/charts/kube-state-metrics/templates/serviceaccount.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2020 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v1 +kind: ServiceAccount +metadata: + name: knative-operator + namespace: "union" + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2020 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v1 +kind: ServiceAccount +metadata: + name: operator-webhook + namespace: "union" + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +--- +# Source: dataplane/charts/prometheus/charts/kube-state-metrics/templates/serviceaccount.yaml apiVersion: v1 kind: ServiceAccount automountServiceAccountToken: true metadata: labels: - helm.sh/chart: kube-state-metrics-5.30.1 + helm.sh/chart: kube-state-metrics-5.25.1 app.kubernetes.io/managed-by: Helm app.kubernetes.io/component: metrics app.kubernetes.io/part-of: kube-state-metrics app.kubernetes.io/name: kube-state-metrics app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2.15.0" + app.kubernetes.io/version: "2.13.0" name: release-name-kube-state-metrics namespace: union --- -# Source: dataplane/templates/common/default-serviceaccount.yaml +# Source: dataplane/charts/prometheus/templates/serviceaccount.yaml apiVersion: v1 kind: ServiceAccount metadata: - name: default + labels: + app.kubernetes.io/component: server + app.kubernetes.io/name: prometheus + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: v2.54.1 + helm.sh/chart: prometheus-25.27.0 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: prometheus + name: union-operator-prometheus namespace: union annotations: - eks.amazonaws.com/role-arn: arn:aws:iam::ACCOUNT_ID:role/flyte_project_role -automountServiceAccountToken: true ---- -# Source: dataplane/templates/imagebuilder/serviceaccount.yaml -apiVersion: v1 -kind: ServiceAccount -metadata: - name: union-imagebuilder + {} --- -# Source: dataplane/templates/nodeexecutor/serviceaccount.yaml +# Source: dataplane/templates/common/system-serviceaccount.yaml apiVersion: v1 kind: ServiceAccount metadata: - name: executor + name: union-system namespace: union - labels: - app: executor ---- -# Source: dataplane/templates/operator/serviceaccount-proxy.yaml -apiVersion: v1 -kind: ServiceAccount -metadata: - name: proxy-system - labels: - app.kubernetes.io/name: operator-proxy - app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name - app.kubernetes.io/managed-by: Helm + annotations: --- -# Source: dataplane/templates/operator/serviceaccount.yaml +# Source: dataplane/templates/common/union-serviceaccount.yaml apiVersion: v1 kind: ServiceAccount metadata: - name: operator-system - labels: - app.kubernetes.io/name: union-operator - app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name - app.kubernetes.io/managed-by: Helm + name: union + namespace: union + annotations: + eks.amazonaws.com/role-arn: arn:aws:iam::ACCOUNT_ID:role/flyte_project_role +automountServiceAccountToken: true --- -# Source: dataplane/templates/prometheus/serviceaccount.yaml +# Source: dataplane/templates/flyteconnector/serviceaccount.yaml apiVersion: v1 kind: ServiceAccount metadata: - name: union-operator-prometheus + name: flyteconnector namespace: union - labels: - helm.sh/chart: dataplane-2026.4.2 - app.kubernetes.io/name: release-name-dataplane + labels: + app.kubernetes.io/name: flyteconnector app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: prometheus ---- -# Source: dataplane/templates/propeller/serviceaccount-webhook.yaml -apiVersion: v1 -kind: ServiceAccount -metadata: - name: flytepropeller-webhook-system - namespace: union --- -# Source: dataplane/templates/propeller/serviceaccount.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2022 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + apiVersion: v1 -kind: ServiceAccount +kind: Secret metadata: - name: flytepropeller-system - namespace: union + name: operator-webhook-certs + namespace: "union" + labels: + app.kubernetes.io/component: operator-webhook + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +# The data is populated at install time. --- # Source: dataplane/templates/common/auth-secret.yaml apiVersion: v1 @@ -111,14 +156,614 @@ type: Opaque data: cluster_name: bXktY2x1c3Rlcg== --- -# Source: dataplane/templates/propeller/deployment-webhook.yaml -# Create an empty secret that the first propeller pod will populate +# Source: dataplane/templates/webhook/mutatingwebhookconfiguration.yaml apiVersion: v1 kind: Secret metadata: - name: flyte-pod-webhook + name: union-pod-webhook namespace: union + labels: + app.kubernetes.io/name: union-pod-webhook + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm type: Opaque +data: + ca.crt: LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSUROekNDQWgrZ0F3SUJBZ0lVSm9kQ09lem4vd1BHQTZjYkw1U3duSlpYT2hJd0RRWUpLb1pJaHZjTkFRRUwKQlFBd0t6RXBNQ2NHQTFVRUF3d2dabXg1ZEdWd2NtOXdaV3hzWlhJdGQyVmlhRzl2YXk1MWJtbHZiaTV6ZG1NdwpIaGNOTWpZd01USXpNREl6TmpVMVdoY05Nell3TVRJeE1ESXpOalUxV2pBck1Ta3dKd1lEVlFRRERDQm1iSGwwClpYQnliM0JsYkd4bGNpMTNaV0pvYjI5ckxuVnVhVzl1TG5OMll6Q0NBU0l3RFFZSktvWklodmNOQVFFQkJRQUQKZ2dFUEFEQ0NBUW9DZ2dFQkFJcjBSWDRGRjFNVTNYdFBmUHErMDUrMGI1Qm0xK3hsTVR5ZUp4bG5mWFJuRlgyegpPcHV6NmsreUpBQk4xRkxmNDVYRnlJclFlaW5DWWJtckZXalFRc3ZDWEloNkpPNGMwdmRMVVU5RERYcldWRFlsCkRNZytTUDlJVnZBMm5USkVFZ2RFREVsKzNWeThUbUJoNlNmcGthcXdHajY5SmhIdy9OeE9yZzM2bUY3TU5VZ1MKWXJVUkxqUFFzTUN6NElTVzhhQnEzS2IwRUFWUTA4V1lZQ0ZEbjY1aHBzVFdxR05rL3BKRnVTYjlsZll6RVdyYwpJeW5wNDhqTnIvNUdKS3NnWG9LeTUwZVo4UkppeTVyOUhENGtnSW15TVNQR2RDMXFZSVppRHRlZW8yOElxNk5nCmd1dWcyTGhlNndYYW9YZXFCQ2d3SHpwWFVXM1hJVzI5eFoxUkFRRUNBd0VBQWFOVE1GRXdIUVlEVlIwT0JCWUUKRktITDQ1RWhQaVg5bnlRN241QUk0YUZVNFlCL01COEdBMVVkSXdRWU1CYUFGS0hMNDVFaFBpWDlueVE3bjVBSQo0YUZVNFlCL01BOEdBMVVkRXdFQi93UUZNQU1CQWY4d0RRWUpLb1pJaHZjTkFRRUxCUUFEZ2dFQkFEeTRac21XCkNjZ3B5TmI4cU1ydzNYclBNejZLMTRlTVhqQVk1bERFYXM1NklLZExoZFM1bi9sWk95bENuUUtOZ3RHS0VrY3UKT05tMEE2Z0xNT3UzcE43dG9aNXh0WktOcnUvS0x6dm44QUNNNndCbUR0bG5oRERDa3ZUTHBVRzh6UTZSNmlaegpzTWlhVmpLSUJiWkJsZXNYSHZZQnQ2YlRTN2o1QWM2aTdYc1RPK3ZqQ2RUQjVYdCtEZ0x2QnBJRWh2ZGJzNXhmClFtc3ZzQmwxenJnR3JXemo4d0txcXNUMzZaOVU1TzdSK1RJMlQrUDVOR25SeHNQOGJ0R0RtMGZ5SW55WlJrSkcKVTdpVFJyMlZRWlRyaTNZQ2dlVjN1UjFtMnJjc2hyU1Avb082T0QyTXR5Q1FMRWtFK1pxVUVJVnpSYmVaNXl2RQptM29YMFV2OUtBUy9pWTA9Ci0tLS0tRU5EIENFUlRJRklDQVRFLS0tLS0K + tls.crt: LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSUQxakNDQXI2Z0F3SUJBZ0lVREtRN3A4MFpIdkQyUE5Ya3p3VGRLem9iUjg4d0RRWUpLb1pJaHZjTkFRRUwKQlFBd0t6RXBNQ2NHQTFVRUF3d2dabXg1ZEdWd2NtOXdaV3hzWlhJdGQyVmlhRzl2YXk1MWJtbHZiaTV6ZG1NdwpIaGNOTWpZd01USXpNREl6TmpVMVdoY05NamN3TVRJek1ESXpOalUxV2pBck1Ta3dKd1lEVlFRRERDQm1iSGwwClpYQnliM0JsYkd4bGNpMTNaV0pvYjI5ckxuVnVhVzl1TG5OMll6Q0NBU0l3RFFZSktvWklodmNOQVFFQkJRQUQKZ2dFUEFEQ0NBUW9DZ2dFQkFMNnMxUTRaZ3ZxVVdNdFp5VC9EKytEQWl6WmR0Q1lkVzArSGk4S3phQ1Vjbi9zZwo3a21NTEhDM2ZlZWtZK2ZUNENHUTd6K1l2WjRWWSt6WGljKzdrNndLbUZqdm1acEM1Rm96OWo1Q21NRk5HZmlWCnRobzVvSU9MUkIzNXRYMU1ROHUwNFVzT29QcjdMK016eEdxWXVmTFdtN1NVdjh0eEp3ZFMxZVlzMllNV1pMeU4KekI4akdWY01Qc2ZNVkdxdFhTTVljQ3hNYURKaTM1N0VEZSs3clM2U2ZtbnFtYjJxekdQM1BRRStCUm91Yk5nLwo4NCt4c253VXBLZUFpem95dFM2c2ZlOTdMUEtRTW9oVzd2TjZORFJrLy9xSmpHem9iZmZwUHU3endhWHVXMWtMCmllc2g2cVRrdlZLSTF3WFJ2WHArVU5GbjZzdDIwdEVlRks0ZzMwOENBd0VBQWFPQjhUQ0I3akFKQmdOVkhSTUUKQWpBQU1Bc0dBMVVkRHdRRUF3SUY0RENCa3dZRFZSMFJCSUdMTUlHSWdoWm1iSGwwWlhCeWIzQmxiR3hsY2kxMwpaV0pvYjI5cmdoeG1iSGwwWlhCeWIzQmxiR3hsY2kxM1pXSm9iMjlyTG5WdWFXOXVnaUJtYkhsMFpYQnliM0JsCmJHeGxjaTEzWldKb2IyOXJMblZ1YVc5dUxuTjJZNEl1Wm14NWRHVndjbTl3Wld4c1pYSXRkMlZpYUc5dmF5NTEKYm1sdmJpNXpkbU11WTJ4MWMzUmxjaTVzYjJOaGJEQWRCZ05WSFE0RUZnUVV1d25wcDZ1MTNKL0NsNnI3eDZ1Sgo5YmZ1RDhzd0h3WURWUjBqQkJnd0ZvQVVvY3Zqa1NFK0pmMmZKRHVma0FqaG9WVGhnSDh3RFFZSktvWklodmNOCkFRRUxCUUFEZ2dFQkFFMmlOa1MzczhGT3Z0cDhrVmJNZUN2amxIQytvYTVlZTFGNmowNlZ3bFJPOWdxOXJla1UKMGptWUhNWGNjWjcrZXJkNFRQRXgwK3ZqMzNhRTV3cDFmdVJJL2xIZHhuOUpJRHg2bTFQTHdVa3p1WG9EekYxLwozVXVIWU1xOGkrT2xsY0tqbmdVMUFUNmRWRCtIbFVWeHpMdjlNazdPb05SQUY5dTJHa1hiUHdCN0I1ZGRxdWxlCmtNcXBxbDhES2s4SVNVelVsTjNBb2Q5NmJUelIyV3RCVkxpTi9zRHZtTDZwZThFYm55RnBUb1BZTDlGeWZOcmIKT1ZPZEJwUkZFTG1KOHZaZWY5d1lxRmVXMTE3SGV5VDArZkxGZVNSemZUNVVWUlVraVFBcUVJdUhHUVRrOVFabgo1WnYvNUF0eEg3WE1rcXR0VE1zMFlSUG0xYjQzY3FBeFdhND0KLS0tLS1FTkQgQ0VSVElGSUNBVEUtLS0tLQo= + tls.key: LS0tLS1CRUdJTiBQUklWQVRFIEtFWS0tLS0tCk1JSUV2UUlCQURBTkJna3Foa2lHOXcwQkFRRUZBQVNDQktjd2dnU2pBZ0VBQW9JQkFRQytyTlVPR1lMNmxGakwKV2NrL3cvdmd3SXMyWGJRbUhWdFBoNHZDczJnbEhKLzdJTzVKakN4d3QzM25wR1BuMCtBaGtPOC9tTDJlRldQcwoxNG5QdTVPc0NwaFk3NW1hUXVSYU0vWStRcGpCVFJuNGxiWWFPYUNEaTBRZCtiVjlURVBMdE9GTERxRDYreS9qCk04UnFtTG55MXB1MGxML0xjU2NIVXRYbUxObURGbVM4amN3Zkl4bFhERDdIekZScXJWMGpHSEFzVEdneVl0K2UKeEEzdnU2MHVrbjVwNnBtOXFzeGo5ejBCUGdVYUxtellQL09Qc2JKOEZLU25nSXM2TXJVdXJIM3ZleXp5a0RLSQpWdTd6ZWpRMFpQLzZpWXhzNkczMzZUN3U4OEdsN2x0WkM0bnJJZXFrNUwxU2lOY0YwYjE2ZmxEUlorckxkdExSCkhoU3VJTjlQQWdNQkFBRUNnZ0VBRjV2SnMzbTRMSE9DdlFic2NwOC9DQmgvQkNlOC92MGlpYW5VUmJLMkFlWlYKN3Arb3NXV1FPUktYSGIvT2VPMVVjb09DQkFOUzh3aGQrM3pDZlB5U0w3cU9HM0RyT1Z6djdqVkIxM3FpZEVpcQpId3ZXWk0vZXpuckhYOWpEdm5SYmJwVUNVaXRKQmxwa2x3S1pYc056UHB5UTRkNkxFdEw3VEo1V2lxM2g3cWQzCjhncmJoeWlWNDhZRFAyYkhpZGt0S2QvMHBNL3piSzZPTS9iMFpLUmRBWVpHQXlOcklMaFJ3eDNsQXE2SDVqL20KUHBuNlZEdVQ3Y1h6UFhmNTgxR0IyNjQxWFJVTXRNWGVEREtaU3JudnIrbG5tV2lsYXZmNkg3ZHp4NlRBMmk0RApybCtIRVdUN1RSQUZGMnZ5SVUrYSt5RzNPVFBKRCtmVTdCWWh4Zld4aFFLQmdRRGVvcXVCbzQxK1dKU255blZnCmNWMHZoUEkwZ0dyRGQzNFdIbCtGNTZuSmxuMVVlbUQ2SUQ0TUhnNVpHWmNmd3NkRlB0aHl1VnB0OVl1SW11SGQKRHE5ZDhpWnpHUzJVbStkSWlsWjhnZXJ5Q3dON3hIbFM3SzJrc0RmRGJwYmZZSXZ1LzRnSExZRHh3YzQ0OG54ZwpWNGpCUFY2QTdYTHRqQzZzdFE2OW9uTlIwd0tCZ1FEYlFBUTRGdWpnUjJUOXh2Y01MdXhMbm1YaHE4U2NoZlM5CkRoWndDUzZIeHB1YlBxblBjQ04vQUZjbDZ1cDRXVEVZdVY4bEpwcGwwRmpYQ2lMdkhuUU15cm5vRjlUOThpanQKZGVHYjF6ZWlwRUpndFpNVHkvOGdERWorZU1wSFBuYTZwQVpMOVpRVUdLR2dDNGQ0SzljdVNHMmRlb1dSU0ViSQpMVGgzeGpXVEZRS0JnQStJRWFidG5nVmVjS0IwQTFSREZGa29VUzFRZUNKQ3g4MExPV2JDRHBvOW9XaXZVT3lpCkt6SDFOdE1JY2Y2SlBCV2NtTVVJSVVMaWltVnhTS2gvU2NTb0MvNmpsd1p6Q2VPSm94YjBpVXR4Y1VERktDR2MKMlZCUDZ0UDdkeE1HVFR6VEhzNUJZbWw3TjhQSlJ0d0J2MHliMTJmdktNRmhzaS9pUWJFQkVFSjVBb0dBUm9NSApHRmJkM0V0NXdsZzcyYkk1a25SRnhkY3RLejIxb1J1bndhNWlSWTV0T3Zkak8zQ1FLZWNkSC9lMklyQmtwdFB2Ck1vNkF0MS9UUW8xakFNNGxlbnUwWUYxUnhiNGN4WW5VM2Y3UVNNRlZDNjg1dHZNemdNWVNyNngvT1h4d1NNUTUKdGpJcnhtN0poQ1JSRkNmZFUyZzl3SmpIM2hxRmtSbGlBTHRCUGFVQ2dZRUF2QkRHejQvYS9kRzRJT0VpREk4SgpBbkxaRWZaTFJrdnI0ejRyT0p0TGZtWlFjTW5DSUZmKzBaelhTME9XOWhvMjk5RUNtS1l4U2VUWk1WSDgzOFZCCnRvSkNGM3lzb01rSC9meEVXTUpicDlMblJwSFUzUElicHlhRG1nWW1YdmsvK3laL3Y1c0dDTWNtVGkzWVFLRGsKcStlK1o0SHZyaGUxVytUaHpwOCtoNGs9Ci0tLS0tRU5EIFBSSVZBVEUgS0VZLS0tLS0K +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2019 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v1 +kind: ConfigMap +metadata: + name: config-logging + namespace: "union" + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +data: {} +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2019 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v1 +kind: ConfigMap +metadata: + name: config-observability + namespace: "union" + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +data: {} +--- +# Source: dataplane/charts/prometheus/templates/cm.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + labels: + app.kubernetes.io/component: server + app.kubernetes.io/name: prometheus + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: v2.54.1 + helm.sh/chart: prometheus-25.27.0 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: prometheus + name: union-operator-prometheus + namespace: union +data: + allow-snippet-annotations: "false" + alerting_rules.yml: | + {} + alerts: | + {} + prometheus.yml: | + global: + evaluation_interval: 1m + scrape_interval: 1m + scrape_timeout: 10s + rule_files: + - /etc/config/recording_rules.yml + - /etc/config/alerting_rules.yml + - /etc/config/rules + - /etc/config/alerts + scrape_configs: + - job_name: prometheus + static_configs: + - targets: + - localhost:9090 + - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + job_name: kubernetes-apiservers + kubernetes_sd_configs: + - role: endpoints + relabel_configs: + - action: keep + regex: default;kubernetes;https + source_labels: + - __meta_kubernetes_namespace + - __meta_kubernetes_service_name + - __meta_kubernetes_endpoint_port_name + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + insecure_skip_verify: true + - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + job_name: kubernetes-nodes + kubernetes_sd_configs: + - role: node + relabel_configs: + - action: labelmap + regex: __meta_kubernetes_node_label_(.+) + - replacement: kubernetes.default.svc:443 + target_label: __address__ + - regex: (.+) + replacement: /api/v1/nodes/$1/proxy/metrics + source_labels: + - __meta_kubernetes_node_name + target_label: __metrics_path__ + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + insecure_skip_verify: true + - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + job_name: kubernetes-nodes-cadvisor + kubernetes_sd_configs: + - role: node + relabel_configs: + - action: labelmap + regex: __meta_kubernetes_node_label_(.+) + - replacement: kubernetes.default.svc:443 + target_label: __address__ + - regex: (.+) + replacement: /api/v1/nodes/$1/proxy/metrics/cadvisor + source_labels: + - __meta_kubernetes_node_name + target_label: __metrics_path__ + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + insecure_skip_verify: true + - honor_labels: true + job_name: kubernetes-service-endpoints + kubernetes_sd_configs: + - role: endpoints + relabel_configs: + - action: keep + regex: true + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_scrape + - action: drop + regex: true + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_scrape_slow + - action: replace + regex: (https?) + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_scheme + target_label: __scheme__ + - action: replace + regex: (.+) + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_path + target_label: __metrics_path__ + - action: replace + regex: (.+?)(?::\d+)?;(\d+) + replacement: $1:$2 + source_labels: + - __address__ + - __meta_kubernetes_service_annotation_prometheus_io_port + target_label: __address__ + - action: labelmap + regex: __meta_kubernetes_service_annotation_prometheus_io_param_(.+) + replacement: __param_$1 + - action: labelmap + regex: __meta_kubernetes_service_label_(.+) + - action: replace + source_labels: + - __meta_kubernetes_namespace + target_label: namespace + - action: replace + source_labels: + - __meta_kubernetes_service_name + target_label: service + - action: replace + source_labels: + - __meta_kubernetes_pod_node_name + target_label: node + - honor_labels: true + job_name: kubernetes-service-endpoints-slow + kubernetes_sd_configs: + - role: endpoints + relabel_configs: + - action: keep + regex: true + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_scrape_slow + - action: replace + regex: (https?) + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_scheme + target_label: __scheme__ + - action: replace + regex: (.+) + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_path + target_label: __metrics_path__ + - action: replace + regex: (.+?)(?::\d+)?;(\d+) + replacement: $1:$2 + source_labels: + - __address__ + - __meta_kubernetes_service_annotation_prometheus_io_port + target_label: __address__ + - action: labelmap + regex: __meta_kubernetes_service_annotation_prometheus_io_param_(.+) + replacement: __param_$1 + - action: labelmap + regex: __meta_kubernetes_service_label_(.+) + - action: replace + source_labels: + - __meta_kubernetes_namespace + target_label: namespace + - action: replace + source_labels: + - __meta_kubernetes_service_name + target_label: service + - action: replace + source_labels: + - __meta_kubernetes_pod_node_name + target_label: node + scrape_interval: 5m + scrape_timeout: 30s + - honor_labels: true + job_name: prometheus-pushgateway + kubernetes_sd_configs: + - role: service + relabel_configs: + - action: keep + regex: pushgateway + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_probe + - honor_labels: true + job_name: kubernetes-services + kubernetes_sd_configs: + - role: service + metrics_path: /probe + params: + module: + - http_2xx + relabel_configs: + - action: keep + regex: true + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_probe + - source_labels: + - __address__ + target_label: __param_target + - replacement: blackbox + target_label: __address__ + - source_labels: + - __param_target + target_label: instance + - action: labelmap + regex: __meta_kubernetes_service_label_(.+) + - source_labels: + - __meta_kubernetes_namespace + target_label: namespace + - source_labels: + - __meta_kubernetes_service_name + target_label: service + - honor_labels: true + job_name: kubernetes-pods + kubernetes_sd_configs: + - role: pod + relabel_configs: + - action: keep + regex: true + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_scrape + - action: drop + regex: true + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_scrape_slow + - action: replace + regex: (https?) + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_scheme + target_label: __scheme__ + - action: replace + regex: (.+) + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_path + target_label: __metrics_path__ + - action: replace + regex: (\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4}) + replacement: '[$2]:$1' + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_port + - __meta_kubernetes_pod_ip + target_label: __address__ + - action: replace + regex: (\d+);((([0-9]+?)(\.|$)){4}) + replacement: $2:$1 + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_port + - __meta_kubernetes_pod_ip + target_label: __address__ + - action: labelmap + regex: __meta_kubernetes_pod_annotation_prometheus_io_param_(.+) + replacement: __param_$1 + - action: labelmap + regex: __meta_kubernetes_pod_label_(.+) + - action: replace + source_labels: + - __meta_kubernetes_namespace + target_label: namespace + - action: replace + source_labels: + - __meta_kubernetes_pod_name + target_label: pod + - action: drop + regex: Pending|Succeeded|Failed|Completed + source_labels: + - __meta_kubernetes_pod_phase + - action: replace + source_labels: + - __meta_kubernetes_pod_node_name + target_label: node + - honor_labels: true + job_name: kubernetes-pods-slow + kubernetes_sd_configs: + - role: pod + relabel_configs: + - action: keep + regex: true + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_scrape_slow + - action: replace + regex: (https?) + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_scheme + target_label: __scheme__ + - action: replace + regex: (.+) + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_path + target_label: __metrics_path__ + - action: replace + regex: (\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4}) + replacement: '[$2]:$1' + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_port + - __meta_kubernetes_pod_ip + target_label: __address__ + - action: replace + regex: (\d+);((([0-9]+?)(\.|$)){4}) + replacement: $2:$1 + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_port + - __meta_kubernetes_pod_ip + target_label: __address__ + - action: labelmap + regex: __meta_kubernetes_pod_annotation_prometheus_io_param_(.+) + replacement: __param_$1 + - action: labelmap + regex: __meta_kubernetes_pod_label_(.+) + - action: replace + source_labels: + - __meta_kubernetes_namespace + target_label: namespace + - action: replace + source_labels: + - __meta_kubernetes_pod_name + target_label: pod + - action: drop + regex: Pending|Succeeded|Failed|Completed + source_labels: + - __meta_kubernetes_pod_phase + - action: replace + source_labels: + - __meta_kubernetes_pod_node_name + target_label: node + scrape_interval: 5m + scrape_timeout: 30s + - job_name: 'kube-state-metrics' + metrics_path: /metrics + scrape_interval: 1m + honor_labels: true + kubernetes_sd_configs: + - role: endpoints + namespaces: + own_namespace: true + relabel_configs: + - source_labels: [__meta_kubernetes_service_label_app_kubernetes_io_name] + regex: kube-state-metrics + action: keep + - source_labels: [__meta_kubernetes_endpoint_port_name] + regex: http + action: keep + metric_relabel_configs: + - source_labels: [__name__] + regex: "kube_pod_container_resource_(limits|requests)|kube_pod_status_phase|kube_node_(labels|status_allocatable|status_condition|status_capacity)|kube_namespace_labels|kube_pod_container_status_(waiting|terminated|last_terminated).*_reason|kube_daemonset_status_number_unavailable|kube_deployment_status_replicas_unavailable|kube_resourcequota|kube_pod_info|kube_node_info|kube_pod_container_status_restarts_total" + action: keep + - source_labels: [__name__, phase] + separator: ";" + regex: "kube_pod_status_phase;(Succeeded|Failed)" + action: drop + - source_labels: [node] + target_label: nodename + regex: "(.*)" + action: replace + - source_labels: [label_node_group_name] + action: replace + regex: "(.+)" + target_label: label_node_pool_name + - job_name: 'opencost' + metrics_path: /metrics + scrape_interval: 1m + honor_labels: true + kubernetes_sd_configs: + - role: endpoints + namespaces: + own_namespace: true + relabel_configs: + - source_labels: [__meta_kubernetes_service_label_app_kubernetes_io_name] + regex: opencost + action: keep + - source_labels: [__meta_kubernetes_endpoint_port_name] + regex: http + action: keep + metric_relabel_configs: + - source_labels: [__name__] + regex: "kube_node_labels|kube_pod_labels|node_total_hourly_cost|node_ram_hourly_cost|node_cpu_hourly_cost|node_gpu_hourly_cost" + action: keep + - job_name: 'union-services' + metrics_path: /metrics + scrape_interval: 1m + honor_labels: true + kubernetes_sd_configs: + - role: endpoints + namespaces: + own_namespace: true + relabel_configs: + - source_labels: [__meta_kubernetes_service_label_platform_union_ai_service_group] + regex: .+ + action: keep + - source_labels: [__meta_kubernetes_endpoint_port_name] + regex: debug + action: keep + # Flyte propeller metrics for execution info and fast task duration. + # No-op when propeller is not deployed (no pods match the label selector). + - job_name: 'flytepropeller' + kubernetes_sd_configs: + - role: pod + namespaces: + own_namespace: true + selectors: + - role: pod + label: app.kubernetes.io/name=flytepropeller + metric_relabel_configs: + - source_labels: [__name__] + regex: "flyte:propeller:all:round:execution_info|flyte:propeller:all:node:fast_task:fast_task_execution_duration" + action: keep + relabel_configs: + - source_labels: [__meta_kubernetes_pod_container_name] + regex: flytepropeller + action: keep + - source_labels: [__meta_kubernetes_namespace] + action: replace + target_label: namespace + - source_labels: [__meta_kubernetes_pod_name] + action: replace + target_label: pod + # Kourier gateway (envoy) metrics for app serving. + # No-op when serving is not enabled (no pods match the label selector). + - job_name: 'serving-envoy' + kubernetes_sd_configs: + - role: pod + namespaces: + own_namespace: true + selectors: + - role: pod + label: app=3scale-kourier-gateway + metrics_path: /stats/prometheus + metric_relabel_configs: + - source_labels: [__name__] + regex: "envoy_cluster_upstream_rq_xx|envoy_cluster_upstream_rq_time_bucket" + action: keep + relabel_configs: + - source_labels: [__meta_kubernetes_pod_container_port_name] + regex: metrics + action: keep + - source_labels: [__meta_kubernetes_namespace] + action: replace + target_label: namespace + - source_labels: [__meta_kubernetes_pod_name] + action: replace + target_label: pod + # DCGM GPU metrics. + # No-op when dcgm-exporter is not deployed (no pods match the label selector). + - job_name: 'gpu-metrics' + kubernetes_sd_configs: + - role: pod + namespaces: + names: + - kube-system + selectors: + - role: pod + label: app.kubernetes.io/name=dcgm-exporter + + recording_rules.yml: | + {} + rules: | + {} +--- +# Source: dataplane/templates/fluent-bit/configmap.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: fluentbit-system + namespace: union + labels: + app.kubernetes.io/instance: release-name + app.kubernetes.io/managed-by: Helm +data: + custom_parsers.conf: | + [PARSER] + Name docker_no_time + Format json + Time_Keep Off + Time_Key time + Time_Format %Y-%m-%dT%H:%M:%S.%L + fluent-bit.conf: | + [SERVICE] + Parsers_File /fluent-bit/etc/parsers.conf + Parsers_File /fluent-bit/etc/conf/custom_parsers.conf + HTTP_Server On + HTTP_Listen 0.0.0.0 + Health_Check On + [INPUT] + Name tail + Tag namespace-.pod-.cont- + Tag_Regex (?[a-z0-9](?:[-a-z0-9]*[a-z0-9])?(?:\\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*)_(?[^_]+)_(?.+)- + Path /var/log/containers/*.log + DB /var/log/flb_kube.db + multiline.parser docker, cri + Mem_Buf_Limit 5MB + Skip_Long_Lines On + Refresh_Interval 10 + + + [OUTPUT] + Name s3 + Match * + upload_timeout 1m + s3_key_format /persisted-logs/$TAG + static_file_path true + json_date_key false + region us-west-2 + bucket bucket + endpoint http://localstack.aws.svc.cluster.local:4566 +--- +# Source: dataplane/templates/imagebuilder/build-image-configmap.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: build-image-config + labels: + app.kubernetes.io/name: union-operator + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm +data: + storage.yaml: | + storage: + container: "bucket" + type: stow + stow: + kind: s3 + config: + auth_type: accesskey + access_key_id: test123 + secret_key: test + disable_ssl: false + endpoint: http://localstack.aws.svc.cluster.local:4566 + region: us-west-2 + enable-multicontainer: false + limits: + maxDownloadMBs: 1024 + cache: + max_size_mbs: 0 + target_gc_percent: 70 + buildkit-uri: "tcp://union-operator-buildkit.union.svc.cluster.local:1234" + default-repository: "localhost:5000/union-dataplane" + authentication-type: "noop" --- # Source: dataplane/templates/imagebuilder/configmap.yaml apiVersion: v1 @@ -2276,6 +2921,18 @@ data: "version": 1 } --- +# Source: dataplane/templates/monitoring/prometheusrule.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: union-recording-rules + namespace: union + labels: + release: release-name +data: + recording_rules.yml: | + groups: [] +--- # Source: dataplane/templates/nodeexecutor/configmap.yaml apiVersion: v1 kind: ConfigMap @@ -2330,8 +2987,13 @@ data: templateUris: - https://scale.neptune.ai/{{ .taskConfig.project }}/-/run/?customId={{ .taskConfig.id }} - kubernetes-enabled: true + kubernetes-enabled: false enabled_plugins.yaml: | + plugins: + connector-service: + defaultConnector: + defaultServiceConfig: '{"loadBalancingConfig": [{"round_robin":{}}]}' + endpoint: k8s:///flyteconnector.union:8000 tasks: task-plugins: default-for-task-types: @@ -2365,7 +3027,7 @@ data: gpu: 256 memory: 2Ti namespace_mapping: - template: 'union' + template: union union: connection: host: dns:///union.us-west-2.union.ai @@ -2386,15 +3048,13 @@ data: cache-endpoint: dns:///union.us-west-2.union.ai endpoint: dns:///union.us-west-2.union.ai insecure: false - type: fallback + type: cacheservicev2 use-admin-auth: true logger: formatter: type: json level: 4 show-source: true - namespace_mapping: - template: 'union' sharedService: metrics: scope: 'executor:' @@ -2408,7 +3068,7 @@ data: secure: false useAuth: false propeller: - limit-namespace: 'union' + limit-namespace: union node-config: disable-input-file-writes: true plugins: @@ -2416,7 +3076,7 @@ data: additional-worker-args: - --last-ack-grace-period-seconds - "120" - callback-uri: http://unionai-dataplane-executor.union.svc.cluster.local:15605 + callback-uri: http://union-operator-executor.union.svc.cluster.local:15605 grace-period-status-not-found: 2m ioutils: remoteFileOutputPaths: @@ -2429,6 +3089,7 @@ data: - FLYTE_AWS_ACCESS_KEY_ID: test123 - FLYTE_AWS_SECRET_ACCESS_KEY: test default-memory: 100Mi + default-pod-template-name: task-template co-pilot: image: 'cr.flyte.org/flyteorg/flytecopilot:v1.14.1' name: flyte-copilot- @@ -2472,6 +3133,7 @@ data: - FLYTE_AWS_ACCESS_KEY_ID: test123 - FLYTE_AWS_SECRET_ACCESS_KEY: test default-memory: 100Mi + default-pod-template-name: task-template config.yaml: | union: connection: @@ -2491,13 +3153,14 @@ data: operator: enabled: true enableTunnelService: true + # enableDepot: false tunnel: - enableDirectToAppIngress: false + enableDirectToAppIngress: true deploymentToRestart: union-operator-proxy - limitNamespace: 'union' + limitNamespace: union disableClusterPermissions: true apps: - enabled: 'false' + enabled: 'true' syncClusterConfig: enabled: false clusterId: @@ -2516,13 +3179,15 @@ data: billing: model: Legacy dependenciesHeartbeat: - propeller: - endpoint: 'http://flytepropeller:10254' + executor: + endpoint: 'http://union-operator-executor:10254' proxy: endpoint: 'http://union-operator-proxy:10254' secretsWatcher: dryRun: true enabled: false + org: + namespaceTemplate: union imageBuilder: enabled: true executionNamespaceLabels: @@ -2530,9 +3195,6 @@ data: referenceConfigmapName: union-operator targetConfigMapName: "build-image-config" proxy: - imageBuilderConfig: - authenticationType: 'noop' - defaultRepository: '' persistedLogs: objectStore: pathTemplate: namespace-{{.KubernetesNamespace}}.pod-{{.KubernetesPodName}}.cont-{{.KubernetesContainerName}} @@ -2586,190 +3248,146 @@ data: endpoint: http://localstack.aws.svc.cluster.local:4566 region: us-west-2 image-builder.buildkit-uri: "tcp://union-operator-buildkit.union.svc.cluster.local:1234" - image-builder.default-repository: "" + image-builder.default-repository: "localhost:5000/union-dataplane" image-builder.authentication-type: "noop" --- -# Source: dataplane/templates/prometheus/configmap.yaml +# Source: dataplane/templates/serving/bootstrap-configmap.yaml +# We need to copy the default configmap here since knative-operator does not automatically update the +# address of the `net-kourier-controller` endpoint to use the release namespace. It assumes that the +# resources are being installed into the `knative-serving` namespace instead. apiVersion: v1 kind: ConfigMap metadata: - name: union-operator-prometheus + name: union-operator-serving-envoy-bootstrap namespace: union - labels: - helm.sh/chart: dataplane-2026.4.2 - app.kubernetes.io/name: release-name-dataplane - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: prometheus data: - prometheus.yml: | - global: - scrape_interval: 15s - evaluation_interval: 15s - alerting: - alertmanagers: - - static_configs: - - targets: - scrape_configs: - # Self-monitoring - - job_name: prometheus - metrics_path: /prometheus/metrics - static_configs: - - targets: ['localhost:9090'] - metric_relabel_configs: - - source_labels: [__name__] - regex: prometheus_tsdb_head_chunks_storage_size_bytes|prometheus_tsdb_head_series|prometheus_tsdb_head_series_created_total|prometheus_tsdb_head_series_removed_total|prometheus_tsdb_head_samples_appended_total|prometheus_tsdb_storage_blocks_bytes|prometheus_tsdb_retention_limit_bytes|prometheus_tsdb_wal_(.*)|.+_(limit|failed|missed|failures)_total - action: keep - - # Kube state metrics for pod/node resource tracking and cost calculations - - job_name: kube-state-metrics - static_configs: - - targets: ['release-name-kube-state-metrics:8080'] - metric_relabel_configs: - - separator: ; - source_labels: [__name__] - regex: kube_pod_container_resource_(limits|requests)|kube_pod_status_phase|kube_node_(labels|status_allocatable|status_condition|status_capacity)|kube_namespace_labels|kube_pod_container_status_(waiting|terminated|last_terminated).*_reason|kube_daemonset_status_number_unavailable|kube_deployment_status_replicas_unavailable|kube_resourcequota|kube_pod_info|kube_node_info|kube_pod_container_status_restarts_total - action: keep - - separator: ; - source_labels: [__name__, phase] - regex: kube_pod_status_phase;(Succeeded|Failed) - action: drop - - source_labels: [node] - target_label: nodename - regex: '(.*)' - action: replace - - source_labels: [label_node_group_name] - action: replace - regex: (.+) - target_label: label_node_pool_name - - # cAdvisor container metrics for CPU and memory tracking - - job_name: kubernetes-cadvisor - metrics_path: /metrics - scheme: https - kubernetes_sd_configs: - - role: node - namespaces: - names: [] - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token - tls_config: - ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt - insecure_skip_verify: false - metric_relabel_configs: - - separator: ; - source_labels: [__name__] - regex: container_cpu_usage_seconds_total|container_memory_working_set_bytes - action: keep - relabel_configs: - - separator: ; - regex: __meta_kubernetes_node_label_(.+) - replacement: $1 - action: labelmap - - separator: ; - regex: (.*) - target_label: __address__ - replacement: kubernetes.default.svc:443 - action: replace - - source_labels: [__meta_kubernetes_node_name] - separator: ; - regex: (.+) - target_label: __metrics_path__ - replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor - action: replace - - # Flyte propeller metrics for execution info and fast task duration - - job_name: flytepropeller - kubernetes_sd_configs: - - role: pod - namespaces: - names: - - union - selectors: - - role: pod - label: app.kubernetes.io/name=flytepropeller - metric_relabel_configs: - - source_labels: [__name__] - regex: "flyte:propeller:all:round:execution_info|flyte:propeller:all:node:fast_task:fast_task_execution_duration" - action: keep - relabel_configs: - - source_labels: [__meta_kubernetes_pod_container_name] - regex: flytepropeller - action: keep - - source_labels: [__meta_kubernetes_namespace] - action: replace - target_label: namespace - - source_labels: [__meta_kubernetes_pod_name] - action: replace - target_label: pod + envoy-bootstrap.yaml: | + dynamic_resources: + ads_config: + transport_api_version: V3 + api_type: GRPC + rate_limit_settings: {} + grpc_services: + - envoy_grpc: {cluster_name: xds_cluster} + cds_config: + resource_api_version: V3 + ads: {} + lds_config: + resource_api_version: V3 + ads: {} + node: + cluster: kourier-knative + id: 3scale-kourier-gateway + static_resources: + listeners: + - name: stats_listener + address: + socket_address: + address: 0.0.0.0 + port_value: 9000 + filter_chains: + - filters: + - name: envoy.filters.network.http_connection_manager + typed_config: + "@type": type.googleapis.com/envoy.extensions.filters.network.http_connection_manager.v3.HttpConnectionManager + stat_prefix: stats_server + http_filters: + - name: envoy.filters.http.router + typed_config: + "@type": type.googleapis.com/envoy.extensions.filters.http.router.v3.Router + route_config: + virtual_hosts: + - name: admin_interface + domains: + - "*" + routes: + - match: + safe_regex: + regex: '/(certs|stats(/prometheus)?|server_info|clusters|listeners|ready)?' + headers: + - name: ':method' + string_match: + exact: GET + route: + cluster: service_stats + - match: + safe_regex: + regex: '/drain_listeners' + headers: + - name: ':method' + string_match: + exact: POST + route: + cluster: service_stats + clusters: + - name: service_stats + connect_timeout: 0.250s + type: static + load_assignment: + cluster_name: service_stats + endpoints: + lb_endpoints: + endpoint: + address: + socket_address: + address: 127.0.0.1 + port_value: 9901 + - name: xds_cluster + # This keepalive is recommended by envoy docs. + # https://www.envoyproxy.io/docs/envoy/latest/api-docs/xds_protocol + typed_extension_protocol_options: + envoy.extensions.upstreams.http.v3.HttpProtocolOptions: + "@type": type.googleapis.com/envoy.extensions.upstreams.http.v3.HttpProtocolOptions + explicit_http_config: + http2_protocol_options: + connection_keepalive: + interval: 30s + timeout: 5s + connect_timeout: 1s + load_assignment: + cluster_name: xds_cluster + endpoints: + lb_endpoints: + endpoint: + address: + socket_address: + address: "net-kourier-controller" + port_value: 18000 + type: STRICT_DNS + admin: + access_log: + - name: envoy.access_loggers.stdout + typed_config: + "@type": type.googleapis.com/envoy.extensions.access_loggers.stream.v3.StdoutAccessLog + address: + socket_address: + address: 127.0.0.1 + port_value: 9901 + stats_config: + stats_tags: + - tag_name: name + regex: '^.*?\.u/.*?n=(.*?)/.*?u\..*$' + - tag_name: domain + regex: '^.*?\.u/.*?d=(.*?)/.*?u\..*$' + - tag_name: org + regex: '^.*?\.u/.*?o=(.*?)/.*?u\..*$' + - tag_name: project + regex: '^.*?\.u/.*?p=(.*?)/.*?u\..*$' + - tag_name: target + regex: '^.*?\.u/.*?t=(.*?)/.*?u\..*$' + - tag_name: target_namespace + regex: '^.*?\.u/.*?tns=(.*?)/.*?u\..*$' --- -# Source: dataplane/templates/propeller/configmap.yaml +# Source: dataplane/templates/webhook/configmap.yaml apiVersion: v1 kind: ConfigMap metadata: - name: flyte-propeller-config + name: union-pod-webhook-config namespace: union data: - admin.yaml: | - admin: - clientId: 'my-client-id' - clientSecretLocation: /etc/union/secret/client_secret - endpoint: dns:///union.us-west-2.union.ai - insecure: false - event: - capacity: 1000 - rate: 500 - type: admin - catalog.yaml: | - catalog-cache: - cache-endpoint: dns:///union.us-west-2.union.ai - endpoint: dns:///union.us-west-2.union.ai - insecure: false - type: fallback - use-admin-auth: true - copilot.yaml: | - plugins: - k8s: - co-pilot: - image: 'cr.flyte.org/flyteorg/flytecopilot:v1.14.1' - name: flyte-copilot- - start-timeout: 30s - core.yaml: | - propeller: - downstream-eval-duration: 30s - enable-admin-launcher: true - leader-election: - enabled: true - lease-duration: 15s - lock-config-map: - name: propeller-leader - namespace: 'union' - renew-deadline: 10s - retry-period: 2s - limit-namespace: 'union' - literal-offloading-config: - enabled: true - max-workflow-retries: 30 - metadata-prefix: metadata/propeller - metrics-prefix: flyte - prof-port: 10254 - queue: - batch-size: -1 - batching-interval: 2s - queue: - base-delay: 5s - capacity: 1000 - max-delay: 120s - rate: 100 - type: maxof - sub-queue: - capacity: 100 - rate: 10 - type: bucket - type: batch - rawoutput-prefix: 's3://bucket' - workers: 4 - workflow-reeval-duration: 30s + core.yaml: | + + webhook: certDir: /etc/webhook/certs disableCreateMutatingWebhookConfig: true @@ -2780,246 +3398,1173 @@ data: namespace: 'union' type: 'K8s' listenPort: '9443' + localCert: true secretManagerTypes: - Embedded - K8s - serviceName: flyte-pod-webhook + secretName: union-pod-webhook + serviceName: union-pod-webhook servicePort: '443' - enabled_plugins.yaml: | - plugins: - connector-service: - defaultConnector: - defaultServiceConfig: '{"loadBalancingConfig": [{"round_robin":{}}]}' - endpoint: k8s:///flyteconnector.union:8000 - tasks: - task-plugins: - default-for-task-types: - actor: fast-task - container: container - container_array: k8s-array - sidecar: sidecar - enabled-plugins: - - container - - sidecar - - k8s-array - - echo - - fast-task - - connector-service - k8s.yaml: | - plugins: - k8s: - default-cpus: 100m - default-env-vars: - - FLYTE_AWS_ENDPOINT: http://localstack.aws.svc.cluster.local:4566 - - FLYTE_AWS_ACCESS_KEY_ID: test123 - - FLYTE_AWS_SECRET_ACCESS_KEY: test - default-memory: 100Mi - logger.yaml: | - logger: - formatter: - type: json - level: 4 - show-source: true - namespace_config.yaml: | - namespace_mapping: - template: 'union' - resource_manager.yaml: | - propeller: - resourcemanager: - type: noop - task_logs.yaml: | - plugins: - logs: - cloudwatch-enabled: false - dynamic-log-links: - - vscode: - displayName: VS Code Debugger - templateUris: - - /dataplane/pod/v1/generated_name/task/{{.executionProject}}/{{.executionDomain}}/{{.executionName}}/{{.nodeID}}/{{.taskRetryAttempt}}/{{.taskProject}}/{{.taskDomain}}/{{.taskID}}/{{.taskVersion}}/ - kubernetes-enabled: false - templates: - - displayName: Task Logs - scheme: TaskExecution - templateUris: - - /console/projects/{{.executionProject}}/domains/{{.executionDomain}}/executions/{{.executionName}}/nodeId/{{.nodeID}}/taskId/{{.taskID}}/attempt/{{.taskRetryAttempt}}/view/logs?duration=all&fromExecutionNav=true - storage.yaml: | - storage: - container: "bucket" - type: stow - stow: - kind: s3 - config: - auth_type: accesskey - access_key_id: test123 - secret_key: test - disable_ssl: false - endpoint: http://localstack.aws.svc.cluster.local:4566 - region: us-west-2 - enable-multicontainer: false - limits: - maxDownloadMBs: 1024 - cache: - max_size_mbs: 0 - target_gc_percent: 70 --- -# Source: dataplane/charts/kube-state-metrics/templates/role.yaml +# Source: dataplane/charts/fluentbit/templates/clusterrole.yaml apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: - labels: - helm.sh/chart: kube-state-metrics-5.30.1 - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: metrics - app.kubernetes.io/part-of: kube-state-metrics - app.kubernetes.io/name: kube-state-metrics + name: release-name-fluentbit + labels: + helm.sh/chart: fluentbit-0.48.9 + app.kubernetes.io/name: fluentbit app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2.15.0" - name: release-name-kube-state-metrics + app.kubernetes.io/version: "3.2.8" + app.kubernetes.io/managed-by: Helm rules: + - apiGroups: + - "" + resources: + - namespaces + - pods + verbs: + - get + - list + - watch +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2020 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. -- apiGroups: ["certificates.k8s.io"] - resources: - - certificatesigningrequests - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - configmaps - verbs: ["list", "watch"] - -- apiGroups: ["batch"] - resources: - - cronjobs - verbs: ["list", "watch"] - -- apiGroups: ["extensions", "apps"] - resources: - - daemonsets - verbs: ["list", "watch"] - -- apiGroups: ["extensions", "apps"] - resources: - - deployments - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - endpoints - verbs: ["list", "watch"] - -- apiGroups: ["autoscaling"] - resources: - - horizontalpodautoscalers - verbs: ["list", "watch"] - -- apiGroups: ["extensions", "networking.k8s.io"] - resources: - - ingresses - verbs: ["list", "watch"] - -- apiGroups: ["batch"] - resources: - - jobs - verbs: ["list", "watch"] - -- apiGroups: ["coordination.k8s.io"] - resources: - - leases - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - limitranges - verbs: ["list", "watch"] - -- apiGroups: ["admissionregistration.k8s.io"] - resources: - - mutatingwebhookconfigurations - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - namespaces - verbs: ["list", "watch"] - -- apiGroups: ["networking.k8s.io"] - resources: - - networkpolicies - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - nodes - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - persistentvolumeclaims - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - persistentvolumes - verbs: ["list", "watch"] - -- apiGroups: ["policy"] - resources: - - poddisruptionbudgets - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - pods - verbs: ["list", "watch"] - -- apiGroups: ["extensions", "apps"] - resources: - - replicasets - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - replicationcontrollers - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - resourcequotas - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - secrets - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - services - verbs: ["list", "watch"] - -- apiGroups: ["apps"] - resources: - - statefulsets - verbs: ["list", "watch"] - -- apiGroups: ["storage.k8s.io"] - resources: - - storageclasses - verbs: ["list", "watch"] +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: knative-serving-operator-aggregated + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +aggregationRule: + clusterRoleSelectors: + # This (along with escalate below) allows the Operator to pick up any + # roles that are provided to the admin of the cluster by knative serving + # automatically. + - matchExpressions: + - {key: serving.knative.dev/release, operator: Exists} +rules: [] # Rules are automatically filled in by the controller manager. +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: knative-serving-operator-aggregated-stable + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +aggregationRule: + clusterRoleSelectors: + # This (along with escalate below) allows the Operator to pick up any + # roles that are provided to the admin of the cluster by knative serving + # automatically. + - matchExpressions: + - {key: app.kubernetes.io/name, operator: In, values: ["knative-serving"]} +rules: [] # Rules are automatically filled in by the controller manager. +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: knative-eventing-operator-aggregated + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +aggregationRule: + clusterRoleSelectors: + # This (along with escalate below) allows the Operator to pick up any + # roles that are provided to the admin of the cluster by knative eventing + # automatically. + - matchExpressions: + - {key: eventing.knative.dev/release, operator: Exists} +rules: [] # Rules are automatically filled in by the controller manager. +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: knative-eventing-operator-aggregated-stable + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +aggregationRule: + clusterRoleSelectors: + # This (along with escalate below) allows the Operator to pick up any + # roles that are provided to the admin of the cluster by knative eventing + # automatically. + - matchExpressions: + - {key: app.kubernetes.io/name, operator: In, values: ["knative-eventing"]} +rules: [] # Rules are automatically filled in by the controller manager. +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2020 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. -- apiGroups: ["admissionregistration.k8s.io"] - resources: - - validatingwebhookconfigurations - verbs: ["list", "watch"] +kind: ClusterRole +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: knative-serving-operator + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +rules: + - apiGroups: + - operator.knative.dev + resources: + - '*' + verbs: + - '*' + # Bootstrapping permissions. + # Roles that are explicitly bound buch which are specified by this Operator + # MUST be specified here with 'get' and 'bind'. + - apiGroups: + - rbac.authorization.k8s.io + resources: + - clusterroles + resourceNames: + - system:auth-delegator + verbs: + - bind + - get + - apiGroups: + - rbac.authorization.k8s.io + resources: + - roles + resourceNames: + - extension-apiserver-authentication-reader + verbs: + - bind + - get + - apiGroups: + - rbac.authorization.k8s.io + resources: + - clusterroles + - roles + verbs: + - create + - delete + # Escalate is necessary in order to create a role using cluster role aggregation, + # and to allow the Operator to bootstrap itself into the necessary set of + # permissions, even as those continue to evolve upstream. + - escalate + - get + - list + - update + - apiGroups: + - rbac.authorization.k8s.io + resources: + - clusterrolebindings + - rolebindings + verbs: + - create + - delete + - list + - get + - update + # Permissions required for Knative controller + # infra. + - apiGroups: + - apiregistration.k8s.io + resources: + - apiservices + verbs: + - update + - apiGroups: + - "" + resources: + - services + verbs: + - create + - delete + - get + - list + - watch + - apiGroups: + - caching.internal.knative.dev + resources: + - images + verbs: + - '*' + - apiGroups: + - "" + resources: + - namespaces + verbs: + - get + - update + - watch + - apiGroups: + - '' + resources: + - events + verbs: + - create + - update + - patch + - apiGroups: + - '' + resources: + - configmaps + verbs: + - create + - delete + - get + - list + - watch + - apiGroups: + - security.istio.io + - apps + - policy + resources: + - poddisruptionbudgets + - peerauthentications + - deployments + - daemonsets + - replicasets + - statefulsets + verbs: + - create + - delete + - get + - list + - watch + - update + - apiGroups: + - apiregistration.k8s.io + resources: + - apiservices + verbs: + - create + - delete + - get + - list + - apiGroups: + - autoscaling + resources: + - horizontalpodautoscalers + verbs: + - create + - delete + - get + - list + - apiGroups: + - coordination.k8s.io + resources: + - leases + verbs: + - '*' + - apiGroups: + - apiextensions.k8s.io + resources: + - customresourcedefinitions + verbs: + - '*' + # Old resources that need cleaning up that are not in the knative-serving + # namespace. + - apiGroups: + - "" + resources: + - services + - deployments + - horizontalpodautoscalers + resourceNames: + - knative-ingressgateway + verbs: + - delete + - apiGroups: + - "" + resources: + - configmaps + resourceNames: + - config-controller + verbs: + - delete + - apiGroups: + - "" + resources: + - serviceaccounts + resourceNames: + - knative-serving-operator + verbs: + - delete + # for contour TLS + - apiGroups: + - projectcontour.io + resources: + - httpproxies + - tlscertificatedelegations + verbs: + - get + - list + - watch + - update + - create + - delete + - deletecollection + - patch + # for security-guard + - apiGroups: + - guard.security.knative.dev + resources: + - guardians + verbs: + - get + - list + - watch + - create + - update + - patch + - delete + - apiGroups: + - "" + resources: + - configmaps + verbs: + - get + - list + - watch + - create + - update + - patch + - delete + - apiGroups: + - authentication.k8s.io + resources: + - tokenreviews + verbs: + - create + - apiGroups: + - "" + resources: + - pods + verbs: + - get +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +kind: ClusterRole +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: knative-eventing-operator + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +rules: + - apiGroups: + - operator.knative.dev + resources: + - '*' + verbs: + - '*' + # Bootstrapping permissions. + # Roles that are explicitly bound buch which are specified by this Operator + # MUST be specified here with 'get' and 'bind'. + - apiGroups: + - rbac.authorization.k8s.io + resources: + - clusterroles + - roles + verbs: + - create + - delete + # Escalate is necessary in order to create a role using cluster role aggregation, + # and to allow the Operator to bootstrap itself into the necessary set of + # permissions, even as those continue to evolve upstream. + - escalate + - get + - list + - update + - apiGroups: + - rbac.authorization.k8s.io + resources: + - clusterrolebindings + - rolebindings + verbs: + - create + - delete + - list + - get + - update + # Permissions required for Knative controller + # infra. + - apiGroups: + - apiregistration.k8s.io + resources: + - apiservices + verbs: + - update + - apiGroups: + - "" + resources: + - services + verbs: + - create + - delete + - get + - list + - watch + - apiGroups: + - caching.internal.knative.dev + resources: + - images + verbs: + - '*' + - apiGroups: + - "" + resources: + - namespaces + verbs: + - get + - update + - watch + - apiGroups: + - '' + resources: + - events + verbs: + - create + - update + - patch + - apiGroups: + - '' + resources: + - configmaps + verbs: + - create + - delete + - get + - list + - watch + - apiGroups: + - apps + resources: + - deployments + - daemonsets + - replicasets + - statefulsets + verbs: + - create + - delete + - get + - list + - watch + - apiGroups: + - apiregistration.k8s.io + resources: + - apiservices + verbs: + - create + - delete + - get + - list + - apiGroups: + - autoscaling + resources: + - horizontalpodautoscalers + verbs: + - create + - delete + - update + - get + - list + - apiGroups: + - coordination.k8s.io + resources: + - leases + verbs: + - '*' + - apiGroups: + - apiextensions.k8s.io + resources: + - customresourcedefinitions + verbs: + - '*' + - apiGroups: + - batch + resources: + - jobs + verbs: + - create + - delete + - update + - get + - list + - watch + # Old resources that need cleaning up that are not in the knative-eventing + # namespace. + - apiGroups: + - "" + resources: + - serviceaccounts + resourceNames: + - knative-eventing-operator + verbs: + - delete + # for RabbitMQ messaging topology objects + - apiGroups: + - rabbitmq.com + resources: + - rabbitmqclusters + verbs: + - "get" + - "list" + - "watch" + - apiGroups: + - rabbitmq.com + resources: + - bindings + - queues + - exchanges + verbs: + - create + - delete + - get + - list + - patch + - update + - watch + - apiGroups: + - rabbitmq.com + resources: + - bindings/status + - queues/status + - exchanges/status + verbs: + - get + # for Kafka eventing source + - apiGroups: + - keda.sh + resources: + - scaledobjects + - scaledobjects/finalizers + - scaledobjects/status + - triggerauthentications + - triggerauthentications/status + verbs: + - get + - list + - watch + - update + - create + - delete + # Internal APIs + - apiGroups: + - "internal.kafka.eventing.knative.dev" + resources: + - "consumers" + - "consumers/status" + - "consumergroups" + - "consumergroups/status" + verbs: + - create + - get + - list + - watch + - patch + - update + - delete + - apiGroups: + - "internal.kafka.eventing.knative.dev" + resources: + - "consumers/finalizers" + - "consumergroups/finalizers" + verbs: + - update + - delete + - apiGroups: + - apps + resources: + - statefulsets/scale + verbs: + - get + - list + - watch + - update + - patch + - create + - delete + - apiGroups: + - rbac.authorization.k8s.io + resources: + - clusterrolebindings + verbs: + - watch + - apiGroups: + - "*" + resources: + - configmaps + verbs: + - delete + - apiGroups: + - "*" + resources: + - configmaps + - services + verbs: + - get + - list + - watch + - update + - create + - delete + - apiGroups: + - "*" + resources: + - pods + verbs: + - list + - update + - get + - watch + - apiGroups: + - "*" + resources: + - pods/finalizers + verbs: + - get + - list + - create + - update + - delete + - apiGroups: + - "*" + resources: + - events + verbs: + - patch + - create + - apiGroups: + - "*" + resources: + - secrets + verbs: + - get + - list + - watch + - update + - create + - delete + - apiGroups: + - "*" + resources: + - nodes + verbs: + - get + - list + - watch + - apiGroups: + - "*" + resources: + - serviceaccounts + verbs: + - get + - list + - watch + - update + - create + - delete + - apiGroups: + - "*" + resources: + - configmaps + resourceNames: + - kafka-channel-config + verbs: + - patch + - apiGroups: + - "*" + resources: + - horizontalpodautoscalers + resourceNames: + - kafka-webhook + verbs: + - delete + - apiGroups: + - "*" + resources: + - leases + verbs: + - delete + - apiGroups: + - "*" + resources: + - poddisruptionbudgets + resourceNames: + - kafka-webhook + verbs: + - delete + - apiGroups: + - "*" + resources: + - services + verbs: + - patch + - apiGroups: + - "apps" + resources: + - deployments + verbs: + - deletecollection + # Eventing TLS + - apiGroups: + - "cert-manager.io" + resources: + - certificates + - issuers + - clusterissuers + verbs: + - create + - delete + - update + - list + - get + - watch + - apiGroups: + - "trust.cert-manager.io" + resources: + - bundles + verbs: + - create + - delete + - update + - list + - get + - watch +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: knative-operator-webhook + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +rules: + # For watching logging configuration and getting certs. + - apiGroups: + - "" + resources: + - "configmaps" + verbs: + - "get" + - "list" + - "watch" + # For manipulating certs into secrets. + - apiGroups: + - "" + resources: + - "namespaces" + verbs: + - "get" + - "create" + - "update" + - "list" + - "watch" + - "patch" + # finalizers are needed for the owner reference of the webhook + - apiGroups: + - "" + resources: + - "namespaces/finalizers" + verbs: + - "update" + # For getting our Deployment so we can decorate with ownerref. + - apiGroups: + - "apps" + resources: + - "deployments" + verbs: + - "get" + - apiGroups: + - "apps" + resources: + - "deployments/finalizers" + verbs: + - update + # For actually registering our webhook. + - apiGroups: + - "admissionregistration.k8s.io" + resources: + - "mutatingwebhookconfigurations" + - "validatingwebhookconfigurations" + verbs: &everything + - "get" + - "list" + - "create" + - "update" + - "delete" + - "patch" + - "watch" + # For leader election + - apiGroups: + - "coordination.k8s.io" + resources: + - "leases" + verbs: *everything + # Necessary for conversion webhook. These are copied from the serving + # TODO: Do we really need all these permissions? + - apiGroups: ["apiextensions.k8s.io"] + resources: ["customresourcedefinitions"] + verbs: ["get", "list", "create", "update", "delete", "patch", "watch"] +--- +# Source: dataplane/charts/fluentbit/templates/clusterrolebinding.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: release-name-fluentbit + labels: + helm.sh/chart: fluentbit-0.48.9 + app.kubernetes.io/name: fluentbit + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "3.2.8" + app.kubernetes.io/managed-by: Helm +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: release-name-fluentbit +subjects: + - kind: ServiceAccount + name: union-system + namespace: union +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2020 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# TODO: Consider restriction of non-aggregated role to knativeservings namespaces. +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: knative-serving-operator + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: knative-serving-operator +subjects: + - kind: ServiceAccount + name: knative-operator + namespace: "union" +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# TODO: Consider restriction of non-aggregated role to knativeeventing namespaces. +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: knative-eventing-operator + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: knative-eventing-operator +subjects: + - kind: ServiceAccount + name: knative-operator + namespace: "union" +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: operator-webhook + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: knative-operator-webhook +subjects: + - kind: ServiceAccount + name: operator-webhook + namespace: "union" +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2020 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: knative-serving-operator-aggregated + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: knative-serving-operator-aggregated +subjects: + - kind: ServiceAccount + name: knative-operator + namespace: "union" +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: knative-serving-operator-aggregated-stable + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: knative-serving-operator-aggregated-stable +subjects: + - kind: ServiceAccount + name: knative-operator + namespace: "union" +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: knative-eventing-operator-aggregated + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: knative-eventing-operator-aggregated +subjects: + - kind: ServiceAccount + name: knative-operator + namespace: "union" +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: knative-eventing-operator-aggregated-stable + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: knative-eventing-operator-aggregated-stable +subjects: + - kind: ServiceAccount + name: knative-operator + namespace: "union" +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2022 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. -- apiGroups: ["storage.k8s.io"] - resources: - - volumeattachments - verbs: ["list", "watch"] +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + namespace: "union" + name: knative-operator-webhook + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +rules: + # For manipulating certs into secrets. + - apiGroups: + - "" + resources: + - "secrets" + verbs: + - "get" + - "create" + - "update" + - "list" + - "watch" + - "patch" +--- +# Source: dataplane/templates/monitoring/prometheus-rbac.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: union-operator-prometheus-rbac + namespace: union + labels: + release: release-name +rules: + # Prometheus server scrape permissions + - apiGroups: [""] + resources: + - nodes + - nodes/proxy + - nodes/metrics + - services + - endpoints + - pods + - ingresses + - configmaps + verbs: + - get + - list + - watch + - apiGroups: ["extensions", "networking.k8s.io"] + resources: + - ingresses/status + - ingresses + verbs: + - get + - list + - watch + - apiGroups: ["discovery.k8s.io"] + resources: + - endpointslices + verbs: + - get + - list + - watch + # kube-state-metrics permissions (rbac.create=false on the subchart; managed here instead) + - apiGroups: ["certificates.k8s.io"] + resources: ["certificatesigningrequests"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["configmaps"] + verbs: ["list", "watch"] + - apiGroups: ["batch"] + resources: ["cronjobs"] + verbs: ["list", "watch"] + - apiGroups: ["extensions", "apps"] + resources: ["daemonsets"] + verbs: ["list", "watch"] + - apiGroups: ["extensions", "apps"] + resources: ["deployments"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["endpoints"] + verbs: ["list", "watch"] + - apiGroups: ["discovery.k8s.io"] + resources: ["endpointslices"] + verbs: ["list", "watch"] + - apiGroups: ["autoscaling"] + resources: ["horizontalpodautoscalers"] + verbs: ["list", "watch"] + - apiGroups: ["extensions", "networking.k8s.io"] + resources: ["ingresses"] + verbs: ["list", "watch"] + - apiGroups: ["batch"] + resources: ["jobs"] + verbs: ["list", "watch"] + - apiGroups: ["coordination.k8s.io"] + resources: ["leases"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["limitranges"] + verbs: ["list", "watch"] + - apiGroups: ["admissionregistration.k8s.io"] + resources: ["mutatingwebhookconfigurations"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["namespaces"] + verbs: ["list", "watch"] + - apiGroups: ["networking.k8s.io"] + resources: ["networkpolicies"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["persistentvolumeclaims"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["persistentvolumes"] + verbs: ["list", "watch"] + - apiGroups: ["policy"] + resources: ["poddisruptionbudgets"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["pods"] + verbs: ["list", "watch"] + - apiGroups: ["extensions", "apps"] + resources: ["replicasets"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["replicationcontrollers"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["resourcequotas"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["secrets"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["services"] + verbs: ["list", "watch"] + - apiGroups: ["apps"] + resources: ["statefulsets"] + verbs: ["list", "watch"] + - apiGroups: ["storage.k8s.io"] + resources: ["storageclasses"] + verbs: ["list", "watch"] + - apiGroups: ["admissionregistration.k8s.io"] + resources: ["validatingwebhookconfigurations"] + verbs: ["list", "watch"] + - apiGroups: ["storage.k8s.io"] + resources: ["volumeattachments"] + verbs: ["list", "watch"] --- # Source: dataplane/templates/nodeexecutor/serviceaccount.yaml apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole +kind: Role metadata: name: union-executor labels: @@ -3070,100 +4615,11 @@ rules: - delete - update --- -# Source: dataplane/templates/prometheus/rbac.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: union-operator-prometheus - labels: - helm.sh/chart: dataplane-2026.4.2 - app.kubernetes.io/name: release-name-dataplane - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: prometheus -rules: - - apiGroups: [""] - resources: - - nodes - - nodes/proxy - - pods - - endpoints - - services - verbs: - - get - - list - - watch - - nonResourceURLs: - - /metrics - - /metrics/cadvisor - verbs: - - get ---- -# Source: dataplane/charts/kube-state-metrics/templates/clusterrolebinding.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - labels: - helm.sh/chart: kube-state-metrics-5.30.1 - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: metrics - app.kubernetes.io/part-of: kube-state-metrics - app.kubernetes.io/name: kube-state-metrics - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2.15.0" - name: release-name-kube-state-metrics -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: release-name-kube-state-metrics -subjects: -- kind: ServiceAccount - name: release-name-kube-state-metrics - namespace: union ---- -# Source: dataplane/templates/nodeexecutor/serviceaccount.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - name: union-executor - labels: - app: executor -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: union-executor -subjects: -- kind: ServiceAccount - name: executor - namespace: union ---- -# Source: dataplane/templates/prometheus/rbac.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - name: union-operator-prometheus - labels: - helm.sh/chart: dataplane-2026.4.2 - app.kubernetes.io/name: release-name-dataplane - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: prometheus -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: union-operator-prometheus -subjects: - - kind: ServiceAccount - name: union-operator-prometheus - namespace: union ---- # Source: dataplane/templates/operator/serviceaccount-proxy-secret.yaml apiVersion: rbac.authorization.k8s.io/v1 kind: Role metadata: - name: proxy-system-secret + name: union-system-secret namespace: union labels: app.kubernetes.io/name: operator-proxy @@ -3261,12 +4717,25 @@ rules: - create - update - delete + - apiGroups: + - serving.knative.dev + resources: + - revisions + - configurations + - services + verbs: + - get + - list + - watch + - create + - update + - delete --- -# Source: dataplane/templates/propeller/serviceaccount-webhook.yaml +# Source: dataplane/templates/webhook/serviceaccount.yaml kind: Role apiVersion: rbac.authorization.k8s.io/v1 metadata: - name: flytepropeller-webhook-role + name: union-webhook-role namespace: union rules: - apiGroups: @@ -3282,78 +4751,94 @@ rules: - update - patch --- -# Source: dataplane/templates/propeller/serviceaccount.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2022 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + apiVersion: rbac.authorization.k8s.io/v1 -kind: Role +kind: RoleBinding metadata: - name: flytepropeller-role -rules: - # Allow RO access to PODS - - apiGroups: - - "" - resources: - - pods - verbs: - - get - - list - - watch - # Allow Event recording access - - apiGroups: - - "" - resources: - - events - verbs: - - create - - update - - delete - - patch - # Allow Access All plugin objects - - apiGroups: - - '*' - resources: - - '*' - verbs: - - get - - list - - watch - - create - - update - - delete - - patch - # Allow Access to CRD - - apiGroups: - - apiextensions.k8s.io - resources: - - customresourcedefinitions - verbs: - - get - - list - - watch - - create - - delete - - update - # Allow Access to all resources under flyte.lyft.com - - apiGroups: - - flyte.lyft.com - resources: - - flyteworkflows - - flyteworkflows/finalizers - verbs: - - get - - list - - watch - - create - - update - - delete - - patch - - post - - deletecollection + namespace: "union" + name: operator-webhook + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +subjects: + - kind: ServiceAccount + name: operator-webhook + namespace: "union" +roleRef: + kind: Role + name: knative-operator-webhook + apiGroup: rbac.authorization.k8s.io +--- +# Source: dataplane/templates/monitoring/prometheus-rbac.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: union-operator-prometheus-rbac + namespace: union + labels: + release: release-name +subjects: + - kind: ServiceAccount + name: union-operator-prometheus + namespace: union +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: union-operator-prometheus-rbac +--- +# Source: dataplane/templates/monitoring/prometheus-rbac.yaml +# kube-state-metrics service account RoleBinding +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: release-name-prometheus-kube-state-metrics + namespace: union + labels: + release: release-name +subjects: + - kind: ServiceAccount + name: release-name-prometheus-kube-state-metrics + namespace: union +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: union-operator-prometheus-rbac +--- +# Source: dataplane/templates/nodeexecutor/serviceaccount.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: union-executor + labels: + app: executor +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: union-executor +subjects: +- kind: ServiceAccount + name: union-system + namespace: union --- # Source: dataplane/templates/operator/serviceaccount-proxy-secret.yaml apiVersion: rbac.authorization.k8s.io/v1 kind: RoleBinding metadata: - name: proxy-system-secret + name: union-system-secret namespace: union labels: app.kubernetes.io/name: operator-proxy @@ -3363,10 +4848,10 @@ metadata: roleRef: apiGroup: rbac.authorization.k8s.io kind: Role - name: proxy-system-secret + name: union-system-secret subjects: - kind: ServiceAccount - name: proxy-system + name: union-system namespace: union --- # Source: dataplane/templates/operator/serviceaccount-proxy.yaml @@ -3385,7 +4870,7 @@ roleRef: name: proxy-system subjects: - kind: ServiceAccount - name: proxy-system + name: union-system namespace: union --- # Source: dataplane/templates/operator/serviceaccount.yaml @@ -3404,53 +4889,102 @@ roleRef: name: operator-system subjects: - kind: ServiceAccount - name: operator-system + name: union-system namespace: union --- -# Source: dataplane/templates/propeller/serviceaccount-webhook.yaml +# Source: dataplane/templates/webhook/serviceaccount.yaml # Create a binding from Role -> ServiceAccount kind: RoleBinding apiVersion: rbac.authorization.k8s.io/v1 metadata: - name: flytepropeller-webhook-binding + name: union-webhook-binding namespace: union roleRef: apiGroup: rbac.authorization.k8s.io kind: Role - name: flytepropeller-webhook-role + name: union-webhook-role subjects: - kind: ServiceAccount - name: flytepropeller-webhook-system + name: union-system namespace: union --- -# Source: dataplane/templates/propeller/serviceaccount.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: RoleBinding +# Source: dataplane/charts/fluentbit/templates/service.yaml +apiVersion: v1 +kind: Service metadata: - name: flytepropeller-binding -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: Role - name: flytepropeller-role -subjects: - - kind: ServiceAccount - name: flytepropeller-system - namespace: union + name: release-name-fluentbit + namespace: union + labels: + helm.sh/chart: fluentbit-0.48.9 + app.kubernetes.io/name: fluentbit + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "3.2.8" + app.kubernetes.io/managed-by: Helm +spec: + type: ClusterIP + ports: + - port: 2020 + targetPort: http + protocol: TCP + name: http + selector: + app.kubernetes.io/name: fluentbit + app.kubernetes.io/instance: release-name +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2022 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v1 +kind: Service +metadata: + labels: + role: operator-webhook + app.kubernetes.io/component: operator-webhook + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator + name: operator-webhook + namespace: "union" +spec: + ports: + # Define metrics and profiling for them to be accessible within service meshes. + - name: http-metrics + port: 9090 + targetPort: 9090 + - name: http-profiling + port: 8008 + targetPort: 8008 + - name: https-webhook + port: 443 + targetPort: 8443 + selector: + role: operator-webhook --- -# Source: dataplane/charts/kube-state-metrics/templates/service.yaml +# Source: dataplane/charts/prometheus/charts/kube-state-metrics/templates/service.yaml apiVersion: v1 kind: Service metadata: name: release-name-kube-state-metrics namespace: union labels: - helm.sh/chart: kube-state-metrics-5.30.1 + helm.sh/chart: kube-state-metrics-5.25.1 app.kubernetes.io/managed-by: Helm app.kubernetes.io/component: metrics app.kubernetes.io/part-of: kube-state-metrics app.kubernetes.io/name: kube-state-metrics app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2.15.0" + app.kubernetes.io/version: "2.13.0" annotations: prometheus.io/scrape: 'true' spec: @@ -3465,6 +4999,33 @@ spec: app.kubernetes.io/name: kube-state-metrics app.kubernetes.io/instance: release-name --- +# Source: dataplane/charts/prometheus/templates/service.yaml +apiVersion: v1 +kind: Service +metadata: + labels: + app.kubernetes.io/component: server + app.kubernetes.io/name: prometheus + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: v2.54.1 + helm.sh/chart: prometheus-25.27.0 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: prometheus + name: union-operator-prometheus + namespace: union +spec: + ports: + - name: http + port: 80 + protocol: TCP + targetPort: 9090 + selector: + app.kubernetes.io/component: server + app.kubernetes.io/name: prometheus + app.kubernetes.io/instance: release-name + sessionAffinity: None + type: "ClusterIP" +--- # Source: dataplane/templates/clusterresourcesync/service.yaml apiVersion: v1 kind: Service @@ -3487,6 +5048,31 @@ spec: app.kubernetes.io/name: clusterresourcesync app.kubernetes.io/instance: release-name --- +# Source: dataplane/templates/flyteconnector/service.yaml +apiVersion: v1 +kind: Service +metadata: + name: flyteconnector + namespace: union + labels: + app.kubernetes.io/name: flyteconnector + app.kubernetes.io/instance: release-name + app.kubernetes.io/managed-by: Helm +spec: + clusterIP: None + ports: + - name: grpc + port: 8000 + protocol: TCP + targetPort: grpc + - name: metric + port: 9090 + protocol: TCP + targetPort: metric + selector: + app.kubernetes.io/name: flyteconnector + app.kubernetes.io/instance: release-name +--- # Source: dataplane/templates/imagebuilder/service.yaml apiVersion: v1 kind: Service @@ -3512,7 +5098,7 @@ spec: apiVersion: v1 kind: Service metadata: - name: release-name-dataplane-executor + name: union-operator-executor labels: platform.union.ai/prometheus-group: "union-services" app: executor @@ -3578,39 +5164,14 @@ spec: app.kubernetes.io/name: union-operator app.kubernetes.io/instance: release-name --- -# Source: dataplane/templates/prometheus/service.yaml -apiVersion: v1 -kind: Service -metadata: - name: union-operator-prometheus - namespace: union - labels: - helm.sh/chart: dataplane-2026.4.2 - app.kubernetes.io/name: release-name-dataplane - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: prometheus -spec: - type: ClusterIP - ports: - - port: 80 - targetPort: 9090 - protocol: TCP - name: http - selector: - app.kubernetes.io/component: prometheus - app.kubernetes.io/name: release-name-dataplane - app.kubernetes.io/instance: release-name ---- -# Source: dataplane/templates/propeller/service-webhook.yaml +# Source: dataplane/templates/webhook/service.yaml apiVersion: v1 kind: Service metadata: - name: flyte-pod-webhook + name: union-pod-webhook namespace: union labels: - app.kubernetes.io/name: flyte-pod-webhook + app.kubernetes.io/name: union-pod-webhook app.kubernetes.io/instance: release-name platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm @@ -3618,7 +5179,7 @@ metadata: projectcontour.io/upstream-protocol.h2c: grpc spec: selector: - app.kubernetes.io/name: flyte-pod-webhook + app.kubernetes.io/name: union-pod-webhook app.kubernetes.io/instance: release-name ports: - name: https @@ -3630,23 +5191,23 @@ spec: port: 10254 targetPort: 10254 --- -# Source: dataplane/templates/propeller/service-webhook.yaml +# Source: dataplane/templates/webhook/service.yaml # Headless Service for cache invalidation — resolves to all pod IPs so that # we can fan out invalidation requests to every webhook replica. apiVersion: v1 kind: Service metadata: - name: flyte-pod-webhook-headless + name: union-pod-webhook-headless namespace: union labels: - app.kubernetes.io/name: flyte-pod-webhook + app.kubernetes.io/name: union-pod-webhook app.kubernetes.io/instance: release-name platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm spec: clusterIP: None selector: - app.kubernetes.io/name: flyte-pod-webhook + app.kubernetes.io/name: union-pod-webhook app.kubernetes.io/instance: release-name ports: - name: cache-internal @@ -3654,46 +5215,294 @@ spec: port: 9443 targetPort: 9443 --- -# Source: dataplane/templates/propeller/service.yaml -apiVersion: v1 -kind: Service +# Source: dataplane/charts/fluentbit/templates/daemonset.yaml +apiVersion: apps/v1 +kind: DaemonSet metadata: + name: release-name-fluentbit namespace: union - name: flytepropeller labels: - platform.union.ai/prometheus-group: "union-services" - app.kubernetes.io/name: flytepropeller + helm.sh/chart: fluentbit-0.48.9 + app.kubernetes.io/name: fluentbit app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name + app.kubernetes.io/version: "3.2.8" app.kubernetes.io/managed-by: Helm spec: - type: ClusterIP - ports: - - name: debug - protocol: TCP - port: 10254 - - name: fasttask - port: 15605 - protocol: TCP - targetPort: 15605 selector: - app.kubernetes.io/name: flytepropeller - app.kubernetes.io/instance: release-name + matchLabels: + app.kubernetes.io/name: fluentbit + app.kubernetes.io/instance: release-name + template: + metadata: + labels: + app.kubernetes.io/name: fluentbit + app.kubernetes.io/instance: release-name + annotations: + checksum/config: e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855 + spec: + serviceAccountName: union-system + hostNetwork: false + dnsPolicy: ClusterFirst + containers: + - name: fluentbit + image: "cr.fluentbit.io/fluent/fluent-bit:3.2.8" + imagePullPolicy: IfNotPresent + command: + - /fluent-bit/bin/fluent-bit + args: + - --workdir=/fluent-bit/etc + - --config=/fluent-bit/etc/conf/fluent-bit.conf + ports: + - name: http + containerPort: 2020 + protocol: TCP + livenessProbe: + httpGet: + path: / + port: http + readinessProbe: + httpGet: + path: /api/v1/health + port: http + volumeMounts: + - name: config + mountPath: /fluent-bit/etc/conf + - mountPath: /var/log + name: varlog + - mountPath: /var/lib/docker/containers + name: varlibdockercontainers + readOnly: true + - mountPath: /etc/machine-id + name: etcmachineid + readOnly: true + volumes: + - name: config + configMap: + name: fluentbit-system + - hostPath: + path: /var/log + name: varlog + - hostPath: + path: /var/lib/docker/containers + name: varlibdockercontainers + - hostPath: + path: /etc/machine-id + type: File + name: etcmachineid + tolerations: + - operator: Exists +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2022 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: operator-webhook + namespace: "union" + labels: + app.kubernetes.io/component: operator-webhook + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +spec: + selector: + matchLabels: + app: operator-webhook + role: operator-webhook + template: + metadata: + labels: + app: operator-webhook + role: operator-webhook + app.kubernetes.io/component: operator-webhook + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator + sidecar.istio.io/inject: "false" + spec: + # To avoid node becoming SPOF, spread our replicas to different nodes. + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - podAffinityTerm: + labelSelector: + matchLabels: + app: operator-webhook + topologyKey: kubernetes.io/hostname + weight: 100 + serviceAccountName: operator-webhook + containers: + - name: operator-webhook + # This is the Go import path for the binary that is containerized + # and substituted here. + image: gcr.io/knative-releases/knative.dev/operator/cmd/webhook@sha256:d3a7a3304629ccfcaacec620b50736e0b4c902a6328b83369b6cbaf7e94677c9 + resources: + requests: + cpu: 100m + memory: 100Mi + limits: + cpu: 500m + memory: 500Mi + env: + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: SYSTEM_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: CONFIG_LOGGING_NAME + value: config-logging + - name: CONFIG_OBSERVABILITY_NAME + value: config-observability + - name: WEBHOOK_NAME + value: operator-webhook + - name: WEBHOOK_PORT + value: "8443" + - name: WEBHOOK_SECRET_NAME + value: operator-webhook-certs + - name: METRICS_DOMAIN + value: knative.dev/operator + - name: KUBERNETES_MIN_VERSION + value: "" + securityContext: + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + runAsNonRoot: true + capabilities: + drop: + - ALL + ports: + - name: metrics + containerPort: 9090 + - name: profiling + containerPort: 8008 + - name: https-webhook + containerPort: 8443 + readinessProbe: + periodSeconds: 1 + httpGet: + scheme: HTTPS + port: 8443 + httpHeaders: + - name: k-kubelet-probe + value: "webhook" + livenessProbe: + periodSeconds: 1 + httpGet: + scheme: HTTPS + port: 8443 + httpHeaders: + - name: k-kubelet-probe + value: "webhook" + failureThreshold: 6 + initialDelaySeconds: 120 + # Our webhook should gracefully terminate by lame ducking first, set this to a sufficiently + # high value that we respect whatever value it has configured for the lame duck grace period. + terminationGracePeriodSeconds: 300 +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2020 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: knative-operator + namespace: "union" + labels: + app.kubernetes.io/name: knative-operator + app.kubernetes.io/version: "1.16.0" +spec: + replicas: 1 + selector: + matchLabels: + name: knative-operator + template: + metadata: + labels: + name: knative-operator + app.kubernetes.io/name: knative-operator + app.kubernetes.io/version: "1.16.0" + sidecar.istio.io/inject: "false" + spec: + serviceAccountName: knative-operator + containers: + - name: knative-operator + image: gcr.io/knative-releases/knative.dev/operator/cmd/operator@sha256:0b5a3532417f9c8e7b6044e23f6f67ad932a74a40a4092ad965b6f173b2fd887 + resources: + requests: + cpu: 100m + memory: 100Mi + limits: + cpu: 1000m + memory: 1000Mi + imagePullPolicy: IfNotPresent + env: + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: SYSTEM_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: METRICS_DOMAIN + value: knative.dev/operator + - name: CONFIG_LOGGING_NAME + value: config-logging + - name: CONFIG_OBSERVABILITY_NAME + value: config-observability + - name: KUBERNETES_MIN_VERSION + value: "" + securityContext: + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + runAsNonRoot: true + capabilities: + drop: + - ALL + ports: + - name: metrics + containerPort: 9090 --- -# Source: dataplane/charts/kube-state-metrics/templates/deployment.yaml +# Source: dataplane/charts/prometheus/charts/kube-state-metrics/templates/deployment.yaml apiVersion: apps/v1 kind: Deployment metadata: name: release-name-kube-state-metrics namespace: union labels: - helm.sh/chart: kube-state-metrics-5.30.1 + helm.sh/chart: kube-state-metrics-5.25.1 app.kubernetes.io/managed-by: Helm app.kubernetes.io/component: metrics app.kubernetes.io/part-of: kube-state-metrics app.kubernetes.io/name: kube-state-metrics app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2.15.0" + app.kubernetes.io/version: "2.13.0" spec: selector: matchLabels: @@ -3706,13 +5515,13 @@ spec: template: metadata: labels: - helm.sh/chart: kube-state-metrics-5.30.1 + helm.sh/chart: kube-state-metrics-5.25.1 app.kubernetes.io/managed-by: Helm app.kubernetes.io/component: metrics app.kubernetes.io/part-of: kube-state-metrics app.kubernetes.io/name: kube-state-metrics app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2.15.0" + app.kubernetes.io/version: "2.13.0" spec: automountServiceAccountToken: true hostNetwork: false @@ -3729,8 +5538,9 @@ spec: args: - --port=8080 - --resources=certificatesigningrequests,configmaps,cronjobs,daemonsets,deployments,endpoints,horizontalpodautoscalers,ingresses,jobs,leases,limitranges,mutatingwebhookconfigurations,namespaces,networkpolicies,nodes,persistentvolumeclaims,persistentvolumes,poddisruptionbudgets,pods,replicasets,replicationcontrollers,resourcequotas,secrets,services,statefulsets,storageclasses,validatingwebhookconfigurations,volumeattachments + - --namespaces=union imagePullPolicy: IfNotPresent - image: registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.15.0 + image: registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.13.0 ports: - containerPort: 8080 name: "http" @@ -3750,7 +5560,7 @@ spec: httpGet: httpHeaders: path: /readyz - port: 8081 + port: 8080 scheme: HTTP initialDelaySeconds: 5 periodSeconds: 10 @@ -3765,6 +5575,205 @@ spec: - ALL readOnlyRootFilesystem: true --- +# Source: dataplane/charts/prometheus/templates/deploy.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + labels: + app.kubernetes.io/component: server + app.kubernetes.io/name: prometheus + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: v2.54.1 + helm.sh/chart: prometheus-25.27.0 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: prometheus + name: union-operator-prometheus + namespace: union +spec: + selector: + matchLabels: + app.kubernetes.io/component: server + app.kubernetes.io/name: prometheus + app.kubernetes.io/instance: release-name + replicas: 1 + revisionHistoryLimit: 10 + strategy: + type: Recreate + rollingUpdate: null + template: + metadata: + labels: + app.kubernetes.io/component: server + app.kubernetes.io/name: prometheus + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: v2.54.1 + helm.sh/chart: prometheus-25.27.0 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: prometheus + spec: + enableServiceLinks: true + serviceAccountName: union-operator-prometheus + containers: + - name: prometheus-server-configmap-reload + image: "quay.io/prometheus-operator/prometheus-config-reloader:v0.76.0" + imagePullPolicy: "IfNotPresent" + args: + - --watched-dir=/etc/config + - --listen-address=0.0.0.0:8080 + - --reload-url=http://127.0.0.1:9090/prometheus/-/reload + ports: + - containerPort: 8080 + name: metrics + livenessProbe: + httpGet: + path: /healthz + port: metrics + scheme: HTTP + initialDelaySeconds: 2 + periodSeconds: 10 + readinessProbe: + httpGet: + path: /healthz + port: metrics + scheme: HTTP + periodSeconds: 10 + volumeMounts: + - name: config-volume + mountPath: /etc/config + readOnly: true + + - name: prometheus-server + image: "quay.io/prometheus/prometheus:v2.54.1" + imagePullPolicy: "IfNotPresent" + args: + - --storage.tsdb.retention.time=3d + - --config.file=/etc/config/prometheus.yml + - --storage.tsdb.path=/data + - --web.console.libraries=/etc/prometheus/console_libraries + - --web.console.templates=/etc/prometheus/consoles + - --web.enable-lifecycle + - --web.route-prefix=/prometheus + - --web.external-url=/prometheus/ + ports: + - containerPort: 9090 + readinessProbe: + httpGet: + path: /prometheus/-/ready + port: 9090 + scheme: HTTP + initialDelaySeconds: 30 + periodSeconds: 5 + timeoutSeconds: 4 + failureThreshold: 3 + successThreshold: 1 + livenessProbe: + httpGet: + path: /prometheus/-/healthy + port: 9090 + scheme: HTTP + initialDelaySeconds: 30 + periodSeconds: 15 + timeoutSeconds: 10 + failureThreshold: 3 + successThreshold: 1 + resources: + limits: + cpu: "3" + memory: 3500Mi + requests: + cpu: "1" + memory: 1Gi + volumeMounts: + - name: config-volume + mountPath: /etc/config + - name: storage-volume + mountPath: /data + subPath: "" + - name: server-recording-rules + mountPath: /etc/config/recording + subPath: + readOnly: + dnsPolicy: ClusterFirst + securityContext: + fsGroup: 65534 + runAsGroup: 65534 + runAsNonRoot: true + runAsUser: 65534 + terminationGracePeriodSeconds: 300 + volumes: + - name: config-volume + configMap: + name: union-operator-prometheus + - name: server-recording-rules + configMap: + name: union-recording-rules + - name: storage-volume + emptyDir: + sizeLimit: 10Gi +--- +# Source: dataplane/templates/flyteconnector/deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: flyteconnector + namespace: union + labels: + app.kubernetes.io/name: flyteconnector + app.kubernetes.io/instance: release-name + app.kubernetes.io/managed-by: Helm +spec: + replicas: 2 + selector: + matchLabels: + app.kubernetes.io/name: flyteconnector + app.kubernetes.io/instance: release-name + template: + metadata: + annotations: + labels: + platform.union.ai/zone: "dataplane" + + app.kubernetes.io/name: flyteconnector + app.kubernetes.io/instance: release-name + app.kubernetes.io/managed-by: Helm + spec: + containers: + - command: + - c0 + image: "ghcr.io/flyteorg/flyte-connectors:py3.13-2.0.0b50.dev3-g695bb1db3.d20260122" + imagePullPolicy: "IfNotPresent" + name: flyteconnector + readinessProbe: + failureThreshold: 3 + httpGet: + path: / + port: 9090 + scheme: HTTP + initialDelaySeconds: 5 + periodSeconds: 10 + successThreshold: 1 + timeoutSeconds: 5 + env: + - name: AIRFLOW_CONN_GOOGLE_CLOUD_DEFAULT + value: "google-cloud-platform://" + - name: LOG_LEVEL + value: "20" + ports: + - containerPort: 8000 + name: grpc + - containerPort: 9090 + name: metric + resources: + limits: + cpu: "1.5" + ephemeral-storage: 100Mi + memory: 1500Mi + requests: + cpu: "1" + ephemeral-storage: 100Mi + memory: 1000Mi + serviceAccountName: flyteconnector +--- # Source: dataplane/templates/imagebuilder/deployment.yaml apiVersion: apps/v1 kind: Deployment @@ -3785,17 +5794,15 @@ spec: app.kubernetes.io/instance: release-name template: metadata: - annotations: - container.apparmor.security.beta.kubernetes.io/buildkit: unconfined labels: platform.union.ai/zone: "dataplane" app.kubernetes.io/name: imagebuilder-buildkit app.kubernetes.io/instance: release-name spec: - serviceAccountName: "union-imagebuilder" + serviceAccountName: "union-system" containers: - name: "buildkit" - image: "docker.io/moby/buildkit:buildx-stable-1-rootless" + image: "docker.io/moby/buildkit:buildx-stable-1" imagePullPolicy: IfNotPresent env: - name: POD_NAME @@ -3830,18 +5837,15 @@ spec: - name: KNATIVE_PROXY_SERVICE_URL value: http://kourier-internal volumeMounts: - - mountPath: /home/user/.local/share/buildkit - name: buildkitd - mountPath: /etc/buildkit name: buildkit-config args: - --config - /etc/buildkit/buildkitd.toml - --addr - - unix:///run/user/1000/buildkit/buildkitd.sock + - unix:///run//buildkit/buildkitd.sock - --addr - tcp://0.0.0.0:1234 - - --oci-worker-no-process-sandbox ports: - name: tcp containerPort: 1234 @@ -3863,18 +5867,13 @@ spec: initialDelaySeconds: 5 periodSeconds: 30 securityContext: - seccompProfile: # Needs Kubernetes >= 1.19 - type: Unconfined - runAsUser: 1000 - runAsGroup: 1000 + privileged: true resources: requests: - cpu: 1 - ephemeral-storage: 20Gi - memory: 1Gi + cpu: 4 + ephemeral-storage: 50Gi + memory: 4Gi volumes: - - name: buildkitd - emptyDir: {} - configMap: name: union-operator-buildkit name: buildkit-config @@ -3903,16 +5902,18 @@ spec: template: metadata: annotations: - configChecksum: "b99c09d1a1a5eed74ac829e07ca2c6d82feafa4635cc4465d6cf7a5532d2b02" + configChecksum: "f9987502f74cfc6192b29b3552077e9918321ae31d7d03f536759d9b9ef60e4" labels: platform.union.ai/zone: "dataplane" app: executor + app.kubernetes.io/instance: 'release-name' + app.kubernetes.io/name: executor spec: securityContext: fsGroup: 1337 - serviceAccountName: executor + serviceAccountName: union-system volumes: - name: config-volume configMap: @@ -3928,7 +5929,7 @@ spec: image: "public.ecr.aws/p0i0a9q8/unionoperator:2026.4.5" imagePullPolicy: IfNotPresent command: - - executor + - executorv2 - serve - --config - /etc/config/*.yaml @@ -4006,7 +6007,7 @@ spec: template: metadata: annotations: - configChecksum: "69aef76847958ab05336281aaac503f93ef70f6722254e90d89fd7043ef2616" + configChecksum: "5f2a1d1f489bd8eec047919bba6d5046eaa4349289ae15c3410882a224a4037" labels: @@ -4024,7 +6025,7 @@ spec: - name: secret-volume secret: secretName: union-secret-auth - serviceAccountName: proxy-system + serviceAccountName: union-system securityContext: {} containers: @@ -4141,7 +6142,7 @@ spec: template: metadata: annotations: - configChecksum: "69aef76847958ab05336281aaac503f93ef70f6722254e90d89fd7043ef2616" + configChecksum: "5f2a1d1f489bd8eec047919bba6d5046eaa4349289ae15c3410882a224a4037" labels: @@ -4150,7 +6151,7 @@ spec: platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm spec: - serviceAccountName: operator-system + serviceAccountName: union-system securityContext: {} volumes: @@ -4231,81 +6232,16 @@ spec: containerPort: 10254 protocol: TCP --- -# Source: dataplane/templates/prometheus/deployment.yaml -apiVersion: apps/v1 -kind: Deployment -metadata: - name: union-operator-prometheus - namespace: union - labels: - helm.sh/chart: dataplane-2026.4.2 - app.kubernetes.io/name: release-name-dataplane - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: prometheus -spec: - replicas: 1 - selector: - matchLabels: - app.kubernetes.io/component: prometheus - app.kubernetes.io/name: release-name-dataplane - app.kubernetes.io/instance: release-name - template: - metadata: - annotations: - configChecksum: "3d012b080e73fe7b58126e78a1fc36f7ec49024230c00dfa369b24df267b19c" - labels: - app.kubernetes.io/component: prometheus - app.kubernetes.io/name: release-name-dataplane - app.kubernetes.io/instance: release-name - spec: - priorityClassName: system-cluster-critical - serviceAccountName: union-operator-prometheus - securityContext: - fsGroup: 65534 - runAsNonRoot: true - runAsUser: 65534 - fsGroupChangePolicy: OnRootMismatch - containers: - - name: prometheus - image: "prom/prometheus:v3.3.1" - args: - - --config.file=/etc/prometheus/prometheus.yml - - --web.external-url=/prometheus/ - - --web.route-prefix=/prometheus/ - - --storage.tsdb.retention.time=3d - ports: - - name: http - containerPort: 9090 - protocol: TCP - resources: - limits: - cpu: "3" - memory: 3500Mi - requests: - cpu: "1" - memory: 1Gi - securityContext: - allowPrivilegeEscalation: false - terminationMessagePolicy: FallbackToLogsOnError - volumeMounts: - - mountPath: /etc/prometheus - name: prometheus-config - volumes: - - name: prometheus-config - configMap: - name: union-operator-prometheus ---- -# Source: dataplane/templates/propeller/deployment-webhook.yaml +# Source: dataplane/templates/webhook/deployment.yaml +# Webhook deployment # Create the actual deployment apiVersion: apps/v1 kind: Deployment metadata: - name: flytepropeller-webhook + name: union-pod-webhook namespace: union labels: - app.kubernetes.io/name: flyte-pod-webhook + app.kubernetes.io/name: union-pod-webhook app.kubernetes.io/instance: release-name platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm @@ -4313,19 +6249,19 @@ spec: replicas: 1 selector: matchLabels: - app.kubernetes.io/name: flyte-pod-webhook + app.kubernetes.io/name: union-pod-webhook app.kubernetes.io/instance: release-name template: metadata: labels: platform.union.ai/zone: "dataplane" - app.kubernetes.io/name: flyte-pod-webhook + app.kubernetes.io/name: union-pod-webhook app.kubernetes.io/instance: release-name platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm annotations: - configChecksum: "1802a3a59db84835bdd2fc604d1b6be50ee3e55489f40c43e896d4c300638ce" + configChecksum: "77958aee0464eee4cb43c70036cf5ffa89f47f58c908845b9eeaecc7e32bba4" spec: securityContext: @@ -4333,64 +6269,8 @@ spec: fsGroupChangePolicy: Always runAsNonRoot: true runAsUser: 1001 - seLinuxOptions: - type: spc_t - serviceAccountName: flytepropeller-webhook-system - initContainers: - - name: generate-secrets - image: "public.ecr.aws/p0i0a9q8/unionoperator:2026.4.5" - imagePullPolicy: "IfNotPresent" - command: - - flytepropeller - args: - - webhook - - init-certs - - --config - - /etc/flyte/config/*.yaml - env: - - name: POD_NAME - valueFrom: - fieldRef: - fieldPath: metadata.name - - name: POD_NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - - name: GOMEMLIMIT - valueFrom: - resourceFieldRef: - divisor: 1 - resource: limits.memory - - name: GOMAXPROCS - valueFrom: - resourceFieldRef: - divisor: 1 - resource: limits.cpu - - name: CLUSTER_NAME - valueFrom: - secretKeyRef: - name: operator-cluster-name - key: cluster_name - - name: DEPLOYMENT_NAME - value: operator - - name: PROXY_SERVICE_URL - value: http://union-operator-proxy:8080 - - name: PROMETHEUS_SERVICE_URL - value: http://union-operator-prometheus:80 - - name: KNATIVE_PROXY_SERVICE_URL - value: http://kourier-internal - volumeMounts: - - name: config-volume - mountPath: /etc/flyte/config - resources: - limits: - cpu: 1 - ephemeral-storage: 500Mi - memory: 500Mi - requests: - cpu: 200m - ephemeral-storage: 500Mi - memory: 500Mi + seLinuxOptions: null + serviceAccountName: union-system containers: - name: webhook image: "public.ecr.aws/p0i0a9q8/unionoperator:2026.4.5" @@ -4455,103 +6335,320 @@ spec: volumes: - name: config-volume configMap: - name: flyte-propeller-config + name: union-pod-webhook-config - name: webhook-certs secret: - secretName: flyte-pod-webhook + secretName: union-pod-webhook --- -# Source: dataplane/templates/propeller/deployment.yaml -apiVersion: apps/v1 -kind: Deployment +# Source: dataplane/templates/flyteconnector/hpa.yaml +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler metadata: - namespace: union - name: flytepropeller + name: flyteconnector + labels: + app.kubernetes.io/name: flyteconnector + app.kubernetes.io/instance: release-name + app.kubernetes.io/managed-by: Helm +spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: flyteconnector + minReplicas: 2 + maxReplicas: 5 + metrics: + - type: Resource + resource: + name: cpu + target: + type: Utilization + averageUtilization: 80 + - type: Resource + resource: + name: memory + target: + type: Utilization + averageUtilization: 80 +--- +# Source: dataplane/templates/webhook/mutatingwebhookconfiguration.yaml +apiVersion: admissionregistration.k8s.io/v1 +kind: MutatingWebhookConfiguration +metadata: + name: union-pod-webhook-union labels: - app.kubernetes.io/name: flytepropeller + app.kubernetes.io/name: union-pod-webhook app.kubernetes.io/instance: release-name platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm +webhooks: + - name: union-pod-webhook.flyte.org + admissionReviewVersions: + - v1 + - v1beta1 + clientConfig: + caBundle: LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSUROekNDQWgrZ0F3SUJBZ0lVSm9kQ09lem4vd1BHQTZjYkw1U3duSlpYT2hJd0RRWUpLb1pJaHZjTkFRRUwKQlFBd0t6RXBNQ2NHQTFVRUF3d2dabXg1ZEdWd2NtOXdaV3hzWlhJdGQyVmlhRzl2YXk1MWJtbHZiaTV6ZG1NdwpIaGNOTWpZd01USXpNREl6TmpVMVdoY05Nell3TVRJeE1ESXpOalUxV2pBck1Ta3dKd1lEVlFRRERDQm1iSGwwClpYQnliM0JsYkd4bGNpMTNaV0pvYjI5ckxuVnVhVzl1TG5OMll6Q0NBU0l3RFFZSktvWklodmNOQVFFQkJRQUQKZ2dFUEFEQ0NBUW9DZ2dFQkFJcjBSWDRGRjFNVTNYdFBmUHErMDUrMGI1Qm0xK3hsTVR5ZUp4bG5mWFJuRlgyegpPcHV6NmsreUpBQk4xRkxmNDVYRnlJclFlaW5DWWJtckZXalFRc3ZDWEloNkpPNGMwdmRMVVU5RERYcldWRFlsCkRNZytTUDlJVnZBMm5USkVFZ2RFREVsKzNWeThUbUJoNlNmcGthcXdHajY5SmhIdy9OeE9yZzM2bUY3TU5VZ1MKWXJVUkxqUFFzTUN6NElTVzhhQnEzS2IwRUFWUTA4V1lZQ0ZEbjY1aHBzVFdxR05rL3BKRnVTYjlsZll6RVdyYwpJeW5wNDhqTnIvNUdKS3NnWG9LeTUwZVo4UkppeTVyOUhENGtnSW15TVNQR2RDMXFZSVppRHRlZW8yOElxNk5nCmd1dWcyTGhlNndYYW9YZXFCQ2d3SHpwWFVXM1hJVzI5eFoxUkFRRUNBd0VBQWFOVE1GRXdIUVlEVlIwT0JCWUUKRktITDQ1RWhQaVg5bnlRN241QUk0YUZVNFlCL01COEdBMVVkSXdRWU1CYUFGS0hMNDVFaFBpWDlueVE3bjVBSQo0YUZVNFlCL01BOEdBMVVkRXdFQi93UUZNQU1CQWY4d0RRWUpLb1pJaHZjTkFRRUxCUUFEZ2dFQkFEeTRac21XCkNjZ3B5TmI4cU1ydzNYclBNejZLMTRlTVhqQVk1bERFYXM1NklLZExoZFM1bi9sWk95bENuUUtOZ3RHS0VrY3UKT05tMEE2Z0xNT3UzcE43dG9aNXh0WktOcnUvS0x6dm44QUNNNndCbUR0bG5oRERDa3ZUTHBVRzh6UTZSNmlaegpzTWlhVmpLSUJiWkJsZXNYSHZZQnQ2YlRTN2o1QWM2aTdYc1RPK3ZqQ2RUQjVYdCtEZ0x2QnBJRWh2ZGJzNXhmClFtc3ZzQmwxenJnR3JXemo4d0txcXNUMzZaOVU1TzdSK1RJMlQrUDVOR25SeHNQOGJ0R0RtMGZ5SW55WlJrSkcKVTdpVFJyMlZRWlRyaTNZQ2dlVjN1UjFtMnJjc2hyU1Avb082T0QyTXR5Q1FMRWtFK1pxVUVJVnpSYmVaNXl2RQptM29YMFV2OUtBUy9pWTA9Ci0tLS0tRU5EIENFUlRJRklDQVRFLS0tLS0K + service: + name: union-pod-webhook + namespace: union + path: /mutate--v1-pod/secrets + port: 443 + failurePolicy: Fail + matchPolicy: Equivalent + namespaceSelector: + matchLabels: + kubernetes.io/metadata.name: union + objectSelector: + matchLabels: + inject-flyte-secrets: "true" + organization: 'union' + reinvocationPolicy: Never + rules: + - apiGroups: + - '*' + apiVersions: + - v1 + operations: + - CREATE + resources: + - pods + scope: '*' + sideEffects: NoneOnDryRun + timeoutSeconds: 30 +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Imported via https://github.com/knative/operator/releases/download/knative-v1.16.0/operator.yaml +# Copyright 2024 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +# Source: dataplane/templates/propeller/serviceaccount.yaml +--- +--- +# Source: dataplane/templates/serving/knative-serving.yaml +apiVersion: operator.knative.dev/v1beta1 +kind: KnativeServing +metadata: + name: union-operator-serving + namespace: union + labels: + helm.sh/chart: dataplane-2026.4.4 + app.kubernetes.io/name: release-name-dataplane + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2026.4.5" + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/component: serving spec: - replicas: 1 - selector: - matchLabels: - app.kubernetes.io/name: flytepropeller + config: + deployment: + progress-deadline: "30m" + queue-sidecar-cpu-request: "25m" + queue-sidecar-cpu-limit: "1000m" + queue-sidecar-memory-request: "400Mi" + queue-sidecar-memory-limit: "800Mi" + queue-sidecar-ephemeral-storage-request: "512Mi" + queue-sidecar-ephemeral-storage-limit: "1024Mi" + registries-skipping-tag-resolving: managed.cr.union.ai + features: + kubernetes.podspec-affinity: "enabled" + kubernetes.podspec-nodeselector: "enabled" + kubernetes.podspec-tolerations: "enabled" + kubernetes.podspec-fieldref: "enabled" + kubernetes.podspec-dnspolicy: "enabled" + kubernetes.podspec-schedulername: "enabled" + kubernetes.podspec-securitycontext: "enabled" + network: + ingress-class: "kourier.ingress.networking.knative.dev" + high-availability: + replicas: 2 + ingress: + kourier: + enabled: true + bootstrap-configmap: "union-operator-serving-envoy-bootstrap" + service-type: ClusterIP + podDisruptionBudgets: + - name: 3scale-kourier-gateway-pdb + minAvailable: 50% + - name: activator-pdb + minAvailable: 50% + - name: webhook-pdb + minAvailable: 50% + registry: + override: + # TODO(jeev): Wire up Union fork of Envoy + 3scale-kourier-gateway/kourier-gateway: ghcr.io/unionai/envoy:456fed84d4ad9a9dfb186d117d9362e9dc0f7c1f + # TODO(jeev): Wire up Union fork of Kourier + net-kourier-controller/controller: ghcr.io/unionai/kourier@sha256:5804c348d15b3959604e3e3ceed216c3a1c7b32cbe254c7d3eb02a35e62ba9c4 + workloads: + - name: 3scale-kourier-gateway + labels: + helm.sh/chart: dataplane-2026.4.4 + app.kubernetes.io/name: release-name-dataplane app.kubernetes.io/instance: release-name - template: - metadata: - annotations: - configChecksum: "1802a3a59db84835bdd2fc604d1b6be50ee3e55489f40c43e896d4c300638ce" - - labels: - platform.union.ai/zone: "dataplane" - - - app.kubernetes.io/name: flytepropeller - app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name - app.kubernetes.io/managed-by: Helm - spec: - containers: - - command: - - flytepropeller - - --config - - /etc/flyte/config/*.yaml - - --propeller.cluster-id - - my-cluster - env: - - name: POD_NAME - valueFrom: - fieldRef: - fieldPath: metadata.name - - name: POD_NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - - name: GOMEMLIMIT - valueFrom: - resourceFieldRef: - divisor: 1 - resource: limits.memory - - name: GOMAXPROCS - valueFrom: - resourceFieldRef: - divisor: 1 - resource: limits.cpu - - name: CLUSTER_NAME - valueFrom: - secretKeyRef: - name: operator-cluster-name - key: cluster_name - - name: DEPLOYMENT_NAME - value: operator - - name: PROXY_SERVICE_URL - value: http://union-operator-proxy:8080 - - name: PROMETHEUS_SERVICE_URL - value: http://union-operator-prometheus:80 - - name: KNATIVE_PROXY_SERVICE_URL - value: http://kourier-internal - image: "public.ecr.aws/p0i0a9q8/unionoperator:2026.4.5" - imagePullPolicy: "IfNotPresent" - name: flytepropeller - ports: - - containerPort: 10254 - resources: - limits: - cpu: "3" - memory: 3Gi - requests: - cpu: "1" - memory: 1Gi - volumeMounts: - - name: config-volume - mountPath: /etc/flyte/config - - name: auth - mountPath: /etc/union/secret - serviceAccountName: flytepropeller-system - volumes: - - configMap: - name: flyte-propeller-config - name: config-volume - - name: auth - secret: - secretName: union-secret-auth + app.kubernetes.io/version: "2026.4.5" + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/component: 3scale-kourier-gateway + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchLabels: + app.kubernetes.io/name: 3scale-kourier-gateway + app.kubernetes.io/instance: release-name + topologyKey: topology.kubernetes.io/zone + annotations: + checksum/bootstrap-config: 6469f6f592eebc9e0c676d8ccc359a05bc799c0988e474b2da942c4cc328f656 + env: + - container: kourier-gateway + envVars: + - name: UNION_AUTHZ_TENANTAUTHURL + value: "https://union.us-west-2.union.ai/me" + - name: UNION_AUTHZ_TENANTAUTHSIGNINURL + value: "https://union.us-west-2.union.ai/login" + - name: UNION_AUTHZ_TENANTCONTROLPLANEURL + value: "https://union.us-west-2.union.ai" + resources: + - container: kourier-gateway + limits: + cpu: "2" + memory: 2Gi + requests: + cpu: "1" + memory: 1Gi + - name: activator + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app + operator: In + values: + - activator + topologyKey: topology.kubernetes.io/zone + - name: autoscaler + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app + operator: In + values: + - autoscaler + topologyKey: topology.kubernetes.io/zone + - name: autoscaler-hpa + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app + operator: In + values: + - autoscaler-hpa + topologyKey: topology.kubernetes.io/zone + - name: controller + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app + operator: In + values: + - controller + topologyKey: topology.kubernetes.io/zone + - name: net-kourier-controller + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app + operator: In + values: + - net-kourier-controller + topologyKey: topology.kubernetes.io/zone + env: + - container: controller + envVars: + - name: KOURIER_UNION_AUTHZ_ENABLED + value: "true" + resources: + - container: controller + limits: + cpu: "1" + memory: 1Gi + requests: + cpu: 500m + memory: 500Mi + - name: webhook + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app + operator: In + values: + - webhook + topologyKey: topology.kubernetes.io/zone +--- +# Source: dataplane/templates/common/task-podtemplate.yaml +apiVersion: v1 +kind: PodTemplate +metadata: + name: task-template + namespace: union +template: + spec: + serviceAccountName: union + containers: + - name: default + image: docker.io/rwgrim/docker-noop +--- +# Source: dataplane/charts/fluentbit/templates/tests/test-connection.yaml +apiVersion: v1 +kind: Pod +metadata: + name: "release-name-fluentbit-test-connection" + namespace: union + labels: + helm.sh/chart: fluentbit-0.48.9 + app.kubernetes.io/version: "3.2.8" + app.kubernetes.io/managed-by: Helm + annotations: + helm.sh/hook: test + helm.sh/hook-delete-policy: hook-succeeded +spec: + containers: + - name: wget + image: "busybox:latest" + imagePullPolicy: Always + command: ["sh"] + args: ["-c", "sleep 5s && wget -O- release-name-fluentbit:2020"] + restartPolicy: Never diff --git a/tests/generated/dataplane.monitoring.yaml b/tests/generated/dataplane.monitoring.yaml index af4f28e9..a791a64d 100644 --- a/tests/generated/dataplane.monitoring.yaml +++ b/tests/generated/dataplane.monitoring.yaml @@ -1,68 +1,51 @@ --- -# Source: dataplane/templates/common/namespaces.yaml -apiVersion: v1 -kind: Namespace -metadata: - name: flytesnacks-development ---- -# Source: dataplane/templates/common/namespaces.yaml -apiVersion: v1 -kind: Namespace -metadata: - name: flytesnacks-staging ---- -# Source: dataplane/templates/common/namespaces.yaml -apiVersion: v1 -kind: Namespace -metadata: - name: flytesnacks-production ---- -# Source: dataplane/templates/common/namespaces.yaml -apiVersion: v1 -kind: Namespace -metadata: - name: union-health-monitoring-development ---- -# Source: dataplane/templates/common/namespaces.yaml -apiVersion: v1 -kind: Namespace -metadata: - name: union-health-monitoring-staging ---- -# Source: dataplane/templates/common/namespaces.yaml -apiVersion: v1 -kind: Namespace -metadata: - name: union-health-monitoring-production ---- -# Source: dataplane/charts/fluentbit/templates/serviceaccount.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2020 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + apiVersion: v1 kind: ServiceAccount metadata: - name: fluentbit-system - namespace: union - labels: - helm.sh/chart: fluentbit-0.48.9 - app.kubernetes.io/name: fluentbit - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "3.2.8" - app.kubernetes.io/managed-by: Helm ---- -# Source: dataplane/charts/kube-state-metrics/templates/serviceaccount.yaml + name: knative-operator + namespace: "union" + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2020 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + apiVersion: v1 kind: ServiceAccount -automountServiceAccountToken: true metadata: - labels: - helm.sh/chart: kube-state-metrics-5.30.1 - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: metrics - app.kubernetes.io/part-of: kube-state-metrics - app.kubernetes.io/name: kube-state-metrics - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2.15.0" - name: release-name-kube-state-metrics - namespace: union + name: operator-webhook + namespace: "union" + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator --- # Source: dataplane/charts/monitoring/charts/grafana/templates/serviceaccount.yaml apiVersion: v1 @@ -151,92 +134,93 @@ metadata: heritage: "Helm" automountServiceAccountToken: true --- -# Source: dataplane/charts/opencost/templates/serviceaccount.yaml +# Source: dataplane/charts/prometheus/charts/kube-state-metrics/templates/serviceaccount.yaml apiVersion: v1 kind: ServiceAccount -metadata: - name: release-name-opencost - namespace: union - labels: - helm.sh/chart: opencost-1.42.0 - app.kubernetes.io/name: opencost - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "1.111.0" - app.kubernetes.io/part-of: opencost - app.kubernetes.io/managed-by: Helm automountServiceAccountToken: true ---- -# Source: dataplane/templates/clusterresourcesync/serviceaccount.yaml -apiVersion: v1 -kind: ServiceAccount -metadata: - name: union-clustersync-system - namespace: union ---- -# Source: dataplane/templates/imagebuilder/serviceaccount.yaml -apiVersion: v1 -kind: ServiceAccount -metadata: - name: union-imagebuilder ---- -# Source: dataplane/templates/nodeexecutor/serviceaccount.yaml -apiVersion: v1 -kind: ServiceAccount metadata: - name: executor + labels: + helm.sh/chart: kube-state-metrics-5.25.1 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/component: metrics + app.kubernetes.io/part-of: kube-state-metrics + app.kubernetes.io/name: kube-state-metrics + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2.13.0" + name: release-name-kube-state-metrics namespace: union - labels: - app: executor --- -# Source: dataplane/templates/operator/serviceaccount-proxy.yaml +# Source: dataplane/charts/prometheus/templates/serviceaccount.yaml apiVersion: v1 kind: ServiceAccount metadata: - name: proxy-system labels: - app.kubernetes.io/name: operator-proxy + app.kubernetes.io/component: server + app.kubernetes.io/name: prometheus app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name + app.kubernetes.io/version: v2.54.1 + helm.sh/chart: prometheus-25.27.0 app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: prometheus + name: union-operator-prometheus + namespace: union + annotations: + {} --- -# Source: dataplane/templates/operator/serviceaccount.yaml +# Source: dataplane/templates/common/system-serviceaccount.yaml apiVersion: v1 kind: ServiceAccount metadata: - name: operator-system - labels: - app.kubernetes.io/name: union-operator - app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name - app.kubernetes.io/managed-by: Helm + name: union-system + namespace: union + annotations: --- -# Source: dataplane/templates/prometheus/serviceaccount.yaml +# Source: dataplane/templates/common/union-serviceaccount.yaml apiVersion: v1 kind: ServiceAccount metadata: - name: union-operator-prometheus + name: union namespace: union - labels: - helm.sh/chart: dataplane-2026.4.2 - app.kubernetes.io/name: release-name-dataplane - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: prometheus + annotations: + eks.amazonaws.com/role-arn: arn:aws:iam::ACCOUNT_ID:role/flyte_project_role +automountServiceAccountToken: true --- -# Source: dataplane/templates/propeller/serviceaccount-webhook.yaml +# Source: dataplane/templates/flyteconnector/serviceaccount.yaml apiVersion: v1 kind: ServiceAccount metadata: - name: flytepropeller-webhook-system + name: flyteconnector namespace: union + labels: + app.kubernetes.io/name: flyteconnector + app.kubernetes.io/instance: release-name + app.kubernetes.io/managed-by: Helm --- -# Source: dataplane/templates/propeller/serviceaccount.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2022 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + apiVersion: v1 -kind: ServiceAccount +kind: Secret metadata: - name: flytepropeller-system - namespace: union + name: operator-webhook-certs + namespace: "union" + labels: + app.kubernetes.io/component: operator-webhook + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +# The data is populated at install time. --- # Source: dataplane/charts/monitoring/charts/grafana/templates/secret.yaml apiVersion: v1 @@ -266,14 +250,72 @@ type: Opaque data: cluster_name: --- -# Source: dataplane/templates/propeller/deployment-webhook.yaml -# Create an empty secret that the first propeller pod will populate +# Source: dataplane/templates/webhook/mutatingwebhookconfiguration.yaml apiVersion: v1 kind: Secret metadata: - name: flyte-pod-webhook + name: union-pod-webhook namespace: union + labels: + app.kubernetes.io/name: union-pod-webhook + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm type: Opaque +data: + ca.crt: LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSUROekNDQWgrZ0F3SUJBZ0lVSm9kQ09lem4vd1BHQTZjYkw1U3duSlpYT2hJd0RRWUpLb1pJaHZjTkFRRUwKQlFBd0t6RXBNQ2NHQTFVRUF3d2dabXg1ZEdWd2NtOXdaV3hzWlhJdGQyVmlhRzl2YXk1MWJtbHZiaTV6ZG1NdwpIaGNOTWpZd01USXpNREl6TmpVMVdoY05Nell3TVRJeE1ESXpOalUxV2pBck1Ta3dKd1lEVlFRRERDQm1iSGwwClpYQnliM0JsYkd4bGNpMTNaV0pvYjI5ckxuVnVhVzl1TG5OMll6Q0NBU0l3RFFZSktvWklodmNOQVFFQkJRQUQKZ2dFUEFEQ0NBUW9DZ2dFQkFJcjBSWDRGRjFNVTNYdFBmUHErMDUrMGI1Qm0xK3hsTVR5ZUp4bG5mWFJuRlgyegpPcHV6NmsreUpBQk4xRkxmNDVYRnlJclFlaW5DWWJtckZXalFRc3ZDWEloNkpPNGMwdmRMVVU5RERYcldWRFlsCkRNZytTUDlJVnZBMm5USkVFZ2RFREVsKzNWeThUbUJoNlNmcGthcXdHajY5SmhIdy9OeE9yZzM2bUY3TU5VZ1MKWXJVUkxqUFFzTUN6NElTVzhhQnEzS2IwRUFWUTA4V1lZQ0ZEbjY1aHBzVFdxR05rL3BKRnVTYjlsZll6RVdyYwpJeW5wNDhqTnIvNUdKS3NnWG9LeTUwZVo4UkppeTVyOUhENGtnSW15TVNQR2RDMXFZSVppRHRlZW8yOElxNk5nCmd1dWcyTGhlNndYYW9YZXFCQ2d3SHpwWFVXM1hJVzI5eFoxUkFRRUNBd0VBQWFOVE1GRXdIUVlEVlIwT0JCWUUKRktITDQ1RWhQaVg5bnlRN241QUk0YUZVNFlCL01COEdBMVVkSXdRWU1CYUFGS0hMNDVFaFBpWDlueVE3bjVBSQo0YUZVNFlCL01BOEdBMVVkRXdFQi93UUZNQU1CQWY4d0RRWUpLb1pJaHZjTkFRRUxCUUFEZ2dFQkFEeTRac21XCkNjZ3B5TmI4cU1ydzNYclBNejZLMTRlTVhqQVk1bERFYXM1NklLZExoZFM1bi9sWk95bENuUUtOZ3RHS0VrY3UKT05tMEE2Z0xNT3UzcE43dG9aNXh0WktOcnUvS0x6dm44QUNNNndCbUR0bG5oRERDa3ZUTHBVRzh6UTZSNmlaegpzTWlhVmpLSUJiWkJsZXNYSHZZQnQ2YlRTN2o1QWM2aTdYc1RPK3ZqQ2RUQjVYdCtEZ0x2QnBJRWh2ZGJzNXhmClFtc3ZzQmwxenJnR3JXemo4d0txcXNUMzZaOVU1TzdSK1RJMlQrUDVOR25SeHNQOGJ0R0RtMGZ5SW55WlJrSkcKVTdpVFJyMlZRWlRyaTNZQ2dlVjN1UjFtMnJjc2hyU1Avb082T0QyTXR5Q1FMRWtFK1pxVUVJVnpSYmVaNXl2RQptM29YMFV2OUtBUy9pWTA9Ci0tLS0tRU5EIENFUlRJRklDQVRFLS0tLS0K + tls.crt: LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSUQxakNDQXI2Z0F3SUJBZ0lVREtRN3A4MFpIdkQyUE5Ya3p3VGRLem9iUjg4d0RRWUpLb1pJaHZjTkFRRUwKQlFBd0t6RXBNQ2NHQTFVRUF3d2dabXg1ZEdWd2NtOXdaV3hzWlhJdGQyVmlhRzl2YXk1MWJtbHZiaTV6ZG1NdwpIaGNOTWpZd01USXpNREl6TmpVMVdoY05NamN3TVRJek1ESXpOalUxV2pBck1Ta3dKd1lEVlFRRERDQm1iSGwwClpYQnliM0JsYkd4bGNpMTNaV0pvYjI5ckxuVnVhVzl1TG5OMll6Q0NBU0l3RFFZSktvWklodmNOQVFFQkJRQUQKZ2dFUEFEQ0NBUW9DZ2dFQkFMNnMxUTRaZ3ZxVVdNdFp5VC9EKytEQWl6WmR0Q1lkVzArSGk4S3phQ1Vjbi9zZwo3a21NTEhDM2ZlZWtZK2ZUNENHUTd6K1l2WjRWWSt6WGljKzdrNndLbUZqdm1acEM1Rm96OWo1Q21NRk5HZmlWCnRobzVvSU9MUkIzNXRYMU1ROHUwNFVzT29QcjdMK016eEdxWXVmTFdtN1NVdjh0eEp3ZFMxZVlzMllNV1pMeU4KekI4akdWY01Qc2ZNVkdxdFhTTVljQ3hNYURKaTM1N0VEZSs3clM2U2ZtbnFtYjJxekdQM1BRRStCUm91Yk5nLwo4NCt4c253VXBLZUFpem95dFM2c2ZlOTdMUEtRTW9oVzd2TjZORFJrLy9xSmpHem9iZmZwUHU3endhWHVXMWtMCmllc2g2cVRrdlZLSTF3WFJ2WHArVU5GbjZzdDIwdEVlRks0ZzMwOENBd0VBQWFPQjhUQ0I3akFKQmdOVkhSTUUKQWpBQU1Bc0dBMVVkRHdRRUF3SUY0RENCa3dZRFZSMFJCSUdMTUlHSWdoWm1iSGwwWlhCeWIzQmxiR3hsY2kxMwpaV0pvYjI5cmdoeG1iSGwwWlhCeWIzQmxiR3hsY2kxM1pXSm9iMjlyTG5WdWFXOXVnaUJtYkhsMFpYQnliM0JsCmJHeGxjaTEzWldKb2IyOXJMblZ1YVc5dUxuTjJZNEl1Wm14NWRHVndjbTl3Wld4c1pYSXRkMlZpYUc5dmF5NTEKYm1sdmJpNXpkbU11WTJ4MWMzUmxjaTVzYjJOaGJEQWRCZ05WSFE0RUZnUVV1d25wcDZ1MTNKL0NsNnI3eDZ1Sgo5YmZ1RDhzd0h3WURWUjBqQkJnd0ZvQVVvY3Zqa1NFK0pmMmZKRHVma0FqaG9WVGhnSDh3RFFZSktvWklodmNOCkFRRUxCUUFEZ2dFQkFFMmlOa1MzczhGT3Z0cDhrVmJNZUN2amxIQytvYTVlZTFGNmowNlZ3bFJPOWdxOXJla1UKMGptWUhNWGNjWjcrZXJkNFRQRXgwK3ZqMzNhRTV3cDFmdVJJL2xIZHhuOUpJRHg2bTFQTHdVa3p1WG9EekYxLwozVXVIWU1xOGkrT2xsY0tqbmdVMUFUNmRWRCtIbFVWeHpMdjlNazdPb05SQUY5dTJHa1hiUHdCN0I1ZGRxdWxlCmtNcXBxbDhES2s4SVNVelVsTjNBb2Q5NmJUelIyV3RCVkxpTi9zRHZtTDZwZThFYm55RnBUb1BZTDlGeWZOcmIKT1ZPZEJwUkZFTG1KOHZaZWY5d1lxRmVXMTE3SGV5VDArZkxGZVNSemZUNVVWUlVraVFBcUVJdUhHUVRrOVFabgo1WnYvNUF0eEg3WE1rcXR0VE1zMFlSUG0xYjQzY3FBeFdhND0KLS0tLS1FTkQgQ0VSVElGSUNBVEUtLS0tLQo= + tls.key: LS0tLS1CRUdJTiBQUklWQVRFIEtFWS0tLS0tCk1JSUV2UUlCQURBTkJna3Foa2lHOXcwQkFRRUZBQVNDQktjd2dnU2pBZ0VBQW9JQkFRQytyTlVPR1lMNmxGakwKV2NrL3cvdmd3SXMyWGJRbUhWdFBoNHZDczJnbEhKLzdJTzVKakN4d3QzM25wR1BuMCtBaGtPOC9tTDJlRldQcwoxNG5QdTVPc0NwaFk3NW1hUXVSYU0vWStRcGpCVFJuNGxiWWFPYUNEaTBRZCtiVjlURVBMdE9GTERxRDYreS9qCk04UnFtTG55MXB1MGxML0xjU2NIVXRYbUxObURGbVM4amN3Zkl4bFhERDdIekZScXJWMGpHSEFzVEdneVl0K2UKeEEzdnU2MHVrbjVwNnBtOXFzeGo5ejBCUGdVYUxtellQL09Qc2JKOEZLU25nSXM2TXJVdXJIM3ZleXp5a0RLSQpWdTd6ZWpRMFpQLzZpWXhzNkczMzZUN3U4OEdsN2x0WkM0bnJJZXFrNUwxU2lOY0YwYjE2ZmxEUlorckxkdExSCkhoU3VJTjlQQWdNQkFBRUNnZ0VBRjV2SnMzbTRMSE9DdlFic2NwOC9DQmgvQkNlOC92MGlpYW5VUmJLMkFlWlYKN3Arb3NXV1FPUktYSGIvT2VPMVVjb09DQkFOUzh3aGQrM3pDZlB5U0w3cU9HM0RyT1Z6djdqVkIxM3FpZEVpcQpId3ZXWk0vZXpuckhYOWpEdm5SYmJwVUNVaXRKQmxwa2x3S1pYc056UHB5UTRkNkxFdEw3VEo1V2lxM2g3cWQzCjhncmJoeWlWNDhZRFAyYkhpZGt0S2QvMHBNL3piSzZPTS9iMFpLUmRBWVpHQXlOcklMaFJ3eDNsQXE2SDVqL20KUHBuNlZEdVQ3Y1h6UFhmNTgxR0IyNjQxWFJVTXRNWGVEREtaU3JudnIrbG5tV2lsYXZmNkg3ZHp4NlRBMmk0RApybCtIRVdUN1RSQUZGMnZ5SVUrYSt5RzNPVFBKRCtmVTdCWWh4Zld4aFFLQmdRRGVvcXVCbzQxK1dKU255blZnCmNWMHZoUEkwZ0dyRGQzNFdIbCtGNTZuSmxuMVVlbUQ2SUQ0TUhnNVpHWmNmd3NkRlB0aHl1VnB0OVl1SW11SGQKRHE5ZDhpWnpHUzJVbStkSWlsWjhnZXJ5Q3dON3hIbFM3SzJrc0RmRGJwYmZZSXZ1LzRnSExZRHh3YzQ0OG54ZwpWNGpCUFY2QTdYTHRqQzZzdFE2OW9uTlIwd0tCZ1FEYlFBUTRGdWpnUjJUOXh2Y01MdXhMbm1YaHE4U2NoZlM5CkRoWndDUzZIeHB1YlBxblBjQ04vQUZjbDZ1cDRXVEVZdVY4bEpwcGwwRmpYQ2lMdkhuUU15cm5vRjlUOThpanQKZGVHYjF6ZWlwRUpndFpNVHkvOGdERWorZU1wSFBuYTZwQVpMOVpRVUdLR2dDNGQ0SzljdVNHMmRlb1dSU0ViSQpMVGgzeGpXVEZRS0JnQStJRWFidG5nVmVjS0IwQTFSREZGa29VUzFRZUNKQ3g4MExPV2JDRHBvOW9XaXZVT3lpCkt6SDFOdE1JY2Y2SlBCV2NtTVVJSVVMaWltVnhTS2gvU2NTb0MvNmpsd1p6Q2VPSm94YjBpVXR4Y1VERktDR2MKMlZCUDZ0UDdkeE1HVFR6VEhzNUJZbWw3TjhQSlJ0d0J2MHliMTJmdktNRmhzaS9pUWJFQkVFSjVBb0dBUm9NSApHRmJkM0V0NXdsZzcyYkk1a25SRnhkY3RLejIxb1J1bndhNWlSWTV0T3Zkak8zQ1FLZWNkSC9lMklyQmtwdFB2Ck1vNkF0MS9UUW8xakFNNGxlbnUwWUYxUnhiNGN4WW5VM2Y3UVNNRlZDNjg1dHZNemdNWVNyNngvT1h4d1NNUTUKdGpJcnhtN0poQ1JSRkNmZFUyZzl3SmpIM2hxRmtSbGlBTHRCUGFVQ2dZRUF2QkRHejQvYS9kRzRJT0VpREk4SgpBbkxaRWZaTFJrdnI0ejRyT0p0TGZtWlFjTW5DSUZmKzBaelhTME9XOWhvMjk5RUNtS1l4U2VUWk1WSDgzOFZCCnRvSkNGM3lzb01rSC9meEVXTUpicDlMblJwSFUzUElicHlhRG1nWW1YdmsvK3laL3Y1c0dDTWNtVGkzWVFLRGsKcStlK1o0SHZyaGUxVytUaHpwOCtoNGs9Ci0tLS0tRU5EIFBSSVZBVEUgS0VZLS0tLS0K +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2019 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v1 +kind: ConfigMap +metadata: + name: config-logging + namespace: "union" + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +data: {} +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2019 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v1 +kind: ConfigMap +metadata: + name: config-observability + namespace: "union" + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +data: {} --- # Source: dataplane/charts/monitoring/charts/grafana/templates/configmap-dashboard-provider.yaml apiVersion: v1 @@ -990,142 +1032,466 @@ data: workload-total.json: |- {"editable":true,"links":[{"asDropdown":true,"includeVars":true,"keepTime":true,"tags":["kubernetes-mixin"],"targetBlank":false,"title":"Kubernetes","type":"dashboards"}],"panels":[{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"color":{"fixedColor":"green","mode":"fixed"},"unit":"bps"}},"gridPos":{"h":9,"w":12,"x":0,"y":0},"id":1,"interval":"1m","options":{"displayMode":"basic","showUnfilled":false},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sort_desc(sum(rate(container_network_receive_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", cluster=\"$cluster\",namespace=~\"$namespace\"}[$__rate_interval])\n* on (cluster, namespace, pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=~\"$namespace\", workload=~\"$workload\", workload_type=~\"$type\"}) by (pod))\n","legendFormat":"__auto"}],"title":"Current Rate of Bytes Received","type":"bargauge"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"color":{"fixedColor":"green","mode":"fixed"},"unit":"bps"}},"gridPos":{"h":9,"w":12,"x":12,"y":0},"id":2,"interval":"1m","options":{"displayMode":"basic","showUnfilled":false},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sort_desc(sum(rate(container_network_transmit_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", cluster=\"$cluster\",namespace=~\"$namespace\"}[$__rate_interval])\n* on (cluster, namespace, pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=~\"$namespace\", workload=~\"$workload\", workload_type=~\"$type\"}) by (pod))\n","legendFormat":"__auto"}],"title":"Current Rate of Bytes Transmitted","type":"bargauge"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"color":{"fixedColor":"green","mode":"fixed"},"unit":"bps"}},"gridPos":{"h":9,"w":12,"x":0,"y":9},"id":3,"interval":"1m","options":{"displayMode":"basic","showUnfilled":false},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sort_desc(avg(rate(container_network_receive_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", cluster=\"$cluster\",namespace=~\"$namespace\"}[$__rate_interval])\n* on (cluster, namespace, pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=~\"$namespace\", workload=~\"$workload\", workload_type=~\"$type\"}) by (pod))\n","legendFormat":"__auto"}],"title":"Average Rate of Bytes Received","type":"bargauge"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"color":{"fixedColor":"green","mode":"fixed"},"unit":"bps"}},"gridPos":{"h":9,"w":12,"x":12,"y":9},"id":4,"interval":"1m","options":{"displayMode":"basic","showUnfilled":false},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sort_desc(avg(rate(container_network_transmit_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", cluster=\"$cluster\",namespace=~\"$namespace\"}[$__rate_interval])\n* on (cluster, namespace, pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=~\"$namespace\", workload=~\"$workload\", workload_type=~\"$type\"}) by (pod))\n","legendFormat":"__auto"}],"title":"Average Rate of Bytes Transmitted","type":"bargauge"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","spanNulls":true},"unit":"bps"}},"gridPos":{"h":9,"w":12,"x":0,"y":18},"id":5,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sort_desc(sum(rate(container_network_receive_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", cluster=\"$cluster\",namespace=~\"$namespace\"}[$__rate_interval])\n* on (cluster, namespace, pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=~\"$namespace\", workload=~\"$workload\", workload_type=~\"$type\"}) by (pod))\n","legendFormat":"__auto"}],"title":"Receive Bandwidth","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","spanNulls":true},"unit":"bps"}},"gridPos":{"h":9,"w":12,"x":12,"y":18},"id":6,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sort_desc(sum(rate(container_network_transmit_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", cluster=\"$cluster\",namespace=~\"$namespace\"}[$__rate_interval])\n* on (cluster, namespace, pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=~\"$namespace\", workload=~\"$workload\", workload_type=~\"$type\"}) by (pod))\n","legendFormat":"__auto"}],"title":"Transmit Bandwidth","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","spanNulls":true},"unit":"pps"}},"gridPos":{"h":9,"w":12,"x":0,"y":27},"id":7,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sort_desc(sum(rate(container_network_receive_packets_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", cluster=\"$cluster\",namespace=~\"$namespace\"}[$__rate_interval])\n* on (cluster, namespace, pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=~\"$namespace\", workload=~\"$workload\", workload_type=~\"$type\"}) by (pod))\n","legendFormat":"__auto"}],"title":"Rate of Received Packets","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","spanNulls":true},"unit":"pps"}},"gridPos":{"h":9,"w":12,"x":12,"y":27},"id":8,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sort_desc(sum(rate(container_network_transmit_packets_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", cluster=\"$cluster\",namespace=~\"$namespace\"}[$__rate_interval])\n* on (cluster, namespace, pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=~\"$namespace\", workload=~\"$workload\", workload_type=~\"$type\"}) by (pod))\n","legendFormat":"__auto"}],"title":"Rate of Transmitted Packets","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","spanNulls":true},"unit":"pps"}},"gridPos":{"h":9,"w":12,"x":0,"y":36},"id":9,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sort_desc(sum(rate(container_network_receive_packets_dropped_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", cluster=\"$cluster\",namespace=~\"$namespace\"}[$__rate_interval])\n* on (cluster, namespace, pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=~\"$namespace\", workload=~\"$workload\", workload_type=~\"$type\"}) by (pod))\n","legendFormat":"__auto"}],"title":"Rate of Received Packets Dropped","type":"timeseries"},{"datasource":{"type":"datasource","uid":"-- Mixed --"},"fieldConfig":{"defaults":{"custom":{"fillOpacity":10,"showPoints":"never","spanNulls":true},"unit":"pps"}},"gridPos":{"h":9,"w":12,"x":12,"y":36},"id":10,"interval":"1m","options":{"legend":{"asTable":true,"calcs":["lastNotNull"],"displayMode":"table","placement":"right","showLegend":true},"tooltip":{"mode":"single"}},"pluginVersion":"v11.4.0","targets":[{"datasource":{"type":"prometheus","uid":"${datasource}"},"expr":"sort_desc(sum(rate(container_network_transmit_packets_dropped_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", cluster=\"$cluster\",namespace=~\"$namespace\"}[$__rate_interval])\n* on (cluster, namespace, pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=~\"$namespace\", workload=~\"$workload\", workload_type=~\"$type\"}) by (pod))\n","legendFormat":"__auto"}],"title":"Rate of Transmitted Packets Dropped","type":"timeseries"}],"refresh":"10s","schemaVersion":39,"tags":["kubernetes-mixin"],"templating":{"list":[{"current":{"selected":true,"text":"default","value":"default"},"hide":0,"label":"Data source","name":"datasource","query":"prometheus","regex":"","type":"datasource"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"hide":2,"label":"cluster","name":"cluster","query":"label_values(kube_pod_info{job=\"kube-state-metrics\"}, cluster)","refresh":2,"sort":1,"type":"query","allValue":".*"},{"allValue":".+","current":{"selected":false,"text":"kube-system","value":"kube-system"},"datasource":{"type":"prometheus","uid":"${datasource}"},"hide":0,"includeAll":true,"label":"namespace","name":"namespace","query":"label_values(container_network_receive_packets_total{cluster=\"$cluster\"}, namespace)","refresh":2,"sort":1,"type":"query"},{"datasource":{"type":"prometheus","uid":"${datasource}"},"hide":0,"label":"workload","name":"workload","query":"label_values(namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\".+\"}, workload)","refresh":2,"sort":1,"type":"query"},{"allValue":".+","datasource":{"type":"prometheus","uid":"${datasource}"},"hide":0,"includeAll":true,"label":"workload_type","name":"type","query":"label_values(namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\"}, workload_type)","refresh":2,"sort":1,"type":"query"}]},"time":{"from":"now-1h","to":"now"},"timezone": "utc","title":"Kubernetes / Networking / Workload","uid":"728bf77cc1166d2f3133bf25846876cc"} --- -# Source: dataplane/templates/clusterresourcesync/configmap.yaml +# Source: dataplane/charts/prometheus/templates/cm.yaml apiVersion: v1 kind: ConfigMap metadata: - name: union-clusterresourcesync-config - namespace: union labels: - app.kubernetes.io/name: clusterresourcesync + app.kubernetes.io/component: server + app.kubernetes.io/name: prometheus app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name + app.kubernetes.io/version: v2.54.1 + helm.sh/chart: prometheus-25.27.0 app.kubernetes.io/managed-by: Helm -data: - cluster_resources.yaml: | - cluster_resources: - clusterName: '' - customData: - - production: - - projectQuotaCpu: - value: "4096" - - projectQuotaMemory: - value: 2Ti - - projectQuotaNvidiaGpu: - value: "256" - - defaultUserRoleKey: - value: 'eks.amazonaws.com/role-arn' - - defaultUserRoleValue: - value: 'arn:aws:iam::ACCOUNT_ID:role/flyte_project_role' - - staging: - - projectQuotaCpu: - value: "4096" - - projectQuotaMemory: - value: 2Ti - - projectQuotaNvidiaGpu: - value: "256" - - defaultUserRoleKey: - value: 'eks.amazonaws.com/role-arn' - - defaultUserRoleValue: - value: 'arn:aws:iam::ACCOUNT_ID:role/flyte_project_role' - - development: - - projectQuotaCpu: - value: "4096" - - projectQuotaMemory: - value: 2Ti - - projectQuotaNvidiaGpu: - value: "256" - - defaultUserRoleKey: - value: 'eks.amazonaws.com/role-arn' - - defaultUserRoleValue: - value: 'arn:aws:iam::ACCOUNT_ID:role/flyte_project_role' - refreshInterval: 5m - standaloneDeployment: true - templatePath: /etc/flyte/clusterresource/templates - clusterResourcesPrivate: - app: - isServerless: false - union: - auth: - authorizationMetadataKey: flyte-authorization - clientId: 'test' - clientSecretLocation: /etc/union/secret/client_secret - tokenRefreshWindow: 5m - type: ClientSecret - connection: - host: dns:/// - admin.yaml: | - admin: - clientId: 'test' - clientSecretLocation: /etc/union/secret/client_secret - endpoint: dns:/// - insecure: false - event: - capacity: 1000 - rate: 500 - type: admin - domain.yaml: | - domains: - - id: development - name: development - - id: staging - name: staging - - id: production - name: production - clusters.yaml: | - clusters: - clusterConfigs: [] - labelClusterMap: {} - logger.yaml: | - logger: - formatter: - type: json - level: 4 - show-source: true ---- -# Source: dataplane/templates/clusterresourcesync/configmap.yaml -apiVersion: v1 -kind: ConfigMap -metadata: - name: union-clusterresource-template + app.kubernetes.io/part-of: prometheus + name: union-operator-prometheus namespace: union - labels: - app.kubernetes.io/name: clusterresourcesync - app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name - app.kubernetes.io/managed-by: Helm data: - a_namespace.yaml: | - apiVersion: v1 - kind: Namespace - metadata: - name: {{ namespace }} - labels: - union.ai/namespace-type: flyte - spec: - finalizers: - - kubernetes - - b_default_service_account.yaml: | - apiVersion: v1 - kind: ServiceAccount - metadata: - name: default - namespace: {{ namespace }} - annotations: - {{ defaultUserRoleKey }}: {{ defaultUserRoleValue }} + allow-snippet-annotations: "false" + alerting_rules.yml: | + {} + alerts: | + {} + prometheus.yml: | + global: + evaluation_interval: 1m + scrape_interval: 1m + scrape_timeout: 10s + rule_files: + - /etc/config/recording_rules.yml + - /etc/config/alerting_rules.yml + - /etc/config/rules + - /etc/config/alerts + scrape_configs: + - job_name: prometheus + static_configs: + - targets: + - localhost:9090 + - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + job_name: kubernetes-apiservers + kubernetes_sd_configs: + - role: endpoints + relabel_configs: + - action: keep + regex: default;kubernetes;https + source_labels: + - __meta_kubernetes_namespace + - __meta_kubernetes_service_name + - __meta_kubernetes_endpoint_port_name + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + insecure_skip_verify: true + - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + job_name: kubernetes-nodes + kubernetes_sd_configs: + - role: node + relabel_configs: + - action: labelmap + regex: __meta_kubernetes_node_label_(.+) + - replacement: kubernetes.default.svc:443 + target_label: __address__ + - regex: (.+) + replacement: /api/v1/nodes/$1/proxy/metrics + source_labels: + - __meta_kubernetes_node_name + target_label: __metrics_path__ + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + insecure_skip_verify: true + - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + job_name: kubernetes-nodes-cadvisor + kubernetes_sd_configs: + - role: node + relabel_configs: + - action: labelmap + regex: __meta_kubernetes_node_label_(.+) + - replacement: kubernetes.default.svc:443 + target_label: __address__ + - regex: (.+) + replacement: /api/v1/nodes/$1/proxy/metrics/cadvisor + source_labels: + - __meta_kubernetes_node_name + target_label: __metrics_path__ + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + insecure_skip_verify: true + - honor_labels: true + job_name: kubernetes-service-endpoints + kubernetes_sd_configs: + - role: endpoints + relabel_configs: + - action: keep + regex: true + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_scrape + - action: drop + regex: true + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_scrape_slow + - action: replace + regex: (https?) + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_scheme + target_label: __scheme__ + - action: replace + regex: (.+) + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_path + target_label: __metrics_path__ + - action: replace + regex: (.+?)(?::\d+)?;(\d+) + replacement: $1:$2 + source_labels: + - __address__ + - __meta_kubernetes_service_annotation_prometheus_io_port + target_label: __address__ + - action: labelmap + regex: __meta_kubernetes_service_annotation_prometheus_io_param_(.+) + replacement: __param_$1 + - action: labelmap + regex: __meta_kubernetes_service_label_(.+) + - action: replace + source_labels: + - __meta_kubernetes_namespace + target_label: namespace + - action: replace + source_labels: + - __meta_kubernetes_service_name + target_label: service + - action: replace + source_labels: + - __meta_kubernetes_pod_node_name + target_label: node + - honor_labels: true + job_name: kubernetes-service-endpoints-slow + kubernetes_sd_configs: + - role: endpoints + relabel_configs: + - action: keep + regex: true + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_scrape_slow + - action: replace + regex: (https?) + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_scheme + target_label: __scheme__ + - action: replace + regex: (.+) + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_path + target_label: __metrics_path__ + - action: replace + regex: (.+?)(?::\d+)?;(\d+) + replacement: $1:$2 + source_labels: + - __address__ + - __meta_kubernetes_service_annotation_prometheus_io_port + target_label: __address__ + - action: labelmap + regex: __meta_kubernetes_service_annotation_prometheus_io_param_(.+) + replacement: __param_$1 + - action: labelmap + regex: __meta_kubernetes_service_label_(.+) + - action: replace + source_labels: + - __meta_kubernetes_namespace + target_label: namespace + - action: replace + source_labels: + - __meta_kubernetes_service_name + target_label: service + - action: replace + source_labels: + - __meta_kubernetes_pod_node_name + target_label: node + scrape_interval: 5m + scrape_timeout: 30s + - honor_labels: true + job_name: prometheus-pushgateway + kubernetes_sd_configs: + - role: service + relabel_configs: + - action: keep + regex: pushgateway + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_probe + - honor_labels: true + job_name: kubernetes-services + kubernetes_sd_configs: + - role: service + metrics_path: /probe + params: + module: + - http_2xx + relabel_configs: + - action: keep + regex: true + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_probe + - source_labels: + - __address__ + target_label: __param_target + - replacement: blackbox + target_label: __address__ + - source_labels: + - __param_target + target_label: instance + - action: labelmap + regex: __meta_kubernetes_service_label_(.+) + - source_labels: + - __meta_kubernetes_namespace + target_label: namespace + - source_labels: + - __meta_kubernetes_service_name + target_label: service + - honor_labels: true + job_name: kubernetes-pods + kubernetes_sd_configs: + - role: pod + relabel_configs: + - action: keep + regex: true + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_scrape + - action: drop + regex: true + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_scrape_slow + - action: replace + regex: (https?) + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_scheme + target_label: __scheme__ + - action: replace + regex: (.+) + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_path + target_label: __metrics_path__ + - action: replace + regex: (\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4}) + replacement: '[$2]:$1' + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_port + - __meta_kubernetes_pod_ip + target_label: __address__ + - action: replace + regex: (\d+);((([0-9]+?)(\.|$)){4}) + replacement: $2:$1 + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_port + - __meta_kubernetes_pod_ip + target_label: __address__ + - action: labelmap + regex: __meta_kubernetes_pod_annotation_prometheus_io_param_(.+) + replacement: __param_$1 + - action: labelmap + regex: __meta_kubernetes_pod_label_(.+) + - action: replace + source_labels: + - __meta_kubernetes_namespace + target_label: namespace + - action: replace + source_labels: + - __meta_kubernetes_pod_name + target_label: pod + - action: drop + regex: Pending|Succeeded|Failed|Completed + source_labels: + - __meta_kubernetes_pod_phase + - action: replace + source_labels: + - __meta_kubernetes_pod_node_name + target_label: node + - honor_labels: true + job_name: kubernetes-pods-slow + kubernetes_sd_configs: + - role: pod + relabel_configs: + - action: keep + regex: true + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_scrape_slow + - action: replace + regex: (https?) + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_scheme + target_label: __scheme__ + - action: replace + regex: (.+) + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_path + target_label: __metrics_path__ + - action: replace + regex: (\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4}) + replacement: '[$2]:$1' + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_port + - __meta_kubernetes_pod_ip + target_label: __address__ + - action: replace + regex: (\d+);((([0-9]+?)(\.|$)){4}) + replacement: $2:$1 + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_port + - __meta_kubernetes_pod_ip + target_label: __address__ + - action: labelmap + regex: __meta_kubernetes_pod_annotation_prometheus_io_param_(.+) + replacement: __param_$1 + - action: labelmap + regex: __meta_kubernetes_pod_label_(.+) + - action: replace + source_labels: + - __meta_kubernetes_namespace + target_label: namespace + - action: replace + source_labels: + - __meta_kubernetes_pod_name + target_label: pod + - action: drop + regex: Pending|Succeeded|Failed|Completed + source_labels: + - __meta_kubernetes_pod_phase + - action: replace + source_labels: + - __meta_kubernetes_pod_node_name + target_label: node + scrape_interval: 5m + scrape_timeout: 30s + - job_name: 'kube-state-metrics' + metrics_path: /metrics + scrape_interval: 1m + honor_labels: true + kubernetes_sd_configs: + - role: endpoints + namespaces: + own_namespace: true + relabel_configs: + - source_labels: [__meta_kubernetes_service_label_app_kubernetes_io_name] + regex: kube-state-metrics + action: keep + - source_labels: [__meta_kubernetes_endpoint_port_name] + regex: http + action: keep + metric_relabel_configs: + - source_labels: [__name__] + regex: "kube_pod_container_resource_(limits|requests)|kube_pod_status_phase|kube_node_(labels|status_allocatable|status_condition|status_capacity)|kube_namespace_labels|kube_pod_container_status_(waiting|terminated|last_terminated).*_reason|kube_daemonset_status_number_unavailable|kube_deployment_status_replicas_unavailable|kube_resourcequota|kube_pod_info|kube_node_info|kube_pod_container_status_restarts_total" + action: keep + - source_labels: [__name__, phase] + separator: ";" + regex: "kube_pod_status_phase;(Succeeded|Failed)" + action: drop + - source_labels: [node] + target_label: nodename + regex: "(.*)" + action: replace + - source_labels: [label_node_group_name] + action: replace + regex: "(.+)" + target_label: label_node_pool_name + - job_name: 'opencost' + metrics_path: /metrics + scrape_interval: 1m + honor_labels: true + kubernetes_sd_configs: + - role: endpoints + namespaces: + own_namespace: true + relabel_configs: + - source_labels: [__meta_kubernetes_service_label_app_kubernetes_io_name] + regex: opencost + action: keep + - source_labels: [__meta_kubernetes_endpoint_port_name] + regex: http + action: keep + metric_relabel_configs: + - source_labels: [__name__] + regex: "kube_node_labels|kube_pod_labels|node_total_hourly_cost|node_ram_hourly_cost|node_cpu_hourly_cost|node_gpu_hourly_cost" + action: keep + - job_name: 'union-services' + metrics_path: /metrics + scrape_interval: 1m + honor_labels: true + kubernetes_sd_configs: + - role: endpoints + namespaces: + own_namespace: true + relabel_configs: + - source_labels: [__meta_kubernetes_service_label_platform_union_ai_service_group] + regex: .+ + action: keep + - source_labels: [__meta_kubernetes_endpoint_port_name] + regex: debug + action: keep + # Flyte propeller metrics for execution info and fast task duration. + # No-op when propeller is not deployed (no pods match the label selector). + - job_name: 'flytepropeller' + kubernetes_sd_configs: + - role: pod + namespaces: + own_namespace: true + selectors: + - role: pod + label: app.kubernetes.io/name=flytepropeller + metric_relabel_configs: + - source_labels: [__name__] + regex: "flyte:propeller:all:round:execution_info|flyte:propeller:all:node:fast_task:fast_task_execution_duration" + action: keep + relabel_configs: + - source_labels: [__meta_kubernetes_pod_container_name] + regex: flytepropeller + action: keep + - source_labels: [__meta_kubernetes_namespace] + action: replace + target_label: namespace + - source_labels: [__meta_kubernetes_pod_name] + action: replace + target_label: pod + # Kourier gateway (envoy) metrics for app serving. + # No-op when serving is not enabled (no pods match the label selector). + - job_name: 'serving-envoy' + kubernetes_sd_configs: + - role: pod + namespaces: + own_namespace: true + selectors: + - role: pod + label: app=3scale-kourier-gateway + metrics_path: /stats/prometheus + metric_relabel_configs: + - source_labels: [__name__] + regex: "envoy_cluster_upstream_rq_xx|envoy_cluster_upstream_rq_time_bucket" + action: keep + relabel_configs: + - source_labels: [__meta_kubernetes_pod_container_port_name] + regex: metrics + action: keep + - source_labels: [__meta_kubernetes_namespace] + action: replace + target_label: namespace + - source_labels: [__meta_kubernetes_pod_name] + action: replace + target_label: pod + # DCGM GPU metrics. + # No-op when dcgm-exporter is not deployed (no pods match the label selector). + - job_name: 'gpu-metrics' + kubernetes_sd_configs: + - role: pod + namespaces: + names: + - kube-system + selectors: + - role: pod + label: app.kubernetes.io/name=dcgm-exporter - c_project_resource_quota.yaml: | - apiVersion: v1 - kind: ResourceQuota - metadata: - name: project-quota - namespace: {{ namespace }} - spec: - hard: - limits.cpu: {{ projectQuotaCpu }} - limits.memory: {{ projectQuotaMemory }} - requests.nvidia.com/gpu: {{ projectQuotaNvidiaGpu }} + recording_rules.yml: | + {} + rules: | + {} --- # Source: dataplane/templates/fluent-bit/configmap.yaml apiVersion: v1 @@ -1173,6 +1539,40 @@ data: region us-east-1 bucket --- +# Source: dataplane/templates/imagebuilder/build-image-configmap.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: build-image-config + labels: + app.kubernetes.io/name: union-operator + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm +data: + storage.yaml: | + storage: + container: "" + type: stow + stow: + kind: s3 + config: + auth_type: accesskey + access_key_id: + secret_key: + disable_ssl: false + endpoint: + region: us-east-1 + enable-multicontainer: false + limits: + maxDownloadMBs: 1024 + cache: + max_size_mbs: 0 + target_gc_percent: 70 + buildkit-uri: "tcp://union-operator-buildkit.union.svc.cluster.local:1234" + default-repository: "union-dataplane" + authentication-type: "noop" +--- # Source: dataplane/templates/imagebuilder/configmap.yaml apiVersion: v1 kind: ConfigMap @@ -3329,14 +3729,26 @@ data: "version": 1 } --- -# Source: dataplane/templates/nodeexecutor/configmap.yaml +# Source: dataplane/templates/monitoring/prometheusrule.yaml apiVersion: v1 kind: ConfigMap metadata: - name: executor + name: union-recording-rules namespace: union labels: - app: executor + release: release-name +data: + recording_rules.yml: | + groups: [] +--- +# Source: dataplane/templates/nodeexecutor/configmap.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: executor + namespace: union + labels: + app: executor data: task_logs.yaml: | plugins: @@ -3383,8 +3795,13 @@ data: templateUris: - https://scale.neptune.ai/{{ .taskConfig.project }}/-/run/?customId={{ .taskConfig.id }} - kubernetes-enabled: true + kubernetes-enabled: false enabled_plugins.yaml: | + plugins: + connector-service: + defaultConnector: + defaultServiceConfig: '{"loadBalancingConfig": [{"round_robin":{}}]}' + endpoint: k8s:///flyteconnector.union:8000 tasks: task-plugins: default-for-task-types: @@ -3417,6 +3834,8 @@ data: cpu: 4096 gpu: 256 memory: 2Ti + namespace_mapping: + template: union union: connection: host: dns:/// @@ -3437,7 +3856,7 @@ data: cache-endpoint: dns:/// endpoint: dns:/// insecure: false - type: fallback + type: cacheservicev2 use-admin-auth: true logger: formatter: @@ -3457,6 +3876,7 @@ data: secure: false useAuth: false propeller: + limit-namespace: union node-config: disable-input-file-writes: true plugins: @@ -3464,7 +3884,7 @@ data: additional-worker-args: - --last-ack-grace-period-seconds - "120" - callback-uri: http://unionai-dataplane-executor.union.svc.cluster.local:15605 + callback-uri: http://union-operator-executor.union.svc.cluster.local:15605 grace-period-status-not-found: 2m ioutils: remoteFileOutputPaths: @@ -3474,6 +3894,7 @@ data: default-cpus: 100m default-env-vars: [] default-memory: 100Mi + default-pod-template-name: task-template co-pilot: image: 'cr.flyte.org/flyteorg/flytecopilot:v1.14.1' name: flyte-copilot- @@ -3514,6 +3935,7 @@ data: default-cpus: 100m default-env-vars: [] default-memory: 100Mi + default-pod-template-name: task-template config.yaml: | union: connection: @@ -3533,11 +3955,14 @@ data: operator: enabled: true enableTunnelService: true + # enableDepot: false tunnel: - enableDirectToAppIngress: false + enableDirectToAppIngress: true deploymentToRestart: union-operator-proxy + limitNamespace: union + disableClusterPermissions: true apps: - enabled: 'false' + enabled: 'true' syncClusterConfig: enabled: false clusterId: @@ -3552,19 +3977,19 @@ data: userRole: 'arn:aws:iam::ACCOUNT_ID:role/flyte_project_role' userRoleKey: 'eks.amazonaws.com/role-arn' collectUsages: - enabled: true + enabled: false billing: model: Legacy dependenciesHeartbeat: - prometheus: - endpoint: 'http://union-operator-prometheus:80/-/healthy' - propeller: - endpoint: 'http://flytepropeller:10254' + executor: + endpoint: 'http://union-operator-executor:10254' proxy: endpoint: 'http://union-operator-proxy:10254' secretsWatcher: dryRun: true enabled: false + org: + namespaceTemplate: union imageBuilder: enabled: true executionNamespaceLabels: @@ -3572,9 +3997,6 @@ data: referenceConfigmapName: union-operator targetConfigMapName: "build-image-config" proxy: - imageBuilderConfig: - authenticationType: 'noop' - defaultRepository: '' persistedLogs: objectStore: pathTemplate: namespace-{{.KubernetesNamespace}}.pod-{{.KubernetesPodName}}.cont-{{.KubernetesContainerName}} @@ -3628,989 +4050,149 @@ data: endpoint: region: us-east-1 image-builder.buildkit-uri: "tcp://union-operator-buildkit.union.svc.cluster.local:1234" - image-builder.default-repository: "" + image-builder.default-repository: "union-dataplane" image-builder.authentication-type: "noop" --- -# Source: dataplane/templates/prometheus/configmap.yaml +# Source: dataplane/templates/serving/bootstrap-configmap.yaml +# We need to copy the default configmap here since knative-operator does not automatically update the +# address of the `net-kourier-controller` endpoint to use the release namespace. It assumes that the +# resources are being installed into the `knative-serving` namespace instead. apiVersion: v1 kind: ConfigMap metadata: - name: union-operator-prometheus + name: union-operator-serving-envoy-bootstrap namespace: union - labels: - helm.sh/chart: dataplane-2026.4.2 - app.kubernetes.io/name: release-name-dataplane - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: prometheus data: - prometheus.yml: | - global: - scrape_interval: 15s - evaluation_interval: 15s - alerting: - alertmanagers: - - static_configs: - - targets: - rule_files: - - rules.yml - scrape_configs: - # Self-monitoring - - job_name: prometheus - metrics_path: /prometheus/metrics - static_configs: - - targets: ['localhost:9090'] - metric_relabel_configs: - - source_labels: [__name__] - regex: prometheus_tsdb_head_chunks_storage_size_bytes|prometheus_tsdb_head_series|prometheus_tsdb_head_series_created_total|prometheus_tsdb_head_series_removed_total|prometheus_tsdb_head_samples_appended_total|prometheus_tsdb_storage_blocks_bytes|prometheus_tsdb_retention_limit_bytes|prometheus_tsdb_wal_(.*)|.+_(limit|failed|missed|failures)_total - action: keep - - # Kube state metrics for pod/node resource tracking and cost calculations - - job_name: kube-state-metrics - static_configs: - - targets: ['release-name-kube-state-metrics:8080'] - metric_relabel_configs: - - separator: ; - source_labels: [__name__] - regex: kube_pod_container_resource_(limits|requests)|kube_pod_status_phase|kube_node_(labels|status_allocatable|status_condition|status_capacity)|kube_namespace_labels|kube_pod_container_status_(waiting|terminated|last_terminated).*_reason|kube_daemonset_status_number_unavailable|kube_deployment_status_replicas_unavailable|kube_resourcequota|kube_pod_info|kube_node_info|kube_pod_container_status_restarts_total - action: keep - - separator: ; - source_labels: [__name__, phase] - regex: kube_pod_status_phase;(Succeeded|Failed) - action: drop - - source_labels: [node] - target_label: nodename - regex: '(.*)' - action: replace - - source_labels: [label_node_group_name] - action: replace - regex: (.+) - target_label: label_node_pool_name - - # cAdvisor container metrics for CPU and memory tracking - - job_name: kubernetes-cadvisor - metrics_path: /metrics - scheme: https - kubernetes_sd_configs: - - role: node - namespaces: - names: [] - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token - tls_config: - ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt - insecure_skip_verify: false - metric_relabel_configs: - - separator: ; - source_labels: [__name__] - regex: container_cpu_usage_seconds_total|container_memory_working_set_bytes - action: keep - relabel_configs: - - separator: ; - regex: __meta_kubernetes_node_label_(.+) - replacement: $1 - action: labelmap - - separator: ; - regex: (.*) - target_label: __address__ - replacement: kubernetes.default.svc:443 - action: replace - - source_labels: [__meta_kubernetes_node_name] - separator: ; - regex: (.+) - target_label: __metrics_path__ - replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor - action: replace - - # Flyte propeller metrics for execution info and fast task duration - - job_name: flytepropeller - kubernetes_sd_configs: - - role: pod - namespaces: - names: - - union - selectors: - - role: pod - label: app.kubernetes.io/name=flytepropeller - metric_relabel_configs: - - source_labels: [__name__] - regex: "flyte:propeller:all:round:execution_info|flyte:propeller:all:node:fast_task:fast_task_execution_duration" - action: keep - relabel_configs: - - source_labels: [__meta_kubernetes_pod_container_name] - regex: flytepropeller - action: keep - - source_labels: [__meta_kubernetes_namespace] - action: replace - target_label: namespace - - source_labels: [__meta_kubernetes_pod_name] - action: replace - target_label: pod - # OpenCost metrics for cost tracking - - job_name: opencost - static_configs: - - targets: ['release-name-opencost:9003'] - metric_relabel_configs: - - source_labels: [__name__] - regex: "kube_node_labels|kube_pod_labels|node_total_hourly_cost|node_ram_hourly_cost|node_cpu_hourly_cost|node_gpu_hourly_cost" - action: keep - rules.yml: | - - groups: - - name: cost_calculations_15s - interval: 15s - rules: - - record: pod_gpu_allocation - expr: | - sum by (namespace, pod) (DCGM_FI_DEV_GPU_UTIL >= bool 0) * on (namespace, pod) group_left() (max by (namespace, pod) (kube_pod_status_phase{phase=~"Running|Pending"} == 1)) - - record: execution_info # A join metric to look up execution-level info. Used to disambiguate workflow/task executions, apps, and workspaces. - expr: | - max by (label_domain, label_project, label_entity_name, label_execution_id, label_entity_id)( - label_replace( - label_replace( - label_replace( - label_replace( - label_replace( - flyte:propeller:all:round:execution_info{domain!="", project!="", workflow_name!="", execution_id!=""}, # filter for workflow/task executions - "label_entity_id", "$1", "execution_id", "(.*)" # join key - ), "label_entity_name", "$1", "workflow_name", "(.*)" # set label_entity_name to the workflow/task name from the workflow_execution_id - ), - "label_execution_id", "$1", "execution_id", "(.*)" - ), - "label_project", "$1", "project", "(.*)" # project - ), - "label_domain", "$1", "domain", "(.*)" # domain - ) - ) - - record: app_info # A join metric to look up app-level info. Used to disambiguate workflow/task executions, apps, and workspaces. - expr: | - max by (label_domain, label_project, label_app_name, label_app_version, label_entity_id)( - label_replace( - label_replace( - label_replace( - kube_pod_labels{ - label_domain!="", - label_project!="", - label_serving_unionai_dev_app_name!="", - label_serving_knative_dev_revision!="" - }, # this filters for apps - "label_app_name", "$1", "label_serving_unionai_dev_app_name", "(.*)" # rename to cleanup - ), - "label_app_version", "$1", "label_serving_knative_dev_revision", "(.*)" # the app_version is equivalent to an execution_id for workflows (lowest level of granularity) - ), - "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # join key - ) - ) - - record: workspace_info # A join metric to look up workspace info. Used to disambiguate workflow/task executions, apps, and workspaces. - expr: | - max by (label_domain, label_project, label_workspace_name, label_entity_id)( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # filter for workspaces - "label_entity_id", "$1", "label_node_id", "(.*)" # join key - ), "label_workspace_name", "$1", "label_node_id", "(.*)" # set label_workspace_name to the workspace name from the kube_pod_labels - ) - ) - - record: fast_task_execution_duration - expr: | - max by (label_domain, label_project, label_execution_id, label_entity_id, namespace, pod)( - label_replace( - label_replace( - label_replace( - label_replace( - label_replace( - label_replace( - flyte:propeller:all:node:fast_task:fast_task_execution_duration{domain!="", project!="", execution_id!="", exported_namespace!="", exported_pod!=""}, - "label_entity_id", "$1", "execution_id", "(.*)" # join key - ), - "label_execution_id", "$1", "execution_id", "(.*)" - ), - "label_project", "$1", "project", "(.*)" # project - ), - "label_domain", "$1", "domain", "(.*)" # domain - ), - "namespace", "$1", "exported_namespace", "(.*)" - ), - "pod", "$1", "exported_pod", "(.*)" - ) - ) - - record: fast_task_execution_duration_rate - expr: | - irate(fast_task_execution_duration[1m]) > 0 or absent(irate(fast_task_execution_duration[1m])) * fast_task_execution_duration - - record: entity_id:mem_usage_bytes_total_per_node:sum # Allocated memory (max(requested, consumed)) aggregated per node and entity, where entity is either a task/workflow execution or an app. - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id, node) ( # aggregate up to entity - # First, calculate the allocated memory for each pod - max by (namespace, pod) ( # this is the case where consumed (the memory working set) exceeds requested memory - ( - sum by (namespace, pod) ( - container_memory_working_set_bytes{namespace!="",pod!="",image!=""} - ) - > sum by (namespace, pod) ( - kube_pod_container_resource_requests{namespace!="", pod!="", node!="", resource="memory"} - ) - ) - or sum by (namespace, pod) ( # this is the case where memory requests are <= consumed memory - kube_pod_container_resource_requests{namespace!="", pod!="", node!="", resource="memory"} # needed to add node!="" to dedupe - ) - ) - # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but we do not want to double the number of pod-level metrics we save - * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) - "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") - ), - "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels - label_replace( - label_replace( - kube_pod_labels{ - label_domain!="", - label_project!="", - label_serving_unionai_dev_app_name!="", - label_serving_knative_dev_revision!="" - }, # this filters for apps only - "label_entity_type", "app", "", "" # set label_entity_type to "app" - ), - "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) - "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" - ), - "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - fast_task_execution_duration_rate{label_domain!="", label_project!="",label_execution_id!=""}, - "label_entity_type", "fast_task", "", "" - ) - ) - ) - # Then filter for pods only in the "Running" or "Pending" phase - * on (namespace, pod) group_left() ( - max by (namespace, pod) ( - kube_pod_status_phase{phase=~"Running|Pending"} == 1 - ) - ) - # Now join in node identifiers which are used for subsequent overhead calculations - * on (namespace, pod) group_left(node) ( - max by (namespace, pod, node) (kube_pod_info{node!=""}) # needed to add node!="" to dedupe - ) - ) - - record: entity_id:cpu_usage_per_node:sum # Allocated cpu (max(requested, consumed)) aggregated per node and entity, where entity is either a task/workflow execution or an app. - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id, node) ( - # First, calculate the allocated cpu for each pod - max by (namespace, pod) ( # this is the case where consumed (the cpu usage seconds total) exceeds requested cpu - ( - sum by (namespace, pod) ( - irate(container_cpu_usage_seconds_total{namespace!="",pod!="",image!=""}[5m]) - ) - > sum by (namespace, pod) ( - kube_pod_container_resource_requests{namespace!="", pod!="", node!="", resource="cpu"} - ) - ) - or sum by (namespace, pod) ( # this is the case where cpu requests are <= consumed cpu - kube_pod_container_resource_requests{namespace!="", pod!="", node!="", resource="cpu"} - ) - ) - # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but I didn't want to double the number of pod-level metrics we save - * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) - "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") - ), - "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels - label_replace( - label_replace( - kube_pod_labels{ - label_domain!="", - label_project!="", - label_serving_unionai_dev_app_name!="", - label_serving_knative_dev_revision!="" - }, # this filters for apps only - "label_entity_type", "app", "", "" # set label_entity_type to "app" - ), - "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) - "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" - ), - "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - fast_task_execution_duration_rate{label_domain!="", label_project!="", label_execution_id!=""}, - "label_entity_type", "fast_task", "", "" - ) - ) - ) - # Then filter for pods only in the "Running" or "Pending" phase - * on (namespace, pod) group_left() ( - max by (namespace, pod) ( - kube_pod_status_phase{phase=~"Running|Pending"} == 1 - ) - ) - # Now join in node identifiers which are used for subsequent overhead calculations - * on (namespace, pod) group_left(node) ( - max by (namespace, pod, node) (kube_pod_info{node!=""}) # needed to add node!="" to dedupe - ) - ) - - record: entity_id:gpu_usage_per_node:sum # Allocated gpu aggregated per node and entity, where entity is either a task/workflow execution or an app. - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id, node) ( - # First, grab the allocated gpu for each pod (which is always either 1 or zero, since k8s can't split gpus the way it can with cpu/memory) - max by (namespace, pod) ( - pod_gpu_allocation - ) - # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but I didn't want to double the number of pod-level metrics we save - * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) - "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") - ), - "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels - label_replace( - label_replace( - kube_pod_labels{ - label_domain!="", - label_project!="", - label_serving_unionai_dev_app_name!="", - label_serving_knative_dev_revision!="" - }, # this filters for apps only - "label_entity_type", "app", "", "" # set label_entity_type to "app" - ), - "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) - "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" - ), - "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - fast_task_execution_duration_rate{label_domain!="", label_project!="", label_execution_id!=""}, - "label_entity_type", "fast_task", "", "" - ) - ) - ) - # Then filter for pods only in the "Running" or "Pending" phase - * on (namespace, pod) group_left() ( - max by (namespace, pod) ( - kube_pod_status_phase{phase=~"Running|Pending"} == 1 - ) - ) - # Now join in node identifiers which are used for subsequent overhead calculations - * on (namespace, pod) group_left(node) ( - max by (namespace, pod, node) (kube_pod_info{node!=""}) # needed to add node!="" to dedupe - ) - ) - - record: entity_id:used_mem_bytes:sum # the sum of used memory across all containers in an entity (numerator for aggregate utilization calculations) - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id) ( # aggregate up to entity - # First, calculate the used memory for each pod - sum by (namespace, pod) ( - container_memory_working_set_bytes{namespace!="",pod!="",image!=""} - ) - # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but we do not want to double the number of pod-level metrics we save - * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) - "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") - ), - "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels - label_replace( - label_replace( - kube_pod_labels{ - label_domain!="", - label_project!="", - label_serving_unionai_dev_app_name!="", - label_serving_knative_dev_revision!="" - }, # this filters for apps only - "label_entity_type", "app", "", "" # set label_entity_type to "app" - ), - "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) - "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" - ), - "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - fast_task_execution_duration_rate{label_domain!="", label_project!="", label_execution_id!=""}, - "label_entity_type", "fast_task", "", "" - ) - ) - ) - # Then filter for pods only in the "Running" or "Pending" phase - * on (namespace, pod) group_left() ( - max by (namespace, pod) ( - kube_pod_status_phase{phase=~"Running|Pending"} == 1 - ) - ) - ) - - record: entity_id:allocated_mem_bytes:sum # the sum of allocated memory across all containers in an entity (denominator for aggregate utilization calculations) - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id) ( # aggregate up to entity (remove node) - entity_id:mem_usage_bytes_total_per_node:sum - ) - - record: entity_id:used_cpu:sum # the sum of used cpu across all containers in an entity (numerator for aggregate utilization calculations) - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id) ( - sum by (namespace, pod) ( - irate(container_cpu_usage_seconds_total{namespace!="",pod!="",image!=""}[5m]) - ) - # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but I didn't want to double the number of pod-level metrics we save - * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) - "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") - ), - "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels - label_replace( - label_replace( - kube_pod_labels{ - label_domain!="", - label_project!="", - label_serving_unionai_dev_app_name!="", - label_serving_knative_dev_revision!="" - }, # this filters for apps only - "label_entity_type", "app", "", "" # set label_entity_type to "app" - ), - "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) - "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" - ), - "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - fast_task_execution_duration_rate{label_domain!="", label_project!="", label_execution_id!=""}, - "label_entity_type", "fast_task", "", "" - ) - ) - ) - # Then filter for pods only in the "Running" or "Pending" phase - * on (namespace, pod) group_left() ( - max by (namespace, pod) ( - kube_pod_status_phase{phase=~"Running|Pending"} == 1 - ) - ) - ) - - record: entity_id:allocated_cpu:sum # the sum of allocated cpu across all containers in an entity (denominator for aggregate utilization calculations) - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id) ( # aggregate up to entity (remove node) - entity_id:cpu_usage_per_node:sum - ) - - record: entity_id:sm_occupancy:avg # the simple average of SM occupancy (a good generic measure of GPU utilization) per entity - expr: | - avg by (label_entity_type, label_domain, label_project, label_entity_id) ( - # First, grab the SM occupancy for each pod - max by (namespace, pod) ( - DCGM_FI_PROF_SM_OCCUPANCY # SM occupancy is a good proxy for actual GPU usage - ) - # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but I didn't want to double the number of pod-level metrics we save - * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) - "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") - ), - "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels - label_replace( - label_replace( - kube_pod_labels{ - label_domain!="", - label_project!="", - label_serving_unionai_dev_app_name!="", - label_serving_knative_dev_revision!="" - }, # this filters for apps only - "label_entity_type", "app", "", "" # set label_entity_type to "app" - ), - "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) - "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" - ), - "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - fast_task_execution_duration_rate{label_domain!="", label_project!="", label_execution_id!=""}, - "label_entity_type", "fast_task", "", "" - ) - ) - ) - # Then filter for pods only in the "Running" or "Pending" phase - * on (namespace, pod) group_left() ( - max by (namespace, pod) ( - kube_pod_status_phase{phase=~"Running|Pending"} == 1 - ) - ) - ) - - record: entity_id:gpu_count:sum # the count of running gpu pods per entity (need this to weight the gpu utilization when aggregating upwards - i.e. project-level) - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id, node) ( - # First, grab the allocated gpu for each pod (which is always either 1 or zero, since k8s can't split gpus the way it can with cpu/memory) - max by (namespace, pod) ( - pod_gpu_allocation - ) - # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but I didn't want to double the number of pod-level metrics we save - * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) - "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") - ), - "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels - label_replace( - label_replace( - kube_pod_labels{ - label_domain!="", - label_project!="", - label_serving_unionai_dev_app_name!="", - label_serving_knative_dev_revision!="" - }, # this filters for apps only - "label_entity_type", "app", "", "" # set label_entity_type to "app" - ), - "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) - "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" - ), - "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - fast_task_execution_duration_rate{label_domain!="", label_project!="", label_execution_id!=""}, - "label_entity_type", "fast_task", "", "" - ) - ) - ) - # Then filter for pods only in the "Running" or "Pending" phase - * on (namespace, pod) group_left() ( - max by (namespace, pod) ( - kube_pod_status_phase{phase=~"Running|Pending"} == 1 - ) - ) - ) - - record: entity_id:weighted_sm_occupancy:sum # product of SM occupancy and allocated GPU count (something like "used memory", numerator of weighted calcs) - expr: | - entity_id:sm_occupancy:avg - * on (label_domain, label_project, label_entity_type, label_entity_id) entity_id:gpu_count:sum - - record: entity_id:allocated_mem_cost:sum # Allocated cost of memory for each workflow/task execution and app. - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id, type) ( - entity_id:mem_usage_bytes_total_per_node:sum / (1024 * 1024 * 1024) # convert bytes to GB - * on (node) group_left(type) label_replace(avg by (node) (node_ram_hourly_cost * (15 / 3600)), "type", "mem", "", "") # convert hourly cost to 15-secondly cost and add type - ) - - record: entity_id:allocated_cpu_cost:sum # Allocated cost of cpu for each workflow/task execution and app. - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id, type)( - entity_id:cpu_usage_per_node:sum - * on (node) group_left(type) label_replace(avg by (node) (node_cpu_hourly_cost * (15 / 3600)), "type", "cpu", "", "") # convert hourly cost to 15-secondly cost and add type - ) - - record: entity_id:allocated_gpu_cost:sum # Allocated cost of gpu for each workflow/task execution and app. - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id, type)( - entity_id:gpu_usage_per_node:sum - * on (node) group_left(type) label_replace(avg by (node) (node_gpu_hourly_cost * (15 / 3600)), "type", "gpu", "", "") # convert hourly cost to 15-secondly cost and add type - ) - - record: entity_id:allocated_cost:sum # Allocated cost of memory, cpu, and gpu for each workflow/task execution and app. - expr: | - label_replace( - sum by (label_entity_type, label_domain, label_project, label_entity_id) ( # for the sum to work, the labels need to be different on each "or" element (type label) - entity_id:allocated_mem_cost:sum - or - entity_id:allocated_cpu_cost:sum - or - entity_id:allocated_gpu_cost:sum - ), - "type", "allocated", "", "" # add type info - ) - - record: entity_id:overhead_cost:sum # The amount of overhead costs (node costs that we can't allocate with container resources) to allocate to each entity (workflow/task execution or app) - expr: | - label_replace( - sum by (label_entity_type, label_entity_id, label_domain, label_project)( # Aggregate the per-node metrics up to workflow/task execution or app (label_entity_id) - # Start with each execution's and app's allocated cost per node - sum by (label_entity_type, label_domain, label_project, label_entity_id, node) ( # for the sum to work, the labels need to be different on each "or" element (type label) - entity_id:mem_usage_bytes_total_per_node:sum / (1024 * 1024 * 1024) # convert bytes to GB - * on (node) group_left(type) label_replace(avg by (node) (node_ram_hourly_cost * (15 / 3600)), "type", "mem", "", "") # convert hourly cost to 15-secondly cost and add type - or - entity_id:cpu_usage_per_node:sum - * on (node) group_left(type) label_replace(avg by (node) (node_cpu_hourly_cost * (15 / 3600)), "type", "cpu", "", "") # convert hourly cost to 15-secondly cost and add type - or - entity_id:gpu_usage_per_node:sum - * on (node) group_left(type) label_replace(avg by (node) (node_gpu_hourly_cost * (15 / 3600)), "type", "gpu", "", "") # convert hourly cost to 15-secondly cost and add type - ) - # Then divide out the total allocated cost per node to get the proportion of allocated cost associated with each entity - / on (node) group_left()( - sum by (node) ( # for the sum to work, the labels need to be different on each "or" element (type label) - entity_id:mem_usage_bytes_total_per_node:sum / (1024 * 1024 * 1024) # convert bytes to GB - * on (node) group_left(type) label_replace(avg by (node) (node_ram_hourly_cost * (15 / 3600)), "type", "mem", "", "") # convert hourly cost to 15-secondly cost and add type - or - entity_id:cpu_usage_per_node:sum - * on (node) group_left(type) label_replace(avg by (node) (node_cpu_hourly_cost * (15 / 3600)), "type", "cpu", "", "") # convert hourly cost to 15-secondly cost and add type - or - entity_id:gpu_usage_per_node:sum - * on (node) group_left(type) label_replace(avg by (node) (node_gpu_hourly_cost * (15 / 3600)), "type", "gpu", "", "") # convert hourly cost to 15-secondly cost and add type - ) - > 0 # need to avoid dividing by zero, or gaps in the data can cause NaNs to proliferate, borking all charts - ) - # Then multiply by the overhead cost per node - * on (node) group_left() ( - # To calculate overhead, start with the true cost of running each node - avg by (node)(kube_node_labels{label_flyte_org_node_role="worker"}) # only look at worker nodes - * on (node) max by (node) ( - node_total_hourly_cost{instance_type!=""} # sometimes, the instance_type can be null, causing an unlabeled label to show up in the Compute Costs dashboard charts - ) * (15 / 3600) # convert hourly cost to 15-secondly cost - # Then subtract out the total allocated cost on each node - - on (node) group_left()( - sum by (node) ( # for the sum to work, the labels need to be different on each "or" element (type label) - entity_id:mem_usage_bytes_total_per_node:sum / (1024 * 1024 * 1024) # convert bytes to GB - * on (node) group_left(type) label_replace(avg by (node) (node_ram_hourly_cost * (15 / 3600)), "type", "mem", "", "") # convert hourly cost to 15-secondly cost and add type - or - entity_id:cpu_usage_per_node:sum - * on (node) group_left(type) label_replace(avg by (node) (node_cpu_hourly_cost * (15 / 3600)), "type", "cpu", "", "") # convert hourly cost to 15-secondly cost and add type - or - entity_id:gpu_usage_per_node:sum - * on (node) group_left(type) label_replace(avg by (node) (node_gpu_hourly_cost * (15 / 3600)), "type", "gpu", "", "") # convert hourly cost to 15-secondly cost and add type - ) - ) - ) - ), - "type", "overhead", "", "" # add type info - ) - - record: entity_id:total_cost:sum # Total cost of each entity (workflow/task execution or app), including allocated (from container resources) and overhead (proportion of unallocated node costs) - expr: | - label_replace( - sum by (label_domain, label_project, label_entity_id, label_entity_type) ( - entity_id:allocated_cost:sum - or - entity_id:overhead_cost:sum - ), - "type", "total", "", "" # add type info - ) - - record: node:total_cost:sum # Total cost of all nodes - expr: | - sum ( - avg by (node)(kube_node_labels{label_flyte_org_node_role="worker", label_node_kubernetes_io_instance_type!=""}) # only look at worker nodes - * on (node) group_left() node_total_hourly_cost{instance_type!=""} * (15 / 3600) # convert hourly cost to 15-secondly cost - ) - - record: node_type:total_cost:sum # Total cost of nodes grouped by node type - expr: | - sum by (node_type)( - avg by (node)(kube_node_labels{label_flyte_org_node_role="worker", label_node_kubernetes_io_instance_type!=""}) # only look at worker nodes - * on (node) group_left(node_type) label_replace(node_total_hourly_cost{instance_type!=""}, "node_type", "$1", "instance_type", "(.*)") * (15 / 3600) # convert hourly cost to 15-secondly cost and rename label - ) - - record: node_type:uptime_hours:sum # Total uptime of nodes grouped by node type - expr: | - sum by (node_type)( - avg by (node, node_type)( # dedupe - label_replace(kube_node_labels{label_flyte_org_node_role="worker", label_node_kubernetes_io_instance_type!=""}, "node_type", "$1", "label_node_kubernetes_io_instance_type", "(.*)") # relabel - ) - ) * (15 / 3600) # convert to number of hours per 15-second observation # Aggregate the above into visible metrics - - name: cost_rollup_15m - interval: 15m - rules: - - record: execution_info15m - expr: | - max_over_time(execution_info[15m:15s]) - - record: app_info15m - expr: | - max_over_time(app_info[15m:15s]) - - record: workspace_info15m - expr: | - max_over_time(workspace_info[15m:15s]) - - record: entity_id:allocated_mem_bytes:sum15m - expr: | - sum_over_time(entity_id:allocated_mem_bytes:sum[15m:15s]) - - record: entity_id:used_mem_bytes:sum15m - expr: | - sum_over_time(entity_id:used_mem_bytes:sum[15m:15s]) - - record: entity_id:allocated_cpu:sum15m - expr: | - sum_over_time(entity_id:allocated_cpu:sum[15m:15s]) - - record: entity_id:used_cpu:sum15m - expr: | - sum_over_time(entity_id:used_cpu:sum[15m:15s]) - - record: entity_id:weighted_sm_occupancy:sum15m - expr: | - sum_over_time(entity_id:weighted_sm_occupancy:sum[15m:15s]) - - record: entity_id:gpu_count:sum15m - expr: | - sum_over_time(entity_id:gpu_count:sum[15m:15s]) - - record: entity_id:allocated_mem_cost:sum15m - expr: | - sum_over_time(entity_id:allocated_mem_cost:sum[15m:15s]) - - record: entity_id:allocated_cpu_cost:sum15m - expr: | - sum_over_time(entity_id:allocated_cpu_cost:sum[15m:15s]) - - record: entity_id:allocated_gpu_cost:sum15m - expr: | - sum_over_time(entity_id:allocated_gpu_cost:sum[15m:15s]) - - record: entity_id:allocated_cost:sum15m - expr: | - sum_over_time(entity_id:allocated_cost:sum[15m:15s]) - - record: entity_id:overhead_cost:sum15m - expr: | - sum_over_time(entity_id:overhead_cost:sum[15m:15s]) - - record: entity_id:total_cost:sum15m - expr: | - sum_over_time(entity_id:total_cost:sum[15m:15s]) - - record: node:total_cost:sum15m - expr: | - sum_over_time(node:total_cost:sum[15m:15s]) - - record: node_type:total_cost:sum15m - expr: | - sum_over_time(node_type:total_cost:sum[15m:15s]) - - record: node_type:uptime_hours:sum15m - expr: | - sum_over_time(node_type:uptime_hours:sum[15m:15s]) ---- -# Source: dataplane/templates/propeller/configmap.yaml + envoy-bootstrap.yaml: | + dynamic_resources: + ads_config: + transport_api_version: V3 + api_type: GRPC + rate_limit_settings: {} + grpc_services: + - envoy_grpc: {cluster_name: xds_cluster} + cds_config: + resource_api_version: V3 + ads: {} + lds_config: + resource_api_version: V3 + ads: {} + node: + cluster: kourier-knative + id: 3scale-kourier-gateway + static_resources: + listeners: + - name: stats_listener + address: + socket_address: + address: 0.0.0.0 + port_value: 9000 + filter_chains: + - filters: + - name: envoy.filters.network.http_connection_manager + typed_config: + "@type": type.googleapis.com/envoy.extensions.filters.network.http_connection_manager.v3.HttpConnectionManager + stat_prefix: stats_server + http_filters: + - name: envoy.filters.http.router + typed_config: + "@type": type.googleapis.com/envoy.extensions.filters.http.router.v3.Router + route_config: + virtual_hosts: + - name: admin_interface + domains: + - "*" + routes: + - match: + safe_regex: + regex: '/(certs|stats(/prometheus)?|server_info|clusters|listeners|ready)?' + headers: + - name: ':method' + string_match: + exact: GET + route: + cluster: service_stats + - match: + safe_regex: + regex: '/drain_listeners' + headers: + - name: ':method' + string_match: + exact: POST + route: + cluster: service_stats + clusters: + - name: service_stats + connect_timeout: 0.250s + type: static + load_assignment: + cluster_name: service_stats + endpoints: + lb_endpoints: + endpoint: + address: + socket_address: + address: 127.0.0.1 + port_value: 9901 + - name: xds_cluster + # This keepalive is recommended by envoy docs. + # https://www.envoyproxy.io/docs/envoy/latest/api-docs/xds_protocol + typed_extension_protocol_options: + envoy.extensions.upstreams.http.v3.HttpProtocolOptions: + "@type": type.googleapis.com/envoy.extensions.upstreams.http.v3.HttpProtocolOptions + explicit_http_config: + http2_protocol_options: + connection_keepalive: + interval: 30s + timeout: 5s + connect_timeout: 1s + load_assignment: + cluster_name: xds_cluster + endpoints: + lb_endpoints: + endpoint: + address: + socket_address: + address: "net-kourier-controller" + port_value: 18000 + type: STRICT_DNS + admin: + access_log: + - name: envoy.access_loggers.stdout + typed_config: + "@type": type.googleapis.com/envoy.extensions.access_loggers.stream.v3.StdoutAccessLog + address: + socket_address: + address: 127.0.0.1 + port_value: 9901 + stats_config: + stats_tags: + - tag_name: name + regex: '^.*?\.u/.*?n=(.*?)/.*?u\..*$' + - tag_name: domain + regex: '^.*?\.u/.*?d=(.*?)/.*?u\..*$' + - tag_name: org + regex: '^.*?\.u/.*?o=(.*?)/.*?u\..*$' + - tag_name: project + regex: '^.*?\.u/.*?p=(.*?)/.*?u\..*$' + - tag_name: target + regex: '^.*?\.u/.*?t=(.*?)/.*?u\..*$' + - tag_name: target_namespace + regex: '^.*?\.u/.*?tns=(.*?)/.*?u\..*$' +--- +# Source: dataplane/templates/webhook/configmap.yaml apiVersion: v1 kind: ConfigMap metadata: - name: flyte-propeller-config + name: union-pod-webhook-config namespace: union data: - admin.yaml: | - admin: - clientId: 'test' - clientSecretLocation: /etc/union/secret/client_secret - endpoint: dns:/// - insecure: false - event: - capacity: 1000 - rate: 500 - type: admin - catalog.yaml: | - catalog-cache: - cache-endpoint: dns:/// - endpoint: dns:/// - insecure: false - type: fallback - use-admin-auth: true - copilot.yaml: | - plugins: - k8s: - co-pilot: - image: 'cr.flyte.org/flyteorg/flytecopilot:v1.14.1' - name: flyte-copilot- - start-timeout: 30s - core.yaml: | - propeller: - downstream-eval-duration: 30s - enable-admin-launcher: true - leader-election: - enabled: true - lease-duration: 15s - lock-config-map: - name: propeller-leader - namespace: 'union' - renew-deadline: 10s - retry-period: 2s - limit-namespace: all - literal-offloading-config: - enabled: true - max-workflow-retries: 30 - metadata-prefix: metadata/propeller - metrics-prefix: flyte - prof-port: 10254 - queue: - batch-size: -1 - batching-interval: 2s - queue: - base-delay: 5s - capacity: 1000 - max-delay: 120s - rate: 100 - type: maxof - sub-queue: - capacity: 100 - rate: 10 - type: bucket - type: batch - rawoutput-prefix: 's3://' - workers: 4 - workflow-reeval-duration: 30s + core.yaml: | + + webhook: certDir: /etc/webhook/certs + disableCreateMutatingWebhookConfig: true embeddedSecretManagerConfig: imagePullSecrets: enabled: true @@ -4618,81 +4200,13 @@ data: namespace: 'union' type: 'K8s' listenPort: '9443' + localCert: true secretManagerTypes: - Embedded - K8s - serviceName: flyte-pod-webhook + secretName: union-pod-webhook + serviceName: union-pod-webhook servicePort: '443' - enabled_plugins.yaml: | - plugins: - connector-service: - defaultConnector: - defaultServiceConfig: '{"loadBalancingConfig": [{"round_robin":{}}]}' - endpoint: k8s:///flyteconnector.union:8000 - tasks: - task-plugins: - default-for-task-types: - actor: fast-task - container: container - container_array: k8s-array - sidecar: sidecar - enabled-plugins: - - container - - sidecar - - k8s-array - - echo - - fast-task - - connector-service - k8s.yaml: | - plugins: - k8s: - default-cpus: 100m - default-env-vars: [] - default-memory: 100Mi - logger.yaml: | - logger: - formatter: - type: json - level: 4 - show-source: true - resource_manager.yaml: | - propeller: - resourcemanager: - type: noop - task_logs.yaml: | - plugins: - logs: - cloudwatch-enabled: false - dynamic-log-links: - - vscode: - displayName: VS Code Debugger - templateUris: - - /dataplane/pod/v1/generated_name/task/{{.executionProject}}/{{.executionDomain}}/{{.executionName}}/{{.nodeID}}/{{.taskRetryAttempt}}/{{.taskProject}}/{{.taskDomain}}/{{.taskID}}/{{.taskVersion}}/ - kubernetes-enabled: false - templates: - - displayName: Task Logs - scheme: TaskExecution - templateUris: - - /console/projects/{{.executionProject}}/domains/{{.executionDomain}}/executions/{{.executionName}}/nodeId/{{.nodeID}}/taskId/{{.taskID}}/attempt/{{.taskRetryAttempt}}/view/logs?duration=all&fromExecutionNav=true - storage.yaml: | - storage: - container: "" - type: stow - stow: - kind: s3 - config: - auth_type: accesskey - access_key_id: - secret_key: - disable_ssl: false - endpoint: - region: us-east-1 - enable-multicontainer: false - limits: - maxDownloadMBs: 1024 - cache: - max_size_mbs: 0 - target_gc_percent: 70 --- # Source: dataplane/charts/fluentbit/templates/clusterrole.yaml apiVersion: rbac.authorization.k8s.io/v1 @@ -4716,851 +4230,1144 @@ rules: - list - watch --- -# Source: dataplane/charts/kube-state-metrics/templates/role.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2020 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: - labels: - helm.sh/chart: kube-state-metrics-5.30.1 - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: metrics - app.kubernetes.io/part-of: kube-state-metrics - app.kubernetes.io/name: kube-state-metrics - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2.15.0" - name: release-name-kube-state-metrics -rules: - -- apiGroups: ["certificates.k8s.io"] - resources: - - certificatesigningrequests - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - configmaps - verbs: ["list", "watch"] - -- apiGroups: ["batch"] - resources: - - cronjobs - verbs: ["list", "watch"] - -- apiGroups: ["extensions", "apps"] - resources: - - daemonsets - verbs: ["list", "watch"] - -- apiGroups: ["extensions", "apps"] - resources: - - deployments - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - endpoints - verbs: ["list", "watch"] + name: knative-serving-operator-aggregated + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +aggregationRule: + clusterRoleSelectors: + # This (along with escalate below) allows the Operator to pick up any + # roles that are provided to the admin of the cluster by knative serving + # automatically. + - matchExpressions: + - {key: serving.knative.dev/release, operator: Exists} +rules: [] # Rules are automatically filled in by the controller manager. +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: knative-serving-operator-aggregated-stable + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +aggregationRule: + clusterRoleSelectors: + # This (along with escalate below) allows the Operator to pick up any + # roles that are provided to the admin of the cluster by knative serving + # automatically. + - matchExpressions: + - {key: app.kubernetes.io/name, operator: In, values: ["knative-serving"]} +rules: [] # Rules are automatically filled in by the controller manager. +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: knative-eventing-operator-aggregated + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +aggregationRule: + clusterRoleSelectors: + # This (along with escalate below) allows the Operator to pick up any + # roles that are provided to the admin of the cluster by knative eventing + # automatically. + - matchExpressions: + - {key: eventing.knative.dev/release, operator: Exists} +rules: [] # Rules are automatically filled in by the controller manager. +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: knative-eventing-operator-aggregated-stable + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +aggregationRule: + clusterRoleSelectors: + # This (along with escalate below) allows the Operator to pick up any + # roles that are provided to the admin of the cluster by knative eventing + # automatically. + - matchExpressions: + - {key: app.kubernetes.io/name, operator: In, values: ["knative-eventing"]} +rules: [] # Rules are automatically filled in by the controller manager. +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2020 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. -- apiGroups: ["autoscaling"] - resources: - - horizontalpodautoscalers - verbs: ["list", "watch"] - -- apiGroups: ["extensions", "networking.k8s.io"] - resources: - - ingresses - verbs: ["list", "watch"] - -- apiGroups: ["batch"] - resources: - - jobs - verbs: ["list", "watch"] - -- apiGroups: ["coordination.k8s.io"] - resources: - - leases - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - limitranges - verbs: ["list", "watch"] - -- apiGroups: ["admissionregistration.k8s.io"] - resources: - - mutatingwebhookconfigurations - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - namespaces - verbs: ["list", "watch"] - -- apiGroups: ["networking.k8s.io"] - resources: - - networkpolicies - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - nodes - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - persistentvolumeclaims - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - persistentvolumes - verbs: ["list", "watch"] - -- apiGroups: ["policy"] - resources: - - poddisruptionbudgets - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - pods - verbs: ["list", "watch"] - -- apiGroups: ["extensions", "apps"] - resources: - - replicasets - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - replicationcontrollers - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - resourcequotas - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - secrets - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - services - verbs: ["list", "watch"] - -- apiGroups: ["apps"] - resources: - - statefulsets - verbs: ["list", "watch"] - -- apiGroups: ["storage.k8s.io"] - resources: - - storageclasses - verbs: ["list", "watch"] - -- apiGroups: ["admissionregistration.k8s.io"] - resources: - - validatingwebhookconfigurations - verbs: ["list", "watch"] - -- apiGroups: ["storage.k8s.io"] - resources: - - volumeattachments - verbs: ["list", "watch"] ---- -# Source: dataplane/charts/monitoring/charts/grafana/templates/clusterrole.yaml -kind: ClusterRole -apiVersion: rbac.authorization.k8s.io/v1 -metadata: - labels: - helm.sh/chart: grafana-10.4.0 - app.kubernetes.io/name: grafana - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "12.3.0" - name: release-name-grafana-clusterrole -rules: - - apiGroups: [""] # "" indicates the core API group - resources: ["configmaps", "secrets"] - verbs: ["get", "watch", "list"] ---- -# Source: dataplane/charts/monitoring/charts/kube-state-metrics/templates/role.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - labels: - helm.sh/chart: kube-state-metrics-7.0.0 - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: metrics - app.kubernetes.io/part-of: monitoring-kube-state-metrics - app.kubernetes.io/name: monitoring-kube-state-metrics - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2.17.0" - release: release-name - name: monitoring-kube-state-metrics -rules: - -- apiGroups: ["certificates.k8s.io"] - resources: - - certificatesigningrequests - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - configmaps - verbs: ["list", "watch"] - -- apiGroups: ["batch"] - resources: - - cronjobs - verbs: ["list", "watch"] - -- apiGroups: ["apps"] - resources: - - daemonsets - verbs: ["list", "watch"] - -- apiGroups: ["apps"] - resources: - - deployments - verbs: ["list", "watch"] - -- apiGroups: ["discovery.k8s.io"] - resources: - - endpointslices - verbs: ["list", "watch"] - -- apiGroups: ["autoscaling"] - resources: - - horizontalpodautoscalers - verbs: ["list", "watch"] - -- apiGroups: ["networking.k8s.io"] - resources: - - ingresses - verbs: ["list", "watch"] - -- apiGroups: ["batch"] - resources: - - jobs - verbs: ["list", "watch"] - -- apiGroups: ["coordination.k8s.io"] - resources: - - leases - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - limitranges - verbs: ["list", "watch"] - -- apiGroups: ["admissionregistration.k8s.io"] - resources: - - mutatingwebhookconfigurations - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - namespaces - verbs: ["list", "watch"] - -- apiGroups: ["networking.k8s.io"] - resources: - - networkpolicies - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - nodes - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - persistentvolumeclaims - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - persistentvolumes - verbs: ["list", "watch"] - -- apiGroups: ["policy"] - resources: - - poddisruptionbudgets - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - pods - verbs: ["list", "watch"] - -- apiGroups: ["apps"] - resources: - - replicasets - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - replicationcontrollers - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - resourcequotas - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - secrets - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - services - verbs: ["list", "watch"] - -- apiGroups: ["apps"] - resources: - - statefulsets - verbs: ["list", "watch"] - -- apiGroups: ["storage.k8s.io"] - resources: - - storageclasses - verbs: ["list", "watch"] - -- apiGroups: ["admissionregistration.k8s.io"] - resources: - - validatingwebhookconfigurations - verbs: ["list", "watch"] - -- apiGroups: ["storage.k8s.io"] - resources: - - volumeattachments - verbs: ["list", "watch"] ---- -# Source: dataplane/charts/monitoring/templates/prometheus-operator/clusterrole.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: monitoring-operator - labels: - - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "80.8.0" - app.kubernetes.io/part-of: monitoring - chart: monitoring-80.8.0 - release: "release-name" - heritage: "Helm" - app: monitoring-operator - app.kubernetes.io/name: monitoring-prometheus-operator - app.kubernetes.io/component: prometheus-operator -rules: -- apiGroups: - - monitoring.coreos.com - resources: - - alertmanagers - - alertmanagers/finalizers - - alertmanagers/status - - alertmanagerconfigs - - prometheuses - - prometheuses/finalizers - - prometheuses/status - - prometheusagents - - prometheusagents/finalizers - - prometheusagents/status - - thanosrulers - - thanosrulers/finalizers - - thanosrulers/status - - scrapeconfigs - - scrapeconfigs/status - - servicemonitors - - servicemonitors/status - - podmonitors - - podmonitors/status - - probes - - probes/status - - prometheusrules - - prometheusrules/status - verbs: - - '*' -- apiGroups: - - apps - resources: - - statefulsets - verbs: - - '*' -- apiGroups: - - "" - resources: - - configmaps - - secrets - verbs: - - '*' -- apiGroups: - - "" - resources: - - pods - verbs: - - list - - delete -- apiGroups: - - "" - resources: - - services - - services/finalizers - - endpoints - verbs: - - get - - create - - update - - delete -- apiGroups: - - "" - resources: - - nodes - verbs: - - list - - watch -- apiGroups: - - "" - resources: - - namespaces - verbs: - - get - - list - - watch -- apiGroups: - - "" - - events.k8s.io - resources: - - events - verbs: - - patch - - create -- apiGroups: - - networking.k8s.io - resources: - - ingresses - verbs: - - get - - list - - watch -- apiGroups: - - storage.k8s.io - resources: - - storageclasses - verbs: - - get -- apiGroups: - - discovery.k8s.io - resources: - - endpointslices - verbs: - - get - - create - - list - - watch - - update - - delete ---- -# Source: dataplane/charts/monitoring/templates/prometheus/clusterrole.yaml -apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole +apiVersion: rbac.authorization.k8s.io/v1 metadata: - name: monitoring-prometheus + name: knative-serving-operator labels: - app: monitoring-prometheus - - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "80.8.0" - app.kubernetes.io/part-of: monitoring - chart: monitoring-80.8.0 - release: "release-name" - heritage: "Helm" + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator rules: -# These permissions (to examine all namespaces) are not in the kube-prometheus repo. -# They're grabbed from https://github.com/prometheus/prometheus/blob/master/documentation/examples/rbac-setup.yml -# kube-prometheus deliberately defaults to a more restrictive setup that is not appropriate for our general audience. -- apiGroups: [""] - resources: - - nodes - - nodes/metrics - - services - - endpoints - - pods - verbs: ["get", "list", "watch"] -- apiGroups: ["discovery.k8s.io"] - resources: - - endpointslices - verbs: ["get", "list", "watch"] -- apiGroups: - - "networking.k8s.io" - resources: - - ingresses - verbs: ["get", "list", "watch"] -- nonResourceURLs: ["/metrics", "/metrics/cadvisor"] - verbs: ["get"] + - apiGroups: + - operator.knative.dev + resources: + - '*' + verbs: + - '*' + # Bootstrapping permissions. + # Roles that are explicitly bound buch which are specified by this Operator + # MUST be specified here with 'get' and 'bind'. + - apiGroups: + - rbac.authorization.k8s.io + resources: + - clusterroles + resourceNames: + - system:auth-delegator + verbs: + - bind + - get + - apiGroups: + - rbac.authorization.k8s.io + resources: + - roles + resourceNames: + - extension-apiserver-authentication-reader + verbs: + - bind + - get + - apiGroups: + - rbac.authorization.k8s.io + resources: + - clusterroles + - roles + verbs: + - create + - delete + # Escalate is necessary in order to create a role using cluster role aggregation, + # and to allow the Operator to bootstrap itself into the necessary set of + # permissions, even as those continue to evolve upstream. + - escalate + - get + - list + - update + - apiGroups: + - rbac.authorization.k8s.io + resources: + - clusterrolebindings + - rolebindings + verbs: + - create + - delete + - list + - get + - update + # Permissions required for Knative controller + # infra. + - apiGroups: + - apiregistration.k8s.io + resources: + - apiservices + verbs: + - update + - apiGroups: + - "" + resources: + - services + verbs: + - create + - delete + - get + - list + - watch + - apiGroups: + - caching.internal.knative.dev + resources: + - images + verbs: + - '*' + - apiGroups: + - "" + resources: + - namespaces + verbs: + - get + - update + - watch + - apiGroups: + - '' + resources: + - events + verbs: + - create + - update + - patch + - apiGroups: + - '' + resources: + - configmaps + verbs: + - create + - delete + - get + - list + - watch + - apiGroups: + - security.istio.io + - apps + - policy + resources: + - poddisruptionbudgets + - peerauthentications + - deployments + - daemonsets + - replicasets + - statefulsets + verbs: + - create + - delete + - get + - list + - watch + - update + - apiGroups: + - apiregistration.k8s.io + resources: + - apiservices + verbs: + - create + - delete + - get + - list + - apiGroups: + - autoscaling + resources: + - horizontalpodautoscalers + verbs: + - create + - delete + - get + - list + - apiGroups: + - coordination.k8s.io + resources: + - leases + verbs: + - '*' + - apiGroups: + - apiextensions.k8s.io + resources: + - customresourcedefinitions + verbs: + - '*' + # Old resources that need cleaning up that are not in the knative-serving + # namespace. + - apiGroups: + - "" + resources: + - services + - deployments + - horizontalpodautoscalers + resourceNames: + - knative-ingressgateway + verbs: + - delete + - apiGroups: + - "" + resources: + - configmaps + resourceNames: + - config-controller + verbs: + - delete + - apiGroups: + - "" + resources: + - serviceaccounts + resourceNames: + - knative-serving-operator + verbs: + - delete + # for contour TLS + - apiGroups: + - projectcontour.io + resources: + - httpproxies + - tlscertificatedelegations + verbs: + - get + - list + - watch + - update + - create + - delete + - deletecollection + - patch + # for security-guard + - apiGroups: + - guard.security.knative.dev + resources: + - guardians + verbs: + - get + - list + - watch + - create + - update + - patch + - delete + - apiGroups: + - "" + resources: + - configmaps + verbs: + - get + - list + - watch + - create + - update + - patch + - delete + - apiGroups: + - authentication.k8s.io + resources: + - tokenreviews + verbs: + - create + - apiGroups: + - "" + resources: + - pods + verbs: + - get --- -# Source: dataplane/charts/opencost/templates/clusterrole.yaml -# Cluster role giving opencost to get, list, watch required resources -# No write permissions are required -apiVersion: rbac.authorization.k8s.io/v1 +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml kind: ClusterRole +apiVersion: rbac.authorization.k8s.io/v1 metadata: - name: release-name-opencost + name: knative-eventing-operator labels: - helm.sh/chart: opencost-1.42.0 - app.kubernetes.io/name: opencost - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "1.111.0" - app.kubernetes.io/part-of: opencost - app.kubernetes.io/managed-by: Helm + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator rules: - - apiGroups: [""] + - apiGroups: + - operator.knative.dev + resources: + - '*' + verbs: + - '*' + # Bootstrapping permissions. + # Roles that are explicitly bound buch which are specified by this Operator + # MUST be specified here with 'get' and 'bind'. + - apiGroups: + - rbac.authorization.k8s.io + resources: + - clusterroles + - roles + verbs: + - create + - delete + # Escalate is necessary in order to create a role using cluster role aggregation, + # and to allow the Operator to bootstrap itself into the necessary set of + # permissions, even as those continue to evolve upstream. + - escalate + - get + - list + - update + - apiGroups: + - rbac.authorization.k8s.io + resources: + - clusterrolebindings + - rolebindings + verbs: + - create + - delete + - list + - get + - update + # Permissions required for Knative controller + # infra. + - apiGroups: + - apiregistration.k8s.io + resources: + - apiservices + verbs: + - update + - apiGroups: + - "" + resources: + - services + verbs: + - create + - delete + - get + - list + - watch + - apiGroups: + - caching.internal.knative.dev + resources: + - images + verbs: + - '*' + - apiGroups: + - "" + resources: + - namespaces + verbs: + - get + - update + - watch + - apiGroups: + - '' + resources: + - events + verbs: + - create + - update + - patch + - apiGroups: + - '' resources: - configmaps + verbs: + - create + - delete + - get + - list + - watch + - apiGroups: + - apps + resources: - deployments - - nodes - - pods + - daemonsets + - replicasets + - statefulsets + verbs: + - create + - delete + - get + - list + - watch + - apiGroups: + - apiregistration.k8s.io + resources: + - apiservices + verbs: + - create + - delete + - get + - list + - apiGroups: + - autoscaling + resources: + - horizontalpodautoscalers + verbs: + - create + - delete + - update + - get + - list + - apiGroups: + - coordination.k8s.io + resources: + - leases + verbs: + - '*' + - apiGroups: + - apiextensions.k8s.io + resources: + - customresourcedefinitions + verbs: + - '*' + - apiGroups: + - batch + resources: + - jobs + verbs: + - create + - delete + - update + - get + - list + - watch + # Old resources that need cleaning up that are not in the knative-eventing + # namespace. + - apiGroups: + - "" + resources: + - serviceaccounts + resourceNames: + - knative-eventing-operator + verbs: + - delete + # for RabbitMQ messaging topology objects + - apiGroups: + - rabbitmq.com + resources: + - rabbitmqclusters + verbs: + - "get" + - "list" + - "watch" + - apiGroups: + - rabbitmq.com + resources: + - bindings + - queues + - exchanges + verbs: + - create + - delete + - get + - list + - patch + - update + - watch + - apiGroups: + - rabbitmq.com + resources: + - bindings/status + - queues/status + - exchanges/status + verbs: + - get + # for Kafka eventing source + - apiGroups: + - keda.sh + resources: + - scaledobjects + - scaledobjects/finalizers + - scaledobjects/status + - triggerauthentications + - triggerauthentications/status + verbs: + - get + - list + - watch + - update + - create + - delete + # Internal APIs + - apiGroups: + - "internal.kafka.eventing.knative.dev" + resources: + - "consumers" + - "consumers/status" + - "consumergroups" + - "consumergroups/status" + verbs: + - create + - get + - list + - watch + - patch + - update + - delete + - apiGroups: + - "internal.kafka.eventing.knative.dev" + resources: + - "consumers/finalizers" + - "consumergroups/finalizers" + verbs: + - update + - delete + - apiGroups: + - apps + resources: + - statefulsets/scale + verbs: + - get + - list + - watch + - update + - patch + - create + - delete + - apiGroups: + - rbac.authorization.k8s.io + resources: + - clusterrolebindings + verbs: + - watch + - apiGroups: + - "*" + resources: + - configmaps + verbs: + - delete + - apiGroups: + - "*" + resources: + - configmaps - services - - resourcequotas - - replicationcontrollers - - limitranges - - persistentvolumeclaims - - persistentvolumes - - namespaces - - endpoints verbs: - get - list - watch + - update + - create + - delete + - apiGroups: + - "*" + resources: + - pods + verbs: + - list + - update + - get + - watch + - apiGroups: + - "*" + resources: + - pods/finalizers + verbs: + - get + - list + - create + - update + - delete + - apiGroups: + - "*" + resources: + - events + verbs: + - patch + - create + - apiGroups: + - "*" + resources: + - secrets + verbs: + - get + - list + - watch + - update + - create + - delete + - apiGroups: + - "*" + resources: + - nodes + verbs: + - get + - list + - watch + - apiGroups: + - "*" + resources: + - serviceaccounts + verbs: + - get + - list + - watch + - update + - create + - delete + - apiGroups: + - "*" + resources: + - configmaps + resourceNames: + - kafka-channel-config + verbs: + - patch + - apiGroups: + - "*" + resources: + - horizontalpodautoscalers + resourceNames: + - kafka-webhook + verbs: + - delete - apiGroups: - - extensions + - "*" resources: - - daemonsets - - deployments - - replicasets + - leases verbs: - - get - - list - - watch + - delete - apiGroups: - - apps + - "*" resources: - - statefulsets - - deployments - - daemonsets - - replicasets + - poddisruptionbudgets + resourceNames: + - kafka-webhook verbs: - - list - - watch + - delete - apiGroups: - - batch + - "*" resources: - - cronjobs - - jobs + - services verbs: - - get - - list - - watch + - patch - apiGroups: - - autoscaling + - "apps" resources: - - horizontalpodautoscalers + - deployments verbs: - - get - - list - - watch + - deletecollection + # Eventing TLS - apiGroups: - - policy + - "cert-manager.io" resources: - - poddisruptionbudgets + - certificates + - issuers + - clusterissuers verbs: - - get + - create + - delete + - update - list + - get - watch - apiGroups: - - storage.k8s.io + - "trust.cert-manager.io" resources: - - storageclasses + - bundles verbs: - - get + - create + - delete + - update - list + - get - watch --- -# Source: dataplane/templates/clusterresourcesync/serviceaccount.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: - name: union-clustersync-resource + name: knative-operator-webhook + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator rules: + # For watching logging configuration and getting certs. - apiGroups: - "" - - rbac.authorization.k8s.io resources: - - configmaps - - namespaces - - pods - - resourcequotas - - roles - - rolebindings - - secrets - - services - - serviceaccounts - - clusterrolebindings - - podtemplates + - "configmaps" verbs: - - '*' + - "get" + - "list" + - "watch" + # For manipulating certs into secrets. + - apiGroups: + - "" + resources: + - "namespaces" + verbs: + - "get" + - "create" + - "update" + - "list" + - "watch" + - "patch" + # finalizers are needed for the owner reference of the webhook + - apiGroups: + - "" + resources: + - "namespaces/finalizers" + verbs: + - "update" + # For getting our Deployment so we can decorate with ownerref. + - apiGroups: + - "apps" + resources: + - "deployments" + verbs: + - "get" + - apiGroups: + - "apps" + resources: + - "deployments/finalizers" + verbs: + - update + # For actually registering our webhook. + - apiGroups: + - "admissionregistration.k8s.io" + resources: + - "mutatingwebhookconfigurations" + - "validatingwebhookconfigurations" + verbs: &everything + - "get" + - "list" + - "create" + - "update" + - "delete" + - "patch" + - "watch" + # For leader election + - apiGroups: + - "coordination.k8s.io" + resources: + - "leases" + verbs: *everything + # Necessary for conversion webhook. These are copied from the serving + # TODO: Do we really need all these permissions? + - apiGroups: ["apiextensions.k8s.io"] + resources: ["customresourcedefinitions"] + verbs: ["get", "list", "create", "update", "delete", "patch", "watch"] --- -# Source: dataplane/templates/nodeexecutor/serviceaccount.yaml +# Source: dataplane/charts/monitoring/charts/grafana/templates/clusterrole.yaml +kind: ClusterRole +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + labels: + helm.sh/chart: grafana-10.4.0 + app.kubernetes.io/name: grafana + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "12.3.0" + name: release-name-grafana-clusterrole +rules: + - apiGroups: [""] # "" indicates the core API group + resources: ["configmaps", "secrets"] + verbs: ["get", "watch", "list"] +--- +# Source: dataplane/charts/monitoring/charts/kube-state-metrics/templates/role.yaml apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: - name: union-executor + labels: + helm.sh/chart: kube-state-metrics-7.0.0 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/component: metrics + app.kubernetes.io/part-of: monitoring-kube-state-metrics + app.kubernetes.io/name: monitoring-kube-state-metrics + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2.17.0" + release: release-name + name: monitoring-kube-state-metrics +rules: + +- apiGroups: ["certificates.k8s.io"] + resources: + - certificatesigningrequests + verbs: ["list", "watch"] + +- apiGroups: [""] + resources: + - configmaps + verbs: ["list", "watch"] + +- apiGroups: ["batch"] + resources: + - cronjobs + verbs: ["list", "watch"] + +- apiGroups: ["apps"] + resources: + - daemonsets + verbs: ["list", "watch"] + +- apiGroups: ["apps"] + resources: + - deployments + verbs: ["list", "watch"] + +- apiGroups: ["discovery.k8s.io"] + resources: + - endpointslices + verbs: ["list", "watch"] + +- apiGroups: ["autoscaling"] + resources: + - horizontalpodautoscalers + verbs: ["list", "watch"] + +- apiGroups: ["networking.k8s.io"] + resources: + - ingresses + verbs: ["list", "watch"] + +- apiGroups: ["batch"] + resources: + - jobs + verbs: ["list", "watch"] + +- apiGroups: ["coordination.k8s.io"] + resources: + - leases + verbs: ["list", "watch"] + +- apiGroups: [""] + resources: + - limitranges + verbs: ["list", "watch"] + +- apiGroups: ["admissionregistration.k8s.io"] + resources: + - mutatingwebhookconfigurations + verbs: ["list", "watch"] + +- apiGroups: [""] + resources: + - namespaces + verbs: ["list", "watch"] + +- apiGroups: ["networking.k8s.io"] + resources: + - networkpolicies + verbs: ["list", "watch"] + +- apiGroups: [""] + resources: + - nodes + verbs: ["list", "watch"] + +- apiGroups: [""] + resources: + - persistentvolumeclaims + verbs: ["list", "watch"] + +- apiGroups: [""] + resources: + - persistentvolumes + verbs: ["list", "watch"] + +- apiGroups: ["policy"] + resources: + - poddisruptionbudgets + verbs: ["list", "watch"] + +- apiGroups: [""] + resources: + - pods + verbs: ["list", "watch"] + +- apiGroups: ["apps"] + resources: + - replicasets + verbs: ["list", "watch"] + +- apiGroups: [""] + resources: + - replicationcontrollers + verbs: ["list", "watch"] + +- apiGroups: [""] + resources: + - resourcequotas + verbs: ["list", "watch"] + +- apiGroups: [""] + resources: + - secrets + verbs: ["list", "watch"] + +- apiGroups: [""] + resources: + - services + verbs: ["list", "watch"] + +- apiGroups: ["apps"] + resources: + - statefulsets + verbs: ["list", "watch"] + +- apiGroups: ["storage.k8s.io"] + resources: + - storageclasses + verbs: ["list", "watch"] + +- apiGroups: ["admissionregistration.k8s.io"] + resources: + - validatingwebhookconfigurations + verbs: ["list", "watch"] + +- apiGroups: ["storage.k8s.io"] + resources: + - volumeattachments + verbs: ["list", "watch"] +--- +# Source: dataplane/charts/monitoring/templates/prometheus-operator/clusterrole.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: monitoring-operator labels: - app: executor + + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "80.8.0" + app.kubernetes.io/part-of: monitoring + chart: monitoring-80.8.0 + release: "release-name" + heritage: "Helm" + app: monitoring-operator + app.kubernetes.io/name: monitoring-prometheus-operator + app.kubernetes.io/component: prometheus-operator rules: -# Allow RO access to PODS +- apiGroups: + - monitoring.coreos.com + resources: + - alertmanagers + - alertmanagers/finalizers + - alertmanagers/status + - alertmanagerconfigs + - prometheuses + - prometheuses/finalizers + - prometheuses/status + - prometheusagents + - prometheusagents/finalizers + - prometheusagents/status + - thanosrulers + - thanosrulers/finalizers + - thanosrulers/status + - scrapeconfigs + - scrapeconfigs/status + - servicemonitors + - servicemonitors/status + - podmonitors + - podmonitors/status + - probes + - probes/status + - prometheusrules + - prometheusrules/status + verbs: + - '*' +- apiGroups: + - apps + resources: + - statefulsets + verbs: + - '*' +- apiGroups: + - "" + resources: + - configmaps + - secrets + verbs: + - '*' - apiGroups: - "" resources: - pods verbs: - - get - list - - watch -# Allow Event recording access + - delete - apiGroups: - "" resources: - - events + - services + - services/finalizers + - endpoints verbs: + - get - create - update - delete - - patch -# Allow Access All plugin objects - apiGroups: - - '*' + - "" resources: - - '*' + - nodes verbs: - - get - list - watch - - create - - update - - delete - - patch -# Allow Access to CRD - apiGroups: - - apiextensions.k8s.io + - "" resources: - - customresourcedefinitions + - namespaces verbs: - get - - list - - watch - - create - - delete - - update ---- -# Source: dataplane/templates/operator/serviceaccount-proxy.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: proxy-system - labels: - app.kubernetes.io/name: operator-proxy - app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name - app.kubernetes.io/managed-by: Helm -rules: - - apiGroups: - - '*' - resources: - - events - - flyteworkflows - - pods/log - - pods - - rayjobs - - resourcequotas - verbs: - - get - - list - - watch ---- -# Source: dataplane/templates/operator/serviceaccount.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: operator-system - labels: - app.kubernetes.io/name: union-operator - app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name - app.kubernetes.io/managed-by: Helm -rules: - # Allow Access to all resources under flyte.lyft.com - - apiGroups: - - flyte.lyft.com - resources: - - flyteworkflows - - flyteworkflows/finalizers - verbs: - - get - - list - - watch - - create - - update - - delete - - patch - - post - - deletecollection - - apiGroups: - - '*' - resources: - - resourcequotas - - pods - - configmaps - - podtemplates - - secrets - - namespaces - - nodes - verbs: - - get - - list - - watch - - create - - update - - delete - - nonResourceURLs: - - /metrics - verbs: - - get ---- -# Source: dataplane/templates/prometheus/rbac.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: union-operator-prometheus - labels: - helm.sh/chart: dataplane-2026.4.2 - app.kubernetes.io/name: release-name-dataplane - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: prometheus -rules: - - apiGroups: [""] - resources: - - nodes - - nodes/proxy - - pods - - endpoints - - services - verbs: - - get - - list - - watch - - nonResourceURLs: - - /metrics - - /metrics/cadvisor - verbs: - - get ---- -# Source: dataplane/templates/propeller/serviceaccount-webhook.yaml -kind: ClusterRole -apiVersion: rbac.authorization.k8s.io/v1 -metadata: - name: flytepropeller-webhook-role - namespace: union -rules: - - apiGroups: - - "*" - resources: - - mutatingwebhookconfigurations - - secrets - - pods - - replicasets/finalizers - verbs: - - get - - create - - update - - patch ---- -# Source: dataplane/templates/propeller/serviceaccount.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: flytepropeller-role -rules: - # Allow RO access to PODS - - apiGroups: - - "" - resources: - - pods - verbs: - - get - - list - - watch - # Allow Event recording access - - apiGroups: - - "" - resources: - - events - verbs: - - create - - update - - delete - - patch - # Allow Access All plugin objects - - apiGroups: - - '*' - resources: - - '*' - verbs: - - get - - list - - watch - - create - - update - - delete - - patch - # Allow Access to CRD - - apiGroups: - - apiextensions.k8s.io - resources: - - customresourcedefinitions - verbs: - - get - - list - - watch - - create - - delete - - update - # Allow Access to all resources under flyte.lyft.com - - apiGroups: - - flyte.lyft.com - resources: - - flyteworkflows - - flyteworkflows/finalizers - verbs: - - get - - list - - watch - - create - - update - - delete - - patch - - post - - deletecollection + - list + - watch +- apiGroups: + - "" + - events.k8s.io + resources: + - events + verbs: + - patch + - create +- apiGroups: + - networking.k8s.io + resources: + - ingresses + verbs: + - get + - list + - watch +- apiGroups: + - storage.k8s.io + resources: + - storageclasses + verbs: + - get +- apiGroups: + - discovery.k8s.io + resources: + - endpointslices + verbs: + - get + - create + - list + - watch + - update + - delete +--- +# Source: dataplane/charts/monitoring/templates/prometheus/clusterrole.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: monitoring-prometheus + labels: + app: monitoring-prometheus + + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "80.8.0" + app.kubernetes.io/part-of: monitoring + chart: monitoring-80.8.0 + release: "release-name" + heritage: "Helm" +rules: +# These permissions (to examine all namespaces) are not in the kube-prometheus repo. +# They're grabbed from https://github.com/prometheus/prometheus/blob/master/documentation/examples/rbac-setup.yml +# kube-prometheus deliberately defaults to a more restrictive setup that is not appropriate for our general audience. +- apiGroups: [""] + resources: + - nodes + - nodes/metrics + - services + - endpoints + - pods + verbs: ["get", "list", "watch"] +- apiGroups: ["discovery.k8s.io"] + resources: + - endpointslices + verbs: ["get", "list", "watch"] +- apiGroups: + - "networking.k8s.io" + resources: + - ingresses + verbs: ["get", "list", "watch"] +- nonResourceURLs: ["/metrics", "/metrics/cadvisor"] + verbs: ["get"] --- # Source: dataplane/charts/fluentbit/templates/clusterrolebinding.yaml apiVersion: rbac.authorization.k8s.io/v1 @@ -5579,277 +5386,287 @@ roleRef: name: release-name-fluentbit subjects: - kind: ServiceAccount - name: fluentbit-system + name: union-system namespace: union --- -# Source: dataplane/charts/kube-state-metrics/templates/clusterrolebinding.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2020 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# TODO: Consider restriction of non-aggregated role to knativeservings namespaces. apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: - labels: - helm.sh/chart: kube-state-metrics-5.30.1 - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: metrics - app.kubernetes.io/part-of: kube-state-metrics - app.kubernetes.io/name: kube-state-metrics - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2.15.0" - name: release-name-kube-state-metrics -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: release-name-kube-state-metrics -subjects: -- kind: ServiceAccount - name: release-name-kube-state-metrics - namespace: union ---- -# Source: dataplane/charts/monitoring/charts/grafana/templates/clusterrolebinding.yaml -kind: ClusterRoleBinding -apiVersion: rbac.authorization.k8s.io/v1 -metadata: - name: release-name-grafana-clusterrolebinding + name: knative-serving-operator labels: - helm.sh/chart: grafana-10.4.0 - app.kubernetes.io/name: grafana - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "12.3.0" -subjects: - - kind: ServiceAccount - name: release-name-grafana - namespace: union -roleRef: - kind: ClusterRole - name: release-name-grafana-clusterrole - apiGroup: rbac.authorization.k8s.io ---- -# Source: dataplane/charts/monitoring/charts/kube-state-metrics/templates/clusterrolebinding.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - labels: - helm.sh/chart: kube-state-metrics-7.0.0 - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: metrics - app.kubernetes.io/part-of: monitoring-kube-state-metrics - app.kubernetes.io/name: monitoring-kube-state-metrics - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2.17.0" - release: release-name - name: monitoring-kube-state-metrics + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator roleRef: apiGroup: rbac.authorization.k8s.io kind: ClusterRole - name: monitoring-kube-state-metrics + name: knative-serving-operator subjects: -- kind: ServiceAccount - name: monitoring-kube-state-metrics - namespace: union + - kind: ServiceAccount + name: knative-operator + namespace: "union" --- -# Source: dataplane/charts/monitoring/templates/prometheus-operator/clusterrolebinding.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# TODO: Consider restriction of non-aggregated role to knativeeventing namespaces. apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: - name: monitoring-operator + name: knative-eventing-operator labels: - - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "80.8.0" - app.kubernetes.io/part-of: monitoring - chart: monitoring-80.8.0 - release: "release-name" - heritage: "Helm" - app: monitoring-operator - app.kubernetes.io/name: monitoring-prometheus-operator - app.kubernetes.io/component: prometheus-operator + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator roleRef: apiGroup: rbac.authorization.k8s.io kind: ClusterRole - name: monitoring-operator + name: knative-eventing-operator subjects: -- kind: ServiceAccount - name: monitoring-operator - namespace: union + - kind: ServiceAccount + name: knative-operator + namespace: "union" --- -# Source: dataplane/charts/monitoring/templates/prometheus/clusterrolebinding.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: - name: monitoring-prometheus + name: operator-webhook labels: - app: monitoring-prometheus - - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "80.8.0" - app.kubernetes.io/part-of: monitoring - chart: monitoring-80.8.0 - release: "release-name" - heritage: "Helm" + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator roleRef: apiGroup: rbac.authorization.k8s.io kind: ClusterRole - name: monitoring-prometheus + name: knative-operator-webhook subjects: - kind: ServiceAccount - name: monitoring-prometheus - namespace: union ---- -# Source: dataplane/charts/opencost/templates/clusterrolebinding.yaml + name: operator-webhook + namespace: "union" +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2020 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: - name: release-name-opencost + name: knative-serving-operator-aggregated labels: - helm.sh/chart: opencost-1.42.0 - app.kubernetes.io/name: opencost - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "1.111.0" - app.kubernetes.io/part-of: opencost - app.kubernetes.io/managed-by: Helm + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator roleRef: apiGroup: rbac.authorization.k8s.io kind: ClusterRole - name: release-name-opencost + name: knative-serving-operator-aggregated subjects: - kind: ServiceAccount - name: release-name-opencost - namespace: union + name: knative-operator + namespace: "union" --- -# Source: dataplane/templates/clusterresourcesync/serviceaccount.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: - name: union-clustersync-resource + name: knative-serving-operator-aggregated-stable + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator roleRef: apiGroup: rbac.authorization.k8s.io kind: ClusterRole - name: union-clustersync-resource + name: knative-serving-operator-aggregated-stable subjects: - kind: ServiceAccount - name: union-clustersync-system - namespace: union + name: knative-operator + namespace: "union" --- -# Source: dataplane/templates/clusterresourcesync/serviceaccount.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: - name: union-clustersync-auth-delegator + name: knative-eventing-operator-aggregated + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator roleRef: apiGroup: rbac.authorization.k8s.io kind: ClusterRole - name: system:auth-delegator + name: knative-eventing-operator-aggregated subjects: - kind: ServiceAccount - name: union-clustersync-system - namespace: union + name: knative-operator + namespace: "union" --- -# Source: dataplane/templates/nodeexecutor/serviceaccount.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: - name: union-executor + name: knative-eventing-operator-aggregated-stable labels: - app: executor + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator roleRef: apiGroup: rbac.authorization.k8s.io kind: ClusterRole - name: union-executor + name: knative-eventing-operator-aggregated-stable subjects: -- kind: ServiceAccount - name: executor - namespace: union + - kind: ServiceAccount + name: knative-operator + namespace: "union" --- -# Source: dataplane/templates/operator/serviceaccount-proxy.yaml -apiVersion: rbac.authorization.k8s.io/v1 +# Source: dataplane/charts/monitoring/charts/grafana/templates/clusterrolebinding.yaml kind: ClusterRoleBinding +apiVersion: rbac.authorization.k8s.io/v1 metadata: - name: proxy-system + name: release-name-grafana-clusterrolebinding labels: - app.kubernetes.io/name: operator-proxy + helm.sh/chart: grafana-10.4.0 + app.kubernetes.io/name: grafana app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name - app.kubernetes.io/managed-by: Helm -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: proxy-system + app.kubernetes.io/version: "12.3.0" subjects: - kind: ServiceAccount - name: proxy-system + name: release-name-grafana namespace: union +roleRef: + kind: ClusterRole + name: release-name-grafana-clusterrole + apiGroup: rbac.authorization.k8s.io --- -# Source: dataplane/templates/operator/serviceaccount.yaml +# Source: dataplane/charts/monitoring/charts/kube-state-metrics/templates/clusterrolebinding.yaml apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: - name: operator-system - labels: - app.kubernetes.io/name: union-operator - app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name + labels: + helm.sh/chart: kube-state-metrics-7.0.0 app.kubernetes.io/managed-by: Helm + app.kubernetes.io/component: metrics + app.kubernetes.io/part-of: monitoring-kube-state-metrics + app.kubernetes.io/name: monitoring-kube-state-metrics + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2.17.0" + release: release-name + name: monitoring-kube-state-metrics roleRef: apiGroup: rbac.authorization.k8s.io kind: ClusterRole - name: operator-system + name: monitoring-kube-state-metrics subjects: - - kind: ServiceAccount - name: operator-system - namespace: union +- kind: ServiceAccount + name: monitoring-kube-state-metrics + namespace: union --- -# Source: dataplane/templates/prometheus/rbac.yaml +# Source: dataplane/charts/monitoring/templates/prometheus-operator/clusterrolebinding.yaml apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: - name: union-operator-prometheus + name: monitoring-operator labels: - helm.sh/chart: dataplane-2026.4.2 - app.kubernetes.io/name: release-name-dataplane - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" + app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: prometheus + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "80.8.0" + app.kubernetes.io/part-of: monitoring + chart: monitoring-80.8.0 + release: "release-name" + heritage: "Helm" + app: monitoring-operator + app.kubernetes.io/name: monitoring-prometheus-operator + app.kubernetes.io/component: prometheus-operator roleRef: apiGroup: rbac.authorization.k8s.io kind: ClusterRole - name: union-operator-prometheus + name: monitoring-operator subjects: - - kind: ServiceAccount - name: union-operator-prometheus - namespace: union ---- -# Source: dataplane/templates/propeller/serviceaccount-webhook.yaml -# Create a binding from Role -> ServiceAccount -kind: ClusterRoleBinding -apiVersion: rbac.authorization.k8s.io/v1 -metadata: - name: flytepropeller-webhook-binding +- kind: ServiceAccount + name: monitoring-operator namespace: union -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: flytepropeller-webhook-role -subjects: - - kind: ServiceAccount - name: flytepropeller-webhook-system - namespace: union --- -# Source: dataplane/templates/propeller/serviceaccount.yaml +# Source: dataplane/charts/monitoring/templates/prometheus/clusterrolebinding.yaml apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: - name: flytepropeller-binding + name: monitoring-prometheus + labels: + app: monitoring-prometheus + + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "80.8.0" + app.kubernetes.io/part-of: monitoring + chart: monitoring-80.8.0 + release: "release-name" + heritage: "Helm" roleRef: apiGroup: rbac.authorization.k8s.io kind: ClusterRole - name: flytepropeller-role + name: monitoring-prometheus subjects: - kind: ServiceAccount - name: flytepropeller-system + name: monitoring-prometheus namespace: union --- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2022 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + namespace: "union" + name: knative-operator-webhook + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +rules: + # For manipulating certs into secrets. + - apiGroups: + - "" + resources: + - "secrets" + verbs: + - "get" + - "create" + - "update" + - "list" + - "watch" + - "patch" +--- # Source: dataplane/charts/monitoring/charts/grafana/templates/role.yaml apiVersion: rbac.authorization.k8s.io/v1 kind: Role @@ -5863,11 +5680,189 @@ metadata: app.kubernetes.io/version: "12.3.0" rules: [] --- +# Source: dataplane/templates/monitoring/prometheus-rbac.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: union-operator-prometheus-rbac + namespace: union + labels: + release: release-name +rules: + # Prometheus server scrape permissions + - apiGroups: [""] + resources: + - nodes + - nodes/proxy + - nodes/metrics + - services + - endpoints + - pods + - ingresses + - configmaps + verbs: + - get + - list + - watch + - apiGroups: ["extensions", "networking.k8s.io"] + resources: + - ingresses/status + - ingresses + verbs: + - get + - list + - watch + - apiGroups: ["discovery.k8s.io"] + resources: + - endpointslices + verbs: + - get + - list + - watch + # kube-state-metrics permissions (rbac.create=false on the subchart; managed here instead) + - apiGroups: ["certificates.k8s.io"] + resources: ["certificatesigningrequests"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["configmaps"] + verbs: ["list", "watch"] + - apiGroups: ["batch"] + resources: ["cronjobs"] + verbs: ["list", "watch"] + - apiGroups: ["extensions", "apps"] + resources: ["daemonsets"] + verbs: ["list", "watch"] + - apiGroups: ["extensions", "apps"] + resources: ["deployments"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["endpoints"] + verbs: ["list", "watch"] + - apiGroups: ["discovery.k8s.io"] + resources: ["endpointslices"] + verbs: ["list", "watch"] + - apiGroups: ["autoscaling"] + resources: ["horizontalpodautoscalers"] + verbs: ["list", "watch"] + - apiGroups: ["extensions", "networking.k8s.io"] + resources: ["ingresses"] + verbs: ["list", "watch"] + - apiGroups: ["batch"] + resources: ["jobs"] + verbs: ["list", "watch"] + - apiGroups: ["coordination.k8s.io"] + resources: ["leases"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["limitranges"] + verbs: ["list", "watch"] + - apiGroups: ["admissionregistration.k8s.io"] + resources: ["mutatingwebhookconfigurations"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["namespaces"] + verbs: ["list", "watch"] + - apiGroups: ["networking.k8s.io"] + resources: ["networkpolicies"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["persistentvolumeclaims"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["persistentvolumes"] + verbs: ["list", "watch"] + - apiGroups: ["policy"] + resources: ["poddisruptionbudgets"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["pods"] + verbs: ["list", "watch"] + - apiGroups: ["extensions", "apps"] + resources: ["replicasets"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["replicationcontrollers"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["resourcequotas"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["secrets"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["services"] + verbs: ["list", "watch"] + - apiGroups: ["apps"] + resources: ["statefulsets"] + verbs: ["list", "watch"] + - apiGroups: ["storage.k8s.io"] + resources: ["storageclasses"] + verbs: ["list", "watch"] + - apiGroups: ["admissionregistration.k8s.io"] + resources: ["validatingwebhookconfigurations"] + verbs: ["list", "watch"] + - apiGroups: ["storage.k8s.io"] + resources: ["volumeattachments"] + verbs: ["list", "watch"] +--- +# Source: dataplane/templates/nodeexecutor/serviceaccount.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: union-executor + labels: + app: executor +rules: +# Allow RO access to PODS +- apiGroups: + - "" + resources: + - pods + verbs: + - get + - list + - watch +# Allow Event recording access +- apiGroups: + - "" + resources: + - events + verbs: + - create + - update + - delete + - patch +# Allow Access All plugin objects +- apiGroups: + - '*' + resources: + - '*' + verbs: + - get + - list + - watch + - create + - update + - delete + - patch +# Allow Access to CRD +- apiGroups: + - apiextensions.k8s.io + resources: + - customresourcedefinitions + verbs: + - get + - list + - watch + - create + - delete + - update +--- # Source: dataplane/templates/operator/serviceaccount-proxy-secret.yaml apiVersion: rbac.authorization.k8s.io/v1 kind: Role metadata: - name: proxy-system-secret + name: union-system-secret namespace: union labels: app.kubernetes.io/name: operator-proxy @@ -5886,6 +5881,31 @@ rules: - update - delete --- +# Source: dataplane/templates/operator/serviceaccount-proxy.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: proxy-system + labels: + app.kubernetes.io/name: operator-proxy + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm +rules: + - apiGroups: + - '*' + resources: + - events + - flyteworkflows + - pods/log + - pods + - rayjobs + - resourcequotas + verbs: + - get + - list + - watch +--- # Source: dataplane/templates/operator/serviceaccount.yaml apiVersion: rbac.authorization.k8s.io/v1 kind: Role @@ -5908,32 +5928,180 @@ rules: - watch - create - update + - apiGroups: + - flyte.lyft.com + resources: + - flyteworkflows + - flyteworkflows/finalizers + verbs: + - get + - list + - watch + - create + - update + - delete + - patch + - post + - deletecollection + - apiGroups: + - '*' + resources: + - resourcequotas + - pods + - configmaps + - podtemplates + - secrets + - namespaces + - nodes + verbs: + - get + - list + - watch + - create + - update + - delete + - apiGroups: + - serving.knative.dev + resources: + - revisions + - configurations + - services + verbs: + - get + - list + - watch + - create + - update + - delete +--- +# Source: dataplane/templates/webhook/serviceaccount.yaml +kind: Role +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: union-webhook-role + namespace: union +rules: + - apiGroups: + - "*" + resources: + - mutatingwebhookconfigurations + - secrets + - pods + - replicasets/finalizers + verbs: + - get + - create + - update + - patch +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2022 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + namespace: "union" + name: operator-webhook + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +subjects: + - kind: ServiceAccount + name: operator-webhook + namespace: "union" +roleRef: + kind: Role + name: knative-operator-webhook + apiGroup: rbac.authorization.k8s.io +--- +# Source: dataplane/charts/monitoring/charts/grafana/templates/rolebinding.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: release-name-grafana + namespace: union + labels: + helm.sh/chart: grafana-10.4.0 + app.kubernetes.io/name: grafana + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "12.3.0" +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: release-name-grafana +subjects: +- kind: ServiceAccount + name: release-name-grafana + namespace: union +--- +# Source: dataplane/templates/monitoring/prometheus-rbac.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: union-operator-prometheus-rbac + namespace: union + labels: + release: release-name +subjects: + - kind: ServiceAccount + name: union-operator-prometheus + namespace: union +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: union-operator-prometheus-rbac +--- +# Source: dataplane/templates/monitoring/prometheus-rbac.yaml +# kube-state-metrics service account RoleBinding +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: release-name-prometheus-kube-state-metrics + namespace: union + labels: + release: release-name +subjects: + - kind: ServiceAccount + name: release-name-prometheus-kube-state-metrics + namespace: union +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: union-operator-prometheus-rbac --- -# Source: dataplane/charts/monitoring/charts/grafana/templates/rolebinding.yaml +# Source: dataplane/templates/nodeexecutor/serviceaccount.yaml apiVersion: rbac.authorization.k8s.io/v1 kind: RoleBinding metadata: - name: release-name-grafana - namespace: union + name: union-executor labels: - helm.sh/chart: grafana-10.4.0 - app.kubernetes.io/name: grafana - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "12.3.0" + app: executor roleRef: apiGroup: rbac.authorization.k8s.io kind: Role - name: release-name-grafana + name: union-executor subjects: - kind: ServiceAccount - name: release-name-grafana + name: union-system namespace: union --- # Source: dataplane/templates/operator/serviceaccount-proxy-secret.yaml apiVersion: rbac.authorization.k8s.io/v1 kind: RoleBinding metadata: - name: proxy-system-secret + name: union-system-secret namespace: union labels: app.kubernetes.io/name: operator-proxy @@ -5943,10 +6111,29 @@ metadata: roleRef: apiGroup: rbac.authorization.k8s.io kind: Role - name: proxy-system-secret + name: union-system-secret +subjects: + - kind: ServiceAccount + name: union-system + namespace: union +--- +# Source: dataplane/templates/operator/serviceaccount-proxy.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: proxy-system + labels: + app.kubernetes.io/name: operator-proxy + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: proxy-system subjects: - kind: ServiceAccount - name: proxy-system + name: union-system namespace: union --- # Source: dataplane/templates/operator/serviceaccount.yaml @@ -5965,7 +6152,23 @@ roleRef: name: operator-system subjects: - kind: ServiceAccount - name: operator-system + name: union-system + namespace: union +--- +# Source: dataplane/templates/webhook/serviceaccount.yaml +# Create a binding from Role -> ServiceAccount +kind: RoleBinding +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: union-webhook-binding + namespace: union +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: union-webhook-role +subjects: + - kind: ServiceAccount + name: union-system namespace: union --- # Source: dataplane/charts/fluentbit/templates/service.yaml @@ -5991,33 +6194,45 @@ spec: app.kubernetes.io/name: fluentbit app.kubernetes.io/instance: release-name --- -# Source: dataplane/charts/kube-state-metrics/templates/service.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2022 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + apiVersion: v1 kind: Service metadata: - name: release-name-kube-state-metrics - namespace: union - labels: - helm.sh/chart: kube-state-metrics-5.30.1 - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: metrics - app.kubernetes.io/part-of: kube-state-metrics - app.kubernetes.io/name: kube-state-metrics - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2.15.0" - annotations: - prometheus.io/scrape: 'true' + labels: + role: operator-webhook + app.kubernetes.io/component: operator-webhook + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator + name: operator-webhook + namespace: "union" spec: - type: "ClusterIP" ports: - - name: "http" - protocol: TCP - port: 8080 - targetPort: 8080 - - selector: - app.kubernetes.io/name: kube-state-metrics - app.kubernetes.io/instance: release-name + # Define metrics and profiling for them to be accessible within service meshes. + - name: http-metrics + port: 9090 + targetPort: 9090 + - name: http-profiling + port: 8008 + targetPort: 8008 + - name: https-webhook + port: 443 + targetPort: 8443 + selector: + role: operator-webhook --- # Source: dataplane/charts/monitoring/charts/grafana/templates/service.yaml apiVersion: v1 @@ -6298,28 +6513,60 @@ spec: sessionAffinity: None type: "ClusterIP" --- -# Source: dataplane/charts/opencost/templates/service.yaml +# Source: dataplane/charts/prometheus/charts/kube-state-metrics/templates/service.yaml apiVersion: v1 kind: Service metadata: - name: release-name-opencost + name: release-name-kube-state-metrics namespace: union + labels: + helm.sh/chart: kube-state-metrics-5.25.1 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/component: metrics + app.kubernetes.io/part-of: kube-state-metrics + app.kubernetes.io/name: kube-state-metrics + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2.13.0" + annotations: + prometheus.io/scrape: 'true' +spec: + type: "ClusterIP" + ports: + - name: "http" + protocol: TCP + port: 8080 + targetPort: 8080 + + selector: + app.kubernetes.io/name: kube-state-metrics + app.kubernetes.io/instance: release-name +--- +# Source: dataplane/charts/prometheus/templates/service.yaml +apiVersion: v1 +kind: Service +metadata: labels: - helm.sh/chart: opencost-1.42.0 - app.kubernetes.io/name: opencost + app.kubernetes.io/component: server + app.kubernetes.io/name: prometheus app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "1.111.0" - app.kubernetes.io/part-of: opencost + app.kubernetes.io/version: v2.54.1 + helm.sh/chart: prometheus-25.27.0 app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: prometheus + name: union-operator-prometheus + namespace: union spec: + ports: + - name: http + port: 80 + protocol: TCP + targetPort: 9090 selector: - app.kubernetes.io/name: opencost + app.kubernetes.io/component: server + app.kubernetes.io/name: prometheus app.kubernetes.io/instance: release-name + sessionAffinity: None type: "ClusterIP" - ports: - - name: http - port: 9003 - targetPort: 9003 --- # Source: dataplane/templates/clusterresourcesync/service.yaml apiVersion: v1 @@ -6343,6 +6590,31 @@ spec: app.kubernetes.io/name: clusterresourcesync app.kubernetes.io/instance: release-name --- +# Source: dataplane/templates/flyteconnector/service.yaml +apiVersion: v1 +kind: Service +metadata: + name: flyteconnector + namespace: union + labels: + app.kubernetes.io/name: flyteconnector + app.kubernetes.io/instance: release-name + app.kubernetes.io/managed-by: Helm +spec: + clusterIP: None + ports: + - name: grpc + port: 8000 + protocol: TCP + targetPort: grpc + - name: metric + port: 9090 + protocol: TCP + targetPort: metric + selector: + app.kubernetes.io/name: flyteconnector + app.kubernetes.io/instance: release-name +--- # Source: dataplane/templates/imagebuilder/service.yaml apiVersion: v1 kind: Service @@ -6368,7 +6640,7 @@ spec: apiVersion: v1 kind: Service metadata: - name: release-name-dataplane-executor + name: union-operator-executor labels: platform.union.ai/prometheus-group: "union-services" app: executor @@ -6434,39 +6706,14 @@ spec: app.kubernetes.io/name: union-operator app.kubernetes.io/instance: release-name --- -# Source: dataplane/templates/prometheus/service.yaml -apiVersion: v1 -kind: Service -metadata: - name: union-operator-prometheus - namespace: union - labels: - helm.sh/chart: dataplane-2026.4.2 - app.kubernetes.io/name: release-name-dataplane - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: prometheus -spec: - type: ClusterIP - ports: - - port: 80 - targetPort: 9090 - protocol: TCP - name: http - selector: - app.kubernetes.io/component: prometheus - app.kubernetes.io/name: release-name-dataplane - app.kubernetes.io/instance: release-name ---- -# Source: dataplane/templates/propeller/service-webhook.yaml +# Source: dataplane/templates/webhook/service.yaml apiVersion: v1 kind: Service metadata: - name: flyte-pod-webhook + name: union-pod-webhook namespace: union labels: - app.kubernetes.io/name: flyte-pod-webhook + app.kubernetes.io/name: union-pod-webhook app.kubernetes.io/instance: release-name platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm @@ -6474,7 +6721,7 @@ metadata: projectcontour.io/upstream-protocol.h2c: grpc spec: selector: - app.kubernetes.io/name: flyte-pod-webhook + app.kubernetes.io/name: union-pod-webhook app.kubernetes.io/instance: release-name ports: - name: https @@ -6486,23 +6733,23 @@ spec: port: 10254 targetPort: 10254 --- -# Source: dataplane/templates/propeller/service-webhook.yaml +# Source: dataplane/templates/webhook/service.yaml # Headless Service for cache invalidation — resolves to all pod IPs so that # we can fan out invalidation requests to every webhook replica. apiVersion: v1 kind: Service metadata: - name: flyte-pod-webhook-headless + name: union-pod-webhook-headless namespace: union labels: - app.kubernetes.io/name: flyte-pod-webhook + app.kubernetes.io/name: union-pod-webhook app.kubernetes.io/instance: release-name platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm spec: clusterIP: None selector: - app.kubernetes.io/name: flyte-pod-webhook + app.kubernetes.io/name: union-pod-webhook app.kubernetes.io/instance: release-name ports: - name: cache-internal @@ -6510,32 +6757,6 @@ spec: port: 9443 targetPort: 9443 --- -# Source: dataplane/templates/propeller/service.yaml -apiVersion: v1 -kind: Service -metadata: - namespace: union - name: flytepropeller - labels: - platform.union.ai/prometheus-group: "union-services" - app.kubernetes.io/name: flytepropeller - app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name - app.kubernetes.io/managed-by: Helm -spec: - type: ClusterIP - ports: - - name: debug - protocol: TCP - port: 10254 - - name: fasttask - port: 15605 - protocol: TCP - targetPort: 15605 - selector: - app.kubernetes.io/name: flytepropeller - app.kubernetes.io/instance: release-name ---- # Source: dataplane/charts/fluentbit/templates/daemonset.yaml apiVersion: apps/v1 kind: DaemonSet @@ -6561,7 +6782,7 @@ spec: annotations: checksum/config: e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855 spec: - serviceAccountName: fluentbit-system + serviceAccountName: union-system hostNetwork: false dnsPolicy: ClusterFirst containers: @@ -6746,90 +6967,202 @@ spec: hostPath: path: / --- -# Source: dataplane/charts/kube-state-metrics/templates/deployment.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2022 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + apiVersion: apps/v1 kind: Deployment metadata: - name: release-name-kube-state-metrics - namespace: union - labels: - helm.sh/chart: kube-state-metrics-5.30.1 - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: metrics - app.kubernetes.io/part-of: kube-state-metrics - app.kubernetes.io/name: kube-state-metrics - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2.15.0" + name: operator-webhook + namespace: "union" + labels: + app.kubernetes.io/component: operator-webhook + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator spec: selector: - matchLabels: - app.kubernetes.io/name: kube-state-metrics - app.kubernetes.io/instance: release-name - replicas: 1 - strategy: - type: RollingUpdate - revisionHistoryLimit: 10 + matchLabels: + app: operator-webhook + role: operator-webhook template: metadata: - labels: - helm.sh/chart: kube-state-metrics-5.30.1 - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: metrics - app.kubernetes.io/part-of: kube-state-metrics - app.kubernetes.io/name: kube-state-metrics - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2.15.0" + labels: + app: operator-webhook + role: operator-webhook + app.kubernetes.io/component: operator-webhook + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator + sidecar.istio.io/inject: "false" spec: - automountServiceAccountToken: true - hostNetwork: false - serviceAccountName: release-name-kube-state-metrics - securityContext: - fsGroup: 65534 - runAsGroup: 65534 - runAsNonRoot: true - runAsUser: 65534 - seccompProfile: - type: RuntimeDefault + # To avoid node becoming SPOF, spread our replicas to different nodes. + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - podAffinityTerm: + labelSelector: + matchLabels: + app: operator-webhook + topologyKey: kubernetes.io/hostname + weight: 100 + serviceAccountName: operator-webhook containers: - - name: kube-state-metrics - args: - - --port=8080 - - --resources=certificatesigningrequests,configmaps,cronjobs,daemonsets,deployments,endpoints,horizontalpodautoscalers,ingresses,jobs,leases,limitranges,mutatingwebhookconfigurations,namespaces,networkpolicies,nodes,persistentvolumeclaims,persistentvolumes,poddisruptionbudgets,pods,replicasets,replicationcontrollers,resourcequotas,secrets,services,statefulsets,storageclasses,validatingwebhookconfigurations,volumeattachments - imagePullPolicy: IfNotPresent - image: registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.15.0 - ports: - - containerPort: 8080 - name: "http" - livenessProbe: - failureThreshold: 3 - httpGet: - httpHeaders: - path: /livez - port: 8080 - scheme: HTTP - initialDelaySeconds: 5 - periodSeconds: 10 - successThreshold: 1 - timeoutSeconds: 5 - readinessProbe: - failureThreshold: 3 - httpGet: - httpHeaders: - path: /readyz - port: 8081 - scheme: HTTP - initialDelaySeconds: 5 - periodSeconds: 10 - successThreshold: 1 - timeoutSeconds: 5 - resources: - {} - securityContext: - allowPrivilegeEscalation: false - capabilities: - drop: - - ALL - readOnlyRootFilesystem: true + - name: operator-webhook + # This is the Go import path for the binary that is containerized + # and substituted here. + image: gcr.io/knative-releases/knative.dev/operator/cmd/webhook@sha256:d3a7a3304629ccfcaacec620b50736e0b4c902a6328b83369b6cbaf7e94677c9 + resources: + requests: + cpu: 100m + memory: 100Mi + limits: + cpu: 500m + memory: 500Mi + env: + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: SYSTEM_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: CONFIG_LOGGING_NAME + value: config-logging + - name: CONFIG_OBSERVABILITY_NAME + value: config-observability + - name: WEBHOOK_NAME + value: operator-webhook + - name: WEBHOOK_PORT + value: "8443" + - name: WEBHOOK_SECRET_NAME + value: operator-webhook-certs + - name: METRICS_DOMAIN + value: knative.dev/operator + - name: KUBERNETES_MIN_VERSION + value: "" + securityContext: + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + runAsNonRoot: true + capabilities: + drop: + - ALL + ports: + - name: metrics + containerPort: 9090 + - name: profiling + containerPort: 8008 + - name: https-webhook + containerPort: 8443 + readinessProbe: + periodSeconds: 1 + httpGet: + scheme: HTTPS + port: 8443 + httpHeaders: + - name: k-kubelet-probe + value: "webhook" + livenessProbe: + periodSeconds: 1 + httpGet: + scheme: HTTPS + port: 8443 + httpHeaders: + - name: k-kubelet-probe + value: "webhook" + failureThreshold: 6 + initialDelaySeconds: 120 + # Our webhook should gracefully terminate by lame ducking first, set this to a sufficiently + # high value that we respect whatever value it has configured for the lame duck grace period. + terminationGracePeriodSeconds: 300 +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2020 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: knative-operator + namespace: "union" + labels: + app.kubernetes.io/name: knative-operator + app.kubernetes.io/version: "1.16.0" +spec: + replicas: 1 + selector: + matchLabels: + name: knative-operator + template: + metadata: + labels: + name: knative-operator + app.kubernetes.io/name: knative-operator + app.kubernetes.io/version: "1.16.0" + sidecar.istio.io/inject: "false" + spec: + serviceAccountName: knative-operator + containers: + - name: knative-operator + image: gcr.io/knative-releases/knative.dev/operator/cmd/operator@sha256:0b5a3532417f9c8e7b6044e23f6f67ad932a74a40a4092ad965b6f173b2fd887 + resources: + requests: + cpu: 100m + memory: 100Mi + limits: + cpu: 1000m + memory: 1000Mi + imagePullPolicy: IfNotPresent + env: + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: SYSTEM_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: METRICS_DOMAIN + value: knative.dev/operator + - name: CONFIG_LOGGING_NAME + value: config-logging + - name: CONFIG_OBSERVABILITY_NAME + value: config-observability + - name: KUBERNETES_MIN_VERSION + value: "" + securityContext: + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + runAsNonRoot: true + capabilities: + drop: + - ALL + ports: + - name: metrics + containerPort: 9090 --- # Source: dataplane/charts/monitoring/charts/grafana/templates/deployment.yaml apiVersion: apps/v1 @@ -7244,201 +7577,290 @@ spec: automountServiceAccountToken: true terminationGracePeriodSeconds: 30 --- -# Source: dataplane/charts/opencost/templates/deployment.yaml +# Source: dataplane/charts/prometheus/charts/kube-state-metrics/templates/deployment.yaml apiVersion: apps/v1 kind: Deployment metadata: - name: release-name-opencost + name: release-name-kube-state-metrics namespace: union + labels: + helm.sh/chart: kube-state-metrics-5.25.1 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/component: metrics + app.kubernetes.io/part-of: kube-state-metrics + app.kubernetes.io/name: kube-state-metrics + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2.13.0" +spec: + selector: + matchLabels: + app.kubernetes.io/name: kube-state-metrics + app.kubernetes.io/instance: release-name + replicas: 1 + strategy: + type: RollingUpdate + revisionHistoryLimit: 10 + template: + metadata: + labels: + helm.sh/chart: kube-state-metrics-5.25.1 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/component: metrics + app.kubernetes.io/part-of: kube-state-metrics + app.kubernetes.io/name: kube-state-metrics + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2.13.0" + spec: + automountServiceAccountToken: true + hostNetwork: false + serviceAccountName: release-name-kube-state-metrics + securityContext: + fsGroup: 65534 + runAsGroup: 65534 + runAsNonRoot: true + runAsUser: 65534 + seccompProfile: + type: RuntimeDefault + containers: + - name: kube-state-metrics + args: + - --port=8080 + - --resources=certificatesigningrequests,configmaps,cronjobs,daemonsets,deployments,endpoints,horizontalpodautoscalers,ingresses,jobs,leases,limitranges,mutatingwebhookconfigurations,namespaces,networkpolicies,nodes,persistentvolumeclaims,persistentvolumes,poddisruptionbudgets,pods,replicasets,replicationcontrollers,resourcequotas,secrets,services,statefulsets,storageclasses,validatingwebhookconfigurations,volumeattachments + - --namespaces=union + imagePullPolicy: IfNotPresent + image: registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.13.0 + ports: + - containerPort: 8080 + name: "http" + livenessProbe: + failureThreshold: 3 + httpGet: + httpHeaders: + path: /livez + port: 8080 + scheme: HTTP + initialDelaySeconds: 5 + periodSeconds: 10 + successThreshold: 1 + timeoutSeconds: 5 + readinessProbe: + failureThreshold: 3 + httpGet: + httpHeaders: + path: /readyz + port: 8080 + scheme: HTTP + initialDelaySeconds: 5 + periodSeconds: 10 + successThreshold: 1 + timeoutSeconds: 5 + resources: + {} + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + readOnlyRootFilesystem: true +--- +# Source: dataplane/charts/prometheus/templates/deploy.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: labels: - helm.sh/chart: opencost-1.42.0 - app.kubernetes.io/name: opencost + app.kubernetes.io/component: server + app.kubernetes.io/name: prometheus app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "1.111.0" - app.kubernetes.io/part-of: opencost + app.kubernetes.io/version: v2.54.1 + helm.sh/chart: prometheus-25.27.0 app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: prometheus + name: union-operator-prometheus + namespace: union spec: - replicas: 1 selector: matchLabels: - app.kubernetes.io/name: opencost + app.kubernetes.io/component: server + app.kubernetes.io/name: prometheus app.kubernetes.io/instance: release-name - strategy: - rollingUpdate: - maxSurge: 1 - maxUnavailable: 1 - type: RollingUpdate + replicas: 1 + revisionHistoryLimit: 10 + strategy: + type: Recreate + rollingUpdate: null template: metadata: labels: - app.kubernetes.io/name: opencost + app.kubernetes.io/component: server + app.kubernetes.io/name: prometheus app.kubernetes.io/instance: release-name + app.kubernetes.io/version: v2.54.1 + helm.sh/chart: prometheus-25.27.0 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: prometheus spec: - serviceAccountName: release-name-opencost + enableServiceLinks: true + serviceAccountName: union-operator-prometheus containers: - - name: release-name-opencost - image: ghcr.io/opencost/opencost:1.111.0@sha256:6aa68e52a24b14ba41f23db08d1b9db1429a1c0300f4c0381ecc2c61fc311a97 - imagePullPolicy: IfNotPresent + - name: prometheus-server-configmap-reload + image: "quay.io/prometheus-operator/prometheus-config-reloader:v0.76.0" + imagePullPolicy: "IfNotPresent" args: + - --watched-dir=/etc/config + - --listen-address=0.0.0.0:8080 + - --reload-url=http://127.0.0.1:9090/prometheus/-/reload ports: - - containerPort: 9003 - name: http - resources: - limits: - cpu: 1000m - memory: 4Gi - requests: - cpu: 500m - memory: 1Gi - startupProbe: - httpGet: - path: /healthz - port: 9003 - initialDelaySeconds: 10 - periodSeconds: 5 - failureThreshold: 30 + - containerPort: 8080 + name: metrics livenessProbe: httpGet: path: /healthz - port: 9003 - initialDelaySeconds: 10 - periodSeconds: 20 - failureThreshold: 3 + port: metrics + scheme: HTTP + initialDelaySeconds: 2 + periodSeconds: 10 readinessProbe: httpGet: path: /healthz - port: 9003 - initialDelaySeconds: 10 + port: metrics + scheme: HTTP periodSeconds: 10 + volumeMounts: + - name: config-volume + mountPath: /etc/config + readOnly: true + + - name: prometheus-server + image: "quay.io/prometheus/prometheus:v2.54.1" + imagePullPolicy: "IfNotPresent" + args: + - --storage.tsdb.retention.time=3d + - --config.file=/etc/config/prometheus.yml + - --storage.tsdb.path=/data + - --web.console.libraries=/etc/prometheus/console_libraries + - --web.console.templates=/etc/prometheus/consoles + - --web.enable-lifecycle + - --web.route-prefix=/prometheus + - --web.external-url=/prometheus/ + ports: + - containerPort: 9090 + readinessProbe: + httpGet: + path: /prometheus/-/ready + port: 9090 + scheme: HTTP + initialDelaySeconds: 30 + periodSeconds: 5 + timeoutSeconds: 4 failureThreshold: 3 - env: - - name: LOG_LEVEL - value: info - - name: CUSTOM_COST_ENABLED - value: "false" - - name: KUBECOST_NAMESPACE - value: union - - name: API_PORT - value: "9003" - - name: PROMETHEUS_SERVER_ENDPOINT - value: "http://union-operator-prometheus.union.svc:80/prometheus" - - name: CLUSTER_ID - value: "default-cluster" - - name: DATA_RETENTION_DAILY_RESOLUTION_DAYS - value: "15" - - name: CLOUD_COST_ENABLED - value: "false" - - name: CLOUD_COST_MONTH_TO_DATE_INTERVAL - value: "6" - - name: CLOUD_COST_REFRESH_RATE_HOURS - value: "6" - - name: CLOUD_COST_QUERY_WINDOW_DAYS - value: "7" - - name: CLOUD_COST_RUN_WINDOW_DAYS - value: "3" - # Add any additional provided variables ---- -# Source: dataplane/templates/clusterresourcesync/deployment.yaml + successThreshold: 1 + livenessProbe: + httpGet: + path: /prometheus/-/healthy + port: 9090 + scheme: HTTP + initialDelaySeconds: 30 + periodSeconds: 15 + timeoutSeconds: 10 + failureThreshold: 3 + successThreshold: 1 + resources: + limits: + cpu: "3" + memory: 3500Mi + requests: + cpu: "1" + memory: 1Gi + volumeMounts: + - name: config-volume + mountPath: /etc/config + - name: storage-volume + mountPath: /data + subPath: "" + - name: server-recording-rules + mountPath: /etc/config/recording + subPath: + readOnly: + dnsPolicy: ClusterFirst + securityContext: + fsGroup: 65534 + runAsGroup: 65534 + runAsNonRoot: true + runAsUser: 65534 + terminationGracePeriodSeconds: 300 + volumes: + - name: config-volume + configMap: + name: union-operator-prometheus + - name: server-recording-rules + configMap: + name: union-recording-rules + - name: storage-volume + emptyDir: + sizeLimit: 10Gi +--- +# Source: dataplane/templates/flyteconnector/deployment.yaml apiVersion: apps/v1 kind: Deployment metadata: - name: union-syncresources + name: flyteconnector namespace: union - labels: - app.kubernetes.io/name: clusterresourcesync + labels: + app.kubernetes.io/name: flyteconnector app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm spec: - replicas: 1 + replicas: 2 selector: - matchLabels: - app.kubernetes.io/name: clusterresourcesync + matchLabels: + app.kubernetes.io/name: flyteconnector app.kubernetes.io/instance: release-name template: metadata: annotations: - configChecksum: "b89195349d21821191ee5fb1fc7860a8b7f731def169473cb1b19bf1ccf3a5b" - labels: platform.union.ai/zone: "dataplane" - app.kubernetes.io/name: clusterresourcesync + app.kubernetes.io/name: flyteconnector app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm spec: containers: - command: - - clusterresource - - --config - - /etc/flyte/config/*.yaml - - clusterresource - - run - env: - - name: POD_NAME - valueFrom: - fieldRef: - fieldPath: metadata.name - - name: POD_NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - - name: GOMEMLIMIT - valueFrom: - resourceFieldRef: - divisor: 1 - resource: limits.memory - - name: GOMAXPROCS - valueFrom: - resourceFieldRef: - divisor: 1 - resource: limits.cpu - - name: CLUSTER_NAME - valueFrom: - secretKeyRef: - name: operator-cluster-name - key: cluster_name - - name: DEPLOYMENT_NAME - value: operator - - name: PROXY_SERVICE_URL - value: http://union-operator-proxy:8080 - - name: PROMETHEUS_SERVICE_URL - value: http://union-operator-prometheus:80 - - name: KNATIVE_PROXY_SERVICE_URL - value: http://kourier-internal - image: "public.ecr.aws/p0i0a9q8/unionoperator:2026.4.5" + - c0 + image: "ghcr.io/flyteorg/flyte-connectors:py3.13-2.0.0b50.dev3-g695bb1db3.d20260122" imagePullPolicy: "IfNotPresent" - name: sync-cluster-resources + name: flyteconnector + readinessProbe: + failureThreshold: 3 + httpGet: + path: / + port: 9090 + scheme: HTTP + initialDelaySeconds: 5 + periodSeconds: 10 + successThreshold: 1 + timeoutSeconds: 5 + env: + - name: AIRFLOW_CONN_GOOGLE_CLOUD_DEFAULT + value: "google-cloud-platform://" + - name: LOG_LEVEL + value: "20" + ports: + - containerPort: 8000 + name: grpc + - containerPort: 9090 + name: metric resources: limits: - cpu: "1" - memory: 500Mi + cpu: "1.5" + ephemeral-storage: 100Mi + memory: 1500Mi requests: - cpu: 500m - memory: 100Mi - volumeMounts: - - name: auth - mountPath: /etc/union/secret - - name: resource-templates - mountPath: /etc/flyte/clusterresource/templates - - name: config-volume - mountPath: /etc/flyte/config - ports: - - name: debug - containerPort: 10254 - protocol: TCP - serviceAccountName: union-clustersync-system - volumes: - - configMap: - name: union-clusterresource-template - name: resource-templates - - configMap: - name: union-clusterresourcesync-config - name: config-volume - - name: auth - secret: - secretName: union-secret-auth + cpu: "1" + ephemeral-storage: 100Mi + memory: 1000Mi + serviceAccountName: flyteconnector --- # Source: dataplane/templates/imagebuilder/deployment.yaml apiVersion: apps/v1 @@ -7460,17 +7882,15 @@ spec: app.kubernetes.io/instance: release-name template: metadata: - annotations: - container.apparmor.security.beta.kubernetes.io/buildkit: unconfined labels: platform.union.ai/zone: "dataplane" app.kubernetes.io/name: imagebuilder-buildkit app.kubernetes.io/instance: release-name spec: - serviceAccountName: "union-imagebuilder" + serviceAccountName: "union-system" containers: - name: "buildkit" - image: "docker.io/moby/buildkit:buildx-stable-1-rootless" + image: "docker.io/moby/buildkit:buildx-stable-1" imagePullPolicy: IfNotPresent env: - name: POD_NAME @@ -7505,18 +7925,15 @@ spec: - name: KNATIVE_PROXY_SERVICE_URL value: http://kourier-internal volumeMounts: - - mountPath: /home/user/.local/share/buildkit - name: buildkitd - mountPath: /etc/buildkit name: buildkit-config args: - --config - /etc/buildkit/buildkitd.toml - --addr - - unix:///run/user/1000/buildkit/buildkitd.sock + - unix:///run//buildkit/buildkitd.sock - --addr - tcp://0.0.0.0:1234 - - --oci-worker-no-process-sandbox ports: - name: tcp containerPort: 1234 @@ -7538,18 +7955,13 @@ spec: initialDelaySeconds: 5 periodSeconds: 30 securityContext: - seccompProfile: # Needs Kubernetes >= 1.19 - type: Unconfined - runAsUser: 1000 - runAsGroup: 1000 + privileged: true resources: requests: - cpu: 1 - ephemeral-storage: 20Gi - memory: 1Gi + cpu: 4 + ephemeral-storage: 50Gi + memory: 4Gi volumes: - - name: buildkitd - emptyDir: {} - configMap: name: union-operator-buildkit name: buildkit-config @@ -7578,16 +7990,18 @@ spec: template: metadata: annotations: - configChecksum: "64f28ca5411984cdabd3525b7aa976f59585963e18cdcdb42787bc61f930e8d" + configChecksum: "de0693f00d848b309738a692ce47b9db0d1896df0ddddc1391ca9f50a3f4c5a" labels: platform.union.ai/zone: "dataplane" app: executor + app.kubernetes.io/instance: 'release-name' + app.kubernetes.io/name: executor spec: securityContext: fsGroup: 1337 - serviceAccountName: executor + serviceAccountName: union-system volumes: - name: config-volume configMap: @@ -7603,7 +8017,7 @@ spec: image: "public.ecr.aws/p0i0a9q8/unionoperator:2026.4.5" imagePullPolicy: IfNotPresent command: - - executor + - executorv2 - serve - --config - /etc/config/*.yaml @@ -7681,7 +8095,7 @@ spec: template: metadata: annotations: - configChecksum: "62355570fcd9b811440c9e2742ebfc5076dac141f095e26b4f4ce7eaedee597" + configChecksum: "51e140ef07908a5d02c6821f38e9947b5630807e493b9b60fa5cb810beb541c" labels: @@ -7696,12 +8110,10 @@ spec: sources: - configMap: name: union-operator - - configMap: - name: union-clusterresourcesync-config - name: secret-volume secret: secretName: union-secret-auth - serviceAccountName: proxy-system + serviceAccountName: union-system securityContext: {} containers: @@ -7818,7 +8230,7 @@ spec: template: metadata: annotations: - configChecksum: "62355570fcd9b811440c9e2742ebfc5076dac141f095e26b4f4ce7eaedee597" + configChecksum: "51e140ef07908a5d02c6821f38e9947b5630807e493b9b60fa5cb810beb541c" labels: @@ -7827,7 +8239,7 @@ spec: platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm spec: - serviceAccountName: operator-system + serviceAccountName: union-system securityContext: {} volumes: @@ -7908,81 +8320,16 @@ spec: containerPort: 10254 protocol: TCP --- -# Source: dataplane/templates/prometheus/deployment.yaml -apiVersion: apps/v1 -kind: Deployment -metadata: - name: union-operator-prometheus - namespace: union - labels: - helm.sh/chart: dataplane-2026.4.2 - app.kubernetes.io/name: release-name-dataplane - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: prometheus -spec: - replicas: 1 - selector: - matchLabels: - app.kubernetes.io/component: prometheus - app.kubernetes.io/name: release-name-dataplane - app.kubernetes.io/instance: release-name - template: - metadata: - annotations: - configChecksum: "72a39ca838f02fc499675b5708fa77e2a4938e316a99315b59e33ef7f2d31ae" - labels: - app.kubernetes.io/component: prometheus - app.kubernetes.io/name: release-name-dataplane - app.kubernetes.io/instance: release-name - spec: - priorityClassName: system-cluster-critical - serviceAccountName: union-operator-prometheus - securityContext: - fsGroup: 65534 - runAsNonRoot: true - runAsUser: 65534 - fsGroupChangePolicy: OnRootMismatch - containers: - - name: prometheus - image: "prom/prometheus:v3.3.1" - args: - - --config.file=/etc/prometheus/prometheus.yml - - --web.external-url=/prometheus/ - - --web.route-prefix=/prometheus/ - - --storage.tsdb.retention.time=3d - ports: - - name: http - containerPort: 9090 - protocol: TCP - resources: - limits: - cpu: "3" - memory: 3500Mi - requests: - cpu: "1" - memory: 1Gi - securityContext: - allowPrivilegeEscalation: false - terminationMessagePolicy: FallbackToLogsOnError - volumeMounts: - - mountPath: /etc/prometheus - name: prometheus-config - volumes: - - name: prometheus-config - configMap: - name: union-operator-prometheus ---- -# Source: dataplane/templates/propeller/deployment-webhook.yaml +# Source: dataplane/templates/webhook/deployment.yaml +# Webhook deployment # Create the actual deployment apiVersion: apps/v1 kind: Deployment metadata: - name: flytepropeller-webhook + name: union-pod-webhook namespace: union labels: - app.kubernetes.io/name: flyte-pod-webhook + app.kubernetes.io/name: union-pod-webhook app.kubernetes.io/instance: release-name platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm @@ -7990,19 +8337,19 @@ spec: replicas: 1 selector: matchLabels: - app.kubernetes.io/name: flyte-pod-webhook + app.kubernetes.io/name: union-pod-webhook app.kubernetes.io/instance: release-name template: metadata: labels: platform.union.ai/zone: "dataplane" - app.kubernetes.io/name: flyte-pod-webhook + app.kubernetes.io/name: union-pod-webhook app.kubernetes.io/instance: release-name platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm annotations: - configChecksum: "cbe1beeb4c6ff02d65b3aa8b3bbda0c740fca914d93dbb0bf0fae058c1dae5f" + configChecksum: "77958aee0464eee4cb43c70036cf5ffa89f47f58c908845b9eeaecc7e32bba4" spec: securityContext: @@ -8010,64 +8357,8 @@ spec: fsGroupChangePolicy: Always runAsNonRoot: true runAsUser: 1001 - seLinuxOptions: - type: spc_t - serviceAccountName: flytepropeller-webhook-system - initContainers: - - name: generate-secrets - image: "public.ecr.aws/p0i0a9q8/unionoperator:2026.4.5" - imagePullPolicy: "IfNotPresent" - command: - - flytepropeller - args: - - webhook - - init-certs - - --config - - /etc/flyte/config/*.yaml - env: - - name: POD_NAME - valueFrom: - fieldRef: - fieldPath: metadata.name - - name: POD_NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - - name: GOMEMLIMIT - valueFrom: - resourceFieldRef: - divisor: 1 - resource: limits.memory - - name: GOMAXPROCS - valueFrom: - resourceFieldRef: - divisor: 1 - resource: limits.cpu - - name: CLUSTER_NAME - valueFrom: - secretKeyRef: - name: operator-cluster-name - key: cluster_name - - name: DEPLOYMENT_NAME - value: operator - - name: PROXY_SERVICE_URL - value: http://union-operator-proxy:8080 - - name: PROMETHEUS_SERVICE_URL - value: http://union-operator-prometheus:80 - - name: KNATIVE_PROXY_SERVICE_URL - value: http://kourier-internal - volumeMounts: - - name: config-volume - mountPath: /etc/flyte/config - resources: - limits: - cpu: 1 - ephemeral-storage: 500Mi - memory: 500Mi - requests: - cpu: 200m - ephemeral-storage: 500Mi - memory: 500Mi + seLinuxOptions: null + serviceAccountName: union-system containers: - name: webhook image: "public.ecr.aws/p0i0a9q8/unionoperator:2026.4.5" @@ -8132,107 +8423,40 @@ spec: volumes: - name: config-volume configMap: - name: flyte-propeller-config + name: union-pod-webhook-config - name: webhook-certs secret: - secretName: flyte-pod-webhook + secretName: union-pod-webhook --- -# Source: dataplane/templates/propeller/deployment.yaml -apiVersion: apps/v1 -kind: Deployment +# Source: dataplane/templates/flyteconnector/hpa.yaml +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler metadata: - namespace: union - name: flytepropeller + name: flyteconnector labels: - app.kubernetes.io/name: flytepropeller + app.kubernetes.io/name: flyteconnector app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm spec: - replicas: 1 - selector: - matchLabels: - app.kubernetes.io/name: flytepropeller - app.kubernetes.io/instance: release-name - template: - metadata: - annotations: - configChecksum: "cbe1beeb4c6ff02d65b3aa8b3bbda0c740fca914d93dbb0bf0fae058c1dae5f" - - labels: - platform.union.ai/zone: "dataplane" - - - app.kubernetes.io/name: flytepropeller - app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name - app.kubernetes.io/managed-by: Helm - spec: - priorityClassName: system-cluster-critical - containers: - - command: - - flytepropeller - - --config - - /etc/flyte/config/*.yaml - - --propeller.cluster-id - - '' - env: - - name: POD_NAME - valueFrom: - fieldRef: - fieldPath: metadata.name - - name: POD_NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - - name: GOMEMLIMIT - valueFrom: - resourceFieldRef: - divisor: 1 - resource: limits.memory - - name: GOMAXPROCS - valueFrom: - resourceFieldRef: - divisor: 1 - resource: limits.cpu - - name: CLUSTER_NAME - valueFrom: - secretKeyRef: - name: operator-cluster-name - key: cluster_name - - name: DEPLOYMENT_NAME - value: operator - - name: PROXY_SERVICE_URL - value: http://union-operator-proxy:8080 - - name: PROMETHEUS_SERVICE_URL - value: http://union-operator-prometheus:80 - - name: KNATIVE_PROXY_SERVICE_URL - value: http://kourier-internal - image: "public.ecr.aws/p0i0a9q8/unionoperator:2026.4.5" - imagePullPolicy: "IfNotPresent" - name: flytepropeller - ports: - - containerPort: 10254 - resources: - limits: - cpu: "3" - memory: 3Gi - requests: - cpu: "1" - memory: 1Gi - volumeMounts: - - name: config-volume - mountPath: /etc/flyte/config - - name: auth - mountPath: /etc/union/secret - serviceAccountName: flytepropeller-system - volumes: - - configMap: - name: flyte-propeller-config - name: config-volume - - name: auth - secret: - secretName: union-secret-auth + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: flyteconnector + minReplicas: 2 + maxReplicas: 5 + metrics: + - type: Resource + resource: + name: cpu + target: + type: Utilization + averageUtilization: 80 + - type: Resource + resource: + name: memory + target: + type: Utilization + averageUtilization: 80 --- # Source: dataplane/charts/monitoring/templates/prometheus-operator/admission-webhooks/mutatingWebhookConfiguration.yaml apiVersion: admissionregistration.k8s.io/v1 @@ -8275,6 +8499,51 @@ webhooks: admissionReviewVersions: ["v1", "v1beta1"] sideEffects: None --- +# Source: dataplane/templates/webhook/mutatingwebhookconfiguration.yaml +apiVersion: admissionregistration.k8s.io/v1 +kind: MutatingWebhookConfiguration +metadata: + name: union-pod-webhook- + labels: + app.kubernetes.io/name: union-pod-webhook + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm +webhooks: + - name: union-pod-webhook.flyte.org + admissionReviewVersions: + - v1 + - v1beta1 + clientConfig: + caBundle: LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSUROekNDQWgrZ0F3SUJBZ0lVSm9kQ09lem4vd1BHQTZjYkw1U3duSlpYT2hJd0RRWUpLb1pJaHZjTkFRRUwKQlFBd0t6RXBNQ2NHQTFVRUF3d2dabXg1ZEdWd2NtOXdaV3hzWlhJdGQyVmlhRzl2YXk1MWJtbHZiaTV6ZG1NdwpIaGNOTWpZd01USXpNREl6TmpVMVdoY05Nell3TVRJeE1ESXpOalUxV2pBck1Ta3dKd1lEVlFRRERDQm1iSGwwClpYQnliM0JsYkd4bGNpMTNaV0pvYjI5ckxuVnVhVzl1TG5OMll6Q0NBU0l3RFFZSktvWklodmNOQVFFQkJRQUQKZ2dFUEFEQ0NBUW9DZ2dFQkFJcjBSWDRGRjFNVTNYdFBmUHErMDUrMGI1Qm0xK3hsTVR5ZUp4bG5mWFJuRlgyegpPcHV6NmsreUpBQk4xRkxmNDVYRnlJclFlaW5DWWJtckZXalFRc3ZDWEloNkpPNGMwdmRMVVU5RERYcldWRFlsCkRNZytTUDlJVnZBMm5USkVFZ2RFREVsKzNWeThUbUJoNlNmcGthcXdHajY5SmhIdy9OeE9yZzM2bUY3TU5VZ1MKWXJVUkxqUFFzTUN6NElTVzhhQnEzS2IwRUFWUTA4V1lZQ0ZEbjY1aHBzVFdxR05rL3BKRnVTYjlsZll6RVdyYwpJeW5wNDhqTnIvNUdKS3NnWG9LeTUwZVo4UkppeTVyOUhENGtnSW15TVNQR2RDMXFZSVppRHRlZW8yOElxNk5nCmd1dWcyTGhlNndYYW9YZXFCQ2d3SHpwWFVXM1hJVzI5eFoxUkFRRUNBd0VBQWFOVE1GRXdIUVlEVlIwT0JCWUUKRktITDQ1RWhQaVg5bnlRN241QUk0YUZVNFlCL01COEdBMVVkSXdRWU1CYUFGS0hMNDVFaFBpWDlueVE3bjVBSQo0YUZVNFlCL01BOEdBMVVkRXdFQi93UUZNQU1CQWY4d0RRWUpLb1pJaHZjTkFRRUxCUUFEZ2dFQkFEeTRac21XCkNjZ3B5TmI4cU1ydzNYclBNejZLMTRlTVhqQVk1bERFYXM1NklLZExoZFM1bi9sWk95bENuUUtOZ3RHS0VrY3UKT05tMEE2Z0xNT3UzcE43dG9aNXh0WktOcnUvS0x6dm44QUNNNndCbUR0bG5oRERDa3ZUTHBVRzh6UTZSNmlaegpzTWlhVmpLSUJiWkJsZXNYSHZZQnQ2YlRTN2o1QWM2aTdYc1RPK3ZqQ2RUQjVYdCtEZ0x2QnBJRWh2ZGJzNXhmClFtc3ZzQmwxenJnR3JXemo4d0txcXNUMzZaOVU1TzdSK1RJMlQrUDVOR25SeHNQOGJ0R0RtMGZ5SW55WlJrSkcKVTdpVFJyMlZRWlRyaTNZQ2dlVjN1UjFtMnJjc2hyU1Avb082T0QyTXR5Q1FMRWtFK1pxVUVJVnpSYmVaNXl2RQptM29YMFV2OUtBUy9pWTA9Ci0tLS0tRU5EIENFUlRJRklDQVRFLS0tLS0K + service: + name: union-pod-webhook + namespace: union + path: /mutate--v1-pod/secrets + port: 443 + failurePolicy: Fail + matchPolicy: Equivalent + namespaceSelector: + matchLabels: + kubernetes.io/metadata.name: union + objectSelector: + matchLabels: + inject-flyte-secrets: "true" + organization: '' + reinvocationPolicy: Never + rules: + - apiGroups: + - '*' + apiVersions: + - v1 + operations: + - CREATE + resources: + - pods + scope: '*' + sideEffects: NoneOnDryRun + timeoutSeconds: 30 +--- # Source: dataplane/charts/monitoring/templates/prometheus-operator/admission-webhooks/validatingWebhookConfiguration.yaml apiVersion: admissionregistration.k8s.io/v1 kind: ValidatingWebhookConfiguration @@ -8336,6 +8605,222 @@ webhooks: admissionReviewVersions: ["v1", "v1beta1"] sideEffects: None --- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Imported via https://github.com/knative/operator/releases/download/knative-v1.16.0/operator.yaml +# Copyright 2024 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +# Source: dataplane/templates/propeller/serviceaccount.yaml +--- +--- +# Source: dataplane/templates/serving/knative-serving.yaml +apiVersion: operator.knative.dev/v1beta1 +kind: KnativeServing +metadata: + name: union-operator-serving + namespace: union + labels: + helm.sh/chart: dataplane-2026.4.4 + app.kubernetes.io/name: release-name-dataplane + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2026.4.5" + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/component: serving +spec: + config: + deployment: + progress-deadline: "30m" + queue-sidecar-cpu-request: "25m" + queue-sidecar-cpu-limit: "1000m" + queue-sidecar-memory-request: "400Mi" + queue-sidecar-memory-limit: "800Mi" + queue-sidecar-ephemeral-storage-request: "512Mi" + queue-sidecar-ephemeral-storage-limit: "1024Mi" + registries-skipping-tag-resolving: managed.cr.union.ai + features: + kubernetes.podspec-affinity: "enabled" + kubernetes.podspec-nodeselector: "enabled" + kubernetes.podspec-tolerations: "enabled" + kubernetes.podspec-fieldref: "enabled" + kubernetes.podspec-dnspolicy: "enabled" + kubernetes.podspec-schedulername: "enabled" + kubernetes.podspec-securitycontext: "enabled" + network: + ingress-class: "kourier.ingress.networking.knative.dev" + high-availability: + replicas: 2 + ingress: + kourier: + enabled: true + bootstrap-configmap: "union-operator-serving-envoy-bootstrap" + service-type: ClusterIP + podDisruptionBudgets: + - name: 3scale-kourier-gateway-pdb + minAvailable: 50% + - name: activator-pdb + minAvailable: 50% + - name: webhook-pdb + minAvailable: 50% + registry: + override: + # TODO(jeev): Wire up Union fork of Envoy + 3scale-kourier-gateway/kourier-gateway: ghcr.io/unionai/envoy:456fed84d4ad9a9dfb186d117d9362e9dc0f7c1f + # TODO(jeev): Wire up Union fork of Kourier + net-kourier-controller/controller: ghcr.io/unionai/kourier@sha256:5804c348d15b3959604e3e3ceed216c3a1c7b32cbe254c7d3eb02a35e62ba9c4 + workloads: + - name: 3scale-kourier-gateway + labels: + helm.sh/chart: dataplane-2026.4.4 + app.kubernetes.io/name: release-name-dataplane + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2026.4.5" + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/component: 3scale-kourier-gateway + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchLabels: + app.kubernetes.io/name: 3scale-kourier-gateway + app.kubernetes.io/instance: release-name + topologyKey: topology.kubernetes.io/zone + annotations: + checksum/bootstrap-config: 6469f6f592eebc9e0c676d8ccc359a05bc799c0988e474b2da942c4cc328f656 + env: + - container: kourier-gateway + envVars: + - name: UNION_AUTHZ_TENANTAUTHURL + value: "https:///me" + - name: UNION_AUTHZ_TENANTAUTHSIGNINURL + value: "https:///login" + - name: UNION_AUTHZ_TENANTCONTROLPLANEURL + value: "https://" + resources: + - container: kourier-gateway + limits: + cpu: "2" + memory: 2Gi + requests: + cpu: "1" + memory: 1Gi + - name: activator + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app + operator: In + values: + - activator + topologyKey: topology.kubernetes.io/zone + - name: autoscaler + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app + operator: In + values: + - autoscaler + topologyKey: topology.kubernetes.io/zone + - name: autoscaler-hpa + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app + operator: In + values: + - autoscaler-hpa + topologyKey: topology.kubernetes.io/zone + - name: controller + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app + operator: In + values: + - controller + topologyKey: topology.kubernetes.io/zone + - name: net-kourier-controller + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app + operator: In + values: + - net-kourier-controller + topologyKey: topology.kubernetes.io/zone + env: + - container: controller + envVars: + - name: KOURIER_UNION_AUTHZ_ENABLED + value: "true" + resources: + - container: controller + limits: + cpu: "1" + memory: 1Gi + requests: + cpu: 500m + memory: 500Mi + - name: webhook + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app + operator: In + values: + - webhook + topologyKey: topology.kubernetes.io/zone +--- +# Source: dataplane/templates/common/task-podtemplate.yaml +apiVersion: v1 +kind: PodTemplate +metadata: + name: task-template + namespace: union +template: + spec: + serviceAccountName: union + containers: + - name: default + image: docker.io/rwgrim/docker-noop +--- # Source: dataplane/charts/monitoring/templates/prometheus/prometheus.yaml apiVersion: monitoring.coreos.com/v1 kind: Prometheus diff --git a/tests/generated/dataplane.nodeobserver.yaml b/tests/generated/dataplane.nodeobserver.yaml index 3ed590ae..77e603c3 100644 --- a/tests/generated/dataplane.nodeobserver.yaml +++ b/tests/generated/dataplane.nodeobserver.yaml @@ -1,162 +1,146 @@ --- -# Source: dataplane/templates/common/namespaces.yaml -apiVersion: v1 -kind: Namespace -metadata: - name: flytesnacks-development ---- -# Source: dataplane/templates/common/namespaces.yaml -apiVersion: v1 -kind: Namespace -metadata: - name: flytesnacks-staging ---- -# Source: dataplane/templates/common/namespaces.yaml -apiVersion: v1 -kind: Namespace -metadata: - name: flytesnacks-production ---- -# Source: dataplane/templates/common/namespaces.yaml -apiVersion: v1 -kind: Namespace -metadata: - name: union-health-monitoring-development ---- -# Source: dataplane/templates/common/namespaces.yaml -apiVersion: v1 -kind: Namespace -metadata: - name: union-health-monitoring-staging ---- -# Source: dataplane/templates/common/namespaces.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2020 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + apiVersion: v1 -kind: Namespace +kind: ServiceAccount metadata: - name: union-health-monitoring-production + name: knative-operator + namespace: "union" + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator --- -# Source: dataplane/charts/fluentbit/templates/serviceaccount.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2020 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + apiVersion: v1 kind: ServiceAccount metadata: - name: fluentbit-system - namespace: union + name: operator-webhook + namespace: "union" labels: - helm.sh/chart: fluentbit-0.48.9 - app.kubernetes.io/name: fluentbit - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "3.2.8" - app.kubernetes.io/managed-by: Helm + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator --- -# Source: dataplane/charts/kube-state-metrics/templates/serviceaccount.yaml +# Source: dataplane/charts/prometheus/charts/kube-state-metrics/templates/serviceaccount.yaml apiVersion: v1 kind: ServiceAccount automountServiceAccountToken: true metadata: labels: - helm.sh/chart: kube-state-metrics-5.30.1 + helm.sh/chart: kube-state-metrics-5.25.1 app.kubernetes.io/managed-by: Helm app.kubernetes.io/component: metrics app.kubernetes.io/part-of: kube-state-metrics app.kubernetes.io/name: kube-state-metrics app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2.15.0" + app.kubernetes.io/version: "2.13.0" name: release-name-kube-state-metrics namespace: union --- -# Source: dataplane/charts/opencost/templates/serviceaccount.yaml +# Source: dataplane/charts/prometheus/templates/serviceaccount.yaml apiVersion: v1 kind: ServiceAccount metadata: - name: release-name-opencost - namespace: union labels: - helm.sh/chart: opencost-1.42.0 - app.kubernetes.io/name: opencost + app.kubernetes.io/component: server + app.kubernetes.io/name: prometheus app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "1.111.0" - app.kubernetes.io/part-of: opencost + app.kubernetes.io/version: v2.54.1 + helm.sh/chart: prometheus-25.27.0 app.kubernetes.io/managed-by: Helm -automountServiceAccountToken: true ---- -# Source: dataplane/templates/clusterresourcesync/serviceaccount.yaml -apiVersion: v1 -kind: ServiceAccount -metadata: - name: union-clustersync-system + app.kubernetes.io/part-of: prometheus + name: union-operator-prometheus namespace: union + annotations: + {} --- -# Source: dataplane/templates/imagebuilder/serviceaccount.yaml -apiVersion: v1 -kind: ServiceAccount -metadata: - name: union-imagebuilder ---- -# Source: dataplane/templates/nodeexecutor/serviceaccount.yaml +# Source: dataplane/templates/common/system-serviceaccount.yaml apiVersion: v1 kind: ServiceAccount metadata: - name: executor + name: union-system namespace: union - labels: - app: executor + annotations: --- -# Source: dataplane/templates/nodeobserver/serviceaccount.yaml +# Source: dataplane/templates/common/union-serviceaccount.yaml apiVersion: v1 kind: ServiceAccount metadata: - name: nodeobserver-system + name: union namespace: union + annotations: + eks.amazonaws.com/role-arn: arn:aws:iam::ACCOUNT_ID:role/flyte_project_role +automountServiceAccountToken: true --- -# Source: dataplane/templates/operator/serviceaccount-proxy.yaml -apiVersion: v1 -kind: ServiceAccount -metadata: - name: proxy-system - labels: - app.kubernetes.io/name: operator-proxy - app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name - app.kubernetes.io/managed-by: Helm ---- -# Source: dataplane/templates/operator/serviceaccount.yaml -apiVersion: v1 -kind: ServiceAccount -metadata: - name: operator-system - labels: - app.kubernetes.io/name: union-operator - app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name - app.kubernetes.io/managed-by: Helm ---- -# Source: dataplane/templates/prometheus/serviceaccount.yaml +# Source: dataplane/templates/flyteconnector/serviceaccount.yaml apiVersion: v1 kind: ServiceAccount metadata: - name: union-operator-prometheus + name: flyteconnector namespace: union - labels: - helm.sh/chart: dataplane-2026.4.2 - app.kubernetes.io/name: release-name-dataplane + labels: + app.kubernetes.io/name: flyteconnector app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: prometheus --- -# Source: dataplane/templates/propeller/serviceaccount-webhook.yaml +# Source: dataplane/templates/nodeobserver/serviceaccount.yaml apiVersion: v1 kind: ServiceAccount metadata: - name: flytepropeller-webhook-system + name: nodeobserver-system namespace: union --- -# Source: dataplane/templates/propeller/serviceaccount.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2022 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + apiVersion: v1 -kind: ServiceAccount +kind: Secret metadata: - name: flytepropeller-system - namespace: union + name: operator-webhook-certs + namespace: "union" + labels: + app.kubernetes.io/component: operator-webhook + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +# The data is populated at install time. --- # Source: dataplane/templates/common/cluster-secret.yaml apiVersion: v1 @@ -167,151 +151,533 @@ type: Opaque data: cluster_name: --- -# Source: dataplane/templates/propeller/deployment-webhook.yaml -# Create an empty secret that the first propeller pod will populate +# Source: dataplane/templates/webhook/mutatingwebhookconfiguration.yaml apiVersion: v1 kind: Secret metadata: - name: flyte-pod-webhook + name: union-pod-webhook namespace: union + labels: + app.kubernetes.io/name: union-pod-webhook + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm type: Opaque +data: + ca.crt: LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSUROekNDQWgrZ0F3SUJBZ0lVSm9kQ09lem4vd1BHQTZjYkw1U3duSlpYT2hJd0RRWUpLb1pJaHZjTkFRRUwKQlFBd0t6RXBNQ2NHQTFVRUF3d2dabXg1ZEdWd2NtOXdaV3hzWlhJdGQyVmlhRzl2YXk1MWJtbHZiaTV6ZG1NdwpIaGNOTWpZd01USXpNREl6TmpVMVdoY05Nell3TVRJeE1ESXpOalUxV2pBck1Ta3dKd1lEVlFRRERDQm1iSGwwClpYQnliM0JsYkd4bGNpMTNaV0pvYjI5ckxuVnVhVzl1TG5OMll6Q0NBU0l3RFFZSktvWklodmNOQVFFQkJRQUQKZ2dFUEFEQ0NBUW9DZ2dFQkFJcjBSWDRGRjFNVTNYdFBmUHErMDUrMGI1Qm0xK3hsTVR5ZUp4bG5mWFJuRlgyegpPcHV6NmsreUpBQk4xRkxmNDVYRnlJclFlaW5DWWJtckZXalFRc3ZDWEloNkpPNGMwdmRMVVU5RERYcldWRFlsCkRNZytTUDlJVnZBMm5USkVFZ2RFREVsKzNWeThUbUJoNlNmcGthcXdHajY5SmhIdy9OeE9yZzM2bUY3TU5VZ1MKWXJVUkxqUFFzTUN6NElTVzhhQnEzS2IwRUFWUTA4V1lZQ0ZEbjY1aHBzVFdxR05rL3BKRnVTYjlsZll6RVdyYwpJeW5wNDhqTnIvNUdKS3NnWG9LeTUwZVo4UkppeTVyOUhENGtnSW15TVNQR2RDMXFZSVppRHRlZW8yOElxNk5nCmd1dWcyTGhlNndYYW9YZXFCQ2d3SHpwWFVXM1hJVzI5eFoxUkFRRUNBd0VBQWFOVE1GRXdIUVlEVlIwT0JCWUUKRktITDQ1RWhQaVg5bnlRN241QUk0YUZVNFlCL01COEdBMVVkSXdRWU1CYUFGS0hMNDVFaFBpWDlueVE3bjVBSQo0YUZVNFlCL01BOEdBMVVkRXdFQi93UUZNQU1CQWY4d0RRWUpLb1pJaHZjTkFRRUxCUUFEZ2dFQkFEeTRac21XCkNjZ3B5TmI4cU1ydzNYclBNejZLMTRlTVhqQVk1bERFYXM1NklLZExoZFM1bi9sWk95bENuUUtOZ3RHS0VrY3UKT05tMEE2Z0xNT3UzcE43dG9aNXh0WktOcnUvS0x6dm44QUNNNndCbUR0bG5oRERDa3ZUTHBVRzh6UTZSNmlaegpzTWlhVmpLSUJiWkJsZXNYSHZZQnQ2YlRTN2o1QWM2aTdYc1RPK3ZqQ2RUQjVYdCtEZ0x2QnBJRWh2ZGJzNXhmClFtc3ZzQmwxenJnR3JXemo4d0txcXNUMzZaOVU1TzdSK1RJMlQrUDVOR25SeHNQOGJ0R0RtMGZ5SW55WlJrSkcKVTdpVFJyMlZRWlRyaTNZQ2dlVjN1UjFtMnJjc2hyU1Avb082T0QyTXR5Q1FMRWtFK1pxVUVJVnpSYmVaNXl2RQptM29YMFV2OUtBUy9pWTA9Ci0tLS0tRU5EIENFUlRJRklDQVRFLS0tLS0K + tls.crt: LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSUQxakNDQXI2Z0F3SUJBZ0lVREtRN3A4MFpIdkQyUE5Ya3p3VGRLem9iUjg4d0RRWUpLb1pJaHZjTkFRRUwKQlFBd0t6RXBNQ2NHQTFVRUF3d2dabXg1ZEdWd2NtOXdaV3hzWlhJdGQyVmlhRzl2YXk1MWJtbHZiaTV6ZG1NdwpIaGNOTWpZd01USXpNREl6TmpVMVdoY05NamN3TVRJek1ESXpOalUxV2pBck1Ta3dKd1lEVlFRRERDQm1iSGwwClpYQnliM0JsYkd4bGNpMTNaV0pvYjI5ckxuVnVhVzl1TG5OMll6Q0NBU0l3RFFZSktvWklodmNOQVFFQkJRQUQKZ2dFUEFEQ0NBUW9DZ2dFQkFMNnMxUTRaZ3ZxVVdNdFp5VC9EKytEQWl6WmR0Q1lkVzArSGk4S3phQ1Vjbi9zZwo3a21NTEhDM2ZlZWtZK2ZUNENHUTd6K1l2WjRWWSt6WGljKzdrNndLbUZqdm1acEM1Rm96OWo1Q21NRk5HZmlWCnRobzVvSU9MUkIzNXRYMU1ROHUwNFVzT29QcjdMK016eEdxWXVmTFdtN1NVdjh0eEp3ZFMxZVlzMllNV1pMeU4KekI4akdWY01Qc2ZNVkdxdFhTTVljQ3hNYURKaTM1N0VEZSs3clM2U2ZtbnFtYjJxekdQM1BRRStCUm91Yk5nLwo4NCt4c253VXBLZUFpem95dFM2c2ZlOTdMUEtRTW9oVzd2TjZORFJrLy9xSmpHem9iZmZwUHU3endhWHVXMWtMCmllc2g2cVRrdlZLSTF3WFJ2WHArVU5GbjZzdDIwdEVlRks0ZzMwOENBd0VBQWFPQjhUQ0I3akFKQmdOVkhSTUUKQWpBQU1Bc0dBMVVkRHdRRUF3SUY0RENCa3dZRFZSMFJCSUdMTUlHSWdoWm1iSGwwWlhCeWIzQmxiR3hsY2kxMwpaV0pvYjI5cmdoeG1iSGwwWlhCeWIzQmxiR3hsY2kxM1pXSm9iMjlyTG5WdWFXOXVnaUJtYkhsMFpYQnliM0JsCmJHeGxjaTEzWldKb2IyOXJMblZ1YVc5dUxuTjJZNEl1Wm14NWRHVndjbTl3Wld4c1pYSXRkMlZpYUc5dmF5NTEKYm1sdmJpNXpkbU11WTJ4MWMzUmxjaTVzYjJOaGJEQWRCZ05WSFE0RUZnUVV1d25wcDZ1MTNKL0NsNnI3eDZ1Sgo5YmZ1RDhzd0h3WURWUjBqQkJnd0ZvQVVvY3Zqa1NFK0pmMmZKRHVma0FqaG9WVGhnSDh3RFFZSktvWklodmNOCkFRRUxCUUFEZ2dFQkFFMmlOa1MzczhGT3Z0cDhrVmJNZUN2amxIQytvYTVlZTFGNmowNlZ3bFJPOWdxOXJla1UKMGptWUhNWGNjWjcrZXJkNFRQRXgwK3ZqMzNhRTV3cDFmdVJJL2xIZHhuOUpJRHg2bTFQTHdVa3p1WG9EekYxLwozVXVIWU1xOGkrT2xsY0tqbmdVMUFUNmRWRCtIbFVWeHpMdjlNazdPb05SQUY5dTJHa1hiUHdCN0I1ZGRxdWxlCmtNcXBxbDhES2s4SVNVelVsTjNBb2Q5NmJUelIyV3RCVkxpTi9zRHZtTDZwZThFYm55RnBUb1BZTDlGeWZOcmIKT1ZPZEJwUkZFTG1KOHZaZWY5d1lxRmVXMTE3SGV5VDArZkxGZVNSemZUNVVWUlVraVFBcUVJdUhHUVRrOVFabgo1WnYvNUF0eEg3WE1rcXR0VE1zMFlSUG0xYjQzY3FBeFdhND0KLS0tLS1FTkQgQ0VSVElGSUNBVEUtLS0tLQo= + tls.key: LS0tLS1CRUdJTiBQUklWQVRFIEtFWS0tLS0tCk1JSUV2UUlCQURBTkJna3Foa2lHOXcwQkFRRUZBQVNDQktjd2dnU2pBZ0VBQW9JQkFRQytyTlVPR1lMNmxGakwKV2NrL3cvdmd3SXMyWGJRbUhWdFBoNHZDczJnbEhKLzdJTzVKakN4d3QzM25wR1BuMCtBaGtPOC9tTDJlRldQcwoxNG5QdTVPc0NwaFk3NW1hUXVSYU0vWStRcGpCVFJuNGxiWWFPYUNEaTBRZCtiVjlURVBMdE9GTERxRDYreS9qCk04UnFtTG55MXB1MGxML0xjU2NIVXRYbUxObURGbVM4amN3Zkl4bFhERDdIekZScXJWMGpHSEFzVEdneVl0K2UKeEEzdnU2MHVrbjVwNnBtOXFzeGo5ejBCUGdVYUxtellQL09Qc2JKOEZLU25nSXM2TXJVdXJIM3ZleXp5a0RLSQpWdTd6ZWpRMFpQLzZpWXhzNkczMzZUN3U4OEdsN2x0WkM0bnJJZXFrNUwxU2lOY0YwYjE2ZmxEUlorckxkdExSCkhoU3VJTjlQQWdNQkFBRUNnZ0VBRjV2SnMzbTRMSE9DdlFic2NwOC9DQmgvQkNlOC92MGlpYW5VUmJLMkFlWlYKN3Arb3NXV1FPUktYSGIvT2VPMVVjb09DQkFOUzh3aGQrM3pDZlB5U0w3cU9HM0RyT1Z6djdqVkIxM3FpZEVpcQpId3ZXWk0vZXpuckhYOWpEdm5SYmJwVUNVaXRKQmxwa2x3S1pYc056UHB5UTRkNkxFdEw3VEo1V2lxM2g3cWQzCjhncmJoeWlWNDhZRFAyYkhpZGt0S2QvMHBNL3piSzZPTS9iMFpLUmRBWVpHQXlOcklMaFJ3eDNsQXE2SDVqL20KUHBuNlZEdVQ3Y1h6UFhmNTgxR0IyNjQxWFJVTXRNWGVEREtaU3JudnIrbG5tV2lsYXZmNkg3ZHp4NlRBMmk0RApybCtIRVdUN1RSQUZGMnZ5SVUrYSt5RzNPVFBKRCtmVTdCWWh4Zld4aFFLQmdRRGVvcXVCbzQxK1dKU255blZnCmNWMHZoUEkwZ0dyRGQzNFdIbCtGNTZuSmxuMVVlbUQ2SUQ0TUhnNVpHWmNmd3NkRlB0aHl1VnB0OVl1SW11SGQKRHE5ZDhpWnpHUzJVbStkSWlsWjhnZXJ5Q3dON3hIbFM3SzJrc0RmRGJwYmZZSXZ1LzRnSExZRHh3YzQ0OG54ZwpWNGpCUFY2QTdYTHRqQzZzdFE2OW9uTlIwd0tCZ1FEYlFBUTRGdWpnUjJUOXh2Y01MdXhMbm1YaHE4U2NoZlM5CkRoWndDUzZIeHB1YlBxblBjQ04vQUZjbDZ1cDRXVEVZdVY4bEpwcGwwRmpYQ2lMdkhuUU15cm5vRjlUOThpanQKZGVHYjF6ZWlwRUpndFpNVHkvOGdERWorZU1wSFBuYTZwQVpMOVpRVUdLR2dDNGQ0SzljdVNHMmRlb1dSU0ViSQpMVGgzeGpXVEZRS0JnQStJRWFidG5nVmVjS0IwQTFSREZGa29VUzFRZUNKQ3g4MExPV2JDRHBvOW9XaXZVT3lpCkt6SDFOdE1JY2Y2SlBCV2NtTVVJSVVMaWltVnhTS2gvU2NTb0MvNmpsd1p6Q2VPSm94YjBpVXR4Y1VERktDR2MKMlZCUDZ0UDdkeE1HVFR6VEhzNUJZbWw3TjhQSlJ0d0J2MHliMTJmdktNRmhzaS9pUWJFQkVFSjVBb0dBUm9NSApHRmJkM0V0NXdsZzcyYkk1a25SRnhkY3RLejIxb1J1bndhNWlSWTV0T3Zkak8zQ1FLZWNkSC9lMklyQmtwdFB2Ck1vNkF0MS9UUW8xakFNNGxlbnUwWUYxUnhiNGN4WW5VM2Y3UVNNRlZDNjg1dHZNemdNWVNyNngvT1h4d1NNUTUKdGpJcnhtN0poQ1JSRkNmZFUyZzl3SmpIM2hxRmtSbGlBTHRCUGFVQ2dZRUF2QkRHejQvYS9kRzRJT0VpREk4SgpBbkxaRWZaTFJrdnI0ejRyT0p0TGZtWlFjTW5DSUZmKzBaelhTME9XOWhvMjk5RUNtS1l4U2VUWk1WSDgzOFZCCnRvSkNGM3lzb01rSC9meEVXTUpicDlMblJwSFUzUElicHlhRG1nWW1YdmsvK3laL3Y1c0dDTWNtVGkzWVFLRGsKcStlK1o0SHZyaGUxVytUaHpwOCtoNGs9Ci0tLS0tRU5EIFBSSVZBVEUgS0VZLS0tLS0K --- -# Source: dataplane/templates/clusterresourcesync/configmap.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2019 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + apiVersion: v1 kind: ConfigMap metadata: - name: union-clusterresourcesync-config - namespace: union + name: config-logging + namespace: "union" labels: - app.kubernetes.io/name: clusterresourcesync - app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name - app.kubernetes.io/managed-by: Helm -data: - cluster_resources.yaml: | - cluster_resources: - clusterName: '' - customData: - - production: - - projectQuotaCpu: - value: "4096" - - projectQuotaMemory: - value: 2Ti - - projectQuotaNvidiaGpu: - value: "256" - - defaultUserRoleKey: - value: 'eks.amazonaws.com/role-arn' - - defaultUserRoleValue: - value: 'arn:aws:iam::ACCOUNT_ID:role/flyte_project_role' - - staging: - - projectQuotaCpu: - value: "4096" - - projectQuotaMemory: - value: 2Ti - - projectQuotaNvidiaGpu: - value: "256" - - defaultUserRoleKey: - value: 'eks.amazonaws.com/role-arn' - - defaultUserRoleValue: - value: 'arn:aws:iam::ACCOUNT_ID:role/flyte_project_role' - - development: - - projectQuotaCpu: - value: "4096" - - projectQuotaMemory: - value: 2Ti - - projectQuotaNvidiaGpu: - value: "256" - - defaultUserRoleKey: - value: 'eks.amazonaws.com/role-arn' - - defaultUserRoleValue: - value: 'arn:aws:iam::ACCOUNT_ID:role/flyte_project_role' - refreshInterval: 5m - standaloneDeployment: true - templatePath: /etc/flyte/clusterresource/templates - clusterResourcesPrivate: - app: - isServerless: false - union: - auth: - authorizationMetadataKey: flyte-authorization - clientId: 'dataplane-operator' - clientSecretLocation: /etc/union/secret/client_secret - tokenRefreshWindow: 5m - type: ClientSecret - connection: - host: dns:/// - admin.yaml: | - admin: - clientId: 'dataplane-operator' - clientSecretLocation: /etc/union/secret/client_secret - endpoint: dns:/// - insecure: false - event: - capacity: 1000 - rate: 500 - type: admin - domain.yaml: | - domains: - - id: development - name: development - - id: staging - name: staging - - id: production - name: production - clusters.yaml: | - clusters: - clusterConfigs: [] - labelClusterMap: {} - logger.yaml: | - logger: - formatter: - type: json - level: 4 - show-source: true + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +data: {} --- -# Source: dataplane/templates/clusterresourcesync/configmap.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2019 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + apiVersion: v1 kind: ConfigMap metadata: - name: union-clusterresource-template - namespace: union + name: config-observability + namespace: "union" labels: - app.kubernetes.io/name: clusterresourcesync + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +data: {} +--- +# Source: dataplane/charts/prometheus/templates/cm.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + labels: + app.kubernetes.io/component: server + app.kubernetes.io/name: prometheus app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name + app.kubernetes.io/version: v2.54.1 + helm.sh/chart: prometheus-25.27.0 app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: prometheus + name: union-operator-prometheus + namespace: union data: - a_namespace.yaml: | - apiVersion: v1 - kind: Namespace - metadata: - name: {{ namespace }} - labels: - union.ai/namespace-type: flyte - spec: - finalizers: - - kubernetes - - b_default_service_account.yaml: | - apiVersion: v1 - kind: ServiceAccount - metadata: - name: default - namespace: {{ namespace }} - annotations: - {{ defaultUserRoleKey }}: {{ defaultUserRoleValue }} + allow-snippet-annotations: "false" + alerting_rules.yml: | + {} + alerts: | + {} + prometheus.yml: | + global: + evaluation_interval: 1m + scrape_interval: 1m + scrape_timeout: 10s + rule_files: + - /etc/config/recording_rules.yml + - /etc/config/alerting_rules.yml + - /etc/config/rules + - /etc/config/alerts + scrape_configs: + - job_name: prometheus + static_configs: + - targets: + - localhost:9090 + - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + job_name: kubernetes-apiservers + kubernetes_sd_configs: + - role: endpoints + relabel_configs: + - action: keep + regex: default;kubernetes;https + source_labels: + - __meta_kubernetes_namespace + - __meta_kubernetes_service_name + - __meta_kubernetes_endpoint_port_name + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + insecure_skip_verify: true + - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + job_name: kubernetes-nodes + kubernetes_sd_configs: + - role: node + relabel_configs: + - action: labelmap + regex: __meta_kubernetes_node_label_(.+) + - replacement: kubernetes.default.svc:443 + target_label: __address__ + - regex: (.+) + replacement: /api/v1/nodes/$1/proxy/metrics + source_labels: + - __meta_kubernetes_node_name + target_label: __metrics_path__ + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + insecure_skip_verify: true + - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + job_name: kubernetes-nodes-cadvisor + kubernetes_sd_configs: + - role: node + relabel_configs: + - action: labelmap + regex: __meta_kubernetes_node_label_(.+) + - replacement: kubernetes.default.svc:443 + target_label: __address__ + - regex: (.+) + replacement: /api/v1/nodes/$1/proxy/metrics/cadvisor + source_labels: + - __meta_kubernetes_node_name + target_label: __metrics_path__ + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + insecure_skip_verify: true + - honor_labels: true + job_name: kubernetes-service-endpoints + kubernetes_sd_configs: + - role: endpoints + relabel_configs: + - action: keep + regex: true + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_scrape + - action: drop + regex: true + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_scrape_slow + - action: replace + regex: (https?) + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_scheme + target_label: __scheme__ + - action: replace + regex: (.+) + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_path + target_label: __metrics_path__ + - action: replace + regex: (.+?)(?::\d+)?;(\d+) + replacement: $1:$2 + source_labels: + - __address__ + - __meta_kubernetes_service_annotation_prometheus_io_port + target_label: __address__ + - action: labelmap + regex: __meta_kubernetes_service_annotation_prometheus_io_param_(.+) + replacement: __param_$1 + - action: labelmap + regex: __meta_kubernetes_service_label_(.+) + - action: replace + source_labels: + - __meta_kubernetes_namespace + target_label: namespace + - action: replace + source_labels: + - __meta_kubernetes_service_name + target_label: service + - action: replace + source_labels: + - __meta_kubernetes_pod_node_name + target_label: node + - honor_labels: true + job_name: kubernetes-service-endpoints-slow + kubernetes_sd_configs: + - role: endpoints + relabel_configs: + - action: keep + regex: true + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_scrape_slow + - action: replace + regex: (https?) + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_scheme + target_label: __scheme__ + - action: replace + regex: (.+) + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_path + target_label: __metrics_path__ + - action: replace + regex: (.+?)(?::\d+)?;(\d+) + replacement: $1:$2 + source_labels: + - __address__ + - __meta_kubernetes_service_annotation_prometheus_io_port + target_label: __address__ + - action: labelmap + regex: __meta_kubernetes_service_annotation_prometheus_io_param_(.+) + replacement: __param_$1 + - action: labelmap + regex: __meta_kubernetes_service_label_(.+) + - action: replace + source_labels: + - __meta_kubernetes_namespace + target_label: namespace + - action: replace + source_labels: + - __meta_kubernetes_service_name + target_label: service + - action: replace + source_labels: + - __meta_kubernetes_pod_node_name + target_label: node + scrape_interval: 5m + scrape_timeout: 30s + - honor_labels: true + job_name: prometheus-pushgateway + kubernetes_sd_configs: + - role: service + relabel_configs: + - action: keep + regex: pushgateway + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_probe + - honor_labels: true + job_name: kubernetes-services + kubernetes_sd_configs: + - role: service + metrics_path: /probe + params: + module: + - http_2xx + relabel_configs: + - action: keep + regex: true + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_probe + - source_labels: + - __address__ + target_label: __param_target + - replacement: blackbox + target_label: __address__ + - source_labels: + - __param_target + target_label: instance + - action: labelmap + regex: __meta_kubernetes_service_label_(.+) + - source_labels: + - __meta_kubernetes_namespace + target_label: namespace + - source_labels: + - __meta_kubernetes_service_name + target_label: service + - honor_labels: true + job_name: kubernetes-pods + kubernetes_sd_configs: + - role: pod + relabel_configs: + - action: keep + regex: true + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_scrape + - action: drop + regex: true + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_scrape_slow + - action: replace + regex: (https?) + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_scheme + target_label: __scheme__ + - action: replace + regex: (.+) + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_path + target_label: __metrics_path__ + - action: replace + regex: (\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4}) + replacement: '[$2]:$1' + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_port + - __meta_kubernetes_pod_ip + target_label: __address__ + - action: replace + regex: (\d+);((([0-9]+?)(\.|$)){4}) + replacement: $2:$1 + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_port + - __meta_kubernetes_pod_ip + target_label: __address__ + - action: labelmap + regex: __meta_kubernetes_pod_annotation_prometheus_io_param_(.+) + replacement: __param_$1 + - action: labelmap + regex: __meta_kubernetes_pod_label_(.+) + - action: replace + source_labels: + - __meta_kubernetes_namespace + target_label: namespace + - action: replace + source_labels: + - __meta_kubernetes_pod_name + target_label: pod + - action: drop + regex: Pending|Succeeded|Failed|Completed + source_labels: + - __meta_kubernetes_pod_phase + - action: replace + source_labels: + - __meta_kubernetes_pod_node_name + target_label: node + - honor_labels: true + job_name: kubernetes-pods-slow + kubernetes_sd_configs: + - role: pod + relabel_configs: + - action: keep + regex: true + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_scrape_slow + - action: replace + regex: (https?) + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_scheme + target_label: __scheme__ + - action: replace + regex: (.+) + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_path + target_label: __metrics_path__ + - action: replace + regex: (\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4}) + replacement: '[$2]:$1' + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_port + - __meta_kubernetes_pod_ip + target_label: __address__ + - action: replace + regex: (\d+);((([0-9]+?)(\.|$)){4}) + replacement: $2:$1 + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_port + - __meta_kubernetes_pod_ip + target_label: __address__ + - action: labelmap + regex: __meta_kubernetes_pod_annotation_prometheus_io_param_(.+) + replacement: __param_$1 + - action: labelmap + regex: __meta_kubernetes_pod_label_(.+) + - action: replace + source_labels: + - __meta_kubernetes_namespace + target_label: namespace + - action: replace + source_labels: + - __meta_kubernetes_pod_name + target_label: pod + - action: drop + regex: Pending|Succeeded|Failed|Completed + source_labels: + - __meta_kubernetes_pod_phase + - action: replace + source_labels: + - __meta_kubernetes_pod_node_name + target_label: node + scrape_interval: 5m + scrape_timeout: 30s + - job_name: 'kube-state-metrics' + metrics_path: /metrics + scrape_interval: 1m + honor_labels: true + kubernetes_sd_configs: + - role: endpoints + namespaces: + own_namespace: true + relabel_configs: + - source_labels: [__meta_kubernetes_service_label_app_kubernetes_io_name] + regex: kube-state-metrics + action: keep + - source_labels: [__meta_kubernetes_endpoint_port_name] + regex: http + action: keep + metric_relabel_configs: + - source_labels: [__name__] + regex: "kube_pod_container_resource_(limits|requests)|kube_pod_status_phase|kube_node_(labels|status_allocatable|status_condition|status_capacity)|kube_namespace_labels|kube_pod_container_status_(waiting|terminated|last_terminated).*_reason|kube_daemonset_status_number_unavailable|kube_deployment_status_replicas_unavailable|kube_resourcequota|kube_pod_info|kube_node_info|kube_pod_container_status_restarts_total" + action: keep + - source_labels: [__name__, phase] + separator: ";" + regex: "kube_pod_status_phase;(Succeeded|Failed)" + action: drop + - source_labels: [node] + target_label: nodename + regex: "(.*)" + action: replace + - source_labels: [label_node_group_name] + action: replace + regex: "(.+)" + target_label: label_node_pool_name + - job_name: 'opencost' + metrics_path: /metrics + scrape_interval: 1m + honor_labels: true + kubernetes_sd_configs: + - role: endpoints + namespaces: + own_namespace: true + relabel_configs: + - source_labels: [__meta_kubernetes_service_label_app_kubernetes_io_name] + regex: opencost + action: keep + - source_labels: [__meta_kubernetes_endpoint_port_name] + regex: http + action: keep + metric_relabel_configs: + - source_labels: [__name__] + regex: "kube_node_labels|kube_pod_labels|node_total_hourly_cost|node_ram_hourly_cost|node_cpu_hourly_cost|node_gpu_hourly_cost" + action: keep + - job_name: 'union-services' + metrics_path: /metrics + scrape_interval: 1m + honor_labels: true + kubernetes_sd_configs: + - role: endpoints + namespaces: + own_namespace: true + relabel_configs: + - source_labels: [__meta_kubernetes_service_label_platform_union_ai_service_group] + regex: .+ + action: keep + - source_labels: [__meta_kubernetes_endpoint_port_name] + regex: debug + action: keep + # Flyte propeller metrics for execution info and fast task duration. + # No-op when propeller is not deployed (no pods match the label selector). + - job_name: 'flytepropeller' + kubernetes_sd_configs: + - role: pod + namespaces: + own_namespace: true + selectors: + - role: pod + label: app.kubernetes.io/name=flytepropeller + metric_relabel_configs: + - source_labels: [__name__] + regex: "flyte:propeller:all:round:execution_info|flyte:propeller:all:node:fast_task:fast_task_execution_duration" + action: keep + relabel_configs: + - source_labels: [__meta_kubernetes_pod_container_name] + regex: flytepropeller + action: keep + - source_labels: [__meta_kubernetes_namespace] + action: replace + target_label: namespace + - source_labels: [__meta_kubernetes_pod_name] + action: replace + target_label: pod + # Kourier gateway (envoy) metrics for app serving. + # No-op when serving is not enabled (no pods match the label selector). + - job_name: 'serving-envoy' + kubernetes_sd_configs: + - role: pod + namespaces: + own_namespace: true + selectors: + - role: pod + label: app=3scale-kourier-gateway + metrics_path: /stats/prometheus + metric_relabel_configs: + - source_labels: [__name__] + regex: "envoy_cluster_upstream_rq_xx|envoy_cluster_upstream_rq_time_bucket" + action: keep + relabel_configs: + - source_labels: [__meta_kubernetes_pod_container_port_name] + regex: metrics + action: keep + - source_labels: [__meta_kubernetes_namespace] + action: replace + target_label: namespace + - source_labels: [__meta_kubernetes_pod_name] + action: replace + target_label: pod + # DCGM GPU metrics. + # No-op when dcgm-exporter is not deployed (no pods match the label selector). + - job_name: 'gpu-metrics' + kubernetes_sd_configs: + - role: pod + namespaces: + names: + - kube-system + selectors: + - role: pod + label: app.kubernetes.io/name=dcgm-exporter - c_project_resource_quota.yaml: | - apiVersion: v1 - kind: ResourceQuota - metadata: - name: project-quota - namespace: {{ namespace }} - spec: - hard: - limits.cpu: {{ projectQuotaCpu }} - limits.memory: {{ projectQuotaMemory }} - requests.nvidia.com/gpu: {{ projectQuotaNvidiaGpu }} + recording_rules.yml: | + {} + rules: | + {} --- # Source: dataplane/templates/fluent-bit/configmap.yaml apiVersion: v1 @@ -359,6 +725,40 @@ data: region us-east-1 bucket --- +# Source: dataplane/templates/imagebuilder/build-image-configmap.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: build-image-config + labels: + app.kubernetes.io/name: union-operator + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm +data: + storage.yaml: | + storage: + container: "" + type: stow + stow: + kind: s3 + config: + auth_type: accesskey + access_key_id: + secret_key: + disable_ssl: false + endpoint: + region: us-east-1 + enable-multicontainer: false + limits: + maxDownloadMBs: 1024 + cache: + max_size_mbs: 0 + target_gc_percent: 70 + buildkit-uri: "tcp://union-operator-buildkit.union.svc.cluster.local:1234" + default-repository: "union-dataplane" + authentication-type: "noop" +--- # Source: dataplane/templates/imagebuilder/configmap.yaml apiVersion: v1 kind: ConfigMap @@ -2515,6 +2915,18 @@ data: "version": 1 } --- +# Source: dataplane/templates/monitoring/prometheusrule.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: union-recording-rules + namespace: union + labels: + release: release-name +data: + recording_rules.yml: | + groups: [] +--- # Source: dataplane/templates/nodeexecutor/configmap.yaml apiVersion: v1 kind: ConfigMap @@ -2569,8 +2981,13 @@ data: templateUris: - https://scale.neptune.ai/{{ .taskConfig.project }}/-/run/?customId={{ .taskConfig.id }} - kubernetes-enabled: true + kubernetes-enabled: false enabled_plugins.yaml: | + plugins: + connector-service: + defaultConnector: + defaultServiceConfig: '{"loadBalancingConfig": [{"round_robin":{}}]}' + endpoint: k8s:///flyteconnector.union:8000 tasks: task-plugins: default-for-task-types: @@ -2603,6 +3020,8 @@ data: cpu: 4096 gpu: 256 memory: 2Ti + namespace_mapping: + template: union union: connection: host: dns:/// @@ -2623,7 +3042,7 @@ data: cache-endpoint: dns:/// endpoint: dns:/// insecure: false - type: fallback + type: cacheservicev2 use-admin-auth: true logger: formatter: @@ -2643,6 +3062,7 @@ data: secure: false useAuth: false propeller: + limit-namespace: union node-config: disable-input-file-writes: true plugins: @@ -2650,7 +3070,7 @@ data: additional-worker-args: - --last-ack-grace-period-seconds - "120" - callback-uri: http://unionai-dataplane-executor.union.svc.cluster.local:15605 + callback-uri: http://union-operator-executor.union.svc.cluster.local:15605 grace-period-status-not-found: 2m ioutils: remoteFileOutputPaths: @@ -2660,6 +3080,7 @@ data: default-cpus: 100m default-env-vars: [] default-memory: 100Mi + default-pod-template-name: task-template co-pilot: image: 'cr.flyte.org/flyteorg/flytecopilot:v1.14.1' name: flyte-copilot- @@ -2710,6 +3131,7 @@ data: default-cpus: 100m default-env-vars: [] default-memory: 100Mi + default-pod-template-name: task-template config.yaml: | union: connection: @@ -2729,11 +3151,14 @@ data: operator: enabled: true enableTunnelService: true + # enableDepot: false tunnel: - enableDirectToAppIngress: false + enableDirectToAppIngress: true deploymentToRestart: union-operator-proxy + limitNamespace: union + disableClusterPermissions: true apps: - enabled: 'false' + enabled: 'true' syncClusterConfig: enabled: false clusterId: @@ -2748,19 +3173,19 @@ data: userRole: 'arn:aws:iam::ACCOUNT_ID:role/flyte_project_role' userRoleKey: 'eks.amazonaws.com/role-arn' collectUsages: - enabled: true + enabled: false billing: model: Legacy dependenciesHeartbeat: - prometheus: - endpoint: 'http://union-operator-prometheus:80/-/healthy' - propeller: - endpoint: 'http://flytepropeller:10254' + executor: + endpoint: 'http://union-operator-executor:10254' proxy: endpoint: 'http://union-operator-proxy:10254' secretsWatcher: dryRun: true enabled: false + org: + namespaceTemplate: union imageBuilder: enabled: true executionNamespaceLabels: @@ -2768,9 +3193,6 @@ data: referenceConfigmapName: union-operator targetConfigMapName: "build-image-config" proxy: - imageBuilderConfig: - authenticationType: 'noop' - defaultRepository: '' persistedLogs: objectStore: pathTemplate: namespace-{{.KubernetesNamespace}}.pod-{{.KubernetesPodName}}.cont-{{.KubernetesContainerName}} @@ -2824,989 +3246,149 @@ data: endpoint: region: us-east-1 image-builder.buildkit-uri: "tcp://union-operator-buildkit.union.svc.cluster.local:1234" - image-builder.default-repository: "" + image-builder.default-repository: "union-dataplane" image-builder.authentication-type: "noop" --- -# Source: dataplane/templates/prometheus/configmap.yaml +# Source: dataplane/templates/serving/bootstrap-configmap.yaml +# We need to copy the default configmap here since knative-operator does not automatically update the +# address of the `net-kourier-controller` endpoint to use the release namespace. It assumes that the +# resources are being installed into the `knative-serving` namespace instead. apiVersion: v1 kind: ConfigMap metadata: - name: union-operator-prometheus + name: union-operator-serving-envoy-bootstrap namespace: union - labels: - helm.sh/chart: dataplane-2026.4.2 - app.kubernetes.io/name: release-name-dataplane - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: prometheus data: - prometheus.yml: | - global: - scrape_interval: 15s - evaluation_interval: 15s - alerting: - alertmanagers: - - static_configs: - - targets: - rule_files: - - rules.yml - scrape_configs: - # Self-monitoring - - job_name: prometheus - metrics_path: /prometheus/metrics - static_configs: - - targets: ['localhost:9090'] - metric_relabel_configs: - - source_labels: [__name__] - regex: prometheus_tsdb_head_chunks_storage_size_bytes|prometheus_tsdb_head_series|prometheus_tsdb_head_series_created_total|prometheus_tsdb_head_series_removed_total|prometheus_tsdb_head_samples_appended_total|prometheus_tsdb_storage_blocks_bytes|prometheus_tsdb_retention_limit_bytes|prometheus_tsdb_wal_(.*)|.+_(limit|failed|missed|failures)_total - action: keep + envoy-bootstrap.yaml: | + dynamic_resources: + ads_config: + transport_api_version: V3 + api_type: GRPC + rate_limit_settings: {} + grpc_services: + - envoy_grpc: {cluster_name: xds_cluster} + cds_config: + resource_api_version: V3 + ads: {} + lds_config: + resource_api_version: V3 + ads: {} + node: + cluster: kourier-knative + id: 3scale-kourier-gateway + static_resources: + listeners: + - name: stats_listener + address: + socket_address: + address: 0.0.0.0 + port_value: 9000 + filter_chains: + - filters: + - name: envoy.filters.network.http_connection_manager + typed_config: + "@type": type.googleapis.com/envoy.extensions.filters.network.http_connection_manager.v3.HttpConnectionManager + stat_prefix: stats_server + http_filters: + - name: envoy.filters.http.router + typed_config: + "@type": type.googleapis.com/envoy.extensions.filters.http.router.v3.Router + route_config: + virtual_hosts: + - name: admin_interface + domains: + - "*" + routes: + - match: + safe_regex: + regex: '/(certs|stats(/prometheus)?|server_info|clusters|listeners|ready)?' + headers: + - name: ':method' + string_match: + exact: GET + route: + cluster: service_stats + - match: + safe_regex: + regex: '/drain_listeners' + headers: + - name: ':method' + string_match: + exact: POST + route: + cluster: service_stats + clusters: + - name: service_stats + connect_timeout: 0.250s + type: static + load_assignment: + cluster_name: service_stats + endpoints: + lb_endpoints: + endpoint: + address: + socket_address: + address: 127.0.0.1 + port_value: 9901 + - name: xds_cluster + # This keepalive is recommended by envoy docs. + # https://www.envoyproxy.io/docs/envoy/latest/api-docs/xds_protocol + typed_extension_protocol_options: + envoy.extensions.upstreams.http.v3.HttpProtocolOptions: + "@type": type.googleapis.com/envoy.extensions.upstreams.http.v3.HttpProtocolOptions + explicit_http_config: + http2_protocol_options: + connection_keepalive: + interval: 30s + timeout: 5s + connect_timeout: 1s + load_assignment: + cluster_name: xds_cluster + endpoints: + lb_endpoints: + endpoint: + address: + socket_address: + address: "net-kourier-controller" + port_value: 18000 + type: STRICT_DNS + admin: + access_log: + - name: envoy.access_loggers.stdout + typed_config: + "@type": type.googleapis.com/envoy.extensions.access_loggers.stream.v3.StdoutAccessLog + address: + socket_address: + address: 127.0.0.1 + port_value: 9901 + stats_config: + stats_tags: + - tag_name: name + regex: '^.*?\.u/.*?n=(.*?)/.*?u\..*$' + - tag_name: domain + regex: '^.*?\.u/.*?d=(.*?)/.*?u\..*$' + - tag_name: org + regex: '^.*?\.u/.*?o=(.*?)/.*?u\..*$' + - tag_name: project + regex: '^.*?\.u/.*?p=(.*?)/.*?u\..*$' + - tag_name: target + regex: '^.*?\.u/.*?t=(.*?)/.*?u\..*$' + - tag_name: target_namespace + regex: '^.*?\.u/.*?tns=(.*?)/.*?u\..*$' +--- +# Source: dataplane/templates/webhook/configmap.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: union-pod-webhook-config + namespace: union +data: + core.yaml: | - # Kube state metrics for pod/node resource tracking and cost calculations - - job_name: kube-state-metrics - static_configs: - - targets: ['release-name-kube-state-metrics:8080'] - metric_relabel_configs: - - separator: ; - source_labels: [__name__] - regex: kube_pod_container_resource_(limits|requests)|kube_pod_status_phase|kube_node_(labels|status_allocatable|status_condition|status_capacity)|kube_namespace_labels|kube_pod_container_status_(waiting|terminated|last_terminated).*_reason|kube_daemonset_status_number_unavailable|kube_deployment_status_replicas_unavailable|kube_resourcequota|kube_pod_info|kube_node_info|kube_pod_container_status_restarts_total - action: keep - - separator: ; - source_labels: [__name__, phase] - regex: kube_pod_status_phase;(Succeeded|Failed) - action: drop - - source_labels: [node] - target_label: nodename - regex: '(.*)' - action: replace - - source_labels: [label_node_group_name] - action: replace - regex: (.+) - target_label: label_node_pool_name - - # cAdvisor container metrics for CPU and memory tracking - - job_name: kubernetes-cadvisor - metrics_path: /metrics - scheme: https - kubernetes_sd_configs: - - role: node - namespaces: - names: [] - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token - tls_config: - ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt - insecure_skip_verify: false - metric_relabel_configs: - - separator: ; - source_labels: [__name__] - regex: container_cpu_usage_seconds_total|container_memory_working_set_bytes - action: keep - relabel_configs: - - separator: ; - regex: __meta_kubernetes_node_label_(.+) - replacement: $1 - action: labelmap - - separator: ; - regex: (.*) - target_label: __address__ - replacement: kubernetes.default.svc:443 - action: replace - - source_labels: [__meta_kubernetes_node_name] - separator: ; - regex: (.+) - target_label: __metrics_path__ - replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor - action: replace - - # Flyte propeller metrics for execution info and fast task duration - - job_name: flytepropeller - kubernetes_sd_configs: - - role: pod - namespaces: - names: - - union - selectors: - - role: pod - label: app.kubernetes.io/name=flytepropeller - metric_relabel_configs: - - source_labels: [__name__] - regex: "flyte:propeller:all:round:execution_info|flyte:propeller:all:node:fast_task:fast_task_execution_duration" - action: keep - relabel_configs: - - source_labels: [__meta_kubernetes_pod_container_name] - regex: flytepropeller - action: keep - - source_labels: [__meta_kubernetes_namespace] - action: replace - target_label: namespace - - source_labels: [__meta_kubernetes_pod_name] - action: replace - target_label: pod - # OpenCost metrics for cost tracking - - job_name: opencost - static_configs: - - targets: ['release-name-opencost:9003'] - metric_relabel_configs: - - source_labels: [__name__] - regex: "kube_node_labels|kube_pod_labels|node_total_hourly_cost|node_ram_hourly_cost|node_cpu_hourly_cost|node_gpu_hourly_cost" - action: keep - rules.yml: | - groups: - - name: cost_calculations_15s - interval: 15s - rules: - - record: pod_gpu_allocation - expr: | - sum by (namespace, pod) (DCGM_FI_DEV_GPU_UTIL >= bool 0) * on (namespace, pod) group_left() (max by (namespace, pod) (kube_pod_status_phase{phase=~"Running|Pending"} == 1)) - - record: execution_info # A join metric to look up execution-level info. Used to disambiguate workflow/task executions, apps, and workspaces. - expr: | - max by (label_domain, label_project, label_entity_name, label_execution_id, label_entity_id)( - label_replace( - label_replace( - label_replace( - label_replace( - label_replace( - flyte:propeller:all:round:execution_info{domain!="", project!="", workflow_name!="", execution_id!=""}, # filter for workflow/task executions - "label_entity_id", "$1", "execution_id", "(.*)" # join key - ), "label_entity_name", "$1", "workflow_name", "(.*)" # set label_entity_name to the workflow/task name from the workflow_execution_id - ), - "label_execution_id", "$1", "execution_id", "(.*)" - ), - "label_project", "$1", "project", "(.*)" # project - ), - "label_domain", "$1", "domain", "(.*)" # domain - ) - ) - - record: app_info # A join metric to look up app-level info. Used to disambiguate workflow/task executions, apps, and workspaces. - expr: | - max by (label_domain, label_project, label_app_name, label_app_version, label_entity_id)( - label_replace( - label_replace( - label_replace( - kube_pod_labels{ - label_domain!="", - label_project!="", - label_serving_unionai_dev_app_name!="", - label_serving_knative_dev_revision!="" - }, # this filters for apps - "label_app_name", "$1", "label_serving_unionai_dev_app_name", "(.*)" # rename to cleanup - ), - "label_app_version", "$1", "label_serving_knative_dev_revision", "(.*)" # the app_version is equivalent to an execution_id for workflows (lowest level of granularity) - ), - "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # join key - ) - ) - - record: workspace_info # A join metric to look up workspace info. Used to disambiguate workflow/task executions, apps, and workspaces. - expr: | - max by (label_domain, label_project, label_workspace_name, label_entity_id)( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # filter for workspaces - "label_entity_id", "$1", "label_node_id", "(.*)" # join key - ), "label_workspace_name", "$1", "label_node_id", "(.*)" # set label_workspace_name to the workspace name from the kube_pod_labels - ) - ) - - record: fast_task_execution_duration - expr: | - max by (label_domain, label_project, label_execution_id, label_entity_id, namespace, pod)( - label_replace( - label_replace( - label_replace( - label_replace( - label_replace( - label_replace( - flyte:propeller:all:node:fast_task:fast_task_execution_duration{domain!="", project!="", execution_id!="", exported_namespace!="", exported_pod!=""}, - "label_entity_id", "$1", "execution_id", "(.*)" # join key - ), - "label_execution_id", "$1", "execution_id", "(.*)" - ), - "label_project", "$1", "project", "(.*)" # project - ), - "label_domain", "$1", "domain", "(.*)" # domain - ), - "namespace", "$1", "exported_namespace", "(.*)" - ), - "pod", "$1", "exported_pod", "(.*)" - ) - ) - - record: fast_task_execution_duration_rate - expr: | - irate(fast_task_execution_duration[1m]) > 0 or absent(irate(fast_task_execution_duration[1m])) * fast_task_execution_duration - - record: entity_id:mem_usage_bytes_total_per_node:sum # Allocated memory (max(requested, consumed)) aggregated per node and entity, where entity is either a task/workflow execution or an app. - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id, node) ( # aggregate up to entity - # First, calculate the allocated memory for each pod - max by (namespace, pod) ( # this is the case where consumed (the memory working set) exceeds requested memory - ( - sum by (namespace, pod) ( - container_memory_working_set_bytes{namespace!="",pod!="",image!=""} - ) - > sum by (namespace, pod) ( - kube_pod_container_resource_requests{namespace!="", pod!="", node!="", resource="memory"} - ) - ) - or sum by (namespace, pod) ( # this is the case where memory requests are <= consumed memory - kube_pod_container_resource_requests{namespace!="", pod!="", node!="", resource="memory"} # needed to add node!="" to dedupe - ) - ) - # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but we do not want to double the number of pod-level metrics we save - * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) - "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") - ), - "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels - label_replace( - label_replace( - kube_pod_labels{ - label_domain!="", - label_project!="", - label_serving_unionai_dev_app_name!="", - label_serving_knative_dev_revision!="" - }, # this filters for apps only - "label_entity_type", "app", "", "" # set label_entity_type to "app" - ), - "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) - "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" - ), - "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - fast_task_execution_duration_rate{label_domain!="", label_project!="",label_execution_id!=""}, - "label_entity_type", "fast_task", "", "" - ) - ) - ) - # Then filter for pods only in the "Running" or "Pending" phase - * on (namespace, pod) group_left() ( - max by (namespace, pod) ( - kube_pod_status_phase{phase=~"Running|Pending"} == 1 - ) - ) - # Now join in node identifiers which are used for subsequent overhead calculations - * on (namespace, pod) group_left(node) ( - max by (namespace, pod, node) (kube_pod_info{node!=""}) # needed to add node!="" to dedupe - ) - ) - - record: entity_id:cpu_usage_per_node:sum # Allocated cpu (max(requested, consumed)) aggregated per node and entity, where entity is either a task/workflow execution or an app. - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id, node) ( - # First, calculate the allocated cpu for each pod - max by (namespace, pod) ( # this is the case where consumed (the cpu usage seconds total) exceeds requested cpu - ( - sum by (namespace, pod) ( - irate(container_cpu_usage_seconds_total{namespace!="",pod!="",image!=""}[5m]) - ) - > sum by (namespace, pod) ( - kube_pod_container_resource_requests{namespace!="", pod!="", node!="", resource="cpu"} - ) - ) - or sum by (namespace, pod) ( # this is the case where cpu requests are <= consumed cpu - kube_pod_container_resource_requests{namespace!="", pod!="", node!="", resource="cpu"} - ) - ) - # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but I didn't want to double the number of pod-level metrics we save - * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) - "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") - ), - "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels - label_replace( - label_replace( - kube_pod_labels{ - label_domain!="", - label_project!="", - label_serving_unionai_dev_app_name!="", - label_serving_knative_dev_revision!="" - }, # this filters for apps only - "label_entity_type", "app", "", "" # set label_entity_type to "app" - ), - "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) - "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" - ), - "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - fast_task_execution_duration_rate{label_domain!="", label_project!="", label_execution_id!=""}, - "label_entity_type", "fast_task", "", "" - ) - ) - ) - # Then filter for pods only in the "Running" or "Pending" phase - * on (namespace, pod) group_left() ( - max by (namespace, pod) ( - kube_pod_status_phase{phase=~"Running|Pending"} == 1 - ) - ) - # Now join in node identifiers which are used for subsequent overhead calculations - * on (namespace, pod) group_left(node) ( - max by (namespace, pod, node) (kube_pod_info{node!=""}) # needed to add node!="" to dedupe - ) - ) - - record: entity_id:gpu_usage_per_node:sum # Allocated gpu aggregated per node and entity, where entity is either a task/workflow execution or an app. - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id, node) ( - # First, grab the allocated gpu for each pod (which is always either 1 or zero, since k8s can't split gpus the way it can with cpu/memory) - max by (namespace, pod) ( - pod_gpu_allocation - ) - # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but I didn't want to double the number of pod-level metrics we save - * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) - "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") - ), - "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels - label_replace( - label_replace( - kube_pod_labels{ - label_domain!="", - label_project!="", - label_serving_unionai_dev_app_name!="", - label_serving_knative_dev_revision!="" - }, # this filters for apps only - "label_entity_type", "app", "", "" # set label_entity_type to "app" - ), - "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) - "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" - ), - "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - fast_task_execution_duration_rate{label_domain!="", label_project!="", label_execution_id!=""}, - "label_entity_type", "fast_task", "", "" - ) - ) - ) - # Then filter for pods only in the "Running" or "Pending" phase - * on (namespace, pod) group_left() ( - max by (namespace, pod) ( - kube_pod_status_phase{phase=~"Running|Pending"} == 1 - ) - ) - # Now join in node identifiers which are used for subsequent overhead calculations - * on (namespace, pod) group_left(node) ( - max by (namespace, pod, node) (kube_pod_info{node!=""}) # needed to add node!="" to dedupe - ) - ) - - record: entity_id:used_mem_bytes:sum # the sum of used memory across all containers in an entity (numerator for aggregate utilization calculations) - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id) ( # aggregate up to entity - # First, calculate the used memory for each pod - sum by (namespace, pod) ( - container_memory_working_set_bytes{namespace!="",pod!="",image!=""} - ) - # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but we do not want to double the number of pod-level metrics we save - * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) - "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") - ), - "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels - label_replace( - label_replace( - kube_pod_labels{ - label_domain!="", - label_project!="", - label_serving_unionai_dev_app_name!="", - label_serving_knative_dev_revision!="" - }, # this filters for apps only - "label_entity_type", "app", "", "" # set label_entity_type to "app" - ), - "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) - "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" - ), - "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - fast_task_execution_duration_rate{label_domain!="", label_project!="", label_execution_id!=""}, - "label_entity_type", "fast_task", "", "" - ) - ) - ) - # Then filter for pods only in the "Running" or "Pending" phase - * on (namespace, pod) group_left() ( - max by (namespace, pod) ( - kube_pod_status_phase{phase=~"Running|Pending"} == 1 - ) - ) - ) - - record: entity_id:allocated_mem_bytes:sum # the sum of allocated memory across all containers in an entity (denominator for aggregate utilization calculations) - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id) ( # aggregate up to entity (remove node) - entity_id:mem_usage_bytes_total_per_node:sum - ) - - record: entity_id:used_cpu:sum # the sum of used cpu across all containers in an entity (numerator for aggregate utilization calculations) - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id) ( - sum by (namespace, pod) ( - irate(container_cpu_usage_seconds_total{namespace!="",pod!="",image!=""}[5m]) - ) - # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but I didn't want to double the number of pod-level metrics we save - * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) - "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") - ), - "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels - label_replace( - label_replace( - kube_pod_labels{ - label_domain!="", - label_project!="", - label_serving_unionai_dev_app_name!="", - label_serving_knative_dev_revision!="" - }, # this filters for apps only - "label_entity_type", "app", "", "" # set label_entity_type to "app" - ), - "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) - "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" - ), - "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - fast_task_execution_duration_rate{label_domain!="", label_project!="", label_execution_id!=""}, - "label_entity_type", "fast_task", "", "" - ) - ) - ) - # Then filter for pods only in the "Running" or "Pending" phase - * on (namespace, pod) group_left() ( - max by (namespace, pod) ( - kube_pod_status_phase{phase=~"Running|Pending"} == 1 - ) - ) - ) - - record: entity_id:allocated_cpu:sum # the sum of allocated cpu across all containers in an entity (denominator for aggregate utilization calculations) - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id) ( # aggregate up to entity (remove node) - entity_id:cpu_usage_per_node:sum - ) - - record: entity_id:sm_occupancy:avg # the simple average of SM occupancy (a good generic measure of GPU utilization) per entity - expr: | - avg by (label_entity_type, label_domain, label_project, label_entity_id) ( - # First, grab the SM occupancy for each pod - max by (namespace, pod) ( - DCGM_FI_PROF_SM_OCCUPANCY # SM occupancy is a good proxy for actual GPU usage - ) - # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but I didn't want to double the number of pod-level metrics we save - * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) - "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") - ), - "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels - label_replace( - label_replace( - kube_pod_labels{ - label_domain!="", - label_project!="", - label_serving_unionai_dev_app_name!="", - label_serving_knative_dev_revision!="" - }, # this filters for apps only - "label_entity_type", "app", "", "" # set label_entity_type to "app" - ), - "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) - "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" - ), - "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - fast_task_execution_duration_rate{label_domain!="", label_project!="", label_execution_id!=""}, - "label_entity_type", "fast_task", "", "" - ) - ) - ) - # Then filter for pods only in the "Running" or "Pending" phase - * on (namespace, pod) group_left() ( - max by (namespace, pod) ( - kube_pod_status_phase{phase=~"Running|Pending"} == 1 - ) - ) - ) - - record: entity_id:gpu_count:sum # the count of running gpu pods per entity (need this to weight the gpu utilization when aggregating upwards - i.e. project-level) - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id, node) ( - # First, grab the allocated gpu for each pod (which is always either 1 or zero, since k8s can't split gpus the way it can with cpu/memory) - max by (namespace, pod) ( - pod_gpu_allocation - ) - # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but I didn't want to double the number of pod-level metrics we save - * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) - "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") - ), - "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels - label_replace( - label_replace( - kube_pod_labels{ - label_domain!="", - label_project!="", - label_serving_unionai_dev_app_name!="", - label_serving_knative_dev_revision!="" - }, # this filters for apps only - "label_entity_type", "app", "", "" # set label_entity_type to "app" - ), - "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) - "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" - ), - "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - fast_task_execution_duration_rate{label_domain!="", label_project!="", label_execution_id!=""}, - "label_entity_type", "fast_task", "", "" - ) - ) - ) - # Then filter for pods only in the "Running" or "Pending" phase - * on (namespace, pod) group_left() ( - max by (namespace, pod) ( - kube_pod_status_phase{phase=~"Running|Pending"} == 1 - ) - ) - ) - - record: entity_id:weighted_sm_occupancy:sum # product of SM occupancy and allocated GPU count (something like "used memory", numerator of weighted calcs) - expr: | - entity_id:sm_occupancy:avg - * on (label_domain, label_project, label_entity_type, label_entity_id) entity_id:gpu_count:sum - - record: entity_id:allocated_mem_cost:sum # Allocated cost of memory for each workflow/task execution and app. - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id, type) ( - entity_id:mem_usage_bytes_total_per_node:sum / (1024 * 1024 * 1024) # convert bytes to GB - * on (node) group_left(type) label_replace(avg by (node) (node_ram_hourly_cost * (15 / 3600)), "type", "mem", "", "") # convert hourly cost to 15-secondly cost and add type - ) - - record: entity_id:allocated_cpu_cost:sum # Allocated cost of cpu for each workflow/task execution and app. - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id, type)( - entity_id:cpu_usage_per_node:sum - * on (node) group_left(type) label_replace(avg by (node) (node_cpu_hourly_cost * (15 / 3600)), "type", "cpu", "", "") # convert hourly cost to 15-secondly cost and add type - ) - - record: entity_id:allocated_gpu_cost:sum # Allocated cost of gpu for each workflow/task execution and app. - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id, type)( - entity_id:gpu_usage_per_node:sum - * on (node) group_left(type) label_replace(avg by (node) (node_gpu_hourly_cost * (15 / 3600)), "type", "gpu", "", "") # convert hourly cost to 15-secondly cost and add type - ) - - record: entity_id:allocated_cost:sum # Allocated cost of memory, cpu, and gpu for each workflow/task execution and app. - expr: | - label_replace( - sum by (label_entity_type, label_domain, label_project, label_entity_id) ( # for the sum to work, the labels need to be different on each "or" element (type label) - entity_id:allocated_mem_cost:sum - or - entity_id:allocated_cpu_cost:sum - or - entity_id:allocated_gpu_cost:sum - ), - "type", "allocated", "", "" # add type info - ) - - record: entity_id:overhead_cost:sum # The amount of overhead costs (node costs that we can't allocate with container resources) to allocate to each entity (workflow/task execution or app) - expr: | - label_replace( - sum by (label_entity_type, label_entity_id, label_domain, label_project)( # Aggregate the per-node metrics up to workflow/task execution or app (label_entity_id) - # Start with each execution's and app's allocated cost per node - sum by (label_entity_type, label_domain, label_project, label_entity_id, node) ( # for the sum to work, the labels need to be different on each "or" element (type label) - entity_id:mem_usage_bytes_total_per_node:sum / (1024 * 1024 * 1024) # convert bytes to GB - * on (node) group_left(type) label_replace(avg by (node) (node_ram_hourly_cost * (15 / 3600)), "type", "mem", "", "") # convert hourly cost to 15-secondly cost and add type - or - entity_id:cpu_usage_per_node:sum - * on (node) group_left(type) label_replace(avg by (node) (node_cpu_hourly_cost * (15 / 3600)), "type", "cpu", "", "") # convert hourly cost to 15-secondly cost and add type - or - entity_id:gpu_usage_per_node:sum - * on (node) group_left(type) label_replace(avg by (node) (node_gpu_hourly_cost * (15 / 3600)), "type", "gpu", "", "") # convert hourly cost to 15-secondly cost and add type - ) - # Then divide out the total allocated cost per node to get the proportion of allocated cost associated with each entity - / on (node) group_left()( - sum by (node) ( # for the sum to work, the labels need to be different on each "or" element (type label) - entity_id:mem_usage_bytes_total_per_node:sum / (1024 * 1024 * 1024) # convert bytes to GB - * on (node) group_left(type) label_replace(avg by (node) (node_ram_hourly_cost * (15 / 3600)), "type", "mem", "", "") # convert hourly cost to 15-secondly cost and add type - or - entity_id:cpu_usage_per_node:sum - * on (node) group_left(type) label_replace(avg by (node) (node_cpu_hourly_cost * (15 / 3600)), "type", "cpu", "", "") # convert hourly cost to 15-secondly cost and add type - or - entity_id:gpu_usage_per_node:sum - * on (node) group_left(type) label_replace(avg by (node) (node_gpu_hourly_cost * (15 / 3600)), "type", "gpu", "", "") # convert hourly cost to 15-secondly cost and add type - ) - > 0 # need to avoid dividing by zero, or gaps in the data can cause NaNs to proliferate, borking all charts - ) - # Then multiply by the overhead cost per node - * on (node) group_left() ( - # To calculate overhead, start with the true cost of running each node - avg by (node)(kube_node_labels{label_flyte_org_node_role="worker"}) # only look at worker nodes - * on (node) max by (node) ( - node_total_hourly_cost{instance_type!=""} # sometimes, the instance_type can be null, causing an unlabeled label to show up in the Compute Costs dashboard charts - ) * (15 / 3600) # convert hourly cost to 15-secondly cost - # Then subtract out the total allocated cost on each node - - on (node) group_left()( - sum by (node) ( # for the sum to work, the labels need to be different on each "or" element (type label) - entity_id:mem_usage_bytes_total_per_node:sum / (1024 * 1024 * 1024) # convert bytes to GB - * on (node) group_left(type) label_replace(avg by (node) (node_ram_hourly_cost * (15 / 3600)), "type", "mem", "", "") # convert hourly cost to 15-secondly cost and add type - or - entity_id:cpu_usage_per_node:sum - * on (node) group_left(type) label_replace(avg by (node) (node_cpu_hourly_cost * (15 / 3600)), "type", "cpu", "", "") # convert hourly cost to 15-secondly cost and add type - or - entity_id:gpu_usage_per_node:sum - * on (node) group_left(type) label_replace(avg by (node) (node_gpu_hourly_cost * (15 / 3600)), "type", "gpu", "", "") # convert hourly cost to 15-secondly cost and add type - ) - ) - ) - ), - "type", "overhead", "", "" # add type info - ) - - record: entity_id:total_cost:sum # Total cost of each entity (workflow/task execution or app), including allocated (from container resources) and overhead (proportion of unallocated node costs) - expr: | - label_replace( - sum by (label_domain, label_project, label_entity_id, label_entity_type) ( - entity_id:allocated_cost:sum - or - entity_id:overhead_cost:sum - ), - "type", "total", "", "" # add type info - ) - - record: node:total_cost:sum # Total cost of all nodes - expr: | - sum ( - avg by (node)(kube_node_labels{label_flyte_org_node_role="worker", label_node_kubernetes_io_instance_type!=""}) # only look at worker nodes - * on (node) group_left() node_total_hourly_cost{instance_type!=""} * (15 / 3600) # convert hourly cost to 15-secondly cost - ) - - record: node_type:total_cost:sum # Total cost of nodes grouped by node type - expr: | - sum by (node_type)( - avg by (node)(kube_node_labels{label_flyte_org_node_role="worker", label_node_kubernetes_io_instance_type!=""}) # only look at worker nodes - * on (node) group_left(node_type) label_replace(node_total_hourly_cost{instance_type!=""}, "node_type", "$1", "instance_type", "(.*)") * (15 / 3600) # convert hourly cost to 15-secondly cost and rename label - ) - - record: node_type:uptime_hours:sum # Total uptime of nodes grouped by node type - expr: | - sum by (node_type)( - avg by (node, node_type)( # dedupe - label_replace(kube_node_labels{label_flyte_org_node_role="worker", label_node_kubernetes_io_instance_type!=""}, "node_type", "$1", "label_node_kubernetes_io_instance_type", "(.*)") # relabel - ) - ) * (15 / 3600) # convert to number of hours per 15-second observation # Aggregate the above into visible metrics - - name: cost_rollup_15m - interval: 15m - rules: - - record: execution_info15m - expr: | - max_over_time(execution_info[15m:15s]) - - record: app_info15m - expr: | - max_over_time(app_info[15m:15s]) - - record: workspace_info15m - expr: | - max_over_time(workspace_info[15m:15s]) - - record: entity_id:allocated_mem_bytes:sum15m - expr: | - sum_over_time(entity_id:allocated_mem_bytes:sum[15m:15s]) - - record: entity_id:used_mem_bytes:sum15m - expr: | - sum_over_time(entity_id:used_mem_bytes:sum[15m:15s]) - - record: entity_id:allocated_cpu:sum15m - expr: | - sum_over_time(entity_id:allocated_cpu:sum[15m:15s]) - - record: entity_id:used_cpu:sum15m - expr: | - sum_over_time(entity_id:used_cpu:sum[15m:15s]) - - record: entity_id:weighted_sm_occupancy:sum15m - expr: | - sum_over_time(entity_id:weighted_sm_occupancy:sum[15m:15s]) - - record: entity_id:gpu_count:sum15m - expr: | - sum_over_time(entity_id:gpu_count:sum[15m:15s]) - - record: entity_id:allocated_mem_cost:sum15m - expr: | - sum_over_time(entity_id:allocated_mem_cost:sum[15m:15s]) - - record: entity_id:allocated_cpu_cost:sum15m - expr: | - sum_over_time(entity_id:allocated_cpu_cost:sum[15m:15s]) - - record: entity_id:allocated_gpu_cost:sum15m - expr: | - sum_over_time(entity_id:allocated_gpu_cost:sum[15m:15s]) - - record: entity_id:allocated_cost:sum15m - expr: | - sum_over_time(entity_id:allocated_cost:sum[15m:15s]) - - record: entity_id:overhead_cost:sum15m - expr: | - sum_over_time(entity_id:overhead_cost:sum[15m:15s]) - - record: entity_id:total_cost:sum15m - expr: | - sum_over_time(entity_id:total_cost:sum[15m:15s]) - - record: node:total_cost:sum15m - expr: | - sum_over_time(node:total_cost:sum[15m:15s]) - - record: node_type:total_cost:sum15m - expr: | - sum_over_time(node_type:total_cost:sum[15m:15s]) - - record: node_type:uptime_hours:sum15m - expr: | - sum_over_time(node_type:uptime_hours:sum[15m:15s]) ---- -# Source: dataplane/templates/propeller/configmap.yaml -apiVersion: v1 -kind: ConfigMap -metadata: - name: flyte-propeller-config - namespace: union -data: - admin.yaml: | - admin: - clientId: 'dataplane-operator' - clientSecretLocation: /etc/union/secret/client_secret - endpoint: dns:/// - insecure: false - event: - capacity: 1000 - rate: 500 - type: admin - catalog.yaml: | - catalog-cache: - cache-endpoint: dns:/// - endpoint: dns:/// - insecure: false - type: fallback - use-admin-auth: true - copilot.yaml: | - plugins: - k8s: - co-pilot: - image: 'cr.flyte.org/flyteorg/flytecopilot:v1.14.1' - name: flyte-copilot- - start-timeout: 30s - core.yaml: | - propeller: - downstream-eval-duration: 30s - enable-admin-launcher: true - leader-election: - enabled: true - lease-duration: 15s - lock-config-map: - name: propeller-leader - namespace: 'union' - renew-deadline: 10s - retry-period: 2s - limit-namespace: all - literal-offloading-config: - enabled: true - max-workflow-retries: 30 - metadata-prefix: metadata/propeller - metrics-prefix: flyte - prof-port: 10254 - queue: - batch-size: -1 - batching-interval: 2s - queue: - base-delay: 5s - capacity: 1000 - max-delay: 120s - rate: 100 - type: maxof - sub-queue: - capacity: 100 - rate: 10 - type: bucket - type: batch - rawoutput-prefix: 's3://' - workers: 4 - workflow-reeval-duration: 30s webhook: certDir: /etc/webhook/certs + disableCreateMutatingWebhookConfig: true embeddedSecretManagerConfig: imagePullSecrets: enabled: true @@ -3814,81 +3396,13 @@ data: namespace: 'union' type: 'K8s' listenPort: '9443' + localCert: true secretManagerTypes: - Embedded - K8s - serviceName: flyte-pod-webhook + secretName: union-pod-webhook + serviceName: union-pod-webhook servicePort: '443' - enabled_plugins.yaml: | - plugins: - connector-service: - defaultConnector: - defaultServiceConfig: '{"loadBalancingConfig": [{"round_robin":{}}]}' - endpoint: k8s:///flyteconnector.union:8000 - tasks: - task-plugins: - default-for-task-types: - actor: fast-task - container: container - container_array: k8s-array - sidecar: sidecar - enabled-plugins: - - container - - sidecar - - k8s-array - - echo - - fast-task - - connector-service - k8s.yaml: | - plugins: - k8s: - default-cpus: 100m - default-env-vars: [] - default-memory: 100Mi - logger.yaml: | - logger: - formatter: - type: json - level: 4 - show-source: true - resource_manager.yaml: | - propeller: - resourcemanager: - type: noop - task_logs.yaml: | - plugins: - logs: - cloudwatch-enabled: false - dynamic-log-links: - - vscode: - displayName: VS Code Debugger - templateUris: - - /dataplane/pod/v1/generated_name/task/{{.executionProject}}/{{.executionDomain}}/{{.executionName}}/{{.nodeID}}/{{.taskRetryAttempt}}/{{.taskProject}}/{{.taskDomain}}/{{.taskID}}/{{.taskVersion}}/ - kubernetes-enabled: false - templates: - - displayName: Task Logs - scheme: TaskExecution - templateUris: - - /console/projects/{{.executionProject}}/domains/{{.executionDomain}}/executions/{{.executionName}}/nodeId/{{.nodeID}}/taskId/{{.taskID}}/attempt/{{.taskRetryAttempt}}/view/logs?duration=all&fromExecutionNav=true - storage.yaml: | - storage: - container: "" - type: stow - stow: - kind: s3 - config: - auth_type: accesskey - access_key_id: - secret_key: - disable_ssl: false - endpoint: - region: us-east-1 - enable-multicontainer: false - limits: - maxDownloadMBs: 1024 - cache: - max_size_mbs: 0 - target_gc_percent: 70 --- # Source: dataplane/charts/fluentbit/templates/clusterrole.yaml apiVersion: rbac.authorization.k8s.io/v1 @@ -3912,275 +3426,1143 @@ rules: - list - watch --- -# Source: dataplane/charts/kube-state-metrics/templates/role.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2020 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: - labels: - helm.sh/chart: kube-state-metrics-5.30.1 - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: metrics - app.kubernetes.io/part-of: kube-state-metrics - app.kubernetes.io/name: kube-state-metrics - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2.15.0" - name: release-name-kube-state-metrics -rules: - -- apiGroups: ["certificates.k8s.io"] - resources: - - certificatesigningrequests - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - configmaps - verbs: ["list", "watch"] - -- apiGroups: ["batch"] - resources: - - cronjobs - verbs: ["list", "watch"] - -- apiGroups: ["extensions", "apps"] - resources: - - daemonsets - verbs: ["list", "watch"] - -- apiGroups: ["extensions", "apps"] - resources: - - deployments - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - endpoints - verbs: ["list", "watch"] - -- apiGroups: ["autoscaling"] - resources: - - horizontalpodautoscalers - verbs: ["list", "watch"] - -- apiGroups: ["extensions", "networking.k8s.io"] - resources: - - ingresses - verbs: ["list", "watch"] - -- apiGroups: ["batch"] - resources: - - jobs - verbs: ["list", "watch"] - -- apiGroups: ["coordination.k8s.io"] - resources: - - leases - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - limitranges - verbs: ["list", "watch"] - -- apiGroups: ["admissionregistration.k8s.io"] - resources: - - mutatingwebhookconfigurations - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - namespaces - verbs: ["list", "watch"] - -- apiGroups: ["networking.k8s.io"] - resources: - - networkpolicies - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - nodes - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - persistentvolumeclaims - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - persistentvolumes - verbs: ["list", "watch"] - -- apiGroups: ["policy"] - resources: - - poddisruptionbudgets - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - pods - verbs: ["list", "watch"] - -- apiGroups: ["extensions", "apps"] - resources: - - replicasets - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - replicationcontrollers - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - resourcequotas - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - secrets - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - services - verbs: ["list", "watch"] - -- apiGroups: ["apps"] - resources: - - statefulsets - verbs: ["list", "watch"] + name: knative-serving-operator-aggregated + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +aggregationRule: + clusterRoleSelectors: + # This (along with escalate below) allows the Operator to pick up any + # roles that are provided to the admin of the cluster by knative serving + # automatically. + - matchExpressions: + - {key: serving.knative.dev/release, operator: Exists} +rules: [] # Rules are automatically filled in by the controller manager. +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: knative-serving-operator-aggregated-stable + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +aggregationRule: + clusterRoleSelectors: + # This (along with escalate below) allows the Operator to pick up any + # roles that are provided to the admin of the cluster by knative serving + # automatically. + - matchExpressions: + - {key: app.kubernetes.io/name, operator: In, values: ["knative-serving"]} +rules: [] # Rules are automatically filled in by the controller manager. +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: knative-eventing-operator-aggregated + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +aggregationRule: + clusterRoleSelectors: + # This (along with escalate below) allows the Operator to pick up any + # roles that are provided to the admin of the cluster by knative eventing + # automatically. + - matchExpressions: + - {key: eventing.knative.dev/release, operator: Exists} +rules: [] # Rules are automatically filled in by the controller manager. +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: knative-eventing-operator-aggregated-stable + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +aggregationRule: + clusterRoleSelectors: + # This (along with escalate below) allows the Operator to pick up any + # roles that are provided to the admin of the cluster by knative eventing + # automatically. + - matchExpressions: + - {key: app.kubernetes.io/name, operator: In, values: ["knative-eventing"]} +rules: [] # Rules are automatically filled in by the controller manager. +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2020 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. -- apiGroups: ["storage.k8s.io"] - resources: - - storageclasses - verbs: ["list", "watch"] +kind: ClusterRole +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: knative-serving-operator + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +rules: + - apiGroups: + - operator.knative.dev + resources: + - '*' + verbs: + - '*' + # Bootstrapping permissions. + # Roles that are explicitly bound buch which are specified by this Operator + # MUST be specified here with 'get' and 'bind'. + - apiGroups: + - rbac.authorization.k8s.io + resources: + - clusterroles + resourceNames: + - system:auth-delegator + verbs: + - bind + - get + - apiGroups: + - rbac.authorization.k8s.io + resources: + - roles + resourceNames: + - extension-apiserver-authentication-reader + verbs: + - bind + - get + - apiGroups: + - rbac.authorization.k8s.io + resources: + - clusterroles + - roles + verbs: + - create + - delete + # Escalate is necessary in order to create a role using cluster role aggregation, + # and to allow the Operator to bootstrap itself into the necessary set of + # permissions, even as those continue to evolve upstream. + - escalate + - get + - list + - update + - apiGroups: + - rbac.authorization.k8s.io + resources: + - clusterrolebindings + - rolebindings + verbs: + - create + - delete + - list + - get + - update + # Permissions required for Knative controller + # infra. + - apiGroups: + - apiregistration.k8s.io + resources: + - apiservices + verbs: + - update + - apiGroups: + - "" + resources: + - services + verbs: + - create + - delete + - get + - list + - watch + - apiGroups: + - caching.internal.knative.dev + resources: + - images + verbs: + - '*' + - apiGroups: + - "" + resources: + - namespaces + verbs: + - get + - update + - watch + - apiGroups: + - '' + resources: + - events + verbs: + - create + - update + - patch + - apiGroups: + - '' + resources: + - configmaps + verbs: + - create + - delete + - get + - list + - watch + - apiGroups: + - security.istio.io + - apps + - policy + resources: + - poddisruptionbudgets + - peerauthentications + - deployments + - daemonsets + - replicasets + - statefulsets + verbs: + - create + - delete + - get + - list + - watch + - update + - apiGroups: + - apiregistration.k8s.io + resources: + - apiservices + verbs: + - create + - delete + - get + - list + - apiGroups: + - autoscaling + resources: + - horizontalpodautoscalers + verbs: + - create + - delete + - get + - list + - apiGroups: + - coordination.k8s.io + resources: + - leases + verbs: + - '*' + - apiGroups: + - apiextensions.k8s.io + resources: + - customresourcedefinitions + verbs: + - '*' + # Old resources that need cleaning up that are not in the knative-serving + # namespace. + - apiGroups: + - "" + resources: + - services + - deployments + - horizontalpodautoscalers + resourceNames: + - knative-ingressgateway + verbs: + - delete + - apiGroups: + - "" + resources: + - configmaps + resourceNames: + - config-controller + verbs: + - delete + - apiGroups: + - "" + resources: + - serviceaccounts + resourceNames: + - knative-serving-operator + verbs: + - delete + # for contour TLS + - apiGroups: + - projectcontour.io + resources: + - httpproxies + - tlscertificatedelegations + verbs: + - get + - list + - watch + - update + - create + - delete + - deletecollection + - patch + # for security-guard + - apiGroups: + - guard.security.knative.dev + resources: + - guardians + verbs: + - get + - list + - watch + - create + - update + - patch + - delete + - apiGroups: + - "" + resources: + - configmaps + verbs: + - get + - list + - watch + - create + - update + - patch + - delete + - apiGroups: + - authentication.k8s.io + resources: + - tokenreviews + verbs: + - create + - apiGroups: + - "" + resources: + - pods + verbs: + - get +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +kind: ClusterRole +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: knative-eventing-operator + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +rules: + - apiGroups: + - operator.knative.dev + resources: + - '*' + verbs: + - '*' + # Bootstrapping permissions. + # Roles that are explicitly bound buch which are specified by this Operator + # MUST be specified here with 'get' and 'bind'. + - apiGroups: + - rbac.authorization.k8s.io + resources: + - clusterroles + - roles + verbs: + - create + - delete + # Escalate is necessary in order to create a role using cluster role aggregation, + # and to allow the Operator to bootstrap itself into the necessary set of + # permissions, even as those continue to evolve upstream. + - escalate + - get + - list + - update + - apiGroups: + - rbac.authorization.k8s.io + resources: + - clusterrolebindings + - rolebindings + verbs: + - create + - delete + - list + - get + - update + # Permissions required for Knative controller + # infra. + - apiGroups: + - apiregistration.k8s.io + resources: + - apiservices + verbs: + - update + - apiGroups: + - "" + resources: + - services + verbs: + - create + - delete + - get + - list + - watch + - apiGroups: + - caching.internal.knative.dev + resources: + - images + verbs: + - '*' + - apiGroups: + - "" + resources: + - namespaces + verbs: + - get + - update + - watch + - apiGroups: + - '' + resources: + - events + verbs: + - create + - update + - patch + - apiGroups: + - '' + resources: + - configmaps + verbs: + - create + - delete + - get + - list + - watch + - apiGroups: + - apps + resources: + - deployments + - daemonsets + - replicasets + - statefulsets + verbs: + - create + - delete + - get + - list + - watch + - apiGroups: + - apiregistration.k8s.io + resources: + - apiservices + verbs: + - create + - delete + - get + - list + - apiGroups: + - autoscaling + resources: + - horizontalpodautoscalers + verbs: + - create + - delete + - update + - get + - list + - apiGroups: + - coordination.k8s.io + resources: + - leases + verbs: + - '*' + - apiGroups: + - apiextensions.k8s.io + resources: + - customresourcedefinitions + verbs: + - '*' + - apiGroups: + - batch + resources: + - jobs + verbs: + - create + - delete + - update + - get + - list + - watch + # Old resources that need cleaning up that are not in the knative-eventing + # namespace. + - apiGroups: + - "" + resources: + - serviceaccounts + resourceNames: + - knative-eventing-operator + verbs: + - delete + # for RabbitMQ messaging topology objects + - apiGroups: + - rabbitmq.com + resources: + - rabbitmqclusters + verbs: + - "get" + - "list" + - "watch" + - apiGroups: + - rabbitmq.com + resources: + - bindings + - queues + - exchanges + verbs: + - create + - delete + - get + - list + - patch + - update + - watch + - apiGroups: + - rabbitmq.com + resources: + - bindings/status + - queues/status + - exchanges/status + verbs: + - get + # for Kafka eventing source + - apiGroups: + - keda.sh + resources: + - scaledobjects + - scaledobjects/finalizers + - scaledobjects/status + - triggerauthentications + - triggerauthentications/status + verbs: + - get + - list + - watch + - update + - create + - delete + # Internal APIs + - apiGroups: + - "internal.kafka.eventing.knative.dev" + resources: + - "consumers" + - "consumers/status" + - "consumergroups" + - "consumergroups/status" + verbs: + - create + - get + - list + - watch + - patch + - update + - delete + - apiGroups: + - "internal.kafka.eventing.knative.dev" + resources: + - "consumers/finalizers" + - "consumergroups/finalizers" + verbs: + - update + - delete + - apiGroups: + - apps + resources: + - statefulsets/scale + verbs: + - get + - list + - watch + - update + - patch + - create + - delete + - apiGroups: + - rbac.authorization.k8s.io + resources: + - clusterrolebindings + verbs: + - watch + - apiGroups: + - "*" + resources: + - configmaps + verbs: + - delete + - apiGroups: + - "*" + resources: + - configmaps + - services + verbs: + - get + - list + - watch + - update + - create + - delete + - apiGroups: + - "*" + resources: + - pods + verbs: + - list + - update + - get + - watch + - apiGroups: + - "*" + resources: + - pods/finalizers + verbs: + - get + - list + - create + - update + - delete + - apiGroups: + - "*" + resources: + - events + verbs: + - patch + - create + - apiGroups: + - "*" + resources: + - secrets + verbs: + - get + - list + - watch + - update + - create + - delete + - apiGroups: + - "*" + resources: + - nodes + verbs: + - get + - list + - watch + - apiGroups: + - "*" + resources: + - serviceaccounts + verbs: + - get + - list + - watch + - update + - create + - delete + - apiGroups: + - "*" + resources: + - configmaps + resourceNames: + - kafka-channel-config + verbs: + - patch + - apiGroups: + - "*" + resources: + - horizontalpodautoscalers + resourceNames: + - kafka-webhook + verbs: + - delete + - apiGroups: + - "*" + resources: + - leases + verbs: + - delete + - apiGroups: + - "*" + resources: + - poddisruptionbudgets + resourceNames: + - kafka-webhook + verbs: + - delete + - apiGroups: + - "*" + resources: + - services + verbs: + - patch + - apiGroups: + - "apps" + resources: + - deployments + verbs: + - deletecollection + # Eventing TLS + - apiGroups: + - "cert-manager.io" + resources: + - certificates + - issuers + - clusterissuers + verbs: + - create + - delete + - update + - list + - get + - watch + - apiGroups: + - "trust.cert-manager.io" + resources: + - bundles + verbs: + - create + - delete + - update + - list + - get + - watch +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: knative-operator-webhook + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +rules: + # For watching logging configuration and getting certs. + - apiGroups: + - "" + resources: + - "configmaps" + verbs: + - "get" + - "list" + - "watch" + # For manipulating certs into secrets. + - apiGroups: + - "" + resources: + - "namespaces" + verbs: + - "get" + - "create" + - "update" + - "list" + - "watch" + - "patch" + # finalizers are needed for the owner reference of the webhook + - apiGroups: + - "" + resources: + - "namespaces/finalizers" + verbs: + - "update" + # For getting our Deployment so we can decorate with ownerref. + - apiGroups: + - "apps" + resources: + - "deployments" + verbs: + - "get" + - apiGroups: + - "apps" + resources: + - "deployments/finalizers" + verbs: + - update + # For actually registering our webhook. + - apiGroups: + - "admissionregistration.k8s.io" + resources: + - "mutatingwebhookconfigurations" + - "validatingwebhookconfigurations" + verbs: &everything + - "get" + - "list" + - "create" + - "update" + - "delete" + - "patch" + - "watch" + # For leader election + - apiGroups: + - "coordination.k8s.io" + resources: + - "leases" + verbs: *everything + # Necessary for conversion webhook. These are copied from the serving + # TODO: Do we really need all these permissions? + - apiGroups: ["apiextensions.k8s.io"] + resources: ["customresourcedefinitions"] + verbs: ["get", "list", "create", "update", "delete", "patch", "watch"] +--- +# Source: dataplane/charts/fluentbit/templates/clusterrolebinding.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: release-name-fluentbit + labels: + helm.sh/chart: fluentbit-0.48.9 + app.kubernetes.io/name: fluentbit + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "3.2.8" + app.kubernetes.io/managed-by: Helm +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: release-name-fluentbit +subjects: + - kind: ServiceAccount + name: union-system + namespace: union +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2020 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# TODO: Consider restriction of non-aggregated role to knativeservings namespaces. +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: knative-serving-operator + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: knative-serving-operator +subjects: + - kind: ServiceAccount + name: knative-operator + namespace: "union" +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# TODO: Consider restriction of non-aggregated role to knativeeventing namespaces. +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: knative-eventing-operator + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: knative-eventing-operator +subjects: + - kind: ServiceAccount + name: knative-operator + namespace: "union" +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: operator-webhook + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: knative-operator-webhook +subjects: + - kind: ServiceAccount + name: operator-webhook + namespace: "union" +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2020 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. -- apiGroups: ["admissionregistration.k8s.io"] - resources: - - validatingwebhookconfigurations - verbs: ["list", "watch"] +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: knative-serving-operator-aggregated + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: knative-serving-operator-aggregated +subjects: + - kind: ServiceAccount + name: knative-operator + namespace: "union" +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: knative-serving-operator-aggregated-stable + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: knative-serving-operator-aggregated-stable +subjects: + - kind: ServiceAccount + name: knative-operator + namespace: "union" +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: knative-eventing-operator-aggregated + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: knative-eventing-operator-aggregated +subjects: + - kind: ServiceAccount + name: knative-operator + namespace: "union" +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: knative-eventing-operator-aggregated-stable + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: knative-eventing-operator-aggregated-stable +subjects: + - kind: ServiceAccount + name: knative-operator + namespace: "union" +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2022 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. -- apiGroups: ["storage.k8s.io"] - resources: - - volumeattachments - verbs: ["list", "watch"] +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + namespace: "union" + name: knative-operator-webhook + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +rules: + # For manipulating certs into secrets. + - apiGroups: + - "" + resources: + - "secrets" + verbs: + - "get" + - "create" + - "update" + - "list" + - "watch" + - "patch" --- -# Source: dataplane/charts/opencost/templates/clusterrole.yaml -# Cluster role giving opencost to get, list, watch required resources -# No write permissions are required +# Source: dataplane/templates/monitoring/prometheus-rbac.yaml apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole +kind: Role metadata: - name: release-name-opencost + name: union-operator-prometheus-rbac + namespace: union labels: - helm.sh/chart: opencost-1.42.0 - app.kubernetes.io/name: opencost - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "1.111.0" - app.kubernetes.io/part-of: opencost - app.kubernetes.io/managed-by: Helm + release: release-name rules: + # Prometheus server scrape permissions - apiGroups: [""] resources: - - configmaps - - deployments - nodes - - pods + - nodes/proxy + - nodes/metrics - services - - resourcequotas - - replicationcontrollers - - limitranges - - persistentvolumeclaims - - persistentvolumes - - namespaces - endpoints + - pods + - ingresses + - configmaps verbs: - get - list - watch - - apiGroups: - - extensions - resources: - - daemonsets - - deployments - - replicasets - verbs: - - get - - list - - watch - - apiGroups: - - apps - resources: - - statefulsets - - deployments - - daemonsets - - replicasets - verbs: - - list - - watch - - apiGroups: - - batch - resources: - - cronjobs - - jobs - verbs: - - get - - list - - watch - - apiGroups: - - autoscaling - resources: - - horizontalpodautoscalers - verbs: - - get - - list - - watch - - apiGroups: - - policy + - apiGroups: ["extensions", "networking.k8s.io"] resources: - - poddisruptionbudgets + - ingresses/status + - ingresses verbs: - get - list - watch - - apiGroups: - - storage.k8s.io + - apiGroups: ["discovery.k8s.io"] resources: - - storageclasses + - endpointslices verbs: - get - list - watch ---- -# Source: dataplane/templates/clusterresourcesync/serviceaccount.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: union-clustersync-resource -rules: - - apiGroups: - - "" - - rbac.authorization.k8s.io - resources: - - configmaps - - namespaces - - pods - - resourcequotas - - roles - - rolebindings - - secrets - - services - - serviceaccounts - - clusterrolebindings - - podtemplates - verbs: - - '*' + # kube-state-metrics permissions (rbac.create=false on the subchart; managed here instead) + - apiGroups: ["certificates.k8s.io"] + resources: ["certificatesigningrequests"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["configmaps"] + verbs: ["list", "watch"] + - apiGroups: ["batch"] + resources: ["cronjobs"] + verbs: ["list", "watch"] + - apiGroups: ["extensions", "apps"] + resources: ["daemonsets"] + verbs: ["list", "watch"] + - apiGroups: ["extensions", "apps"] + resources: ["deployments"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["endpoints"] + verbs: ["list", "watch"] + - apiGroups: ["discovery.k8s.io"] + resources: ["endpointslices"] + verbs: ["list", "watch"] + - apiGroups: ["autoscaling"] + resources: ["horizontalpodautoscalers"] + verbs: ["list", "watch"] + - apiGroups: ["extensions", "networking.k8s.io"] + resources: ["ingresses"] + verbs: ["list", "watch"] + - apiGroups: ["batch"] + resources: ["jobs"] + verbs: ["list", "watch"] + - apiGroups: ["coordination.k8s.io"] + resources: ["leases"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["limitranges"] + verbs: ["list", "watch"] + - apiGroups: ["admissionregistration.k8s.io"] + resources: ["mutatingwebhookconfigurations"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["namespaces"] + verbs: ["list", "watch"] + - apiGroups: ["networking.k8s.io"] + resources: ["networkpolicies"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["persistentvolumeclaims"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["persistentvolumes"] + verbs: ["list", "watch"] + - apiGroups: ["policy"] + resources: ["poddisruptionbudgets"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["pods"] + verbs: ["list", "watch"] + - apiGroups: ["extensions", "apps"] + resources: ["replicasets"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["replicationcontrollers"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["resourcequotas"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["secrets"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["services"] + verbs: ["list", "watch"] + - apiGroups: ["apps"] + resources: ["statefulsets"] + verbs: ["list", "watch"] + - apiGroups: ["storage.k8s.io"] + resources: ["storageclasses"] + verbs: ["list", "watch"] + - apiGroups: ["admissionregistration.k8s.io"] + resources: ["validatingwebhookconfigurations"] + verbs: ["list", "watch"] + - apiGroups: ["storage.k8s.io"] + resources: ["volumeattachments"] + verbs: ["list", "watch"] --- # Source: dataplane/templates/nodeexecutor/serviceaccount.yaml apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole +kind: Role metadata: name: union-executor labels: @@ -4233,7 +4615,7 @@ rules: --- # Source: dataplane/templates/nodeobserver/serviceaccount.yaml apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole +kind: Role metadata: name: union-nodeobserver rules: @@ -4244,11 +4626,12 @@ rules: resources: ["nodes"] verbs: ["get", "update"] --- -# Source: dataplane/templates/operator/serviceaccount-proxy.yaml +# Source: dataplane/templates/operator/serviceaccount-proxy-secret.yaml apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole +kind: Role metadata: - name: proxy-system + name: union-system-secret + namespace: union labels: app.kubernetes.io/name: operator-proxy app.kubernetes.io/instance: release-name @@ -4258,146 +4641,86 @@ rules: - apiGroups: - '*' resources: - - events - - flyteworkflows - - pods/log - - pods - - rayjobs - - resourcequotas + - secrets verbs: - get - list - - watch + - create + - update + - delete --- -# Source: dataplane/templates/operator/serviceaccount.yaml +# Source: dataplane/templates/operator/serviceaccount-proxy.yaml apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole +kind: Role metadata: - name: operator-system + name: proxy-system labels: - app.kubernetes.io/name: union-operator + app.kubernetes.io/name: operator-proxy app.kubernetes.io/instance: release-name platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm rules: - # Allow Access to all resources under flyte.lyft.com - - apiGroups: - - flyte.lyft.com - resources: - - flyteworkflows - - flyteworkflows/finalizers - verbs: - - get - - list - - watch - - create - - update - - delete - - patch - - post - - deletecollection - apiGroups: - '*' resources: - - resourcequotas + - events + - flyteworkflows + - pods/log - pods - - configmaps - - podtemplates - - secrets - - namespaces - - nodes + - rayjobs + - resourcequotas verbs: - get - list - watch - - create - - update - - delete - - nonResourceURLs: - - /metrics - verbs: - - get --- -# Source: dataplane/templates/prometheus/rbac.yaml +# Source: dataplane/templates/operator/serviceaccount.yaml apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole +kind: Role metadata: - name: union-operator-prometheus + name: operator-system labels: - helm.sh/chart: dataplane-2026.4.2 - app.kubernetes.io/name: release-name-dataplane + app.kubernetes.io/name: union-operator app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: prometheus -rules: - - apiGroups: [""] - resources: - - nodes - - nodes/proxy - - pods - - endpoints - - services - verbs: - - get - - list - - watch - - nonResourceURLs: - - /metrics - - /metrics/cadvisor - verbs: - - get ---- -# Source: dataplane/templates/propeller/serviceaccount-webhook.yaml -kind: ClusterRole -apiVersion: rbac.authorization.k8s.io/v1 -metadata: - name: flytepropeller-webhook-role - namespace: union -rules: - - apiGroups: - - "*" - resources: - - mutatingwebhookconfigurations - - secrets - - pods - - replicasets/finalizers - verbs: - - get - - create - - update - - patch ---- -# Source: dataplane/templates/propeller/serviceaccount.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: flytepropeller-role + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm rules: - # Allow RO access to PODS - apiGroups: - - "" + - '*' resources: - - pods + - secrets + - deployments verbs: - get - list - watch - # Allow Event recording access + - create + - update - apiGroups: - - "" + - flyte.lyft.com resources: - - events + - flyteworkflows + - flyteworkflows/finalizers verbs: + - get + - list + - watch - create - update - delete - patch - # Allow Access All plugin objects + - post + - deletecollection - apiGroups: - '*' resources: - - '*' + - resourcequotas + - pods + - configmaps + - podtemplates + - secrets + - namespaces + - nodes verbs: - get - list @@ -4405,146 +4728,126 @@ rules: - create - update - delete - - patch - # Allow Access to CRD - apiGroups: - - apiextensions.k8s.io + - serving.knative.dev resources: - - customresourcedefinitions + - revisions + - configurations + - services verbs: - get - list - watch - create - - delete - update - # Allow Access to all resources under flyte.lyft.com + - delete +--- +# Source: dataplane/templates/webhook/serviceaccount.yaml +kind: Role +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: union-webhook-role + namespace: union +rules: - apiGroups: - - flyte.lyft.com + - "*" resources: - - flyteworkflows - - flyteworkflows/finalizers + - mutatingwebhookconfigurations + - secrets + - pods + - replicasets/finalizers verbs: - get - - list - - watch - create - update - - delete - patch - - post - - deletecollection --- -# Source: dataplane/charts/fluentbit/templates/clusterrolebinding.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2022 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding +kind: RoleBinding metadata: - name: release-name-fluentbit + namespace: "union" + name: operator-webhook labels: - helm.sh/chart: fluentbit-0.48.9 - app.kubernetes.io/name: fluentbit - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "3.2.8" - app.kubernetes.io/managed-by: Helm -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: release-name-fluentbit + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator subjects: - kind: ServiceAccount - name: fluentbit-system - namespace: union ---- -# Source: dataplane/charts/kube-state-metrics/templates/clusterrolebinding.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - labels: - helm.sh/chart: kube-state-metrics-5.30.1 - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: metrics - app.kubernetes.io/part-of: kube-state-metrics - app.kubernetes.io/name: kube-state-metrics - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2.15.0" - name: release-name-kube-state-metrics + name: operator-webhook + namespace: "union" roleRef: + kind: Role + name: knative-operator-webhook apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: release-name-kube-state-metrics -subjects: -- kind: ServiceAccount - name: release-name-kube-state-metrics - namespace: union --- -# Source: dataplane/charts/opencost/templates/clusterrolebinding.yaml +# Source: dataplane/templates/monitoring/prometheus-rbac.yaml apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding +kind: RoleBinding metadata: - name: release-name-opencost + name: union-operator-prometheus-rbac + namespace: union labels: - helm.sh/chart: opencost-1.42.0 - app.kubernetes.io/name: opencost - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "1.111.0" - app.kubernetes.io/part-of: opencost - app.kubernetes.io/managed-by: Helm -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: release-name-opencost + release: release-name subjects: - kind: ServiceAccount - name: release-name-opencost + name: union-operator-prometheus namespace: union ---- -# Source: dataplane/templates/clusterresourcesync/serviceaccount.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - name: union-clustersync-resource roleRef: apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: union-clustersync-resource -subjects: - - kind: ServiceAccount - name: union-clustersync-system - namespace: union + kind: Role + name: union-operator-prometheus-rbac --- -# Source: dataplane/templates/clusterresourcesync/serviceaccount.yaml +# Source: dataplane/templates/monitoring/prometheus-rbac.yaml +# kube-state-metrics service account RoleBinding apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding +kind: RoleBinding metadata: - name: union-clustersync-auth-delegator -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: system:auth-delegator + name: release-name-prometheus-kube-state-metrics + namespace: union + labels: + release: release-name subjects: - kind: ServiceAccount - name: union-clustersync-system + name: release-name-prometheus-kube-state-metrics namespace: union +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: union-operator-prometheus-rbac --- # Source: dataplane/templates/nodeexecutor/serviceaccount.yaml apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding +kind: RoleBinding metadata: name: union-executor labels: app: executor roleRef: apiGroup: rbac.authorization.k8s.io - kind: ClusterRole + kind: Role name: union-executor subjects: - kind: ServiceAccount - name: executor + name: union-system namespace: union --- # Source: dataplane/templates/nodeobserver/serviceaccount.yaml apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding +kind: RoleBinding metadata: name: union-nodeobserver subjects: @@ -4552,15 +4855,16 @@ subjects: name: nodeobserver-system namespace: union roleRef: - kind: ClusterRole + kind: Role apiGroup: rbac.authorization.k8s.io name: union-nodeobserver --- -# Source: dataplane/templates/operator/serviceaccount-proxy.yaml +# Source: dataplane/templates/operator/serviceaccount-proxy-secret.yaml apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding +kind: RoleBinding metadata: - name: proxy-system + name: union-system-secret + namespace: union labels: app.kubernetes.io/name: operator-proxy app.kubernetes.io/instance: release-name @@ -4568,109 +4872,35 @@ metadata: app.kubernetes.io/managed-by: Helm roleRef: apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: proxy-system + kind: Role + name: union-system-secret subjects: - kind: ServiceAccount - name: proxy-system + name: union-system namespace: union --- -# Source: dataplane/templates/operator/serviceaccount.yaml +# Source: dataplane/templates/operator/serviceaccount-proxy.yaml apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding +kind: RoleBinding metadata: - name: operator-system + name: proxy-system labels: - app.kubernetes.io/name: union-operator + app.kubernetes.io/name: operator-proxy app.kubernetes.io/instance: release-name platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm roleRef: apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: operator-system -subjects: - - kind: ServiceAccount - name: operator-system - namespace: union ---- -# Source: dataplane/templates/prometheus/rbac.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - name: union-operator-prometheus - labels: - helm.sh/chart: dataplane-2026.4.2 - app.kubernetes.io/name: release-name-dataplane - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: prometheus -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: union-operator-prometheus -subjects: - - kind: ServiceAccount - name: union-operator-prometheus - namespace: union ---- -# Source: dataplane/templates/propeller/serviceaccount-webhook.yaml -# Create a binding from Role -> ServiceAccount -kind: ClusterRoleBinding -apiVersion: rbac.authorization.k8s.io/v1 -metadata: - name: flytepropeller-webhook-binding - namespace: union -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: flytepropeller-webhook-role -subjects: - - kind: ServiceAccount - name: flytepropeller-webhook-system - namespace: union ---- -# Source: dataplane/templates/propeller/serviceaccount.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - name: flytepropeller-binding -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: flytepropeller-role + kind: Role + name: proxy-system subjects: - kind: ServiceAccount - name: flytepropeller-system + name: union-system namespace: union --- -# Source: dataplane/templates/operator/serviceaccount-proxy-secret.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: Role -metadata: - name: proxy-system-secret - namespace: union - labels: - app.kubernetes.io/name: operator-proxy - app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name - app.kubernetes.io/managed-by: Helm -rules: - - apiGroups: - - '*' - resources: - - secrets - verbs: - - get - - list - - create - - update - - delete ---- # Source: dataplane/templates/operator/serviceaccount.yaml apiVersion: rbac.authorization.k8s.io/v1 -kind: Role +kind: RoleBinding metadata: name: operator-system labels: @@ -4678,56 +4908,29 @@ metadata: app.kubernetes.io/instance: release-name platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm -rules: - - apiGroups: - - '*' - resources: - - secrets - - deployments - verbs: - - get - - list - - watch - - create - - update ---- -# Source: dataplane/templates/operator/serviceaccount-proxy-secret.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: RoleBinding -metadata: - name: proxy-system-secret - namespace: union - labels: - app.kubernetes.io/name: operator-proxy - app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name - app.kubernetes.io/managed-by: Helm roleRef: apiGroup: rbac.authorization.k8s.io kind: Role - name: proxy-system-secret + name: operator-system subjects: - kind: ServiceAccount - name: proxy-system + name: union-system namespace: union --- -# Source: dataplane/templates/operator/serviceaccount.yaml -apiVersion: rbac.authorization.k8s.io/v1 +# Source: dataplane/templates/webhook/serviceaccount.yaml +# Create a binding from Role -> ServiceAccount kind: RoleBinding +apiVersion: rbac.authorization.k8s.io/v1 metadata: - name: operator-system - labels: - app.kubernetes.io/name: union-operator - app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name - app.kubernetes.io/managed-by: Helm + name: union-webhook-binding + namespace: union roleRef: apiGroup: rbac.authorization.k8s.io kind: Role - name: operator-system + name: union-webhook-role subjects: - kind: ServiceAccount - name: operator-system + name: union-system namespace: union --- # Source: dataplane/charts/fluentbit/templates/service.yaml @@ -4753,20 +4956,60 @@ spec: app.kubernetes.io/name: fluentbit app.kubernetes.io/instance: release-name --- -# Source: dataplane/charts/kube-state-metrics/templates/service.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2022 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v1 +kind: Service +metadata: + labels: + role: operator-webhook + app.kubernetes.io/component: operator-webhook + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator + name: operator-webhook + namespace: "union" +spec: + ports: + # Define metrics and profiling for them to be accessible within service meshes. + - name: http-metrics + port: 9090 + targetPort: 9090 + - name: http-profiling + port: 8008 + targetPort: 8008 + - name: https-webhook + port: 443 + targetPort: 8443 + selector: + role: operator-webhook +--- +# Source: dataplane/charts/prometheus/charts/kube-state-metrics/templates/service.yaml apiVersion: v1 kind: Service metadata: name: release-name-kube-state-metrics namespace: union labels: - helm.sh/chart: kube-state-metrics-5.30.1 + helm.sh/chart: kube-state-metrics-5.25.1 app.kubernetes.io/managed-by: Helm app.kubernetes.io/component: metrics app.kubernetes.io/part-of: kube-state-metrics app.kubernetes.io/name: kube-state-metrics app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2.15.0" + app.kubernetes.io/version: "2.13.0" annotations: prometheus.io/scrape: 'true' spec: @@ -4781,28 +5024,32 @@ spec: app.kubernetes.io/name: kube-state-metrics app.kubernetes.io/instance: release-name --- -# Source: dataplane/charts/opencost/templates/service.yaml +# Source: dataplane/charts/prometheus/templates/service.yaml apiVersion: v1 kind: Service metadata: - name: release-name-opencost - namespace: union labels: - helm.sh/chart: opencost-1.42.0 - app.kubernetes.io/name: opencost + app.kubernetes.io/component: server + app.kubernetes.io/name: prometheus app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "1.111.0" - app.kubernetes.io/part-of: opencost + app.kubernetes.io/version: v2.54.1 + helm.sh/chart: prometheus-25.27.0 app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: prometheus + name: union-operator-prometheus + namespace: union spec: + ports: + - name: http + port: 80 + protocol: TCP + targetPort: 9090 selector: - app.kubernetes.io/name: opencost + app.kubernetes.io/component: server + app.kubernetes.io/name: prometheus app.kubernetes.io/instance: release-name + sessionAffinity: None type: "ClusterIP" - ports: - - name: http - port: 9003 - targetPort: 9003 --- # Source: dataplane/templates/clusterresourcesync/service.yaml apiVersion: v1 @@ -4826,6 +5073,31 @@ spec: app.kubernetes.io/name: clusterresourcesync app.kubernetes.io/instance: release-name --- +# Source: dataplane/templates/flyteconnector/service.yaml +apiVersion: v1 +kind: Service +metadata: + name: flyteconnector + namespace: union + labels: + app.kubernetes.io/name: flyteconnector + app.kubernetes.io/instance: release-name + app.kubernetes.io/managed-by: Helm +spec: + clusterIP: None + ports: + - name: grpc + port: 8000 + protocol: TCP + targetPort: grpc + - name: metric + port: 9090 + protocol: TCP + targetPort: metric + selector: + app.kubernetes.io/name: flyteconnector + app.kubernetes.io/instance: release-name +--- # Source: dataplane/templates/imagebuilder/service.yaml apiVersion: v1 kind: Service @@ -4851,7 +5123,7 @@ spec: apiVersion: v1 kind: Service metadata: - name: release-name-dataplane-executor + name: union-operator-executor labels: platform.union.ai/prometheus-group: "union-services" app: executor @@ -4917,39 +5189,14 @@ spec: app.kubernetes.io/name: union-operator app.kubernetes.io/instance: release-name --- -# Source: dataplane/templates/prometheus/service.yaml -apiVersion: v1 -kind: Service -metadata: - name: union-operator-prometheus - namespace: union - labels: - helm.sh/chart: dataplane-2026.4.2 - app.kubernetes.io/name: release-name-dataplane - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: prometheus -spec: - type: ClusterIP - ports: - - port: 80 - targetPort: 9090 - protocol: TCP - name: http - selector: - app.kubernetes.io/component: prometheus - app.kubernetes.io/name: release-name-dataplane - app.kubernetes.io/instance: release-name ---- -# Source: dataplane/templates/propeller/service-webhook.yaml +# Source: dataplane/templates/webhook/service.yaml apiVersion: v1 kind: Service metadata: - name: flyte-pod-webhook + name: union-pod-webhook namespace: union labels: - app.kubernetes.io/name: flyte-pod-webhook + app.kubernetes.io/name: union-pod-webhook app.kubernetes.io/instance: release-name platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm @@ -4957,7 +5204,7 @@ metadata: projectcontour.io/upstream-protocol.h2c: grpc spec: selector: - app.kubernetes.io/name: flyte-pod-webhook + app.kubernetes.io/name: union-pod-webhook app.kubernetes.io/instance: release-name ports: - name: https @@ -4969,23 +5216,23 @@ spec: port: 10254 targetPort: 10254 --- -# Source: dataplane/templates/propeller/service-webhook.yaml +# Source: dataplane/templates/webhook/service.yaml # Headless Service for cache invalidation — resolves to all pod IPs so that # we can fan out invalidation requests to every webhook replica. apiVersion: v1 kind: Service metadata: - name: flyte-pod-webhook-headless + name: union-pod-webhook-headless namespace: union labels: - app.kubernetes.io/name: flyte-pod-webhook + app.kubernetes.io/name: union-pod-webhook app.kubernetes.io/instance: release-name platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm spec: clusterIP: None selector: - app.kubernetes.io/name: flyte-pod-webhook + app.kubernetes.io/name: union-pod-webhook app.kubernetes.io/instance: release-name ports: - name: cache-internal @@ -4993,32 +5240,6 @@ spec: port: 9443 targetPort: 9443 --- -# Source: dataplane/templates/propeller/service.yaml -apiVersion: v1 -kind: Service -metadata: - namespace: union - name: flytepropeller - labels: - platform.union.ai/prometheus-group: "union-services" - app.kubernetes.io/name: flytepropeller - app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name - app.kubernetes.io/managed-by: Helm -spec: - type: ClusterIP - ports: - - name: debug - protocol: TCP - port: 10254 - - name: fasttask - port: 15605 - protocol: TCP - targetPort: 15605 - selector: - app.kubernetes.io/name: flytepropeller - app.kubernetes.io/instance: release-name ---- # Source: dataplane/charts/fluentbit/templates/daemonset.yaml apiVersion: apps/v1 kind: DaemonSet @@ -5044,7 +5265,7 @@ spec: annotations: checksum/config: e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855 spec: - serviceAccountName: fluentbit-system + serviceAccountName: union-system hostNetwork: false dnsPolicy: ClusterFirst containers: @@ -5197,20 +5418,217 @@ spec: - --config - /etc/union/config.yaml --- -# Source: dataplane/charts/kube-state-metrics/templates/deployment.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2022 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: operator-webhook + namespace: "union" + labels: + app.kubernetes.io/component: operator-webhook + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +spec: + selector: + matchLabels: + app: operator-webhook + role: operator-webhook + template: + metadata: + labels: + app: operator-webhook + role: operator-webhook + app.kubernetes.io/component: operator-webhook + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator + sidecar.istio.io/inject: "false" + spec: + # To avoid node becoming SPOF, spread our replicas to different nodes. + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - podAffinityTerm: + labelSelector: + matchLabels: + app: operator-webhook + topologyKey: kubernetes.io/hostname + weight: 100 + serviceAccountName: operator-webhook + containers: + - name: operator-webhook + # This is the Go import path for the binary that is containerized + # and substituted here. + image: gcr.io/knative-releases/knative.dev/operator/cmd/webhook@sha256:d3a7a3304629ccfcaacec620b50736e0b4c902a6328b83369b6cbaf7e94677c9 + resources: + requests: + cpu: 100m + memory: 100Mi + limits: + cpu: 500m + memory: 500Mi + env: + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: SYSTEM_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: CONFIG_LOGGING_NAME + value: config-logging + - name: CONFIG_OBSERVABILITY_NAME + value: config-observability + - name: WEBHOOK_NAME + value: operator-webhook + - name: WEBHOOK_PORT + value: "8443" + - name: WEBHOOK_SECRET_NAME + value: operator-webhook-certs + - name: METRICS_DOMAIN + value: knative.dev/operator + - name: KUBERNETES_MIN_VERSION + value: "" + securityContext: + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + runAsNonRoot: true + capabilities: + drop: + - ALL + ports: + - name: metrics + containerPort: 9090 + - name: profiling + containerPort: 8008 + - name: https-webhook + containerPort: 8443 + readinessProbe: + periodSeconds: 1 + httpGet: + scheme: HTTPS + port: 8443 + httpHeaders: + - name: k-kubelet-probe + value: "webhook" + livenessProbe: + periodSeconds: 1 + httpGet: + scheme: HTTPS + port: 8443 + httpHeaders: + - name: k-kubelet-probe + value: "webhook" + failureThreshold: 6 + initialDelaySeconds: 120 + # Our webhook should gracefully terminate by lame ducking first, set this to a sufficiently + # high value that we respect whatever value it has configured for the lame duck grace period. + terminationGracePeriodSeconds: 300 +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2020 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: knative-operator + namespace: "union" + labels: + app.kubernetes.io/name: knative-operator + app.kubernetes.io/version: "1.16.0" +spec: + replicas: 1 + selector: + matchLabels: + name: knative-operator + template: + metadata: + labels: + name: knative-operator + app.kubernetes.io/name: knative-operator + app.kubernetes.io/version: "1.16.0" + sidecar.istio.io/inject: "false" + spec: + serviceAccountName: knative-operator + containers: + - name: knative-operator + image: gcr.io/knative-releases/knative.dev/operator/cmd/operator@sha256:0b5a3532417f9c8e7b6044e23f6f67ad932a74a40a4092ad965b6f173b2fd887 + resources: + requests: + cpu: 100m + memory: 100Mi + limits: + cpu: 1000m + memory: 1000Mi + imagePullPolicy: IfNotPresent + env: + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: SYSTEM_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: METRICS_DOMAIN + value: knative.dev/operator + - name: CONFIG_LOGGING_NAME + value: config-logging + - name: CONFIG_OBSERVABILITY_NAME + value: config-observability + - name: KUBERNETES_MIN_VERSION + value: "" + securityContext: + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + runAsNonRoot: true + capabilities: + drop: + - ALL + ports: + - name: metrics + containerPort: 9090 +--- +# Source: dataplane/charts/prometheus/charts/kube-state-metrics/templates/deployment.yaml apiVersion: apps/v1 kind: Deployment metadata: name: release-name-kube-state-metrics namespace: union labels: - helm.sh/chart: kube-state-metrics-5.30.1 + helm.sh/chart: kube-state-metrics-5.25.1 app.kubernetes.io/managed-by: Helm app.kubernetes.io/component: metrics app.kubernetes.io/part-of: kube-state-metrics app.kubernetes.io/name: kube-state-metrics app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2.15.0" + app.kubernetes.io/version: "2.13.0" spec: selector: matchLabels: @@ -5223,13 +5641,13 @@ spec: template: metadata: labels: - helm.sh/chart: kube-state-metrics-5.30.1 + helm.sh/chart: kube-state-metrics-5.25.1 app.kubernetes.io/managed-by: Helm app.kubernetes.io/component: metrics app.kubernetes.io/part-of: kube-state-metrics app.kubernetes.io/name: kube-state-metrics app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2.15.0" + app.kubernetes.io/version: "2.13.0" spec: automountServiceAccountToken: true hostNetwork: false @@ -5246,8 +5664,9 @@ spec: args: - --port=8080 - --resources=certificatesigningrequests,configmaps,cronjobs,daemonsets,deployments,endpoints,horizontalpodautoscalers,ingresses,jobs,leases,limitranges,mutatingwebhookconfigurations,namespaces,networkpolicies,nodes,persistentvolumeclaims,persistentvolumes,poddisruptionbudgets,pods,replicasets,replicationcontrollers,resourcequotas,secrets,services,statefulsets,storageclasses,validatingwebhookconfigurations,volumeattachments + - --namespaces=union imagePullPolicy: IfNotPresent - image: registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.15.0 + image: registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.13.0 ports: - containerPort: 8080 name: "http" @@ -5267,7 +5686,7 @@ spec: httpGet: httpHeaders: path: /readyz - port: 8081 + port: 8080 scheme: HTTP initialDelaySeconds: 5 periodSeconds: 10 @@ -5282,201 +5701,204 @@ spec: - ALL readOnlyRootFilesystem: true --- -# Source: dataplane/charts/opencost/templates/deployment.yaml +# Source: dataplane/charts/prometheus/templates/deploy.yaml apiVersion: apps/v1 kind: Deployment metadata: - name: release-name-opencost - namespace: union labels: - helm.sh/chart: opencost-1.42.0 - app.kubernetes.io/name: opencost + app.kubernetes.io/component: server + app.kubernetes.io/name: prometheus app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "1.111.0" - app.kubernetes.io/part-of: opencost + app.kubernetes.io/version: v2.54.1 + helm.sh/chart: prometheus-25.27.0 app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: prometheus + name: union-operator-prometheus + namespace: union spec: - replicas: 1 selector: matchLabels: - app.kubernetes.io/name: opencost + app.kubernetes.io/component: server + app.kubernetes.io/name: prometheus app.kubernetes.io/instance: release-name - strategy: - rollingUpdate: - maxSurge: 1 - maxUnavailable: 1 - type: RollingUpdate + replicas: 1 + revisionHistoryLimit: 10 + strategy: + type: Recreate + rollingUpdate: null template: metadata: labels: - app.kubernetes.io/name: opencost + app.kubernetes.io/component: server + app.kubernetes.io/name: prometheus app.kubernetes.io/instance: release-name + app.kubernetes.io/version: v2.54.1 + helm.sh/chart: prometheus-25.27.0 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: prometheus spec: - serviceAccountName: release-name-opencost + enableServiceLinks: true + serviceAccountName: union-operator-prometheus containers: - - name: release-name-opencost - image: ghcr.io/opencost/opencost:1.111.0@sha256:6aa68e52a24b14ba41f23db08d1b9db1429a1c0300f4c0381ecc2c61fc311a97 - imagePullPolicy: IfNotPresent + - name: prometheus-server-configmap-reload + image: "quay.io/prometheus-operator/prometheus-config-reloader:v0.76.0" + imagePullPolicy: "IfNotPresent" args: + - --watched-dir=/etc/config + - --listen-address=0.0.0.0:8080 + - --reload-url=http://127.0.0.1:9090/prometheus/-/reload ports: - - containerPort: 9003 - name: http - resources: - limits: - cpu: 1000m - memory: 4Gi - requests: - cpu: 500m - memory: 1Gi - startupProbe: - httpGet: - path: /healthz - port: 9003 - initialDelaySeconds: 10 - periodSeconds: 5 - failureThreshold: 30 + - containerPort: 8080 + name: metrics livenessProbe: httpGet: path: /healthz - port: 9003 - initialDelaySeconds: 10 - periodSeconds: 20 - failureThreshold: 3 + port: metrics + scheme: HTTP + initialDelaySeconds: 2 + periodSeconds: 10 readinessProbe: httpGet: path: /healthz - port: 9003 - initialDelaySeconds: 10 + port: metrics + scheme: HTTP periodSeconds: 10 + volumeMounts: + - name: config-volume + mountPath: /etc/config + readOnly: true + + - name: prometheus-server + image: "quay.io/prometheus/prometheus:v2.54.1" + imagePullPolicy: "IfNotPresent" + args: + - --storage.tsdb.retention.time=3d + - --config.file=/etc/config/prometheus.yml + - --storage.tsdb.path=/data + - --web.console.libraries=/etc/prometheus/console_libraries + - --web.console.templates=/etc/prometheus/consoles + - --web.enable-lifecycle + - --web.route-prefix=/prometheus + - --web.external-url=/prometheus/ + ports: + - containerPort: 9090 + readinessProbe: + httpGet: + path: /prometheus/-/ready + port: 9090 + scheme: HTTP + initialDelaySeconds: 30 + periodSeconds: 5 + timeoutSeconds: 4 failureThreshold: 3 - env: - - name: LOG_LEVEL - value: info - - name: CUSTOM_COST_ENABLED - value: "false" - - name: KUBECOST_NAMESPACE - value: union - - name: API_PORT - value: "9003" - - name: PROMETHEUS_SERVER_ENDPOINT - value: "http://union-operator-prometheus.union.svc:80/prometheus" - - name: CLUSTER_ID - value: "default-cluster" - - name: DATA_RETENTION_DAILY_RESOLUTION_DAYS - value: "15" - - name: CLOUD_COST_ENABLED - value: "false" - - name: CLOUD_COST_MONTH_TO_DATE_INTERVAL - value: "6" - - name: CLOUD_COST_REFRESH_RATE_HOURS - value: "6" - - name: CLOUD_COST_QUERY_WINDOW_DAYS - value: "7" - - name: CLOUD_COST_RUN_WINDOW_DAYS - value: "3" - # Add any additional provided variables + successThreshold: 1 + livenessProbe: + httpGet: + path: /prometheus/-/healthy + port: 9090 + scheme: HTTP + initialDelaySeconds: 30 + periodSeconds: 15 + timeoutSeconds: 10 + failureThreshold: 3 + successThreshold: 1 + resources: + limits: + cpu: "3" + memory: 3500Mi + requests: + cpu: "1" + memory: 1Gi + volumeMounts: + - name: config-volume + mountPath: /etc/config + - name: storage-volume + mountPath: /data + subPath: "" + - name: server-recording-rules + mountPath: /etc/config/recording + subPath: + readOnly: + dnsPolicy: ClusterFirst + securityContext: + fsGroup: 65534 + runAsGroup: 65534 + runAsNonRoot: true + runAsUser: 65534 + terminationGracePeriodSeconds: 300 + volumes: + - name: config-volume + configMap: + name: union-operator-prometheus + - name: server-recording-rules + configMap: + name: union-recording-rules + - name: storage-volume + emptyDir: + sizeLimit: 10Gi --- -# Source: dataplane/templates/clusterresourcesync/deployment.yaml +# Source: dataplane/templates/flyteconnector/deployment.yaml apiVersion: apps/v1 kind: Deployment metadata: - name: union-syncresources + name: flyteconnector namespace: union - labels: - app.kubernetes.io/name: clusterresourcesync + labels: + app.kubernetes.io/name: flyteconnector app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm spec: - replicas: 1 - selector: - matchLabels: - app.kubernetes.io/name: clusterresourcesync - app.kubernetes.io/instance: release-name - template: - metadata: - annotations: - configChecksum: "7ff0169854ce83fe5e5cb0ec550944a512e1ea0ebc47177b32a5bf3f7fadf9f" - - labels: - platform.union.ai/zone: "dataplane" - - app.kubernetes.io/name: clusterresourcesync - app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name - app.kubernetes.io/managed-by: Helm - spec: - containers: - - command: - - clusterresource - - --config - - /etc/flyte/config/*.yaml - - clusterresource - - run - env: - - name: POD_NAME - valueFrom: - fieldRef: - fieldPath: metadata.name - - name: POD_NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - - name: GOMEMLIMIT - valueFrom: - resourceFieldRef: - divisor: 1 - resource: limits.memory - - name: GOMAXPROCS - valueFrom: - resourceFieldRef: - divisor: 1 - resource: limits.cpu - - name: CLUSTER_NAME - valueFrom: - secretKeyRef: - name: operator-cluster-name - key: cluster_name - - name: DEPLOYMENT_NAME - value: operator - - name: PROXY_SERVICE_URL - value: http://union-operator-proxy:8080 - - name: PROMETHEUS_SERVICE_URL - value: http://union-operator-prometheus:80 - - name: KNATIVE_PROXY_SERVICE_URL - value: http://kourier-internal - image: "public.ecr.aws/p0i0a9q8/unionoperator:2026.4.5" + replicas: 2 + selector: + matchLabels: + app.kubernetes.io/name: flyteconnector + app.kubernetes.io/instance: release-name + template: + metadata: + annotations: + labels: + platform.union.ai/zone: "dataplane" + + app.kubernetes.io/name: flyteconnector + app.kubernetes.io/instance: release-name + app.kubernetes.io/managed-by: Helm + spec: + containers: + - command: + - c0 + image: "ghcr.io/flyteorg/flyte-connectors:py3.13-2.0.0b50.dev3-g695bb1db3.d20260122" imagePullPolicy: "IfNotPresent" - name: sync-cluster-resources + name: flyteconnector + readinessProbe: + failureThreshold: 3 + httpGet: + path: / + port: 9090 + scheme: HTTP + initialDelaySeconds: 5 + periodSeconds: 10 + successThreshold: 1 + timeoutSeconds: 5 + env: + - name: AIRFLOW_CONN_GOOGLE_CLOUD_DEFAULT + value: "google-cloud-platform://" + - name: LOG_LEVEL + value: "20" + ports: + - containerPort: 8000 + name: grpc + - containerPort: 9090 + name: metric resources: limits: - cpu: "1" - memory: 500Mi + cpu: "1.5" + ephemeral-storage: 100Mi + memory: 1500Mi requests: - cpu: 500m - memory: 100Mi - volumeMounts: - - name: auth - mountPath: /etc/union/secret - - name: resource-templates - mountPath: /etc/flyte/clusterresource/templates - - name: config-volume - mountPath: /etc/flyte/config - ports: - - name: debug - containerPort: 10254 - protocol: TCP - serviceAccountName: union-clustersync-system - volumes: - - configMap: - name: union-clusterresource-template - name: resource-templates - - configMap: - name: union-clusterresourcesync-config - name: config-volume - - name: auth - secret: - secretName: union-secret-auth + cpu: "1" + ephemeral-storage: 100Mi + memory: 1000Mi + serviceAccountName: flyteconnector --- # Source: dataplane/templates/imagebuilder/deployment.yaml apiVersion: apps/v1 @@ -5498,17 +5920,15 @@ spec: app.kubernetes.io/instance: release-name template: metadata: - annotations: - container.apparmor.security.beta.kubernetes.io/buildkit: unconfined labels: platform.union.ai/zone: "dataplane" app.kubernetes.io/name: imagebuilder-buildkit app.kubernetes.io/instance: release-name spec: - serviceAccountName: "union-imagebuilder" + serviceAccountName: "union-system" containers: - name: "buildkit" - image: "docker.io/moby/buildkit:buildx-stable-1-rootless" + image: "docker.io/moby/buildkit:buildx-stable-1" imagePullPolicy: IfNotPresent env: - name: POD_NAME @@ -5543,18 +5963,15 @@ spec: - name: KNATIVE_PROXY_SERVICE_URL value: http://kourier-internal volumeMounts: - - mountPath: /home/user/.local/share/buildkit - name: buildkitd - mountPath: /etc/buildkit name: buildkit-config args: - --config - /etc/buildkit/buildkitd.toml - --addr - - unix:///run/user/1000/buildkit/buildkitd.sock + - unix:///run//buildkit/buildkitd.sock - --addr - tcp://0.0.0.0:1234 - - --oci-worker-no-process-sandbox ports: - name: tcp containerPort: 1234 @@ -5576,18 +5993,13 @@ spec: initialDelaySeconds: 5 periodSeconds: 30 securityContext: - seccompProfile: # Needs Kubernetes >= 1.19 - type: Unconfined - runAsUser: 1000 - runAsGroup: 1000 + privileged: true resources: requests: - cpu: 1 - ephemeral-storage: 20Gi - memory: 1Gi + cpu: 4 + ephemeral-storage: 50Gi + memory: 4Gi volumes: - - name: buildkitd - emptyDir: {} - configMap: name: union-operator-buildkit name: buildkit-config @@ -5616,16 +6028,18 @@ spec: template: metadata: annotations: - configChecksum: "807825e2bc1d1dc69164fbea82af93461fcf79e35ad2c0929e03df2f1e14935" + configChecksum: "841817ea8873e0592d093d9f564f22eb6a316a7cac8d611afc2582a4856dc7a" labels: platform.union.ai/zone: "dataplane" app: executor + app.kubernetes.io/instance: 'release-name' + app.kubernetes.io/name: executor spec: securityContext: fsGroup: 1337 - serviceAccountName: executor + serviceAccountName: union-system volumes: - name: config-volume configMap: @@ -5641,7 +6055,7 @@ spec: image: "public.ecr.aws/p0i0a9q8/unionoperator:2026.4.5" imagePullPolicy: IfNotPresent command: - - executor + - executorv2 - serve - --config - /etc/config/*.yaml @@ -5719,7 +6133,7 @@ spec: template: metadata: annotations: - configChecksum: "0d3d3093adf1cf4b2bb08dfec3df8021cea0b4660395075f002f399062a3485" + configChecksum: "1cbf9bb44767576a0ce2cfe30facbef94a1d54963db0d92bea7b5e1c1df350d" labels: @@ -5734,12 +6148,10 @@ spec: sources: - configMap: name: union-operator - - configMap: - name: union-clusterresourcesync-config - name: secret-volume secret: secretName: union-secret-auth - serviceAccountName: proxy-system + serviceAccountName: union-system securityContext: {} containers: @@ -5856,7 +6268,7 @@ spec: template: metadata: annotations: - configChecksum: "0d3d3093adf1cf4b2bb08dfec3df8021cea0b4660395075f002f399062a3485" + configChecksum: "1cbf9bb44767576a0ce2cfe30facbef94a1d54963db0d92bea7b5e1c1df350d" labels: @@ -5865,7 +6277,7 @@ spec: platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm spec: - serviceAccountName: operator-system + serviceAccountName: union-system securityContext: {} volumes: @@ -5946,81 +6358,16 @@ spec: containerPort: 10254 protocol: TCP --- -# Source: dataplane/templates/prometheus/deployment.yaml -apiVersion: apps/v1 -kind: Deployment -metadata: - name: union-operator-prometheus - namespace: union - labels: - helm.sh/chart: dataplane-2026.4.2 - app.kubernetes.io/name: release-name-dataplane - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: prometheus -spec: - replicas: 1 - selector: - matchLabels: - app.kubernetes.io/component: prometheus - app.kubernetes.io/name: release-name-dataplane - app.kubernetes.io/instance: release-name - template: - metadata: - annotations: - configChecksum: "72a39ca838f02fc499675b5708fa77e2a4938e316a99315b59e33ef7f2d31ae" - labels: - app.kubernetes.io/component: prometheus - app.kubernetes.io/name: release-name-dataplane - app.kubernetes.io/instance: release-name - spec: - priorityClassName: system-cluster-critical - serviceAccountName: union-operator-prometheus - securityContext: - fsGroup: 65534 - runAsNonRoot: true - runAsUser: 65534 - fsGroupChangePolicy: OnRootMismatch - containers: - - name: prometheus - image: "prom/prometheus:v3.3.1" - args: - - --config.file=/etc/prometheus/prometheus.yml - - --web.external-url=/prometheus/ - - --web.route-prefix=/prometheus/ - - --storage.tsdb.retention.time=3d - ports: - - name: http - containerPort: 9090 - protocol: TCP - resources: - limits: - cpu: "3" - memory: 3500Mi - requests: - cpu: "1" - memory: 1Gi - securityContext: - allowPrivilegeEscalation: false - terminationMessagePolicy: FallbackToLogsOnError - volumeMounts: - - mountPath: /etc/prometheus - name: prometheus-config - volumes: - - name: prometheus-config - configMap: - name: union-operator-prometheus ---- -# Source: dataplane/templates/propeller/deployment-webhook.yaml +# Source: dataplane/templates/webhook/deployment.yaml +# Webhook deployment # Create the actual deployment apiVersion: apps/v1 kind: Deployment metadata: - name: flytepropeller-webhook + name: union-pod-webhook namespace: union labels: - app.kubernetes.io/name: flyte-pod-webhook + app.kubernetes.io/name: union-pod-webhook app.kubernetes.io/instance: release-name platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm @@ -6028,19 +6375,19 @@ spec: replicas: 1 selector: matchLabels: - app.kubernetes.io/name: flyte-pod-webhook + app.kubernetes.io/name: union-pod-webhook app.kubernetes.io/instance: release-name template: metadata: labels: platform.union.ai/zone: "dataplane" - app.kubernetes.io/name: flyte-pod-webhook + app.kubernetes.io/name: union-pod-webhook app.kubernetes.io/instance: release-name platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm annotations: - configChecksum: "f99369e37d4a20dded730f6900a334aaa4b1b1c3e62f534e17d7967785c62a8" + configChecksum: "77958aee0464eee4cb43c70036cf5ffa89f47f58c908845b9eeaecc7e32bba4" spec: securityContext: @@ -6048,64 +6395,8 @@ spec: fsGroupChangePolicy: Always runAsNonRoot: true runAsUser: 1001 - seLinuxOptions: - type: spc_t - serviceAccountName: flytepropeller-webhook-system - initContainers: - - name: generate-secrets - image: "public.ecr.aws/p0i0a9q8/unionoperator:2026.4.5" - imagePullPolicy: "IfNotPresent" - command: - - flytepropeller - args: - - webhook - - init-certs - - --config - - /etc/flyte/config/*.yaml - env: - - name: POD_NAME - valueFrom: - fieldRef: - fieldPath: metadata.name - - name: POD_NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - - name: GOMEMLIMIT - valueFrom: - resourceFieldRef: - divisor: 1 - resource: limits.memory - - name: GOMAXPROCS - valueFrom: - resourceFieldRef: - divisor: 1 - resource: limits.cpu - - name: CLUSTER_NAME - valueFrom: - secretKeyRef: - name: operator-cluster-name - key: cluster_name - - name: DEPLOYMENT_NAME - value: operator - - name: PROXY_SERVICE_URL - value: http://union-operator-proxy:8080 - - name: PROMETHEUS_SERVICE_URL - value: http://union-operator-prometheus:80 - - name: KNATIVE_PROXY_SERVICE_URL - value: http://kourier-internal - volumeMounts: - - name: config-volume - mountPath: /etc/flyte/config - resources: - limits: - cpu: 1 - ephemeral-storage: 500Mi - memory: 500Mi - requests: - cpu: 200m - ephemeral-storage: 500Mi - memory: 500Mi + seLinuxOptions: null + serviceAccountName: union-system containers: - name: webhook image: "public.ecr.aws/p0i0a9q8/unionoperator:2026.4.5" @@ -6170,107 +6461,301 @@ spec: volumes: - name: config-volume configMap: - name: flyte-propeller-config + name: union-pod-webhook-config - name: webhook-certs secret: - secretName: flyte-pod-webhook + secretName: union-pod-webhook --- -# Source: dataplane/templates/propeller/deployment.yaml -apiVersion: apps/v1 -kind: Deployment +# Source: dataplane/templates/flyteconnector/hpa.yaml +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler metadata: - namespace: union - name: flytepropeller + name: flyteconnector + labels: + app.kubernetes.io/name: flyteconnector + app.kubernetes.io/instance: release-name + app.kubernetes.io/managed-by: Helm +spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: flyteconnector + minReplicas: 2 + maxReplicas: 5 + metrics: + - type: Resource + resource: + name: cpu + target: + type: Utilization + averageUtilization: 80 + - type: Resource + resource: + name: memory + target: + type: Utilization + averageUtilization: 80 +--- +# Source: dataplane/templates/webhook/mutatingwebhookconfiguration.yaml +apiVersion: admissionregistration.k8s.io/v1 +kind: MutatingWebhookConfiguration +metadata: + name: union-pod-webhook- labels: - app.kubernetes.io/name: flytepropeller + app.kubernetes.io/name: union-pod-webhook app.kubernetes.io/instance: release-name platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm +webhooks: + - name: union-pod-webhook.flyte.org + admissionReviewVersions: + - v1 + - v1beta1 + clientConfig: + caBundle: LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSUROekNDQWgrZ0F3SUJBZ0lVSm9kQ09lem4vd1BHQTZjYkw1U3duSlpYT2hJd0RRWUpLb1pJaHZjTkFRRUwKQlFBd0t6RXBNQ2NHQTFVRUF3d2dabXg1ZEdWd2NtOXdaV3hzWlhJdGQyVmlhRzl2YXk1MWJtbHZiaTV6ZG1NdwpIaGNOTWpZd01USXpNREl6TmpVMVdoY05Nell3TVRJeE1ESXpOalUxV2pBck1Ta3dKd1lEVlFRRERDQm1iSGwwClpYQnliM0JsYkd4bGNpMTNaV0pvYjI5ckxuVnVhVzl1TG5OMll6Q0NBU0l3RFFZSktvWklodmNOQVFFQkJRQUQKZ2dFUEFEQ0NBUW9DZ2dFQkFJcjBSWDRGRjFNVTNYdFBmUHErMDUrMGI1Qm0xK3hsTVR5ZUp4bG5mWFJuRlgyegpPcHV6NmsreUpBQk4xRkxmNDVYRnlJclFlaW5DWWJtckZXalFRc3ZDWEloNkpPNGMwdmRMVVU5RERYcldWRFlsCkRNZytTUDlJVnZBMm5USkVFZ2RFREVsKzNWeThUbUJoNlNmcGthcXdHajY5SmhIdy9OeE9yZzM2bUY3TU5VZ1MKWXJVUkxqUFFzTUN6NElTVzhhQnEzS2IwRUFWUTA4V1lZQ0ZEbjY1aHBzVFdxR05rL3BKRnVTYjlsZll6RVdyYwpJeW5wNDhqTnIvNUdKS3NnWG9LeTUwZVo4UkppeTVyOUhENGtnSW15TVNQR2RDMXFZSVppRHRlZW8yOElxNk5nCmd1dWcyTGhlNndYYW9YZXFCQ2d3SHpwWFVXM1hJVzI5eFoxUkFRRUNBd0VBQWFOVE1GRXdIUVlEVlIwT0JCWUUKRktITDQ1RWhQaVg5bnlRN241QUk0YUZVNFlCL01COEdBMVVkSXdRWU1CYUFGS0hMNDVFaFBpWDlueVE3bjVBSQo0YUZVNFlCL01BOEdBMVVkRXdFQi93UUZNQU1CQWY4d0RRWUpLb1pJaHZjTkFRRUxCUUFEZ2dFQkFEeTRac21XCkNjZ3B5TmI4cU1ydzNYclBNejZLMTRlTVhqQVk1bERFYXM1NklLZExoZFM1bi9sWk95bENuUUtOZ3RHS0VrY3UKT05tMEE2Z0xNT3UzcE43dG9aNXh0WktOcnUvS0x6dm44QUNNNndCbUR0bG5oRERDa3ZUTHBVRzh6UTZSNmlaegpzTWlhVmpLSUJiWkJsZXNYSHZZQnQ2YlRTN2o1QWM2aTdYc1RPK3ZqQ2RUQjVYdCtEZ0x2QnBJRWh2ZGJzNXhmClFtc3ZzQmwxenJnR3JXemo4d0txcXNUMzZaOVU1TzdSK1RJMlQrUDVOR25SeHNQOGJ0R0RtMGZ5SW55WlJrSkcKVTdpVFJyMlZRWlRyaTNZQ2dlVjN1UjFtMnJjc2hyU1Avb082T0QyTXR5Q1FMRWtFK1pxVUVJVnpSYmVaNXl2RQptM29YMFV2OUtBUy9pWTA9Ci0tLS0tRU5EIENFUlRJRklDQVRFLS0tLS0K + service: + name: union-pod-webhook + namespace: union + path: /mutate--v1-pod/secrets + port: 443 + failurePolicy: Fail + matchPolicy: Equivalent + namespaceSelector: + matchLabels: + kubernetes.io/metadata.name: union + objectSelector: + matchLabels: + inject-flyte-secrets: "true" + organization: '' + reinvocationPolicy: Never + rules: + - apiGroups: + - '*' + apiVersions: + - v1 + operations: + - CREATE + resources: + - pods + scope: '*' + sideEffects: NoneOnDryRun + timeoutSeconds: 30 +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Imported via https://github.com/knative/operator/releases/download/knative-v1.16.0/operator.yaml +# Copyright 2024 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +# Source: dataplane/templates/propeller/serviceaccount.yaml +--- +--- +# Source: dataplane/templates/serving/knative-serving.yaml +apiVersion: operator.knative.dev/v1beta1 +kind: KnativeServing +metadata: + name: union-operator-serving + namespace: union + labels: + helm.sh/chart: dataplane-2026.4.4 + app.kubernetes.io/name: release-name-dataplane + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2026.4.5" + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/component: serving spec: - replicas: 1 - selector: - matchLabels: - app.kubernetes.io/name: flytepropeller + config: + deployment: + progress-deadline: "30m" + queue-sidecar-cpu-request: "25m" + queue-sidecar-cpu-limit: "1000m" + queue-sidecar-memory-request: "400Mi" + queue-sidecar-memory-limit: "800Mi" + queue-sidecar-ephemeral-storage-request: "512Mi" + queue-sidecar-ephemeral-storage-limit: "1024Mi" + registries-skipping-tag-resolving: managed.cr.union.ai + features: + kubernetes.podspec-affinity: "enabled" + kubernetes.podspec-nodeselector: "enabled" + kubernetes.podspec-tolerations: "enabled" + kubernetes.podspec-fieldref: "enabled" + kubernetes.podspec-dnspolicy: "enabled" + kubernetes.podspec-schedulername: "enabled" + kubernetes.podspec-securitycontext: "enabled" + network: + ingress-class: "kourier.ingress.networking.knative.dev" + high-availability: + replicas: 2 + ingress: + kourier: + enabled: true + bootstrap-configmap: "union-operator-serving-envoy-bootstrap" + service-type: ClusterIP + podDisruptionBudgets: + - name: 3scale-kourier-gateway-pdb + minAvailable: 50% + - name: activator-pdb + minAvailable: 50% + - name: webhook-pdb + minAvailable: 50% + registry: + override: + # TODO(jeev): Wire up Union fork of Envoy + 3scale-kourier-gateway/kourier-gateway: ghcr.io/unionai/envoy:456fed84d4ad9a9dfb186d117d9362e9dc0f7c1f + # TODO(jeev): Wire up Union fork of Kourier + net-kourier-controller/controller: ghcr.io/unionai/kourier@sha256:5804c348d15b3959604e3e3ceed216c3a1c7b32cbe254c7d3eb02a35e62ba9c4 + workloads: + - name: 3scale-kourier-gateway + labels: + helm.sh/chart: dataplane-2026.4.4 + app.kubernetes.io/name: release-name-dataplane app.kubernetes.io/instance: release-name - template: - metadata: - annotations: - configChecksum: "f99369e37d4a20dded730f6900a334aaa4b1b1c3e62f534e17d7967785c62a8" - - labels: - platform.union.ai/zone: "dataplane" - - - app.kubernetes.io/name: flytepropeller - app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name - app.kubernetes.io/managed-by: Helm - spec: - priorityClassName: system-cluster-critical - containers: - - command: - - flytepropeller - - --config - - /etc/flyte/config/*.yaml - - --propeller.cluster-id - - '' - env: - - name: POD_NAME - valueFrom: - fieldRef: - fieldPath: metadata.name - - name: POD_NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - - name: GOMEMLIMIT - valueFrom: - resourceFieldRef: - divisor: 1 - resource: limits.memory - - name: GOMAXPROCS - valueFrom: - resourceFieldRef: - divisor: 1 - resource: limits.cpu - - name: CLUSTER_NAME - valueFrom: - secretKeyRef: - name: operator-cluster-name - key: cluster_name - - name: DEPLOYMENT_NAME - value: operator - - name: PROXY_SERVICE_URL - value: http://union-operator-proxy:8080 - - name: PROMETHEUS_SERVICE_URL - value: http://union-operator-prometheus:80 - - name: KNATIVE_PROXY_SERVICE_URL - value: http://kourier-internal - image: "public.ecr.aws/p0i0a9q8/unionoperator:2026.4.5" - imagePullPolicy: "IfNotPresent" - name: flytepropeller - ports: - - containerPort: 10254 - resources: - limits: - cpu: "3" - memory: 3Gi - requests: - cpu: "1" - memory: 1Gi - volumeMounts: - - name: config-volume - mountPath: /etc/flyte/config - - name: auth - mountPath: /etc/union/secret - serviceAccountName: flytepropeller-system - volumes: - - configMap: - name: flyte-propeller-config - name: config-volume - - name: auth - secret: - secretName: union-secret-auth + app.kubernetes.io/version: "2026.4.5" + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/component: 3scale-kourier-gateway + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchLabels: + app.kubernetes.io/name: 3scale-kourier-gateway + app.kubernetes.io/instance: release-name + topologyKey: topology.kubernetes.io/zone + annotations: + checksum/bootstrap-config: 6469f6f592eebc9e0c676d8ccc359a05bc799c0988e474b2da942c4cc328f656 + env: + - container: kourier-gateway + envVars: + - name: UNION_AUTHZ_TENANTAUTHURL + value: "https:///me" + - name: UNION_AUTHZ_TENANTAUTHSIGNINURL + value: "https:///login" + - name: UNION_AUTHZ_TENANTCONTROLPLANEURL + value: "https://" + resources: + - container: kourier-gateway + limits: + cpu: "2" + memory: 2Gi + requests: + cpu: "1" + memory: 1Gi + - name: activator + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app + operator: In + values: + - activator + topologyKey: topology.kubernetes.io/zone + - name: autoscaler + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app + operator: In + values: + - autoscaler + topologyKey: topology.kubernetes.io/zone + - name: autoscaler-hpa + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app + operator: In + values: + - autoscaler-hpa + topologyKey: topology.kubernetes.io/zone + - name: controller + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app + operator: In + values: + - controller + topologyKey: topology.kubernetes.io/zone + - name: net-kourier-controller + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app + operator: In + values: + - net-kourier-controller + topologyKey: topology.kubernetes.io/zone + env: + - container: controller + envVars: + - name: KOURIER_UNION_AUTHZ_ENABLED + value: "true" + resources: + - container: controller + limits: + cpu: "1" + memory: 1Gi + requests: + cpu: 500m + memory: 500Mi + - name: webhook + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app + operator: In + values: + - webhook + topologyKey: topology.kubernetes.io/zone +--- +# Source: dataplane/templates/common/task-podtemplate.yaml +apiVersion: v1 +kind: PodTemplate +metadata: + name: task-template + namespace: union +template: + spec: + serviceAccountName: union + containers: + - name: default + image: docker.io/rwgrim/docker-noop --- # Source: dataplane/charts/fluentbit/templates/tests/test-connection.yaml apiVersion: v1 diff --git a/tests/generated/dataplane.oci.yaml b/tests/generated/dataplane.oci.yaml index 62418af6..17e7181e 100644 --- a/tests/generated/dataplane.oci.yaml +++ b/tests/generated/dataplane.oci.yaml @@ -1,155 +1,139 @@ --- -# Source: dataplane/templates/common/namespaces.yaml -apiVersion: v1 -kind: Namespace -metadata: - name: flytesnacks-development ---- -# Source: dataplane/templates/common/namespaces.yaml -apiVersion: v1 -kind: Namespace -metadata: - name: flytesnacks-staging ---- -# Source: dataplane/templates/common/namespaces.yaml -apiVersion: v1 -kind: Namespace -metadata: - name: flytesnacks-production ---- -# Source: dataplane/templates/common/namespaces.yaml -apiVersion: v1 -kind: Namespace -metadata: - name: union-health-monitoring-development ---- -# Source: dataplane/templates/common/namespaces.yaml -apiVersion: v1 -kind: Namespace -metadata: - name: union-health-monitoring-staging ---- -# Source: dataplane/templates/common/namespaces.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2020 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + apiVersion: v1 -kind: Namespace +kind: ServiceAccount metadata: - name: union-health-monitoring-production + name: knative-operator + namespace: "union" + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator --- -# Source: dataplane/charts/fluentbit/templates/serviceaccount.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2020 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + apiVersion: v1 kind: ServiceAccount metadata: - name: fluentbit-system - namespace: union + name: operator-webhook + namespace: "union" labels: - helm.sh/chart: fluentbit-0.48.9 - app.kubernetes.io/name: fluentbit - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "3.2.8" - app.kubernetes.io/managed-by: Helm + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator --- -# Source: dataplane/charts/kube-state-metrics/templates/serviceaccount.yaml +# Source: dataplane/charts/prometheus/charts/kube-state-metrics/templates/serviceaccount.yaml apiVersion: v1 kind: ServiceAccount automountServiceAccountToken: true metadata: labels: - helm.sh/chart: kube-state-metrics-5.30.1 + helm.sh/chart: kube-state-metrics-5.25.1 app.kubernetes.io/managed-by: Helm app.kubernetes.io/component: metrics app.kubernetes.io/part-of: kube-state-metrics app.kubernetes.io/name: kube-state-metrics app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2.15.0" + app.kubernetes.io/version: "2.13.0" name: release-name-kube-state-metrics namespace: union --- -# Source: dataplane/charts/opencost/templates/serviceaccount.yaml +# Source: dataplane/charts/prometheus/templates/serviceaccount.yaml apiVersion: v1 kind: ServiceAccount metadata: - name: release-name-opencost - namespace: union labels: - helm.sh/chart: opencost-1.42.0 - app.kubernetes.io/name: opencost + app.kubernetes.io/component: server + app.kubernetes.io/name: prometheus app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "1.111.0" - app.kubernetes.io/part-of: opencost + app.kubernetes.io/version: v2.54.1 + helm.sh/chart: prometheus-25.27.0 app.kubernetes.io/managed-by: Helm -automountServiceAccountToken: true ---- -# Source: dataplane/templates/clusterresourcesync/serviceaccount.yaml -apiVersion: v1 -kind: ServiceAccount -metadata: - name: union-clustersync-system + app.kubernetes.io/part-of: prometheus + name: union-operator-prometheus namespace: union + annotations: + {} --- -# Source: dataplane/templates/imagebuilder/serviceaccount.yaml -apiVersion: v1 -kind: ServiceAccount -metadata: - name: union-imagebuilder ---- -# Source: dataplane/templates/nodeexecutor/serviceaccount.yaml +# Source: dataplane/templates/common/system-serviceaccount.yaml apiVersion: v1 kind: ServiceAccount metadata: - name: executor + name: union-system namespace: union - labels: - app: executor ---- -# Source: dataplane/templates/operator/serviceaccount-proxy.yaml -apiVersion: v1 -kind: ServiceAccount -metadata: - name: proxy-system - labels: - app.kubernetes.io/name: operator-proxy - app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name - app.kubernetes.io/managed-by: Helm + annotations: --- -# Source: dataplane/templates/operator/serviceaccount.yaml +# Source: dataplane/templates/common/union-serviceaccount.yaml apiVersion: v1 kind: ServiceAccount metadata: - name: operator-system - labels: - app.kubernetes.io/name: union-operator - app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name - app.kubernetes.io/managed-by: Helm + name: union + namespace: union + annotations: + eks.amazonaws.com/role-arn: arn:aws:iam::ACCOUNT_ID:role/flyte_project_role +automountServiceAccountToken: true --- -# Source: dataplane/templates/prometheus/serviceaccount.yaml +# Source: dataplane/templates/flyteconnector/serviceaccount.yaml apiVersion: v1 kind: ServiceAccount metadata: - name: union-operator-prometheus + name: flyteconnector namespace: union - labels: - helm.sh/chart: dataplane-2026.4.2 - app.kubernetes.io/name: release-name-dataplane + labels: + app.kubernetes.io/name: flyteconnector app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: prometheus ---- -# Source: dataplane/templates/propeller/serviceaccount-webhook.yaml -apiVersion: v1 -kind: ServiceAccount -metadata: - name: flytepropeller-webhook-system - namespace: union --- -# Source: dataplane/templates/propeller/serviceaccount.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2022 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + apiVersion: v1 -kind: ServiceAccount +kind: Secret metadata: - name: flytepropeller-system - namespace: union + name: operator-webhook-certs + namespace: "union" + labels: + app.kubernetes.io/component: operator-webhook + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +# The data is populated at install time. --- # Source: dataplane/templates/common/auth-secret.yaml apiVersion: v1 @@ -172,151 +156,533 @@ type: Opaque data: cluster_name: dW5pb24tb2Np --- -# Source: dataplane/templates/propeller/deployment-webhook.yaml -# Create an empty secret that the first propeller pod will populate +# Source: dataplane/templates/webhook/mutatingwebhookconfiguration.yaml apiVersion: v1 kind: Secret metadata: - name: flyte-pod-webhook + name: union-pod-webhook namespace: union + labels: + app.kubernetes.io/name: union-pod-webhook + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm type: Opaque +data: + ca.crt: LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSUROekNDQWgrZ0F3SUJBZ0lVSm9kQ09lem4vd1BHQTZjYkw1U3duSlpYT2hJd0RRWUpLb1pJaHZjTkFRRUwKQlFBd0t6RXBNQ2NHQTFVRUF3d2dabXg1ZEdWd2NtOXdaV3hzWlhJdGQyVmlhRzl2YXk1MWJtbHZiaTV6ZG1NdwpIaGNOTWpZd01USXpNREl6TmpVMVdoY05Nell3TVRJeE1ESXpOalUxV2pBck1Ta3dKd1lEVlFRRERDQm1iSGwwClpYQnliM0JsYkd4bGNpMTNaV0pvYjI5ckxuVnVhVzl1TG5OMll6Q0NBU0l3RFFZSktvWklodmNOQVFFQkJRQUQKZ2dFUEFEQ0NBUW9DZ2dFQkFJcjBSWDRGRjFNVTNYdFBmUHErMDUrMGI1Qm0xK3hsTVR5ZUp4bG5mWFJuRlgyegpPcHV6NmsreUpBQk4xRkxmNDVYRnlJclFlaW5DWWJtckZXalFRc3ZDWEloNkpPNGMwdmRMVVU5RERYcldWRFlsCkRNZytTUDlJVnZBMm5USkVFZ2RFREVsKzNWeThUbUJoNlNmcGthcXdHajY5SmhIdy9OeE9yZzM2bUY3TU5VZ1MKWXJVUkxqUFFzTUN6NElTVzhhQnEzS2IwRUFWUTA4V1lZQ0ZEbjY1aHBzVFdxR05rL3BKRnVTYjlsZll6RVdyYwpJeW5wNDhqTnIvNUdKS3NnWG9LeTUwZVo4UkppeTVyOUhENGtnSW15TVNQR2RDMXFZSVppRHRlZW8yOElxNk5nCmd1dWcyTGhlNndYYW9YZXFCQ2d3SHpwWFVXM1hJVzI5eFoxUkFRRUNBd0VBQWFOVE1GRXdIUVlEVlIwT0JCWUUKRktITDQ1RWhQaVg5bnlRN241QUk0YUZVNFlCL01COEdBMVVkSXdRWU1CYUFGS0hMNDVFaFBpWDlueVE3bjVBSQo0YUZVNFlCL01BOEdBMVVkRXdFQi93UUZNQU1CQWY4d0RRWUpLb1pJaHZjTkFRRUxCUUFEZ2dFQkFEeTRac21XCkNjZ3B5TmI4cU1ydzNYclBNejZLMTRlTVhqQVk1bERFYXM1NklLZExoZFM1bi9sWk95bENuUUtOZ3RHS0VrY3UKT05tMEE2Z0xNT3UzcE43dG9aNXh0WktOcnUvS0x6dm44QUNNNndCbUR0bG5oRERDa3ZUTHBVRzh6UTZSNmlaegpzTWlhVmpLSUJiWkJsZXNYSHZZQnQ2YlRTN2o1QWM2aTdYc1RPK3ZqQ2RUQjVYdCtEZ0x2QnBJRWh2ZGJzNXhmClFtc3ZzQmwxenJnR3JXemo4d0txcXNUMzZaOVU1TzdSK1RJMlQrUDVOR25SeHNQOGJ0R0RtMGZ5SW55WlJrSkcKVTdpVFJyMlZRWlRyaTNZQ2dlVjN1UjFtMnJjc2hyU1Avb082T0QyTXR5Q1FMRWtFK1pxVUVJVnpSYmVaNXl2RQptM29YMFV2OUtBUy9pWTA9Ci0tLS0tRU5EIENFUlRJRklDQVRFLS0tLS0K + tls.crt: LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSUQxakNDQXI2Z0F3SUJBZ0lVREtRN3A4MFpIdkQyUE5Ya3p3VGRLem9iUjg4d0RRWUpLb1pJaHZjTkFRRUwKQlFBd0t6RXBNQ2NHQTFVRUF3d2dabXg1ZEdWd2NtOXdaV3hzWlhJdGQyVmlhRzl2YXk1MWJtbHZiaTV6ZG1NdwpIaGNOTWpZd01USXpNREl6TmpVMVdoY05NamN3TVRJek1ESXpOalUxV2pBck1Ta3dKd1lEVlFRRERDQm1iSGwwClpYQnliM0JsYkd4bGNpMTNaV0pvYjI5ckxuVnVhVzl1TG5OMll6Q0NBU0l3RFFZSktvWklodmNOQVFFQkJRQUQKZ2dFUEFEQ0NBUW9DZ2dFQkFMNnMxUTRaZ3ZxVVdNdFp5VC9EKytEQWl6WmR0Q1lkVzArSGk4S3phQ1Vjbi9zZwo3a21NTEhDM2ZlZWtZK2ZUNENHUTd6K1l2WjRWWSt6WGljKzdrNndLbUZqdm1acEM1Rm96OWo1Q21NRk5HZmlWCnRobzVvSU9MUkIzNXRYMU1ROHUwNFVzT29QcjdMK016eEdxWXVmTFdtN1NVdjh0eEp3ZFMxZVlzMllNV1pMeU4KekI4akdWY01Qc2ZNVkdxdFhTTVljQ3hNYURKaTM1N0VEZSs3clM2U2ZtbnFtYjJxekdQM1BRRStCUm91Yk5nLwo4NCt4c253VXBLZUFpem95dFM2c2ZlOTdMUEtRTW9oVzd2TjZORFJrLy9xSmpHem9iZmZwUHU3endhWHVXMWtMCmllc2g2cVRrdlZLSTF3WFJ2WHArVU5GbjZzdDIwdEVlRks0ZzMwOENBd0VBQWFPQjhUQ0I3akFKQmdOVkhSTUUKQWpBQU1Bc0dBMVVkRHdRRUF3SUY0RENCa3dZRFZSMFJCSUdMTUlHSWdoWm1iSGwwWlhCeWIzQmxiR3hsY2kxMwpaV0pvYjI5cmdoeG1iSGwwWlhCeWIzQmxiR3hsY2kxM1pXSm9iMjlyTG5WdWFXOXVnaUJtYkhsMFpYQnliM0JsCmJHeGxjaTEzWldKb2IyOXJMblZ1YVc5dUxuTjJZNEl1Wm14NWRHVndjbTl3Wld4c1pYSXRkMlZpYUc5dmF5NTEKYm1sdmJpNXpkbU11WTJ4MWMzUmxjaTVzYjJOaGJEQWRCZ05WSFE0RUZnUVV1d25wcDZ1MTNKL0NsNnI3eDZ1Sgo5YmZ1RDhzd0h3WURWUjBqQkJnd0ZvQVVvY3Zqa1NFK0pmMmZKRHVma0FqaG9WVGhnSDh3RFFZSktvWklodmNOCkFRRUxCUUFEZ2dFQkFFMmlOa1MzczhGT3Z0cDhrVmJNZUN2amxIQytvYTVlZTFGNmowNlZ3bFJPOWdxOXJla1UKMGptWUhNWGNjWjcrZXJkNFRQRXgwK3ZqMzNhRTV3cDFmdVJJL2xIZHhuOUpJRHg2bTFQTHdVa3p1WG9EekYxLwozVXVIWU1xOGkrT2xsY0tqbmdVMUFUNmRWRCtIbFVWeHpMdjlNazdPb05SQUY5dTJHa1hiUHdCN0I1ZGRxdWxlCmtNcXBxbDhES2s4SVNVelVsTjNBb2Q5NmJUelIyV3RCVkxpTi9zRHZtTDZwZThFYm55RnBUb1BZTDlGeWZOcmIKT1ZPZEJwUkZFTG1KOHZaZWY5d1lxRmVXMTE3SGV5VDArZkxGZVNSemZUNVVWUlVraVFBcUVJdUhHUVRrOVFabgo1WnYvNUF0eEg3WE1rcXR0VE1zMFlSUG0xYjQzY3FBeFdhND0KLS0tLS1FTkQgQ0VSVElGSUNBVEUtLS0tLQo= + tls.key: LS0tLS1CRUdJTiBQUklWQVRFIEtFWS0tLS0tCk1JSUV2UUlCQURBTkJna3Foa2lHOXcwQkFRRUZBQVNDQktjd2dnU2pBZ0VBQW9JQkFRQytyTlVPR1lMNmxGakwKV2NrL3cvdmd3SXMyWGJRbUhWdFBoNHZDczJnbEhKLzdJTzVKakN4d3QzM25wR1BuMCtBaGtPOC9tTDJlRldQcwoxNG5QdTVPc0NwaFk3NW1hUXVSYU0vWStRcGpCVFJuNGxiWWFPYUNEaTBRZCtiVjlURVBMdE9GTERxRDYreS9qCk04UnFtTG55MXB1MGxML0xjU2NIVXRYbUxObURGbVM4amN3Zkl4bFhERDdIekZScXJWMGpHSEFzVEdneVl0K2UKeEEzdnU2MHVrbjVwNnBtOXFzeGo5ejBCUGdVYUxtellQL09Qc2JKOEZLU25nSXM2TXJVdXJIM3ZleXp5a0RLSQpWdTd6ZWpRMFpQLzZpWXhzNkczMzZUN3U4OEdsN2x0WkM0bnJJZXFrNUwxU2lOY0YwYjE2ZmxEUlorckxkdExSCkhoU3VJTjlQQWdNQkFBRUNnZ0VBRjV2SnMzbTRMSE9DdlFic2NwOC9DQmgvQkNlOC92MGlpYW5VUmJLMkFlWlYKN3Arb3NXV1FPUktYSGIvT2VPMVVjb09DQkFOUzh3aGQrM3pDZlB5U0w3cU9HM0RyT1Z6djdqVkIxM3FpZEVpcQpId3ZXWk0vZXpuckhYOWpEdm5SYmJwVUNVaXRKQmxwa2x3S1pYc056UHB5UTRkNkxFdEw3VEo1V2lxM2g3cWQzCjhncmJoeWlWNDhZRFAyYkhpZGt0S2QvMHBNL3piSzZPTS9iMFpLUmRBWVpHQXlOcklMaFJ3eDNsQXE2SDVqL20KUHBuNlZEdVQ3Y1h6UFhmNTgxR0IyNjQxWFJVTXRNWGVEREtaU3JudnIrbG5tV2lsYXZmNkg3ZHp4NlRBMmk0RApybCtIRVdUN1RSQUZGMnZ5SVUrYSt5RzNPVFBKRCtmVTdCWWh4Zld4aFFLQmdRRGVvcXVCbzQxK1dKU255blZnCmNWMHZoUEkwZ0dyRGQzNFdIbCtGNTZuSmxuMVVlbUQ2SUQ0TUhnNVpHWmNmd3NkRlB0aHl1VnB0OVl1SW11SGQKRHE5ZDhpWnpHUzJVbStkSWlsWjhnZXJ5Q3dON3hIbFM3SzJrc0RmRGJwYmZZSXZ1LzRnSExZRHh3YzQ0OG54ZwpWNGpCUFY2QTdYTHRqQzZzdFE2OW9uTlIwd0tCZ1FEYlFBUTRGdWpnUjJUOXh2Y01MdXhMbm1YaHE4U2NoZlM5CkRoWndDUzZIeHB1YlBxblBjQ04vQUZjbDZ1cDRXVEVZdVY4bEpwcGwwRmpYQ2lMdkhuUU15cm5vRjlUOThpanQKZGVHYjF6ZWlwRUpndFpNVHkvOGdERWorZU1wSFBuYTZwQVpMOVpRVUdLR2dDNGQ0SzljdVNHMmRlb1dSU0ViSQpMVGgzeGpXVEZRS0JnQStJRWFidG5nVmVjS0IwQTFSREZGa29VUzFRZUNKQ3g4MExPV2JDRHBvOW9XaXZVT3lpCkt6SDFOdE1JY2Y2SlBCV2NtTVVJSVVMaWltVnhTS2gvU2NTb0MvNmpsd1p6Q2VPSm94YjBpVXR4Y1VERktDR2MKMlZCUDZ0UDdkeE1HVFR6VEhzNUJZbWw3TjhQSlJ0d0J2MHliMTJmdktNRmhzaS9pUWJFQkVFSjVBb0dBUm9NSApHRmJkM0V0NXdsZzcyYkk1a25SRnhkY3RLejIxb1J1bndhNWlSWTV0T3Zkak8zQ1FLZWNkSC9lMklyQmtwdFB2Ck1vNkF0MS9UUW8xakFNNGxlbnUwWUYxUnhiNGN4WW5VM2Y3UVNNRlZDNjg1dHZNemdNWVNyNngvT1h4d1NNUTUKdGpJcnhtN0poQ1JSRkNmZFUyZzl3SmpIM2hxRmtSbGlBTHRCUGFVQ2dZRUF2QkRHejQvYS9kRzRJT0VpREk4SgpBbkxaRWZaTFJrdnI0ejRyT0p0TGZtWlFjTW5DSUZmKzBaelhTME9XOWhvMjk5RUNtS1l4U2VUWk1WSDgzOFZCCnRvSkNGM3lzb01rSC9meEVXTUpicDlMblJwSFUzUElicHlhRG1nWW1YdmsvK3laL3Y1c0dDTWNtVGkzWVFLRGsKcStlK1o0SHZyaGUxVytUaHpwOCtoNGs9Ci0tLS0tRU5EIFBSSVZBVEUgS0VZLS0tLS0K --- -# Source: dataplane/templates/clusterresourcesync/configmap.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2019 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + apiVersion: v1 kind: ConfigMap metadata: - name: union-clusterresourcesync-config - namespace: union + name: config-logging + namespace: "union" labels: - app.kubernetes.io/name: clusterresourcesync - app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name - app.kubernetes.io/managed-by: Helm -data: - cluster_resources.yaml: | - cluster_resources: - clusterName: 'union-oci' - customData: - - production: - - projectQuotaCpu: - value: "4096" - - projectQuotaMemory: - value: 2Ti - - projectQuotaNvidiaGpu: - value: "256" - - defaultUserRoleKey: - value: 'eks.amazonaws.com/role-arn' - - defaultUserRoleValue: - value: 'arn:aws:iam::ACCOUNT_ID:role/flyte_project_role' - - staging: - - projectQuotaCpu: - value: "4096" - - projectQuotaMemory: - value: 2Ti - - projectQuotaNvidiaGpu: - value: "256" - - defaultUserRoleKey: - value: 'eks.amazonaws.com/role-arn' - - defaultUserRoleValue: - value: 'arn:aws:iam::ACCOUNT_ID:role/flyte_project_role' - - development: - - projectQuotaCpu: - value: "4096" - - projectQuotaMemory: - value: 2Ti - - projectQuotaNvidiaGpu: - value: "256" - - defaultUserRoleKey: - value: 'eks.amazonaws.com/role-arn' - - defaultUserRoleValue: - value: 'arn:aws:iam::ACCOUNT_ID:role/flyte_project_role' - refreshInterval: 5m - standaloneDeployment: true - templatePath: /etc/flyte/clusterresource/templates - clusterResourcesPrivate: - app: - isServerless: false - union: - auth: - authorizationMetadataKey: flyte-authorization - clientId: 'clientId' - clientSecretLocation: /etc/union/secret/client_secret - tokenRefreshWindow: 5m - type: ClientSecret - connection: - host: dns:///union.us-west-2.union.ai - admin.yaml: | - admin: - clientId: 'clientId' - clientSecretLocation: /etc/union/secret/client_secret - endpoint: dns:///union.us-west-2.union.ai - insecure: false - event: - capacity: 1000 - rate: 500 - type: admin - domain.yaml: | - domains: - - id: development - name: development - - id: staging - name: staging - - id: production - name: production - clusters.yaml: | - clusters: - clusterConfigs: [] - labelClusterMap: {} - logger.yaml: | - logger: - formatter: - type: json - level: 4 - show-source: true + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +data: {} +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2019 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v1 +kind: ConfigMap +metadata: + name: config-observability + namespace: "union" + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +data: {} --- -# Source: dataplane/templates/clusterresourcesync/configmap.yaml +# Source: dataplane/charts/prometheus/templates/cm.yaml apiVersion: v1 kind: ConfigMap metadata: - name: union-clusterresource-template - namespace: union labels: - app.kubernetes.io/name: clusterresourcesync + app.kubernetes.io/component: server + app.kubernetes.io/name: prometheus app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name + app.kubernetes.io/version: v2.54.1 + helm.sh/chart: prometheus-25.27.0 app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: prometheus + name: union-operator-prometheus + namespace: union data: - a_namespace.yaml: | - apiVersion: v1 - kind: Namespace - metadata: - name: {{ namespace }} - labels: - union.ai/namespace-type: flyte - spec: - finalizers: - - kubernetes - - b_default_service_account.yaml: | - apiVersion: v1 - kind: ServiceAccount - metadata: - name: default - namespace: {{ namespace }} - annotations: - {{ defaultUserRoleKey }}: {{ defaultUserRoleValue }} + allow-snippet-annotations: "false" + alerting_rules.yml: | + {} + alerts: | + {} + prometheus.yml: | + global: + evaluation_interval: 1m + scrape_interval: 1m + scrape_timeout: 10s + rule_files: + - /etc/config/recording_rules.yml + - /etc/config/alerting_rules.yml + - /etc/config/rules + - /etc/config/alerts + scrape_configs: + - job_name: prometheus + static_configs: + - targets: + - localhost:9090 + - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + job_name: kubernetes-apiservers + kubernetes_sd_configs: + - role: endpoints + relabel_configs: + - action: keep + regex: default;kubernetes;https + source_labels: + - __meta_kubernetes_namespace + - __meta_kubernetes_service_name + - __meta_kubernetes_endpoint_port_name + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + insecure_skip_verify: true + - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + job_name: kubernetes-nodes + kubernetes_sd_configs: + - role: node + relabel_configs: + - action: labelmap + regex: __meta_kubernetes_node_label_(.+) + - replacement: kubernetes.default.svc:443 + target_label: __address__ + - regex: (.+) + replacement: /api/v1/nodes/$1/proxy/metrics + source_labels: + - __meta_kubernetes_node_name + target_label: __metrics_path__ + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + insecure_skip_verify: true + - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + job_name: kubernetes-nodes-cadvisor + kubernetes_sd_configs: + - role: node + relabel_configs: + - action: labelmap + regex: __meta_kubernetes_node_label_(.+) + - replacement: kubernetes.default.svc:443 + target_label: __address__ + - regex: (.+) + replacement: /api/v1/nodes/$1/proxy/metrics/cadvisor + source_labels: + - __meta_kubernetes_node_name + target_label: __metrics_path__ + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + insecure_skip_verify: true + - honor_labels: true + job_name: kubernetes-service-endpoints + kubernetes_sd_configs: + - role: endpoints + relabel_configs: + - action: keep + regex: true + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_scrape + - action: drop + regex: true + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_scrape_slow + - action: replace + regex: (https?) + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_scheme + target_label: __scheme__ + - action: replace + regex: (.+) + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_path + target_label: __metrics_path__ + - action: replace + regex: (.+?)(?::\d+)?;(\d+) + replacement: $1:$2 + source_labels: + - __address__ + - __meta_kubernetes_service_annotation_prometheus_io_port + target_label: __address__ + - action: labelmap + regex: __meta_kubernetes_service_annotation_prometheus_io_param_(.+) + replacement: __param_$1 + - action: labelmap + regex: __meta_kubernetes_service_label_(.+) + - action: replace + source_labels: + - __meta_kubernetes_namespace + target_label: namespace + - action: replace + source_labels: + - __meta_kubernetes_service_name + target_label: service + - action: replace + source_labels: + - __meta_kubernetes_pod_node_name + target_label: node + - honor_labels: true + job_name: kubernetes-service-endpoints-slow + kubernetes_sd_configs: + - role: endpoints + relabel_configs: + - action: keep + regex: true + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_scrape_slow + - action: replace + regex: (https?) + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_scheme + target_label: __scheme__ + - action: replace + regex: (.+) + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_path + target_label: __metrics_path__ + - action: replace + regex: (.+?)(?::\d+)?;(\d+) + replacement: $1:$2 + source_labels: + - __address__ + - __meta_kubernetes_service_annotation_prometheus_io_port + target_label: __address__ + - action: labelmap + regex: __meta_kubernetes_service_annotation_prometheus_io_param_(.+) + replacement: __param_$1 + - action: labelmap + regex: __meta_kubernetes_service_label_(.+) + - action: replace + source_labels: + - __meta_kubernetes_namespace + target_label: namespace + - action: replace + source_labels: + - __meta_kubernetes_service_name + target_label: service + - action: replace + source_labels: + - __meta_kubernetes_pod_node_name + target_label: node + scrape_interval: 5m + scrape_timeout: 30s + - honor_labels: true + job_name: prometheus-pushgateway + kubernetes_sd_configs: + - role: service + relabel_configs: + - action: keep + regex: pushgateway + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_probe + - honor_labels: true + job_name: kubernetes-services + kubernetes_sd_configs: + - role: service + metrics_path: /probe + params: + module: + - http_2xx + relabel_configs: + - action: keep + regex: true + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_probe + - source_labels: + - __address__ + target_label: __param_target + - replacement: blackbox + target_label: __address__ + - source_labels: + - __param_target + target_label: instance + - action: labelmap + regex: __meta_kubernetes_service_label_(.+) + - source_labels: + - __meta_kubernetes_namespace + target_label: namespace + - source_labels: + - __meta_kubernetes_service_name + target_label: service + - honor_labels: true + job_name: kubernetes-pods + kubernetes_sd_configs: + - role: pod + relabel_configs: + - action: keep + regex: true + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_scrape + - action: drop + regex: true + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_scrape_slow + - action: replace + regex: (https?) + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_scheme + target_label: __scheme__ + - action: replace + regex: (.+) + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_path + target_label: __metrics_path__ + - action: replace + regex: (\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4}) + replacement: '[$2]:$1' + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_port + - __meta_kubernetes_pod_ip + target_label: __address__ + - action: replace + regex: (\d+);((([0-9]+?)(\.|$)){4}) + replacement: $2:$1 + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_port + - __meta_kubernetes_pod_ip + target_label: __address__ + - action: labelmap + regex: __meta_kubernetes_pod_annotation_prometheus_io_param_(.+) + replacement: __param_$1 + - action: labelmap + regex: __meta_kubernetes_pod_label_(.+) + - action: replace + source_labels: + - __meta_kubernetes_namespace + target_label: namespace + - action: replace + source_labels: + - __meta_kubernetes_pod_name + target_label: pod + - action: drop + regex: Pending|Succeeded|Failed|Completed + source_labels: + - __meta_kubernetes_pod_phase + - action: replace + source_labels: + - __meta_kubernetes_pod_node_name + target_label: node + - honor_labels: true + job_name: kubernetes-pods-slow + kubernetes_sd_configs: + - role: pod + relabel_configs: + - action: keep + regex: true + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_scrape_slow + - action: replace + regex: (https?) + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_scheme + target_label: __scheme__ + - action: replace + regex: (.+) + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_path + target_label: __metrics_path__ + - action: replace + regex: (\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4}) + replacement: '[$2]:$1' + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_port + - __meta_kubernetes_pod_ip + target_label: __address__ + - action: replace + regex: (\d+);((([0-9]+?)(\.|$)){4}) + replacement: $2:$1 + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_port + - __meta_kubernetes_pod_ip + target_label: __address__ + - action: labelmap + regex: __meta_kubernetes_pod_annotation_prometheus_io_param_(.+) + replacement: __param_$1 + - action: labelmap + regex: __meta_kubernetes_pod_label_(.+) + - action: replace + source_labels: + - __meta_kubernetes_namespace + target_label: namespace + - action: replace + source_labels: + - __meta_kubernetes_pod_name + target_label: pod + - action: drop + regex: Pending|Succeeded|Failed|Completed + source_labels: + - __meta_kubernetes_pod_phase + - action: replace + source_labels: + - __meta_kubernetes_pod_node_name + target_label: node + scrape_interval: 5m + scrape_timeout: 30s + - job_name: 'kube-state-metrics' + metrics_path: /metrics + scrape_interval: 1m + honor_labels: true + kubernetes_sd_configs: + - role: endpoints + namespaces: + own_namespace: true + relabel_configs: + - source_labels: [__meta_kubernetes_service_label_app_kubernetes_io_name] + regex: kube-state-metrics + action: keep + - source_labels: [__meta_kubernetes_endpoint_port_name] + regex: http + action: keep + metric_relabel_configs: + - source_labels: [__name__] + regex: "kube_pod_container_resource_(limits|requests)|kube_pod_status_phase|kube_node_(labels|status_allocatable|status_condition|status_capacity)|kube_namespace_labels|kube_pod_container_status_(waiting|terminated|last_terminated).*_reason|kube_daemonset_status_number_unavailable|kube_deployment_status_replicas_unavailable|kube_resourcequota|kube_pod_info|kube_node_info|kube_pod_container_status_restarts_total" + action: keep + - source_labels: [__name__, phase] + separator: ";" + regex: "kube_pod_status_phase;(Succeeded|Failed)" + action: drop + - source_labels: [node] + target_label: nodename + regex: "(.*)" + action: replace + - source_labels: [label_node_group_name] + action: replace + regex: "(.+)" + target_label: label_node_pool_name + - job_name: 'opencost' + metrics_path: /metrics + scrape_interval: 1m + honor_labels: true + kubernetes_sd_configs: + - role: endpoints + namespaces: + own_namespace: true + relabel_configs: + - source_labels: [__meta_kubernetes_service_label_app_kubernetes_io_name] + regex: opencost + action: keep + - source_labels: [__meta_kubernetes_endpoint_port_name] + regex: http + action: keep + metric_relabel_configs: + - source_labels: [__name__] + regex: "kube_node_labels|kube_pod_labels|node_total_hourly_cost|node_ram_hourly_cost|node_cpu_hourly_cost|node_gpu_hourly_cost" + action: keep + - job_name: 'union-services' + metrics_path: /metrics + scrape_interval: 1m + honor_labels: true + kubernetes_sd_configs: + - role: endpoints + namespaces: + own_namespace: true + relabel_configs: + - source_labels: [__meta_kubernetes_service_label_platform_union_ai_service_group] + regex: .+ + action: keep + - source_labels: [__meta_kubernetes_endpoint_port_name] + regex: debug + action: keep + # Flyte propeller metrics for execution info and fast task duration. + # No-op when propeller is not deployed (no pods match the label selector). + - job_name: 'flytepropeller' + kubernetes_sd_configs: + - role: pod + namespaces: + own_namespace: true + selectors: + - role: pod + label: app.kubernetes.io/name=flytepropeller + metric_relabel_configs: + - source_labels: [__name__] + regex: "flyte:propeller:all:round:execution_info|flyte:propeller:all:node:fast_task:fast_task_execution_duration" + action: keep + relabel_configs: + - source_labels: [__meta_kubernetes_pod_container_name] + regex: flytepropeller + action: keep + - source_labels: [__meta_kubernetes_namespace] + action: replace + target_label: namespace + - source_labels: [__meta_kubernetes_pod_name] + action: replace + target_label: pod + # Kourier gateway (envoy) metrics for app serving. + # No-op when serving is not enabled (no pods match the label selector). + - job_name: 'serving-envoy' + kubernetes_sd_configs: + - role: pod + namespaces: + own_namespace: true + selectors: + - role: pod + label: app=3scale-kourier-gateway + metrics_path: /stats/prometheus + metric_relabel_configs: + - source_labels: [__name__] + regex: "envoy_cluster_upstream_rq_xx|envoy_cluster_upstream_rq_time_bucket" + action: keep + relabel_configs: + - source_labels: [__meta_kubernetes_pod_container_port_name] + regex: metrics + action: keep + - source_labels: [__meta_kubernetes_namespace] + action: replace + target_label: namespace + - source_labels: [__meta_kubernetes_pod_name] + action: replace + target_label: pod + # DCGM GPU metrics. + # No-op when dcgm-exporter is not deployed (no pods match the label selector). + - job_name: 'gpu-metrics' + kubernetes_sd_configs: + - role: pod + namespaces: + names: + - kube-system + selectors: + - role: pod + label: app.kubernetes.io/name=dcgm-exporter - c_project_resource_quota.yaml: | - apiVersion: v1 - kind: ResourceQuota - metadata: - name: project-quota - namespace: {{ namespace }} - spec: - hard: - limits.cpu: {{ projectQuotaCpu }} - limits.memory: {{ projectQuotaMemory }} - requests.nvidia.com/gpu: {{ projectQuotaNvidiaGpu }} + recording_rules.yml: | + {} + rules: | + {} --- # Source: dataplane/templates/fluent-bit/configmap.yaml apiVersion: v1 @@ -365,6 +731,40 @@ data: bucket bucket endpoint https://xxxxxxxxxxx.compat.objectstorage.us-ashburn-1.oraclecloud.com --- +# Source: dataplane/templates/imagebuilder/build-image-configmap.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: build-image-config + labels: + app.kubernetes.io/name: union-operator + app.kubernetes.io/instance: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm +data: + storage.yaml: | + storage: + container: "bucket" + type: stow + stow: + kind: s3 + config: + auth_type: accesskey + access_key_id: accessKey + secret_key: secretKey + disable_ssl: false + endpoint: https://xxxxxxxxxxx.compat.objectstorage.us-ashburn-1.oraclecloud.com + region: us-ashburn-1 + enable-multicontainer: false + limits: + maxDownloadMBs: 1024 + cache: + max_size_mbs: 0 + target_gc_percent: 70 + buildkit-uri: "tcp://union-operator-buildkit.union.svc.cluster.local:1234" + default-repository: "union-dataplane" + authentication-type: "noop" +--- # Source: dataplane/templates/imagebuilder/configmap.yaml apiVersion: v1 kind: ConfigMap @@ -2521,6 +2921,18 @@ data: "version": 1 } --- +# Source: dataplane/templates/monitoring/prometheusrule.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: union-recording-rules + namespace: union + labels: + release: release-name +data: + recording_rules.yml: | + groups: [] +--- # Source: dataplane/templates/nodeexecutor/configmap.yaml apiVersion: v1 kind: ConfigMap @@ -2575,8 +2987,13 @@ data: templateUris: - https://scale.neptune.ai/{{ .taskConfig.project }}/-/run/?customId={{ .taskConfig.id }} - kubernetes-enabled: true + kubernetes-enabled: false enabled_plugins.yaml: | + plugins: + connector-service: + defaultConnector: + defaultServiceConfig: '{"loadBalancingConfig": [{"round_robin":{}}]}' + endpoint: k8s:///flyteconnector.union:8000 tasks: task-plugins: default-for-task-types: @@ -2609,6 +3026,8 @@ data: cpu: 4096 gpu: 256 memory: 2Ti + namespace_mapping: + template: union union: connection: host: dns:///union.us-west-2.union.ai @@ -2629,7 +3048,7 @@ data: cache-endpoint: dns:///union.us-west-2.union.ai endpoint: dns:///union.us-west-2.union.ai insecure: false - type: fallback + type: cacheservicev2 use-admin-auth: true logger: formatter: @@ -2649,6 +3068,7 @@ data: secure: false useAuth: false propeller: + limit-namespace: union node-config: disable-input-file-writes: true plugins: @@ -2656,7 +3076,7 @@ data: additional-worker-args: - --last-ack-grace-period-seconds - "120" - callback-uri: http://unionai-dataplane-executor.union.svc.cluster.local:15605 + callback-uri: http://union-operator-executor.union.svc.cluster.local:15605 grace-period-status-not-found: 2m ioutils: remoteFileOutputPaths: @@ -2674,6 +3094,7 @@ data: - MORE: foo - SOME_CONFIG_BOOL: "true" default-memory: 100Mi + default-pod-template-name: task-template co-pilot: image: 'cr.flyte.org/flyteorg/flytecopilot:v1.14.1' name: flyte-copilot- @@ -2722,6 +3143,7 @@ data: - MORE: foo - SOME_CONFIG_BOOL: "true" default-memory: 100Mi + default-pod-template-name: task-template config.yaml: | union: connection: @@ -2741,11 +3163,14 @@ data: operator: enabled: true enableTunnelService: true + # enableDepot: false tunnel: - enableDirectToAppIngress: false + enableDirectToAppIngress: true deploymentToRestart: union-operator-proxy + limitNamespace: union + disableClusterPermissions: true apps: - enabled: 'false' + enabled: 'true' syncClusterConfig: enabled: false clusterId: @@ -2760,19 +3185,19 @@ data: userRole: 'arn:aws:iam::ACCOUNT_ID:role/flyte_project_role' userRoleKey: 'eks.amazonaws.com/role-arn' collectUsages: - enabled: true + enabled: false billing: model: Legacy dependenciesHeartbeat: - prometheus: - endpoint: 'http://union-operator-prometheus:80/-/healthy' - propeller: - endpoint: 'http://flytepropeller:10254' + executor: + endpoint: 'http://union-operator-executor:10254' proxy: endpoint: 'http://union-operator-proxy:10254' secretsWatcher: dryRun: true enabled: false + org: + namespaceTemplate: union imageBuilder: enabled: true executionNamespaceLabels: @@ -2780,9 +3205,6 @@ data: referenceConfigmapName: union-operator targetConfigMapName: "build-image-config" proxy: - imageBuilderConfig: - authenticationType: 'noop' - defaultRepository: '' persistedLogs: objectStore: pathTemplate: namespace-{{.KubernetesNamespace}}.pod-{{.KubernetesPodName}}.cont-{{.KubernetesContainerName}} @@ -2836,989 +3258,149 @@ data: endpoint: https://xxxxxxxxxxx.compat.objectstorage.us-ashburn-1.oraclecloud.com region: us-ashburn-1 image-builder.buildkit-uri: "tcp://union-operator-buildkit.union.svc.cluster.local:1234" - image-builder.default-repository: "" + image-builder.default-repository: "union-dataplane" image-builder.authentication-type: "noop" --- -# Source: dataplane/templates/prometheus/configmap.yaml +# Source: dataplane/templates/serving/bootstrap-configmap.yaml +# We need to copy the default configmap here since knative-operator does not automatically update the +# address of the `net-kourier-controller` endpoint to use the release namespace. It assumes that the +# resources are being installed into the `knative-serving` namespace instead. apiVersion: v1 kind: ConfigMap metadata: - name: union-operator-prometheus + name: union-operator-serving-envoy-bootstrap namespace: union - labels: - helm.sh/chart: dataplane-2026.4.2 - app.kubernetes.io/name: release-name-dataplane - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: prometheus data: - prometheus.yml: | - global: - scrape_interval: 15s - evaluation_interval: 15s - alerting: - alertmanagers: - - static_configs: - - targets: - rule_files: - - rules.yml - scrape_configs: - # Self-monitoring - - job_name: prometheus - metrics_path: /prometheus/metrics - static_configs: - - targets: ['localhost:9090'] - metric_relabel_configs: - - source_labels: [__name__] - regex: prometheus_tsdb_head_chunks_storage_size_bytes|prometheus_tsdb_head_series|prometheus_tsdb_head_series_created_total|prometheus_tsdb_head_series_removed_total|prometheus_tsdb_head_samples_appended_total|prometheus_tsdb_storage_blocks_bytes|prometheus_tsdb_retention_limit_bytes|prometheus_tsdb_wal_(.*)|.+_(limit|failed|missed|failures)_total - action: keep + envoy-bootstrap.yaml: | + dynamic_resources: + ads_config: + transport_api_version: V3 + api_type: GRPC + rate_limit_settings: {} + grpc_services: + - envoy_grpc: {cluster_name: xds_cluster} + cds_config: + resource_api_version: V3 + ads: {} + lds_config: + resource_api_version: V3 + ads: {} + node: + cluster: kourier-knative + id: 3scale-kourier-gateway + static_resources: + listeners: + - name: stats_listener + address: + socket_address: + address: 0.0.0.0 + port_value: 9000 + filter_chains: + - filters: + - name: envoy.filters.network.http_connection_manager + typed_config: + "@type": type.googleapis.com/envoy.extensions.filters.network.http_connection_manager.v3.HttpConnectionManager + stat_prefix: stats_server + http_filters: + - name: envoy.filters.http.router + typed_config: + "@type": type.googleapis.com/envoy.extensions.filters.http.router.v3.Router + route_config: + virtual_hosts: + - name: admin_interface + domains: + - "*" + routes: + - match: + safe_regex: + regex: '/(certs|stats(/prometheus)?|server_info|clusters|listeners|ready)?' + headers: + - name: ':method' + string_match: + exact: GET + route: + cluster: service_stats + - match: + safe_regex: + regex: '/drain_listeners' + headers: + - name: ':method' + string_match: + exact: POST + route: + cluster: service_stats + clusters: + - name: service_stats + connect_timeout: 0.250s + type: static + load_assignment: + cluster_name: service_stats + endpoints: + lb_endpoints: + endpoint: + address: + socket_address: + address: 127.0.0.1 + port_value: 9901 + - name: xds_cluster + # This keepalive is recommended by envoy docs. + # https://www.envoyproxy.io/docs/envoy/latest/api-docs/xds_protocol + typed_extension_protocol_options: + envoy.extensions.upstreams.http.v3.HttpProtocolOptions: + "@type": type.googleapis.com/envoy.extensions.upstreams.http.v3.HttpProtocolOptions + explicit_http_config: + http2_protocol_options: + connection_keepalive: + interval: 30s + timeout: 5s + connect_timeout: 1s + load_assignment: + cluster_name: xds_cluster + endpoints: + lb_endpoints: + endpoint: + address: + socket_address: + address: "net-kourier-controller" + port_value: 18000 + type: STRICT_DNS + admin: + access_log: + - name: envoy.access_loggers.stdout + typed_config: + "@type": type.googleapis.com/envoy.extensions.access_loggers.stream.v3.StdoutAccessLog + address: + socket_address: + address: 127.0.0.1 + port_value: 9901 + stats_config: + stats_tags: + - tag_name: name + regex: '^.*?\.u/.*?n=(.*?)/.*?u\..*$' + - tag_name: domain + regex: '^.*?\.u/.*?d=(.*?)/.*?u\..*$' + - tag_name: org + regex: '^.*?\.u/.*?o=(.*?)/.*?u\..*$' + - tag_name: project + regex: '^.*?\.u/.*?p=(.*?)/.*?u\..*$' + - tag_name: target + regex: '^.*?\.u/.*?t=(.*?)/.*?u\..*$' + - tag_name: target_namespace + regex: '^.*?\.u/.*?tns=(.*?)/.*?u\..*$' +--- +# Source: dataplane/templates/webhook/configmap.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: union-pod-webhook-config + namespace: union +data: + core.yaml: | - # Kube state metrics for pod/node resource tracking and cost calculations - - job_name: kube-state-metrics - static_configs: - - targets: ['release-name-kube-state-metrics:8080'] - metric_relabel_configs: - - separator: ; - source_labels: [__name__] - regex: kube_pod_container_resource_(limits|requests)|kube_pod_status_phase|kube_node_(labels|status_allocatable|status_condition|status_capacity)|kube_namespace_labels|kube_pod_container_status_(waiting|terminated|last_terminated).*_reason|kube_daemonset_status_number_unavailable|kube_deployment_status_replicas_unavailable|kube_resourcequota|kube_pod_info|kube_node_info|kube_pod_container_status_restarts_total - action: keep - - separator: ; - source_labels: [__name__, phase] - regex: kube_pod_status_phase;(Succeeded|Failed) - action: drop - - source_labels: [node] - target_label: nodename - regex: '(.*)' - action: replace - - source_labels: [label_node_group_name] - action: replace - regex: (.+) - target_label: label_node_pool_name - - # cAdvisor container metrics for CPU and memory tracking - - job_name: kubernetes-cadvisor - metrics_path: /metrics - scheme: https - kubernetes_sd_configs: - - role: node - namespaces: - names: [] - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token - tls_config: - ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt - insecure_skip_verify: false - metric_relabel_configs: - - separator: ; - source_labels: [__name__] - regex: container_cpu_usage_seconds_total|container_memory_working_set_bytes - action: keep - relabel_configs: - - separator: ; - regex: __meta_kubernetes_node_label_(.+) - replacement: $1 - action: labelmap - - separator: ; - regex: (.*) - target_label: __address__ - replacement: kubernetes.default.svc:443 - action: replace - - source_labels: [__meta_kubernetes_node_name] - separator: ; - regex: (.+) - target_label: __metrics_path__ - replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor - action: replace - - # Flyte propeller metrics for execution info and fast task duration - - job_name: flytepropeller - kubernetes_sd_configs: - - role: pod - namespaces: - names: - - union - selectors: - - role: pod - label: app.kubernetes.io/name=flytepropeller - metric_relabel_configs: - - source_labels: [__name__] - regex: "flyte:propeller:all:round:execution_info|flyte:propeller:all:node:fast_task:fast_task_execution_duration" - action: keep - relabel_configs: - - source_labels: [__meta_kubernetes_pod_container_name] - regex: flytepropeller - action: keep - - source_labels: [__meta_kubernetes_namespace] - action: replace - target_label: namespace - - source_labels: [__meta_kubernetes_pod_name] - action: replace - target_label: pod - # OpenCost metrics for cost tracking - - job_name: opencost - static_configs: - - targets: ['release-name-opencost:9003'] - metric_relabel_configs: - - source_labels: [__name__] - regex: "kube_node_labels|kube_pod_labels|node_total_hourly_cost|node_ram_hourly_cost|node_cpu_hourly_cost|node_gpu_hourly_cost" - action: keep - rules.yml: | - groups: - - name: cost_calculations_15s - interval: 15s - rules: - - record: pod_gpu_allocation - expr: | - sum by (namespace, pod) (DCGM_FI_DEV_GPU_UTIL >= bool 0) * on (namespace, pod) group_left() (max by (namespace, pod) (kube_pod_status_phase{phase=~"Running|Pending"} == 1)) - - record: execution_info # A join metric to look up execution-level info. Used to disambiguate workflow/task executions, apps, and workspaces. - expr: | - max by (label_domain, label_project, label_entity_name, label_execution_id, label_entity_id)( - label_replace( - label_replace( - label_replace( - label_replace( - label_replace( - flyte:propeller:all:round:execution_info{domain!="", project!="", workflow_name!="", execution_id!=""}, # filter for workflow/task executions - "label_entity_id", "$1", "execution_id", "(.*)" # join key - ), "label_entity_name", "$1", "workflow_name", "(.*)" # set label_entity_name to the workflow/task name from the workflow_execution_id - ), - "label_execution_id", "$1", "execution_id", "(.*)" - ), - "label_project", "$1", "project", "(.*)" # project - ), - "label_domain", "$1", "domain", "(.*)" # domain - ) - ) - - record: app_info # A join metric to look up app-level info. Used to disambiguate workflow/task executions, apps, and workspaces. - expr: | - max by (label_domain, label_project, label_app_name, label_app_version, label_entity_id)( - label_replace( - label_replace( - label_replace( - kube_pod_labels{ - label_domain!="", - label_project!="", - label_serving_unionai_dev_app_name!="", - label_serving_knative_dev_revision!="" - }, # this filters for apps - "label_app_name", "$1", "label_serving_unionai_dev_app_name", "(.*)" # rename to cleanup - ), - "label_app_version", "$1", "label_serving_knative_dev_revision", "(.*)" # the app_version is equivalent to an execution_id for workflows (lowest level of granularity) - ), - "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # join key - ) - ) - - record: workspace_info # A join metric to look up workspace info. Used to disambiguate workflow/task executions, apps, and workspaces. - expr: | - max by (label_domain, label_project, label_workspace_name, label_entity_id)( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # filter for workspaces - "label_entity_id", "$1", "label_node_id", "(.*)" # join key - ), "label_workspace_name", "$1", "label_node_id", "(.*)" # set label_workspace_name to the workspace name from the kube_pod_labels - ) - ) - - record: fast_task_execution_duration - expr: | - max by (label_domain, label_project, label_execution_id, label_entity_id, namespace, pod)( - label_replace( - label_replace( - label_replace( - label_replace( - label_replace( - label_replace( - flyte:propeller:all:node:fast_task:fast_task_execution_duration{domain!="", project!="", execution_id!="", exported_namespace!="", exported_pod!=""}, - "label_entity_id", "$1", "execution_id", "(.*)" # join key - ), - "label_execution_id", "$1", "execution_id", "(.*)" - ), - "label_project", "$1", "project", "(.*)" # project - ), - "label_domain", "$1", "domain", "(.*)" # domain - ), - "namespace", "$1", "exported_namespace", "(.*)" - ), - "pod", "$1", "exported_pod", "(.*)" - ) - ) - - record: fast_task_execution_duration_rate - expr: | - irate(fast_task_execution_duration[1m]) > 0 or absent(irate(fast_task_execution_duration[1m])) * fast_task_execution_duration - - record: entity_id:mem_usage_bytes_total_per_node:sum # Allocated memory (max(requested, consumed)) aggregated per node and entity, where entity is either a task/workflow execution or an app. - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id, node) ( # aggregate up to entity - # First, calculate the allocated memory for each pod - max by (namespace, pod) ( # this is the case where consumed (the memory working set) exceeds requested memory - ( - sum by (namespace, pod) ( - container_memory_working_set_bytes{namespace!="",pod!="",image!=""} - ) - > sum by (namespace, pod) ( - kube_pod_container_resource_requests{namespace!="", pod!="", node!="", resource="memory"} - ) - ) - or sum by (namespace, pod) ( # this is the case where memory requests are <= consumed memory - kube_pod_container_resource_requests{namespace!="", pod!="", node!="", resource="memory"} # needed to add node!="" to dedupe - ) - ) - # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but we do not want to double the number of pod-level metrics we save - * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) - "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") - ), - "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels - label_replace( - label_replace( - kube_pod_labels{ - label_domain!="", - label_project!="", - label_serving_unionai_dev_app_name!="", - label_serving_knative_dev_revision!="" - }, # this filters for apps only - "label_entity_type", "app", "", "" # set label_entity_type to "app" - ), - "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) - "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" - ), - "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - fast_task_execution_duration_rate{label_domain!="", label_project!="",label_execution_id!=""}, - "label_entity_type", "fast_task", "", "" - ) - ) - ) - # Then filter for pods only in the "Running" or "Pending" phase - * on (namespace, pod) group_left() ( - max by (namespace, pod) ( - kube_pod_status_phase{phase=~"Running|Pending"} == 1 - ) - ) - # Now join in node identifiers which are used for subsequent overhead calculations - * on (namespace, pod) group_left(node) ( - max by (namespace, pod, node) (kube_pod_info{node!=""}) # needed to add node!="" to dedupe - ) - ) - - record: entity_id:cpu_usage_per_node:sum # Allocated cpu (max(requested, consumed)) aggregated per node and entity, where entity is either a task/workflow execution or an app. - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id, node) ( - # First, calculate the allocated cpu for each pod - max by (namespace, pod) ( # this is the case where consumed (the cpu usage seconds total) exceeds requested cpu - ( - sum by (namespace, pod) ( - irate(container_cpu_usage_seconds_total{namespace!="",pod!="",image!=""}[5m]) - ) - > sum by (namespace, pod) ( - kube_pod_container_resource_requests{namespace!="", pod!="", node!="", resource="cpu"} - ) - ) - or sum by (namespace, pod) ( # this is the case where cpu requests are <= consumed cpu - kube_pod_container_resource_requests{namespace!="", pod!="", node!="", resource="cpu"} - ) - ) - # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but I didn't want to double the number of pod-level metrics we save - * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) - "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") - ), - "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels - label_replace( - label_replace( - kube_pod_labels{ - label_domain!="", - label_project!="", - label_serving_unionai_dev_app_name!="", - label_serving_knative_dev_revision!="" - }, # this filters for apps only - "label_entity_type", "app", "", "" # set label_entity_type to "app" - ), - "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) - "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" - ), - "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - fast_task_execution_duration_rate{label_domain!="", label_project!="", label_execution_id!=""}, - "label_entity_type", "fast_task", "", "" - ) - ) - ) - # Then filter for pods only in the "Running" or "Pending" phase - * on (namespace, pod) group_left() ( - max by (namespace, pod) ( - kube_pod_status_phase{phase=~"Running|Pending"} == 1 - ) - ) - # Now join in node identifiers which are used for subsequent overhead calculations - * on (namespace, pod) group_left(node) ( - max by (namespace, pod, node) (kube_pod_info{node!=""}) # needed to add node!="" to dedupe - ) - ) - - record: entity_id:gpu_usage_per_node:sum # Allocated gpu aggregated per node and entity, where entity is either a task/workflow execution or an app. - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id, node) ( - # First, grab the allocated gpu for each pod (which is always either 1 or zero, since k8s can't split gpus the way it can with cpu/memory) - max by (namespace, pod) ( - pod_gpu_allocation - ) - # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but I didn't want to double the number of pod-level metrics we save - * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) - "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") - ), - "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels - label_replace( - label_replace( - kube_pod_labels{ - label_domain!="", - label_project!="", - label_serving_unionai_dev_app_name!="", - label_serving_knative_dev_revision!="" - }, # this filters for apps only - "label_entity_type", "app", "", "" # set label_entity_type to "app" - ), - "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) - "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" - ), - "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - fast_task_execution_duration_rate{label_domain!="", label_project!="", label_execution_id!=""}, - "label_entity_type", "fast_task", "", "" - ) - ) - ) - # Then filter for pods only in the "Running" or "Pending" phase - * on (namespace, pod) group_left() ( - max by (namespace, pod) ( - kube_pod_status_phase{phase=~"Running|Pending"} == 1 - ) - ) - # Now join in node identifiers which are used for subsequent overhead calculations - * on (namespace, pod) group_left(node) ( - max by (namespace, pod, node) (kube_pod_info{node!=""}) # needed to add node!="" to dedupe - ) - ) - - record: entity_id:used_mem_bytes:sum # the sum of used memory across all containers in an entity (numerator for aggregate utilization calculations) - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id) ( # aggregate up to entity - # First, calculate the used memory for each pod - sum by (namespace, pod) ( - container_memory_working_set_bytes{namespace!="",pod!="",image!=""} - ) - # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but we do not want to double the number of pod-level metrics we save - * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) - "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") - ), - "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels - label_replace( - label_replace( - kube_pod_labels{ - label_domain!="", - label_project!="", - label_serving_unionai_dev_app_name!="", - label_serving_knative_dev_revision!="" - }, # this filters for apps only - "label_entity_type", "app", "", "" # set label_entity_type to "app" - ), - "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) - "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" - ), - "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - fast_task_execution_duration_rate{label_domain!="", label_project!="", label_execution_id!=""}, - "label_entity_type", "fast_task", "", "" - ) - ) - ) - # Then filter for pods only in the "Running" or "Pending" phase - * on (namespace, pod) group_left() ( - max by (namespace, pod) ( - kube_pod_status_phase{phase=~"Running|Pending"} == 1 - ) - ) - ) - - record: entity_id:allocated_mem_bytes:sum # the sum of allocated memory across all containers in an entity (denominator for aggregate utilization calculations) - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id) ( # aggregate up to entity (remove node) - entity_id:mem_usage_bytes_total_per_node:sum - ) - - record: entity_id:used_cpu:sum # the sum of used cpu across all containers in an entity (numerator for aggregate utilization calculations) - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id) ( - sum by (namespace, pod) ( - irate(container_cpu_usage_seconds_total{namespace!="",pod!="",image!=""}[5m]) - ) - # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but I didn't want to double the number of pod-level metrics we save - * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) - "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") - ), - "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels - label_replace( - label_replace( - kube_pod_labels{ - label_domain!="", - label_project!="", - label_serving_unionai_dev_app_name!="", - label_serving_knative_dev_revision!="" - }, # this filters for apps only - "label_entity_type", "app", "", "" # set label_entity_type to "app" - ), - "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) - "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" - ), - "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - fast_task_execution_duration_rate{label_domain!="", label_project!="", label_execution_id!=""}, - "label_entity_type", "fast_task", "", "" - ) - ) - ) - # Then filter for pods only in the "Running" or "Pending" phase - * on (namespace, pod) group_left() ( - max by (namespace, pod) ( - kube_pod_status_phase{phase=~"Running|Pending"} == 1 - ) - ) - ) - - record: entity_id:allocated_cpu:sum # the sum of allocated cpu across all containers in an entity (denominator for aggregate utilization calculations) - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id) ( # aggregate up to entity (remove node) - entity_id:cpu_usage_per_node:sum - ) - - record: entity_id:sm_occupancy:avg # the simple average of SM occupancy (a good generic measure of GPU utilization) per entity - expr: | - avg by (label_entity_type, label_domain, label_project, label_entity_id) ( - # First, grab the SM occupancy for each pod - max by (namespace, pod) ( - DCGM_FI_PROF_SM_OCCUPANCY # SM occupancy is a good proxy for actual GPU usage - ) - # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but I didn't want to double the number of pod-level metrics we save - * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) - "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") - ), - "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels - label_replace( - label_replace( - kube_pod_labels{ - label_domain!="", - label_project!="", - label_serving_unionai_dev_app_name!="", - label_serving_knative_dev_revision!="" - }, # this filters for apps only - "label_entity_type", "app", "", "" # set label_entity_type to "app" - ), - "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) - "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" - ), - "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - fast_task_execution_duration_rate{label_domain!="", label_project!="", label_execution_id!=""}, - "label_entity_type", "fast_task", "", "" - ) - ) - ) - # Then filter for pods only in the "Running" or "Pending" phase - * on (namespace, pod) group_left() ( - max by (namespace, pod) ( - kube_pod_status_phase{phase=~"Running|Pending"} == 1 - ) - ) - ) - - record: entity_id:gpu_count:sum # the count of running gpu pods per entity (need this to weight the gpu utilization when aggregating upwards - i.e. project-level) - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id, node) ( - # First, grab the allocated gpu for each pod (which is always either 1 or zero, since k8s can't split gpus the way it can with cpu/memory) - max by (namespace, pod) ( - pod_gpu_allocation - ) - # Next, add labels to each pod that contain the relevant entity information (i.e. workflow/task or app). Note that this is repetitive but I didn't want to double the number of pod-level metrics we save - * on (namespace, pod) group_left(label_entity_type, label_domain, label_project, label_entity_id) ( - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_workflow_name!="", label_execution_id!="", label_workspace=""}, # this filters for workflow and task executions only (no apps) - "label_entity_type", "workflow", "", "" # set label_entity_type to "workflow" (note that both workflow and single task executions will say "workflow") - ), - "label_entity_id", "$1", "label_execution_id", "(.*)" # set label_entity_id to the execution id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds app labels - label_replace( - label_replace( - kube_pod_labels{ - label_domain!="", - label_project!="", - label_serving_unionai_dev_app_name!="", - label_serving_knative_dev_revision!="" - }, # this filters for apps only - "label_entity_type", "app", "", "" # set label_entity_type to "app" - ), - "label_entity_id", "$1", "label_serving_knative_dev_revision", "(.*)" # set label_entity_id to the app version (so we have label_entity_id with both execution ids and app versions) - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workspace labels - label_replace( - label_replace( - label_replace( - label_replace( - kube_pod_labels{label_domain!="", label_project!="", label_node_id!="", label_workspace="true"}, # this filters for workspace executions only (no tasks, workflows, or apps) - "label_entity_type", "workspace", "", "" # set label_entity_type to "workspace" - ), - "label_entity_id", "$1", "label_node_id", "(.*)" # set label_entity_id to the label_node_id (join key) - ), - "label_domain", "$1", "label_domain", "(.*)" - ), - "label_project", "$1", "label_project", "(.*)" - ) - ) - or - max by (label_entity_type, label_domain, label_project, label_entity_id, namespace, pod)( # adds workflow/task labels - label_replace( - fast_task_execution_duration_rate{label_domain!="", label_project!="", label_execution_id!=""}, - "label_entity_type", "fast_task", "", "" - ) - ) - ) - # Then filter for pods only in the "Running" or "Pending" phase - * on (namespace, pod) group_left() ( - max by (namespace, pod) ( - kube_pod_status_phase{phase=~"Running|Pending"} == 1 - ) - ) - ) - - record: entity_id:weighted_sm_occupancy:sum # product of SM occupancy and allocated GPU count (something like "used memory", numerator of weighted calcs) - expr: | - entity_id:sm_occupancy:avg - * on (label_domain, label_project, label_entity_type, label_entity_id) entity_id:gpu_count:sum - - record: entity_id:allocated_mem_cost:sum # Allocated cost of memory for each workflow/task execution and app. - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id, type) ( - entity_id:mem_usage_bytes_total_per_node:sum / (1024 * 1024 * 1024) # convert bytes to GB - * on (node) group_left(type) label_replace(avg by (node) (node_ram_hourly_cost * (15 / 3600)), "type", "mem", "", "") # convert hourly cost to 15-secondly cost and add type - ) - - record: entity_id:allocated_cpu_cost:sum # Allocated cost of cpu for each workflow/task execution and app. - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id, type)( - entity_id:cpu_usage_per_node:sum - * on (node) group_left(type) label_replace(avg by (node) (node_cpu_hourly_cost * (15 / 3600)), "type", "cpu", "", "") # convert hourly cost to 15-secondly cost and add type - ) - - record: entity_id:allocated_gpu_cost:sum # Allocated cost of gpu for each workflow/task execution and app. - expr: | - sum by (label_entity_type, label_domain, label_project, label_entity_id, type)( - entity_id:gpu_usage_per_node:sum - * on (node) group_left(type) label_replace(avg by (node) (node_gpu_hourly_cost * (15 / 3600)), "type", "gpu", "", "") # convert hourly cost to 15-secondly cost and add type - ) - - record: entity_id:allocated_cost:sum # Allocated cost of memory, cpu, and gpu for each workflow/task execution and app. - expr: | - label_replace( - sum by (label_entity_type, label_domain, label_project, label_entity_id) ( # for the sum to work, the labels need to be different on each "or" element (type label) - entity_id:allocated_mem_cost:sum - or - entity_id:allocated_cpu_cost:sum - or - entity_id:allocated_gpu_cost:sum - ), - "type", "allocated", "", "" # add type info - ) - - record: entity_id:overhead_cost:sum # The amount of overhead costs (node costs that we can't allocate with container resources) to allocate to each entity (workflow/task execution or app) - expr: | - label_replace( - sum by (label_entity_type, label_entity_id, label_domain, label_project)( # Aggregate the per-node metrics up to workflow/task execution or app (label_entity_id) - # Start with each execution's and app's allocated cost per node - sum by (label_entity_type, label_domain, label_project, label_entity_id, node) ( # for the sum to work, the labels need to be different on each "or" element (type label) - entity_id:mem_usage_bytes_total_per_node:sum / (1024 * 1024 * 1024) # convert bytes to GB - * on (node) group_left(type) label_replace(avg by (node) (node_ram_hourly_cost * (15 / 3600)), "type", "mem", "", "") # convert hourly cost to 15-secondly cost and add type - or - entity_id:cpu_usage_per_node:sum - * on (node) group_left(type) label_replace(avg by (node) (node_cpu_hourly_cost * (15 / 3600)), "type", "cpu", "", "") # convert hourly cost to 15-secondly cost and add type - or - entity_id:gpu_usage_per_node:sum - * on (node) group_left(type) label_replace(avg by (node) (node_gpu_hourly_cost * (15 / 3600)), "type", "gpu", "", "") # convert hourly cost to 15-secondly cost and add type - ) - # Then divide out the total allocated cost per node to get the proportion of allocated cost associated with each entity - / on (node) group_left()( - sum by (node) ( # for the sum to work, the labels need to be different on each "or" element (type label) - entity_id:mem_usage_bytes_total_per_node:sum / (1024 * 1024 * 1024) # convert bytes to GB - * on (node) group_left(type) label_replace(avg by (node) (node_ram_hourly_cost * (15 / 3600)), "type", "mem", "", "") # convert hourly cost to 15-secondly cost and add type - or - entity_id:cpu_usage_per_node:sum - * on (node) group_left(type) label_replace(avg by (node) (node_cpu_hourly_cost * (15 / 3600)), "type", "cpu", "", "") # convert hourly cost to 15-secondly cost and add type - or - entity_id:gpu_usage_per_node:sum - * on (node) group_left(type) label_replace(avg by (node) (node_gpu_hourly_cost * (15 / 3600)), "type", "gpu", "", "") # convert hourly cost to 15-secondly cost and add type - ) - > 0 # need to avoid dividing by zero, or gaps in the data can cause NaNs to proliferate, borking all charts - ) - # Then multiply by the overhead cost per node - * on (node) group_left() ( - # To calculate overhead, start with the true cost of running each node - avg by (node)(kube_node_labels{label_flyte_org_node_role="worker"}) # only look at worker nodes - * on (node) max by (node) ( - node_total_hourly_cost{instance_type!=""} # sometimes, the instance_type can be null, causing an unlabeled label to show up in the Compute Costs dashboard charts - ) * (15 / 3600) # convert hourly cost to 15-secondly cost - # Then subtract out the total allocated cost on each node - - on (node) group_left()( - sum by (node) ( # for the sum to work, the labels need to be different on each "or" element (type label) - entity_id:mem_usage_bytes_total_per_node:sum / (1024 * 1024 * 1024) # convert bytes to GB - * on (node) group_left(type) label_replace(avg by (node) (node_ram_hourly_cost * (15 / 3600)), "type", "mem", "", "") # convert hourly cost to 15-secondly cost and add type - or - entity_id:cpu_usage_per_node:sum - * on (node) group_left(type) label_replace(avg by (node) (node_cpu_hourly_cost * (15 / 3600)), "type", "cpu", "", "") # convert hourly cost to 15-secondly cost and add type - or - entity_id:gpu_usage_per_node:sum - * on (node) group_left(type) label_replace(avg by (node) (node_gpu_hourly_cost * (15 / 3600)), "type", "gpu", "", "") # convert hourly cost to 15-secondly cost and add type - ) - ) - ) - ), - "type", "overhead", "", "" # add type info - ) - - record: entity_id:total_cost:sum # Total cost of each entity (workflow/task execution or app), including allocated (from container resources) and overhead (proportion of unallocated node costs) - expr: | - label_replace( - sum by (label_domain, label_project, label_entity_id, label_entity_type) ( - entity_id:allocated_cost:sum - or - entity_id:overhead_cost:sum - ), - "type", "total", "", "" # add type info - ) - - record: node:total_cost:sum # Total cost of all nodes - expr: | - sum ( - avg by (node)(kube_node_labels{label_flyte_org_node_role="worker", label_node_kubernetes_io_instance_type!=""}) # only look at worker nodes - * on (node) group_left() node_total_hourly_cost{instance_type!=""} * (15 / 3600) # convert hourly cost to 15-secondly cost - ) - - record: node_type:total_cost:sum # Total cost of nodes grouped by node type - expr: | - sum by (node_type)( - avg by (node)(kube_node_labels{label_flyte_org_node_role="worker", label_node_kubernetes_io_instance_type!=""}) # only look at worker nodes - * on (node) group_left(node_type) label_replace(node_total_hourly_cost{instance_type!=""}, "node_type", "$1", "instance_type", "(.*)") * (15 / 3600) # convert hourly cost to 15-secondly cost and rename label - ) - - record: node_type:uptime_hours:sum # Total uptime of nodes grouped by node type - expr: | - sum by (node_type)( - avg by (node, node_type)( # dedupe - label_replace(kube_node_labels{label_flyte_org_node_role="worker", label_node_kubernetes_io_instance_type!=""}, "node_type", "$1", "label_node_kubernetes_io_instance_type", "(.*)") # relabel - ) - ) * (15 / 3600) # convert to number of hours per 15-second observation # Aggregate the above into visible metrics - - name: cost_rollup_15m - interval: 15m - rules: - - record: execution_info15m - expr: | - max_over_time(execution_info[15m:15s]) - - record: app_info15m - expr: | - max_over_time(app_info[15m:15s]) - - record: workspace_info15m - expr: | - max_over_time(workspace_info[15m:15s]) - - record: entity_id:allocated_mem_bytes:sum15m - expr: | - sum_over_time(entity_id:allocated_mem_bytes:sum[15m:15s]) - - record: entity_id:used_mem_bytes:sum15m - expr: | - sum_over_time(entity_id:used_mem_bytes:sum[15m:15s]) - - record: entity_id:allocated_cpu:sum15m - expr: | - sum_over_time(entity_id:allocated_cpu:sum[15m:15s]) - - record: entity_id:used_cpu:sum15m - expr: | - sum_over_time(entity_id:used_cpu:sum[15m:15s]) - - record: entity_id:weighted_sm_occupancy:sum15m - expr: | - sum_over_time(entity_id:weighted_sm_occupancy:sum[15m:15s]) - - record: entity_id:gpu_count:sum15m - expr: | - sum_over_time(entity_id:gpu_count:sum[15m:15s]) - - record: entity_id:allocated_mem_cost:sum15m - expr: | - sum_over_time(entity_id:allocated_mem_cost:sum[15m:15s]) - - record: entity_id:allocated_cpu_cost:sum15m - expr: | - sum_over_time(entity_id:allocated_cpu_cost:sum[15m:15s]) - - record: entity_id:allocated_gpu_cost:sum15m - expr: | - sum_over_time(entity_id:allocated_gpu_cost:sum[15m:15s]) - - record: entity_id:allocated_cost:sum15m - expr: | - sum_over_time(entity_id:allocated_cost:sum[15m:15s]) - - record: entity_id:overhead_cost:sum15m - expr: | - sum_over_time(entity_id:overhead_cost:sum[15m:15s]) - - record: entity_id:total_cost:sum15m - expr: | - sum_over_time(entity_id:total_cost:sum[15m:15s]) - - record: node:total_cost:sum15m - expr: | - sum_over_time(node:total_cost:sum[15m:15s]) - - record: node_type:total_cost:sum15m - expr: | - sum_over_time(node_type:total_cost:sum[15m:15s]) - - record: node_type:uptime_hours:sum15m - expr: | - sum_over_time(node_type:uptime_hours:sum[15m:15s]) ---- -# Source: dataplane/templates/propeller/configmap.yaml -apiVersion: v1 -kind: ConfigMap -metadata: - name: flyte-propeller-config - namespace: union -data: - admin.yaml: | - admin: - clientId: 'clientId' - clientSecretLocation: /etc/union/secret/client_secret - endpoint: dns:///union.us-west-2.union.ai - insecure: false - event: - capacity: 1000 - rate: 500 - type: admin - catalog.yaml: | - catalog-cache: - cache-endpoint: dns:///union.us-west-2.union.ai - endpoint: dns:///union.us-west-2.union.ai - insecure: false - type: fallback - use-admin-auth: true - copilot.yaml: | - plugins: - k8s: - co-pilot: - image: 'cr.flyte.org/flyteorg/flytecopilot:v1.14.1' - name: flyte-copilot- - start-timeout: 30s - core.yaml: | - propeller: - downstream-eval-duration: 30s - enable-admin-launcher: true - leader-election: - enabled: true - lease-duration: 15s - lock-config-map: - name: propeller-leader - namespace: 'union' - renew-deadline: 10s - retry-period: 2s - limit-namespace: all - literal-offloading-config: - enabled: true - max-workflow-retries: 30 - metadata-prefix: metadata/propeller - metrics-prefix: flyte - prof-port: 10254 - queue: - batch-size: -1 - batching-interval: 2s - queue: - base-delay: 5s - capacity: 1000 - max-delay: 120s - rate: 100 - type: maxof - sub-queue: - capacity: 100 - rate: 10 - type: bucket - type: batch - rawoutput-prefix: 's3://bucket' - workers: 4 - workflow-reeval-duration: 30s webhook: certDir: /etc/webhook/certs + disableCreateMutatingWebhookConfig: true embeddedSecretManagerConfig: imagePullSecrets: enabled: true @@ -3826,89 +3408,13 @@ data: namespace: 'union' type: 'K8s' listenPort: '9443' + localCert: true secretManagerTypes: - Embedded - K8s - serviceName: flyte-pod-webhook + secretName: union-pod-webhook + serviceName: union-pod-webhook servicePort: '443' - enabled_plugins.yaml: | - plugins: - connector-service: - defaultConnector: - defaultServiceConfig: '{"loadBalancingConfig": [{"round_robin":{}}]}' - endpoint: k8s:///flyteconnector.union:8000 - tasks: - task-plugins: - default-for-task-types: - actor: fast-task - container: container - container_array: k8s-array - sidecar: sidecar - enabled-plugins: - - container - - sidecar - - k8s-array - - echo - - fast-task - - connector-service - k8s.yaml: | - plugins: - k8s: - default-cpus: 200m - default-env-vars: - - FLYTE_AWS_ENDPOINT: https://xxxxxxxxxxx.compat.objectstorage.us-ashburn-1.oraclecloud.com - - FLYTE_AWS_ACCESS_KEY_ID: accessKey - - FLYTE_AWS_SECRET_ACCESS_KEY: secretKey - - AWS_REQUEST_CHECKSUM_CALCULATION: when_required - - CREATE_UPRIVER_DATA_SOURCE: "true" - - SOME_NUMERIC_VAR: "42" - - MORE: foo - - SOME_CONFIG_BOOL: "true" - default-memory: 100Mi - logger.yaml: | - logger: - formatter: - type: json - level: 4 - show-source: true - resource_manager.yaml: | - propeller: - resourcemanager: - type: noop - task_logs.yaml: | - plugins: - logs: - cloudwatch-enabled: false - dynamic-log-links: - - vscode: - displayName: VS Code Debugger - templateUris: - - /dataplane/pod/v1/generated_name/task/{{.executionProject}}/{{.executionDomain}}/{{.executionName}}/{{.nodeID}}/{{.taskRetryAttempt}}/{{.taskProject}}/{{.taskDomain}}/{{.taskID}}/{{.taskVersion}}/ - kubernetes-enabled: false - templates: - - displayName: Task Logs - scheme: TaskExecution - templateUris: - - /console/projects/{{.executionProject}}/domains/{{.executionDomain}}/executions/{{.executionName}}/nodeId/{{.nodeID}}/taskId/{{.taskID}}/attempt/{{.taskRetryAttempt}}/view/logs?duration=all&fromExecutionNav=true - storage.yaml: | - storage: - container: "bucket" - type: stow - stow: - kind: s3 - config: - auth_type: accesskey - access_key_id: accessKey - secret_key: secretKey - disable_ssl: false - endpoint: https://xxxxxxxxxxx.compat.objectstorage.us-ashburn-1.oraclecloud.com - region: us-ashburn-1 - enable-multicontainer: false - limits: - maxDownloadMBs: 1024 - cache: - max_size_mbs: 0 - target_gc_percent: 70 --- # Source: dataplane/charts/fluentbit/templates/clusterrole.yaml apiVersion: rbac.authorization.k8s.io/v1 @@ -3932,275 +3438,1143 @@ rules: - list - watch --- -# Source: dataplane/charts/kube-state-metrics/templates/role.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2020 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: - labels: - helm.sh/chart: kube-state-metrics-5.30.1 - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: metrics - app.kubernetes.io/part-of: kube-state-metrics - app.kubernetes.io/name: kube-state-metrics - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2.15.0" - name: release-name-kube-state-metrics -rules: - -- apiGroups: ["certificates.k8s.io"] - resources: - - certificatesigningrequests - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - configmaps - verbs: ["list", "watch"] - -- apiGroups: ["batch"] - resources: - - cronjobs - verbs: ["list", "watch"] - -- apiGroups: ["extensions", "apps"] - resources: - - daemonsets - verbs: ["list", "watch"] - -- apiGroups: ["extensions", "apps"] - resources: - - deployments - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - endpoints - verbs: ["list", "watch"] - -- apiGroups: ["autoscaling"] - resources: - - horizontalpodautoscalers - verbs: ["list", "watch"] - -- apiGroups: ["extensions", "networking.k8s.io"] - resources: - - ingresses - verbs: ["list", "watch"] - -- apiGroups: ["batch"] - resources: - - jobs - verbs: ["list", "watch"] - -- apiGroups: ["coordination.k8s.io"] - resources: - - leases - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - limitranges - verbs: ["list", "watch"] - -- apiGroups: ["admissionregistration.k8s.io"] - resources: - - mutatingwebhookconfigurations - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - namespaces - verbs: ["list", "watch"] - -- apiGroups: ["networking.k8s.io"] - resources: - - networkpolicies - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - nodes - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - persistentvolumeclaims - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - persistentvolumes - verbs: ["list", "watch"] - -- apiGroups: ["policy"] - resources: - - poddisruptionbudgets - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - pods - verbs: ["list", "watch"] - -- apiGroups: ["extensions", "apps"] - resources: - - replicasets - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - replicationcontrollers - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - resourcequotas - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - secrets - verbs: ["list", "watch"] - -- apiGroups: [""] - resources: - - services - verbs: ["list", "watch"] - -- apiGroups: ["apps"] - resources: - - statefulsets - verbs: ["list", "watch"] + name: knative-serving-operator-aggregated + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +aggregationRule: + clusterRoleSelectors: + # This (along with escalate below) allows the Operator to pick up any + # roles that are provided to the admin of the cluster by knative serving + # automatically. + - matchExpressions: + - {key: serving.knative.dev/release, operator: Exists} +rules: [] # Rules are automatically filled in by the controller manager. +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: knative-serving-operator-aggregated-stable + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +aggregationRule: + clusterRoleSelectors: + # This (along with escalate below) allows the Operator to pick up any + # roles that are provided to the admin of the cluster by knative serving + # automatically. + - matchExpressions: + - {key: app.kubernetes.io/name, operator: In, values: ["knative-serving"]} +rules: [] # Rules are automatically filled in by the controller manager. +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: knative-eventing-operator-aggregated + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +aggregationRule: + clusterRoleSelectors: + # This (along with escalate below) allows the Operator to pick up any + # roles that are provided to the admin of the cluster by knative eventing + # automatically. + - matchExpressions: + - {key: eventing.knative.dev/release, operator: Exists} +rules: [] # Rules are automatically filled in by the controller manager. +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: knative-eventing-operator-aggregated-stable + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +aggregationRule: + clusterRoleSelectors: + # This (along with escalate below) allows the Operator to pick up any + # roles that are provided to the admin of the cluster by knative eventing + # automatically. + - matchExpressions: + - {key: app.kubernetes.io/name, operator: In, values: ["knative-eventing"]} +rules: [] # Rules are automatically filled in by the controller manager. +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2020 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. -- apiGroups: ["storage.k8s.io"] - resources: - - storageclasses - verbs: ["list", "watch"] +kind: ClusterRole +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: knative-serving-operator + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +rules: + - apiGroups: + - operator.knative.dev + resources: + - '*' + verbs: + - '*' + # Bootstrapping permissions. + # Roles that are explicitly bound buch which are specified by this Operator + # MUST be specified here with 'get' and 'bind'. + - apiGroups: + - rbac.authorization.k8s.io + resources: + - clusterroles + resourceNames: + - system:auth-delegator + verbs: + - bind + - get + - apiGroups: + - rbac.authorization.k8s.io + resources: + - roles + resourceNames: + - extension-apiserver-authentication-reader + verbs: + - bind + - get + - apiGroups: + - rbac.authorization.k8s.io + resources: + - clusterroles + - roles + verbs: + - create + - delete + # Escalate is necessary in order to create a role using cluster role aggregation, + # and to allow the Operator to bootstrap itself into the necessary set of + # permissions, even as those continue to evolve upstream. + - escalate + - get + - list + - update + - apiGroups: + - rbac.authorization.k8s.io + resources: + - clusterrolebindings + - rolebindings + verbs: + - create + - delete + - list + - get + - update + # Permissions required for Knative controller + # infra. + - apiGroups: + - apiregistration.k8s.io + resources: + - apiservices + verbs: + - update + - apiGroups: + - "" + resources: + - services + verbs: + - create + - delete + - get + - list + - watch + - apiGroups: + - caching.internal.knative.dev + resources: + - images + verbs: + - '*' + - apiGroups: + - "" + resources: + - namespaces + verbs: + - get + - update + - watch + - apiGroups: + - '' + resources: + - events + verbs: + - create + - update + - patch + - apiGroups: + - '' + resources: + - configmaps + verbs: + - create + - delete + - get + - list + - watch + - apiGroups: + - security.istio.io + - apps + - policy + resources: + - poddisruptionbudgets + - peerauthentications + - deployments + - daemonsets + - replicasets + - statefulsets + verbs: + - create + - delete + - get + - list + - watch + - update + - apiGroups: + - apiregistration.k8s.io + resources: + - apiservices + verbs: + - create + - delete + - get + - list + - apiGroups: + - autoscaling + resources: + - horizontalpodautoscalers + verbs: + - create + - delete + - get + - list + - apiGroups: + - coordination.k8s.io + resources: + - leases + verbs: + - '*' + - apiGroups: + - apiextensions.k8s.io + resources: + - customresourcedefinitions + verbs: + - '*' + # Old resources that need cleaning up that are not in the knative-serving + # namespace. + - apiGroups: + - "" + resources: + - services + - deployments + - horizontalpodautoscalers + resourceNames: + - knative-ingressgateway + verbs: + - delete + - apiGroups: + - "" + resources: + - configmaps + resourceNames: + - config-controller + verbs: + - delete + - apiGroups: + - "" + resources: + - serviceaccounts + resourceNames: + - knative-serving-operator + verbs: + - delete + # for contour TLS + - apiGroups: + - projectcontour.io + resources: + - httpproxies + - tlscertificatedelegations + verbs: + - get + - list + - watch + - update + - create + - delete + - deletecollection + - patch + # for security-guard + - apiGroups: + - guard.security.knative.dev + resources: + - guardians + verbs: + - get + - list + - watch + - create + - update + - patch + - delete + - apiGroups: + - "" + resources: + - configmaps + verbs: + - get + - list + - watch + - create + - update + - patch + - delete + - apiGroups: + - authentication.k8s.io + resources: + - tokenreviews + verbs: + - create + - apiGroups: + - "" + resources: + - pods + verbs: + - get +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +kind: ClusterRole +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: knative-eventing-operator + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +rules: + - apiGroups: + - operator.knative.dev + resources: + - '*' + verbs: + - '*' + # Bootstrapping permissions. + # Roles that are explicitly bound buch which are specified by this Operator + # MUST be specified here with 'get' and 'bind'. + - apiGroups: + - rbac.authorization.k8s.io + resources: + - clusterroles + - roles + verbs: + - create + - delete + # Escalate is necessary in order to create a role using cluster role aggregation, + # and to allow the Operator to bootstrap itself into the necessary set of + # permissions, even as those continue to evolve upstream. + - escalate + - get + - list + - update + - apiGroups: + - rbac.authorization.k8s.io + resources: + - clusterrolebindings + - rolebindings + verbs: + - create + - delete + - list + - get + - update + # Permissions required for Knative controller + # infra. + - apiGroups: + - apiregistration.k8s.io + resources: + - apiservices + verbs: + - update + - apiGroups: + - "" + resources: + - services + verbs: + - create + - delete + - get + - list + - watch + - apiGroups: + - caching.internal.knative.dev + resources: + - images + verbs: + - '*' + - apiGroups: + - "" + resources: + - namespaces + verbs: + - get + - update + - watch + - apiGroups: + - '' + resources: + - events + verbs: + - create + - update + - patch + - apiGroups: + - '' + resources: + - configmaps + verbs: + - create + - delete + - get + - list + - watch + - apiGroups: + - apps + resources: + - deployments + - daemonsets + - replicasets + - statefulsets + verbs: + - create + - delete + - get + - list + - watch + - apiGroups: + - apiregistration.k8s.io + resources: + - apiservices + verbs: + - create + - delete + - get + - list + - apiGroups: + - autoscaling + resources: + - horizontalpodautoscalers + verbs: + - create + - delete + - update + - get + - list + - apiGroups: + - coordination.k8s.io + resources: + - leases + verbs: + - '*' + - apiGroups: + - apiextensions.k8s.io + resources: + - customresourcedefinitions + verbs: + - '*' + - apiGroups: + - batch + resources: + - jobs + verbs: + - create + - delete + - update + - get + - list + - watch + # Old resources that need cleaning up that are not in the knative-eventing + # namespace. + - apiGroups: + - "" + resources: + - serviceaccounts + resourceNames: + - knative-eventing-operator + verbs: + - delete + # for RabbitMQ messaging topology objects + - apiGroups: + - rabbitmq.com + resources: + - rabbitmqclusters + verbs: + - "get" + - "list" + - "watch" + - apiGroups: + - rabbitmq.com + resources: + - bindings + - queues + - exchanges + verbs: + - create + - delete + - get + - list + - patch + - update + - watch + - apiGroups: + - rabbitmq.com + resources: + - bindings/status + - queues/status + - exchanges/status + verbs: + - get + # for Kafka eventing source + - apiGroups: + - keda.sh + resources: + - scaledobjects + - scaledobjects/finalizers + - scaledobjects/status + - triggerauthentications + - triggerauthentications/status + verbs: + - get + - list + - watch + - update + - create + - delete + # Internal APIs + - apiGroups: + - "internal.kafka.eventing.knative.dev" + resources: + - "consumers" + - "consumers/status" + - "consumergroups" + - "consumergroups/status" + verbs: + - create + - get + - list + - watch + - patch + - update + - delete + - apiGroups: + - "internal.kafka.eventing.knative.dev" + resources: + - "consumers/finalizers" + - "consumergroups/finalizers" + verbs: + - update + - delete + - apiGroups: + - apps + resources: + - statefulsets/scale + verbs: + - get + - list + - watch + - update + - patch + - create + - delete + - apiGroups: + - rbac.authorization.k8s.io + resources: + - clusterrolebindings + verbs: + - watch + - apiGroups: + - "*" + resources: + - configmaps + verbs: + - delete + - apiGroups: + - "*" + resources: + - configmaps + - services + verbs: + - get + - list + - watch + - update + - create + - delete + - apiGroups: + - "*" + resources: + - pods + verbs: + - list + - update + - get + - watch + - apiGroups: + - "*" + resources: + - pods/finalizers + verbs: + - get + - list + - create + - update + - delete + - apiGroups: + - "*" + resources: + - events + verbs: + - patch + - create + - apiGroups: + - "*" + resources: + - secrets + verbs: + - get + - list + - watch + - update + - create + - delete + - apiGroups: + - "*" + resources: + - nodes + verbs: + - get + - list + - watch + - apiGroups: + - "*" + resources: + - serviceaccounts + verbs: + - get + - list + - watch + - update + - create + - delete + - apiGroups: + - "*" + resources: + - configmaps + resourceNames: + - kafka-channel-config + verbs: + - patch + - apiGroups: + - "*" + resources: + - horizontalpodautoscalers + resourceNames: + - kafka-webhook + verbs: + - delete + - apiGroups: + - "*" + resources: + - leases + verbs: + - delete + - apiGroups: + - "*" + resources: + - poddisruptionbudgets + resourceNames: + - kafka-webhook + verbs: + - delete + - apiGroups: + - "*" + resources: + - services + verbs: + - patch + - apiGroups: + - "apps" + resources: + - deployments + verbs: + - deletecollection + # Eventing TLS + - apiGroups: + - "cert-manager.io" + resources: + - certificates + - issuers + - clusterissuers + verbs: + - create + - delete + - update + - list + - get + - watch + - apiGroups: + - "trust.cert-manager.io" + resources: + - bundles + verbs: + - create + - delete + - update + - list + - get + - watch +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: knative-operator-webhook + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +rules: + # For watching logging configuration and getting certs. + - apiGroups: + - "" + resources: + - "configmaps" + verbs: + - "get" + - "list" + - "watch" + # For manipulating certs into secrets. + - apiGroups: + - "" + resources: + - "namespaces" + verbs: + - "get" + - "create" + - "update" + - "list" + - "watch" + - "patch" + # finalizers are needed for the owner reference of the webhook + - apiGroups: + - "" + resources: + - "namespaces/finalizers" + verbs: + - "update" + # For getting our Deployment so we can decorate with ownerref. + - apiGroups: + - "apps" + resources: + - "deployments" + verbs: + - "get" + - apiGroups: + - "apps" + resources: + - "deployments/finalizers" + verbs: + - update + # For actually registering our webhook. + - apiGroups: + - "admissionregistration.k8s.io" + resources: + - "mutatingwebhookconfigurations" + - "validatingwebhookconfigurations" + verbs: &everything + - "get" + - "list" + - "create" + - "update" + - "delete" + - "patch" + - "watch" + # For leader election + - apiGroups: + - "coordination.k8s.io" + resources: + - "leases" + verbs: *everything + # Necessary for conversion webhook. These are copied from the serving + # TODO: Do we really need all these permissions? + - apiGroups: ["apiextensions.k8s.io"] + resources: ["customresourcedefinitions"] + verbs: ["get", "list", "create", "update", "delete", "patch", "watch"] +--- +# Source: dataplane/charts/fluentbit/templates/clusterrolebinding.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: release-name-fluentbit + labels: + helm.sh/chart: fluentbit-0.48.9 + app.kubernetes.io/name: fluentbit + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "3.2.8" + app.kubernetes.io/managed-by: Helm +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: release-name-fluentbit +subjects: + - kind: ServiceAccount + name: union-system + namespace: union +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2020 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# TODO: Consider restriction of non-aggregated role to knativeservings namespaces. +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: knative-serving-operator + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: knative-serving-operator +subjects: + - kind: ServiceAccount + name: knative-operator + namespace: "union" +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# TODO: Consider restriction of non-aggregated role to knativeeventing namespaces. +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: knative-eventing-operator + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: knative-eventing-operator +subjects: + - kind: ServiceAccount + name: knative-operator + namespace: "union" +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: operator-webhook + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: knative-operator-webhook +subjects: + - kind: ServiceAccount + name: operator-webhook + namespace: "union" +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2020 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. -- apiGroups: ["admissionregistration.k8s.io"] - resources: - - validatingwebhookconfigurations - verbs: ["list", "watch"] +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: knative-serving-operator-aggregated + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: knative-serving-operator-aggregated +subjects: + - kind: ServiceAccount + name: knative-operator + namespace: "union" +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: knative-serving-operator-aggregated-stable + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: knative-serving-operator-aggregated-stable +subjects: + - kind: ServiceAccount + name: knative-operator + namespace: "union" +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: knative-eventing-operator-aggregated + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: knative-eventing-operator-aggregated +subjects: + - kind: ServiceAccount + name: knative-operator + namespace: "union" +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: knative-eventing-operator-aggregated-stable + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: knative-eventing-operator-aggregated-stable +subjects: + - kind: ServiceAccount + name: knative-operator + namespace: "union" +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2022 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. -- apiGroups: ["storage.k8s.io"] - resources: - - volumeattachments - verbs: ["list", "watch"] +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + namespace: "union" + name: knative-operator-webhook + labels: + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +rules: + # For manipulating certs into secrets. + - apiGroups: + - "" + resources: + - "secrets" + verbs: + - "get" + - "create" + - "update" + - "list" + - "watch" + - "patch" --- -# Source: dataplane/charts/opencost/templates/clusterrole.yaml -# Cluster role giving opencost to get, list, watch required resources -# No write permissions are required +# Source: dataplane/templates/monitoring/prometheus-rbac.yaml apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole +kind: Role metadata: - name: release-name-opencost + name: union-operator-prometheus-rbac + namespace: union labels: - helm.sh/chart: opencost-1.42.0 - app.kubernetes.io/name: opencost - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "1.111.0" - app.kubernetes.io/part-of: opencost - app.kubernetes.io/managed-by: Helm + release: release-name rules: + # Prometheus server scrape permissions - apiGroups: [""] resources: - - configmaps - - deployments - nodes - - pods + - nodes/proxy + - nodes/metrics - services - - resourcequotas - - replicationcontrollers - - limitranges - - persistentvolumeclaims - - persistentvolumes - - namespaces - endpoints + - pods + - ingresses + - configmaps verbs: - get - list - watch - - apiGroups: - - extensions - resources: - - daemonsets - - deployments - - replicasets - verbs: - - get - - list - - watch - - apiGroups: - - apps - resources: - - statefulsets - - deployments - - daemonsets - - replicasets - verbs: - - list - - watch - - apiGroups: - - batch - resources: - - cronjobs - - jobs - verbs: - - get - - list - - watch - - apiGroups: - - autoscaling - resources: - - horizontalpodautoscalers - verbs: - - get - - list - - watch - - apiGroups: - - policy + - apiGroups: ["extensions", "networking.k8s.io"] resources: - - poddisruptionbudgets + - ingresses/status + - ingresses verbs: - get - list - watch - - apiGroups: - - storage.k8s.io + - apiGroups: ["discovery.k8s.io"] resources: - - storageclasses + - endpointslices verbs: - get - list - watch ---- -# Source: dataplane/templates/clusterresourcesync/serviceaccount.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: union-clustersync-resource -rules: - - apiGroups: - - "" - - rbac.authorization.k8s.io - resources: - - configmaps - - namespaces - - pods - - resourcequotas - - roles - - rolebindings - - secrets - - services - - serviceaccounts - - clusterrolebindings - - podtemplates - verbs: - - '*' + # kube-state-metrics permissions (rbac.create=false on the subchart; managed here instead) + - apiGroups: ["certificates.k8s.io"] + resources: ["certificatesigningrequests"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["configmaps"] + verbs: ["list", "watch"] + - apiGroups: ["batch"] + resources: ["cronjobs"] + verbs: ["list", "watch"] + - apiGroups: ["extensions", "apps"] + resources: ["daemonsets"] + verbs: ["list", "watch"] + - apiGroups: ["extensions", "apps"] + resources: ["deployments"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["endpoints"] + verbs: ["list", "watch"] + - apiGroups: ["discovery.k8s.io"] + resources: ["endpointslices"] + verbs: ["list", "watch"] + - apiGroups: ["autoscaling"] + resources: ["horizontalpodautoscalers"] + verbs: ["list", "watch"] + - apiGroups: ["extensions", "networking.k8s.io"] + resources: ["ingresses"] + verbs: ["list", "watch"] + - apiGroups: ["batch"] + resources: ["jobs"] + verbs: ["list", "watch"] + - apiGroups: ["coordination.k8s.io"] + resources: ["leases"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["limitranges"] + verbs: ["list", "watch"] + - apiGroups: ["admissionregistration.k8s.io"] + resources: ["mutatingwebhookconfigurations"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["namespaces"] + verbs: ["list", "watch"] + - apiGroups: ["networking.k8s.io"] + resources: ["networkpolicies"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["persistentvolumeclaims"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["persistentvolumes"] + verbs: ["list", "watch"] + - apiGroups: ["policy"] + resources: ["poddisruptionbudgets"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["pods"] + verbs: ["list", "watch"] + - apiGroups: ["extensions", "apps"] + resources: ["replicasets"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["replicationcontrollers"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["resourcequotas"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["secrets"] + verbs: ["list", "watch"] + - apiGroups: [""] + resources: ["services"] + verbs: ["list", "watch"] + - apiGroups: ["apps"] + resources: ["statefulsets"] + verbs: ["list", "watch"] + - apiGroups: ["storage.k8s.io"] + resources: ["storageclasses"] + verbs: ["list", "watch"] + - apiGroups: ["admissionregistration.k8s.io"] + resources: ["validatingwebhookconfigurations"] + verbs: ["list", "watch"] + - apiGroups: ["storage.k8s.io"] + resources: ["volumeattachments"] + verbs: ["list", "watch"] --- # Source: dataplane/templates/nodeexecutor/serviceaccount.yaml apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole +kind: Role metadata: name: union-executor labels: @@ -4251,11 +4625,12 @@ rules: - delete - update --- -# Source: dataplane/templates/operator/serviceaccount-proxy.yaml +# Source: dataplane/templates/operator/serviceaccount-proxy-secret.yaml apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole +kind: Role metadata: - name: proxy-system + name: union-system-secret + namespace: union labels: app.kubernetes.io/name: operator-proxy app.kubernetes.io/instance: release-name @@ -4265,146 +4640,86 @@ rules: - apiGroups: - '*' resources: - - events - - flyteworkflows - - pods/log - - pods - - rayjobs - - resourcequotas + - secrets verbs: - get - list - - watch + - create + - update + - delete --- -# Source: dataplane/templates/operator/serviceaccount.yaml +# Source: dataplane/templates/operator/serviceaccount-proxy.yaml apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole +kind: Role metadata: - name: operator-system + name: proxy-system labels: - app.kubernetes.io/name: union-operator + app.kubernetes.io/name: operator-proxy app.kubernetes.io/instance: release-name platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm rules: - # Allow Access to all resources under flyte.lyft.com - - apiGroups: - - flyte.lyft.com - resources: - - flyteworkflows - - flyteworkflows/finalizers - verbs: - - get - - list - - watch - - create - - update - - delete - - patch - - post - - deletecollection - apiGroups: - '*' resources: - - resourcequotas + - events + - flyteworkflows + - pods/log - pods - - configmaps - - podtemplates - - secrets - - namespaces - - nodes + - rayjobs + - resourcequotas verbs: - get - list - watch - - create - - update - - delete - - nonResourceURLs: - - /metrics - verbs: - - get --- -# Source: dataplane/templates/prometheus/rbac.yaml +# Source: dataplane/templates/operator/serviceaccount.yaml apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole +kind: Role metadata: - name: union-operator-prometheus + name: operator-system labels: - helm.sh/chart: dataplane-2026.4.2 - app.kubernetes.io/name: release-name-dataplane + app.kubernetes.io/name: union-operator app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" + platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: prometheus -rules: - - apiGroups: [""] - resources: - - nodes - - nodes/proxy - - pods - - endpoints - - services - verbs: - - get - - list - - watch - - nonResourceURLs: - - /metrics - - /metrics/cadvisor - verbs: - - get ---- -# Source: dataplane/templates/propeller/serviceaccount-webhook.yaml -kind: ClusterRole -apiVersion: rbac.authorization.k8s.io/v1 -metadata: - name: flytepropeller-webhook-role - namespace: union rules: - apiGroups: - - "*" + - '*' resources: - - mutatingwebhookconfigurations - secrets - - pods - - replicasets/finalizers + - deployments verbs: - get + - list + - watch - create - update - - patch ---- -# Source: dataplane/templates/propeller/serviceaccount.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: flytepropeller-role -rules: - # Allow RO access to PODS - apiGroups: - - "" + - flyte.lyft.com resources: - - pods + - flyteworkflows + - flyteworkflows/finalizers verbs: - get - list - watch - # Allow Event recording access - - apiGroups: - - "" - resources: - - events - verbs: - create - update - delete - patch - # Allow Access All plugin objects + - post + - deletecollection - apiGroups: - '*' resources: - - '*' + - resourcequotas + - pods + - configmaps + - podtemplates + - secrets + - namespaces + - nodes verbs: - get - list @@ -4412,148 +4727,129 @@ rules: - create - update - delete - - patch - # Allow Access to CRD - apiGroups: - - apiextensions.k8s.io + - serving.knative.dev resources: - - customresourcedefinitions + - revisions + - configurations + - services verbs: - get - list - watch - create - - delete - update - # Allow Access to all resources under flyte.lyft.com + - delete +--- +# Source: dataplane/templates/webhook/serviceaccount.yaml +kind: Role +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: union-webhook-role + namespace: union +rules: - apiGroups: - - flyte.lyft.com + - "*" resources: - - flyteworkflows - - flyteworkflows/finalizers + - mutatingwebhookconfigurations + - secrets + - pods + - replicasets/finalizers verbs: - get - - list - - watch - create - update - - delete - patch - - post - - deletecollection --- -# Source: dataplane/charts/fluentbit/templates/clusterrolebinding.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2022 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding +kind: RoleBinding metadata: - name: release-name-fluentbit + namespace: "union" + name: operator-webhook labels: - helm.sh/chart: fluentbit-0.48.9 - app.kubernetes.io/name: fluentbit - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "3.2.8" - app.kubernetes.io/managed-by: Helm -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: release-name-fluentbit + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator subjects: - kind: ServiceAccount - name: fluentbit-system - namespace: union ---- -# Source: dataplane/charts/kube-state-metrics/templates/clusterrolebinding.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - labels: - helm.sh/chart: kube-state-metrics-5.30.1 - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: metrics - app.kubernetes.io/part-of: kube-state-metrics - app.kubernetes.io/name: kube-state-metrics - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2.15.0" - name: release-name-kube-state-metrics + name: operator-webhook + namespace: "union" roleRef: + kind: Role + name: knative-operator-webhook apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: release-name-kube-state-metrics -subjects: -- kind: ServiceAccount - name: release-name-kube-state-metrics - namespace: union --- -# Source: dataplane/charts/opencost/templates/clusterrolebinding.yaml +# Source: dataplane/templates/monitoring/prometheus-rbac.yaml apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding +kind: RoleBinding metadata: - name: release-name-opencost + name: union-operator-prometheus-rbac + namespace: union labels: - helm.sh/chart: opencost-1.42.0 - app.kubernetes.io/name: opencost - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "1.111.0" - app.kubernetes.io/part-of: opencost - app.kubernetes.io/managed-by: Helm -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: release-name-opencost + release: release-name subjects: - kind: ServiceAccount - name: release-name-opencost + name: union-operator-prometheus namespace: union ---- -# Source: dataplane/templates/clusterresourcesync/serviceaccount.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - name: union-clustersync-resource roleRef: apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: union-clustersync-resource -subjects: - - kind: ServiceAccount - name: union-clustersync-system - namespace: union + kind: Role + name: union-operator-prometheus-rbac --- -# Source: dataplane/templates/clusterresourcesync/serviceaccount.yaml +# Source: dataplane/templates/monitoring/prometheus-rbac.yaml +# kube-state-metrics service account RoleBinding apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding +kind: RoleBinding metadata: - name: union-clustersync-auth-delegator -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: system:auth-delegator + name: release-name-prometheus-kube-state-metrics + namespace: union + labels: + release: release-name subjects: - kind: ServiceAccount - name: union-clustersync-system + name: release-name-prometheus-kube-state-metrics namespace: union +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: union-operator-prometheus-rbac --- # Source: dataplane/templates/nodeexecutor/serviceaccount.yaml apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding +kind: RoleBinding metadata: name: union-executor labels: app: executor roleRef: apiGroup: rbac.authorization.k8s.io - kind: ClusterRole + kind: Role name: union-executor subjects: - kind: ServiceAccount - name: executor + name: union-system namespace: union --- -# Source: dataplane/templates/operator/serviceaccount-proxy.yaml +# Source: dataplane/templates/operator/serviceaccount-proxy-secret.yaml apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding +kind: RoleBinding metadata: - name: proxy-system + name: union-system-secret + namespace: union labels: app.kubernetes.io/name: operator-proxy app.kubernetes.io/instance: release-name @@ -4561,109 +4857,35 @@ metadata: app.kubernetes.io/managed-by: Helm roleRef: apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: proxy-system + kind: Role + name: union-system-secret subjects: - kind: ServiceAccount - name: proxy-system + name: union-system namespace: union --- -# Source: dataplane/templates/operator/serviceaccount.yaml +# Source: dataplane/templates/operator/serviceaccount-proxy.yaml apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding +kind: RoleBinding metadata: - name: operator-system + name: proxy-system labels: - app.kubernetes.io/name: union-operator + app.kubernetes.io/name: operator-proxy app.kubernetes.io/instance: release-name platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm roleRef: apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: operator-system -subjects: - - kind: ServiceAccount - name: operator-system - namespace: union ---- -# Source: dataplane/templates/prometheus/rbac.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - name: union-operator-prometheus - labels: - helm.sh/chart: dataplane-2026.4.2 - app.kubernetes.io/name: release-name-dataplane - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: prometheus -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: union-operator-prometheus -subjects: - - kind: ServiceAccount - name: union-operator-prometheus - namespace: union ---- -# Source: dataplane/templates/propeller/serviceaccount-webhook.yaml -# Create a binding from Role -> ServiceAccount -kind: ClusterRoleBinding -apiVersion: rbac.authorization.k8s.io/v1 -metadata: - name: flytepropeller-webhook-binding - namespace: union -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: flytepropeller-webhook-role -subjects: - - kind: ServiceAccount - name: flytepropeller-webhook-system - namespace: union ---- -# Source: dataplane/templates/propeller/serviceaccount.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - name: flytepropeller-binding -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: flytepropeller-role + kind: Role + name: proxy-system subjects: - kind: ServiceAccount - name: flytepropeller-system + name: union-system namespace: union --- -# Source: dataplane/templates/operator/serviceaccount-proxy-secret.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: Role -metadata: - name: proxy-system-secret - namespace: union - labels: - app.kubernetes.io/name: operator-proxy - app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name - app.kubernetes.io/managed-by: Helm -rules: - - apiGroups: - - '*' - resources: - - secrets - verbs: - - get - - list - - create - - update - - delete ---- # Source: dataplane/templates/operator/serviceaccount.yaml apiVersion: rbac.authorization.k8s.io/v1 -kind: Role +kind: RoleBinding metadata: name: operator-system labels: @@ -4671,56 +4893,29 @@ metadata: app.kubernetes.io/instance: release-name platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm -rules: - - apiGroups: - - '*' - resources: - - secrets - - deployments - verbs: - - get - - list - - watch - - create - - update ---- -# Source: dataplane/templates/operator/serviceaccount-proxy-secret.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: RoleBinding -metadata: - name: proxy-system-secret - namespace: union - labels: - app.kubernetes.io/name: operator-proxy - app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name - app.kubernetes.io/managed-by: Helm roleRef: apiGroup: rbac.authorization.k8s.io kind: Role - name: proxy-system-secret + name: operator-system subjects: - kind: ServiceAccount - name: proxy-system + name: union-system namespace: union --- -# Source: dataplane/templates/operator/serviceaccount.yaml -apiVersion: rbac.authorization.k8s.io/v1 +# Source: dataplane/templates/webhook/serviceaccount.yaml +# Create a binding from Role -> ServiceAccount kind: RoleBinding +apiVersion: rbac.authorization.k8s.io/v1 metadata: - name: operator-system - labels: - app.kubernetes.io/name: union-operator - app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name - app.kubernetes.io/managed-by: Helm + name: union-webhook-binding + namespace: union roleRef: apiGroup: rbac.authorization.k8s.io kind: Role - name: operator-system + name: union-webhook-role subjects: - kind: ServiceAccount - name: operator-system + name: union-system namespace: union --- # Source: dataplane/charts/fluentbit/templates/service.yaml @@ -4746,20 +4941,60 @@ spec: app.kubernetes.io/name: fluentbit app.kubernetes.io/instance: release-name --- -# Source: dataplane/charts/kube-state-metrics/templates/service.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2022 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v1 +kind: Service +metadata: + labels: + role: operator-webhook + app.kubernetes.io/component: operator-webhook + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator + name: operator-webhook + namespace: "union" +spec: + ports: + # Define metrics and profiling for them to be accessible within service meshes. + - name: http-metrics + port: 9090 + targetPort: 9090 + - name: http-profiling + port: 8008 + targetPort: 8008 + - name: https-webhook + port: 443 + targetPort: 8443 + selector: + role: operator-webhook +--- +# Source: dataplane/charts/prometheus/charts/kube-state-metrics/templates/service.yaml apiVersion: v1 kind: Service metadata: name: release-name-kube-state-metrics namespace: union labels: - helm.sh/chart: kube-state-metrics-5.30.1 + helm.sh/chart: kube-state-metrics-5.25.1 app.kubernetes.io/managed-by: Helm app.kubernetes.io/component: metrics app.kubernetes.io/part-of: kube-state-metrics app.kubernetes.io/name: kube-state-metrics app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2.15.0" + app.kubernetes.io/version: "2.13.0" annotations: prometheus.io/scrape: 'true' spec: @@ -4774,28 +5009,32 @@ spec: app.kubernetes.io/name: kube-state-metrics app.kubernetes.io/instance: release-name --- -# Source: dataplane/charts/opencost/templates/service.yaml +# Source: dataplane/charts/prometheus/templates/service.yaml apiVersion: v1 kind: Service metadata: - name: release-name-opencost - namespace: union labels: - helm.sh/chart: opencost-1.42.0 - app.kubernetes.io/name: opencost + app.kubernetes.io/component: server + app.kubernetes.io/name: prometheus app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "1.111.0" - app.kubernetes.io/part-of: opencost + app.kubernetes.io/version: v2.54.1 + helm.sh/chart: prometheus-25.27.0 app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: prometheus + name: union-operator-prometheus + namespace: union spec: + ports: + - name: http + port: 80 + protocol: TCP + targetPort: 9090 selector: - app.kubernetes.io/name: opencost + app.kubernetes.io/component: server + app.kubernetes.io/name: prometheus app.kubernetes.io/instance: release-name + sessionAffinity: None type: "ClusterIP" - ports: - - name: http - port: 9003 - targetPort: 9003 --- # Source: dataplane/templates/clusterresourcesync/service.yaml apiVersion: v1 @@ -4806,18 +5045,43 @@ metadata: platform.union.ai/prometheus-group: "union-services" app.kubernetes.io/name: clusterresourcesync app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name + platform.union.ai/service-group: release-name + app.kubernetes.io/managed-by: Helm +spec: + type: ClusterIP + ports: + - port: 10254 + targetPort: debug + protocol: TCP + name: debug + selector: + app.kubernetes.io/name: clusterresourcesync + app.kubernetes.io/instance: release-name +--- +# Source: dataplane/templates/flyteconnector/service.yaml +apiVersion: v1 +kind: Service +metadata: + name: flyteconnector + namespace: union + labels: + app.kubernetes.io/name: flyteconnector + app.kubernetes.io/instance: release-name app.kubernetes.io/managed-by: Helm spec: - type: ClusterIP + clusterIP: None ports: - - port: 10254 - targetPort: debug + - name: grpc + port: 8000 protocol: TCP - name: debug - selector: - app.kubernetes.io/name: clusterresourcesync - app.kubernetes.io/instance: release-name + targetPort: grpc + - name: metric + port: 9090 + protocol: TCP + targetPort: metric + selector: + app.kubernetes.io/name: flyteconnector + app.kubernetes.io/instance: release-name --- # Source: dataplane/templates/imagebuilder/service.yaml apiVersion: v1 @@ -4844,7 +5108,7 @@ spec: apiVersion: v1 kind: Service metadata: - name: release-name-dataplane-executor + name: union-operator-executor labels: platform.union.ai/prometheus-group: "union-services" app: executor @@ -4910,39 +5174,14 @@ spec: app.kubernetes.io/name: union-operator app.kubernetes.io/instance: release-name --- -# Source: dataplane/templates/prometheus/service.yaml -apiVersion: v1 -kind: Service -metadata: - name: union-operator-prometheus - namespace: union - labels: - helm.sh/chart: dataplane-2026.4.2 - app.kubernetes.io/name: release-name-dataplane - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: prometheus -spec: - type: ClusterIP - ports: - - port: 80 - targetPort: 9090 - protocol: TCP - name: http - selector: - app.kubernetes.io/component: prometheus - app.kubernetes.io/name: release-name-dataplane - app.kubernetes.io/instance: release-name ---- -# Source: dataplane/templates/propeller/service-webhook.yaml +# Source: dataplane/templates/webhook/service.yaml apiVersion: v1 kind: Service metadata: - name: flyte-pod-webhook + name: union-pod-webhook namespace: union labels: - app.kubernetes.io/name: flyte-pod-webhook + app.kubernetes.io/name: union-pod-webhook app.kubernetes.io/instance: release-name platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm @@ -4950,7 +5189,7 @@ metadata: projectcontour.io/upstream-protocol.h2c: grpc spec: selector: - app.kubernetes.io/name: flyte-pod-webhook + app.kubernetes.io/name: union-pod-webhook app.kubernetes.io/instance: release-name ports: - name: https @@ -4962,23 +5201,23 @@ spec: port: 10254 targetPort: 10254 --- -# Source: dataplane/templates/propeller/service-webhook.yaml +# Source: dataplane/templates/webhook/service.yaml # Headless Service for cache invalidation — resolves to all pod IPs so that # we can fan out invalidation requests to every webhook replica. apiVersion: v1 kind: Service metadata: - name: flyte-pod-webhook-headless + name: union-pod-webhook-headless namespace: union labels: - app.kubernetes.io/name: flyte-pod-webhook + app.kubernetes.io/name: union-pod-webhook app.kubernetes.io/instance: release-name platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm spec: clusterIP: None selector: - app.kubernetes.io/name: flyte-pod-webhook + app.kubernetes.io/name: union-pod-webhook app.kubernetes.io/instance: release-name ports: - name: cache-internal @@ -4986,32 +5225,6 @@ spec: port: 9443 targetPort: 9443 --- -# Source: dataplane/templates/propeller/service.yaml -apiVersion: v1 -kind: Service -metadata: - namespace: union - name: flytepropeller - labels: - platform.union.ai/prometheus-group: "union-services" - app.kubernetes.io/name: flytepropeller - app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name - app.kubernetes.io/managed-by: Helm -spec: - type: ClusterIP - ports: - - name: debug - protocol: TCP - port: 10254 - - name: fasttask - port: 15605 - protocol: TCP - targetPort: 15605 - selector: - app.kubernetes.io/name: flytepropeller - app.kubernetes.io/instance: release-name ---- # Source: dataplane/charts/fluentbit/templates/daemonset.yaml apiVersion: apps/v1 kind: DaemonSet @@ -5037,7 +5250,7 @@ spec: annotations: checksum/config: e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855 spec: - serviceAccountName: fluentbit-system + serviceAccountName: union-system hostNetwork: false dnsPolicy: ClusterFirst containers: @@ -5089,20 +5302,217 @@ spec: tolerations: - operator: Exists --- -# Source: dataplane/charts/kube-state-metrics/templates/deployment.yaml +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2022 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: operator-webhook + namespace: "union" + labels: + app.kubernetes.io/component: operator-webhook + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator +spec: + selector: + matchLabels: + app: operator-webhook + role: operator-webhook + template: + metadata: + labels: + app: operator-webhook + role: operator-webhook + app.kubernetes.io/component: operator-webhook + app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/name: knative-operator + sidecar.istio.io/inject: "false" + spec: + # To avoid node becoming SPOF, spread our replicas to different nodes. + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - podAffinityTerm: + labelSelector: + matchLabels: + app: operator-webhook + topologyKey: kubernetes.io/hostname + weight: 100 + serviceAccountName: operator-webhook + containers: + - name: operator-webhook + # This is the Go import path for the binary that is containerized + # and substituted here. + image: gcr.io/knative-releases/knative.dev/operator/cmd/webhook@sha256:d3a7a3304629ccfcaacec620b50736e0b4c902a6328b83369b6cbaf7e94677c9 + resources: + requests: + cpu: 100m + memory: 100Mi + limits: + cpu: 500m + memory: 500Mi + env: + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: SYSTEM_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: CONFIG_LOGGING_NAME + value: config-logging + - name: CONFIG_OBSERVABILITY_NAME + value: config-observability + - name: WEBHOOK_NAME + value: operator-webhook + - name: WEBHOOK_PORT + value: "8443" + - name: WEBHOOK_SECRET_NAME + value: operator-webhook-certs + - name: METRICS_DOMAIN + value: knative.dev/operator + - name: KUBERNETES_MIN_VERSION + value: "" + securityContext: + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + runAsNonRoot: true + capabilities: + drop: + - ALL + ports: + - name: metrics + containerPort: 9090 + - name: profiling + containerPort: 8008 + - name: https-webhook + containerPort: 8443 + readinessProbe: + periodSeconds: 1 + httpGet: + scheme: HTTPS + port: 8443 + httpHeaders: + - name: k-kubelet-probe + value: "webhook" + livenessProbe: + periodSeconds: 1 + httpGet: + scheme: HTTPS + port: 8443 + httpHeaders: + - name: k-kubelet-probe + value: "webhook" + failureThreshold: 6 + initialDelaySeconds: 120 + # Our webhook should gracefully terminate by lame ducking first, set this to a sufficiently + # high value that we respect whatever value it has configured for the lame duck grace period. + terminationGracePeriodSeconds: 300 +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Copyright 2020 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: knative-operator + namespace: "union" + labels: + app.kubernetes.io/name: knative-operator + app.kubernetes.io/version: "1.16.0" +spec: + replicas: 1 + selector: + matchLabels: + name: knative-operator + template: + metadata: + labels: + name: knative-operator + app.kubernetes.io/name: knative-operator + app.kubernetes.io/version: "1.16.0" + sidecar.istio.io/inject: "false" + spec: + serviceAccountName: knative-operator + containers: + - name: knative-operator + image: gcr.io/knative-releases/knative.dev/operator/cmd/operator@sha256:0b5a3532417f9c8e7b6044e23f6f67ad932a74a40a4092ad965b6f173b2fd887 + resources: + requests: + cpu: 100m + memory: 100Mi + limits: + cpu: 1000m + memory: 1000Mi + imagePullPolicy: IfNotPresent + env: + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: SYSTEM_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: METRICS_DOMAIN + value: knative.dev/operator + - name: CONFIG_LOGGING_NAME + value: config-logging + - name: CONFIG_OBSERVABILITY_NAME + value: config-observability + - name: KUBERNETES_MIN_VERSION + value: "" + securityContext: + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + runAsNonRoot: true + capabilities: + drop: + - ALL + ports: + - name: metrics + containerPort: 9090 +--- +# Source: dataplane/charts/prometheus/charts/kube-state-metrics/templates/deployment.yaml apiVersion: apps/v1 kind: Deployment metadata: name: release-name-kube-state-metrics namespace: union labels: - helm.sh/chart: kube-state-metrics-5.30.1 + helm.sh/chart: kube-state-metrics-5.25.1 app.kubernetes.io/managed-by: Helm app.kubernetes.io/component: metrics app.kubernetes.io/part-of: kube-state-metrics app.kubernetes.io/name: kube-state-metrics app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2.15.0" + app.kubernetes.io/version: "2.13.0" spec: selector: matchLabels: @@ -5115,13 +5525,13 @@ spec: template: metadata: labels: - helm.sh/chart: kube-state-metrics-5.30.1 + helm.sh/chart: kube-state-metrics-5.25.1 app.kubernetes.io/managed-by: Helm app.kubernetes.io/component: metrics app.kubernetes.io/part-of: kube-state-metrics app.kubernetes.io/name: kube-state-metrics app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2.15.0" + app.kubernetes.io/version: "2.13.0" spec: automountServiceAccountToken: true hostNetwork: false @@ -5138,8 +5548,9 @@ spec: args: - --port=8080 - --resources=certificatesigningrequests,configmaps,cronjobs,daemonsets,deployments,endpoints,horizontalpodautoscalers,ingresses,jobs,leases,limitranges,mutatingwebhookconfigurations,namespaces,networkpolicies,nodes,persistentvolumeclaims,persistentvolumes,poddisruptionbudgets,pods,replicasets,replicationcontrollers,resourcequotas,secrets,services,statefulsets,storageclasses,validatingwebhookconfigurations,volumeattachments + - --namespaces=union imagePullPolicy: IfNotPresent - image: registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.15.0 + image: registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.13.0 ports: - containerPort: 8080 name: "http" @@ -5159,7 +5570,7 @@ spec: httpGet: httpHeaders: path: /readyz - port: 8081 + port: 8080 scheme: HTTP initialDelaySeconds: 5 periodSeconds: 10 @@ -5174,215 +5585,204 @@ spec: - ALL readOnlyRootFilesystem: true --- -# Source: dataplane/charts/opencost/templates/deployment.yaml +# Source: dataplane/charts/prometheus/templates/deploy.yaml apiVersion: apps/v1 kind: Deployment metadata: - name: release-name-opencost - namespace: union labels: - helm.sh/chart: opencost-1.42.0 - app.kubernetes.io/name: opencost + app.kubernetes.io/component: server + app.kubernetes.io/name: prometheus app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "1.111.0" - app.kubernetes.io/part-of: opencost + app.kubernetes.io/version: v2.54.1 + helm.sh/chart: prometheus-25.27.0 app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: prometheus + name: union-operator-prometheus + namespace: union spec: - replicas: 1 selector: matchLabels: - app.kubernetes.io/name: opencost + app.kubernetes.io/component: server + app.kubernetes.io/name: prometheus app.kubernetes.io/instance: release-name - strategy: - rollingUpdate: - maxSurge: 1 - maxUnavailable: 1 - type: RollingUpdate + replicas: 1 + revisionHistoryLimit: 10 + strategy: + type: Recreate + rollingUpdate: null template: metadata: labels: - app.kubernetes.io/name: opencost + app.kubernetes.io/component: server + app.kubernetes.io/name: prometheus app.kubernetes.io/instance: release-name + app.kubernetes.io/version: v2.54.1 + helm.sh/chart: prometheus-25.27.0 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: prometheus spec: - serviceAccountName: release-name-opencost + enableServiceLinks: true + serviceAccountName: union-operator-prometheus containers: - - name: release-name-opencost - image: ghcr.io/opencost/opencost:1.111.0@sha256:6aa68e52a24b14ba41f23db08d1b9db1429a1c0300f4c0381ecc2c61fc311a97 - imagePullPolicy: IfNotPresent + - name: prometheus-server-configmap-reload + image: "quay.io/prometheus-operator/prometheus-config-reloader:v0.76.0" + imagePullPolicy: "IfNotPresent" args: + - --watched-dir=/etc/config + - --listen-address=0.0.0.0:8080 + - --reload-url=http://127.0.0.1:9090/prometheus/-/reload ports: - - containerPort: 9003 - name: http - resources: - limits: - cpu: 1000m - memory: 4Gi - requests: - cpu: 500m - memory: 1Gi - startupProbe: - httpGet: - path: /healthz - port: 9003 - initialDelaySeconds: 10 - periodSeconds: 5 - failureThreshold: 30 + - containerPort: 8080 + name: metrics livenessProbe: httpGet: path: /healthz - port: 9003 - initialDelaySeconds: 10 - periodSeconds: 20 - failureThreshold: 3 + port: metrics + scheme: HTTP + initialDelaySeconds: 2 + periodSeconds: 10 readinessProbe: httpGet: path: /healthz - port: 9003 - initialDelaySeconds: 10 + port: metrics + scheme: HTTP periodSeconds: 10 + volumeMounts: + - name: config-volume + mountPath: /etc/config + readOnly: true + + - name: prometheus-server + image: "quay.io/prometheus/prometheus:v2.54.1" + imagePullPolicy: "IfNotPresent" + args: + - --storage.tsdb.retention.time=3d + - --config.file=/etc/config/prometheus.yml + - --storage.tsdb.path=/data + - --web.console.libraries=/etc/prometheus/console_libraries + - --web.console.templates=/etc/prometheus/consoles + - --web.enable-lifecycle + - --web.route-prefix=/prometheus + - --web.external-url=/prometheus/ + ports: + - containerPort: 9090 + readinessProbe: + httpGet: + path: /prometheus/-/ready + port: 9090 + scheme: HTTP + initialDelaySeconds: 30 + periodSeconds: 5 + timeoutSeconds: 4 failureThreshold: 3 - env: - - name: LOG_LEVEL - value: info - - name: CUSTOM_COST_ENABLED - value: "false" - - name: KUBECOST_NAMESPACE - value: union - - name: API_PORT - value: "9003" - - name: PROMETHEUS_SERVER_ENDPOINT - value: "http://union-operator-prometheus.union.svc:80/prometheus" - - name: CLUSTER_ID - value: "default-cluster" - - name: DATA_RETENTION_DAILY_RESOLUTION_DAYS - value: "15" - - name: CLOUD_COST_ENABLED - value: "false" - - name: CLOUD_COST_MONTH_TO_DATE_INTERVAL - value: "6" - - name: CLOUD_COST_REFRESH_RATE_HOURS - value: "6" - - name: CLOUD_COST_QUERY_WINDOW_DAYS - value: "7" - - name: CLOUD_COST_RUN_WINDOW_DAYS - value: "3" - # Add any additional provided variables + successThreshold: 1 + livenessProbe: + httpGet: + path: /prometheus/-/healthy + port: 9090 + scheme: HTTP + initialDelaySeconds: 30 + periodSeconds: 15 + timeoutSeconds: 10 + failureThreshold: 3 + successThreshold: 1 + resources: + limits: + cpu: "3" + memory: 3500Mi + requests: + cpu: "1" + memory: 1Gi + volumeMounts: + - name: config-volume + mountPath: /etc/config + - name: storage-volume + mountPath: /data + subPath: "" + - name: server-recording-rules + mountPath: /etc/config/recording + subPath: + readOnly: + dnsPolicy: ClusterFirst + securityContext: + fsGroup: 65534 + runAsGroup: 65534 + runAsNonRoot: true + runAsUser: 65534 + terminationGracePeriodSeconds: 300 + volumes: + - name: config-volume + configMap: + name: union-operator-prometheus + - name: server-recording-rules + configMap: + name: union-recording-rules + - name: storage-volume + emptyDir: + sizeLimit: 10Gi --- -# Source: dataplane/templates/clusterresourcesync/deployment.yaml +# Source: dataplane/templates/flyteconnector/deployment.yaml apiVersion: apps/v1 kind: Deployment metadata: - name: union-syncresources + name: flyteconnector namespace: union - labels: - app.kubernetes.io/name: clusterresourcesync + labels: + app.kubernetes.io/name: flyteconnector app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm spec: - replicas: 1 + replicas: 2 selector: - matchLabels: - app.kubernetes.io/name: clusterresourcesync + matchLabels: + app.kubernetes.io/name: flyteconnector app.kubernetes.io/instance: release-name template: metadata: annotations: - configChecksum: "1bd09818f02c1912f9c75c474a35e8edb80667a3f7942753afb237355631f6f" - labels: platform.union.ai/zone: "dataplane" - app.kubernetes.io/name: clusterresourcesync + app.kubernetes.io/name: flyteconnector app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm spec: containers: - command: - - clusterresource - - --config - - /etc/flyte/config/*.yaml - - clusterresource - - run - env: - - name: POD_NAME - valueFrom: - fieldRef: - fieldPath: metadata.name - - name: POD_NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - - name: GOMEMLIMIT - valueFrom: - resourceFieldRef: - divisor: 1 - resource: limits.memory - - name: GOMAXPROCS - valueFrom: - resourceFieldRef: - divisor: 1 - resource: limits.cpu - - name: CLUSTER_NAME - valueFrom: - secretKeyRef: - name: operator-cluster-name - key: cluster_name - - name: DEPLOYMENT_NAME - value: operator - - name: PROXY_SERVICE_URL - value: http://union-operator-proxy:8080 - - name: PROMETHEUS_SERVICE_URL - value: http://union-operator-prometheus:80 - - name: KNATIVE_PROXY_SERVICE_URL - value: http://kourier-internal - - name: AWS_REQUEST_CHECKSUM_CALCULATION - value: "when_required" - - name: CREATE_UPRIVER_DATA_SOURCE - value: "true" - - name: SOME_NUMERIC_VAR - value: "42" - image: "public.ecr.aws/p0i0a9q8/unionoperator:2026.4.5" + - c0 + image: "ghcr.io/flyteorg/flyte-connectors:py3.13-2.0.0b50.dev3-g695bb1db3.d20260122" imagePullPolicy: "IfNotPresent" - name: sync-cluster-resources - resources: - limits: - cpu: "1" - memory: 500Mi - requests: - cpu: 500m - memory: 100Mi - volumeMounts: - - name: auth - mountPath: /etc/union/secret - - name: resource-templates - mountPath: /etc/flyte/clusterresource/templates - - name: config-volume - mountPath: /etc/flyte/config + name: flyteconnector + readinessProbe: + failureThreshold: 3 + httpGet: + path: / + port: 9090 + scheme: HTTP + initialDelaySeconds: 5 + periodSeconds: 10 + successThreshold: 1 + timeoutSeconds: 5 + env: + - name: AIRFLOW_CONN_GOOGLE_CLOUD_DEFAULT + value: "google-cloud-platform://" + - name: LOG_LEVEL + value: "20" ports: - - name: debug - containerPort: 10254 - protocol: TCP - serviceAccountName: union-clustersync-system - volumes: - - configMap: - name: union-clusterresource-template - name: resource-templates - - configMap: - name: union-clusterresourcesync-config - name: config-volume - - name: auth - secret: - secretName: union-secret-auth - - nodeSelector: - flyte.org/node-role: worker - tolerations: - - effect: NoSchedule - key: flyte.org/node-role - operator: Equal - value: worker + - containerPort: 8000 + name: grpc + - containerPort: 9090 + name: metric + resources: + limits: + cpu: "1.5" + ephemeral-storage: 100Mi + memory: 1500Mi + requests: + cpu: "1" + ephemeral-storage: 100Mi + memory: 1000Mi + serviceAccountName: flyteconnector --- # Source: dataplane/templates/imagebuilder/deployment.yaml apiVersion: apps/v1 @@ -5404,17 +5804,15 @@ spec: app.kubernetes.io/instance: release-name template: metadata: - annotations: - container.apparmor.security.beta.kubernetes.io/buildkit: unconfined labels: platform.union.ai/zone: "dataplane" app.kubernetes.io/name: imagebuilder-buildkit app.kubernetes.io/instance: release-name spec: - serviceAccountName: "union-imagebuilder" + serviceAccountName: "union-system" containers: - name: "buildkit" - image: "docker.io/moby/buildkit:buildx-stable-1-rootless" + image: "docker.io/moby/buildkit:buildx-stable-1" imagePullPolicy: IfNotPresent env: - name: POD_NAME @@ -5455,18 +5853,15 @@ spec: - name: SOME_NUMERIC_VAR value: "42" volumeMounts: - - mountPath: /home/user/.local/share/buildkit - name: buildkitd - mountPath: /etc/buildkit name: buildkit-config args: - --config - /etc/buildkit/buildkitd.toml - --addr - - unix:///run/user/1000/buildkit/buildkitd.sock + - unix:///run//buildkit/buildkitd.sock - --addr - tcp://0.0.0.0:1234 - - --oci-worker-no-process-sandbox ports: - name: tcp containerPort: 1234 @@ -5488,18 +5883,13 @@ spec: initialDelaySeconds: 5 periodSeconds: 30 securityContext: - seccompProfile: # Needs Kubernetes >= 1.19 - type: Unconfined - runAsUser: 1000 - runAsGroup: 1000 + privileged: true resources: requests: - cpu: 1 - ephemeral-storage: 20Gi - memory: 1Gi + cpu: 4 + ephemeral-storage: 50Gi + memory: 4Gi volumes: - - name: buildkitd - emptyDir: {} - configMap: name: union-operator-buildkit name: buildkit-config @@ -5528,16 +5918,18 @@ spec: template: metadata: annotations: - configChecksum: "5518a4d42a7f64a68f4856df98a6be10e932a27e2faf82a4fcc9375c4f1e290" + configChecksum: "ab417e9c80ea8e0680ba7733a11f4fd2cc2edc0ec72da30177b267fcec1da54" labels: platform.union.ai/zone: "dataplane" app: executor + app.kubernetes.io/instance: 'release-name' + app.kubernetes.io/name: executor spec: securityContext: fsGroup: 1337 - serviceAccountName: executor + serviceAccountName: union-system volumes: - name: config-volume configMap: @@ -5553,7 +5945,7 @@ spec: image: "public.ecr.aws/p0i0a9q8/unionoperator:2026.4.5" imagePullPolicy: IfNotPresent command: - - executor + - executorv2 - serve - --config - /etc/config/*.yaml @@ -5645,7 +6037,7 @@ spec: template: metadata: annotations: - configChecksum: "7f52c9e14faa1e4b045aefd2b6b01472e62a3941cf6b1889d00ae80c4e42a6e" + configChecksum: "0770e2f78b475354622f69d9572955275f4b84354e7ad382a3b6c94736a36d5" labels: @@ -5660,12 +6052,10 @@ spec: sources: - configMap: name: union-operator - - configMap: - name: union-clusterresourcesync-config - name: secret-volume secret: secretName: union-secret-auth - serviceAccountName: proxy-system + serviceAccountName: union-system securityContext: {} containers: @@ -5796,7 +6186,7 @@ spec: template: metadata: annotations: - configChecksum: "7f52c9e14faa1e4b045aefd2b6b01472e62a3941cf6b1889d00ae80c4e42a6e" + configChecksum: "0770e2f78b475354622f69d9572955275f4b84354e7ad382a3b6c94736a36d5" labels: @@ -5805,7 +6195,7 @@ spec: platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm spec: - serviceAccountName: operator-system + serviceAccountName: union-system securityContext: {} volumes: @@ -5900,81 +6290,16 @@ spec: operator: Equal value: worker --- -# Source: dataplane/templates/prometheus/deployment.yaml -apiVersion: apps/v1 -kind: Deployment -metadata: - name: union-operator-prometheus - namespace: union - labels: - helm.sh/chart: dataplane-2026.4.2 - app.kubernetes.io/name: release-name-dataplane - app.kubernetes.io/instance: release-name - app.kubernetes.io/version: "2026.4.5" - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/component: prometheus -spec: - replicas: 1 - selector: - matchLabels: - app.kubernetes.io/component: prometheus - app.kubernetes.io/name: release-name-dataplane - app.kubernetes.io/instance: release-name - template: - metadata: - annotations: - configChecksum: "72a39ca838f02fc499675b5708fa77e2a4938e316a99315b59e33ef7f2d31ae" - labels: - app.kubernetes.io/component: prometheus - app.kubernetes.io/name: release-name-dataplane - app.kubernetes.io/instance: release-name - spec: - priorityClassName: system-cluster-critical - serviceAccountName: union-operator-prometheus - securityContext: - fsGroup: 65534 - runAsNonRoot: true - runAsUser: 65534 - fsGroupChangePolicy: OnRootMismatch - containers: - - name: prometheus - image: "prom/prometheus:v3.3.1" - args: - - --config.file=/etc/prometheus/prometheus.yml - - --web.external-url=/prometheus/ - - --web.route-prefix=/prometheus/ - - --storage.tsdb.retention.time=3d - ports: - - name: http - containerPort: 9090 - protocol: TCP - resources: - limits: - cpu: "3" - memory: 3500Mi - requests: - cpu: "1" - memory: 1Gi - securityContext: - allowPrivilegeEscalation: false - terminationMessagePolicy: FallbackToLogsOnError - volumeMounts: - - mountPath: /etc/prometheus - name: prometheus-config - volumes: - - name: prometheus-config - configMap: - name: union-operator-prometheus ---- -# Source: dataplane/templates/propeller/deployment-webhook.yaml +# Source: dataplane/templates/webhook/deployment.yaml +# Webhook deployment # Create the actual deployment apiVersion: apps/v1 kind: Deployment metadata: - name: flytepropeller-webhook + name: union-pod-webhook namespace: union labels: - app.kubernetes.io/name: flyte-pod-webhook + app.kubernetes.io/name: union-pod-webhook app.kubernetes.io/instance: release-name platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm @@ -5982,19 +6307,19 @@ spec: replicas: 1 selector: matchLabels: - app.kubernetes.io/name: flyte-pod-webhook + app.kubernetes.io/name: union-pod-webhook app.kubernetes.io/instance: release-name template: metadata: labels: platform.union.ai/zone: "dataplane" - app.kubernetes.io/name: flyte-pod-webhook + app.kubernetes.io/name: union-pod-webhook app.kubernetes.io/instance: release-name platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm annotations: - configChecksum: "f3931dbcb3b33014ce22a4a663e84dbaadd5fb0aa0728492a9042bc8bbac1b9" + configChecksum: "77958aee0464eee4cb43c70036cf5ffa89f47f58c908845b9eeaecc7e32bba4" spec: securityContext: @@ -6002,70 +6327,8 @@ spec: fsGroupChangePolicy: Always runAsNonRoot: true runAsUser: 1001 - seLinuxOptions: - type: spc_t - serviceAccountName: flytepropeller-webhook-system - initContainers: - - name: generate-secrets - image: "public.ecr.aws/p0i0a9q8/unionoperator:2026.4.5" - imagePullPolicy: "IfNotPresent" - command: - - flytepropeller - args: - - webhook - - init-certs - - --config - - /etc/flyte/config/*.yaml - env: - - name: POD_NAME - valueFrom: - fieldRef: - fieldPath: metadata.name - - name: POD_NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - - name: GOMEMLIMIT - valueFrom: - resourceFieldRef: - divisor: 1 - resource: limits.memory - - name: GOMAXPROCS - valueFrom: - resourceFieldRef: - divisor: 1 - resource: limits.cpu - - name: CLUSTER_NAME - valueFrom: - secretKeyRef: - name: operator-cluster-name - key: cluster_name - - name: DEPLOYMENT_NAME - value: operator - - name: PROXY_SERVICE_URL - value: http://union-operator-proxy:8080 - - name: PROMETHEUS_SERVICE_URL - value: http://union-operator-prometheus:80 - - name: KNATIVE_PROXY_SERVICE_URL - value: http://kourier-internal - - name: AWS_REQUEST_CHECKSUM_CALCULATION - value: "when_required" - - name: CREATE_UPRIVER_DATA_SOURCE - value: "true" - - name: SOME_NUMERIC_VAR - value: "42" - volumeMounts: - - name: config-volume - mountPath: /etc/flyte/config - resources: - limits: - cpu: 1 - ephemeral-storage: 500Mi - memory: 500Mi - requests: - cpu: 200m - ephemeral-storage: 500Mi - memory: 500Mi + seLinuxOptions: null + serviceAccountName: union-system containers: - name: webhook image: "public.ecr.aws/p0i0a9q8/unionoperator:2026.4.5" @@ -6136,10 +6399,10 @@ spec: volumes: - name: config-volume configMap: - name: flyte-propeller-config + name: union-pod-webhook-config - name: webhook-certs secret: - secretName: flyte-pod-webhook + secretName: union-pod-webhook nodeSelector: flyte.org/node-role: worker @@ -6149,116 +6412,296 @@ spec: operator: Equal value: worker --- -# Source: dataplane/templates/propeller/deployment.yaml -apiVersion: apps/v1 -kind: Deployment +# Source: dataplane/templates/flyteconnector/hpa.yaml +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler metadata: - namespace: union - name: flytepropeller + name: flyteconnector + labels: + app.kubernetes.io/name: flyteconnector + app.kubernetes.io/instance: release-name + app.kubernetes.io/managed-by: Helm +spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: flyteconnector + minReplicas: 2 + maxReplicas: 5 + metrics: + - type: Resource + resource: + name: cpu + target: + type: Utilization + averageUtilization: 80 + - type: Resource + resource: + name: memory + target: + type: Utilization + averageUtilization: 80 +--- +# Source: dataplane/templates/webhook/mutatingwebhookconfiguration.yaml +apiVersion: admissionregistration.k8s.io/v1 +kind: MutatingWebhookConfiguration +metadata: + name: union-pod-webhook-union labels: - app.kubernetes.io/name: flytepropeller + app.kubernetes.io/name: union-pod-webhook app.kubernetes.io/instance: release-name platform.union.ai/service-group: release-name app.kubernetes.io/managed-by: Helm +webhooks: + - name: union-pod-webhook.flyte.org + admissionReviewVersions: + - v1 + - v1beta1 + clientConfig: + caBundle: LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSUROekNDQWgrZ0F3SUJBZ0lVSm9kQ09lem4vd1BHQTZjYkw1U3duSlpYT2hJd0RRWUpLb1pJaHZjTkFRRUwKQlFBd0t6RXBNQ2NHQTFVRUF3d2dabXg1ZEdWd2NtOXdaV3hzWlhJdGQyVmlhRzl2YXk1MWJtbHZiaTV6ZG1NdwpIaGNOTWpZd01USXpNREl6TmpVMVdoY05Nell3TVRJeE1ESXpOalUxV2pBck1Ta3dKd1lEVlFRRERDQm1iSGwwClpYQnliM0JsYkd4bGNpMTNaV0pvYjI5ckxuVnVhVzl1TG5OMll6Q0NBU0l3RFFZSktvWklodmNOQVFFQkJRQUQKZ2dFUEFEQ0NBUW9DZ2dFQkFJcjBSWDRGRjFNVTNYdFBmUHErMDUrMGI1Qm0xK3hsTVR5ZUp4bG5mWFJuRlgyegpPcHV6NmsreUpBQk4xRkxmNDVYRnlJclFlaW5DWWJtckZXalFRc3ZDWEloNkpPNGMwdmRMVVU5RERYcldWRFlsCkRNZytTUDlJVnZBMm5USkVFZ2RFREVsKzNWeThUbUJoNlNmcGthcXdHajY5SmhIdy9OeE9yZzM2bUY3TU5VZ1MKWXJVUkxqUFFzTUN6NElTVzhhQnEzS2IwRUFWUTA4V1lZQ0ZEbjY1aHBzVFdxR05rL3BKRnVTYjlsZll6RVdyYwpJeW5wNDhqTnIvNUdKS3NnWG9LeTUwZVo4UkppeTVyOUhENGtnSW15TVNQR2RDMXFZSVppRHRlZW8yOElxNk5nCmd1dWcyTGhlNndYYW9YZXFCQ2d3SHpwWFVXM1hJVzI5eFoxUkFRRUNBd0VBQWFOVE1GRXdIUVlEVlIwT0JCWUUKRktITDQ1RWhQaVg5bnlRN241QUk0YUZVNFlCL01COEdBMVVkSXdRWU1CYUFGS0hMNDVFaFBpWDlueVE3bjVBSQo0YUZVNFlCL01BOEdBMVVkRXdFQi93UUZNQU1CQWY4d0RRWUpLb1pJaHZjTkFRRUxCUUFEZ2dFQkFEeTRac21XCkNjZ3B5TmI4cU1ydzNYclBNejZLMTRlTVhqQVk1bERFYXM1NklLZExoZFM1bi9sWk95bENuUUtOZ3RHS0VrY3UKT05tMEE2Z0xNT3UzcE43dG9aNXh0WktOcnUvS0x6dm44QUNNNndCbUR0bG5oRERDa3ZUTHBVRzh6UTZSNmlaegpzTWlhVmpLSUJiWkJsZXNYSHZZQnQ2YlRTN2o1QWM2aTdYc1RPK3ZqQ2RUQjVYdCtEZ0x2QnBJRWh2ZGJzNXhmClFtc3ZzQmwxenJnR3JXemo4d0txcXNUMzZaOVU1TzdSK1RJMlQrUDVOR25SeHNQOGJ0R0RtMGZ5SW55WlJrSkcKVTdpVFJyMlZRWlRyaTNZQ2dlVjN1UjFtMnJjc2hyU1Avb082T0QyTXR5Q1FMRWtFK1pxVUVJVnpSYmVaNXl2RQptM29YMFV2OUtBUy9pWTA9Ci0tLS0tRU5EIENFUlRJRklDQVRFLS0tLS0K + service: + name: union-pod-webhook + namespace: union + path: /mutate--v1-pod/secrets + port: 443 + failurePolicy: Fail + matchPolicy: Equivalent + namespaceSelector: + matchLabels: + kubernetes.io/metadata.name: union + objectSelector: + matchLabels: + inject-flyte-secrets: "true" + organization: 'union' + reinvocationPolicy: Never + rules: + - apiGroups: + - '*' + apiVersions: + - v1 + operations: + - CREATE + resources: + - pods + scope: '*' + sideEffects: NoneOnDryRun + timeoutSeconds: 30 +--- +# Source: dataplane/charts/knative-operator/templates/knative-operator.yaml +# Imported via https://github.com/knative/operator/releases/download/knative-v1.16.0/operator.yaml +# Copyright 2024 The Knative Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +# Source: dataplane/templates/propeller/serviceaccount.yaml +--- +--- +# Source: dataplane/templates/serving/knative-serving.yaml +apiVersion: operator.knative.dev/v1beta1 +kind: KnativeServing +metadata: + name: union-operator-serving + namespace: union + labels: + helm.sh/chart: dataplane-2026.4.4 + app.kubernetes.io/name: release-name-dataplane + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: "2026.4.5" + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/component: serving spec: - replicas: 1 - selector: - matchLabels: - app.kubernetes.io/name: flytepropeller + config: + deployment: + progress-deadline: "30m" + queue-sidecar-cpu-request: "25m" + queue-sidecar-cpu-limit: "1000m" + queue-sidecar-memory-request: "400Mi" + queue-sidecar-memory-limit: "800Mi" + queue-sidecar-ephemeral-storage-request: "512Mi" + queue-sidecar-ephemeral-storage-limit: "1024Mi" + registries-skipping-tag-resolving: managed.cr.union.ai + features: + kubernetes.podspec-affinity: "enabled" + kubernetes.podspec-nodeselector: "enabled" + kubernetes.podspec-tolerations: "enabled" + kubernetes.podspec-fieldref: "enabled" + kubernetes.podspec-dnspolicy: "enabled" + kubernetes.podspec-schedulername: "enabled" + kubernetes.podspec-securitycontext: "enabled" + network: + ingress-class: "kourier.ingress.networking.knative.dev" + high-availability: + replicas: 2 + ingress: + kourier: + enabled: true + bootstrap-configmap: "union-operator-serving-envoy-bootstrap" + service-type: ClusterIP + podDisruptionBudgets: + - name: 3scale-kourier-gateway-pdb + minAvailable: 50% + - name: activator-pdb + minAvailable: 50% + - name: webhook-pdb + minAvailable: 50% + registry: + override: + # TODO(jeev): Wire up Union fork of Envoy + 3scale-kourier-gateway/kourier-gateway: ghcr.io/unionai/envoy:456fed84d4ad9a9dfb186d117d9362e9dc0f7c1f + # TODO(jeev): Wire up Union fork of Kourier + net-kourier-controller/controller: ghcr.io/unionai/kourier@sha256:5804c348d15b3959604e3e3ceed216c3a1c7b32cbe254c7d3eb02a35e62ba9c4 + workloads: + - name: 3scale-kourier-gateway + labels: + helm.sh/chart: dataplane-2026.4.4 + app.kubernetes.io/name: release-name-dataplane app.kubernetes.io/instance: release-name - template: - metadata: - annotations: - configChecksum: "f3931dbcb3b33014ce22a4a663e84dbaadd5fb0aa0728492a9042bc8bbac1b9" - - labels: - platform.union.ai/zone: "dataplane" - - - app.kubernetes.io/name: flytepropeller - app.kubernetes.io/instance: release-name - platform.union.ai/service-group: release-name - app.kubernetes.io/managed-by: Helm - spec: - priorityClassName: system-cluster-critical - containers: - - command: - - flytepropeller - - --config - - /etc/flyte/config/*.yaml - - --propeller.cluster-id - - union-oci - env: - - name: POD_NAME - valueFrom: - fieldRef: - fieldPath: metadata.name - - name: POD_NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - - name: GOMEMLIMIT - valueFrom: - resourceFieldRef: - divisor: 1 - resource: limits.memory - - name: GOMAXPROCS - valueFrom: - resourceFieldRef: - divisor: 1 - resource: limits.cpu - - name: CLUSTER_NAME - valueFrom: - secretKeyRef: - name: operator-cluster-name - key: cluster_name - - name: DEPLOYMENT_NAME - value: operator - - name: PROXY_SERVICE_URL - value: http://union-operator-proxy:8080 - - name: PROMETHEUS_SERVICE_URL - value: http://union-operator-prometheus:80 - - name: KNATIVE_PROXY_SERVICE_URL - value: http://kourier-internal - - name: AWS_REQUEST_CHECKSUM_CALCULATION - value: "when_required" - - name: CREATE_UPRIVER_DATA_SOURCE - value: "true" - - name: SOME_NUMERIC_VAR - value: "42" - image: "public.ecr.aws/p0i0a9q8/unionoperator:2026.4.5" - imagePullPolicy: "IfNotPresent" - name: flytepropeller - ports: - - containerPort: 10254 - resources: - limits: - cpu: "3" - memory: 3Gi - requests: - cpu: "1" - memory: 1Gi - volumeMounts: - - name: config-volume - mountPath: /etc/flyte/config - - name: auth - mountPath: /etc/union/secret - serviceAccountName: flytepropeller-system - volumes: - - configMap: - name: flyte-propeller-config - name: config-volume - - name: auth - secret: - secretName: union-secret-auth - - nodeSelector: - flyte.org/node-role: worker - tolerations: - - effect: NoSchedule - key: flyte.org/node-role - operator: Equal - value: worker + app.kubernetes.io/version: "2026.4.5" + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/component: 3scale-kourier-gateway + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchLabels: + app.kubernetes.io/name: 3scale-kourier-gateway + app.kubernetes.io/instance: release-name + topologyKey: topology.kubernetes.io/zone + annotations: + checksum/bootstrap-config: 6469f6f592eebc9e0c676d8ccc359a05bc799c0988e474b2da942c4cc328f656 + env: + - container: kourier-gateway + envVars: + - name: UNION_AUTHZ_TENANTAUTHURL + value: "https://union.us-west-2.union.ai/me" + - name: UNION_AUTHZ_TENANTAUTHSIGNINURL + value: "https://union.us-west-2.union.ai/login" + - name: UNION_AUTHZ_TENANTCONTROLPLANEURL + value: "https://union.us-west-2.union.ai" + resources: + - container: kourier-gateway + limits: + cpu: "2" + memory: 2Gi + requests: + cpu: "1" + memory: 1Gi + - name: activator + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app + operator: In + values: + - activator + topologyKey: topology.kubernetes.io/zone + - name: autoscaler + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app + operator: In + values: + - autoscaler + topologyKey: topology.kubernetes.io/zone + - name: autoscaler-hpa + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app + operator: In + values: + - autoscaler-hpa + topologyKey: topology.kubernetes.io/zone + - name: controller + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app + operator: In + values: + - controller + topologyKey: topology.kubernetes.io/zone + - name: net-kourier-controller + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app + operator: In + values: + - net-kourier-controller + topologyKey: topology.kubernetes.io/zone + env: + - container: controller + envVars: + - name: KOURIER_UNION_AUTHZ_ENABLED + value: "true" + resources: + - container: controller + limits: + cpu: "1" + memory: 1Gi + requests: + cpu: 500m + memory: 500Mi + - name: webhook + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app + operator: In + values: + - webhook + topologyKey: topology.kubernetes.io/zone +--- +# Source: dataplane/templates/common/task-podtemplate.yaml +apiVersion: v1 +kind: PodTemplate +metadata: + name: task-template + namespace: union +template: + spec: + serviceAccountName: union + containers: + - name: default + image: docker.io/rwgrim/docker-noop --- # Source: dataplane/charts/fluentbit/templates/tests/test-connection.yaml apiVersion: v1 diff --git a/tests/generated/knative-operator.crds-disabled.yaml b/tests/generated/knative-operator.crds-disabled.yaml index 35883d95..9a648aaf 100644 --- a/tests/generated/knative-operator.crds-disabled.yaml +++ b/tests/generated/knative-operator.crds-disabled.yaml @@ -14,7 +14,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - apiVersion: v1 kind: Namespace metadata: @@ -120,45 +119,7 @@ metadata: labels: app.kubernetes.io/version: "1.16.0" app.kubernetes.io/name: knative-operator -data: - _example: | - ################################ - # # - # EXAMPLE CONFIGURATION # - # # - ################################ - - # This block is not actually functional configuration, - # but serves to illustrate the available configuration - # options and document them in a way that is accessible - # to users that `kubectl edit` this config map. - # - # These sample configuration options may be copied out of - # this example block and unindented to be in the data block - # to actually change the configuration. - - # Common configuration for all Knative codebase - zap-logger-config: | - { - "level": "info", - "development": false, - "outputPaths": ["stdout"], - "errorOutputPaths": ["stderr"], - "encoding": "json", - "encoderConfig": { - "timeKey": "ts", - "levelKey": "level", - "nameKey": "logger", - "callerKey": "caller", - "messageKey": "msg", - "stacktraceKey": "stacktrace", - "lineEnding": "", - "levelEncoder": "", - "timeEncoder": "iso8601", - "durationEncoder": "", - "callerEncoder": "" - } - } +data: {} --- # Source: knative-operator/templates/knative-operator.yaml # Copyright 2019 The Knative Authors @@ -183,56 +144,7 @@ metadata: labels: app.kubernetes.io/version: "1.16.0" app.kubernetes.io/name: knative-operator -data: - _example: | - ################################ - # # - # EXAMPLE CONFIGURATION # - # # - ################################ - - # This block is not actually functional configuration, - # but serves to illustrate the available configuration - # options and document them in a way that is accessible - # to users that `kubectl edit` this config map. - # - # These sample configuration options may be copied out of - # this example block and unindented to be in the data block - # to actually change the configuration. - - # logging.enable-var-log-collection defaults to false. - # The fluentd daemon set will be set up to collect /var/log if - # this flag is true. - logging.enable-var-log-collection: false - - # logging.revision-url-template provides a template to use for producing the - # logging URL that is injected into the status of each Revision. - # This value is what you might use the the Knative monitoring bundle, and provides - # access to Kibana after setting up kubectl proxy. - logging.revision-url-template: | - http://localhost:8001/api/v1/namespaces/knative-monitoring/services/kibana-logging/proxy/app/kibana#/discover?_a=(query:(match:(kubernetes.labels.serving-knative-dev%2FrevisionUID:(query:'${REVISION_UID}',type:phrase)))) - - # metrics.backend-destination field specifies the system metrics destination. - # It supports either prometheus (the default) or stackdriver. - # Note: Using stackdriver will incur additional charges - metrics.backend-destination: prometheus - - # metrics.request-metrics-backend-destination specifies the request metrics - # destination. If non-empty, it enables queue proxy to send request metrics. - # Currently supported values: prometheus, stackdriver. - metrics.request-metrics-backend-destination: prometheus - - # metrics.stackdriver-project-id field specifies the stackdriver project ID. This - # field is optional. When running on GCE, application default credentials will be - # used if this field is not provided. - metrics.stackdriver-project-id: "" - - # metrics.allow-stackdriver-custom-metrics indicates whether it is allowed to send metrics to - # Stackdriver using "global" resource type and custom metric type if the - # metrics are not supported by "knative_revision" resource type. Setting this - # flag to "true" could cause extra Stackdriver charge. - # If metrics.backend-destination is not Stackdriver, this is ignored. - metrics.allow-stackdriver-custom-metrics: "false" +data: {} --- # Source: knative-operator/templates/knative-operator.yaml # Copyright 2020 The Knative Authors @@ -330,6 +242,7 @@ rules: [] # Rules are automatically filled in by the controller manager. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + kind: ClusterRole apiVersion: rbac.authorization.k8s.io/v1 metadata: @@ -1117,6 +1030,7 @@ subjects: # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: diff --git a/tests/generated/knative-operator.default.yaml b/tests/generated/knative-operator.default.yaml index 2d352b30..9a648aaf 100644 --- a/tests/generated/knative-operator.default.yaml +++ b/tests/generated/knative-operator.default.yaml @@ -14,7 +14,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - apiVersion: v1 kind: Namespace metadata: @@ -120,45 +119,7 @@ metadata: labels: app.kubernetes.io/version: "1.16.0" app.kubernetes.io/name: knative-operator -data: - _example: | - ################################ - # # - # EXAMPLE CONFIGURATION # - # # - ################################ - - # This block is not actually functional configuration, - # but serves to illustrate the available configuration - # options and document them in a way that is accessible - # to users that `kubectl edit` this config map. - # - # These sample configuration options may be copied out of - # this example block and unindented to be in the data block - # to actually change the configuration. - - # Common configuration for all Knative codebase - zap-logger-config: | - { - "level": "info", - "development": false, - "outputPaths": ["stdout"], - "errorOutputPaths": ["stderr"], - "encoding": "json", - "encoderConfig": { - "timeKey": "ts", - "levelKey": "level", - "nameKey": "logger", - "callerKey": "caller", - "messageKey": "msg", - "stacktraceKey": "stacktrace", - "lineEnding": "", - "levelEncoder": "", - "timeEncoder": "iso8601", - "durationEncoder": "", - "callerEncoder": "" - } - } +data: {} --- # Source: knative-operator/templates/knative-operator.yaml # Copyright 2019 The Knative Authors @@ -183,10136 +144,7 @@ metadata: labels: app.kubernetes.io/version: "1.16.0" app.kubernetes.io/name: knative-operator -data: - _example: | - ################################ - # # - # EXAMPLE CONFIGURATION # - # # - ################################ - - # This block is not actually functional configuration, - # but serves to illustrate the available configuration - # options and document them in a way that is accessible - # to users that `kubectl edit` this config map. - # - # These sample configuration options may be copied out of - # this example block and unindented to be in the data block - # to actually change the configuration. - - # logging.enable-var-log-collection defaults to false. - # The fluentd daemon set will be set up to collect /var/log if - # this flag is true. - logging.enable-var-log-collection: false - - # logging.revision-url-template provides a template to use for producing the - # logging URL that is injected into the status of each Revision. - # This value is what you might use the the Knative monitoring bundle, and provides - # access to Kibana after setting up kubectl proxy. - logging.revision-url-template: | - http://localhost:8001/api/v1/namespaces/knative-monitoring/services/kibana-logging/proxy/app/kibana#/discover?_a=(query:(match:(kubernetes.labels.serving-knative-dev%2FrevisionUID:(query:'${REVISION_UID}',type:phrase)))) - - # metrics.backend-destination field specifies the system metrics destination. - # It supports either prometheus (the default) or stackdriver. - # Note: Using stackdriver will incur additional charges - metrics.backend-destination: prometheus - - # metrics.request-metrics-backend-destination specifies the request metrics - # destination. If non-empty, it enables queue proxy to send request metrics. - # Currently supported values: prometheus, stackdriver. - metrics.request-metrics-backend-destination: prometheus - - # metrics.stackdriver-project-id field specifies the stackdriver project ID. This - # field is optional. When running on GCE, application default credentials will be - # used if this field is not provided. - metrics.stackdriver-project-id: "" - - # metrics.allow-stackdriver-custom-metrics indicates whether it is allowed to send metrics to - # Stackdriver using "global" resource type and custom metric type if the - # metrics are not supported by "knative_revision" resource type. Setting this - # flag to "true" could cause extra Stackdriver charge. - # If metrics.backend-destination is not Stackdriver, this is ignored. - metrics.allow-stackdriver-custom-metrics: "false" ---- -# Source: knative-operator/templates/knative-crds.yaml -# Imported from https://github.com/knative/serving/releases/download/knative-v1.16.0/serving-crds.yaml - -# Copyright 2020 The Knative Authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -apiVersion: apiextensions.k8s.io/v1 -kind: CustomResourceDefinition -metadata: - name: certificates.networking.internal.knative.dev - labels: - app.kubernetes.io/name: knative-serving - app.kubernetes.io/component: networking - app.kubernetes.io/version: "1.16.0" - knative.dev/crd-install: "true" -spec: - group: networking.internal.knative.dev - versions: - - name: v1alpha1 - served: true - storage: true - subresources: - status: {} - schema: - openAPIV3Schema: - description: |- - Certificate is responsible for provisioning a SSL certificate for the - given hosts. It is a Knative abstraction for various SSL certificate - provisioning solutions (such as cert-manager or self-signed SSL certificate). - type: object - properties: - apiVersion: - description: |- - APIVersion defines the versioned schema of this representation of an object. - Servers should convert recognized schemas to the latest internal value, and - may reject unrecognized values. - More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources - type: string - kind: - description: |- - Kind is a string value representing the REST resource this object represents. - Servers may infer this from the endpoint the client submits requests to. - Cannot be updated. - In CamelCase. - More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds - type: string - metadata: - type: object - spec: - description: |- - Spec is the desired state of the Certificate. - More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#spec-and-status - type: object - required: - - dnsNames - - secretName - properties: - dnsNames: - description: |- - DNSNames is a list of DNS names the Certificate could support. - The wildcard format of DNSNames (e.g. *.default.example.com) is supported. - type: array - items: - type: string - domain: - description: Domain is the top level domain of the values for DNSNames. - type: string - secretName: - description: SecretName is the name of the secret resource to store the SSL certificate in. - type: string - status: - description: |- - Status is the current state of the Certificate. - More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#spec-and-status - type: object - properties: - annotations: - description: |- - Annotations is additional Status fields for the Resource to save some - additional State as well as convey more information to the user. This is - roughly akin to Annotations on any k8s resource, just the reconciler conveying - richer information outwards. - type: object - additionalProperties: - type: string - conditions: - description: Conditions the latest available observations of a resource's current state. - type: array - items: - description: |- - Condition defines a readiness condition for a Knative resource. - See: https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/api-conventions.md#typical-status-properties - type: object - required: - - status - - type - properties: - lastTransitionTime: - description: |- - LastTransitionTime is the last time the condition transitioned from one status to another. - We use VolatileTime in place of metav1.Time to exclude this from creating equality.Semantic - differences (all other things held constant). - type: string - message: - description: A human readable message indicating details about the transition. - type: string - reason: - description: The reason for the condition's last transition. - type: string - severity: - description: |- - Severity with which to treat failures of this type of condition. - When this is not specified, it defaults to Error. - type: string - status: - description: Status of the condition, one of True, False, Unknown. - type: string - type: - description: Type of condition. - type: string - http01Challenges: - description: |- - HTTP01Challenges is a list of HTTP01 challenges that need to be fulfilled - in order to get the TLS certificate.. - type: array - items: - description: |- - HTTP01Challenge defines the status of a HTTP01 challenge that a certificate needs - to fulfill. - type: object - properties: - serviceName: - description: ServiceName is the name of the service to serve HTTP01 challenge requests. - type: string - serviceNamespace: - description: ServiceNamespace is the namespace of the service to serve HTTP01 challenge requests. - type: string - servicePort: - description: ServicePort is the port of the service to serve HTTP01 challenge requests. - anyOf: - - type: integer - - type: string - x-kubernetes-int-or-string: true - url: - description: URL is the URL that the HTTP01 challenge is expected to serve on. - type: string - notAfter: - description: |- - The expiration time of the TLS certificate stored in the secret named - by this resource in spec.secretName. - type: string - format: date-time - observedGeneration: - description: |- - ObservedGeneration is the 'Generation' of the Service that - was last processed by the controller. - type: integer - format: int64 - additionalPrinterColumns: - - name: Ready - type: string - jsonPath: ".status.conditions[?(@.type==\"Ready\")].status" - - name: Reason - type: string - jsonPath: ".status.conditions[?(@.type==\"Ready\")].reason" - names: - kind: Certificate - plural: certificates - singular: certificate - categories: - - knative-internal - - networking - shortNames: - - kcert - scope: Namespaced ---- -# Source: knative-operator/templates/knative-crds.yaml -# Copyright 2019 The Knative Authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Note: The schema part of the spec is auto-generated by hack/update-schemas.sh. - -apiVersion: apiextensions.k8s.io/v1 -kind: CustomResourceDefinition -metadata: - name: configurations.serving.knative.dev - labels: - app.kubernetes.io/name: knative-serving - app.kubernetes.io/version: "1.16.0" - knative.dev/crd-install: "true" - duck.knative.dev/podspecable: "true" -spec: - group: serving.knative.dev - names: - kind: Configuration - plural: configurations - singular: configuration - categories: - - all - - knative - - serving - shortNames: - - config - - cfg - scope: Namespaced - versions: - - name: v1 - served: true - storage: true - subresources: - status: {} - additionalPrinterColumns: - - name: LatestCreated - type: string - jsonPath: .status.latestCreatedRevisionName - - name: LatestReady - type: string - jsonPath: .status.latestReadyRevisionName - - name: Ready - type: string - jsonPath: ".status.conditions[?(@.type=='Ready')].status" - - name: Reason - type: string - jsonPath: ".status.conditions[?(@.type=='Ready')].reason" - schema: - openAPIV3Schema: - description: |- - Configuration represents the "floating HEAD" of a linear history of Revisions. - Users create new Revisions by updating the Configuration's spec. - The "latest created" revision's name is available under status, as is the - "latest ready" revision's name. - See also: https://github.com/knative/serving/blob/main/docs/spec/overview.md#configuration - type: object - properties: - apiVersion: - description: |- - APIVersion defines the versioned schema of this representation of an object. - Servers should convert recognized schemas to the latest internal value, and - may reject unrecognized values. - More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources - type: string - kind: - description: |- - Kind is a string value representing the REST resource this object represents. - Servers may infer this from the endpoint the client submits requests to. - Cannot be updated. - In CamelCase. - More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds - type: string - metadata: - type: object - spec: - description: ConfigurationSpec holds the desired state of the Configuration (from the client). - type: object - properties: - template: - description: Template holds the latest specification for the Revision to be stamped out. - type: object - properties: - metadata: - type: object - properties: - annotations: - type: object - additionalProperties: - type: string - finalizers: - type: array - items: - type: string - labels: - type: object - additionalProperties: - type: string - name: - type: string - namespace: - type: string - x-kubernetes-preserve-unknown-fields: true - spec: - description: RevisionSpec holds the desired state of the Revision (from the client). - type: object - required: - - containers - properties: - affinity: - description: This is accessible behind a feature flag - kubernetes.podspec-affinity - type: object - x-kubernetes-preserve-unknown-fields: true - automountServiceAccountToken: - description: AutomountServiceAccountToken indicates whether a service account token should be automatically mounted. - type: boolean - containerConcurrency: - description: |- - ContainerConcurrency specifies the maximum allowed in-flight (concurrent) - requests per container of the Revision. Defaults to `0` which means - concurrency to the application is not limited, and the system decides the - target concurrency for the autoscaler. - type: integer - format: int64 - containers: - description: |- - List of containers belonging to the pod. - Containers cannot currently be added or removed. - There must be at least one container in a Pod. - Cannot be updated. - type: array - items: - description: A single application container that you want to run within a pod. - type: object - properties: - args: - description: |- - Arguments to the entrypoint. - The container image's CMD is used if this is not provided. - Variable references $(VAR_NAME) are expanded using the container's environment. If a variable - cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced - to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. "$$(VAR_NAME)" will - produce the string literal "$(VAR_NAME)". Escaped references will never be expanded, regardless - of whether the variable exists or not. Cannot be updated. - More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell - type: array - items: - type: string - x-kubernetes-list-type: atomic - command: - description: |- - Entrypoint array. Not executed within a shell. - The container image's ENTRYPOINT is used if this is not provided. - Variable references $(VAR_NAME) are expanded using the container's environment. If a variable - cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced - to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. "$$(VAR_NAME)" will - produce the string literal "$(VAR_NAME)". Escaped references will never be expanded, regardless - of whether the variable exists or not. Cannot be updated. - More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell - type: array - items: - type: string - x-kubernetes-list-type: atomic - env: - description: |- - List of environment variables to set in the container. - Cannot be updated. - type: array - items: - description: EnvVar represents an environment variable present in a Container. - type: object - required: - - name - properties: - name: - description: Name of the environment variable. Must be a C_IDENTIFIER. - type: string - value: - description: |- - Variable references $(VAR_NAME) are expanded - using the previously defined environment variables in the container and - any service environment variables. If a variable cannot be resolved, - the reference in the input string will be unchanged. Double $$ are reduced - to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. - "$$(VAR_NAME)" will produce the string literal "$(VAR_NAME)". - Escaped references will never be expanded, regardless of whether the variable - exists or not. - Defaults to "". - type: string - valueFrom: - description: Source for the environment variable's value. Cannot be used if value is not empty. - type: object - properties: - configMapKeyRef: - description: Selects a key of a ConfigMap. - type: object - required: - - key - properties: - key: - description: The key to select. - type: string - name: - description: |- - Name of the referent. - This field is effectively required, but due to backwards compatibility is - allowed to be empty. Instances of this type with an empty value here are - almost certainly wrong. - TODO: Add other useful fields. apiVersion, kind, uid? - More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names - TODO: Drop `kubebuilder:default` when controller-gen doesn't need it https://github.com/kubernetes-sigs/kubebuilder/issues/3896. - type: string - default: "" - optional: - description: Specify whether the ConfigMap or its key must be defined - type: boolean - x-kubernetes-map-type: atomic - fieldRef: - description: This is accessible behind a feature flag - kubernetes.podspec-fieldref - type: object - x-kubernetes-preserve-unknown-fields: true - x-kubernetes-map-type: atomic - resourceFieldRef: - description: This is accessible behind a feature flag - kubernetes.podspec-fieldref - type: object - x-kubernetes-preserve-unknown-fields: true - x-kubernetes-map-type: atomic - secretKeyRef: - description: Selects a key of a secret in the pod's namespace - type: object - required: - - key - properties: - key: - description: The key of the secret to select from. Must be a valid secret key. - type: string - name: - description: |- - Name of the referent. - This field is effectively required, but due to backwards compatibility is - allowed to be empty. Instances of this type with an empty value here are - almost certainly wrong. - TODO: Add other useful fields. apiVersion, kind, uid? - More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names - TODO: Drop `kubebuilder:default` when controller-gen doesn't need it https://github.com/kubernetes-sigs/kubebuilder/issues/3896. - type: string - default: "" - optional: - description: Specify whether the Secret or its key must be defined - type: boolean - x-kubernetes-map-type: atomic - x-kubernetes-list-map-keys: - - name - x-kubernetes-list-type: map - envFrom: - description: |- - List of sources to populate environment variables in the container. - The keys defined within a source must be a C_IDENTIFIER. All invalid keys - will be reported as an event when the container is starting. When a key exists in multiple - sources, the value associated with the last source will take precedence. - Values defined by an Env with a duplicate key will take precedence. - Cannot be updated. - type: array - items: - description: EnvFromSource represents the source of a set of ConfigMaps - type: object - properties: - configMapRef: - description: The ConfigMap to select from - type: object - properties: - name: - description: |- - Name of the referent. - This field is effectively required, but due to backwards compatibility is - allowed to be empty. Instances of this type with an empty value here are - almost certainly wrong. - TODO: Add other useful fields. apiVersion, kind, uid? - More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names - TODO: Drop `kubebuilder:default` when controller-gen doesn't need it https://github.com/kubernetes-sigs/kubebuilder/issues/3896. - type: string - default: "" - optional: - description: Specify whether the ConfigMap must be defined - type: boolean - x-kubernetes-map-type: atomic - prefix: - description: An optional identifier to prepend to each key in the ConfigMap. Must be a C_IDENTIFIER. - type: string - secretRef: - description: The Secret to select from - type: object - properties: - name: - description: |- - Name of the referent. - This field is effectively required, but due to backwards compatibility is - allowed to be empty. Instances of this type with an empty value here are - almost certainly wrong. - TODO: Add other useful fields. apiVersion, kind, uid? - More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names - TODO: Drop `kubebuilder:default` when controller-gen doesn't need it https://github.com/kubernetes-sigs/kubebuilder/issues/3896. - type: string - default: "" - optional: - description: Specify whether the Secret must be defined - type: boolean - x-kubernetes-map-type: atomic - x-kubernetes-list-type: atomic - image: - description: |- - Container image name. - More info: https://kubernetes.io/docs/concepts/containers/images - This field is optional to allow higher level config management to default or override - container images in workload controllers like Deployments and StatefulSets. - type: string - imagePullPolicy: - description: |- - Image pull policy. - One of Always, Never, IfNotPresent. - Defaults to Always if :latest tag is specified, or IfNotPresent otherwise. - Cannot be updated. - More info: https://kubernetes.io/docs/concepts/containers/images#updating-images - type: string - livenessProbe: - description: |- - Periodic probe of container liveness. - Container will be restarted if the probe fails. - Cannot be updated. - More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes - type: object - properties: - exec: - description: Exec specifies the action to take. - type: object - properties: - command: - description: |- - Command is the command line to execute inside the container, the working directory for the - command is root ('/') in the container's filesystem. The command is simply exec'd, it is - not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use - a shell, you need to explicitly call out to that shell. - Exit status of 0 is treated as live/healthy and non-zero is unhealthy. - type: array - items: - type: string - x-kubernetes-list-type: atomic - failureThreshold: - description: |- - Minimum consecutive failures for the probe to be considered failed after having succeeded. - Defaults to 3. Minimum value is 1. - type: integer - format: int32 - grpc: - description: GRPC specifies an action involving a GRPC port. - type: object - required: - - port - properties: - port: - description: Port number of the gRPC service. Number must be in the range 1 to 65535. - type: integer - format: int32 - service: - description: |- - Service is the name of the service to place in the gRPC HealthCheckRequest - (see https://github.com/grpc/grpc/blob/master/doc/health-checking.md). - - - If this is not specified, the default behavior is defined by gRPC. - type: string - httpGet: - description: HTTPGet specifies the http request to perform. - type: object - properties: - host: - description: |- - Host name to connect to, defaults to the pod IP. You probably want to set - "Host" in httpHeaders instead. - type: string - httpHeaders: - description: Custom headers to set in the request. HTTP allows repeated headers. - type: array - items: - description: HTTPHeader describes a custom header to be used in HTTP probes - type: object - required: - - name - - value - properties: - name: - description: |- - The header field name. - This will be canonicalized upon output, so case-variant names will be understood as the same header. - type: string - value: - description: The header field value - type: string - x-kubernetes-list-type: atomic - path: - description: Path to access on the HTTP server. - type: string - port: - description: |- - Name or number of the port to access on the container. - Number must be in the range 1 to 65535. - Name must be an IANA_SVC_NAME. - anyOf: - - type: integer - - type: string - x-kubernetes-int-or-string: true - scheme: - description: |- - Scheme to use for connecting to the host. - Defaults to HTTP. - type: string - initialDelaySeconds: - description: |- - Number of seconds after the container has started before liveness probes are initiated. - More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes - type: integer - format: int32 - periodSeconds: - description: How often (in seconds) to perform the probe. - type: integer - format: int32 - successThreshold: - description: |- - Minimum consecutive successes for the probe to be considered successful after having failed. - Defaults to 1. Must be 1 for liveness and startup. Minimum value is 1. - type: integer - format: int32 - tcpSocket: - description: TCPSocket specifies an action involving a TCP port. - type: object - properties: - host: - description: 'Optional: Host name to connect to, defaults to the pod IP.' - type: string - port: - description: |- - Number or name of the port to access on the container. - Number must be in the range 1 to 65535. - Name must be an IANA_SVC_NAME. - anyOf: - - type: integer - - type: string - x-kubernetes-int-or-string: true - timeoutSeconds: - description: |- - Number of seconds after which the probe times out. - Defaults to 1 second. Minimum value is 1. - More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes - type: integer - format: int32 - name: - description: |- - Name of the container specified as a DNS_LABEL. - Each container in a pod must have a unique name (DNS_LABEL). - Cannot be updated. - type: string - ports: - description: |- - List of ports to expose from the container. Not specifying a port here - DOES NOT prevent that port from being exposed. Any port which is - listening on the default "0.0.0.0" address inside a container will be - accessible from the network. - Modifying this array with strategic merge patch may corrupt the data. - For more information See https://github.com/kubernetes/kubernetes/issues/108255. - Cannot be updated. - type: array - items: - description: ContainerPort represents a network port in a single container. - type: object - required: - - containerPort - properties: - containerPort: - description: |- - Number of port to expose on the pod's IP address. - This must be a valid port number, 0 < x < 65536. - type: integer - format: int32 - name: - description: |- - If specified, this must be an IANA_SVC_NAME and unique within the pod. Each - named port in a pod must have a unique name. Name for the port that can be - referred to by services. - type: string - protocol: - description: |- - Protocol for port. Must be UDP, TCP, or SCTP. - Defaults to "TCP". - type: string - default: TCP - x-kubernetes-list-map-keys: - - containerPort - - protocol - x-kubernetes-list-type: map - readinessProbe: - description: |- - Periodic probe of container service readiness. - Container will be removed from service endpoints if the probe fails. - Cannot be updated. - More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes - type: object - properties: - exec: - description: Exec specifies the action to take. - type: object - properties: - command: - description: |- - Command is the command line to execute inside the container, the working directory for the - command is root ('/') in the container's filesystem. The command is simply exec'd, it is - not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use - a shell, you need to explicitly call out to that shell. - Exit status of 0 is treated as live/healthy and non-zero is unhealthy. - type: array - items: - type: string - x-kubernetes-list-type: atomic - failureThreshold: - description: |- - Minimum consecutive failures for the probe to be considered failed after having succeeded. - Defaults to 3. Minimum value is 1. - type: integer - format: int32 - grpc: - description: GRPC specifies an action involving a GRPC port. - type: object - required: - - port - properties: - port: - description: Port number of the gRPC service. Number must be in the range 1 to 65535. - type: integer - format: int32 - service: - description: |- - Service is the name of the service to place in the gRPC HealthCheckRequest - (see https://github.com/grpc/grpc/blob/master/doc/health-checking.md). - - - If this is not specified, the default behavior is defined by gRPC. - type: string - httpGet: - description: HTTPGet specifies the http request to perform. - type: object - properties: - host: - description: |- - Host name to connect to, defaults to the pod IP. You probably want to set - "Host" in httpHeaders instead. - type: string - httpHeaders: - description: Custom headers to set in the request. HTTP allows repeated headers. - type: array - items: - description: HTTPHeader describes a custom header to be used in HTTP probes - type: object - required: - - name - - value - properties: - name: - description: |- - The header field name. - This will be canonicalized upon output, so case-variant names will be understood as the same header. - type: string - value: - description: The header field value - type: string - x-kubernetes-list-type: atomic - path: - description: Path to access on the HTTP server. - type: string - port: - description: |- - Name or number of the port to access on the container. - Number must be in the range 1 to 65535. - Name must be an IANA_SVC_NAME. - anyOf: - - type: integer - - type: string - x-kubernetes-int-or-string: true - scheme: - description: |- - Scheme to use for connecting to the host. - Defaults to HTTP. - type: string - initialDelaySeconds: - description: |- - Number of seconds after the container has started before liveness probes are initiated. - More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes - type: integer - format: int32 - periodSeconds: - description: How often (in seconds) to perform the probe. - type: integer - format: int32 - successThreshold: - description: |- - Minimum consecutive successes for the probe to be considered successful after having failed. - Defaults to 1. Must be 1 for liveness and startup. Minimum value is 1. - type: integer - format: int32 - tcpSocket: - description: TCPSocket specifies an action involving a TCP port. - type: object - properties: - host: - description: 'Optional: Host name to connect to, defaults to the pod IP.' - type: string - port: - description: |- - Number or name of the port to access on the container. - Number must be in the range 1 to 65535. - Name must be an IANA_SVC_NAME. - anyOf: - - type: integer - - type: string - x-kubernetes-int-or-string: true - timeoutSeconds: - description: |- - Number of seconds after which the probe times out. - Defaults to 1 second. Minimum value is 1. - More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes - type: integer - format: int32 - resources: - description: |- - Compute Resources required by this container. - Cannot be updated. - More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ - type: object - properties: - claims: - description: |- - Claims lists the names of resources, defined in spec.resourceClaims, - that are used by this container. - - - This is an alpha field and requires enabling the - DynamicResourceAllocation feature gate. - - - This field is immutable. It can only be set for containers. - type: array - items: - description: ResourceClaim references one entry in PodSpec.ResourceClaims. - type: object - required: - - name - properties: - name: - description: |- - Name must match the name of one entry in pod.spec.resourceClaims of - the Pod where this field is used. It makes that resource available - inside a container. - type: string - x-kubernetes-list-map-keys: - - name - x-kubernetes-list-type: map - limits: - description: |- - Limits describes the maximum amount of compute resources allowed. - More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ - type: object - additionalProperties: - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - anyOf: - - type: integer - - type: string - x-kubernetes-int-or-string: true - requests: - description: |- - Requests describes the minimum amount of compute resources required. - If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, - otherwise to an implementation-defined value. Requests cannot exceed Limits. - More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ - type: object - additionalProperties: - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - anyOf: - - type: integer - - type: string - x-kubernetes-int-or-string: true - securityContext: - description: |- - SecurityContext defines the security options the container should be run with. - If set, the fields of SecurityContext override the equivalent fields of PodSecurityContext. - More info: https://kubernetes.io/docs/tasks/configure-pod-container/security-context/ - type: object - properties: - allowPrivilegeEscalation: - description: |- - AllowPrivilegeEscalation controls whether a process can gain more - privileges than its parent process. This bool directly controls if - the no_new_privs flag will be set on the container process. - AllowPrivilegeEscalation is true always when the container is: - 1) run as Privileged - 2) has CAP_SYS_ADMIN - Note that this field cannot be set when spec.os.name is windows. - type: boolean - capabilities: - description: |- - The capabilities to add/drop when running containers. - Defaults to the default set of capabilities granted by the container runtime. - Note that this field cannot be set when spec.os.name is windows. - type: object - properties: - add: - description: This is accessible behind a feature flag - kubernetes.containerspec-addcapabilities - type: array - items: - description: Capability represent POSIX capabilities type - type: string - x-kubernetes-list-type: atomic - drop: - description: Removed capabilities - type: array - items: - description: Capability represent POSIX capabilities type - type: string - x-kubernetes-list-type: atomic - readOnlyRootFilesystem: - description: |- - Whether this container has a read-only root filesystem. - Default is false. - Note that this field cannot be set when spec.os.name is windows. - type: boolean - runAsGroup: - description: |- - The GID to run the entrypoint of the container process. - Uses runtime default if unset. - May also be set in PodSecurityContext. If set in both SecurityContext and - PodSecurityContext, the value specified in SecurityContext takes precedence. - Note that this field cannot be set when spec.os.name is windows. - type: integer - format: int64 - runAsNonRoot: - description: |- - Indicates that the container must run as a non-root user. - If true, the Kubelet will validate the image at runtime to ensure that it - does not run as UID 0 (root) and fail to start the container if it does. - If unset or false, no such validation will be performed. - May also be set in PodSecurityContext. If set in both SecurityContext and - PodSecurityContext, the value specified in SecurityContext takes precedence. - type: boolean - runAsUser: - description: |- - The UID to run the entrypoint of the container process. - Defaults to user specified in image metadata if unspecified. - May also be set in PodSecurityContext. If set in both SecurityContext and - PodSecurityContext, the value specified in SecurityContext takes precedence. - Note that this field cannot be set when spec.os.name is windows. - type: integer - format: int64 - seccompProfile: - description: |- - The seccomp options to use by this container. If seccomp options are - provided at both the pod & container level, the container options - override the pod options. - Note that this field cannot be set when spec.os.name is windows. - type: object - required: - - type - properties: - localhostProfile: - description: |- - localhostProfile indicates a profile defined in a file on the node should be used. - The profile must be preconfigured on the node to work. - Must be a descending path, relative to the kubelet's configured seccomp profile location. - Must be set if type is "Localhost". Must NOT be set for any other type. - type: string - type: - description: |- - type indicates which kind of seccomp profile will be applied. - Valid options are: - - - Localhost - a profile defined in a file on the node should be used. - RuntimeDefault - the container runtime default profile should be used. - Unconfined - no profile should be applied. - type: string - startupProbe: - description: |- - StartupProbe indicates that the Pod has successfully initialized. - If specified, no other probes are executed until this completes successfully. - If this probe fails, the Pod will be restarted, just as if the livenessProbe failed. - This can be used to provide different probe parameters at the beginning of a Pod's lifecycle, - when it might take a long time to load data or warm a cache, than during steady-state operation. - This cannot be updated. - More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes - type: object - properties: - exec: - description: Exec specifies the action to take. - type: object - properties: - command: - description: |- - Command is the command line to execute inside the container, the working directory for the - command is root ('/') in the container's filesystem. The command is simply exec'd, it is - not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use - a shell, you need to explicitly call out to that shell. - Exit status of 0 is treated as live/healthy and non-zero is unhealthy. - type: array - items: - type: string - x-kubernetes-list-type: atomic - failureThreshold: - description: |- - Minimum consecutive failures for the probe to be considered failed after having succeeded. - Defaults to 3. Minimum value is 1. - type: integer - format: int32 - grpc: - description: GRPC specifies an action involving a GRPC port. - type: object - required: - - port - properties: - port: - description: Port number of the gRPC service. Number must be in the range 1 to 65535. - type: integer - format: int32 - service: - description: |- - Service is the name of the service to place in the gRPC HealthCheckRequest - (see https://github.com/grpc/grpc/blob/master/doc/health-checking.md). - - - If this is not specified, the default behavior is defined by gRPC. - type: string - httpGet: - description: HTTPGet specifies the http request to perform. - type: object - properties: - host: - description: |- - Host name to connect to, defaults to the pod IP. You probably want to set - "Host" in httpHeaders instead. - type: string - httpHeaders: - description: Custom headers to set in the request. HTTP allows repeated headers. - type: array - items: - description: HTTPHeader describes a custom header to be used in HTTP probes - type: object - required: - - name - - value - properties: - name: - description: |- - The header field name. - This will be canonicalized upon output, so case-variant names will be understood as the same header. - type: string - value: - description: The header field value - type: string - x-kubernetes-list-type: atomic - path: - description: Path to access on the HTTP server. - type: string - port: - description: |- - Name or number of the port to access on the container. - Number must be in the range 1 to 65535. - Name must be an IANA_SVC_NAME. - anyOf: - - type: integer - - type: string - x-kubernetes-int-or-string: true - scheme: - description: |- - Scheme to use for connecting to the host. - Defaults to HTTP. - type: string - initialDelaySeconds: - description: |- - Number of seconds after the container has started before liveness probes are initiated. - More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes - type: integer - format: int32 - periodSeconds: - description: How often (in seconds) to perform the probe. - type: integer - format: int32 - successThreshold: - description: |- - Minimum consecutive successes for the probe to be considered successful after having failed. - Defaults to 1. Must be 1 for liveness and startup. Minimum value is 1. - type: integer - format: int32 - tcpSocket: - description: TCPSocket specifies an action involving a TCP port. - type: object - properties: - host: - description: 'Optional: Host name to connect to, defaults to the pod IP.' - type: string - port: - description: |- - Number or name of the port to access on the container. - Number must be in the range 1 to 65535. - Name must be an IANA_SVC_NAME. - anyOf: - - type: integer - - type: string - x-kubernetes-int-or-string: true - timeoutSeconds: - description: |- - Number of seconds after which the probe times out. - Defaults to 1 second. Minimum value is 1. - More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes - type: integer - format: int32 - terminationMessagePath: - description: |- - Optional: Path at which the file to which the container's termination message - will be written is mounted into the container's filesystem. - Message written is intended to be brief final status, such as an assertion failure message. - Will be truncated by the node if greater than 4096 bytes. The total message length across - all containers will be limited to 12kb. - Defaults to /dev/termination-log. - Cannot be updated. - type: string - terminationMessagePolicy: - description: |- - Indicate how the termination message should be populated. File will use the contents of - terminationMessagePath to populate the container status message on both success and failure. - FallbackToLogsOnError will use the last chunk of container log output if the termination - message file is empty and the container exited with an error. - The log output is limited to 2048 bytes or 80 lines, whichever is smaller. - Defaults to File. - Cannot be updated. - type: string - volumeMounts: - description: |- - Pod volumes to mount into the container's filesystem. - Cannot be updated. - type: array - items: - description: VolumeMount describes a mounting of a Volume within a container. - type: object - required: - - mountPath - - name - properties: - mountPath: - description: |- - Path within the container at which the volume should be mounted. Must - not contain ':'. - type: string - name: - description: This must match the Name of a Volume. - type: string - readOnly: - description: |- - Mounted read-only if true, read-write otherwise (false or unspecified). - Defaults to false. - type: boolean - subPath: - description: |- - Path within the volume from which the container's volume should be mounted. - Defaults to "" (volume's root). - type: string - x-kubernetes-list-map-keys: - - mountPath - x-kubernetes-list-type: map - workingDir: - description: |- - Container's working directory. - If not specified, the container runtime's default will be used, which - might be configured in the container image. - Cannot be updated. - type: string - dnsConfig: - description: This is accessible behind a feature flag - kubernetes.podspec-dnsconfig - type: object - x-kubernetes-preserve-unknown-fields: true - dnsPolicy: - description: This is accessible behind a feature flag - kubernetes.podspec-dnspolicy - type: string - enableServiceLinks: - description: 'EnableServiceLinks indicates whether information about services should be injected into pod''s environment variables, matching the syntax of Docker links. Optional: Knative defaults this to false.' - type: boolean - hostAliases: - description: This is accessible behind a feature flag - kubernetes.podspec-hostaliases - type: array - items: - description: This is accessible behind a feature flag - kubernetes.podspec-hostaliases - type: object - x-kubernetes-preserve-unknown-fields: true - hostIPC: - description: This is accessible behind a feature flag - kubernetes.podspec-hostipc - type: boolean - x-kubernetes-preserve-unknown-fields: true - hostNetwork: - description: This is accessible behind a feature flag - kubernetes.podspec-hostnetwork - type: boolean - x-kubernetes-preserve-unknown-fields: true - hostPID: - description: This is accessible behind a feature flag - kubernetes.podspec-hostpid - type: boolean - x-kubernetes-preserve-unknown-fields: true - idleTimeoutSeconds: - description: |- - IdleTimeoutSeconds is the maximum duration in seconds a request will be allowed - to stay open while not receiving any bytes from the user's application. If - unspecified, a system default will be provided. - type: integer - format: int64 - imagePullSecrets: - description: |- - ImagePullSecrets is an optional list of references to secrets in the same namespace to use for pulling any of the images used by this PodSpec. - If specified, these secrets will be passed to individual puller implementations for them to use. - More info: https://kubernetes.io/docs/concepts/containers/images#specifying-imagepullsecrets-on-a-pod - type: array - items: - description: |- - LocalObjectReference contains enough information to let you locate the - referenced object inside the same namespace. - type: object - properties: - name: - description: |- - Name of the referent. - This field is effectively required, but due to backwards compatibility is - allowed to be empty. Instances of this type with an empty value here are - almost certainly wrong. - TODO: Add other useful fields. apiVersion, kind, uid? - More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names - TODO: Drop `kubebuilder:default` when controller-gen doesn't need it https://github.com/kubernetes-sigs/kubebuilder/issues/3896. - type: string - default: "" - x-kubernetes-map-type: atomic - x-kubernetes-list-map-keys: - - name - x-kubernetes-list-type: map - initContainers: - description: |- - List of initialization containers belonging to the pod. - Init containers are executed in order prior to containers being started. If any - init container fails, the pod is considered to have failed and is handled according - to its restartPolicy. The name for an init container or normal container must be - unique among all containers. - Init containers may not have Lifecycle actions, Readiness probes, Liveness probes, or Startup probes. - The resourceRequirements of an init container are taken into account during scheduling - by finding the highest request/limit for each resource type, and then using the max of - of that value or the sum of the normal containers. Limits are applied to init containers - in a similar fashion. - Init containers cannot currently be added or removed. - Cannot be updated. - More info: https://kubernetes.io/docs/concepts/workloads/pods/init-containers/ - type: array - items: - description: This is accessible behind a feature flag - kubernetes.podspec-init-containers - type: object - x-kubernetes-preserve-unknown-fields: true - nodeSelector: - description: This is accessible behind a feature flag - kubernetes.podspec-nodeselector - type: object - x-kubernetes-preserve-unknown-fields: true - x-kubernetes-map-type: atomic - priorityClassName: - description: This is accessible behind a feature flag - kubernetes.podspec-priorityclassname - type: string - x-kubernetes-preserve-unknown-fields: true - responseStartTimeoutSeconds: - description: |- - ResponseStartTimeoutSeconds is the maximum duration in seconds that the request - routing layer will wait for a request delivered to a container to begin - sending any network traffic. - type: integer - format: int64 - runtimeClassName: - description: This is accessible behind a feature flag - kubernetes.podspec-runtimeclassname - type: string - x-kubernetes-preserve-unknown-fields: true - schedulerName: - description: This is accessible behind a feature flag - kubernetes.podspec-schedulername - type: string - x-kubernetes-preserve-unknown-fields: true - securityContext: - description: This is accessible behind a feature flag - kubernetes.podspec-securitycontext - type: object - x-kubernetes-preserve-unknown-fields: true - serviceAccountName: - description: |- - ServiceAccountName is the name of the ServiceAccount to use to run this pod. - More info: https://kubernetes.io/docs/tasks/configure-pod-container/configure-service-account/ - type: string - shareProcessNamespace: - description: This is accessible behind a feature flag - kubernetes.podspec-shareproccessnamespace - type: boolean - x-kubernetes-preserve-unknown-fields: true - timeoutSeconds: - description: |- - TimeoutSeconds is the maximum duration in seconds that the request instance - is allowed to respond to a request. If unspecified, a system default will - be provided. - type: integer - format: int64 - tolerations: - description: This is accessible behind a feature flag - kubernetes.podspec-tolerations - type: array - items: - description: This is accessible behind a feature flag - kubernetes.podspec-tolerations - type: object - x-kubernetes-preserve-unknown-fields: true - x-kubernetes-list-type: atomic - topologySpreadConstraints: - description: This is accessible behind a feature flag - kubernetes.podspec-topologyspreadconstraints - type: array - items: - description: This is accessible behind a feature flag - kubernetes.podspec-topologyspreadconstraints - type: object - x-kubernetes-preserve-unknown-fields: true - volumes: - description: |- - List of volumes that can be mounted by containers belonging to the pod. - More info: https://kubernetes.io/docs/concepts/storage/volumes - type: array - items: - description: Volume represents a named volume in a pod that may be accessed by any container in the pod. - type: object - required: - - name - properties: - configMap: - description: configMap represents a configMap that should populate this volume - type: object - properties: - defaultMode: - description: |- - defaultMode is optional: mode bits used to set permissions on created files by default. - Must be an octal value between 0000 and 0777 or a decimal value between 0 and 511. - YAML accepts both octal and decimal values, JSON requires decimal values for mode bits. - Defaults to 0644. - Directories within the path are not affected by this setting. - This might be in conflict with other options that affect the file - mode, like fsGroup, and the result can be other mode bits set. - type: integer - format: int32 - items: - description: |- - items if unspecified, each key-value pair in the Data field of the referenced - ConfigMap will be projected into the volume as a file whose name is the - key and content is the value. If specified, the listed keys will be - projected into the specified paths, and unlisted keys will not be - present. If a key is specified which is not present in the ConfigMap, - the volume setup will error unless it is marked optional. Paths must be - relative and may not contain the '..' path or start with '..'. - type: array - items: - description: Maps a string key to a path within a volume. - type: object - required: - - key - - path - properties: - key: - description: key is the key to project. - type: string - mode: - description: |- - mode is Optional: mode bits used to set permissions on this file. - Must be an octal value between 0000 and 0777 or a decimal value between 0 and 511. - YAML accepts both octal and decimal values, JSON requires decimal values for mode bits. - If not specified, the volume defaultMode will be used. - This might be in conflict with other options that affect the file - mode, like fsGroup, and the result can be other mode bits set. - type: integer - format: int32 - path: - description: |- - path is the relative path of the file to map the key to. - May not be an absolute path. - May not contain the path element '..'. - May not start with the string '..'. - type: string - x-kubernetes-list-type: atomic - name: - description: |- - Name of the referent. - This field is effectively required, but due to backwards compatibility is - allowed to be empty. Instances of this type with an empty value here are - almost certainly wrong. - TODO: Add other useful fields. apiVersion, kind, uid? - More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names - TODO: Drop `kubebuilder:default` when controller-gen doesn't need it https://github.com/kubernetes-sigs/kubebuilder/issues/3896. - type: string - default: "" - optional: - description: optional specify whether the ConfigMap or its keys must be defined - type: boolean - x-kubernetes-map-type: atomic - emptyDir: - description: This is accessible behind a feature flag - kubernetes.podspec-emptydir - type: object - x-kubernetes-preserve-unknown-fields: true - name: - description: |- - name of the volume. - Must be a DNS_LABEL and unique within the pod. - More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names - type: string - persistentVolumeClaim: - description: This is accessible behind a feature flag - kubernetes.podspec-persistent-volume-claim - type: object - x-kubernetes-preserve-unknown-fields: true - projected: - description: projected items for all in one resources secrets, configmaps, and downward API - type: object - properties: - defaultMode: - description: |- - defaultMode are the mode bits used to set permissions on created files by default. - Must be an octal value between 0000 and 0777 or a decimal value between 0 and 511. - YAML accepts both octal and decimal values, JSON requires decimal values for mode bits. - Directories within the path are not affected by this setting. - This might be in conflict with other options that affect the file - mode, like fsGroup, and the result can be other mode bits set. - type: integer - format: int32 - sources: - description: sources is the list of volume projections - type: array - items: - description: Projection that may be projected along with other supported volume types - type: object - properties: - configMap: - description: configMap information about the configMap data to project - type: object - properties: - items: - description: |- - items if unspecified, each key-value pair in the Data field of the referenced - ConfigMap will be projected into the volume as a file whose name is the - key and content is the value. If specified, the listed keys will be - projected into the specified paths, and unlisted keys will not be - present. If a key is specified which is not present in the ConfigMap, - the volume setup will error unless it is marked optional. Paths must be - relative and may not contain the '..' path or start with '..'. - type: array - items: - description: Maps a string key to a path within a volume. - type: object - required: - - key - - path - properties: - key: - description: key is the key to project. - type: string - mode: - description: |- - mode is Optional: mode bits used to set permissions on this file. - Must be an octal value between 0000 and 0777 or a decimal value between 0 and 511. - YAML accepts both octal and decimal values, JSON requires decimal values for mode bits. - If not specified, the volume defaultMode will be used. - This might be in conflict with other options that affect the file - mode, like fsGroup, and the result can be other mode bits set. - type: integer - format: int32 - path: - description: |- - path is the relative path of the file to map the key to. - May not be an absolute path. - May not contain the path element '..'. - May not start with the string '..'. - type: string - x-kubernetes-list-type: atomic - name: - description: |- - Name of the referent. - This field is effectively required, but due to backwards compatibility is - allowed to be empty. Instances of this type with an empty value here are - almost certainly wrong. - TODO: Add other useful fields. apiVersion, kind, uid? - More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names - TODO: Drop `kubebuilder:default` when controller-gen doesn't need it https://github.com/kubernetes-sigs/kubebuilder/issues/3896. - type: string - default: "" - optional: - description: optional specify whether the ConfigMap or its keys must be defined - type: boolean - x-kubernetes-map-type: atomic - downwardAPI: - description: downwardAPI information about the downwardAPI data to project - type: object - properties: - items: - description: Items is a list of DownwardAPIVolume file - type: array - items: - description: DownwardAPIVolumeFile represents information to create the file containing the pod field - type: object - required: - - path - properties: - fieldRef: - description: 'Required: Selects a field of the pod: only annotations, labels, name, namespace and uid are supported.' - type: object - required: - - fieldPath - properties: - apiVersion: - description: Version of the schema the FieldPath is written in terms of, defaults to "v1". - type: string - fieldPath: - description: Path of the field to select in the specified API version. - type: string - x-kubernetes-map-type: atomic - mode: - description: |- - Optional: mode bits used to set permissions on this file, must be an octal value - between 0000 and 0777 or a decimal value between 0 and 511. - YAML accepts both octal and decimal values, JSON requires decimal values for mode bits. - If not specified, the volume defaultMode will be used. - This might be in conflict with other options that affect the file - mode, like fsGroup, and the result can be other mode bits set. - type: integer - format: int32 - path: - description: 'Required: Path is the relative path name of the file to be created. Must not be absolute or contain the ''..'' path. Must be utf-8 encoded. The first item of the relative path must not start with ''..''' - type: string - resourceFieldRef: - description: |- - Selects a resource of the container: only resources limits and requests - (limits.cpu, limits.memory, requests.cpu and requests.memory) are currently supported. - type: object - required: - - resource - properties: - containerName: - description: 'Container name: required for volumes, optional for env vars' - type: string - divisor: - description: Specifies the output format of the exposed resources, defaults to "1" - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - anyOf: - - type: integer - - type: string - x-kubernetes-int-or-string: true - resource: - description: 'Required: resource to select' - type: string - x-kubernetes-map-type: atomic - x-kubernetes-list-type: atomic - secret: - description: secret information about the secret data to project - type: object - properties: - items: - description: |- - items if unspecified, each key-value pair in the Data field of the referenced - Secret will be projected into the volume as a file whose name is the - key and content is the value. If specified, the listed keys will be - projected into the specified paths, and unlisted keys will not be - present. If a key is specified which is not present in the Secret, - the volume setup will error unless it is marked optional. Paths must be - relative and may not contain the '..' path or start with '..'. - type: array - items: - description: Maps a string key to a path within a volume. - type: object - required: - - key - - path - properties: - key: - description: key is the key to project. - type: string - mode: - description: |- - mode is Optional: mode bits used to set permissions on this file. - Must be an octal value between 0000 and 0777 or a decimal value between 0 and 511. - YAML accepts both octal and decimal values, JSON requires decimal values for mode bits. - If not specified, the volume defaultMode will be used. - This might be in conflict with other options that affect the file - mode, like fsGroup, and the result can be other mode bits set. - type: integer - format: int32 - path: - description: |- - path is the relative path of the file to map the key to. - May not be an absolute path. - May not contain the path element '..'. - May not start with the string '..'. - type: string - x-kubernetes-list-type: atomic - name: - description: |- - Name of the referent. - This field is effectively required, but due to backwards compatibility is - allowed to be empty. Instances of this type with an empty value here are - almost certainly wrong. - TODO: Add other useful fields. apiVersion, kind, uid? - More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names - TODO: Drop `kubebuilder:default` when controller-gen doesn't need it https://github.com/kubernetes-sigs/kubebuilder/issues/3896. - type: string - default: "" - optional: - description: optional field specify whether the Secret or its key must be defined - type: boolean - x-kubernetes-map-type: atomic - serviceAccountToken: - description: serviceAccountToken is information about the serviceAccountToken data to project - type: object - required: - - path - properties: - audience: - description: |- - audience is the intended audience of the token. A recipient of a token - must identify itself with an identifier specified in the audience of the - token, and otherwise should reject the token. The audience defaults to the - identifier of the apiserver. - type: string - expirationSeconds: - description: |- - expirationSeconds is the requested duration of validity of the service - account token. As the token approaches expiration, the kubelet volume - plugin will proactively rotate the service account token. The kubelet will - start trying to rotate the token if the token is older than 80 percent of - its time to live or if the token is older than 24 hours.Defaults to 1 hour - and must be at least 10 minutes. - type: integer - format: int64 - path: - description: |- - path is the path relative to the mount point of the file to project the - token into. - type: string - x-kubernetes-list-type: atomic - secret: - description: |- - secret represents a secret that should populate this volume. - More info: https://kubernetes.io/docs/concepts/storage/volumes#secret - type: object - properties: - defaultMode: - description: |- - defaultMode is Optional: mode bits used to set permissions on created files by default. - Must be an octal value between 0000 and 0777 or a decimal value between 0 and 511. - YAML accepts both octal and decimal values, JSON requires decimal values - for mode bits. Defaults to 0644. - Directories within the path are not affected by this setting. - This might be in conflict with other options that affect the file - mode, like fsGroup, and the result can be other mode bits set. - type: integer - format: int32 - items: - description: |- - items If unspecified, each key-value pair in the Data field of the referenced - Secret will be projected into the volume as a file whose name is the - key and content is the value. If specified, the listed keys will be - projected into the specified paths, and unlisted keys will not be - present. If a key is specified which is not present in the Secret, - the volume setup will error unless it is marked optional. Paths must be - relative and may not contain the '..' path or start with '..'. - type: array - items: - description: Maps a string key to a path within a volume. - type: object - required: - - key - - path - properties: - key: - description: key is the key to project. - type: string - mode: - description: |- - mode is Optional: mode bits used to set permissions on this file. - Must be an octal value between 0000 and 0777 or a decimal value between 0 and 511. - YAML accepts both octal and decimal values, JSON requires decimal values for mode bits. - If not specified, the volume defaultMode will be used. - This might be in conflict with other options that affect the file - mode, like fsGroup, and the result can be other mode bits set. - type: integer - format: int32 - path: - description: |- - path is the relative path of the file to map the key to. - May not be an absolute path. - May not contain the path element '..'. - May not start with the string '..'. - type: string - x-kubernetes-list-type: atomic - optional: - description: optional field specify whether the Secret or its keys must be defined - type: boolean - secretName: - description: |- - secretName is the name of the secret in the pod's namespace to use. - More info: https://kubernetes.io/docs/concepts/storage/volumes#secret - type: string - x-kubernetes-list-map-keys: - - name - x-kubernetes-list-type: map - status: - description: ConfigurationStatus communicates the observed state of the Configuration (from the controller). - type: object - properties: - annotations: - description: |- - Annotations is additional Status fields for the Resource to save some - additional State as well as convey more information to the user. This is - roughly akin to Annotations on any k8s resource, just the reconciler conveying - richer information outwards. - type: object - additionalProperties: - type: string - conditions: - description: Conditions the latest available observations of a resource's current state. - type: array - items: - description: |- - Condition defines a readiness condition for a Knative resource. - See: https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/api-conventions.md#typical-status-properties - type: object - required: - - status - - type - properties: - lastTransitionTime: - description: |- - LastTransitionTime is the last time the condition transitioned from one status to another. - We use VolatileTime in place of metav1.Time to exclude this from creating equality.Semantic - differences (all other things held constant). - type: string - message: - description: A human readable message indicating details about the transition. - type: string - reason: - description: The reason for the condition's last transition. - type: string - severity: - description: |- - Severity with which to treat failures of this type of condition. - When this is not specified, it defaults to Error. - type: string - status: - description: Status of the condition, one of True, False, Unknown. - type: string - type: - description: Type of condition. - type: string - latestCreatedRevisionName: - description: |- - LatestCreatedRevisionName is the last revision that was created from this - Configuration. It might not be ready yet, for that use LatestReadyRevisionName. - type: string - latestReadyRevisionName: - description: |- - LatestReadyRevisionName holds the name of the latest Revision stamped out - from this Configuration that has had its "Ready" condition become "True". - type: string - observedGeneration: - description: |- - ObservedGeneration is the 'Generation' of the Service that - was last processed by the controller. - type: integer - format: int64 ---- -# Source: knative-operator/templates/knative-crds.yaml -# Copyright 2020 The Knative Authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -apiVersion: apiextensions.k8s.io/v1 -kind: CustomResourceDefinition -metadata: - name: clusterdomainclaims.networking.internal.knative.dev - labels: - app.kubernetes.io/name: knative-serving - app.kubernetes.io/component: networking - app.kubernetes.io/version: "1.16.0" - knative.dev/crd-install: "true" -spec: - group: networking.internal.knative.dev - versions: - - name: v1alpha1 - served: true - storage: true - subresources: - status: {} - schema: - openAPIV3Schema: - description: ClusterDomainClaim is a cluster-wide reservation for a particular domain name. - type: object - properties: - apiVersion: - description: |- - APIVersion defines the versioned schema of this representation of an object. - Servers should convert recognized schemas to the latest internal value, and - may reject unrecognized values. - More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources - type: string - kind: - description: |- - Kind is a string value representing the REST resource this object represents. - Servers may infer this from the endpoint the client submits requests to. - Cannot be updated. - In CamelCase. - More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds - type: string - metadata: - type: object - spec: - description: |- - Spec is the desired state of the ClusterDomainClaim. - More info: https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/api-conventions.md#spec-and-status - type: object - required: - - namespace - properties: - namespace: - description: |- - Namespace is the namespace which is allowed to create a DomainMapping - using this ClusterDomainClaim's name. - type: string - names: - kind: ClusterDomainClaim - plural: clusterdomainclaims - singular: clusterdomainclaim - categories: - - knative-internal - - networking - shortNames: - - cdc - scope: Cluster ---- -# Source: knative-operator/templates/knative-crds.yaml -# Copyright 2020 The Knative Authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -apiVersion: apiextensions.k8s.io/v1 -kind: CustomResourceDefinition -metadata: - name: domainmappings.serving.knative.dev - labels: - app.kubernetes.io/name: knative-serving - app.kubernetes.io/version: "1.16.0" - knative.dev/crd-install: "true" -spec: - group: serving.knative.dev - versions: - - name: v1beta1 - served: true - storage: true - subresources: - status: {} - additionalPrinterColumns: - - name: URL - type: string - jsonPath: .status.url - - name: Ready - type: string - jsonPath: ".status.conditions[?(@.type=='Ready')].status" - - name: Reason - type: string - jsonPath: ".status.conditions[?(@.type=='Ready')].reason" - "schema": - "openAPIV3Schema": - description: DomainMapping is a mapping from a custom hostname to an Addressable. - type: object - properties: - apiVersion: - description: |- - APIVersion defines the versioned schema of this representation of an object. - Servers should convert recognized schemas to the latest internal value, and - may reject unrecognized values. - More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources - type: string - kind: - description: |- - Kind is a string value representing the REST resource this object represents. - Servers may infer this from the endpoint the client submits requests to. - Cannot be updated. - In CamelCase. - More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds - type: string - metadata: - type: object - spec: - description: |- - Spec is the desired state of the DomainMapping. - More info: https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/api-conventions.md#spec-and-status - type: object - required: - - ref - properties: - ref: - description: |- - Ref specifies the target of the Domain Mapping. - - - The object identified by the Ref must be an Addressable with a URL of the - form `{name}.{namespace}.{domain}` where `{domain}` is the cluster domain, - and `{name}` and `{namespace}` are the name and namespace of a Kubernetes - Service. - - - This contract is satisfied by Knative types such as Knative Services and - Knative Routes, and by Kubernetes Services. - type: object - required: - - kind - - name - properties: - address: - description: Address points to a specific Address Name. - type: string - apiVersion: - description: API version of the referent. - type: string - group: - description: |- - Group of the API, without the version of the group. This can be used as an alternative to the APIVersion, and then resolved using ResolveGroup. - Note: This API is EXPERIMENTAL and might break anytime. For more details: https://github.com/knative/eventing/issues/5086 - type: string - kind: - description: |- - Kind of the referent. - More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds - type: string - name: - description: |- - Name of the referent. - More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names - type: string - namespace: - description: |- - Namespace of the referent. - More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/namespaces/ - This is optional field, it gets defaulted to the object holding it if left out. - type: string - tls: - description: TLS allows the DomainMapping to terminate TLS traffic with an existing secret. - type: object - required: - - secretName - properties: - secretName: - description: SecretName is the name of the existing secret used to terminate TLS traffic. - type: string - status: - description: |- - Status is the current state of the DomainMapping. - More info: https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/api-conventions.md#spec-and-status - type: object - properties: - address: - description: Address holds the information needed for a DomainMapping to be the target of an event. - type: object - properties: - CACerts: - description: |- - CACerts is the Certification Authority (CA) certificates in PEM format - according to https://www.rfc-editor.org/rfc/rfc7468. - type: string - audience: - description: Audience is the OIDC audience for this address. - type: string - name: - description: Name is the name of the address. - type: string - url: - type: string - annotations: - description: |- - Annotations is additional Status fields for the Resource to save some - additional State as well as convey more information to the user. This is - roughly akin to Annotations on any k8s resource, just the reconciler conveying - richer information outwards. - type: object - additionalProperties: - type: string - conditions: - description: Conditions the latest available observations of a resource's current state. - type: array - items: - description: |- - Condition defines a readiness condition for a Knative resource. - See: https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/api-conventions.md#typical-status-properties - type: object - required: - - status - - type - properties: - lastTransitionTime: - description: |- - LastTransitionTime is the last time the condition transitioned from one status to another. - We use VolatileTime in place of metav1.Time to exclude this from creating equality.Semantic - differences (all other things held constant). - type: string - message: - description: A human readable message indicating details about the transition. - type: string - reason: - description: The reason for the condition's last transition. - type: string - severity: - description: |- - Severity with which to treat failures of this type of condition. - When this is not specified, it defaults to Error. - type: string - status: - description: Status of the condition, one of True, False, Unknown. - type: string - type: - description: Type of condition. - type: string - observedGeneration: - description: |- - ObservedGeneration is the 'Generation' of the Service that - was last processed by the controller. - type: integer - format: int64 - url: - description: URL is the URL of this DomainMapping. - type: string - names: - kind: DomainMapping - plural: domainmappings - singular: domainmapping - categories: - - all - - knative - - serving - shortNames: - - dm - scope: Namespaced ---- -# Source: knative-operator/templates/knative-crds.yaml -# Copyright 2020 The Knative Authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -apiVersion: apiextensions.k8s.io/v1 -kind: CustomResourceDefinition -metadata: - name: ingresses.networking.internal.knative.dev - labels: - app.kubernetes.io/name: knative-serving - app.kubernetes.io/component: networking - app.kubernetes.io/version: "1.16.0" - knative.dev/crd-install: "true" -spec: - group: networking.internal.knative.dev - versions: - - name: v1alpha1 - served: true - storage: true - subresources: - status: {} - schema: - openAPIV3Schema: - description: |- - Ingress is a collection of rules that allow inbound connections to reach the endpoints defined - by a backend. An Ingress can be configured to give services externally-reachable URLs, load - balance traffic, offer name based virtual hosting, etc. - - - This is heavily based on K8s Ingress https://godoc.org/k8s.io/api/networking/v1beta1#Ingress - which some highlighted modifications. - type: object - properties: - apiVersion: - description: |- - APIVersion defines the versioned schema of this representation of an object. - Servers should convert recognized schemas to the latest internal value, and - may reject unrecognized values. - More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources - type: string - kind: - description: |- - Kind is a string value representing the REST resource this object represents. - Servers may infer this from the endpoint the client submits requests to. - Cannot be updated. - In CamelCase. - More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds - type: string - metadata: - type: object - spec: - description: |- - Spec is the desired state of the Ingress. - More info: https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/api-conventions.md#spec-and-status - type: object - properties: - httpOption: - description: |- - HTTPOption is the option of HTTP. It has the following two values: - `HTTPOptionEnabled`, `HTTPOptionRedirected` - type: string - rules: - description: A list of host rules used to configure the Ingress. - type: array - items: - description: |- - IngressRule represents the rules mapping the paths under a specified host to - the related backend services. Incoming requests are first evaluated for a host - match, then routed to the backend associated with the matching IngressRuleValue. - type: object - properties: - hosts: - description: |- - Host is the fully qualified domain name of a network host, as defined - by RFC 3986. Note the following deviations from the "host" part of the - URI as defined in the RFC: - 1. IPs are not allowed. Currently a rule value can only apply to the - IP in the Spec of the parent . - 2. The `:` delimiter is not respected because ports are not allowed. - Currently the port of an Ingress is implicitly :80 for http and - :443 for https. - Both these may change in the future. - If the host is unspecified, the Ingress routes all traffic based on the - specified IngressRuleValue. - If multiple matching Hosts were provided, the first rule will take precedent. - type: array - items: - type: string - http: - description: |- - HTTP represents a rule to apply against incoming requests. If the - rule is satisfied, the request is routed to the specified backend. - type: object - required: - - paths - properties: - paths: - description: |- - A collection of paths that map requests to backends. - - - If they are multiple matching paths, the first match takes precedence. - type: array - items: - description: |- - HTTPIngressPath associates a path regex with a backend. Incoming URLs matching - the path are forwarded to the backend. - type: object - required: - - splits - properties: - appendHeaders: - description: |- - AppendHeaders allow specifying additional HTTP headers to add - before forwarding a request to the destination service. - - - NOTE: This differs from K8s Ingress which doesn't allow header appending. - type: object - additionalProperties: - type: string - headers: - description: |- - Headers defines header matching rules which is a map from a header name - to HeaderMatch which specify a matching condition. - When a request matched with all the header matching rules, - the request is routed by the corresponding ingress rule. - If it is empty, the headers are not used for matching - type: object - additionalProperties: - description: |- - HeaderMatch represents a matching value of Headers in HTTPIngressPath. - Currently, only the exact matching is supported. - type: object - required: - - exact - properties: - exact: - type: string - path: - description: |- - Path represents a literal prefix to which this rule should apply. - Currently it can contain characters disallowed from the conventional - "path" part of a URL as defined by RFC 3986. Paths must begin with - a '/'. If unspecified, the path defaults to a catch all sending - traffic to the backend. - type: string - rewriteHost: - description: |- - RewriteHost rewrites the incoming request's host header. - - - This field is currently experimental and not supported by all Ingress - implementations. - type: string - splits: - description: |- - Splits defines the referenced service endpoints to which the traffic - will be forwarded to. - type: array - items: - description: IngressBackendSplit describes all endpoints for a given service and port. - type: object - required: - - serviceName - - serviceNamespace - - servicePort - properties: - appendHeaders: - description: |- - AppendHeaders allow specifying additional HTTP headers to add - before forwarding a request to the destination service. - - - NOTE: This differs from K8s Ingress which doesn't allow header appending. - type: object - additionalProperties: - type: string - percent: - description: |- - Specifies the split percentage, a number between 0 and 100. If - only one split is specified, we default to 100. - - - NOTE: This differs from K8s Ingress to allow percentage split. - type: integer - serviceName: - description: Specifies the name of the referenced service. - type: string - serviceNamespace: - description: |- - Specifies the namespace of the referenced service. - - - NOTE: This differs from K8s Ingress to allow routing to different namespaces. - type: string - servicePort: - description: Specifies the port of the referenced service. - anyOf: - - type: integer - - type: string - x-kubernetes-int-or-string: true - visibility: - description: |- - Visibility signifies whether this rule should `ClusterLocal`. If it's not - specified then it defaults to `ExternalIP`. - type: string - tls: - description: |- - TLS configuration. Currently Ingress only supports a single TLS - port: 443. If multiple members of this list specify different hosts, they - will be multiplexed on the same port according to the hostname specified - through the SNI TLS extension, if the ingress controller fulfilling the - ingress supports SNI. - type: array - items: - description: IngressTLS describes the transport layer security associated with an Ingress. - type: object - properties: - hosts: - description: |- - Hosts is a list of hosts included in the TLS certificate. The values in - this list must match the name/s used in the tlsSecret. Defaults to the - wildcard host setting for the loadbalancer controller fulfilling this - Ingress, if left unspecified. - type: array - items: - type: string - secretName: - description: SecretName is the name of the secret used to terminate SSL traffic. - type: string - secretNamespace: - description: |- - SecretNamespace is the namespace of the secret used to terminate SSL traffic. - If not set the namespace should be assumed to be the same as the Ingress. - If set the secret should have the same namespace as the Ingress otherwise - the behaviour is undefined and not supported. - type: string - status: - description: |- - Status is the current state of the Ingress. - More info: https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/api-conventions.md#spec-and-status - type: object - properties: - annotations: - description: |- - Annotations is additional Status fields for the Resource to save some - additional State as well as convey more information to the user. This is - roughly akin to Annotations on any k8s resource, just the reconciler conveying - richer information outwards. - type: object - additionalProperties: - type: string - conditions: - description: Conditions the latest available observations of a resource's current state. - type: array - items: - description: |- - Condition defines a readiness condition for a Knative resource. - See: https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/api-conventions.md#typical-status-properties - type: object - required: - - status - - type - properties: - lastTransitionTime: - description: |- - LastTransitionTime is the last time the condition transitioned from one status to another. - We use VolatileTime in place of metav1.Time to exclude this from creating equality.Semantic - differences (all other things held constant). - type: string - message: - description: A human readable message indicating details about the transition. - type: string - reason: - description: The reason for the condition's last transition. - type: string - severity: - description: |- - Severity with which to treat failures of this type of condition. - When this is not specified, it defaults to Error. - type: string - status: - description: Status of the condition, one of True, False, Unknown. - type: string - type: - description: Type of condition. - type: string - observedGeneration: - description: |- - ObservedGeneration is the 'Generation' of the Service that - was last processed by the controller. - type: integer - format: int64 - privateLoadBalancer: - description: PrivateLoadBalancer contains the current status of the load-balancer. - type: object - properties: - ingress: - description: |- - Ingress is a list containing ingress points for the load-balancer. - Traffic intended for the service should be sent to these ingress points. - type: array - items: - description: |- - LoadBalancerIngressStatus represents the status of a load-balancer ingress point: - traffic intended for the service should be sent to an ingress point. - type: object - properties: - domain: - description: |- - Domain is set for load-balancer ingress points that are DNS based - (typically AWS load-balancers) - type: string - domainInternal: - description: |- - DomainInternal is set if there is a cluster-local DNS name to access the Ingress. - - - NOTE: This differs from K8s Ingress, since we also desire to have a cluster-local - DNS name to allow routing in case of not having a mesh. - type: string - ip: - description: |- - IP is set for load-balancer ingress points that are IP based - (typically GCE or OpenStack load-balancers) - type: string - meshOnly: - description: MeshOnly is set if the Ingress is only load-balanced through a Service mesh. - type: boolean - publicLoadBalancer: - description: PublicLoadBalancer contains the current status of the load-balancer. - type: object - properties: - ingress: - description: |- - Ingress is a list containing ingress points for the load-balancer. - Traffic intended for the service should be sent to these ingress points. - type: array - items: - description: |- - LoadBalancerIngressStatus represents the status of a load-balancer ingress point: - traffic intended for the service should be sent to an ingress point. - type: object - properties: - domain: - description: |- - Domain is set for load-balancer ingress points that are DNS based - (typically AWS load-balancers) - type: string - domainInternal: - description: |- - DomainInternal is set if there is a cluster-local DNS name to access the Ingress. - - - NOTE: This differs from K8s Ingress, since we also desire to have a cluster-local - DNS name to allow routing in case of not having a mesh. - type: string - ip: - description: |- - IP is set for load-balancer ingress points that are IP based - (typically GCE or OpenStack load-balancers) - type: string - meshOnly: - description: MeshOnly is set if the Ingress is only load-balanced through a Service mesh. - type: boolean - additionalPrinterColumns: - - name: Ready - type: string - jsonPath: ".status.conditions[?(@.type=='Ready')].status" - - name: Reason - type: string - jsonPath: ".status.conditions[?(@.type=='Ready')].reason" - names: - kind: Ingress - plural: ingresses - singular: ingress - categories: - - knative-internal - - networking - shortNames: - - kingress - - king - scope: Namespaced ---- -# Source: knative-operator/templates/knative-crds.yaml -# Copyright 2019 The Knative Authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Note: The schema part of the spec is auto-generated by hack/update-schemas.sh. - -apiVersion: apiextensions.k8s.io/v1 -kind: CustomResourceDefinition -metadata: - name: metrics.autoscaling.internal.knative.dev - labels: - app.kubernetes.io/name: knative-serving - app.kubernetes.io/version: "1.16.0" - knative.dev/crd-install: "true" -spec: - group: autoscaling.internal.knative.dev - names: - kind: Metric - plural: metrics - singular: metric - categories: - - knative-internal - - autoscaling - scope: Namespaced - versions: - - name: v1alpha1 - served: true - storage: true - subresources: - status: {} - additionalPrinterColumns: - - name: Ready - type: string - jsonPath: ".status.conditions[?(@.type=='Ready')].status" - - name: Reason - type: string - jsonPath: ".status.conditions[?(@.type=='Ready')].reason" - schema: - openAPIV3Schema: - description: Metric represents a resource to configure the metric collector with. - type: object - properties: - apiVersion: - description: |- - APIVersion defines the versioned schema of this representation of an object. - Servers should convert recognized schemas to the latest internal value, and - may reject unrecognized values. - More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources - type: string - kind: - description: |- - Kind is a string value representing the REST resource this object represents. - Servers may infer this from the endpoint the client submits requests to. - Cannot be updated. - In CamelCase. - More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds - type: string - metadata: - type: object - spec: - description: Spec holds the desired state of the Metric (from the client). - type: object - required: - - panicWindow - - scrapeTarget - - stableWindow - properties: - panicWindow: - description: PanicWindow is the aggregation window for metrics where quick reactions are needed. - type: integer - format: int64 - scrapeTarget: - description: ScrapeTarget is the K8s service that publishes the metric endpoint. - type: string - stableWindow: - description: StableWindow is the aggregation window for metrics in a stable state. - type: integer - format: int64 - status: - description: Status communicates the observed state of the Metric (from the controller). - type: object - properties: - annotations: - description: |- - Annotations is additional Status fields for the Resource to save some - additional State as well as convey more information to the user. This is - roughly akin to Annotations on any k8s resource, just the reconciler conveying - richer information outwards. - type: object - additionalProperties: - type: string - conditions: - description: Conditions the latest available observations of a resource's current state. - type: array - items: - description: |- - Condition defines a readiness condition for a Knative resource. - See: https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/api-conventions.md#typical-status-properties - type: object - required: - - status - - type - properties: - lastTransitionTime: - description: |- - LastTransitionTime is the last time the condition transitioned from one status to another. - We use VolatileTime in place of metav1.Time to exclude this from creating equality.Semantic - differences (all other things held constant). - type: string - message: - description: A human readable message indicating details about the transition. - type: string - reason: - description: The reason for the condition's last transition. - type: string - severity: - description: |- - Severity with which to treat failures of this type of condition. - When this is not specified, it defaults to Error. - type: string - status: - description: Status of the condition, one of True, False, Unknown. - type: string - type: - description: Type of condition. - type: string - observedGeneration: - description: |- - ObservedGeneration is the 'Generation' of the Service that - was last processed by the controller. - type: integer - format: int64 ---- -# Source: knative-operator/templates/knative-crds.yaml -# Copyright 2018 The Knative Authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Note: The schema part of the spec is auto-generated by hack/update-schemas.sh. - -apiVersion: apiextensions.k8s.io/v1 -kind: CustomResourceDefinition -metadata: - name: podautoscalers.autoscaling.internal.knative.dev - labels: - app.kubernetes.io/name: knative-serving - app.kubernetes.io/version: "1.16.0" - knative.dev/crd-install: "true" -spec: - group: autoscaling.internal.knative.dev - names: - kind: PodAutoscaler - plural: podautoscalers - singular: podautoscaler - categories: - - knative-internal - - autoscaling - shortNames: - - kpa - - pa - scope: Namespaced - versions: - - name: v1alpha1 - served: true - storage: true - subresources: - status: {} - additionalPrinterColumns: - - name: DesiredScale - type: integer - jsonPath: ".status.desiredScale" - - name: ActualScale - type: integer - jsonPath: ".status.actualScale" - - name: Ready - type: string - jsonPath: ".status.conditions[?(@.type=='Ready')].status" - - name: Reason - type: string - jsonPath: ".status.conditions[?(@.type=='Ready')].reason" - schema: - openAPIV3Schema: - description: |- - PodAutoscaler is a Knative abstraction that encapsulates the interface by which Knative - components instantiate autoscalers. This definition is an abstraction that may be backed - by multiple definitions. For more information, see the Knative Pluggability presentation: - https://docs.google.com/presentation/d/19vW9HFZ6Puxt31biNZF3uLRejDmu82rxJIk1cWmxF7w/edit - type: object - properties: - apiVersion: - description: |- - APIVersion defines the versioned schema of this representation of an object. - Servers should convert recognized schemas to the latest internal value, and - may reject unrecognized values. - More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources - type: string - kind: - description: |- - Kind is a string value representing the REST resource this object represents. - Servers may infer this from the endpoint the client submits requests to. - Cannot be updated. - In CamelCase. - More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds - type: string - metadata: - type: object - spec: - description: Spec holds the desired state of the PodAutoscaler (from the client). - type: object - required: - - protocolType - - scaleTargetRef - properties: - containerConcurrency: - description: |- - ContainerConcurrency specifies the maximum allowed - in-flight (concurrent) requests per container of the Revision. - Defaults to `0` which means unlimited concurrency. - type: integer - format: int64 - protocolType: - description: The application-layer protocol. Matches `ProtocolType` inferred from the revision spec. - type: string - reachability: - description: |- - Reachability specifies whether or not the `ScaleTargetRef` can be reached (ie. has a route). - Defaults to `ReachabilityUnknown` - type: string - scaleTargetRef: - description: |- - ScaleTargetRef defines the /scale-able resource that this PodAutoscaler - is responsible for quickly right-sizing. - type: object - properties: - apiVersion: - description: API version of the referent. - type: string - kind: - description: |- - Kind of the referent. - More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds - type: string - name: - description: |- - Name of the referent. - More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names - type: string - x-kubernetes-map-type: atomic - status: - description: Status communicates the observed state of the PodAutoscaler (from the controller). - type: object - required: - - metricsServiceName - - serviceName - properties: - actualScale: - description: ActualScale shows the actual number of replicas for the revision. - type: integer - format: int32 - annotations: - description: |- - Annotations is additional Status fields for the Resource to save some - additional State as well as convey more information to the user. This is - roughly akin to Annotations on any k8s resource, just the reconciler conveying - richer information outwards. - type: object - additionalProperties: - type: string - conditions: - description: Conditions the latest available observations of a resource's current state. - type: array - items: - description: |- - Condition defines a readiness condition for a Knative resource. - See: https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/api-conventions.md#typical-status-properties - type: object - required: - - status - - type - properties: - lastTransitionTime: - description: |- - LastTransitionTime is the last time the condition transitioned from one status to another. - We use VolatileTime in place of metav1.Time to exclude this from creating equality.Semantic - differences (all other things held constant). - type: string - message: - description: A human readable message indicating details about the transition. - type: string - reason: - description: The reason for the condition's last transition. - type: string - severity: - description: |- - Severity with which to treat failures of this type of condition. - When this is not specified, it defaults to Error. - type: string - status: - description: Status of the condition, one of True, False, Unknown. - type: string - type: - description: Type of condition. - type: string - desiredScale: - description: DesiredScale shows the current desired number of replicas for the revision. - type: integer - format: int32 - metricsServiceName: - description: |- - MetricsServiceName is the K8s Service name that provides revision metrics. - The service is managed by the PA object. - type: string - observedGeneration: - description: |- - ObservedGeneration is the 'Generation' of the Service that - was last processed by the controller. - type: integer - format: int64 - serviceName: - description: |- - ServiceName is the K8s Service name that serves the revision, scaled by this PA. - The service is created and owned by the ServerlessService object owned by this PA. - type: string ---- -# Source: knative-operator/templates/knative-crds.yaml -# Copyright 2019 The Knative Authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Note: The schema part of the spec is auto-generated by hack/update-schemas.sh. - -apiVersion: apiextensions.k8s.io/v1 -kind: CustomResourceDefinition -metadata: - name: revisions.serving.knative.dev - labels: - app.kubernetes.io/name: knative-serving - app.kubernetes.io/version: "1.16.0" - knative.dev/crd-install: "true" -spec: - group: serving.knative.dev - names: - kind: Revision - plural: revisions - singular: revision - categories: - - all - - knative - - serving - shortNames: - - rev - scope: Namespaced - versions: - - name: v1 - served: true - storage: true - subresources: - status: {} - additionalPrinterColumns: - - name: Config Name - type: string - jsonPath: ".metadata.labels['serving\\.knative\\.dev/configuration']" - - name: Generation - type: string # int in string form :( - jsonPath: ".metadata.labels['serving\\.knative\\.dev/configurationGeneration']" - - name: Ready - type: string - jsonPath: ".status.conditions[?(@.type=='Ready')].status" - - name: Reason - type: string - jsonPath: ".status.conditions[?(@.type=='Ready')].reason" - - name: Actual Replicas - type: integer - jsonPath: ".status.actualReplicas" - - name: Desired Replicas - type: integer - jsonPath: ".status.desiredReplicas" - schema: - openAPIV3Schema: - description: |- - Revision is an immutable snapshot of code and configuration. A revision - references a container image. Revisions are created by updates to a - Configuration. - - - See also: https://github.com/knative/serving/blob/main/docs/spec/overview.md#revision - type: object - properties: - apiVersion: - description: |- - APIVersion defines the versioned schema of this representation of an object. - Servers should convert recognized schemas to the latest internal value, and - may reject unrecognized values. - More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources - type: string - kind: - description: |- - Kind is a string value representing the REST resource this object represents. - Servers may infer this from the endpoint the client submits requests to. - Cannot be updated. - In CamelCase. - More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds - type: string - metadata: - type: object - spec: - description: RevisionSpec holds the desired state of the Revision (from the client). - type: object - required: - - containers - properties: - affinity: - description: This is accessible behind a feature flag - kubernetes.podspec-affinity - type: object - x-kubernetes-preserve-unknown-fields: true - automountServiceAccountToken: - description: AutomountServiceAccountToken indicates whether a service account token should be automatically mounted. - type: boolean - containerConcurrency: - description: |- - ContainerConcurrency specifies the maximum allowed in-flight (concurrent) - requests per container of the Revision. Defaults to `0` which means - concurrency to the application is not limited, and the system decides the - target concurrency for the autoscaler. - type: integer - format: int64 - containers: - description: |- - List of containers belonging to the pod. - Containers cannot currently be added or removed. - There must be at least one container in a Pod. - Cannot be updated. - type: array - items: - description: A single application container that you want to run within a pod. - type: object - properties: - args: - description: |- - Arguments to the entrypoint. - The container image's CMD is used if this is not provided. - Variable references $(VAR_NAME) are expanded using the container's environment. If a variable - cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced - to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. "$$(VAR_NAME)" will - produce the string literal "$(VAR_NAME)". Escaped references will never be expanded, regardless - of whether the variable exists or not. Cannot be updated. - More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell - type: array - items: - type: string - x-kubernetes-list-type: atomic - command: - description: |- - Entrypoint array. Not executed within a shell. - The container image's ENTRYPOINT is used if this is not provided. - Variable references $(VAR_NAME) are expanded using the container's environment. If a variable - cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced - to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. "$$(VAR_NAME)" will - produce the string literal "$(VAR_NAME)". Escaped references will never be expanded, regardless - of whether the variable exists or not. Cannot be updated. - More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell - type: array - items: - type: string - x-kubernetes-list-type: atomic - env: - description: |- - List of environment variables to set in the container. - Cannot be updated. - type: array - items: - description: EnvVar represents an environment variable present in a Container. - type: object - required: - - name - properties: - name: - description: Name of the environment variable. Must be a C_IDENTIFIER. - type: string - value: - description: |- - Variable references $(VAR_NAME) are expanded - using the previously defined environment variables in the container and - any service environment variables. If a variable cannot be resolved, - the reference in the input string will be unchanged. Double $$ are reduced - to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. - "$$(VAR_NAME)" will produce the string literal "$(VAR_NAME)". - Escaped references will never be expanded, regardless of whether the variable - exists or not. - Defaults to "". - type: string - valueFrom: - description: Source for the environment variable's value. Cannot be used if value is not empty. - type: object - properties: - configMapKeyRef: - description: Selects a key of a ConfigMap. - type: object - required: - - key - properties: - key: - description: The key to select. - type: string - name: - description: |- - Name of the referent. - This field is effectively required, but due to backwards compatibility is - allowed to be empty. Instances of this type with an empty value here are - almost certainly wrong. - TODO: Add other useful fields. apiVersion, kind, uid? - More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names - TODO: Drop `kubebuilder:default` when controller-gen doesn't need it https://github.com/kubernetes-sigs/kubebuilder/issues/3896. - type: string - default: "" - optional: - description: Specify whether the ConfigMap or its key must be defined - type: boolean - x-kubernetes-map-type: atomic - fieldRef: - description: This is accessible behind a feature flag - kubernetes.podspec-fieldref - type: object - x-kubernetes-preserve-unknown-fields: true - x-kubernetes-map-type: atomic - resourceFieldRef: - description: This is accessible behind a feature flag - kubernetes.podspec-fieldref - type: object - x-kubernetes-preserve-unknown-fields: true - x-kubernetes-map-type: atomic - secretKeyRef: - description: Selects a key of a secret in the pod's namespace - type: object - required: - - key - properties: - key: - description: The key of the secret to select from. Must be a valid secret key. - type: string - name: - description: |- - Name of the referent. - This field is effectively required, but due to backwards compatibility is - allowed to be empty. Instances of this type with an empty value here are - almost certainly wrong. - TODO: Add other useful fields. apiVersion, kind, uid? - More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names - TODO: Drop `kubebuilder:default` when controller-gen doesn't need it https://github.com/kubernetes-sigs/kubebuilder/issues/3896. - type: string - default: "" - optional: - description: Specify whether the Secret or its key must be defined - type: boolean - x-kubernetes-map-type: atomic - x-kubernetes-list-map-keys: - - name - x-kubernetes-list-type: map - envFrom: - description: |- - List of sources to populate environment variables in the container. - The keys defined within a source must be a C_IDENTIFIER. All invalid keys - will be reported as an event when the container is starting. When a key exists in multiple - sources, the value associated with the last source will take precedence. - Values defined by an Env with a duplicate key will take precedence. - Cannot be updated. - type: array - items: - description: EnvFromSource represents the source of a set of ConfigMaps - type: object - properties: - configMapRef: - description: The ConfigMap to select from - type: object - properties: - name: - description: |- - Name of the referent. - This field is effectively required, but due to backwards compatibility is - allowed to be empty. Instances of this type with an empty value here are - almost certainly wrong. - TODO: Add other useful fields. apiVersion, kind, uid? - More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names - TODO: Drop `kubebuilder:default` when controller-gen doesn't need it https://github.com/kubernetes-sigs/kubebuilder/issues/3896. - type: string - default: "" - optional: - description: Specify whether the ConfigMap must be defined - type: boolean - x-kubernetes-map-type: atomic - prefix: - description: An optional identifier to prepend to each key in the ConfigMap. Must be a C_IDENTIFIER. - type: string - secretRef: - description: The Secret to select from - type: object - properties: - name: - description: |- - Name of the referent. - This field is effectively required, but due to backwards compatibility is - allowed to be empty. Instances of this type with an empty value here are - almost certainly wrong. - TODO: Add other useful fields. apiVersion, kind, uid? - More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names - TODO: Drop `kubebuilder:default` when controller-gen doesn't need it https://github.com/kubernetes-sigs/kubebuilder/issues/3896. - type: string - default: "" - optional: - description: Specify whether the Secret must be defined - type: boolean - x-kubernetes-map-type: atomic - x-kubernetes-list-type: atomic - image: - description: |- - Container image name. - More info: https://kubernetes.io/docs/concepts/containers/images - This field is optional to allow higher level config management to default or override - container images in workload controllers like Deployments and StatefulSets. - type: string - imagePullPolicy: - description: |- - Image pull policy. - One of Always, Never, IfNotPresent. - Defaults to Always if :latest tag is specified, or IfNotPresent otherwise. - Cannot be updated. - More info: https://kubernetes.io/docs/concepts/containers/images#updating-images - type: string - livenessProbe: - description: |- - Periodic probe of container liveness. - Container will be restarted if the probe fails. - Cannot be updated. - More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes - type: object - properties: - exec: - description: Exec specifies the action to take. - type: object - properties: - command: - description: |- - Command is the command line to execute inside the container, the working directory for the - command is root ('/') in the container's filesystem. The command is simply exec'd, it is - not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use - a shell, you need to explicitly call out to that shell. - Exit status of 0 is treated as live/healthy and non-zero is unhealthy. - type: array - items: - type: string - x-kubernetes-list-type: atomic - failureThreshold: - description: |- - Minimum consecutive failures for the probe to be considered failed after having succeeded. - Defaults to 3. Minimum value is 1. - type: integer - format: int32 - grpc: - description: GRPC specifies an action involving a GRPC port. - type: object - required: - - port - properties: - port: - description: Port number of the gRPC service. Number must be in the range 1 to 65535. - type: integer - format: int32 - service: - description: |- - Service is the name of the service to place in the gRPC HealthCheckRequest - (see https://github.com/grpc/grpc/blob/master/doc/health-checking.md). - - - If this is not specified, the default behavior is defined by gRPC. - type: string - httpGet: - description: HTTPGet specifies the http request to perform. - type: object - properties: - host: - description: |- - Host name to connect to, defaults to the pod IP. You probably want to set - "Host" in httpHeaders instead. - type: string - httpHeaders: - description: Custom headers to set in the request. HTTP allows repeated headers. - type: array - items: - description: HTTPHeader describes a custom header to be used in HTTP probes - type: object - required: - - name - - value - properties: - name: - description: |- - The header field name. - This will be canonicalized upon output, so case-variant names will be understood as the same header. - type: string - value: - description: The header field value - type: string - x-kubernetes-list-type: atomic - path: - description: Path to access on the HTTP server. - type: string - port: - description: |- - Name or number of the port to access on the container. - Number must be in the range 1 to 65535. - Name must be an IANA_SVC_NAME. - anyOf: - - type: integer - - type: string - x-kubernetes-int-or-string: true - scheme: - description: |- - Scheme to use for connecting to the host. - Defaults to HTTP. - type: string - initialDelaySeconds: - description: |- - Number of seconds after the container has started before liveness probes are initiated. - More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes - type: integer - format: int32 - periodSeconds: - description: How often (in seconds) to perform the probe. - type: integer - format: int32 - successThreshold: - description: |- - Minimum consecutive successes for the probe to be considered successful after having failed. - Defaults to 1. Must be 1 for liveness and startup. Minimum value is 1. - type: integer - format: int32 - tcpSocket: - description: TCPSocket specifies an action involving a TCP port. - type: object - properties: - host: - description: 'Optional: Host name to connect to, defaults to the pod IP.' - type: string - port: - description: |- - Number or name of the port to access on the container. - Number must be in the range 1 to 65535. - Name must be an IANA_SVC_NAME. - anyOf: - - type: integer - - type: string - x-kubernetes-int-or-string: true - timeoutSeconds: - description: |- - Number of seconds after which the probe times out. - Defaults to 1 second. Minimum value is 1. - More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes - type: integer - format: int32 - name: - description: |- - Name of the container specified as a DNS_LABEL. - Each container in a pod must have a unique name (DNS_LABEL). - Cannot be updated. - type: string - ports: - description: |- - List of ports to expose from the container. Not specifying a port here - DOES NOT prevent that port from being exposed. Any port which is - listening on the default "0.0.0.0" address inside a container will be - accessible from the network. - Modifying this array with strategic merge patch may corrupt the data. - For more information See https://github.com/kubernetes/kubernetes/issues/108255. - Cannot be updated. - type: array - items: - description: ContainerPort represents a network port in a single container. - type: object - required: - - containerPort - properties: - containerPort: - description: |- - Number of port to expose on the pod's IP address. - This must be a valid port number, 0 < x < 65536. - type: integer - format: int32 - name: - description: |- - If specified, this must be an IANA_SVC_NAME and unique within the pod. Each - named port in a pod must have a unique name. Name for the port that can be - referred to by services. - type: string - protocol: - description: |- - Protocol for port. Must be UDP, TCP, or SCTP. - Defaults to "TCP". - type: string - default: TCP - x-kubernetes-list-map-keys: - - containerPort - - protocol - x-kubernetes-list-type: map - readinessProbe: - description: |- - Periodic probe of container service readiness. - Container will be removed from service endpoints if the probe fails. - Cannot be updated. - More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes - type: object - properties: - exec: - description: Exec specifies the action to take. - type: object - properties: - command: - description: |- - Command is the command line to execute inside the container, the working directory for the - command is root ('/') in the container's filesystem. The command is simply exec'd, it is - not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use - a shell, you need to explicitly call out to that shell. - Exit status of 0 is treated as live/healthy and non-zero is unhealthy. - type: array - items: - type: string - x-kubernetes-list-type: atomic - failureThreshold: - description: |- - Minimum consecutive failures for the probe to be considered failed after having succeeded. - Defaults to 3. Minimum value is 1. - type: integer - format: int32 - grpc: - description: GRPC specifies an action involving a GRPC port. - type: object - required: - - port - properties: - port: - description: Port number of the gRPC service. Number must be in the range 1 to 65535. - type: integer - format: int32 - service: - description: |- - Service is the name of the service to place in the gRPC HealthCheckRequest - (see https://github.com/grpc/grpc/blob/master/doc/health-checking.md). - - - If this is not specified, the default behavior is defined by gRPC. - type: string - httpGet: - description: HTTPGet specifies the http request to perform. - type: object - properties: - host: - description: |- - Host name to connect to, defaults to the pod IP. You probably want to set - "Host" in httpHeaders instead. - type: string - httpHeaders: - description: Custom headers to set in the request. HTTP allows repeated headers. - type: array - items: - description: HTTPHeader describes a custom header to be used in HTTP probes - type: object - required: - - name - - value - properties: - name: - description: |- - The header field name. - This will be canonicalized upon output, so case-variant names will be understood as the same header. - type: string - value: - description: The header field value - type: string - x-kubernetes-list-type: atomic - path: - description: Path to access on the HTTP server. - type: string - port: - description: |- - Name or number of the port to access on the container. - Number must be in the range 1 to 65535. - Name must be an IANA_SVC_NAME. - anyOf: - - type: integer - - type: string - x-kubernetes-int-or-string: true - scheme: - description: |- - Scheme to use for connecting to the host. - Defaults to HTTP. - type: string - initialDelaySeconds: - description: |- - Number of seconds after the container has started before liveness probes are initiated. - More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes - type: integer - format: int32 - periodSeconds: - description: How often (in seconds) to perform the probe. - type: integer - format: int32 - successThreshold: - description: |- - Minimum consecutive successes for the probe to be considered successful after having failed. - Defaults to 1. Must be 1 for liveness and startup. Minimum value is 1. - type: integer - format: int32 - tcpSocket: - description: TCPSocket specifies an action involving a TCP port. - type: object - properties: - host: - description: 'Optional: Host name to connect to, defaults to the pod IP.' - type: string - port: - description: |- - Number or name of the port to access on the container. - Number must be in the range 1 to 65535. - Name must be an IANA_SVC_NAME. - anyOf: - - type: integer - - type: string - x-kubernetes-int-or-string: true - timeoutSeconds: - description: |- - Number of seconds after which the probe times out. - Defaults to 1 second. Minimum value is 1. - More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes - type: integer - format: int32 - resources: - description: |- - Compute Resources required by this container. - Cannot be updated. - More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ - type: object - properties: - claims: - description: |- - Claims lists the names of resources, defined in spec.resourceClaims, - that are used by this container. - - - This is an alpha field and requires enabling the - DynamicResourceAllocation feature gate. - - - This field is immutable. It can only be set for containers. - type: array - items: - description: ResourceClaim references one entry in PodSpec.ResourceClaims. - type: object - required: - - name - properties: - name: - description: |- - Name must match the name of one entry in pod.spec.resourceClaims of - the Pod where this field is used. It makes that resource available - inside a container. - type: string - x-kubernetes-list-map-keys: - - name - x-kubernetes-list-type: map - limits: - description: |- - Limits describes the maximum amount of compute resources allowed. - More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ - type: object - additionalProperties: - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - anyOf: - - type: integer - - type: string - x-kubernetes-int-or-string: true - requests: - description: |- - Requests describes the minimum amount of compute resources required. - If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, - otherwise to an implementation-defined value. Requests cannot exceed Limits. - More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ - type: object - additionalProperties: - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - anyOf: - - type: integer - - type: string - x-kubernetes-int-or-string: true - securityContext: - description: |- - SecurityContext defines the security options the container should be run with. - If set, the fields of SecurityContext override the equivalent fields of PodSecurityContext. - More info: https://kubernetes.io/docs/tasks/configure-pod-container/security-context/ - type: object - properties: - allowPrivilegeEscalation: - description: |- - AllowPrivilegeEscalation controls whether a process can gain more - privileges than its parent process. This bool directly controls if - the no_new_privs flag will be set on the container process. - AllowPrivilegeEscalation is true always when the container is: - 1) run as Privileged - 2) has CAP_SYS_ADMIN - Note that this field cannot be set when spec.os.name is windows. - type: boolean - capabilities: - description: |- - The capabilities to add/drop when running containers. - Defaults to the default set of capabilities granted by the container runtime. - Note that this field cannot be set when spec.os.name is windows. - type: object - properties: - add: - description: This is accessible behind a feature flag - kubernetes.containerspec-addcapabilities - type: array - items: - description: Capability represent POSIX capabilities type - type: string - x-kubernetes-list-type: atomic - drop: - description: Removed capabilities - type: array - items: - description: Capability represent POSIX capabilities type - type: string - x-kubernetes-list-type: atomic - readOnlyRootFilesystem: - description: |- - Whether this container has a read-only root filesystem. - Default is false. - Note that this field cannot be set when spec.os.name is windows. - type: boolean - runAsGroup: - description: |- - The GID to run the entrypoint of the container process. - Uses runtime default if unset. - May also be set in PodSecurityContext. If set in both SecurityContext and - PodSecurityContext, the value specified in SecurityContext takes precedence. - Note that this field cannot be set when spec.os.name is windows. - type: integer - format: int64 - runAsNonRoot: - description: |- - Indicates that the container must run as a non-root user. - If true, the Kubelet will validate the image at runtime to ensure that it - does not run as UID 0 (root) and fail to start the container if it does. - If unset or false, no such validation will be performed. - May also be set in PodSecurityContext. If set in both SecurityContext and - PodSecurityContext, the value specified in SecurityContext takes precedence. - type: boolean - runAsUser: - description: |- - The UID to run the entrypoint of the container process. - Defaults to user specified in image metadata if unspecified. - May also be set in PodSecurityContext. If set in both SecurityContext and - PodSecurityContext, the value specified in SecurityContext takes precedence. - Note that this field cannot be set when spec.os.name is windows. - type: integer - format: int64 - seccompProfile: - description: |- - The seccomp options to use by this container. If seccomp options are - provided at both the pod & container level, the container options - override the pod options. - Note that this field cannot be set when spec.os.name is windows. - type: object - required: - - type - properties: - localhostProfile: - description: |- - localhostProfile indicates a profile defined in a file on the node should be used. - The profile must be preconfigured on the node to work. - Must be a descending path, relative to the kubelet's configured seccomp profile location. - Must be set if type is "Localhost". Must NOT be set for any other type. - type: string - type: - description: |- - type indicates which kind of seccomp profile will be applied. - Valid options are: - - - Localhost - a profile defined in a file on the node should be used. - RuntimeDefault - the container runtime default profile should be used. - Unconfined - no profile should be applied. - type: string - startupProbe: - description: |- - StartupProbe indicates that the Pod has successfully initialized. - If specified, no other probes are executed until this completes successfully. - If this probe fails, the Pod will be restarted, just as if the livenessProbe failed. - This can be used to provide different probe parameters at the beginning of a Pod's lifecycle, - when it might take a long time to load data or warm a cache, than during steady-state operation. - This cannot be updated. - More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes - type: object - properties: - exec: - description: Exec specifies the action to take. - type: object - properties: - command: - description: |- - Command is the command line to execute inside the container, the working directory for the - command is root ('/') in the container's filesystem. The command is simply exec'd, it is - not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use - a shell, you need to explicitly call out to that shell. - Exit status of 0 is treated as live/healthy and non-zero is unhealthy. - type: array - items: - type: string - x-kubernetes-list-type: atomic - failureThreshold: - description: |- - Minimum consecutive failures for the probe to be considered failed after having succeeded. - Defaults to 3. Minimum value is 1. - type: integer - format: int32 - grpc: - description: GRPC specifies an action involving a GRPC port. - type: object - required: - - port - properties: - port: - description: Port number of the gRPC service. Number must be in the range 1 to 65535. - type: integer - format: int32 - service: - description: |- - Service is the name of the service to place in the gRPC HealthCheckRequest - (see https://github.com/grpc/grpc/blob/master/doc/health-checking.md). - - - If this is not specified, the default behavior is defined by gRPC. - type: string - httpGet: - description: HTTPGet specifies the http request to perform. - type: object - properties: - host: - description: |- - Host name to connect to, defaults to the pod IP. You probably want to set - "Host" in httpHeaders instead. - type: string - httpHeaders: - description: Custom headers to set in the request. HTTP allows repeated headers. - type: array - items: - description: HTTPHeader describes a custom header to be used in HTTP probes - type: object - required: - - name - - value - properties: - name: - description: |- - The header field name. - This will be canonicalized upon output, so case-variant names will be understood as the same header. - type: string - value: - description: The header field value - type: string - x-kubernetes-list-type: atomic - path: - description: Path to access on the HTTP server. - type: string - port: - description: |- - Name or number of the port to access on the container. - Number must be in the range 1 to 65535. - Name must be an IANA_SVC_NAME. - anyOf: - - type: integer - - type: string - x-kubernetes-int-or-string: true - scheme: - description: |- - Scheme to use for connecting to the host. - Defaults to HTTP. - type: string - initialDelaySeconds: - description: |- - Number of seconds after the container has started before liveness probes are initiated. - More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes - type: integer - format: int32 - periodSeconds: - description: How often (in seconds) to perform the probe. - type: integer - format: int32 - successThreshold: - description: |- - Minimum consecutive successes for the probe to be considered successful after having failed. - Defaults to 1. Must be 1 for liveness and startup. Minimum value is 1. - type: integer - format: int32 - tcpSocket: - description: TCPSocket specifies an action involving a TCP port. - type: object - properties: - host: - description: 'Optional: Host name to connect to, defaults to the pod IP.' - type: string - port: - description: |- - Number or name of the port to access on the container. - Number must be in the range 1 to 65535. - Name must be an IANA_SVC_NAME. - anyOf: - - type: integer - - type: string - x-kubernetes-int-or-string: true - timeoutSeconds: - description: |- - Number of seconds after which the probe times out. - Defaults to 1 second. Minimum value is 1. - More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes - type: integer - format: int32 - terminationMessagePath: - description: |- - Optional: Path at which the file to which the container's termination message - will be written is mounted into the container's filesystem. - Message written is intended to be brief final status, such as an assertion failure message. - Will be truncated by the node if greater than 4096 bytes. The total message length across - all containers will be limited to 12kb. - Defaults to /dev/termination-log. - Cannot be updated. - type: string - terminationMessagePolicy: - description: |- - Indicate how the termination message should be populated. File will use the contents of - terminationMessagePath to populate the container status message on both success and failure. - FallbackToLogsOnError will use the last chunk of container log output if the termination - message file is empty and the container exited with an error. - The log output is limited to 2048 bytes or 80 lines, whichever is smaller. - Defaults to File. - Cannot be updated. - type: string - volumeMounts: - description: |- - Pod volumes to mount into the container's filesystem. - Cannot be updated. - type: array - items: - description: VolumeMount describes a mounting of a Volume within a container. - type: object - required: - - mountPath - - name - properties: - mountPath: - description: |- - Path within the container at which the volume should be mounted. Must - not contain ':'. - type: string - name: - description: This must match the Name of a Volume. - type: string - readOnly: - description: |- - Mounted read-only if true, read-write otherwise (false or unspecified). - Defaults to false. - type: boolean - subPath: - description: |- - Path within the volume from which the container's volume should be mounted. - Defaults to "" (volume's root). - type: string - x-kubernetes-list-map-keys: - - mountPath - x-kubernetes-list-type: map - workingDir: - description: |- - Container's working directory. - If not specified, the container runtime's default will be used, which - might be configured in the container image. - Cannot be updated. - type: string - dnsConfig: - description: This is accessible behind a feature flag - kubernetes.podspec-dnsconfig - type: object - x-kubernetes-preserve-unknown-fields: true - dnsPolicy: - description: This is accessible behind a feature flag - kubernetes.podspec-dnspolicy - type: string - enableServiceLinks: - description: 'EnableServiceLinks indicates whether information about services should be injected into pod''s environment variables, matching the syntax of Docker links. Optional: Knative defaults this to false.' - type: boolean - hostAliases: - description: This is accessible behind a feature flag - kubernetes.podspec-hostaliases - type: array - items: - description: This is accessible behind a feature flag - kubernetes.podspec-hostaliases - type: object - x-kubernetes-preserve-unknown-fields: true - hostIPC: - description: This is accessible behind a feature flag - kubernetes.podspec-hostipc - type: boolean - x-kubernetes-preserve-unknown-fields: true - hostNetwork: - description: This is accessible behind a feature flag - kubernetes.podspec-hostnetwork - type: boolean - x-kubernetes-preserve-unknown-fields: true - hostPID: - description: This is accessible behind a feature flag - kubernetes.podspec-hostpid - type: boolean - x-kubernetes-preserve-unknown-fields: true - idleTimeoutSeconds: - description: |- - IdleTimeoutSeconds is the maximum duration in seconds a request will be allowed - to stay open while not receiving any bytes from the user's application. If - unspecified, a system default will be provided. - type: integer - format: int64 - imagePullSecrets: - description: |- - ImagePullSecrets is an optional list of references to secrets in the same namespace to use for pulling any of the images used by this PodSpec. - If specified, these secrets will be passed to individual puller implementations for them to use. - More info: https://kubernetes.io/docs/concepts/containers/images#specifying-imagepullsecrets-on-a-pod - type: array - items: - description: |- - LocalObjectReference contains enough information to let you locate the - referenced object inside the same namespace. - type: object - properties: - name: - description: |- - Name of the referent. - This field is effectively required, but due to backwards compatibility is - allowed to be empty. Instances of this type with an empty value here are - almost certainly wrong. - TODO: Add other useful fields. apiVersion, kind, uid? - More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names - TODO: Drop `kubebuilder:default` when controller-gen doesn't need it https://github.com/kubernetes-sigs/kubebuilder/issues/3896. - type: string - default: "" - x-kubernetes-map-type: atomic - x-kubernetes-list-map-keys: - - name - x-kubernetes-list-type: map - initContainers: - description: |- - List of initialization containers belonging to the pod. - Init containers are executed in order prior to containers being started. If any - init container fails, the pod is considered to have failed and is handled according - to its restartPolicy. The name for an init container or normal container must be - unique among all containers. - Init containers may not have Lifecycle actions, Readiness probes, Liveness probes, or Startup probes. - The resourceRequirements of an init container are taken into account during scheduling - by finding the highest request/limit for each resource type, and then using the max of - of that value or the sum of the normal containers. Limits are applied to init containers - in a similar fashion. - Init containers cannot currently be added or removed. - Cannot be updated. - More info: https://kubernetes.io/docs/concepts/workloads/pods/init-containers/ - type: array - items: - description: This is accessible behind a feature flag - kubernetes.podspec-init-containers - type: object - x-kubernetes-preserve-unknown-fields: true - nodeSelector: - description: This is accessible behind a feature flag - kubernetes.podspec-nodeselector - type: object - x-kubernetes-preserve-unknown-fields: true - x-kubernetes-map-type: atomic - priorityClassName: - description: This is accessible behind a feature flag - kubernetes.podspec-priorityclassname - type: string - x-kubernetes-preserve-unknown-fields: true - responseStartTimeoutSeconds: - description: |- - ResponseStartTimeoutSeconds is the maximum duration in seconds that the request - routing layer will wait for a request delivered to a container to begin - sending any network traffic. - type: integer - format: int64 - runtimeClassName: - description: This is accessible behind a feature flag - kubernetes.podspec-runtimeclassname - type: string - x-kubernetes-preserve-unknown-fields: true - schedulerName: - description: This is accessible behind a feature flag - kubernetes.podspec-schedulername - type: string - x-kubernetes-preserve-unknown-fields: true - securityContext: - description: This is accessible behind a feature flag - kubernetes.podspec-securitycontext - type: object - x-kubernetes-preserve-unknown-fields: true - serviceAccountName: - description: |- - ServiceAccountName is the name of the ServiceAccount to use to run this pod. - More info: https://kubernetes.io/docs/tasks/configure-pod-container/configure-service-account/ - type: string - shareProcessNamespace: - description: This is accessible behind a feature flag - kubernetes.podspec-shareproccessnamespace - type: boolean - x-kubernetes-preserve-unknown-fields: true - timeoutSeconds: - description: |- - TimeoutSeconds is the maximum duration in seconds that the request instance - is allowed to respond to a request. If unspecified, a system default will - be provided. - type: integer - format: int64 - tolerations: - description: This is accessible behind a feature flag - kubernetes.podspec-tolerations - type: array - items: - description: This is accessible behind a feature flag - kubernetes.podspec-tolerations - type: object - x-kubernetes-preserve-unknown-fields: true - x-kubernetes-list-type: atomic - topologySpreadConstraints: - description: This is accessible behind a feature flag - kubernetes.podspec-topologyspreadconstraints - type: array - items: - description: This is accessible behind a feature flag - kubernetes.podspec-topologyspreadconstraints - type: object - x-kubernetes-preserve-unknown-fields: true - volumes: - description: |- - List of volumes that can be mounted by containers belonging to the pod. - More info: https://kubernetes.io/docs/concepts/storage/volumes - type: array - items: - description: Volume represents a named volume in a pod that may be accessed by any container in the pod. - type: object - required: - - name - properties: - configMap: - description: configMap represents a configMap that should populate this volume - type: object - properties: - defaultMode: - description: |- - defaultMode is optional: mode bits used to set permissions on created files by default. - Must be an octal value between 0000 and 0777 or a decimal value between 0 and 511. - YAML accepts both octal and decimal values, JSON requires decimal values for mode bits. - Defaults to 0644. - Directories within the path are not affected by this setting. - This might be in conflict with other options that affect the file - mode, like fsGroup, and the result can be other mode bits set. - type: integer - format: int32 - items: - description: |- - items if unspecified, each key-value pair in the Data field of the referenced - ConfigMap will be projected into the volume as a file whose name is the - key and content is the value. If specified, the listed keys will be - projected into the specified paths, and unlisted keys will not be - present. If a key is specified which is not present in the ConfigMap, - the volume setup will error unless it is marked optional. Paths must be - relative and may not contain the '..' path or start with '..'. - type: array - items: - description: Maps a string key to a path within a volume. - type: object - required: - - key - - path - properties: - key: - description: key is the key to project. - type: string - mode: - description: |- - mode is Optional: mode bits used to set permissions on this file. - Must be an octal value between 0000 and 0777 or a decimal value between 0 and 511. - YAML accepts both octal and decimal values, JSON requires decimal values for mode bits. - If not specified, the volume defaultMode will be used. - This might be in conflict with other options that affect the file - mode, like fsGroup, and the result can be other mode bits set. - type: integer - format: int32 - path: - description: |- - path is the relative path of the file to map the key to. - May not be an absolute path. - May not contain the path element '..'. - May not start with the string '..'. - type: string - x-kubernetes-list-type: atomic - name: - description: |- - Name of the referent. - This field is effectively required, but due to backwards compatibility is - allowed to be empty. Instances of this type with an empty value here are - almost certainly wrong. - TODO: Add other useful fields. apiVersion, kind, uid? - More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names - TODO: Drop `kubebuilder:default` when controller-gen doesn't need it https://github.com/kubernetes-sigs/kubebuilder/issues/3896. - type: string - default: "" - optional: - description: optional specify whether the ConfigMap or its keys must be defined - type: boolean - x-kubernetes-map-type: atomic - emptyDir: - description: This is accessible behind a feature flag - kubernetes.podspec-emptydir - type: object - x-kubernetes-preserve-unknown-fields: true - name: - description: |- - name of the volume. - Must be a DNS_LABEL and unique within the pod. - More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names - type: string - persistentVolumeClaim: - description: This is accessible behind a feature flag - kubernetes.podspec-persistent-volume-claim - type: object - x-kubernetes-preserve-unknown-fields: true - projected: - description: projected items for all in one resources secrets, configmaps, and downward API - type: object - properties: - defaultMode: - description: |- - defaultMode are the mode bits used to set permissions on created files by default. - Must be an octal value between 0000 and 0777 or a decimal value between 0 and 511. - YAML accepts both octal and decimal values, JSON requires decimal values for mode bits. - Directories within the path are not affected by this setting. - This might be in conflict with other options that affect the file - mode, like fsGroup, and the result can be other mode bits set. - type: integer - format: int32 - sources: - description: sources is the list of volume projections - type: array - items: - description: Projection that may be projected along with other supported volume types - type: object - properties: - configMap: - description: configMap information about the configMap data to project - type: object - properties: - items: - description: |- - items if unspecified, each key-value pair in the Data field of the referenced - ConfigMap will be projected into the volume as a file whose name is the - key and content is the value. If specified, the listed keys will be - projected into the specified paths, and unlisted keys will not be - present. If a key is specified which is not present in the ConfigMap, - the volume setup will error unless it is marked optional. Paths must be - relative and may not contain the '..' path or start with '..'. - type: array - items: - description: Maps a string key to a path within a volume. - type: object - required: - - key - - path - properties: - key: - description: key is the key to project. - type: string - mode: - description: |- - mode is Optional: mode bits used to set permissions on this file. - Must be an octal value between 0000 and 0777 or a decimal value between 0 and 511. - YAML accepts both octal and decimal values, JSON requires decimal values for mode bits. - If not specified, the volume defaultMode will be used. - This might be in conflict with other options that affect the file - mode, like fsGroup, and the result can be other mode bits set. - type: integer - format: int32 - path: - description: |- - path is the relative path of the file to map the key to. - May not be an absolute path. - May not contain the path element '..'. - May not start with the string '..'. - type: string - x-kubernetes-list-type: atomic - name: - description: |- - Name of the referent. - This field is effectively required, but due to backwards compatibility is - allowed to be empty. Instances of this type with an empty value here are - almost certainly wrong. - TODO: Add other useful fields. apiVersion, kind, uid? - More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names - TODO: Drop `kubebuilder:default` when controller-gen doesn't need it https://github.com/kubernetes-sigs/kubebuilder/issues/3896. - type: string - default: "" - optional: - description: optional specify whether the ConfigMap or its keys must be defined - type: boolean - x-kubernetes-map-type: atomic - downwardAPI: - description: downwardAPI information about the downwardAPI data to project - type: object - properties: - items: - description: Items is a list of DownwardAPIVolume file - type: array - items: - description: DownwardAPIVolumeFile represents information to create the file containing the pod field - type: object - required: - - path - properties: - fieldRef: - description: 'Required: Selects a field of the pod: only annotations, labels, name, namespace and uid are supported.' - type: object - required: - - fieldPath - properties: - apiVersion: - description: Version of the schema the FieldPath is written in terms of, defaults to "v1". - type: string - fieldPath: - description: Path of the field to select in the specified API version. - type: string - x-kubernetes-map-type: atomic - mode: - description: |- - Optional: mode bits used to set permissions on this file, must be an octal value - between 0000 and 0777 or a decimal value between 0 and 511. - YAML accepts both octal and decimal values, JSON requires decimal values for mode bits. - If not specified, the volume defaultMode will be used. - This might be in conflict with other options that affect the file - mode, like fsGroup, and the result can be other mode bits set. - type: integer - format: int32 - path: - description: 'Required: Path is the relative path name of the file to be created. Must not be absolute or contain the ''..'' path. Must be utf-8 encoded. The first item of the relative path must not start with ''..''' - type: string - resourceFieldRef: - description: |- - Selects a resource of the container: only resources limits and requests - (limits.cpu, limits.memory, requests.cpu and requests.memory) are currently supported. - type: object - required: - - resource - properties: - containerName: - description: 'Container name: required for volumes, optional for env vars' - type: string - divisor: - description: Specifies the output format of the exposed resources, defaults to "1" - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - anyOf: - - type: integer - - type: string - x-kubernetes-int-or-string: true - resource: - description: 'Required: resource to select' - type: string - x-kubernetes-map-type: atomic - x-kubernetes-list-type: atomic - secret: - description: secret information about the secret data to project - type: object - properties: - items: - description: |- - items if unspecified, each key-value pair in the Data field of the referenced - Secret will be projected into the volume as a file whose name is the - key and content is the value. If specified, the listed keys will be - projected into the specified paths, and unlisted keys will not be - present. If a key is specified which is not present in the Secret, - the volume setup will error unless it is marked optional. Paths must be - relative and may not contain the '..' path or start with '..'. - type: array - items: - description: Maps a string key to a path within a volume. - type: object - required: - - key - - path - properties: - key: - description: key is the key to project. - type: string - mode: - description: |- - mode is Optional: mode bits used to set permissions on this file. - Must be an octal value between 0000 and 0777 or a decimal value between 0 and 511. - YAML accepts both octal and decimal values, JSON requires decimal values for mode bits. - If not specified, the volume defaultMode will be used. - This might be in conflict with other options that affect the file - mode, like fsGroup, and the result can be other mode bits set. - type: integer - format: int32 - path: - description: |- - path is the relative path of the file to map the key to. - May not be an absolute path. - May not contain the path element '..'. - May not start with the string '..'. - type: string - x-kubernetes-list-type: atomic - name: - description: |- - Name of the referent. - This field is effectively required, but due to backwards compatibility is - allowed to be empty. Instances of this type with an empty value here are - almost certainly wrong. - TODO: Add other useful fields. apiVersion, kind, uid? - More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names - TODO: Drop `kubebuilder:default` when controller-gen doesn't need it https://github.com/kubernetes-sigs/kubebuilder/issues/3896. - type: string - default: "" - optional: - description: optional field specify whether the Secret or its key must be defined - type: boolean - x-kubernetes-map-type: atomic - serviceAccountToken: - description: serviceAccountToken is information about the serviceAccountToken data to project - type: object - required: - - path - properties: - audience: - description: |- - audience is the intended audience of the token. A recipient of a token - must identify itself with an identifier specified in the audience of the - token, and otherwise should reject the token. The audience defaults to the - identifier of the apiserver. - type: string - expirationSeconds: - description: |- - expirationSeconds is the requested duration of validity of the service - account token. As the token approaches expiration, the kubelet volume - plugin will proactively rotate the service account token. The kubelet will - start trying to rotate the token if the token is older than 80 percent of - its time to live or if the token is older than 24 hours.Defaults to 1 hour - and must be at least 10 minutes. - type: integer - format: int64 - path: - description: |- - path is the path relative to the mount point of the file to project the - token into. - type: string - x-kubernetes-list-type: atomic - secret: - description: |- - secret represents a secret that should populate this volume. - More info: https://kubernetes.io/docs/concepts/storage/volumes#secret - type: object - properties: - defaultMode: - description: |- - defaultMode is Optional: mode bits used to set permissions on created files by default. - Must be an octal value between 0000 and 0777 or a decimal value between 0 and 511. - YAML accepts both octal and decimal values, JSON requires decimal values - for mode bits. Defaults to 0644. - Directories within the path are not affected by this setting. - This might be in conflict with other options that affect the file - mode, like fsGroup, and the result can be other mode bits set. - type: integer - format: int32 - items: - description: |- - items If unspecified, each key-value pair in the Data field of the referenced - Secret will be projected into the volume as a file whose name is the - key and content is the value. If specified, the listed keys will be - projected into the specified paths, and unlisted keys will not be - present. If a key is specified which is not present in the Secret, - the volume setup will error unless it is marked optional. Paths must be - relative and may not contain the '..' path or start with '..'. - type: array - items: - description: Maps a string key to a path within a volume. - type: object - required: - - key - - path - properties: - key: - description: key is the key to project. - type: string - mode: - description: |- - mode is Optional: mode bits used to set permissions on this file. - Must be an octal value between 0000 and 0777 or a decimal value between 0 and 511. - YAML accepts both octal and decimal values, JSON requires decimal values for mode bits. - If not specified, the volume defaultMode will be used. - This might be in conflict with other options that affect the file - mode, like fsGroup, and the result can be other mode bits set. - type: integer - format: int32 - path: - description: |- - path is the relative path of the file to map the key to. - May not be an absolute path. - May not contain the path element '..'. - May not start with the string '..'. - type: string - x-kubernetes-list-type: atomic - optional: - description: optional field specify whether the Secret or its keys must be defined - type: boolean - secretName: - description: |- - secretName is the name of the secret in the pod's namespace to use. - More info: https://kubernetes.io/docs/concepts/storage/volumes#secret - type: string - x-kubernetes-list-map-keys: - - name - x-kubernetes-list-type: map - status: - description: RevisionStatus communicates the observed state of the Revision (from the controller). - type: object - properties: - actualReplicas: - description: ActualReplicas reflects the amount of ready pods running this revision. - type: integer - format: int32 - annotations: - description: |- - Annotations is additional Status fields for the Resource to save some - additional State as well as convey more information to the user. This is - roughly akin to Annotations on any k8s resource, just the reconciler conveying - richer information outwards. - type: object - additionalProperties: - type: string - conditions: - description: Conditions the latest available observations of a resource's current state. - type: array - items: - description: |- - Condition defines a readiness condition for a Knative resource. - See: https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/api-conventions.md#typical-status-properties - type: object - required: - - status - - type - properties: - lastTransitionTime: - description: |- - LastTransitionTime is the last time the condition transitioned from one status to another. - We use VolatileTime in place of metav1.Time to exclude this from creating equality.Semantic - differences (all other things held constant). - type: string - message: - description: A human readable message indicating details about the transition. - type: string - reason: - description: The reason for the condition's last transition. - type: string - severity: - description: |- - Severity with which to treat failures of this type of condition. - When this is not specified, it defaults to Error. - type: string - status: - description: Status of the condition, one of True, False, Unknown. - type: string - type: - description: Type of condition. - type: string - containerStatuses: - description: |- - ContainerStatuses is a slice of images present in .Spec.Container[*].Image - to their respective digests and their container name. - The digests are resolved during the creation of Revision. - ContainerStatuses holds the container name and image digests - for both serving and non serving containers. - ref: http://bit.ly/image-digests - type: array - items: - description: ContainerStatus holds the information of container name and image digest value - type: object - properties: - imageDigest: - type: string - name: - type: string - desiredReplicas: - description: DesiredReplicas reflects the desired amount of pods running this revision. - type: integer - format: int32 - initContainerStatuses: - description: |- - InitContainerStatuses is a slice of images present in .Spec.InitContainer[*].Image - to their respective digests and their container name. - The digests are resolved during the creation of Revision. - ContainerStatuses holds the container name and image digests - for both serving and non serving containers. - ref: http://bit.ly/image-digests - type: array - items: - description: ContainerStatus holds the information of container name and image digest value - type: object - properties: - imageDigest: - type: string - name: - type: string - logUrl: - description: |- - LogURL specifies the generated logging url for this particular revision - based on the revision url template specified in the controller's config. - type: string - observedGeneration: - description: |- - ObservedGeneration is the 'Generation' of the Service that - was last processed by the controller. - type: integer - format: int64 ---- -# Source: knative-operator/templates/knative-crds.yaml -# Copyright 2019 The Knative Authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Note: The schema part of the spec is auto-generated by hack/update-schemas.sh. - -apiVersion: apiextensions.k8s.io/v1 -kind: CustomResourceDefinition -metadata: - name: routes.serving.knative.dev - labels: - app.kubernetes.io/name: knative-serving - app.kubernetes.io/version: "1.16.0" - knative.dev/crd-install: "true" - duck.knative.dev/addressable: "true" -spec: - group: serving.knative.dev - names: - kind: Route - plural: routes - singular: route - categories: - - all - - knative - - serving - shortNames: - - rt - scope: Namespaced - versions: - - name: v1 - served: true - storage: true - subresources: - status: {} - additionalPrinterColumns: - - name: URL - type: string - jsonPath: .status.url - - name: Ready - type: string - jsonPath: ".status.conditions[?(@.type=='Ready')].status" - - name: Reason - type: string - jsonPath: ".status.conditions[?(@.type=='Ready')].reason" - schema: - openAPIV3Schema: - description: |- - Route is responsible for configuring ingress over a collection of Revisions. - Some of the Revisions a Route distributes traffic over may be specified by - referencing the Configuration responsible for creating them; in these cases - the Route is additionally responsible for monitoring the Configuration for - "latest ready revision" changes, and smoothly rolling out latest revisions. - See also: https://github.com/knative/serving/blob/main/docs/spec/overview.md#route - type: object - properties: - apiVersion: - description: |- - APIVersion defines the versioned schema of this representation of an object. - Servers should convert recognized schemas to the latest internal value, and - may reject unrecognized values. - More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources - type: string - kind: - description: |- - Kind is a string value representing the REST resource this object represents. - Servers may infer this from the endpoint the client submits requests to. - Cannot be updated. - In CamelCase. - More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds - type: string - metadata: - type: object - spec: - description: Spec holds the desired state of the Route (from the client). - type: object - properties: - traffic: - description: |- - Traffic specifies how to distribute traffic over a collection of - revisions and configurations. - type: array - items: - description: TrafficTarget holds a single entry of the routing table for a Route. - type: object - properties: - configurationName: - description: |- - ConfigurationName of a configuration to whose latest revision we will send - this portion of traffic. When the "status.latestReadyRevisionName" of the - referenced configuration changes, we will automatically migrate traffic - from the prior "latest ready" revision to the new one. This field is never - set in Route's status, only its spec. This is mutually exclusive with - RevisionName. - type: string - latestRevision: - description: |- - LatestRevision may be optionally provided to indicate that the latest - ready Revision of the Configuration should be used for this traffic - target. When provided LatestRevision must be true if RevisionName is - empty; it must be false when RevisionName is non-empty. - type: boolean - percent: - description: |- - Percent indicates that percentage based routing should be used and - the value indicates the percent of traffic that is be routed to this - Revision or Configuration. `0` (zero) mean no traffic, `100` means all - traffic. - When percentage based routing is being used the follow rules apply: - - the sum of all percent values must equal 100 - - when not specified, the implied value for `percent` is zero for - that particular Revision or Configuration - type: integer - format: int64 - revisionName: - description: |- - RevisionName of a specific revision to which to send this portion of - traffic. This is mutually exclusive with ConfigurationName. - type: string - tag: - description: |- - Tag is optionally used to expose a dedicated url for referencing - this target exclusively. - type: string - url: - description: |- - URL displays the URL for accessing named traffic targets. URL is displayed in - status, and is disallowed on spec. URL must contain a scheme (e.g. http://) and - a hostname, but may not contain anything else (e.g. basic auth, url path, etc.) - type: string - status: - description: Status communicates the observed state of the Route (from the controller). - type: object - properties: - address: - description: Address holds the information needed for a Route to be the target of an event. - type: object - properties: - CACerts: - description: |- - CACerts is the Certification Authority (CA) certificates in PEM format - according to https://www.rfc-editor.org/rfc/rfc7468. - type: string - audience: - description: Audience is the OIDC audience for this address. - type: string - name: - description: Name is the name of the address. - type: string - url: - type: string - annotations: - description: |- - Annotations is additional Status fields for the Resource to save some - additional State as well as convey more information to the user. This is - roughly akin to Annotations on any k8s resource, just the reconciler conveying - richer information outwards. - type: object - additionalProperties: - type: string - conditions: - description: Conditions the latest available observations of a resource's current state. - type: array - items: - description: |- - Condition defines a readiness condition for a Knative resource. - See: https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/api-conventions.md#typical-status-properties - type: object - required: - - status - - type - properties: - lastTransitionTime: - description: |- - LastTransitionTime is the last time the condition transitioned from one status to another. - We use VolatileTime in place of metav1.Time to exclude this from creating equality.Semantic - differences (all other things held constant). - type: string - message: - description: A human readable message indicating details about the transition. - type: string - reason: - description: The reason for the condition's last transition. - type: string - severity: - description: |- - Severity with which to treat failures of this type of condition. - When this is not specified, it defaults to Error. - type: string - status: - description: Status of the condition, one of True, False, Unknown. - type: string - type: - description: Type of condition. - type: string - observedGeneration: - description: |- - ObservedGeneration is the 'Generation' of the Service that - was last processed by the controller. - type: integer - format: int64 - traffic: - description: |- - Traffic holds the configured traffic distribution. - These entries will always contain RevisionName references. - When ConfigurationName appears in the spec, this will hold the - LatestReadyRevisionName that we last observed. - type: array - items: - description: TrafficTarget holds a single entry of the routing table for a Route. - type: object - properties: - configurationName: - description: |- - ConfigurationName of a configuration to whose latest revision we will send - this portion of traffic. When the "status.latestReadyRevisionName" of the - referenced configuration changes, we will automatically migrate traffic - from the prior "latest ready" revision to the new one. This field is never - set in Route's status, only its spec. This is mutually exclusive with - RevisionName. - type: string - latestRevision: - description: |- - LatestRevision may be optionally provided to indicate that the latest - ready Revision of the Configuration should be used for this traffic - target. When provided LatestRevision must be true if RevisionName is - empty; it must be false when RevisionName is non-empty. - type: boolean - percent: - description: |- - Percent indicates that percentage based routing should be used and - the value indicates the percent of traffic that is be routed to this - Revision or Configuration. `0` (zero) mean no traffic, `100` means all - traffic. - When percentage based routing is being used the follow rules apply: - - the sum of all percent values must equal 100 - - when not specified, the implied value for `percent` is zero for - that particular Revision or Configuration - type: integer - format: int64 - revisionName: - description: |- - RevisionName of a specific revision to which to send this portion of - traffic. This is mutually exclusive with ConfigurationName. - type: string - tag: - description: |- - Tag is optionally used to expose a dedicated url for referencing - this target exclusively. - type: string - url: - description: |- - URL displays the URL for accessing named traffic targets. URL is displayed in - status, and is disallowed on spec. URL must contain a scheme (e.g. http://) and - a hostname, but may not contain anything else (e.g. basic auth, url path, etc.) - type: string - url: - description: |- - URL holds the url that will distribute traffic over the provided traffic targets. - It generally has the form http[s]://{route-name}.{route-namespace}.{cluster-level-suffix} - type: string ---- -# Source: knative-operator/templates/knative-crds.yaml -# Copyright 2019 The Knative Authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -apiVersion: apiextensions.k8s.io/v1 -kind: CustomResourceDefinition -metadata: - name: serverlessservices.networking.internal.knative.dev - labels: - app.kubernetes.io/name: knative-serving - app.kubernetes.io/component: networking - app.kubernetes.io/version: "1.16.0" - knative.dev/crd-install: "true" -spec: - group: networking.internal.knative.dev - versions: - - name: v1alpha1 - served: true - storage: true - subresources: - status: {} - schema: - openAPIV3Schema: - description: |- - ServerlessService is a proxy for the K8s service objects containing the - endpoints for the revision, whether those are endpoints of the activator or - revision pods. - See: https://knative.page.link/naxz for details. - type: object - properties: - apiVersion: - description: |- - APIVersion defines the versioned schema of this representation of an object. - Servers should convert recognized schemas to the latest internal value, and - may reject unrecognized values. - More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources - type: string - kind: - description: |- - Kind is a string value representing the REST resource this object represents. - Servers may infer this from the endpoint the client submits requests to. - Cannot be updated. - In CamelCase. - More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds - type: string - metadata: - type: object - spec: - description: |- - Spec is the desired state of the ServerlessService. - More info: https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/api-conventions.md#spec-and-status - type: object - required: - - objectRef - - protocolType - properties: - mode: - description: Mode describes the mode of operation of the ServerlessService. - type: string - numActivators: - description: |- - NumActivators contains number of Activators that this revision should be - assigned. - O means — assign all. - type: integer - format: int32 - objectRef: - description: |- - ObjectRef defines the resource that this ServerlessService - is responsible for making "serverless". - type: object - properties: - apiVersion: - description: API version of the referent. - type: string - fieldPath: - description: |- - If referring to a piece of an object instead of an entire object, this string - should contain a valid JSON/Go field access statement, such as desiredState.manifest.containers[2]. - For example, if the object reference is to a container within a pod, this would take on a value like: - "spec.containers{name}" (where "name" refers to the name of the container that triggered - the event) or if no container name is specified "spec.containers[2]" (container with - index 2 in this pod). This syntax is chosen only to have some well-defined way of - referencing a part of an object. - TODO: this design is not final and this field is subject to change in the future. - type: string - kind: - description: |- - Kind of the referent. - More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds - type: string - name: - description: |- - Name of the referent. - More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names - type: string - namespace: - description: |- - Namespace of the referent. - More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/namespaces/ - type: string - resourceVersion: - description: |- - Specific resourceVersion to which this reference is made, if any. - More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#concurrency-control-and-consistency - type: string - uid: - description: |- - UID of the referent. - More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#uids - type: string - x-kubernetes-map-type: atomic - protocolType: - description: |- - The application-layer protocol. Matches `RevisionProtocolType` set on the owning pa/revision. - serving imports networking, so just use string. - type: string - status: - description: |- - Status is the current state of the ServerlessService. - More info: https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/api-conventions.md#spec-and-status - type: object - properties: - annotations: - description: |- - Annotations is additional Status fields for the Resource to save some - additional State as well as convey more information to the user. This is - roughly akin to Annotations on any k8s resource, just the reconciler conveying - richer information outwards. - type: object - additionalProperties: - type: string - conditions: - description: Conditions the latest available observations of a resource's current state. - type: array - items: - description: |- - Condition defines a readiness condition for a Knative resource. - See: https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/api-conventions.md#typical-status-properties - type: object - required: - - status - - type - properties: - lastTransitionTime: - description: |- - LastTransitionTime is the last time the condition transitioned from one status to another. - We use VolatileTime in place of metav1.Time to exclude this from creating equality.Semantic - differences (all other things held constant). - type: string - message: - description: A human readable message indicating details about the transition. - type: string - reason: - description: The reason for the condition's last transition. - type: string - severity: - description: |- - Severity with which to treat failures of this type of condition. - When this is not specified, it defaults to Error. - type: string - status: - description: Status of the condition, one of True, False, Unknown. - type: string - type: - description: Type of condition. - type: string - observedGeneration: - description: |- - ObservedGeneration is the 'Generation' of the Service that - was last processed by the controller. - type: integer - format: int64 - privateServiceName: - description: |- - PrivateServiceName holds the name of a core K8s Service resource that - load balances over the user service pods backing this Revision. - type: string - serviceName: - description: |- - ServiceName holds the name of a core K8s Service resource that - load balances over the pods backing this Revision (activator or revision). - type: string - additionalPrinterColumns: - - name: Mode - type: string - jsonPath: ".spec.mode" - - name: Activators - type: integer - jsonPath: ".spec.numActivators" - - name: ServiceName - type: string - jsonPath: ".status.serviceName" - - name: PrivateServiceName - type: string - jsonPath: ".status.privateServiceName" - - name: Ready - type: string - jsonPath: ".status.conditions[?(@.type=='Ready')].status" - - name: Reason - type: string - jsonPath: ".status.conditions[?(@.type=='Ready')].reason" - names: - kind: ServerlessService - plural: serverlessservices - singular: serverlessservice - categories: - - knative-internal - - networking - shortNames: - - sks - scope: Namespaced ---- -# Source: knative-operator/templates/knative-crds.yaml -# Copyright 2019 The Knative Authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Note: The schema part of the spec is auto-generated by hack/update-schemas.sh. - -apiVersion: apiextensions.k8s.io/v1 -kind: CustomResourceDefinition -metadata: - name: services.serving.knative.dev - labels: - app.kubernetes.io/name: knative-serving - app.kubernetes.io/version: "1.16.0" - knative.dev/crd-install: "true" - duck.knative.dev/addressable: "true" - duck.knative.dev/podspecable: "true" -spec: - group: serving.knative.dev - names: - kind: Service - plural: services - singular: service - categories: - - all - - knative - - serving - shortNames: - - kservice - - ksvc - scope: Namespaced - versions: - - name: v1 - served: true - storage: true - subresources: - status: {} - additionalPrinterColumns: - - name: URL - type: string - jsonPath: .status.url - - name: LatestCreated - type: string - jsonPath: .status.latestCreatedRevisionName - - name: LatestReady - type: string - jsonPath: .status.latestReadyRevisionName - - name: Ready - type: string - jsonPath: ".status.conditions[?(@.type=='Ready')].status" - - name: Reason - type: string - jsonPath: ".status.conditions[?(@.type=='Ready')].reason" - schema: - openAPIV3Schema: - description: |- - Service acts as a top-level container that manages a Route and Configuration - which implement a network service. Service exists to provide a singular - abstraction which can be access controlled, reasoned about, and which - encapsulates software lifecycle decisions such as rollout policy and - team resource ownership. Service acts only as an orchestrator of the - underlying Routes and Configurations (much as a kubernetes Deployment - orchestrates ReplicaSets), and its usage is optional but recommended. - - - The Service's controller will track the statuses of its owned Configuration - and Route, reflecting their statuses and conditions as its own. - - - See also: https://github.com/knative/serving/blob/main/docs/spec/overview.md#service - type: object - properties: - apiVersion: - description: |- - APIVersion defines the versioned schema of this representation of an object. - Servers should convert recognized schemas to the latest internal value, and - may reject unrecognized values. - More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources - type: string - kind: - description: |- - Kind is a string value representing the REST resource this object represents. - Servers may infer this from the endpoint the client submits requests to. - Cannot be updated. - In CamelCase. - More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds - type: string - metadata: - type: object - spec: - description: |- - ServiceSpec represents the configuration for the Service object. - A Service's specification is the union of the specifications for a Route - and Configuration. The Service restricts what can be expressed in these - fields, e.g. the Route must reference the provided Configuration; - however, these limitations also enable friendlier defaulting, - e.g. Route never needs a Configuration name, and may be defaulted to - the appropriate "run latest" spec. - type: object - properties: - template: - description: Template holds the latest specification for the Revision to be stamped out. - type: object - properties: - metadata: - type: object - properties: - annotations: - type: object - additionalProperties: - type: string - finalizers: - type: array - items: - type: string - labels: - type: object - additionalProperties: - type: string - name: - type: string - namespace: - type: string - x-kubernetes-preserve-unknown-fields: true - spec: - description: RevisionSpec holds the desired state of the Revision (from the client). - type: object - required: - - containers - properties: - affinity: - description: This is accessible behind a feature flag - kubernetes.podspec-affinity - type: object - x-kubernetes-preserve-unknown-fields: true - automountServiceAccountToken: - description: AutomountServiceAccountToken indicates whether a service account token should be automatically mounted. - type: boolean - containerConcurrency: - description: |- - ContainerConcurrency specifies the maximum allowed in-flight (concurrent) - requests per container of the Revision. Defaults to `0` which means - concurrency to the application is not limited, and the system decides the - target concurrency for the autoscaler. - type: integer - format: int64 - containers: - description: |- - List of containers belonging to the pod. - Containers cannot currently be added or removed. - There must be at least one container in a Pod. - Cannot be updated. - type: array - items: - description: A single application container that you want to run within a pod. - type: object - properties: - args: - description: |- - Arguments to the entrypoint. - The container image's CMD is used if this is not provided. - Variable references $(VAR_NAME) are expanded using the container's environment. If a variable - cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced - to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. "$$(VAR_NAME)" will - produce the string literal "$(VAR_NAME)". Escaped references will never be expanded, regardless - of whether the variable exists or not. Cannot be updated. - More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell - type: array - items: - type: string - x-kubernetes-list-type: atomic - command: - description: |- - Entrypoint array. Not executed within a shell. - The container image's ENTRYPOINT is used if this is not provided. - Variable references $(VAR_NAME) are expanded using the container's environment. If a variable - cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced - to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. "$$(VAR_NAME)" will - produce the string literal "$(VAR_NAME)". Escaped references will never be expanded, regardless - of whether the variable exists or not. Cannot be updated. - More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell - type: array - items: - type: string - x-kubernetes-list-type: atomic - env: - description: |- - List of environment variables to set in the container. - Cannot be updated. - type: array - items: - description: EnvVar represents an environment variable present in a Container. - type: object - required: - - name - properties: - name: - description: Name of the environment variable. Must be a C_IDENTIFIER. - type: string - value: - description: |- - Variable references $(VAR_NAME) are expanded - using the previously defined environment variables in the container and - any service environment variables. If a variable cannot be resolved, - the reference in the input string will be unchanged. Double $$ are reduced - to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. - "$$(VAR_NAME)" will produce the string literal "$(VAR_NAME)". - Escaped references will never be expanded, regardless of whether the variable - exists or not. - Defaults to "". - type: string - valueFrom: - description: Source for the environment variable's value. Cannot be used if value is not empty. - type: object - properties: - configMapKeyRef: - description: Selects a key of a ConfigMap. - type: object - required: - - key - properties: - key: - description: The key to select. - type: string - name: - description: |- - Name of the referent. - This field is effectively required, but due to backwards compatibility is - allowed to be empty. Instances of this type with an empty value here are - almost certainly wrong. - TODO: Add other useful fields. apiVersion, kind, uid? - More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names - TODO: Drop `kubebuilder:default` when controller-gen doesn't need it https://github.com/kubernetes-sigs/kubebuilder/issues/3896. - type: string - default: "" - optional: - description: Specify whether the ConfigMap or its key must be defined - type: boolean - x-kubernetes-map-type: atomic - fieldRef: - description: This is accessible behind a feature flag - kubernetes.podspec-fieldref - type: object - x-kubernetes-preserve-unknown-fields: true - x-kubernetes-map-type: atomic - resourceFieldRef: - description: This is accessible behind a feature flag - kubernetes.podspec-fieldref - type: object - x-kubernetes-preserve-unknown-fields: true - x-kubernetes-map-type: atomic - secretKeyRef: - description: Selects a key of a secret in the pod's namespace - type: object - required: - - key - properties: - key: - description: The key of the secret to select from. Must be a valid secret key. - type: string - name: - description: |- - Name of the referent. - This field is effectively required, but due to backwards compatibility is - allowed to be empty. Instances of this type with an empty value here are - almost certainly wrong. - TODO: Add other useful fields. apiVersion, kind, uid? - More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names - TODO: Drop `kubebuilder:default` when controller-gen doesn't need it https://github.com/kubernetes-sigs/kubebuilder/issues/3896. - type: string - default: "" - optional: - description: Specify whether the Secret or its key must be defined - type: boolean - x-kubernetes-map-type: atomic - x-kubernetes-list-map-keys: - - name - x-kubernetes-list-type: map - envFrom: - description: |- - List of sources to populate environment variables in the container. - The keys defined within a source must be a C_IDENTIFIER. All invalid keys - will be reported as an event when the container is starting. When a key exists in multiple - sources, the value associated with the last source will take precedence. - Values defined by an Env with a duplicate key will take precedence. - Cannot be updated. - type: array - items: - description: EnvFromSource represents the source of a set of ConfigMaps - type: object - properties: - configMapRef: - description: The ConfigMap to select from - type: object - properties: - name: - description: |- - Name of the referent. - This field is effectively required, but due to backwards compatibility is - allowed to be empty. Instances of this type with an empty value here are - almost certainly wrong. - TODO: Add other useful fields. apiVersion, kind, uid? - More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names - TODO: Drop `kubebuilder:default` when controller-gen doesn't need it https://github.com/kubernetes-sigs/kubebuilder/issues/3896. - type: string - default: "" - optional: - description: Specify whether the ConfigMap must be defined - type: boolean - x-kubernetes-map-type: atomic - prefix: - description: An optional identifier to prepend to each key in the ConfigMap. Must be a C_IDENTIFIER. - type: string - secretRef: - description: The Secret to select from - type: object - properties: - name: - description: |- - Name of the referent. - This field is effectively required, but due to backwards compatibility is - allowed to be empty. Instances of this type with an empty value here are - almost certainly wrong. - TODO: Add other useful fields. apiVersion, kind, uid? - More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names - TODO: Drop `kubebuilder:default` when controller-gen doesn't need it https://github.com/kubernetes-sigs/kubebuilder/issues/3896. - type: string - default: "" - optional: - description: Specify whether the Secret must be defined - type: boolean - x-kubernetes-map-type: atomic - x-kubernetes-list-type: atomic - image: - description: |- - Container image name. - More info: https://kubernetes.io/docs/concepts/containers/images - This field is optional to allow higher level config management to default or override - container images in workload controllers like Deployments and StatefulSets. - type: string - imagePullPolicy: - description: |- - Image pull policy. - One of Always, Never, IfNotPresent. - Defaults to Always if :latest tag is specified, or IfNotPresent otherwise. - Cannot be updated. - More info: https://kubernetes.io/docs/concepts/containers/images#updating-images - type: string - livenessProbe: - description: |- - Periodic probe of container liveness. - Container will be restarted if the probe fails. - Cannot be updated. - More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes - type: object - properties: - exec: - description: Exec specifies the action to take. - type: object - properties: - command: - description: |- - Command is the command line to execute inside the container, the working directory for the - command is root ('/') in the container's filesystem. The command is simply exec'd, it is - not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use - a shell, you need to explicitly call out to that shell. - Exit status of 0 is treated as live/healthy and non-zero is unhealthy. - type: array - items: - type: string - x-kubernetes-list-type: atomic - failureThreshold: - description: |- - Minimum consecutive failures for the probe to be considered failed after having succeeded. - Defaults to 3. Minimum value is 1. - type: integer - format: int32 - grpc: - description: GRPC specifies an action involving a GRPC port. - type: object - required: - - port - properties: - port: - description: Port number of the gRPC service. Number must be in the range 1 to 65535. - type: integer - format: int32 - service: - description: |- - Service is the name of the service to place in the gRPC HealthCheckRequest - (see https://github.com/grpc/grpc/blob/master/doc/health-checking.md). - - - If this is not specified, the default behavior is defined by gRPC. - type: string - httpGet: - description: HTTPGet specifies the http request to perform. - type: object - properties: - host: - description: |- - Host name to connect to, defaults to the pod IP. You probably want to set - "Host" in httpHeaders instead. - type: string - httpHeaders: - description: Custom headers to set in the request. HTTP allows repeated headers. - type: array - items: - description: HTTPHeader describes a custom header to be used in HTTP probes - type: object - required: - - name - - value - properties: - name: - description: |- - The header field name. - This will be canonicalized upon output, so case-variant names will be understood as the same header. - type: string - value: - description: The header field value - type: string - x-kubernetes-list-type: atomic - path: - description: Path to access on the HTTP server. - type: string - port: - description: |- - Name or number of the port to access on the container. - Number must be in the range 1 to 65535. - Name must be an IANA_SVC_NAME. - anyOf: - - type: integer - - type: string - x-kubernetes-int-or-string: true - scheme: - description: |- - Scheme to use for connecting to the host. - Defaults to HTTP. - type: string - initialDelaySeconds: - description: |- - Number of seconds after the container has started before liveness probes are initiated. - More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes - type: integer - format: int32 - periodSeconds: - description: How often (in seconds) to perform the probe. - type: integer - format: int32 - successThreshold: - description: |- - Minimum consecutive successes for the probe to be considered successful after having failed. - Defaults to 1. Must be 1 for liveness and startup. Minimum value is 1. - type: integer - format: int32 - tcpSocket: - description: TCPSocket specifies an action involving a TCP port. - type: object - properties: - host: - description: 'Optional: Host name to connect to, defaults to the pod IP.' - type: string - port: - description: |- - Number or name of the port to access on the container. - Number must be in the range 1 to 65535. - Name must be an IANA_SVC_NAME. - anyOf: - - type: integer - - type: string - x-kubernetes-int-or-string: true - timeoutSeconds: - description: |- - Number of seconds after which the probe times out. - Defaults to 1 second. Minimum value is 1. - More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes - type: integer - format: int32 - name: - description: |- - Name of the container specified as a DNS_LABEL. - Each container in a pod must have a unique name (DNS_LABEL). - Cannot be updated. - type: string - ports: - description: |- - List of ports to expose from the container. Not specifying a port here - DOES NOT prevent that port from being exposed. Any port which is - listening on the default "0.0.0.0" address inside a container will be - accessible from the network. - Modifying this array with strategic merge patch may corrupt the data. - For more information See https://github.com/kubernetes/kubernetes/issues/108255. - Cannot be updated. - type: array - items: - description: ContainerPort represents a network port in a single container. - type: object - required: - - containerPort - properties: - containerPort: - description: |- - Number of port to expose on the pod's IP address. - This must be a valid port number, 0 < x < 65536. - type: integer - format: int32 - name: - description: |- - If specified, this must be an IANA_SVC_NAME and unique within the pod. Each - named port in a pod must have a unique name. Name for the port that can be - referred to by services. - type: string - protocol: - description: |- - Protocol for port. Must be UDP, TCP, or SCTP. - Defaults to "TCP". - type: string - default: TCP - x-kubernetes-list-map-keys: - - containerPort - - protocol - x-kubernetes-list-type: map - readinessProbe: - description: |- - Periodic probe of container service readiness. - Container will be removed from service endpoints if the probe fails. - Cannot be updated. - More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes - type: object - properties: - exec: - description: Exec specifies the action to take. - type: object - properties: - command: - description: |- - Command is the command line to execute inside the container, the working directory for the - command is root ('/') in the container's filesystem. The command is simply exec'd, it is - not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use - a shell, you need to explicitly call out to that shell. - Exit status of 0 is treated as live/healthy and non-zero is unhealthy. - type: array - items: - type: string - x-kubernetes-list-type: atomic - failureThreshold: - description: |- - Minimum consecutive failures for the probe to be considered failed after having succeeded. - Defaults to 3. Minimum value is 1. - type: integer - format: int32 - grpc: - description: GRPC specifies an action involving a GRPC port. - type: object - required: - - port - properties: - port: - description: Port number of the gRPC service. Number must be in the range 1 to 65535. - type: integer - format: int32 - service: - description: |- - Service is the name of the service to place in the gRPC HealthCheckRequest - (see https://github.com/grpc/grpc/blob/master/doc/health-checking.md). - - - If this is not specified, the default behavior is defined by gRPC. - type: string - httpGet: - description: HTTPGet specifies the http request to perform. - type: object - properties: - host: - description: |- - Host name to connect to, defaults to the pod IP. You probably want to set - "Host" in httpHeaders instead. - type: string - httpHeaders: - description: Custom headers to set in the request. HTTP allows repeated headers. - type: array - items: - description: HTTPHeader describes a custom header to be used in HTTP probes - type: object - required: - - name - - value - properties: - name: - description: |- - The header field name. - This will be canonicalized upon output, so case-variant names will be understood as the same header. - type: string - value: - description: The header field value - type: string - x-kubernetes-list-type: atomic - path: - description: Path to access on the HTTP server. - type: string - port: - description: |- - Name or number of the port to access on the container. - Number must be in the range 1 to 65535. - Name must be an IANA_SVC_NAME. - anyOf: - - type: integer - - type: string - x-kubernetes-int-or-string: true - scheme: - description: |- - Scheme to use for connecting to the host. - Defaults to HTTP. - type: string - initialDelaySeconds: - description: |- - Number of seconds after the container has started before liveness probes are initiated. - More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes - type: integer - format: int32 - periodSeconds: - description: How often (in seconds) to perform the probe. - type: integer - format: int32 - successThreshold: - description: |- - Minimum consecutive successes for the probe to be considered successful after having failed. - Defaults to 1. Must be 1 for liveness and startup. Minimum value is 1. - type: integer - format: int32 - tcpSocket: - description: TCPSocket specifies an action involving a TCP port. - type: object - properties: - host: - description: 'Optional: Host name to connect to, defaults to the pod IP.' - type: string - port: - description: |- - Number or name of the port to access on the container. - Number must be in the range 1 to 65535. - Name must be an IANA_SVC_NAME. - anyOf: - - type: integer - - type: string - x-kubernetes-int-or-string: true - timeoutSeconds: - description: |- - Number of seconds after which the probe times out. - Defaults to 1 second. Minimum value is 1. - More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes - type: integer - format: int32 - resources: - description: |- - Compute Resources required by this container. - Cannot be updated. - More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ - type: object - properties: - claims: - description: |- - Claims lists the names of resources, defined in spec.resourceClaims, - that are used by this container. - - - This is an alpha field and requires enabling the - DynamicResourceAllocation feature gate. - - - This field is immutable. It can only be set for containers. - type: array - items: - description: ResourceClaim references one entry in PodSpec.ResourceClaims. - type: object - required: - - name - properties: - name: - description: |- - Name must match the name of one entry in pod.spec.resourceClaims of - the Pod where this field is used. It makes that resource available - inside a container. - type: string - x-kubernetes-list-map-keys: - - name - x-kubernetes-list-type: map - limits: - description: |- - Limits describes the maximum amount of compute resources allowed. - More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ - type: object - additionalProperties: - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - anyOf: - - type: integer - - type: string - x-kubernetes-int-or-string: true - requests: - description: |- - Requests describes the minimum amount of compute resources required. - If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, - otherwise to an implementation-defined value. Requests cannot exceed Limits. - More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ - type: object - additionalProperties: - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - anyOf: - - type: integer - - type: string - x-kubernetes-int-or-string: true - securityContext: - description: |- - SecurityContext defines the security options the container should be run with. - If set, the fields of SecurityContext override the equivalent fields of PodSecurityContext. - More info: https://kubernetes.io/docs/tasks/configure-pod-container/security-context/ - type: object - properties: - allowPrivilegeEscalation: - description: |- - AllowPrivilegeEscalation controls whether a process can gain more - privileges than its parent process. This bool directly controls if - the no_new_privs flag will be set on the container process. - AllowPrivilegeEscalation is true always when the container is: - 1) run as Privileged - 2) has CAP_SYS_ADMIN - Note that this field cannot be set when spec.os.name is windows. - type: boolean - capabilities: - description: |- - The capabilities to add/drop when running containers. - Defaults to the default set of capabilities granted by the container runtime. - Note that this field cannot be set when spec.os.name is windows. - type: object - properties: - add: - description: This is accessible behind a feature flag - kubernetes.containerspec-addcapabilities - type: array - items: - description: Capability represent POSIX capabilities type - type: string - x-kubernetes-list-type: atomic - drop: - description: Removed capabilities - type: array - items: - description: Capability represent POSIX capabilities type - type: string - x-kubernetes-list-type: atomic - readOnlyRootFilesystem: - description: |- - Whether this container has a read-only root filesystem. - Default is false. - Note that this field cannot be set when spec.os.name is windows. - type: boolean - runAsGroup: - description: |- - The GID to run the entrypoint of the container process. - Uses runtime default if unset. - May also be set in PodSecurityContext. If set in both SecurityContext and - PodSecurityContext, the value specified in SecurityContext takes precedence. - Note that this field cannot be set when spec.os.name is windows. - type: integer - format: int64 - runAsNonRoot: - description: |- - Indicates that the container must run as a non-root user. - If true, the Kubelet will validate the image at runtime to ensure that it - does not run as UID 0 (root) and fail to start the container if it does. - If unset or false, no such validation will be performed. - May also be set in PodSecurityContext. If set in both SecurityContext and - PodSecurityContext, the value specified in SecurityContext takes precedence. - type: boolean - runAsUser: - description: |- - The UID to run the entrypoint of the container process. - Defaults to user specified in image metadata if unspecified. - May also be set in PodSecurityContext. If set in both SecurityContext and - PodSecurityContext, the value specified in SecurityContext takes precedence. - Note that this field cannot be set when spec.os.name is windows. - type: integer - format: int64 - seccompProfile: - description: |- - The seccomp options to use by this container. If seccomp options are - provided at both the pod & container level, the container options - override the pod options. - Note that this field cannot be set when spec.os.name is windows. - type: object - required: - - type - properties: - localhostProfile: - description: |- - localhostProfile indicates a profile defined in a file on the node should be used. - The profile must be preconfigured on the node to work. - Must be a descending path, relative to the kubelet's configured seccomp profile location. - Must be set if type is "Localhost". Must NOT be set for any other type. - type: string - type: - description: |- - type indicates which kind of seccomp profile will be applied. - Valid options are: - - - Localhost - a profile defined in a file on the node should be used. - RuntimeDefault - the container runtime default profile should be used. - Unconfined - no profile should be applied. - type: string - startupProbe: - description: |- - StartupProbe indicates that the Pod has successfully initialized. - If specified, no other probes are executed until this completes successfully. - If this probe fails, the Pod will be restarted, just as if the livenessProbe failed. - This can be used to provide different probe parameters at the beginning of a Pod's lifecycle, - when it might take a long time to load data or warm a cache, than during steady-state operation. - This cannot be updated. - More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes - type: object - properties: - exec: - description: Exec specifies the action to take. - type: object - properties: - command: - description: |- - Command is the command line to execute inside the container, the working directory for the - command is root ('/') in the container's filesystem. The command is simply exec'd, it is - not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use - a shell, you need to explicitly call out to that shell. - Exit status of 0 is treated as live/healthy and non-zero is unhealthy. - type: array - items: - type: string - x-kubernetes-list-type: atomic - failureThreshold: - description: |- - Minimum consecutive failures for the probe to be considered failed after having succeeded. - Defaults to 3. Minimum value is 1. - type: integer - format: int32 - grpc: - description: GRPC specifies an action involving a GRPC port. - type: object - required: - - port - properties: - port: - description: Port number of the gRPC service. Number must be in the range 1 to 65535. - type: integer - format: int32 - service: - description: |- - Service is the name of the service to place in the gRPC HealthCheckRequest - (see https://github.com/grpc/grpc/blob/master/doc/health-checking.md). - - - If this is not specified, the default behavior is defined by gRPC. - type: string - httpGet: - description: HTTPGet specifies the http request to perform. - type: object - properties: - host: - description: |- - Host name to connect to, defaults to the pod IP. You probably want to set - "Host" in httpHeaders instead. - type: string - httpHeaders: - description: Custom headers to set in the request. HTTP allows repeated headers. - type: array - items: - description: HTTPHeader describes a custom header to be used in HTTP probes - type: object - required: - - name - - value - properties: - name: - description: |- - The header field name. - This will be canonicalized upon output, so case-variant names will be understood as the same header. - type: string - value: - description: The header field value - type: string - x-kubernetes-list-type: atomic - path: - description: Path to access on the HTTP server. - type: string - port: - description: |- - Name or number of the port to access on the container. - Number must be in the range 1 to 65535. - Name must be an IANA_SVC_NAME. - anyOf: - - type: integer - - type: string - x-kubernetes-int-or-string: true - scheme: - description: |- - Scheme to use for connecting to the host. - Defaults to HTTP. - type: string - initialDelaySeconds: - description: |- - Number of seconds after the container has started before liveness probes are initiated. - More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes - type: integer - format: int32 - periodSeconds: - description: How often (in seconds) to perform the probe. - type: integer - format: int32 - successThreshold: - description: |- - Minimum consecutive successes for the probe to be considered successful after having failed. - Defaults to 1. Must be 1 for liveness and startup. Minimum value is 1. - type: integer - format: int32 - tcpSocket: - description: TCPSocket specifies an action involving a TCP port. - type: object - properties: - host: - description: 'Optional: Host name to connect to, defaults to the pod IP.' - type: string - port: - description: |- - Number or name of the port to access on the container. - Number must be in the range 1 to 65535. - Name must be an IANA_SVC_NAME. - anyOf: - - type: integer - - type: string - x-kubernetes-int-or-string: true - timeoutSeconds: - description: |- - Number of seconds after which the probe times out. - Defaults to 1 second. Minimum value is 1. - More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes - type: integer - format: int32 - terminationMessagePath: - description: |- - Optional: Path at which the file to which the container's termination message - will be written is mounted into the container's filesystem. - Message written is intended to be brief final status, such as an assertion failure message. - Will be truncated by the node if greater than 4096 bytes. The total message length across - all containers will be limited to 12kb. - Defaults to /dev/termination-log. - Cannot be updated. - type: string - terminationMessagePolicy: - description: |- - Indicate how the termination message should be populated. File will use the contents of - terminationMessagePath to populate the container status message on both success and failure. - FallbackToLogsOnError will use the last chunk of container log output if the termination - message file is empty and the container exited with an error. - The log output is limited to 2048 bytes or 80 lines, whichever is smaller. - Defaults to File. - Cannot be updated. - type: string - volumeMounts: - description: |- - Pod volumes to mount into the container's filesystem. - Cannot be updated. - type: array - items: - description: VolumeMount describes a mounting of a Volume within a container. - type: object - required: - - mountPath - - name - properties: - mountPath: - description: |- - Path within the container at which the volume should be mounted. Must - not contain ':'. - type: string - name: - description: This must match the Name of a Volume. - type: string - readOnly: - description: |- - Mounted read-only if true, read-write otherwise (false or unspecified). - Defaults to false. - type: boolean - subPath: - description: |- - Path within the volume from which the container's volume should be mounted. - Defaults to "" (volume's root). - type: string - x-kubernetes-list-map-keys: - - mountPath - x-kubernetes-list-type: map - workingDir: - description: |- - Container's working directory. - If not specified, the container runtime's default will be used, which - might be configured in the container image. - Cannot be updated. - type: string - dnsConfig: - description: This is accessible behind a feature flag - kubernetes.podspec-dnsconfig - type: object - x-kubernetes-preserve-unknown-fields: true - dnsPolicy: - description: This is accessible behind a feature flag - kubernetes.podspec-dnspolicy - type: string - enableServiceLinks: - description: 'EnableServiceLinks indicates whether information about services should be injected into pod''s environment variables, matching the syntax of Docker links. Optional: Knative defaults this to false.' - type: boolean - hostAliases: - description: This is accessible behind a feature flag - kubernetes.podspec-hostaliases - type: array - items: - description: This is accessible behind a feature flag - kubernetes.podspec-hostaliases - type: object - x-kubernetes-preserve-unknown-fields: true - hostIPC: - description: This is accessible behind a feature flag - kubernetes.podspec-hostipc - type: boolean - x-kubernetes-preserve-unknown-fields: true - hostNetwork: - description: This is accessible behind a feature flag - kubernetes.podspec-hostnetwork - type: boolean - x-kubernetes-preserve-unknown-fields: true - hostPID: - description: This is accessible behind a feature flag - kubernetes.podspec-hostpid - type: boolean - x-kubernetes-preserve-unknown-fields: true - idleTimeoutSeconds: - description: |- - IdleTimeoutSeconds is the maximum duration in seconds a request will be allowed - to stay open while not receiving any bytes from the user's application. If - unspecified, a system default will be provided. - type: integer - format: int64 - imagePullSecrets: - description: |- - ImagePullSecrets is an optional list of references to secrets in the same namespace to use for pulling any of the images used by this PodSpec. - If specified, these secrets will be passed to individual puller implementations for them to use. - More info: https://kubernetes.io/docs/concepts/containers/images#specifying-imagepullsecrets-on-a-pod - type: array - items: - description: |- - LocalObjectReference contains enough information to let you locate the - referenced object inside the same namespace. - type: object - properties: - name: - description: |- - Name of the referent. - This field is effectively required, but due to backwards compatibility is - allowed to be empty. Instances of this type with an empty value here are - almost certainly wrong. - TODO: Add other useful fields. apiVersion, kind, uid? - More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names - TODO: Drop `kubebuilder:default` when controller-gen doesn't need it https://github.com/kubernetes-sigs/kubebuilder/issues/3896. - type: string - default: "" - x-kubernetes-map-type: atomic - x-kubernetes-list-map-keys: - - name - x-kubernetes-list-type: map - initContainers: - description: |- - List of initialization containers belonging to the pod. - Init containers are executed in order prior to containers being started. If any - init container fails, the pod is considered to have failed and is handled according - to its restartPolicy. The name for an init container or normal container must be - unique among all containers. - Init containers may not have Lifecycle actions, Readiness probes, Liveness probes, or Startup probes. - The resourceRequirements of an init container are taken into account during scheduling - by finding the highest request/limit for each resource type, and then using the max of - of that value or the sum of the normal containers. Limits are applied to init containers - in a similar fashion. - Init containers cannot currently be added or removed. - Cannot be updated. - More info: https://kubernetes.io/docs/concepts/workloads/pods/init-containers/ - type: array - items: - description: This is accessible behind a feature flag - kubernetes.podspec-init-containers - type: object - x-kubernetes-preserve-unknown-fields: true - nodeSelector: - description: This is accessible behind a feature flag - kubernetes.podspec-nodeselector - type: object - x-kubernetes-preserve-unknown-fields: true - x-kubernetes-map-type: atomic - priorityClassName: - description: This is accessible behind a feature flag - kubernetes.podspec-priorityclassname - type: string - x-kubernetes-preserve-unknown-fields: true - responseStartTimeoutSeconds: - description: |- - ResponseStartTimeoutSeconds is the maximum duration in seconds that the request - routing layer will wait for a request delivered to a container to begin - sending any network traffic. - type: integer - format: int64 - runtimeClassName: - description: This is accessible behind a feature flag - kubernetes.podspec-runtimeclassname - type: string - x-kubernetes-preserve-unknown-fields: true - schedulerName: - description: This is accessible behind a feature flag - kubernetes.podspec-schedulername - type: string - x-kubernetes-preserve-unknown-fields: true - securityContext: - description: This is accessible behind a feature flag - kubernetes.podspec-securitycontext - type: object - x-kubernetes-preserve-unknown-fields: true - serviceAccountName: - description: |- - ServiceAccountName is the name of the ServiceAccount to use to run this pod. - More info: https://kubernetes.io/docs/tasks/configure-pod-container/configure-service-account/ - type: string - shareProcessNamespace: - description: This is accessible behind a feature flag - kubernetes.podspec-shareproccessnamespace - type: boolean - x-kubernetes-preserve-unknown-fields: true - timeoutSeconds: - description: |- - TimeoutSeconds is the maximum duration in seconds that the request instance - is allowed to respond to a request. If unspecified, a system default will - be provided. - type: integer - format: int64 - tolerations: - description: This is accessible behind a feature flag - kubernetes.podspec-tolerations - type: array - items: - description: This is accessible behind a feature flag - kubernetes.podspec-tolerations - type: object - x-kubernetes-preserve-unknown-fields: true - x-kubernetes-list-type: atomic - topologySpreadConstraints: - description: This is accessible behind a feature flag - kubernetes.podspec-topologyspreadconstraints - type: array - items: - description: This is accessible behind a feature flag - kubernetes.podspec-topologyspreadconstraints - type: object - x-kubernetes-preserve-unknown-fields: true - volumes: - description: |- - List of volumes that can be mounted by containers belonging to the pod. - More info: https://kubernetes.io/docs/concepts/storage/volumes - type: array - items: - description: Volume represents a named volume in a pod that may be accessed by any container in the pod. - type: object - required: - - name - properties: - configMap: - description: configMap represents a configMap that should populate this volume - type: object - properties: - defaultMode: - description: |- - defaultMode is optional: mode bits used to set permissions on created files by default. - Must be an octal value between 0000 and 0777 or a decimal value between 0 and 511. - YAML accepts both octal and decimal values, JSON requires decimal values for mode bits. - Defaults to 0644. - Directories within the path are not affected by this setting. - This might be in conflict with other options that affect the file - mode, like fsGroup, and the result can be other mode bits set. - type: integer - format: int32 - items: - description: |- - items if unspecified, each key-value pair in the Data field of the referenced - ConfigMap will be projected into the volume as a file whose name is the - key and content is the value. If specified, the listed keys will be - projected into the specified paths, and unlisted keys will not be - present. If a key is specified which is not present in the ConfigMap, - the volume setup will error unless it is marked optional. Paths must be - relative and may not contain the '..' path or start with '..'. - type: array - items: - description: Maps a string key to a path within a volume. - type: object - required: - - key - - path - properties: - key: - description: key is the key to project. - type: string - mode: - description: |- - mode is Optional: mode bits used to set permissions on this file. - Must be an octal value between 0000 and 0777 or a decimal value between 0 and 511. - YAML accepts both octal and decimal values, JSON requires decimal values for mode bits. - If not specified, the volume defaultMode will be used. - This might be in conflict with other options that affect the file - mode, like fsGroup, and the result can be other mode bits set. - type: integer - format: int32 - path: - description: |- - path is the relative path of the file to map the key to. - May not be an absolute path. - May not contain the path element '..'. - May not start with the string '..'. - type: string - x-kubernetes-list-type: atomic - name: - description: |- - Name of the referent. - This field is effectively required, but due to backwards compatibility is - allowed to be empty. Instances of this type with an empty value here are - almost certainly wrong. - TODO: Add other useful fields. apiVersion, kind, uid? - More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names - TODO: Drop `kubebuilder:default` when controller-gen doesn't need it https://github.com/kubernetes-sigs/kubebuilder/issues/3896. - type: string - default: "" - optional: - description: optional specify whether the ConfigMap or its keys must be defined - type: boolean - x-kubernetes-map-type: atomic - emptyDir: - description: This is accessible behind a feature flag - kubernetes.podspec-emptydir - type: object - x-kubernetes-preserve-unknown-fields: true - name: - description: |- - name of the volume. - Must be a DNS_LABEL and unique within the pod. - More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names - type: string - persistentVolumeClaim: - description: This is accessible behind a feature flag - kubernetes.podspec-persistent-volume-claim - type: object - x-kubernetes-preserve-unknown-fields: true - projected: - description: projected items for all in one resources secrets, configmaps, and downward API - type: object - properties: - defaultMode: - description: |- - defaultMode are the mode bits used to set permissions on created files by default. - Must be an octal value between 0000 and 0777 or a decimal value between 0 and 511. - YAML accepts both octal and decimal values, JSON requires decimal values for mode bits. - Directories within the path are not affected by this setting. - This might be in conflict with other options that affect the file - mode, like fsGroup, and the result can be other mode bits set. - type: integer - format: int32 - sources: - description: sources is the list of volume projections - type: array - items: - description: Projection that may be projected along with other supported volume types - type: object - properties: - configMap: - description: configMap information about the configMap data to project - type: object - properties: - items: - description: |- - items if unspecified, each key-value pair in the Data field of the referenced - ConfigMap will be projected into the volume as a file whose name is the - key and content is the value. If specified, the listed keys will be - projected into the specified paths, and unlisted keys will not be - present. If a key is specified which is not present in the ConfigMap, - the volume setup will error unless it is marked optional. Paths must be - relative and may not contain the '..' path or start with '..'. - type: array - items: - description: Maps a string key to a path within a volume. - type: object - required: - - key - - path - properties: - key: - description: key is the key to project. - type: string - mode: - description: |- - mode is Optional: mode bits used to set permissions on this file. - Must be an octal value between 0000 and 0777 or a decimal value between 0 and 511. - YAML accepts both octal and decimal values, JSON requires decimal values for mode bits. - If not specified, the volume defaultMode will be used. - This might be in conflict with other options that affect the file - mode, like fsGroup, and the result can be other mode bits set. - type: integer - format: int32 - path: - description: |- - path is the relative path of the file to map the key to. - May not be an absolute path. - May not contain the path element '..'. - May not start with the string '..'. - type: string - x-kubernetes-list-type: atomic - name: - description: |- - Name of the referent. - This field is effectively required, but due to backwards compatibility is - allowed to be empty. Instances of this type with an empty value here are - almost certainly wrong. - TODO: Add other useful fields. apiVersion, kind, uid? - More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names - TODO: Drop `kubebuilder:default` when controller-gen doesn't need it https://github.com/kubernetes-sigs/kubebuilder/issues/3896. - type: string - default: "" - optional: - description: optional specify whether the ConfigMap or its keys must be defined - type: boolean - x-kubernetes-map-type: atomic - downwardAPI: - description: downwardAPI information about the downwardAPI data to project - type: object - properties: - items: - description: Items is a list of DownwardAPIVolume file - type: array - items: - description: DownwardAPIVolumeFile represents information to create the file containing the pod field - type: object - required: - - path - properties: - fieldRef: - description: 'Required: Selects a field of the pod: only annotations, labels, name, namespace and uid are supported.' - type: object - required: - - fieldPath - properties: - apiVersion: - description: Version of the schema the FieldPath is written in terms of, defaults to "v1". - type: string - fieldPath: - description: Path of the field to select in the specified API version. - type: string - x-kubernetes-map-type: atomic - mode: - description: |- - Optional: mode bits used to set permissions on this file, must be an octal value - between 0000 and 0777 or a decimal value between 0 and 511. - YAML accepts both octal and decimal values, JSON requires decimal values for mode bits. - If not specified, the volume defaultMode will be used. - This might be in conflict with other options that affect the file - mode, like fsGroup, and the result can be other mode bits set. - type: integer - format: int32 - path: - description: 'Required: Path is the relative path name of the file to be created. Must not be absolute or contain the ''..'' path. Must be utf-8 encoded. The first item of the relative path must not start with ''..''' - type: string - resourceFieldRef: - description: |- - Selects a resource of the container: only resources limits and requests - (limits.cpu, limits.memory, requests.cpu and requests.memory) are currently supported. - type: object - required: - - resource - properties: - containerName: - description: 'Container name: required for volumes, optional for env vars' - type: string - divisor: - description: Specifies the output format of the exposed resources, defaults to "1" - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - anyOf: - - type: integer - - type: string - x-kubernetes-int-or-string: true - resource: - description: 'Required: resource to select' - type: string - x-kubernetes-map-type: atomic - x-kubernetes-list-type: atomic - secret: - description: secret information about the secret data to project - type: object - properties: - items: - description: |- - items if unspecified, each key-value pair in the Data field of the referenced - Secret will be projected into the volume as a file whose name is the - key and content is the value. If specified, the listed keys will be - projected into the specified paths, and unlisted keys will not be - present. If a key is specified which is not present in the Secret, - the volume setup will error unless it is marked optional. Paths must be - relative and may not contain the '..' path or start with '..'. - type: array - items: - description: Maps a string key to a path within a volume. - type: object - required: - - key - - path - properties: - key: - description: key is the key to project. - type: string - mode: - description: |- - mode is Optional: mode bits used to set permissions on this file. - Must be an octal value between 0000 and 0777 or a decimal value between 0 and 511. - YAML accepts both octal and decimal values, JSON requires decimal values for mode bits. - If not specified, the volume defaultMode will be used. - This might be in conflict with other options that affect the file - mode, like fsGroup, and the result can be other mode bits set. - type: integer - format: int32 - path: - description: |- - path is the relative path of the file to map the key to. - May not be an absolute path. - May not contain the path element '..'. - May not start with the string '..'. - type: string - x-kubernetes-list-type: atomic - name: - description: |- - Name of the referent. - This field is effectively required, but due to backwards compatibility is - allowed to be empty. Instances of this type with an empty value here are - almost certainly wrong. - TODO: Add other useful fields. apiVersion, kind, uid? - More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names - TODO: Drop `kubebuilder:default` when controller-gen doesn't need it https://github.com/kubernetes-sigs/kubebuilder/issues/3896. - type: string - default: "" - optional: - description: optional field specify whether the Secret or its key must be defined - type: boolean - x-kubernetes-map-type: atomic - serviceAccountToken: - description: serviceAccountToken is information about the serviceAccountToken data to project - type: object - required: - - path - properties: - audience: - description: |- - audience is the intended audience of the token. A recipient of a token - must identify itself with an identifier specified in the audience of the - token, and otherwise should reject the token. The audience defaults to the - identifier of the apiserver. - type: string - expirationSeconds: - description: |- - expirationSeconds is the requested duration of validity of the service - account token. As the token approaches expiration, the kubelet volume - plugin will proactively rotate the service account token. The kubelet will - start trying to rotate the token if the token is older than 80 percent of - its time to live or if the token is older than 24 hours.Defaults to 1 hour - and must be at least 10 minutes. - type: integer - format: int64 - path: - description: |- - path is the path relative to the mount point of the file to project the - token into. - type: string - x-kubernetes-list-type: atomic - secret: - description: |- - secret represents a secret that should populate this volume. - More info: https://kubernetes.io/docs/concepts/storage/volumes#secret - type: object - properties: - defaultMode: - description: |- - defaultMode is Optional: mode bits used to set permissions on created files by default. - Must be an octal value between 0000 and 0777 or a decimal value between 0 and 511. - YAML accepts both octal and decimal values, JSON requires decimal values - for mode bits. Defaults to 0644. - Directories within the path are not affected by this setting. - This might be in conflict with other options that affect the file - mode, like fsGroup, and the result can be other mode bits set. - type: integer - format: int32 - items: - description: |- - items If unspecified, each key-value pair in the Data field of the referenced - Secret will be projected into the volume as a file whose name is the - key and content is the value. If specified, the listed keys will be - projected into the specified paths, and unlisted keys will not be - present. If a key is specified which is not present in the Secret, - the volume setup will error unless it is marked optional. Paths must be - relative and may not contain the '..' path or start with '..'. - type: array - items: - description: Maps a string key to a path within a volume. - type: object - required: - - key - - path - properties: - key: - description: key is the key to project. - type: string - mode: - description: |- - mode is Optional: mode bits used to set permissions on this file. - Must be an octal value between 0000 and 0777 or a decimal value between 0 and 511. - YAML accepts both octal and decimal values, JSON requires decimal values for mode bits. - If not specified, the volume defaultMode will be used. - This might be in conflict with other options that affect the file - mode, like fsGroup, and the result can be other mode bits set. - type: integer - format: int32 - path: - description: |- - path is the relative path of the file to map the key to. - May not be an absolute path. - May not contain the path element '..'. - May not start with the string '..'. - type: string - x-kubernetes-list-type: atomic - optional: - description: optional field specify whether the Secret or its keys must be defined - type: boolean - secretName: - description: |- - secretName is the name of the secret in the pod's namespace to use. - More info: https://kubernetes.io/docs/concepts/storage/volumes#secret - type: string - x-kubernetes-list-map-keys: - - name - x-kubernetes-list-type: map - traffic: - description: |- - Traffic specifies how to distribute traffic over a collection of - revisions and configurations. - type: array - items: - description: TrafficTarget holds a single entry of the routing table for a Route. - type: object - properties: - configurationName: - description: |- - ConfigurationName of a configuration to whose latest revision we will send - this portion of traffic. When the "status.latestReadyRevisionName" of the - referenced configuration changes, we will automatically migrate traffic - from the prior "latest ready" revision to the new one. This field is never - set in Route's status, only its spec. This is mutually exclusive with - RevisionName. - type: string - latestRevision: - description: |- - LatestRevision may be optionally provided to indicate that the latest - ready Revision of the Configuration should be used for this traffic - target. When provided LatestRevision must be true if RevisionName is - empty; it must be false when RevisionName is non-empty. - type: boolean - percent: - description: |- - Percent indicates that percentage based routing should be used and - the value indicates the percent of traffic that is be routed to this - Revision or Configuration. `0` (zero) mean no traffic, `100` means all - traffic. - When percentage based routing is being used the follow rules apply: - - the sum of all percent values must equal 100 - - when not specified, the implied value for `percent` is zero for - that particular Revision or Configuration - type: integer - format: int64 - revisionName: - description: |- - RevisionName of a specific revision to which to send this portion of - traffic. This is mutually exclusive with ConfigurationName. - type: string - tag: - description: |- - Tag is optionally used to expose a dedicated url for referencing - this target exclusively. - type: string - url: - description: |- - URL displays the URL for accessing named traffic targets. URL is displayed in - status, and is disallowed on spec. URL must contain a scheme (e.g. http://) and - a hostname, but may not contain anything else (e.g. basic auth, url path, etc.) - type: string - status: - description: ServiceStatus represents the Status stanza of the Service resource. - type: object - properties: - address: - description: Address holds the information needed for a Route to be the target of an event. - type: object - properties: - CACerts: - description: |- - CACerts is the Certification Authority (CA) certificates in PEM format - according to https://www.rfc-editor.org/rfc/rfc7468. - type: string - audience: - description: Audience is the OIDC audience for this address. - type: string - name: - description: Name is the name of the address. - type: string - url: - type: string - annotations: - description: |- - Annotations is additional Status fields for the Resource to save some - additional State as well as convey more information to the user. This is - roughly akin to Annotations on any k8s resource, just the reconciler conveying - richer information outwards. - type: object - additionalProperties: - type: string - conditions: - description: Conditions the latest available observations of a resource's current state. - type: array - items: - description: |- - Condition defines a readiness condition for a Knative resource. - See: https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/api-conventions.md#typical-status-properties - type: object - required: - - status - - type - properties: - lastTransitionTime: - description: |- - LastTransitionTime is the last time the condition transitioned from one status to another. - We use VolatileTime in place of metav1.Time to exclude this from creating equality.Semantic - differences (all other things held constant). - type: string - message: - description: A human readable message indicating details about the transition. - type: string - reason: - description: The reason for the condition's last transition. - type: string - severity: - description: |- - Severity with which to treat failures of this type of condition. - When this is not specified, it defaults to Error. - type: string - status: - description: Status of the condition, one of True, False, Unknown. - type: string - type: - description: Type of condition. - type: string - latestCreatedRevisionName: - description: |- - LatestCreatedRevisionName is the last revision that was created from this - Configuration. It might not be ready yet, for that use LatestReadyRevisionName. - type: string - latestReadyRevisionName: - description: |- - LatestReadyRevisionName holds the name of the latest Revision stamped out - from this Configuration that has had its "Ready" condition become "True". - type: string - observedGeneration: - description: |- - ObservedGeneration is the 'Generation' of the Service that - was last processed by the controller. - type: integer - format: int64 - traffic: - description: |- - Traffic holds the configured traffic distribution. - These entries will always contain RevisionName references. - When ConfigurationName appears in the spec, this will hold the - LatestReadyRevisionName that we last observed. - type: array - items: - description: TrafficTarget holds a single entry of the routing table for a Route. - type: object - properties: - configurationName: - description: |- - ConfigurationName of a configuration to whose latest revision we will send - this portion of traffic. When the "status.latestReadyRevisionName" of the - referenced configuration changes, we will automatically migrate traffic - from the prior "latest ready" revision to the new one. This field is never - set in Route's status, only its spec. This is mutually exclusive with - RevisionName. - type: string - latestRevision: - description: |- - LatestRevision may be optionally provided to indicate that the latest - ready Revision of the Configuration should be used for this traffic - target. When provided LatestRevision must be true if RevisionName is - empty; it must be false when RevisionName is non-empty. - type: boolean - percent: - description: |- - Percent indicates that percentage based routing should be used and - the value indicates the percent of traffic that is be routed to this - Revision or Configuration. `0` (zero) mean no traffic, `100` means all - traffic. - When percentage based routing is being used the follow rules apply: - - the sum of all percent values must equal 100 - - when not specified, the implied value for `percent` is zero for - that particular Revision or Configuration - type: integer - format: int64 - revisionName: - description: |- - RevisionName of a specific revision to which to send this portion of - traffic. This is mutually exclusive with ConfigurationName. - type: string - tag: - description: |- - Tag is optionally used to expose a dedicated url for referencing - this target exclusively. - type: string - url: - description: |- - URL displays the URL for accessing named traffic targets. URL is displayed in - status, and is disallowed on spec. URL must contain a scheme (e.g. http://) and - a hostname, but may not contain anything else (e.g. basic auth, url path, etc.) - type: string - url: - description: |- - URL holds the url that will distribute traffic over the provided traffic targets. - It generally has the form http[s]://{route-name}.{route-namespace}.{cluster-level-suffix} - type: string ---- -# Source: knative-operator/templates/knative-crds.yaml -# Copyright 2018 The Knative Authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -apiVersion: apiextensions.k8s.io/v1 -kind: CustomResourceDefinition -metadata: - name: images.caching.internal.knative.dev - labels: - app.kubernetes.io/name: knative-serving - app.kubernetes.io/version: "1.16.0" - knative.dev/crd-install: "true" -spec: - group: caching.internal.knative.dev - names: - kind: Image - plural: images - singular: image - categories: - - knative-internal - - caching - scope: Namespaced - versions: - - name: v1alpha1 - served: true - storage: true - subresources: - status: {} - schema: - openAPIV3Schema: - description: |- - Image is a Knative abstraction that encapsulates the interface by which Knative - components express a desire to have a particular image cached. - type: object - properties: - apiVersion: - description: |- - APIVersion defines the versioned schema of this representation of an object. - Servers should convert recognized schemas to the latest internal value, and - may reject unrecognized values. - More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources - type: string - kind: - description: |- - Kind is a string value representing the REST resource this object represents. - Servers may infer this from the endpoint the client submits requests to. - Cannot be updated. - In CamelCase. - More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds - type: string - metadata: - type: object - spec: - description: Spec holds the desired state of the Image (from the client). - type: object - required: - - image - properties: - image: - description: Image is the name of the container image url to cache across the cluster. - type: string - imagePullSecrets: - description: |- - ImagePullSecrets contains the names of the Kubernetes Secrets containing login - information used by the Pods which will run this container. - type: array - items: - description: |- - LocalObjectReference contains enough information to let you locate the - referenced object inside the same namespace. - type: object - properties: - name: - description: |- - Name of the referent. - This field is effectively required, but due to backwards compatibility is - allowed to be empty. Instances of this type with an empty value here are - almost certainly wrong. - TODO: Add other useful fields. apiVersion, kind, uid? - More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names - TODO: Drop `kubebuilder:default` when controller-gen doesn't need it https://github.com/kubernetes-sigs/kubebuilder/issues/3896. - type: string - default: "" - x-kubernetes-map-type: atomic - serviceAccountName: - description: |- - ServiceAccountName is the name of the Kubernetes ServiceAccount as which the Pods - will run this container. This is potentially used to authenticate the image pull - if the service account has attached pull secrets. For more information: - https://kubernetes.io/docs/tasks/configure-pod-container/configure-service-account/#add-imagepullsecrets-to-a-service-account - type: string - status: - description: Status communicates the observed state of the Image (from the controller). - type: object - properties: - annotations: - description: |- - Annotations is additional Status fields for the Resource to save some - additional State as well as convey more information to the user. This is - roughly akin to Annotations on any k8s resource, just the reconciler conveying - richer information outwards. - type: object - additionalProperties: - type: string - conditions: - description: Conditions the latest available observations of a resource's current state. - type: array - items: - description: |- - Condition defines a readiness condition for a Knative resource. - See: https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/api-conventions.md#typical-status-properties - type: object - required: - - status - - type - properties: - lastTransitionTime: - description: |- - LastTransitionTime is the last time the condition transitioned from one status to another. - We use VolatileTime in place of metav1.Time to exclude this from creating equality.Semantic - differences (all other things held constant). - type: string - message: - description: A human readable message indicating details about the transition. - type: string - reason: - description: The reason for the condition's last transition. - type: string - severity: - description: |- - Severity with which to treat failures of this type of condition. - When this is not specified, it defaults to Error. - type: string - status: - description: Status of the condition, one of True, False, Unknown. - type: string - type: - description: Type of condition. - type: string - observedGeneration: - description: |- - ObservedGeneration is the 'Generation' of the Service that - was last processed by the controller. - type: integer - format: int64 - additionalPrinterColumns: - - name: Image - type: string - jsonPath: .spec.image ---- -# Source: knative-operator/templates/knative-crds.yaml -# Copyright 2021 The Knative Authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -apiVersion: apiextensions.k8s.io/v1 -kind: CustomResourceDefinition -metadata: - name: knativeeventings.operator.knative.dev - labels: - app.kubernetes.io/version: "1.16.0" - app.kubernetes.io/name: knative-operator -spec: - group: operator.knative.dev - versions: - - name: v1beta1 - served: true - storage: true - subresources: - status: {} - schema: - openAPIV3Schema: - description: Schema for the knativeeventings API - properties: - apiVersion: - description: 'APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources' - type: string - kind: - description: 'Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds' - type: string - metadata: - type: object - spec: - description: Spec defines the desired state of KnativeEventing - properties: - additionalManifests: - description: A list of the additional eventing manifests, which will be installed by the operator - items: - properties: - URL: - description: The link of the additional manifest URL - type: string - type: object - type: array - config: - additionalProperties: - additionalProperties: - type: string - type: object - description: A means to override the corresponding entries in the upstream configmaps - type: object - defaultBrokerClass: - description: The default broker type to use for the brokers Knative creates. If no value is provided, MTChannelBasedBroker will be used. - type: string - high-availability: - description: Allows specification of HA control plane - properties: - replicas: - description: The number of replicas that HA parts of the control plane will be scaled to - minimum: 0 - type: integer - type: object - workloads: - description: A mapping of deployment or statefulset name to override - type: array - items: - type: object - properties: - name: - description: The name of the deployment - type: string - labels: - additionalProperties: - type: string - description: Labels overrides labels for the deployment and its template. - type: object - livenessProbes: - description: LivenessProbes overrides liveness probes for the containers. - items: - description: ProbesRequirementsOverride enables the user to override any container's env vars. - properties: - container: - description: The container name - type: string - failureThreshold: - description: Minimum consecutive failures for the probe to be considered failed after having succeeded. Defaults to 3. Minimum value is 1. - format: int32 - type: integer - initialDelaySeconds: - description: 'Number of seconds after the container has started before liveness probes are initiated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' - format: int32 - type: integer - periodSeconds: - description: How often (in seconds) to perform the probe. Default to 10 seconds. Minimum value is 1. - format: int32 - type: integer - successThreshold: - description: Minimum consecutive successes for the probe to be considered successful after having failed. Defaults to 1. Must be 1 for liveness and startup. Minimum value is 1. - format: int32 - type: integer - terminationGracePeriodSeconds: - description: Optional duration in seconds the pod needs to terminate gracefully upon probe failure. The grace period is the duration in seconds after the processes running in the pod are sent a termination signal and the time when the processes are forcibly halted with a kill signal. Set this value longer than the expected cleanup time for your process. If this value is nil, the pod's terminationGracePeriodSeconds will be used. Otherwise, this value overrides the value provided by the pod spec. Value must be non-negative integer. The value zero indicates stop immediately via the kill signal (no opportunity to shut down). This is a beta field and requires enabling ProbeTerminationGracePeriod feature gate. Minimum value is 1. spec.terminationGracePeriodSeconds is used if unset. - format: int64 - type: integer - timeoutSeconds: - description: 'Number of seconds after which the probe times out. Defaults to 1 second. Minimum value is 1. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' - format: int32 - type: integer - required: - - container - type: object - type: array - annotations: - additionalProperties: - type: string - description: Annotations overrides labels for the deployment and its template. - type: object - env: - description: Env overrides env vars for the containers. - items: - properties: - container: - description: The container name - type: string - envVars: - description: The desired EnvVarRequirements - items: - description: EnvVar represents an environment variable present in a Container. - properties: - name: - description: Name of the environment variable. Must be a C_IDENTIFIER. - type: string - value: - description: 'Variable references $(VAR_NAME) are expanded using the previously defined environment variables in the container and any service environment variables. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. "$$(VAR_NAME)" will produce the string literal "$(VAR_NAME)". Escaped references will never be expanded, regardless of whether the variable exists or not. Defaults to "".' - type: string - valueFrom: - description: Source for the environment variable's value. Cannot be used if value is not empty. - properties: - configMapKeyRef: - description: Selects a key of a ConfigMap. - properties: - key: - description: The key to select. - type: string - name: - description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' - type: string - optional: - description: Specify whether the ConfigMap or its key must be defined - type: boolean - required: - - key - type: object - fieldRef: - description: 'Selects a field of the pod: supports metadata.name, metadata.namespace, `metadata.labels['''']`, `metadata.annotations['''']`, spec.nodeName, spec.serviceAccountName, status.hostIP, status.podIP, status.podIPs.' - properties: - apiVersion: - description: Version of the schema the FieldPath is written in terms of, defaults to "v1". - type: string - fieldPath: - description: Path of the field to select in the specified API version. - type: string - required: - - fieldPath - type: object - resourceFieldRef: - description: 'Selects a resource of the container: only resources limits and requests (limits.cpu, limits.memory, limits.ephemeral-storage, requests.cpu, requests.memory and requests.ephemeral-storage) are currently supported.' - properties: - containerName: - description: 'Container name: required for volumes, optional for env vars' - type: string - divisor: - anyOf: - - type: integer - - type: string - description: Specifies the output format of the exposed resources, defaults to "1" - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true - resource: - description: 'Required: resource to select' - type: string - required: - - resource - type: object - secretKeyRef: - description: Selects a key of a secret in the pod's namespace - properties: - key: - description: The key of the secret to select from. Must be a valid secret key. - type: string - name: - description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' - type: string - optional: - description: Specify whether the Secret or its key must be defined - type: boolean - required: - - key - type: object - type: object - required: - - name - type: object - type: array - required: - - container - type: object - type: array - replicas: - description: The number of replicas that HA parts of the control plane will be scaled to - type: integer - minimum: 0 - nodeSelector: - additionalProperties: - type: string - description: NodeSelector overrides nodeSelector for the deployment. - type: object - readinessProbes: - description: ReadinessProbes overrides readiness probes for the containers. - items: - description: ProbesRequirementsOverride enables the user to override any container's env vars. - properties: - container: - description: The container name - type: string - failureThreshold: - description: Minimum consecutive failures for the probe to be considered failed after having succeeded. Defaults to 3. Minimum value is 1. - format: int32 - type: integer - initialDelaySeconds: - description: 'Number of seconds after the container has started before liveness probes are initiated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' - format: int32 - type: integer - periodSeconds: - description: How often (in seconds) to perform the probe. Default to 10 seconds. Minimum value is 1. - format: int32 - type: integer - successThreshold: - description: Minimum consecutive successes for the probe to be considered successful after having failed. Defaults to 1. Must be 1 for liveness and startup. Minimum value is 1. - format: int32 - type: integer - terminationGracePeriodSeconds: - description: Optional duration in seconds the pod needs to terminate gracefully upon probe failure. The grace period is the duration in seconds after the processes running in the pod are sent a termination signal and the time when the processes are forcibly halted with a kill signal. Set this value longer than the expected cleanup time for your process. If this value is nil, the pod's terminationGracePeriodSeconds will be used. Otherwise, this value overrides the value provided by the pod spec. Value must be non-negative integer. The value zero indicates stop immediately via the kill signal (no opportunity to shut down). This is a beta field and requires enabling ProbeTerminationGracePeriod feature gate. Minimum value is 1. spec.terminationGracePeriodSeconds is used if unset. - format: int64 - type: integer - timeoutSeconds: - description: 'Number of seconds after which the probe times out. Defaults to 1 second. Minimum value is 1. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' - format: int32 - type: integer - required: - - container - type: object - type: array - tolerations: - description: If specified, the pod's tolerations. - items: - description: The pod this Toleration is attached to tolerates any taint that matches the triple using the matching operator . - properties: - effect: - description: Effect indicates the taint effect to match. Empty means match all taint effects. When specified, allowed values are NoSchedule, PreferNoSchedule and NoExecute. - type: string - key: - description: Key is the taint key that the toleration applies to. Empty means match all taint keys. If the key is empty, operator must be Exists; this combination means to match all values and all keys. - type: string - operator: - description: Operator represents a key's relationship to the value. Valid operators are Exists and Equal. Defaults to Equal. Exists is equivalent to wildcard for value, so that a pod can tolerate all taints of a particular category. - type: string - tolerationSeconds: - description: TolerationSeconds represents the period of time the toleration (which must be of effect NoExecute, otherwise this field is ignored) tolerates the taint. By default, it is not set, which means tolerate the taint forever (do not evict). Zero and negative values will be treated as 0 (evict immediately) by the system. - format: int64 - type: integer - value: - description: Value is the taint value the toleration matches to. If the operator is Exists, the value should be empty, otherwise just a regular string. - type: string - type: object - type: array - hostNetwork: - description: Use the host's network namespace if true. Make sure to understand the security implications if you want to enable it. When hostNetwork is enabled, this will set dnsPolicy to ClusterFirstWithHostNet automatically. - type: boolean - topologySpreadConstraints: - description: If specified, the pod's topology spread constraints. - items: - description: TopologySpreadConstraint specifies how to spread matching pods among the given topology. - properties: - labelSelector: - description: LabelSelector is used to find matching pods. Pods that match this label selector are counted to determine the number of pods in their corresponding topology domain. - properties: - matchExpressions: - description: matchExpressions is a list of label selector requirements. The requirements are ANDed. - items: - description: A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values. - properties: - key: - description: key is the label key that the selector applies to. - type: string - operator: - description: operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist. - type: string - values: - description: values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch. - items: - type: string - type: array - required: - - key - - operator - type: object - type: array - matchLabels: - additionalProperties: - type: string - description: matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is "key", the operator is "In", and the values array contains only "value". The requirements are ANDed. - type: object - type: object - maxSkew: - description: 'MaxSkew describes the degree to which pods may be unevenly distributed. It''s the maximum permitted difference between the number of matching pods in any two topology domains of a given topology type. For example, in a 3-zone cluster, MaxSkew is set to 1, and pods with the same labelSelector spread as 1/1/0: | zone1 | zone2 | zone3 | | P | P | | - if MaxSkew is 1, incoming pod can only be scheduled to zone3 to become 1/1/1; scheduling it onto zone1(zone2) would make the ActualSkew(2-0) on zone1(zone2) violate MaxSkew(1). - if MaxSkew is 2, incoming pod can be scheduled onto any zone. It''s a required field. Default value is 1 and 0 is not allowed.' - format: int32 - type: integer - topologyKey: - description: TopologyKey is the key of node labels. Nodes that have a label with this key and identical values are considered to be in the same topology. We consider each as a "bucket", and try to put balanced number of pods into each bucket. It's a required field. - type: string - whenUnsatisfiable: - description: 'WhenUnsatisfiable indicates how to deal with a pod if it doesn''t satisfy the spread constraint. - DoNotSchedule (default) tells the scheduler not to schedule it - ScheduleAnyway tells the scheduler to still schedule it It''s considered as "Unsatisfiable" if and only if placing incoming pod on any topology violates "MaxSkew". For example, in a 3-zone cluster, MaxSkew is set to 1, and pods with the same labelSelector spread as 3/1/1: | zone1 | zone2 | zone3 | | P P P | P | P | If WhenUnsatisfiable is set to DoNotSchedule, incoming pod can only be scheduled to zone2(zone3) to become 3/2/1(3/1/2) as ActualSkew(2-1) on zone2(zone3) satisfies MaxSkew(1). In other words, the cluster can still be imbalanced, but scheduler won''t make it *more* imbalanced. It''s a required field.' - type: string - required: - - maxSkew - - topologyKey - - whenUnsatisfiable - type: object - type: array - version: - description: Version the cluster should be on. - type: string - volumeMounts: - description: VolumeMounts allows configuration of additional VolumeMounts on the output StatefulSet definition. VolumeMounts specified will be appended to other VolumeMounts in the alertmanager container, that are generated as a result of StorageSpec objects. - items: - description: VolumeMount describes a mounting of a Volume within a container. - properties: - mountPath: - description: Path within the container at which the volume should be mounted. Must not contain ':'. - type: string - mountPropagation: - description: mountPropagation determines how mounts are propagated from the host to container and the other way around. When not set, MountPropagationNone is used. This field is beta in 1.10. - type: string - name: - description: This must match the Name of a Volume. - type: string - readOnly: - description: Mounted read-only if true, read-write otherwise (false or unspecified). Defaults to false. - type: boolean - subPath: - description: Path within the volume from which the container's volume should be mounted. Defaults to "" (volume's root). - type: string - subPathExpr: - description: Expanded path within the volume from which the container's volume should be mounted. Behaves similarly to SubPath but environment variable references $(VAR_NAME) are expanded using the container's environment. Defaults to "" (volume's root). SubPathExpr and SubPath are mutually exclusive. - type: string - required: - - mountPath - - name - type: object - type: array - affinity: - description: If specified, the pod's scheduling constraints. - properties: - nodeAffinity: - description: Describes node affinity scheduling rules for the pod. - properties: - preferredDuringSchedulingIgnoredDuringExecution: - description: The scheduler will prefer to schedule pods to nodes that satisfy the affinity expressions specified by this field, but it may choose a node that violates one or more of the expressions. The node that is most preferred is the one with the greatest sum of weights, i.e. for each node that meets all of the scheduling requirements (resource request, requiredDuringScheduling affinity expressions, etc.), compute a sum by iterating through the elements of this field and adding "weight" to the sum if the node matches the corresponding matchExpressions; the node(s) with the highest sum are the most preferred. - items: - description: An empty preferred scheduling term matches all objects with implicit weight 0 (i.e. it's a no-op). A null preferred scheduling term matches no objects (i.e. is also a no-op). - properties: - preference: - description: A node selector term, associated with the corresponding weight. - properties: - matchExpressions: - description: A list of node selector requirements by node's labels. - items: - description: A node selector requirement is a selector that contains values, a key, and an operator that relates the key and values. - properties: - key: - description: The label key that the selector applies to. - type: string - operator: - description: Represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists, DoesNotExist. Gt, and Lt. - type: string - values: - description: An array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. If the operator is Gt or Lt, the values array must have a single element, which will be interpreted as an integer. This array is replaced during a strategic merge patch. - items: - type: string - type: array - required: - - key - - operator - type: object - type: array - matchFields: - description: A list of node selector requirements by node's fields. - items: - description: A node selector requirement is a selector that contains values, a key, and an operator that relates the key and values. - properties: - key: - description: The label key that the selector applies to. - type: string - operator: - description: Represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists, DoesNotExist. Gt, and Lt. - type: string - values: - description: An array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. If the operator is Gt or Lt, the values array must have a single element, which will be interpreted as an integer. This array is replaced during a strategic merge patch. - items: - type: string - type: array - required: - - key - - operator - type: object - type: array - type: object - weight: - description: Weight associated with matching the corresponding nodeSelectorTerm, in the range 1-100. - format: int32 - type: integer - required: - - preference - - weight - type: object - type: array - requiredDuringSchedulingIgnoredDuringExecution: - description: If the affinity requirements specified by this field are not met at scheduling time, the pod will not be scheduled onto the node. If the affinity requirements specified by this field cease to be met at some point during pod execution (e.g. due to an update), the system may or may not try to eventually evict the pod from its node. - properties: - nodeSelectorTerms: - description: Required. A list of node selector terms. The terms are ORed. - items: - description: A null or empty node selector term matches no objects. The requirements of them are ANDed. The TopologySelectorTerm type implements a subset of the NodeSelectorTerm. - properties: - matchExpressions: - description: A list of node selector requirements by node's labels. - items: - description: A node selector requirement is a selector that contains values, a key, and an operator that relates the key and values. - properties: - key: - description: The label key that the selector applies to. - type: string - operator: - description: Represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists, DoesNotExist. Gt, and Lt. - type: string - values: - description: An array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. If the operator is Gt or Lt, the values array must have a single element, which will be interpreted as an integer. This array is replaced during a strategic merge patch. - items: - type: string - type: array - required: - - key - - operator - type: object - type: array - matchFields: - description: A list of node selector requirements by node's fields. - items: - description: A node selector requirement is a selector that contains values, a key, and an operator that relates the key and values. - properties: - key: - description: The label key that the selector applies to. - type: string - operator: - description: Represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists, DoesNotExist. Gt, and Lt. - type: string - values: - description: An array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. If the operator is Gt or Lt, the values array must have a single element, which will be interpreted as an integer. This array is replaced during a strategic merge patch. - items: - type: string - type: array - required: - - key - - operator - type: object - type: array - type: object - type: array - required: - - nodeSelectorTerms - type: object - type: object - podAffinity: - description: Describes pod affinity scheduling rules (e.g. co-locate this pod in the same node, zone, etc. as some other pod(s)). - properties: - preferredDuringSchedulingIgnoredDuringExecution: - description: The scheduler will prefer to schedule pods to nodes that satisfy the affinity expressions specified by this field, but it may choose a node that violates one or more of the expressions. The node that is most preferred is the one with the greatest sum of weights, i.e. for each node that meets all of the scheduling requirements (resource request, requiredDuringScheduling affinity expressions, etc.), compute a sum by iterating through the elements of this field and adding "weight" to the sum if the node has pods which matches the corresponding podAffinityTerm; the node(s) with the highest sum are the most preferred. - items: - description: The weights of all of the matched WeightedPodAffinityTerm fields are added per-node to find the most preferred node(s) - properties: - podAffinityTerm: - description: Required. A pod affinity term, associated with the corresponding weight. - properties: - labelSelector: - description: A label query over a set of resources, in this case pods. - properties: - matchExpressions: - description: matchExpressions is a list of label selector requirements. The requirements are ANDed. - items: - description: A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values. - properties: - key: - description: key is the label key that the selector applies to. - type: string - operator: - description: operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist. - type: string - values: - description: values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch. - items: - type: string - type: array - required: - - key - - operator - type: object - type: array - matchLabels: - additionalProperties: - type: string - description: matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is "key", the operator is "In", and the values array contains only "value". The requirements are ANDed. - type: object - type: object - namespaces: - description: namespaces specifies which namespaces the labelSelector applies to (matches against); null or empty list means "this pod's namespace" - items: - type: string - type: array - topologyKey: - description: This pod should be co-located (affinity) or not co-located (anti-affinity) with the pods matching the labelSelector in the specified namespaces, where co-located is defined as running on a node whose value of the label with key topologyKey matches that of any node on which any of the selected pods is running. Empty topologyKey is not allowed. - type: string - required: - - topologyKey - type: object - weight: - description: weight associated with matching the corresponding podAffinityTerm, in the range 1-100. - format: int32 - type: integer - required: - - podAffinityTerm - - weight - type: object - type: array - requiredDuringSchedulingIgnoredDuringExecution: - description: If the affinity requirements specified by this field are not met at scheduling time, the pod will not be scheduled onto the node. If the affinity requirements specified by this field cease to be met at some point during pod execution (e.g. due to a pod label update), the system may or may not try to eventually evict the pod from its node. When there are multiple elements, the lists of nodes corresponding to each podAffinityTerm are intersected, i.e. all terms must be satisfied. - items: - description: Defines a set of pods (namely those matching the labelSelector relative to the given namespace(s)) that this pod should be co-located (affinity) or not co-located (anti-affinity) with, where co-located is defined as running on a node whose value of the label with key matches that of any node on which a pod of the set of pods is running - properties: - labelSelector: - description: A label query over a set of resources, in this case pods. - properties: - matchExpressions: - description: matchExpressions is a list of label selector requirements. The requirements are ANDed. - items: - description: A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values. - properties: - key: - description: key is the label key that the selector applies to. - type: string - operator: - description: operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist. - type: string - values: - description: values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch. - items: - type: string - type: array - required: - - key - - operator - type: object - type: array - matchLabels: - additionalProperties: - type: string - description: matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is "key", the operator is "In", and the values array contains only "value". The requirements are ANDed. - type: object - type: object - namespaces: - description: namespaces specifies which namespaces the labelSelector applies to (matches against); null or empty list means "this pod's namespace" - items: - type: string - type: array - topologyKey: - description: This pod should be co-located (affinity) or not co-located (anti-affinity) with the pods matching the labelSelector in the specified namespaces, where co-located is defined as running on a node whose value of the label with key topologyKey matches that of any node on which any of the selected pods is running. Empty topologyKey is not allowed. - type: string - required: - - topologyKey - type: object - type: array - type: object - podAntiAffinity: - description: Describes pod anti-affinity scheduling rules (e.g. avoid putting this pod in the same node, zone, etc. as some other pod(s)). - properties: - preferredDuringSchedulingIgnoredDuringExecution: - description: The scheduler will prefer to schedule pods to nodes that satisfy the anti-affinity expressions specified by this field, but it may choose a node that violates one or more of the expressions. The node that is most preferred is the one with the greatest sum of weights, i.e. for each node that meets all of the scheduling requirements (resource request, requiredDuringScheduling anti-affinity expressions, etc.), compute a sum by iterating through the elements of this field and adding "weight" to the sum if the node has pods which matches the corresponding podAffinityTerm; the node(s) with the highest sum are the most preferred. - items: - description: The weights of all of the matched WeightedPodAffinityTerm fields are added per-node to find the most preferred node(s) - properties: - podAffinityTerm: - description: Required. A pod affinity term, associated with the corresponding weight. - properties: - labelSelector: - description: A label query over a set of resources, in this case pods. - properties: - matchExpressions: - description: matchExpressions is a list of label selector requirements. The requirements are ANDed. - items: - description: A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values. - properties: - key: - description: key is the label key that the selector applies to. - type: string - operator: - description: operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist. - type: string - values: - description: values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch. - items: - type: string - type: array - required: - - key - - operator - type: object - type: array - matchLabels: - additionalProperties: - type: string - description: matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is "key", the operator is "In", and the values array contains only "value". The requirements are ANDed. - type: object - type: object - namespaces: - description: namespaces specifies which namespaces the labelSelector applies to (matches against); null or empty list means "this pod's namespace" - items: - type: string - type: array - topologyKey: - description: This pod should be co-located (affinity) or not co-located (anti-affinity) with the pods matching the labelSelector in the specified namespaces, where co-located is defined as running on a node whose value of the label with key topologyKey matches that of any node on which any of the selected pods is running. Empty topologyKey is not allowed. - type: string - required: - - topologyKey - type: object - weight: - description: weight associated with matching the corresponding podAffinityTerm, in the range 1-100. - format: int32 - type: integer - required: - - podAffinityTerm - - weight - type: object - type: array - requiredDuringSchedulingIgnoredDuringExecution: - description: If the anti-affinity requirements specified by this field are not met at scheduling time, the pod will not be scheduled onto the node. If the anti-affinity requirements specified by this field cease to be met at some point during pod execution (e.g. due to a pod label update), the system may or may not try to eventually evict the pod from its node. When there are multiple elements, the lists of nodes corresponding to each podAffinityTerm are intersected, i.e. all terms must be satisfied. - items: - description: Defines a set of pods (namely those matching the labelSelector relative to the given namespace(s)) that this pod should be co-located (affinity) or not co-located (anti-affinity) with, where co-located is defined as running on a node whose value of the label with key matches that of any node on which a pod of the set of pods is running - properties: - labelSelector: - description: A label query over a set of resources, in this case pods. - properties: - matchExpressions: - description: matchExpressions is a list of label selector requirements. The requirements are ANDed. - items: - description: A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values. - properties: - key: - description: key is the label key that the selector applies to. - type: string - operator: - description: operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist. - type: string - values: - description: values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch. - items: - type: string - type: array - required: - - key - - operator - type: object - type: array - matchLabels: - additionalProperties: - type: string - description: matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is "key", the operator is "In", and the values array contains only "value". The requirements are ANDed. - type: object - type: object - namespaces: - description: namespaces specifies which namespaces the labelSelector applies to (matches against); null or empty list means "this pod's namespace" - items: - type: string - type: array - topologyKey: - description: This pod should be co-located (affinity) or not co-located (anti-affinity) with the pods matching the labelSelector in the specified namespaces, where co-located is defined as running on a node whose value of the label with key topologyKey matches that of any node on which any of the selected pods is running. Empty topologyKey is not allowed. - type: string - required: - - topologyKey - type: object - type: array - type: object - type: object - resources: - description: If specified, the container's resources. - items: - description: The pod this Resource is used to specify the requests and limits for a certain container based on the name. - properties: - container: - description: The name of the container - type: string - limits: - properties: - cpu: - pattern: ^([+-]?[0-9.]+)([eEinumkKMGTP]*[-+]?[0-9]*)$ - type: string - memory: - pattern: ^([+-]?[0-9.]+)([eEinumkKMGTP]*[-+]?[0-9]*)$ - type: string - type: object - requests: - properties: - cpu: - pattern: ^([+-]?[0-9.]+)([eEinumkKMGTP]*[-+]?[0-9]*)$ - type: string - memory: - pattern: ^([+-]?[0-9.]+)([eEinumkKMGTP]*[-+]?[0-9]*)$ - type: string - type: object - type: object - type: array - namespace: - description: A field of namespace name to override the labels and annotations - type: object - properties: - labels: - additionalProperties: - type: string - description: Labels overrides labels for the namespace and its template. - type: object - annotations: - additionalProperties: - type: string - description: Annotations overrides labels for the namespace and its template. - type: object - deployments: - description: A mapping of deployment name to override - type: array - items: - type: object - properties: - name: - description: The name of the deployment - type: string - labels: - additionalProperties: - type: string - description: Labels overrides labels for the deployment and its template. - type: object - annotations: - additionalProperties: - type: string - description: Annotations overrides labels for the deployment and its template. - type: object - env: - description: Env overrides env vars for the containers. - items: - properties: - container: - description: The container name - type: string - envVars: - description: The desired EnvVarRequirements - items: - description: EnvVar represents an environment variable present in a Container. - properties: - name: - description: Name of the environment variable. Must be a C_IDENTIFIER. - type: string - value: - description: 'Variable references $(VAR_NAME) are expanded using the previously defined environment variables in the container and any service environment variables. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. "$$(VAR_NAME)" will produce the string literal "$(VAR_NAME)". Escaped references will never be expanded, regardless of whether the variable exists or not. Defaults to "".' - type: string - valueFrom: - description: Source for the environment variable's value. Cannot be used if value is not empty. - properties: - configMapKeyRef: - description: Selects a key of a ConfigMap. - properties: - key: - description: The key to select. - type: string - name: - description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' - type: string - optional: - description: Specify whether the ConfigMap or its key must be defined - type: boolean - required: - - key - type: object - fieldRef: - description: 'Selects a field of the pod: supports metadata.name, metadata.namespace, `metadata.labels['''']`, `metadata.annotations['''']`, spec.nodeName, spec.serviceAccountName, status.hostIP, status.podIP, status.podIPs.' - properties: - apiVersion: - description: Version of the schema the FieldPath is written in terms of, defaults to "v1". - type: string - fieldPath: - description: Path of the field to select in the specified API version. - type: string - required: - - fieldPath - type: object - resourceFieldRef: - description: 'Selects a resource of the container: only resources limits and requests (limits.cpu, limits.memory, limits.ephemeral-storage, requests.cpu, requests.memory and requests.ephemeral-storage) are currently supported.' - properties: - containerName: - description: 'Container name: required for volumes, optional for env vars' - type: string - divisor: - anyOf: - - type: integer - - type: string - description: Specifies the output format of the exposed resources, defaults to "1" - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true - resource: - description: 'Required: resource to select' - type: string - required: - - resource - type: object - secretKeyRef: - description: Selects a key of a secret in the pod's namespace - properties: - key: - description: The key of the secret to select from. Must be a valid secret key. - type: string - name: - description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' - type: string - optional: - description: Specify whether the Secret or its key must be defined - type: boolean - required: - - key - type: object - type: object - required: - - name - type: object - type: array - required: - - container - type: object - type: array - livenessProbes: - description: LivenessProbes overrides liveness probes for the containers. - items: - description: ProbesRequirementsOverride enables the user to override any container's env vars. - properties: - container: - description: The container name - type: string - failureThreshold: - description: Minimum consecutive failures for the probe to be considered failed after having succeeded. Defaults to 3. Minimum value is 1. - format: int32 - type: integer - initialDelaySeconds: - description: 'Number of seconds after the container has started before liveness probes are initiated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' - format: int32 - type: integer - periodSeconds: - description: How often (in seconds) to perform the probe. Default to 10 seconds. Minimum value is 1. - format: int32 - type: integer - successThreshold: - description: Minimum consecutive successes for the probe to be considered successful after having failed. Defaults to 1. Must be 1 for liveness and startup. Minimum value is 1. - format: int32 - type: integer - terminationGracePeriodSeconds: - description: Optional duration in seconds the pod needs to terminate gracefully upon probe failure. The grace period is the duration in seconds after the processes running in the pod are sent a termination signal and the time when the processes are forcibly halted with a kill signal. Set this value longer than the expected cleanup time for your process. If this value is nil, the pod's terminationGracePeriodSeconds will be used. Otherwise, this value overrides the value provided by the pod spec. Value must be non-negative integer. The value zero indicates stop immediately via the kill signal (no opportunity to shut down). This is a beta field and requires enabling ProbeTerminationGracePeriod feature gate. Minimum value is 1. spec.terminationGracePeriodSeconds is used if unset. - format: int64 - type: integer - timeoutSeconds: - description: 'Number of seconds after which the probe times out. Defaults to 1 second. Minimum value is 1. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' - format: int32 - type: integer - required: - - container - type: object - type: array - replicas: - description: The number of replicas that HA parts of the control plane will be scaled to - type: integer - minimum: 0 - nodeSelector: - additionalProperties: - type: string - description: NodeSelector overrides nodeSelector for the deployment. - type: object - readinessProbes: - description: ReadinessProbes overrides readiness probes for the containers. - items: - description: ProbesRequirementsOverride enables the user to override any container's env vars. - properties: - container: - description: The container name - type: string - failureThreshold: - description: Minimum consecutive failures for the probe to be considered failed after having succeeded. Defaults to 3. Minimum value is 1. - format: int32 - type: integer - initialDelaySeconds: - description: 'Number of seconds after the container has started before liveness probes are initiated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' - format: int32 - type: integer - periodSeconds: - description: How often (in seconds) to perform the probe. Default to 10 seconds. Minimum value is 1. - format: int32 - type: integer - successThreshold: - description: Minimum consecutive successes for the probe to be considered successful after having failed. Defaults to 1. Must be 1 for liveness and startup. Minimum value is 1. - format: int32 - type: integer - terminationGracePeriodSeconds: - description: Optional duration in seconds the pod needs to terminate gracefully upon probe failure. The grace period is the duration in seconds after the processes running in the pod are sent a termination signal and the time when the processes are forcibly halted with a kill signal. Set this value longer than the expected cleanup time for your process. If this value is nil, the pod's terminationGracePeriodSeconds will be used. Otherwise, this value overrides the value provided by the pod spec. Value must be non-negative integer. The value zero indicates stop immediately via the kill signal (no opportunity to shut down). This is a beta field and requires enabling ProbeTerminationGracePeriod feature gate. Minimum value is 1. spec.terminationGracePeriodSeconds is used if unset. - format: int64 - type: integer - timeoutSeconds: - description: 'Number of seconds after which the probe times out. Defaults to 1 second. Minimum value is 1. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' - format: int32 - type: integer - required: - - container - type: object - type: array - tolerations: - description: If specified, the pod's tolerations. - items: - description: The pod this Toleration is attached to tolerates any taint that matches the triple using the matching operator . - properties: - effect: - description: Effect indicates the taint effect to match. Empty means match all taint effects. When specified, allowed values are NoSchedule, PreferNoSchedule and NoExecute. - type: string - key: - description: Key is the taint key that the toleration applies to. Empty means match all taint keys. If the key is empty, operator must be Exists; this combination means to match all values and all keys. - type: string - operator: - description: Operator represents a key's relationship to the value. Valid operators are Exists and Equal. Defaults to Equal. Exists is equivalent to wildcard for value, so that a pod can tolerate all taints of a particular category. - type: string - tolerationSeconds: - description: TolerationSeconds represents the period of time the toleration (which must be of effect NoExecute, otherwise this field is ignored) tolerates the taint. By default, it is not set, which means tolerate the taint forever (do not evict). Zero and negative values will be treated as 0 (evict immediately) by the system. - format: int64 - type: integer - value: - description: Value is the taint value the toleration matches to. If the operator is Exists, the value should be empty, otherwise just a regular string. - type: string - type: object - type: array - hostNetwork: - description: Use the host's network namespace if true. Make sure to understand the security implications if you want to enable it. When hostNetwork is enabled, this will set dnsPolicy to ClusterFirstWithHostNet automatically. - type: boolean - topologySpreadConstraints: - description: If specified, the pod's topology spread constraints. - items: - description: TopologySpreadConstraint specifies how to spread matching pods among the given topology. - properties: - labelSelector: - description: LabelSelector is used to find matching pods. Pods that match this label selector are counted to determine the number of pods in their corresponding topology domain. - properties: - matchExpressions: - description: matchExpressions is a list of label selector requirements. The requirements are ANDed. - items: - description: A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values. - properties: - key: - description: key is the label key that the selector applies to. - type: string - operator: - description: operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist. - type: string - values: - description: values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch. - items: - type: string - type: array - required: - - key - - operator - type: object - type: array - matchLabels: - additionalProperties: - type: string - description: matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is "key", the operator is "In", and the values array contains only "value". The requirements are ANDed. - type: object - type: object - maxSkew: - description: 'MaxSkew describes the degree to which pods may be unevenly distributed. It''s the maximum permitted difference between the number of matching pods in any two topology domains of a given topology type. For example, in a 3-zone cluster, MaxSkew is set to 1, and pods with the same labelSelector spread as 1/1/0: | zone1 | zone2 | zone3 | | P | P | | - if MaxSkew is 1, incoming pod can only be scheduled to zone3 to become 1/1/1; scheduling it onto zone1(zone2) would make the ActualSkew(2-0) on zone1(zone2) violate MaxSkew(1). - if MaxSkew is 2, incoming pod can be scheduled onto any zone. It''s a required field. Default value is 1 and 0 is not allowed.' - format: int32 - type: integer - topologyKey: - description: TopologyKey is the key of node labels. Nodes that have a label with this key and identical values are considered to be in the same topology. We consider each as a "bucket", and try to put balanced number of pods into each bucket. It's a required field. - type: string - whenUnsatisfiable: - description: 'WhenUnsatisfiable indicates how to deal with a pod if it doesn''t satisfy the spread constraint. - DoNotSchedule (default) tells the scheduler not to schedule it - ScheduleAnyway tells the scheduler to still schedule it It''s considered as "Unsatisfiable" if and only if placing incoming pod on any topology violates "MaxSkew". For example, in a 3-zone cluster, MaxSkew is set to 1, and pods with the same labelSelector spread as 3/1/1: | zone1 | zone2 | zone3 | | P P P | P | P | If WhenUnsatisfiable is set to DoNotSchedule, incoming pod can only be scheduled to zone2(zone3) to become 3/2/1(3/1/2) as ActualSkew(2-1) on zone2(zone3) satisfies MaxSkew(1). In other words, the cluster can still be imbalanced, but scheduler won''t make it *more* imbalanced. It''s a required field.' - type: string - required: - - maxSkew - - topologyKey - - whenUnsatisfiable - type: object - type: array - affinity: - description: If specified, the pod's scheduling constraints. - properties: - nodeAffinity: - description: Describes node affinity scheduling rules for the pod. - properties: - preferredDuringSchedulingIgnoredDuringExecution: - description: The scheduler will prefer to schedule pods to nodes that satisfy the affinity expressions specified by this field, but it may choose a node that violates one or more of the expressions. The node that is most preferred is the one with the greatest sum of weights, i.e. for each node that meets all of the scheduling requirements (resource request, requiredDuringScheduling affinity expressions, etc.), compute a sum by iterating through the elements of this field and adding "weight" to the sum if the node matches the corresponding matchExpressions; the node(s) with the highest sum are the most preferred. - items: - description: An empty preferred scheduling term matches all objects with implicit weight 0 (i.e. it's a no-op). A null preferred scheduling term matches no objects (i.e. is also a no-op). - properties: - preference: - description: A node selector term, associated with the corresponding weight. - properties: - matchExpressions: - description: A list of node selector requirements by node's labels. - items: - description: A node selector requirement is a selector that contains values, a key, and an operator that relates the key and values. - properties: - key: - description: The label key that the selector applies to. - type: string - operator: - description: Represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists, DoesNotExist. Gt, and Lt. - type: string - values: - description: An array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. If the operator is Gt or Lt, the values array must have a single element, which will be interpreted as an integer. This array is replaced during a strategic merge patch. - items: - type: string - type: array - required: - - key - - operator - type: object - type: array - matchFields: - description: A list of node selector requirements by node's fields. - items: - description: A node selector requirement is a selector that contains values, a key, and an operator that relates the key and values. - properties: - key: - description: The label key that the selector applies to. - type: string - operator: - description: Represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists, DoesNotExist. Gt, and Lt. - type: string - values: - description: An array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. If the operator is Gt or Lt, the values array must have a single element, which will be interpreted as an integer. This array is replaced during a strategic merge patch. - items: - type: string - type: array - required: - - key - - operator - type: object - type: array - type: object - weight: - description: Weight associated with matching the corresponding nodeSelectorTerm, in the range 1-100. - format: int32 - type: integer - required: - - preference - - weight - type: object - type: array - requiredDuringSchedulingIgnoredDuringExecution: - description: If the affinity requirements specified by this field are not met at scheduling time, the pod will not be scheduled onto the node. If the affinity requirements specified by this field cease to be met at some point during pod execution (e.g. due to an update), the system may or may not try to eventually evict the pod from its node. - properties: - nodeSelectorTerms: - description: Required. A list of node selector terms. The terms are ORed. - items: - description: A null or empty node selector term matches no objects. The requirements of them are ANDed. The TopologySelectorTerm type implements a subset of the NodeSelectorTerm. - properties: - matchExpressions: - description: A list of node selector requirements by node's labels. - items: - description: A node selector requirement is a selector that contains values, a key, and an operator that relates the key and values. - properties: - key: - description: The label key that the selector applies to. - type: string - operator: - description: Represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists, DoesNotExist. Gt, and Lt. - type: string - values: - description: An array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. If the operator is Gt or Lt, the values array must have a single element, which will be interpreted as an integer. This array is replaced during a strategic merge patch. - items: - type: string - type: array - required: - - key - - operator - type: object - type: array - matchFields: - description: A list of node selector requirements by node's fields. - items: - description: A node selector requirement is a selector that contains values, a key, and an operator that relates the key and values. - properties: - key: - description: The label key that the selector applies to. - type: string - operator: - description: Represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists, DoesNotExist. Gt, and Lt. - type: string - values: - description: An array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. If the operator is Gt or Lt, the values array must have a single element, which will be interpreted as an integer. This array is replaced during a strategic merge patch. - items: - type: string - type: array - required: - - key - - operator - type: object - type: array - type: object - type: array - required: - - nodeSelectorTerms - type: object - type: object - podAffinity: - description: Describes pod affinity scheduling rules (e.g. co-locate this pod in the same node, zone, etc. as some other pod(s)). - properties: - preferredDuringSchedulingIgnoredDuringExecution: - description: The scheduler will prefer to schedule pods to nodes that satisfy the affinity expressions specified by this field, but it may choose a node that violates one or more of the expressions. The node that is most preferred is the one with the greatest sum of weights, i.e. for each node that meets all of the scheduling requirements (resource request, requiredDuringScheduling affinity expressions, etc.), compute a sum by iterating through the elements of this field and adding "weight" to the sum if the node has pods which matches the corresponding podAffinityTerm; the node(s) with the highest sum are the most preferred. - items: - description: The weights of all of the matched WeightedPodAffinityTerm fields are added per-node to find the most preferred node(s) - properties: - podAffinityTerm: - description: Required. A pod affinity term, associated with the corresponding weight. - properties: - labelSelector: - description: A label query over a set of resources, in this case pods. - properties: - matchExpressions: - description: matchExpressions is a list of label selector requirements. The requirements are ANDed. - items: - description: A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values. - properties: - key: - description: key is the label key that the selector applies to. - type: string - operator: - description: operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist. - type: string - values: - description: values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch. - items: - type: string - type: array - required: - - key - - operator - type: object - type: array - matchLabels: - additionalProperties: - type: string - description: matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is "key", the operator is "In", and the values array contains only "value". The requirements are ANDed. - type: object - type: object - namespaces: - description: namespaces specifies which namespaces the labelSelector applies to (matches against); null or empty list means "this pod's namespace" - items: - type: string - type: array - topologyKey: - description: This pod should be co-located (affinity) or not co-located (anti-affinity) with the pods matching the labelSelector in the specified namespaces, where co-located is defined as running on a node whose value of the label with key topologyKey matches that of any node on which any of the selected pods is running. Empty topologyKey is not allowed. - type: string - required: - - topologyKey - type: object - weight: - description: weight associated with matching the corresponding podAffinityTerm, in the range 1-100. - format: int32 - type: integer - required: - - podAffinityTerm - - weight - type: object - type: array - requiredDuringSchedulingIgnoredDuringExecution: - description: If the affinity requirements specified by this field are not met at scheduling time, the pod will not be scheduled onto the node. If the affinity requirements specified by this field cease to be met at some point during pod execution (e.g. due to a pod label update), the system may or may not try to eventually evict the pod from its node. When there are multiple elements, the lists of nodes corresponding to each podAffinityTerm are intersected, i.e. all terms must be satisfied. - items: - description: Defines a set of pods (namely those matching the labelSelector relative to the given namespace(s)) that this pod should be co-located (affinity) or not co-located (anti-affinity) with, where co-located is defined as running on a node whose value of the label with key matches that of any node on which a pod of the set of pods is running - properties: - labelSelector: - description: A label query over a set of resources, in this case pods. - properties: - matchExpressions: - description: matchExpressions is a list of label selector requirements. The requirements are ANDed. - items: - description: A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values. - properties: - key: - description: key is the label key that the selector applies to. - type: string - operator: - description: operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist. - type: string - values: - description: values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch. - items: - type: string - type: array - required: - - key - - operator - type: object - type: array - matchLabels: - additionalProperties: - type: string - description: matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is "key", the operator is "In", and the values array contains only "value". The requirements are ANDed. - type: object - type: object - namespaces: - description: namespaces specifies which namespaces the labelSelector applies to (matches against); null or empty list means "this pod's namespace" - items: - type: string - type: array - topologyKey: - description: This pod should be co-located (affinity) or not co-located (anti-affinity) with the pods matching the labelSelector in the specified namespaces, where co-located is defined as running on a node whose value of the label with key topologyKey matches that of any node on which any of the selected pods is running. Empty topologyKey is not allowed. - type: string - required: - - topologyKey - type: object - type: array - type: object - podAntiAffinity: - description: Describes pod anti-affinity scheduling rules (e.g. avoid putting this pod in the same node, zone, etc. as some other pod(s)). - properties: - preferredDuringSchedulingIgnoredDuringExecution: - description: The scheduler will prefer to schedule pods to nodes that satisfy the anti-affinity expressions specified by this field, but it may choose a node that violates one or more of the expressions. The node that is most preferred is the one with the greatest sum of weights, i.e. for each node that meets all of the scheduling requirements (resource request, requiredDuringScheduling anti-affinity expressions, etc.), compute a sum by iterating through the elements of this field and adding "weight" to the sum if the node has pods which matches the corresponding podAffinityTerm; the node(s) with the highest sum are the most preferred. - items: - description: The weights of all of the matched WeightedPodAffinityTerm fields are added per-node to find the most preferred node(s) - properties: - podAffinityTerm: - description: Required. A pod affinity term, associated with the corresponding weight. - properties: - labelSelector: - description: A label query over a set of resources, in this case pods. - properties: - matchExpressions: - description: matchExpressions is a list of label selector requirements. The requirements are ANDed. - items: - description: A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values. - properties: - key: - description: key is the label key that the selector applies to. - type: string - operator: - description: operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist. - type: string - values: - description: values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch. - items: - type: string - type: array - required: - - key - - operator - type: object - type: array - matchLabels: - additionalProperties: - type: string - description: matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is "key", the operator is "In", and the values array contains only "value". The requirements are ANDed. - type: object - type: object - namespaces: - description: namespaces specifies which namespaces the labelSelector applies to (matches against); null or empty list means "this pod's namespace" - items: - type: string - type: array - topologyKey: - description: This pod should be co-located (affinity) or not co-located (anti-affinity) with the pods matching the labelSelector in the specified namespaces, where co-located is defined as running on a node whose value of the label with key topologyKey matches that of any node on which any of the selected pods is running. Empty topologyKey is not allowed. - type: string - required: - - topologyKey - type: object - weight: - description: weight associated with matching the corresponding podAffinityTerm, in the range 1-100. - format: int32 - type: integer - required: - - podAffinityTerm - - weight - type: object - type: array - requiredDuringSchedulingIgnoredDuringExecution: - description: If the anti-affinity requirements specified by this field are not met at scheduling time, the pod will not be scheduled onto the node. If the anti-affinity requirements specified by this field cease to be met at some point during pod execution (e.g. due to a pod label update), the system may or may not try to eventually evict the pod from its node. When there are multiple elements, the lists of nodes corresponding to each podAffinityTerm are intersected, i.e. all terms must be satisfied. - items: - description: Defines a set of pods (namely those matching the labelSelector relative to the given namespace(s)) that this pod should be co-located (affinity) or not co-located (anti-affinity) with, where co-located is defined as running on a node whose value of the label with key matches that of any node on which a pod of the set of pods is running - properties: - labelSelector: - description: A label query over a set of resources, in this case pods. - properties: - matchExpressions: - description: matchExpressions is a list of label selector requirements. The requirements are ANDed. - items: - description: A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values. - properties: - key: - description: key is the label key that the selector applies to. - type: string - operator: - description: operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist. - type: string - values: - description: values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch. - items: - type: string - type: array - required: - - key - - operator - type: object - type: array - matchLabels: - additionalProperties: - type: string - description: matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is "key", the operator is "In", and the values array contains only "value". The requirements are ANDed. - type: object - type: object - namespaces: - description: namespaces specifies which namespaces the labelSelector applies to (matches against); null or empty list means "this pod's namespace" - items: - type: string - type: array - topologyKey: - description: This pod should be co-located (affinity) or not co-located (anti-affinity) with the pods matching the labelSelector in the specified namespaces, where co-located is defined as running on a node whose value of the label with key topologyKey matches that of any node on which any of the selected pods is running. Empty topologyKey is not allowed. - type: string - required: - - topologyKey - type: object - type: array - type: object - type: object - resources: - description: If specified, the container's resources. - items: - description: The pod this Resource is used to specify the requests and limits for a certain container based on the name. - properties: - container: - description: The name of the container - type: string - limits: - properties: - cpu: - pattern: ^([+-]?[0-9.]+)([eEinumkKMGTP]*[-+]?[0-9]*)$ - type: string - memory: - pattern: ^([+-]?[0-9.]+)([eEinumkKMGTP]*[-+]?[0-9]*)$ - type: string - type: object - requests: - properties: - cpu: - pattern: ^([+-]?[0-9.]+)([eEinumkKMGTP]*[-+]?[0-9]*)$ - type: string - memory: - pattern: ^([+-]?[0-9.]+)([eEinumkKMGTP]*[-+]?[0-9]*)$ - type: string - type: object - type: object - type: array - services: - description: A mapping of service name to override - type: array - items: - type: object - properties: - name: - description: The name of the service - type: string - labels: - additionalProperties: - type: string - description: Labels overrides labels for the service - type: object - annotations: - additionalProperties: - type: string - description: Annotations overrides labels for the service - type: object - selector: - additionalProperties: - type: string - description: Selector overrides selector for the service - type: object - podDisruptionBudgets: - description: A mapping of podDisruptionBudget name to override - type: array - items: - type: object - properties: - name: - description: The name of the podDisruptionBudget - type: string - minAvailable: - anyOf: - - type: integer - - type: string - description: An eviction is allowed if at least "minAvailable" pods selected by "selector" will still be available after the eviction, i.e. even in the absence of the evicted pod. So for example you can prevent all voluntary evictions by specifying "100%". - x-kubernetes-int-or-string: true - maxUnavailable: - anyOf: - - type: integer - - type: string - description: An eviction is allowed if at most "maxUnavailable" pods selected by "selector" are unavailable after the eviction, i.e. even in absence of the evicted pod. For example, one can prevent all voluntary evictions by specifying 0. This is a mutually exclusive setting with "minAvailable". - x-kubernetes-int-or-string: true - source: - description: The source configuration for Knative Eventing - properties: - ceph: - description: Ceph settings - properties: - enabled: - type: boolean - type: object - github: - description: GitHub settings - properties: - enabled: - type: boolean - type: object - gitlab: - description: GitLab settings - properties: - enabled: - type: boolean - type: object - kafka: - description: Apache Kafka settings - properties: - enabled: - type: boolean - type: object - rabbitmq: - description: RabbitMQ settings - properties: - enabled: - type: boolean - type: object - redis: - description: Redis settings - properties: - enabled: - type: boolean - type: object - type: object - manifests: - description: A list of eventing manifests, which will be installed by the operator - items: - properties: - URL: - description: The link of the manifest URL - type: string - type: object - type: array - registry: - description: A means to override the corresponding deployment images in the upstream. This affects both apps/v1.Deployment and caching.internal.knative.dev/v1alpha1.Image. - properties: - default: - description: The default image reference template to use for all knative images. Takes the form of example-registry.io/custom/path/${NAME}:custom-tag - type: string - imagePullSecrets: - description: A list of secrets to be used when pulling the knative images. The secret must be created in the same namespace as the knative-eventing deployments, and not the namespace of this resource. - items: - properties: - name: - description: The name of the secret. - type: string - type: object - type: array - override: - additionalProperties: - type: string - description: A map of a container name or image name to the full image location of the individual knative image. - type: object - type: object - sinkBindingSelectionMode: - description: Specifies the selection mode for the sinkbinding webhook. If the value is `inclusion`, only namespaces/objects labelled as `bindings.knative.dev/include:true` will be considered. If `exclusion` is selected, only `bindings.knative.dev/exclude:true` label is checked and these will NOT be considered. The default is `exclusion`. - type: string - version: - description: The version of Knative Eventing to be installed - type: string - type: object - status: - properties: - conditions: - description: The latest available observations of a resource's current state. - items: - properties: - lastTransitionTime: - description: LastTransitionTime is the last time the condition transitioned from one status to another. We use VolatileTime in place of metav1.Time to exclude this from creating equality.Semantic differences (all other things held constant). - type: string - message: - description: A human readable message indicating details about the transition. - type: string - reason: - description: The reason for the condition's last transition. - type: string - severity: - description: Severity with which to treat failures of this type of condition. When this is not specified, it defaults to Error. - type: string - status: - description: Status of the condition, one of True, False, Unknown. - type: string - type: - description: Type of condition. - type: string - required: - - type - - status - type: object - type: array - manifests: - description: The list of eventing manifests, which have been installed by the operator - items: - type: string - type: array - observedGeneration: - description: The generation last processed by the controller - type: integer - version: - description: The version of the installed release - type: string - type: object - type: object - additionalPrinterColumns: - - jsonPath: .status.version - name: Version - type: string - - jsonPath: .status.conditions[?(@.type=="Ready")].status - name: Ready - type: string - - jsonPath: .status.conditions[?(@.type=="Ready")].reason - name: Reason - type: string - names: - kind: KnativeEventing - listKind: KnativeEventingList - plural: knativeeventings - singular: knativeeventing - scope: Namespaced - conversion: - strategy: Webhook - webhook: - conversionReviewVersions: ["v1beta1"] - clientConfig: - service: - name: operator-webhook - namespace: union - path: /resource-conversion ---- -# Source: knative-operator/templates/knative-crds.yaml -# Copyright 2021 The Knative Authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -apiVersion: apiextensions.k8s.io/v1 -kind: CustomResourceDefinition -metadata: - name: knativeservings.operator.knative.dev - labels: - app.kubernetes.io/version: "1.16.0" - app.kubernetes.io/name: knative-operator -spec: - group: operator.knative.dev - versions: - - name: v1beta1 - served: true - storage: true - subresources: - status: {} - schema: - openAPIV3Schema: - description: Schema for the knativeservings API - properties: - apiVersion: - description: 'APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources' - type: string - kind: - description: 'Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds' - type: string - metadata: - type: object - spec: - description: Spec defines the desired state of KnativeServing - properties: - additionalManifests: - description: A list of the additional serving manifests, which will be installed by the operator - items: - properties: - URL: - description: The link of the additional manifest URL - type: string - type: object - type: array - config: - additionalProperties: - additionalProperties: - type: string - type: object - description: A means to override the corresponding entries in the upstream configmaps - type: object - controller-custom-certs: - description: Enabling the controller to trust registries with self-signed certificates - properties: - name: - description: The name of the ConfigMap or Secret - type: string - type: - description: One of ConfigMap or Secret - enum: - - ConfigMap - - Secret - - "" - type: string - type: object - high-availability: - description: Allows specification of HA control plane - properties: - replicas: - description: The number of replicas that HA parts of the control plane will be scaled to - minimum: 0 - type: integer - type: object - workloads: - description: A mapping of deployment or statefulset name to override - type: array - items: - type: object - properties: - name: - description: The name of the deployment - type: string - labels: - additionalProperties: - type: string - description: Labels overrides labels for the deployment and its template. - type: object - livenessProbes: - description: LivenessProbes overrides liveness probes for the containers. - items: - description: ProbesRequirementsOverride enables the user to override any container's env vars. - properties: - container: - description: The container name - type: string - failureThreshold: - description: Minimum consecutive failures for the probe to be considered failed after having succeeded. Defaults to 3. Minimum value is 1. - format: int32 - type: integer - initialDelaySeconds: - description: 'Number of seconds after the container has started before liveness probes are initiated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' - format: int32 - type: integer - periodSeconds: - description: How often (in seconds) to perform the probe. Default to 10 seconds. Minimum value is 1. - format: int32 - type: integer - successThreshold: - description: Minimum consecutive successes for the probe to be considered successful after having failed. Defaults to 1. Must be 1 for liveness and startup. Minimum value is 1. - format: int32 - type: integer - terminationGracePeriodSeconds: - description: Optional duration in seconds the pod needs to terminate gracefully upon probe failure. The grace period is the duration in seconds after the processes running in the pod are sent a termination signal and the time when the processes are forcibly halted with a kill signal. Set this value longer than the expected cleanup time for your process. If this value is nil, the pod's terminationGracePeriodSeconds will be used. Otherwise, this value overrides the value provided by the pod spec. Value must be non-negative integer. The value zero indicates stop immediately via the kill signal (no opportunity to shut down). This is a beta field and requires enabling ProbeTerminationGracePeriod feature gate. Minimum value is 1. spec.terminationGracePeriodSeconds is used if unset. - format: int64 - type: integer - timeoutSeconds: - description: 'Number of seconds after which the probe times out. Defaults to 1 second. Minimum value is 1. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' - format: int32 - type: integer - required: - - container - type: object - type: array - annotations: - additionalProperties: - type: string - description: Annotations overrides labels for the deployment and its template. - type: object - env: - description: Env overrides env vars for the containers. - items: - properties: - container: - description: The container name - type: string - envVars: - description: The desired EnvVarRequirements - items: - description: EnvVar represents an environment variable present in a Container. - properties: - name: - description: Name of the environment variable. Must be a C_IDENTIFIER. - type: string - value: - description: 'Variable references $(VAR_NAME) are expanded using the previously defined environment variables in the container and any service environment variables. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. "$$(VAR_NAME)" will produce the string literal "$(VAR_NAME)". Escaped references will never be expanded, regardless of whether the variable exists or not. Defaults to "".' - type: string - valueFrom: - description: Source for the environment variable's value. Cannot be used if value is not empty. - properties: - configMapKeyRef: - description: Selects a key of a ConfigMap. - properties: - key: - description: The key to select. - type: string - name: - description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' - type: string - optional: - description: Specify whether the ConfigMap or its key must be defined - type: boolean - required: - - key - type: object - fieldRef: - description: 'Selects a field of the pod: supports metadata.name, metadata.namespace, `metadata.labels['''']`, `metadata.annotations['''']`, spec.nodeName, spec.serviceAccountName, status.hostIP, status.podIP, status.podIPs.' - properties: - apiVersion: - description: Version of the schema the FieldPath is written in terms of, defaults to "v1". - type: string - fieldPath: - description: Path of the field to select in the specified API version. - type: string - required: - - fieldPath - type: object - resourceFieldRef: - description: 'Selects a resource of the container: only resources limits and requests (limits.cpu, limits.memory, limits.ephemeral-storage, requests.cpu, requests.memory and requests.ephemeral-storage) are currently supported.' - properties: - containerName: - description: 'Container name: required for volumes, optional for env vars' - type: string - divisor: - anyOf: - - type: integer - - type: string - description: Specifies the output format of the exposed resources, defaults to "1" - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true - resource: - description: 'Required: resource to select' - type: string - required: - - resource - type: object - secretKeyRef: - description: Selects a key of a secret in the pod's namespace - properties: - key: - description: The key of the secret to select from. Must be a valid secret key. - type: string - name: - description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' - type: string - optional: - description: Specify whether the Secret or its key must be defined - type: boolean - required: - - key - type: object - type: object - required: - - name - type: object - type: array - required: - - container - type: object - type: array - replicas: - description: The number of replicas that HA parts of the control plane will be scaled to - type: integer - minimum: 0 - nodeSelector: - additionalProperties: - type: string - description: NodeSelector overrides nodeSelector for the deployment. - type: object - readinessProbes: - description: ReadinessProbes overrides readiness probes for the containers. - items: - description: ProbesRequirementsOverride enables the user to override any container's env vars. - properties: - container: - description: The container name - type: string - failureThreshold: - description: Minimum consecutive failures for the probe to be considered failed after having succeeded. Defaults to 3. Minimum value is 1. - format: int32 - type: integer - initialDelaySeconds: - description: 'Number of seconds after the container has started before liveness probes are initiated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' - format: int32 - type: integer - periodSeconds: - description: How often (in seconds) to perform the probe. Default to 10 seconds. Minimum value is 1. - format: int32 - type: integer - successThreshold: - description: Minimum consecutive successes for the probe to be considered successful after having failed. Defaults to 1. Must be 1 for liveness and startup. Minimum value is 1. - format: int32 - type: integer - terminationGracePeriodSeconds: - description: Optional duration in seconds the pod needs to terminate gracefully upon probe failure. The grace period is the duration in seconds after the processes running in the pod are sent a termination signal and the time when the processes are forcibly halted with a kill signal. Set this value longer than the expected cleanup time for your process. If this value is nil, the pod's terminationGracePeriodSeconds will be used. Otherwise, this value overrides the value provided by the pod spec. Value must be non-negative integer. The value zero indicates stop immediately via the kill signal (no opportunity to shut down). This is a beta field and requires enabling ProbeTerminationGracePeriod feature gate. Minimum value is 1. spec.terminationGracePeriodSeconds is used if unset. - format: int64 - type: integer - timeoutSeconds: - description: 'Number of seconds after which the probe times out. Defaults to 1 second. Minimum value is 1. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' - format: int32 - type: integer - required: - - container - type: object - type: array - tolerations: - description: If specified, the pod's tolerations. - items: - description: The pod this Toleration is attached to tolerates any taint that matches the triple using the matching operator . - properties: - effect: - description: Effect indicates the taint effect to match. Empty means match all taint effects. When specified, allowed values are NoSchedule, PreferNoSchedule and NoExecute. - type: string - key: - description: Key is the taint key that the toleration applies to. Empty means match all taint keys. If the key is empty, operator must be Exists; this combination means to match all values and all keys. - type: string - operator: - description: Operator represents a key's relationship to the value. Valid operators are Exists and Equal. Defaults to Equal. Exists is equivalent to wildcard for value, so that a pod can tolerate all taints of a particular category. - type: string - tolerationSeconds: - description: TolerationSeconds represents the period of time the toleration (which must be of effect NoExecute, otherwise this field is ignored) tolerates the taint. By default, it is not set, which means tolerate the taint forever (do not evict). Zero and negative values will be treated as 0 (evict immediately) by the system. - format: int64 - type: integer - value: - description: Value is the taint value the toleration matches to. If the operator is Exists, the value should be empty, otherwise just a regular string. - type: string - type: object - type: array - hostNetwork: - description: Use the host's network namespace if true. Make sure to understand the security implications if you want to enable it. When hostNetwork is enabled, this will set dnsPolicy to ClusterFirstWithHostNet automatically. - type: boolean - topologySpreadConstraints: - description: If specified, the pod's topology spread constraints. - items: - description: TopologySpreadConstraint specifies how to spread matching pods among the given topology. - properties: - labelSelector: - description: LabelSelector is used to find matching pods. Pods that match this label selector are counted to determine the number of pods in their corresponding topology domain. - properties: - matchExpressions: - description: matchExpressions is a list of label selector requirements. The requirements are ANDed. - items: - description: A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values. - properties: - key: - description: key is the label key that the selector applies to. - type: string - operator: - description: operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist. - type: string - values: - description: values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch. - items: - type: string - type: array - required: - - key - - operator - type: object - type: array - matchLabels: - additionalProperties: - type: string - description: matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is "key", the operator is "In", and the values array contains only "value". The requirements are ANDed. - type: object - type: object - maxSkew: - description: 'MaxSkew describes the degree to which pods may be unevenly distributed. It''s the maximum permitted difference between the number of matching pods in any two topology domains of a given topology type. For example, in a 3-zone cluster, MaxSkew is set to 1, and pods with the same labelSelector spread as 1/1/0: | zone1 | zone2 | zone3 | | P | P | | - if MaxSkew is 1, incoming pod can only be scheduled to zone3 to become 1/1/1; scheduling it onto zone1(zone2) would make the ActualSkew(2-0) on zone1(zone2) violate MaxSkew(1). - if MaxSkew is 2, incoming pod can be scheduled onto any zone. It''s a required field. Default value is 1 and 0 is not allowed.' - format: int32 - type: integer - topologyKey: - description: TopologyKey is the key of node labels. Nodes that have a label with this key and identical values are considered to be in the same topology. We consider each as a "bucket", and try to put balanced number of pods into each bucket. It's a required field. - type: string - whenUnsatisfiable: - description: 'WhenUnsatisfiable indicates how to deal with a pod if it doesn''t satisfy the spread constraint. - DoNotSchedule (default) tells the scheduler not to schedule it - ScheduleAnyway tells the scheduler to still schedule it It''s considered as "Unsatisfiable" if and only if placing incoming pod on any topology violates "MaxSkew". For example, in a 3-zone cluster, MaxSkew is set to 1, and pods with the same labelSelector spread as 3/1/1: | zone1 | zone2 | zone3 | | P P P | P | P | If WhenUnsatisfiable is set to DoNotSchedule, incoming pod can only be scheduled to zone2(zone3) to become 3/2/1(3/1/2) as ActualSkew(2-1) on zone2(zone3) satisfies MaxSkew(1). In other words, the cluster can still be imbalanced, but scheduler won''t make it *more* imbalanced. It''s a required field.' - type: string - required: - - maxSkew - - topologyKey - - whenUnsatisfiable - type: object - type: array - version: - description: Version the cluster should be on. - type: string - volumeMounts: - description: VolumeMounts allows configuration of additional VolumeMounts on the output StatefulSet definition. VolumeMounts specified will be appended to other VolumeMounts in the alertmanager container, that are generated as a result of StorageSpec objects. - items: - description: VolumeMount describes a mounting of a Volume within a container. - properties: - mountPath: - description: Path within the container at which the volume should be mounted. Must not contain ':'. - type: string - mountPropagation: - description: mountPropagation determines how mounts are propagated from the host to container and the other way around. When not set, MountPropagationNone is used. This field is beta in 1.10. - type: string - name: - description: This must match the Name of a Volume. - type: string - readOnly: - description: Mounted read-only if true, read-write otherwise (false or unspecified). Defaults to false. - type: boolean - subPath: - description: Path within the volume from which the container's volume should be mounted. Defaults to "" (volume's root). - type: string - subPathExpr: - description: Expanded path within the volume from which the container's volume should be mounted. Behaves similarly to SubPath but environment variable references $(VAR_NAME) are expanded using the container's environment. Defaults to "" (volume's root). SubPathExpr and SubPath are mutually exclusive. - type: string - required: - - mountPath - - name - type: object - type: array - affinity: - description: If specified, the pod's scheduling constraints. - properties: - nodeAffinity: - description: Describes node affinity scheduling rules for the pod. - properties: - preferredDuringSchedulingIgnoredDuringExecution: - description: The scheduler will prefer to schedule pods to nodes that satisfy the affinity expressions specified by this field, but it may choose a node that violates one or more of the expressions. The node that is most preferred is the one with the greatest sum of weights, i.e. for each node that meets all of the scheduling requirements (resource request, requiredDuringScheduling affinity expressions, etc.), compute a sum by iterating through the elements of this field and adding "weight" to the sum if the node matches the corresponding matchExpressions; the node(s) with the highest sum are the most preferred. - items: - description: An empty preferred scheduling term matches all objects with implicit weight 0 (i.e. it's a no-op). A null preferred scheduling term matches no objects (i.e. is also a no-op). - properties: - preference: - description: A node selector term, associated with the corresponding weight. - properties: - matchExpressions: - description: A list of node selector requirements by node's labels. - items: - description: A node selector requirement is a selector that contains values, a key, and an operator that relates the key and values. - properties: - key: - description: The label key that the selector applies to. - type: string - operator: - description: Represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists, DoesNotExist. Gt, and Lt. - type: string - values: - description: An array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. If the operator is Gt or Lt, the values array must have a single element, which will be interpreted as an integer. This array is replaced during a strategic merge patch. - items: - type: string - type: array - required: - - key - - operator - type: object - type: array - matchFields: - description: A list of node selector requirements by node's fields. - items: - description: A node selector requirement is a selector that contains values, a key, and an operator that relates the key and values. - properties: - key: - description: The label key that the selector applies to. - type: string - operator: - description: Represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists, DoesNotExist. Gt, and Lt. - type: string - values: - description: An array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. If the operator is Gt or Lt, the values array must have a single element, which will be interpreted as an integer. This array is replaced during a strategic merge patch. - items: - type: string - type: array - required: - - key - - operator - type: object - type: array - type: object - weight: - description: Weight associated with matching the corresponding nodeSelectorTerm, in the range 1-100. - format: int32 - type: integer - required: - - preference - - weight - type: object - type: array - requiredDuringSchedulingIgnoredDuringExecution: - description: If the affinity requirements specified by this field are not met at scheduling time, the pod will not be scheduled onto the node. If the affinity requirements specified by this field cease to be met at some point during pod execution (e.g. due to an update), the system may or may not try to eventually evict the pod from its node. - properties: - nodeSelectorTerms: - description: Required. A list of node selector terms. The terms are ORed. - items: - description: A null or empty node selector term matches no objects. The requirements of them are ANDed. The TopologySelectorTerm type implements a subset of the NodeSelectorTerm. - properties: - matchExpressions: - description: A list of node selector requirements by node's labels. - items: - description: A node selector requirement is a selector that contains values, a key, and an operator that relates the key and values. - properties: - key: - description: The label key that the selector applies to. - type: string - operator: - description: Represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists, DoesNotExist. Gt, and Lt. - type: string - values: - description: An array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. If the operator is Gt or Lt, the values array must have a single element, which will be interpreted as an integer. This array is replaced during a strategic merge patch. - items: - type: string - type: array - required: - - key - - operator - type: object - type: array - matchFields: - description: A list of node selector requirements by node's fields. - items: - description: A node selector requirement is a selector that contains values, a key, and an operator that relates the key and values. - properties: - key: - description: The label key that the selector applies to. - type: string - operator: - description: Represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists, DoesNotExist. Gt, and Lt. - type: string - values: - description: An array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. If the operator is Gt or Lt, the values array must have a single element, which will be interpreted as an integer. This array is replaced during a strategic merge patch. - items: - type: string - type: array - required: - - key - - operator - type: object - type: array - type: object - type: array - required: - - nodeSelectorTerms - type: object - type: object - podAffinity: - description: Describes pod affinity scheduling rules (e.g. co-locate this pod in the same node, zone, etc. as some other pod(s)). - properties: - preferredDuringSchedulingIgnoredDuringExecution: - description: The scheduler will prefer to schedule pods to nodes that satisfy the affinity expressions specified by this field, but it may choose a node that violates one or more of the expressions. The node that is most preferred is the one with the greatest sum of weights, i.e. for each node that meets all of the scheduling requirements (resource request, requiredDuringScheduling affinity expressions, etc.), compute a sum by iterating through the elements of this field and adding "weight" to the sum if the node has pods which matches the corresponding podAffinityTerm; the node(s) with the highest sum are the most preferred. - items: - description: The weights of all of the matched WeightedPodAffinityTerm fields are added per-node to find the most preferred node(s) - properties: - podAffinityTerm: - description: Required. A pod affinity term, associated with the corresponding weight. - properties: - labelSelector: - description: A label query over a set of resources, in this case pods. - properties: - matchExpressions: - description: matchExpressions is a list of label selector requirements. The requirements are ANDed. - items: - description: A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values. - properties: - key: - description: key is the label key that the selector applies to. - type: string - operator: - description: operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist. - type: string - values: - description: values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch. - items: - type: string - type: array - required: - - key - - operator - type: object - type: array - matchLabels: - additionalProperties: - type: string - description: matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is "key", the operator is "In", and the values array contains only "value". The requirements are ANDed. - type: object - type: object - namespaces: - description: namespaces specifies which namespaces the labelSelector applies to (matches against); null or empty list means "this pod's namespace" - items: - type: string - type: array - topologyKey: - description: This pod should be co-located (affinity) or not co-located (anti-affinity) with the pods matching the labelSelector in the specified namespaces, where co-located is defined as running on a node whose value of the label with key topologyKey matches that of any node on which any of the selected pods is running. Empty topologyKey is not allowed. - type: string - required: - - topologyKey - type: object - weight: - description: weight associated with matching the corresponding podAffinityTerm, in the range 1-100. - format: int32 - type: integer - required: - - podAffinityTerm - - weight - type: object - type: array - requiredDuringSchedulingIgnoredDuringExecution: - description: If the affinity requirements specified by this field are not met at scheduling time, the pod will not be scheduled onto the node. If the affinity requirements specified by this field cease to be met at some point during pod execution (e.g. due to a pod label update), the system may or may not try to eventually evict the pod from its node. When there are multiple elements, the lists of nodes corresponding to each podAffinityTerm are intersected, i.e. all terms must be satisfied. - items: - description: Defines a set of pods (namely those matching the labelSelector relative to the given namespace(s)) that this pod should be co-located (affinity) or not co-located (anti-affinity) with, where co-located is defined as running on a node whose value of the label with key matches that of any node on which a pod of the set of pods is running - properties: - labelSelector: - description: A label query over a set of resources, in this case pods. - properties: - matchExpressions: - description: matchExpressions is a list of label selector requirements. The requirements are ANDed. - items: - description: A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values. - properties: - key: - description: key is the label key that the selector applies to. - type: string - operator: - description: operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist. - type: string - values: - description: values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch. - items: - type: string - type: array - required: - - key - - operator - type: object - type: array - matchLabels: - additionalProperties: - type: string - description: matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is "key", the operator is "In", and the values array contains only "value". The requirements are ANDed. - type: object - type: object - namespaces: - description: namespaces specifies which namespaces the labelSelector applies to (matches against); null or empty list means "this pod's namespace" - items: - type: string - type: array - topologyKey: - description: This pod should be co-located (affinity) or not co-located (anti-affinity) with the pods matching the labelSelector in the specified namespaces, where co-located is defined as running on a node whose value of the label with key topologyKey matches that of any node on which any of the selected pods is running. Empty topologyKey is not allowed. - type: string - required: - - topologyKey - type: object - type: array - type: object - podAntiAffinity: - description: Describes pod anti-affinity scheduling rules (e.g. avoid putting this pod in the same node, zone, etc. as some other pod(s)). - properties: - preferredDuringSchedulingIgnoredDuringExecution: - description: The scheduler will prefer to schedule pods to nodes that satisfy the anti-affinity expressions specified by this field, but it may choose a node that violates one or more of the expressions. The node that is most preferred is the one with the greatest sum of weights, i.e. for each node that meets all of the scheduling requirements (resource request, requiredDuringScheduling anti-affinity expressions, etc.), compute a sum by iterating through the elements of this field and adding "weight" to the sum if the node has pods which matches the corresponding podAffinityTerm; the node(s) with the highest sum are the most preferred. - items: - description: The weights of all of the matched WeightedPodAffinityTerm fields are added per-node to find the most preferred node(s) - properties: - podAffinityTerm: - description: Required. A pod affinity term, associated with the corresponding weight. - properties: - labelSelector: - description: A label query over a set of resources, in this case pods. - properties: - matchExpressions: - description: matchExpressions is a list of label selector requirements. The requirements are ANDed. - items: - description: A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values. - properties: - key: - description: key is the label key that the selector applies to. - type: string - operator: - description: operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist. - type: string - values: - description: values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch. - items: - type: string - type: array - required: - - key - - operator - type: object - type: array - matchLabels: - additionalProperties: - type: string - description: matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is "key", the operator is "In", and the values array contains only "value". The requirements are ANDed. - type: object - type: object - namespaces: - description: namespaces specifies which namespaces the labelSelector applies to (matches against); null or empty list means "this pod's namespace" - items: - type: string - type: array - topologyKey: - description: This pod should be co-located (affinity) or not co-located (anti-affinity) with the pods matching the labelSelector in the specified namespaces, where co-located is defined as running on a node whose value of the label with key topologyKey matches that of any node on which any of the selected pods is running. Empty topologyKey is not allowed. - type: string - required: - - topologyKey - type: object - weight: - description: weight associated with matching the corresponding podAffinityTerm, in the range 1-100. - format: int32 - type: integer - required: - - podAffinityTerm - - weight - type: object - type: array - requiredDuringSchedulingIgnoredDuringExecution: - description: If the anti-affinity requirements specified by this field are not met at scheduling time, the pod will not be scheduled onto the node. If the anti-affinity requirements specified by this field cease to be met at some point during pod execution (e.g. due to a pod label update), the system may or may not try to eventually evict the pod from its node. When there are multiple elements, the lists of nodes corresponding to each podAffinityTerm are intersected, i.e. all terms must be satisfied. - items: - description: Defines a set of pods (namely those matching the labelSelector relative to the given namespace(s)) that this pod should be co-located (affinity) or not co-located (anti-affinity) with, where co-located is defined as running on a node whose value of the label with key matches that of any node on which a pod of the set of pods is running - properties: - labelSelector: - description: A label query over a set of resources, in this case pods. - properties: - matchExpressions: - description: matchExpressions is a list of label selector requirements. The requirements are ANDed. - items: - description: A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values. - properties: - key: - description: key is the label key that the selector applies to. - type: string - operator: - description: operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist. - type: string - values: - description: values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch. - items: - type: string - type: array - required: - - key - - operator - type: object - type: array - matchLabels: - additionalProperties: - type: string - description: matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is "key", the operator is "In", and the values array contains only "value". The requirements are ANDed. - type: object - type: object - namespaces: - description: namespaces specifies which namespaces the labelSelector applies to (matches against); null or empty list means "this pod's namespace" - items: - type: string - type: array - topologyKey: - description: This pod should be co-located (affinity) or not co-located (anti-affinity) with the pods matching the labelSelector in the specified namespaces, where co-located is defined as running on a node whose value of the label with key topologyKey matches that of any node on which any of the selected pods is running. Empty topologyKey is not allowed. - type: string - required: - - topologyKey - type: object - type: array - type: object - type: object - resources: - description: If specified, the container's resources. - items: - description: The pod this Resource is used to specify the requests and limits for a certain container based on the name. - properties: - container: - description: The name of the container - type: string - limits: - properties: - cpu: - pattern: ^([+-]?[0-9.]+)([eEinumkKMGTP]*[-+]?[0-9]*)$ - type: string - memory: - pattern: ^([+-]?[0-9.]+)([eEinumkKMGTP]*[-+]?[0-9]*)$ - type: string - type: object - requests: - properties: - cpu: - pattern: ^([+-]?[0-9.]+)([eEinumkKMGTP]*[-+]?[0-9]*)$ - type: string - memory: - pattern: ^([+-]?[0-9.]+)([eEinumkKMGTP]*[-+]?[0-9]*)$ - type: string - type: object - type: object - type: array - namespace: - description: A field of namespace name to override the labels and annotations - type: object - properties: - labels: - additionalProperties: - type: string - description: Labels overrides labels for the namespace and its template. - type: object - annotations: - additionalProperties: - type: string - description: Annotations overrides labels for the namespace and its template. - type: object - deployments: - description: A mapping of deployment name to override - type: array - items: - type: object - properties: - name: - description: The name of the deployment - type: string - labels: - additionalProperties: - type: string - description: Labels overrides labels for the deployment and its template. - type: object - annotations: - additionalProperties: - type: string - description: Annotations overrides labels for the deployment and its template. - type: object - env: - description: Env overrides env vars for the containers. - items: - properties: - container: - description: The container name - type: string - envVars: - description: The desired EnvVarRequirements - items: - description: EnvVar represents an environment variable present in a Container. - properties: - name: - description: Name of the environment variable. Must be a C_IDENTIFIER. - type: string - value: - description: 'Variable references $(VAR_NAME) are expanded using the previously defined environment variables in the container and any service environment variables. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. "$$(VAR_NAME)" will produce the string literal "$(VAR_NAME)". Escaped references will never be expanded, regardless of whether the variable exists or not. Defaults to "".' - type: string - valueFrom: - description: Source for the environment variable's value. Cannot be used if value is not empty. - properties: - configMapKeyRef: - description: Selects a key of a ConfigMap. - properties: - key: - description: The key to select. - type: string - name: - description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' - type: string - optional: - description: Specify whether the ConfigMap or its key must be defined - type: boolean - required: - - key - type: object - fieldRef: - description: 'Selects a field of the pod: supports metadata.name, metadata.namespace, `metadata.labels['''']`, `metadata.annotations['''']`, spec.nodeName, spec.serviceAccountName, status.hostIP, status.podIP, status.podIPs.' - properties: - apiVersion: - description: Version of the schema the FieldPath is written in terms of, defaults to "v1". - type: string - fieldPath: - description: Path of the field to select in the specified API version. - type: string - required: - - fieldPath - type: object - resourceFieldRef: - description: 'Selects a resource of the container: only resources limits and requests (limits.cpu, limits.memory, limits.ephemeral-storage, requests.cpu, requests.memory and requests.ephemeral-storage) are currently supported.' - properties: - containerName: - description: 'Container name: required for volumes, optional for env vars' - type: string - divisor: - anyOf: - - type: integer - - type: string - description: Specifies the output format of the exposed resources, defaults to "1" - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true - resource: - description: 'Required: resource to select' - type: string - required: - - resource - type: object - secretKeyRef: - description: Selects a key of a secret in the pod's namespace - properties: - key: - description: The key of the secret to select from. Must be a valid secret key. - type: string - name: - description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names TODO: Add other useful fields. apiVersion, kind, uid?' - type: string - optional: - description: Specify whether the Secret or its key must be defined - type: boolean - required: - - key - type: object - type: object - required: - - name - type: object - type: array - required: - - container - type: object - type: array - livenessProbes: - description: LivenessProbes overrides liveness probes for the containers. - items: - description: ProbesRequirementsOverride enables the user to override any container's env vars. - properties: - container: - description: The container name - type: string - failureThreshold: - description: Minimum consecutive failures for the probe to be considered failed after having succeeded. Defaults to 3. Minimum value is 1. - format: int32 - type: integer - initialDelaySeconds: - description: 'Number of seconds after the container has started before liveness probes are initiated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' - format: int32 - type: integer - periodSeconds: - description: How often (in seconds) to perform the probe. Default to 10 seconds. Minimum value is 1. - format: int32 - type: integer - successThreshold: - description: Minimum consecutive successes for the probe to be considered successful after having failed. Defaults to 1. Must be 1 for liveness and startup. Minimum value is 1. - format: int32 - type: integer - terminationGracePeriodSeconds: - description: Optional duration in seconds the pod needs to terminate gracefully upon probe failure. The grace period is the duration in seconds after the processes running in the pod are sent a termination signal and the time when the processes are forcibly halted with a kill signal. Set this value longer than the expected cleanup time for your process. If this value is nil, the pod's terminationGracePeriodSeconds will be used. Otherwise, this value overrides the value provided by the pod spec. Value must be non-negative integer. The value zero indicates stop immediately via the kill signal (no opportunity to shut down). This is a beta field and requires enabling ProbeTerminationGracePeriod feature gate. Minimum value is 1. spec.terminationGracePeriodSeconds is used if unset. - format: int64 - type: integer - timeoutSeconds: - description: 'Number of seconds after which the probe times out. Defaults to 1 second. Minimum value is 1. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' - format: int32 - type: integer - required: - - container - type: object - type: array - replicas: - description: The number of replicas that HA parts of the control plane will be scaled to - type: integer - minimum: 0 - nodeSelector: - additionalProperties: - type: string - description: NodeSelector overrides nodeSelector for the deployment. - type: object - readinessProbes: - description: ReadinessProbes overrides readiness probes for the containers. - items: - description: ProbesRequirementsOverride enables the user to override any container's env vars. - properties: - container: - description: The container name - type: string - failureThreshold: - description: Minimum consecutive failures for the probe to be considered failed after having succeeded. Defaults to 3. Minimum value is 1. - format: int32 - type: integer - initialDelaySeconds: - description: 'Number of seconds after the container has started before liveness probes are initiated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' - format: int32 - type: integer - periodSeconds: - description: How often (in seconds) to perform the probe. Default to 10 seconds. Minimum value is 1. - format: int32 - type: integer - successThreshold: - description: Minimum consecutive successes for the probe to be considered successful after having failed. Defaults to 1. Must be 1 for liveness and startup. Minimum value is 1. - format: int32 - type: integer - terminationGracePeriodSeconds: - description: Optional duration in seconds the pod needs to terminate gracefully upon probe failure. The grace period is the duration in seconds after the processes running in the pod are sent a termination signal and the time when the processes are forcibly halted with a kill signal. Set this value longer than the expected cleanup time for your process. If this value is nil, the pod's terminationGracePeriodSeconds will be used. Otherwise, this value overrides the value provided by the pod spec. Value must be non-negative integer. The value zero indicates stop immediately via the kill signal (no opportunity to shut down). This is a beta field and requires enabling ProbeTerminationGracePeriod feature gate. Minimum value is 1. spec.terminationGracePeriodSeconds is used if unset. - format: int64 - type: integer - timeoutSeconds: - description: 'Number of seconds after which the probe times out. Defaults to 1 second. Minimum value is 1. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' - format: int32 - type: integer - required: - - container - type: object - type: array - tolerations: - description: If specified, the pod's tolerations. - items: - description: The pod this Toleration is attached to tolerates any taint that matches the triple using the matching operator . - properties: - effect: - description: Effect indicates the taint effect to match. Empty means match all taint effects. When specified, allowed values are NoSchedule, PreferNoSchedule and NoExecute. - type: string - key: - description: Key is the taint key that the toleration applies to. Empty means match all taint keys. If the key is empty, operator must be Exists; this combination means to match all values and all keys. - type: string - operator: - description: Operator represents a key's relationship to the value. Valid operators are Exists and Equal. Defaults to Equal. Exists is equivalent to wildcard for value, so that a pod can tolerate all taints of a particular category. - type: string - tolerationSeconds: - description: TolerationSeconds represents the period of time the toleration (which must be of effect NoExecute, otherwise this field is ignored) tolerates the taint. By default, it is not set, which means tolerate the taint forever (do not evict). Zero and negative values will be treated as 0 (evict immediately) by the system. - format: int64 - type: integer - value: - description: Value is the taint value the toleration matches to. If the operator is Exists, the value should be empty, otherwise just a regular string. - type: string - type: object - type: array - hostNetwork: - description: Use the host's network namespace if true. Make sure to understand the security implications if you want to enable it. When hostNetwork is enabled, this will set dnsPolicy to ClusterFirstWithHostNet automatically. - type: boolean - topologySpreadConstraints: - description: If specified, the pod's topology spread constraints. - items: - description: TopologySpreadConstraint specifies how to spread matching pods among the given topology. - properties: - labelSelector: - description: LabelSelector is used to find matching pods. Pods that match this label selector are counted to determine the number of pods in their corresponding topology domain. - properties: - matchExpressions: - description: matchExpressions is a list of label selector requirements. The requirements are ANDed. - items: - description: A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values. - properties: - key: - description: key is the label key that the selector applies to. - type: string - operator: - description: operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist. - type: string - values: - description: values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch. - items: - type: string - type: array - required: - - key - - operator - type: object - type: array - matchLabels: - additionalProperties: - type: string - description: matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is "key", the operator is "In", and the values array contains only "value". The requirements are ANDed. - type: object - type: object - maxSkew: - description: 'MaxSkew describes the degree to which pods may be unevenly distributed. It''s the maximum permitted difference between the number of matching pods in any two topology domains of a given topology type. For example, in a 3-zone cluster, MaxSkew is set to 1, and pods with the same labelSelector spread as 1/1/0: | zone1 | zone2 | zone3 | | P | P | | - if MaxSkew is 1, incoming pod can only be scheduled to zone3 to become 1/1/1; scheduling it onto zone1(zone2) would make the ActualSkew(2-0) on zone1(zone2) violate MaxSkew(1). - if MaxSkew is 2, incoming pod can be scheduled onto any zone. It''s a required field. Default value is 1 and 0 is not allowed.' - format: int32 - type: integer - topologyKey: - description: TopologyKey is the key of node labels. Nodes that have a label with this key and identical values are considered to be in the same topology. We consider each as a "bucket", and try to put balanced number of pods into each bucket. It's a required field. - type: string - whenUnsatisfiable: - description: 'WhenUnsatisfiable indicates how to deal with a pod if it doesn''t satisfy the spread constraint. - DoNotSchedule (default) tells the scheduler not to schedule it - ScheduleAnyway tells the scheduler to still schedule it It''s considered as "Unsatisfiable" if and only if placing incoming pod on any topology violates "MaxSkew". For example, in a 3-zone cluster, MaxSkew is set to 1, and pods with the same labelSelector spread as 3/1/1: | zone1 | zone2 | zone3 | | P P P | P | P | If WhenUnsatisfiable is set to DoNotSchedule, incoming pod can only be scheduled to zone2(zone3) to become 3/2/1(3/1/2) as ActualSkew(2-1) on zone2(zone3) satisfies MaxSkew(1). In other words, the cluster can still be imbalanced, but scheduler won''t make it *more* imbalanced. It''s a required field.' - type: string - required: - - maxSkew - - topologyKey - - whenUnsatisfiable - type: object - type: array - affinity: - description: If specified, the pod's scheduling constraints. - properties: - nodeAffinity: - description: Describes node affinity scheduling rules for the pod. - properties: - preferredDuringSchedulingIgnoredDuringExecution: - description: The scheduler will prefer to schedule pods to nodes that satisfy the affinity expressions specified by this field, but it may choose a node that violates one or more of the expressions. The node that is most preferred is the one with the greatest sum of weights, i.e. for each node that meets all of the scheduling requirements (resource request, requiredDuringScheduling affinity expressions, etc.), compute a sum by iterating through the elements of this field and adding "weight" to the sum if the node matches the corresponding matchExpressions; the node(s) with the highest sum are the most preferred. - items: - description: An empty preferred scheduling term matches all objects with implicit weight 0 (i.e. it's a no-op). A null preferred scheduling term matches no objects (i.e. is also a no-op). - properties: - preference: - description: A node selector term, associated with the corresponding weight. - properties: - matchExpressions: - description: A list of node selector requirements by node's labels. - items: - description: A node selector requirement is a selector that contains values, a key, and an operator that relates the key and values. - properties: - key: - description: The label key that the selector applies to. - type: string - operator: - description: Represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists, DoesNotExist. Gt, and Lt. - type: string - values: - description: An array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. If the operator is Gt or Lt, the values array must have a single element, which will be interpreted as an integer. This array is replaced during a strategic merge patch. - items: - type: string - type: array - required: - - key - - operator - type: object - type: array - matchFields: - description: A list of node selector requirements by node's fields. - items: - description: A node selector requirement is a selector that contains values, a key, and an operator that relates the key and values. - properties: - key: - description: The label key that the selector applies to. - type: string - operator: - description: Represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists, DoesNotExist. Gt, and Lt. - type: string - values: - description: An array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. If the operator is Gt or Lt, the values array must have a single element, which will be interpreted as an integer. This array is replaced during a strategic merge patch. - items: - type: string - type: array - required: - - key - - operator - type: object - type: array - type: object - weight: - description: Weight associated with matching the corresponding nodeSelectorTerm, in the range 1-100. - format: int32 - type: integer - required: - - preference - - weight - type: object - type: array - requiredDuringSchedulingIgnoredDuringExecution: - description: If the affinity requirements specified by this field are not met at scheduling time, the pod will not be scheduled onto the node. If the affinity requirements specified by this field cease to be met at some point during pod execution (e.g. due to an update), the system may or may not try to eventually evict the pod from its node. - properties: - nodeSelectorTerms: - description: Required. A list of node selector terms. The terms are ORed. - items: - description: A null or empty node selector term matches no objects. The requirements of them are ANDed. The TopologySelectorTerm type implements a subset of the NodeSelectorTerm. - properties: - matchExpressions: - description: A list of node selector requirements by node's labels. - items: - description: A node selector requirement is a selector that contains values, a key, and an operator that relates the key and values. - properties: - key: - description: The label key that the selector applies to. - type: string - operator: - description: Represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists, DoesNotExist. Gt, and Lt. - type: string - values: - description: An array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. If the operator is Gt or Lt, the values array must have a single element, which will be interpreted as an integer. This array is replaced during a strategic merge patch. - items: - type: string - type: array - required: - - key - - operator - type: object - type: array - matchFields: - description: A list of node selector requirements by node's fields. - items: - description: A node selector requirement is a selector that contains values, a key, and an operator that relates the key and values. - properties: - key: - description: The label key that the selector applies to. - type: string - operator: - description: Represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists, DoesNotExist. Gt, and Lt. - type: string - values: - description: An array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. If the operator is Gt or Lt, the values array must have a single element, which will be interpreted as an integer. This array is replaced during a strategic merge patch. - items: - type: string - type: array - required: - - key - - operator - type: object - type: array - type: object - type: array - required: - - nodeSelectorTerms - type: object - type: object - podAffinity: - description: Describes pod affinity scheduling rules (e.g. co-locate this pod in the same node, zone, etc. as some other pod(s)). - properties: - preferredDuringSchedulingIgnoredDuringExecution: - description: The scheduler will prefer to schedule pods to nodes that satisfy the affinity expressions specified by this field, but it may choose a node that violates one or more of the expressions. The node that is most preferred is the one with the greatest sum of weights, i.e. for each node that meets all of the scheduling requirements (resource request, requiredDuringScheduling affinity expressions, etc.), compute a sum by iterating through the elements of this field and adding "weight" to the sum if the node has pods which matches the corresponding podAffinityTerm; the node(s) with the highest sum are the most preferred. - items: - description: The weights of all of the matched WeightedPodAffinityTerm fields are added per-node to find the most preferred node(s) - properties: - podAffinityTerm: - description: Required. A pod affinity term, associated with the corresponding weight. - properties: - labelSelector: - description: A label query over a set of resources, in this case pods. - properties: - matchExpressions: - description: matchExpressions is a list of label selector requirements. The requirements are ANDed. - items: - description: A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values. - properties: - key: - description: key is the label key that the selector applies to. - type: string - operator: - description: operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist. - type: string - values: - description: values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch. - items: - type: string - type: array - required: - - key - - operator - type: object - type: array - matchLabels: - additionalProperties: - type: string - description: matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is "key", the operator is "In", and the values array contains only "value". The requirements are ANDed. - type: object - type: object - namespaces: - description: namespaces specifies which namespaces the labelSelector applies to (matches against); null or empty list means "this pod's namespace" - items: - type: string - type: array - topologyKey: - description: This pod should be co-located (affinity) or not co-located (anti-affinity) with the pods matching the labelSelector in the specified namespaces, where co-located is defined as running on a node whose value of the label with key topologyKey matches that of any node on which any of the selected pods is running. Empty topologyKey is not allowed. - type: string - required: - - topologyKey - type: object - weight: - description: weight associated with matching the corresponding podAffinityTerm, in the range 1-100. - format: int32 - type: integer - required: - - podAffinityTerm - - weight - type: object - type: array - requiredDuringSchedulingIgnoredDuringExecution: - description: If the affinity requirements specified by this field are not met at scheduling time, the pod will not be scheduled onto the node. If the affinity requirements specified by this field cease to be met at some point during pod execution (e.g. due to a pod label update), the system may or may not try to eventually evict the pod from its node. When there are multiple elements, the lists of nodes corresponding to each podAffinityTerm are intersected, i.e. all terms must be satisfied. - items: - description: Defines a set of pods (namely those matching the labelSelector relative to the given namespace(s)) that this pod should be co-located (affinity) or not co-located (anti-affinity) with, where co-located is defined as running on a node whose value of the label with key matches that of any node on which a pod of the set of pods is running - properties: - labelSelector: - description: A label query over a set of resources, in this case pods. - properties: - matchExpressions: - description: matchExpressions is a list of label selector requirements. The requirements are ANDed. - items: - description: A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values. - properties: - key: - description: key is the label key that the selector applies to. - type: string - operator: - description: operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist. - type: string - values: - description: values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch. - items: - type: string - type: array - required: - - key - - operator - type: object - type: array - matchLabels: - additionalProperties: - type: string - description: matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is "key", the operator is "In", and the values array contains only "value". The requirements are ANDed. - type: object - type: object - namespaces: - description: namespaces specifies which namespaces the labelSelector applies to (matches against); null or empty list means "this pod's namespace" - items: - type: string - type: array - topologyKey: - description: This pod should be co-located (affinity) or not co-located (anti-affinity) with the pods matching the labelSelector in the specified namespaces, where co-located is defined as running on a node whose value of the label with key topologyKey matches that of any node on which any of the selected pods is running. Empty topologyKey is not allowed. - type: string - required: - - topologyKey - type: object - type: array - type: object - podAntiAffinity: - description: Describes pod anti-affinity scheduling rules (e.g. avoid putting this pod in the same node, zone, etc. as some other pod(s)). - properties: - preferredDuringSchedulingIgnoredDuringExecution: - description: The scheduler will prefer to schedule pods to nodes that satisfy the anti-affinity expressions specified by this field, but it may choose a node that violates one or more of the expressions. The node that is most preferred is the one with the greatest sum of weights, i.e. for each node that meets all of the scheduling requirements (resource request, requiredDuringScheduling anti-affinity expressions, etc.), compute a sum by iterating through the elements of this field and adding "weight" to the sum if the node has pods which matches the corresponding podAffinityTerm; the node(s) with the highest sum are the most preferred. - items: - description: The weights of all of the matched WeightedPodAffinityTerm fields are added per-node to find the most preferred node(s) - properties: - podAffinityTerm: - description: Required. A pod affinity term, associated with the corresponding weight. - properties: - labelSelector: - description: A label query over a set of resources, in this case pods. - properties: - matchExpressions: - description: matchExpressions is a list of label selector requirements. The requirements are ANDed. - items: - description: A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values. - properties: - key: - description: key is the label key that the selector applies to. - type: string - operator: - description: operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist. - type: string - values: - description: values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch. - items: - type: string - type: array - required: - - key - - operator - type: object - type: array - matchLabels: - additionalProperties: - type: string - description: matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is "key", the operator is "In", and the values array contains only "value". The requirements are ANDed. - type: object - type: object - namespaces: - description: namespaces specifies which namespaces the labelSelector applies to (matches against); null or empty list means "this pod's namespace" - items: - type: string - type: array - topologyKey: - description: This pod should be co-located (affinity) or not co-located (anti-affinity) with the pods matching the labelSelector in the specified namespaces, where co-located is defined as running on a node whose value of the label with key topologyKey matches that of any node on which any of the selected pods is running. Empty topologyKey is not allowed. - type: string - required: - - topologyKey - type: object - weight: - description: weight associated with matching the corresponding podAffinityTerm, in the range 1-100. - format: int32 - type: integer - required: - - podAffinityTerm - - weight - type: object - type: array - requiredDuringSchedulingIgnoredDuringExecution: - description: If the anti-affinity requirements specified by this field are not met at scheduling time, the pod will not be scheduled onto the node. If the anti-affinity requirements specified by this field cease to be met at some point during pod execution (e.g. due to a pod label update), the system may or may not try to eventually evict the pod from its node. When there are multiple elements, the lists of nodes corresponding to each podAffinityTerm are intersected, i.e. all terms must be satisfied. - items: - description: Defines a set of pods (namely those matching the labelSelector relative to the given namespace(s)) that this pod should be co-located (affinity) or not co-located (anti-affinity) with, where co-located is defined as running on a node whose value of the label with key matches that of any node on which a pod of the set of pods is running - properties: - labelSelector: - description: A label query over a set of resources, in this case pods. - properties: - matchExpressions: - description: matchExpressions is a list of label selector requirements. The requirements are ANDed. - items: - description: A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values. - properties: - key: - description: key is the label key that the selector applies to. - type: string - operator: - description: operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist. - type: string - values: - description: values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch. - items: - type: string - type: array - required: - - key - - operator - type: object - type: array - matchLabels: - additionalProperties: - type: string - description: matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is "key", the operator is "In", and the values array contains only "value". The requirements are ANDed. - type: object - type: object - namespaces: - description: namespaces specifies which namespaces the labelSelector applies to (matches against); null or empty list means "this pod's namespace" - items: - type: string - type: array - topologyKey: - description: This pod should be co-located (affinity) or not co-located (anti-affinity) with the pods matching the labelSelector in the specified namespaces, where co-located is defined as running on a node whose value of the label with key topologyKey matches that of any node on which any of the selected pods is running. Empty topologyKey is not allowed. - type: string - required: - - topologyKey - type: object - type: array - type: object - type: object - resources: - description: If specified, the container's resources. - items: - description: The pod this Resource is used to specify the requests and limits for a certain container based on the name. - properties: - container: - description: The name of the container - type: string - limits: - properties: - cpu: - pattern: ^([+-]?[0-9.]+)([eEinumkKMGTP]*[-+]?[0-9]*)$ - type: string - memory: - pattern: ^([+-]?[0-9.]+)([eEinumkKMGTP]*[-+]?[0-9]*)$ - type: string - type: object - requests: - properties: - cpu: - pattern: ^([+-]?[0-9.]+)([eEinumkKMGTP]*[-+]?[0-9]*)$ - type: string - memory: - pattern: ^([+-]?[0-9.]+)([eEinumkKMGTP]*[-+]?[0-9]*)$ - type: string - type: object - type: object - type: array - services: - description: A mapping of service name to override - type: array - items: - type: object - properties: - name: - description: The name of the service - type: string - labels: - additionalProperties: - type: string - description: Labels overrides labels for the service - type: object - annotations: - additionalProperties: - type: string - description: Annotations overrides labels for the service - type: object - selector: - additionalProperties: - type: string - description: Selector overrides selector for the service - type: object - podDisruptionBudgets: - description: A mapping of podDisruptionBudget name to override - type: array - items: - type: object - properties: - name: - description: The name of the podDisruptionBudget - type: string - minAvailable: - anyOf: - - type: integer - - type: string - description: An eviction is allowed if at least "minAvailable" pods selected by "selector" will still be available after the eviction, i.e. even in the absence of the evicted pod. So for example you can prevent all voluntary evictions by specifying "100%". - x-kubernetes-int-or-string: true - maxUnavailable: - anyOf: - - type: integer - - type: string - description: An eviction is allowed if at most "maxUnavailable" pods selected by "selector" are unavailable after the eviction, i.e. even in absence of the evicted pod. For example, one can prevent all voluntary evictions by specifying 0. This is a mutually exclusive setting with "minAvailable". - x-kubernetes-int-or-string: true - ingress: - description: The ingress configuration for Knative Serving - properties: - contour: - description: Contour settings - properties: - enabled: - type: boolean - type: object - istio: - description: Istio settings - properties: - enabled: - type: boolean - knative-ingress-gateway: - description: A means to override the knative-ingress-gateway - properties: - selector: - additionalProperties: - type: string - description: The selector for the ingress-gateway. - type: object - servers: - description: A list of server specifications. - items: - properties: - hosts: - description: One or more hosts exposed by this gateway. - items: - format: string - type: string - type: array - port: - properties: - name: - description: Label assigned to the port. - format: string - type: string - number: - description: A valid non-negative integer port number. - type: integer - target_port: - description: A valid non-negative integer target port number. - type: integer - protocol: - description: The protocol exposed on the port. - format: string - type: string - type: object - tls: - nullable: true - oneOf: - - required: - - mode - - credentialName - - required: - - httpsRedirect - properties: - mode: - description: TLS mode can be SIMPLE, MUTUAL, ISTIO_MUTUAL. - format: string - type: string - credentialName: - description: TLS certificate name. - format: string - type: string - httpsRedirect: - description: If set to true, the load balancer will send a 301 redirect to HTTPS for all HTTP requests. Should be used only for HTTP listener, is mutually exclusive with all other TLS options. - type: boolean - type: object - type: object - type: array - type: object - knative-local-gateway: - description: A means to override the knative-local-gateway - properties: - selector: - additionalProperties: - type: string - description: The selector for the ingress-gateway. - type: object - servers: - description: A list of server specifications. - items: - properties: - hosts: - description: One or more hosts exposed by this gateway. - items: - format: string - type: string - type: array - port: - properties: - name: - description: Label assigned to the port. - format: string - type: string - number: - description: A valid non-negative integer port number. - type: integer - target_port: - description: A valid non-negative integer target port number. - type: integer - protocol: - description: The protocol exposed on the port. - format: string - type: string - type: object - tls: - nullable: true - oneOf: - - required: - - mode - - credentialName - - required: - - httpsRedirect - properties: - mode: - description: TLS mode can be SIMPLE, MUTUAL, ISTIO_MUTUAL. - format: string - type: string - credentialName: - description: TLS certificate name. - format: string - type: string - httpsRedirect: - description: If set to true, the load balancer will send a 301 redirect to HTTPS for all HTTP requests. Should be used only for HTTP listener, is mutually exclusive with all other TLS options. - type: boolean - type: object - type: object - type: array - type: object - type: object - kourier: - description: Kourier settings - properties: - enabled: - type: boolean - service-type: - type: string - service-load-balancer-ip: - type: string - bootstrap-configmap: - type: string - http-port: - type: integer - https-port: - type: integer - type: object - type: object - security: - description: The security configuration for Knative Serving - properties: - securityGuard: - description: Security Guard settings - properties: - enabled: - type: boolean - type: object - type: object - manifests: - description: A list of serving manifests, which will be installed by the operator - items: - properties: - URL: - description: The link of the manifest URL - type: string - type: object - type: array - registry: - description: A means to override the corresponding deployment images in the upstream. This affects both apps/v1.Deployment and caching.internal.knative.dev/v1alpha1.Image. - properties: - default: - description: The default image reference template to use for all knative images. Takes the form of example-registry.io/custom/path/${NAME}:custom-tag - type: string - imagePullSecrets: - description: A list of secrets to be used when pulling the knative images. The secret must be created in the same namespace as the knative-serving deployments, and not the namespace of this resource. - items: - properties: - name: - description: The name of the secret. - type: string - type: object - type: array - override: - additionalProperties: - type: string - description: A map of a container name or image name to the full image location of the individual knative image. - type: object - type: object - version: - description: The version of Knative Serving to be installed - type: string - type: object - status: - description: Status defines the observed state of KnativeServing - properties: - conditions: - description: The latest available observations of a resource's current state. - items: - properties: - lastTransitionTime: - description: LastTransitionTime is the last time the condition transitioned from one status to another. We use VolatileTime in place of metav1.Time to exclude this from creating equality.Semantic differences (all other things held constant). - type: string - message: - description: A human readable message indicating details about the transition. - type: string - reason: - description: The reason for the condition's last transition. - type: string - severity: - description: Severity with which to treat failures of this type of condition. When this is not specified, it defaults to Error. - type: string - status: - description: Status of the condition, one of True, False, Unknown. - type: string - type: - description: Type of condition. - type: string - required: - - type - - status - type: object - type: array - manifests: - description: The list of serving manifests, which have been installed by the operator - items: - type: string - type: array - observedGeneration: - description: The generation last processed by the controller - type: integer - version: - description: The version of the installed release - type: string - type: object - type: object - additionalPrinterColumns: - - jsonPath: .status.version - name: Version - type: string - - jsonPath: .status.conditions[?(@.type=="Ready")].status - name: Ready - type: string - - jsonPath: .status.conditions[?(@.type=="Ready")].reason - name: Reason - type: string - names: - kind: KnativeServing - listKind: KnativeServingList - plural: knativeservings - singular: knativeserving - scope: Namespaced - conversion: - strategy: Webhook - webhook: - conversionReviewVersions: ["v1beta1"] - clientConfig: - service: - name: operator-webhook - namespace: union - path: /resource-conversion +data: {} --- # Source: knative-operator/templates/knative-operator.yaml # Copyright 2020 The Knative Authors @@ -10410,6 +242,7 @@ rules: [] # Rules are automatically filled in by the controller manager. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + kind: ClusterRole apiVersion: rbac.authorization.k8s.io/v1 metadata: @@ -11197,6 +1030,7 @@ subjects: # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: diff --git a/tests/values/dataplane.additional-podlabels.yaml b/tests/values/dataplane.additional-podlabels.yaml index fd97f115..23b8afaa 100644 --- a/tests/values/dataplane.additional-podlabels.yaml +++ b/tests/values/dataplane.additional-podlabels.yaml @@ -1,5 +1,6 @@ +# helm-values: values-test-certs.yaml # Minimal test to verify additionalPodLabels and additionalPodAnnotations -# This ensures labels/annotations appear on separate lines and don't get concatenated + # This ensures labels/annotations appear on separate lines and don't get concatenated host: union.test.union.ai clusterName: union-test diff --git a/tests/values/dataplane.additional-templates.yaml b/tests/values/dataplane.additional-templates.yaml index 0a08c20e..d63fba42 100644 --- a/tests/values/dataplane.additional-templates.yaml +++ b/tests/values/dataplane.additional-templates.yaml @@ -1,3 +1,5 @@ +# helm-values: values-test-certs.yaml + # Test that additionalTemplates are rendered alongside the default templates # without overwriting the namespace, service account, or resource quota templates. namespace_mapping: diff --git a/tests/values/dataplane.aws.eks-automode.yaml b/tests/values/dataplane.aws.eks-automode.yaml index 13699ebd..f20af8c6 100644 --- a/tests/values/dataplane.aws.eks-automode.yaml +++ b/tests/values/dataplane.aws.eks-automode.yaml @@ -1,3 +1,4 @@ +# helm-values: values-test-certs.yaml # Intended to match and test charts/dataplane/values.aws.eks-automode.yaml global: UNION_CONTROL_PLANE_HOST: "test-controlplane-host" diff --git a/tests/values/dataplane.aws.with-ingress.yaml b/tests/values/dataplane.aws.with-ingress.yaml index fef025cb..d999fd62 100644 --- a/tests/values/dataplane.aws.with-ingress.yaml +++ b/tests/values/dataplane.aws.with-ingress.yaml @@ -1,3 +1,4 @@ +# helm-values: values-test-certs.yaml host: union.us-west-2.union.ai clusterName: union-aws orgName: union diff --git a/tests/values/dataplane.aws.yaml b/tests/values/dataplane.aws.yaml index 98a1e9ff..ee3592c7 100644 --- a/tests/values/dataplane.aws.yaml +++ b/tests/values/dataplane.aws.yaml @@ -1,3 +1,4 @@ +# helm-values: values-test-certs.yaml # Intended to match and test charts/dataplane/values.aws.yaml # Test namespace_mapping cascading to all services diff --git a/tests/values/dataplane.azure-custom-storage-prefix.yaml b/tests/values/dataplane.azure-custom-storage-prefix.yaml index ced450cd..577d89a3 100644 --- a/tests/values/dataplane.azure-custom-storage-prefix.yaml +++ b/tests/values/dataplane.azure-custom-storage-prefix.yaml @@ -1,3 +1,4 @@ +# helm-values: values-test-certs.yaml # Test: custom storage.metadataPrefix overrides the auto-generated s3:// prefix # for Azure custom storage providers using ABFS protocol. diff --git a/tests/values/dataplane.azure.yaml b/tests/values/dataplane.azure.yaml index 495564e3..708ccb9a 100644 --- a/tests/values/dataplane.azure.yaml +++ b/tests/values/dataplane.azure.yaml @@ -1,3 +1,4 @@ +# helm-values: values-test-certs.yaml # Testing ../../charts/dataplane.values.azure.yaml global: diff --git a/tests/values/dataplane.cost.yaml b/tests/values/dataplane.cost.yaml index f85f8f6f..10d44aa3 100644 --- a/tests/values/dataplane.cost.yaml +++ b/tests/values/dataplane.cost.yaml @@ -1,3 +1,4 @@ +# helm-values: values-test-certs.yaml cost: enabled: true serviceMonitor: diff --git a/tests/values/dataplane.dcgm-exporter.yaml b/tests/values/dataplane.dcgm-exporter.yaml index eca0b3fd..7d9011e9 100644 --- a/tests/values/dataplane.dcgm-exporter.yaml +++ b/tests/values/dataplane.dcgm-exporter.yaml @@ -1,3 +1,4 @@ +# helm-values: values-test-certs.yaml dcgm-exporter: enabled: true diff --git a/tests/values/dataplane.fully-selfhosted.yaml b/tests/values/dataplane.fully-selfhosted.yaml index 778b162d..bc87f450 100644 --- a/tests/values/dataplane.fully-selfhosted.yaml +++ b/tests/values/dataplane.fully-selfhosted.yaml @@ -1,3 +1,4 @@ +# helm-values: values-test-certs.yaml # Test namespace_mapping cascading to all services namespace_mapping: template: '{{`{{ project }}`}}-{{`{{ domain }}`}}' diff --git a/tests/values/dataplane.gcp.yaml b/tests/values/dataplane.gcp.yaml new file mode 100644 index 00000000..31eea929 --- /dev/null +++ b/tests/values/dataplane.gcp.yaml @@ -0,0 +1,26 @@ +# helm-values: values-test-certs.yaml +host: byok.us-west-2.union.ai +clusterName: test-e2e-gcp +orgName: byok +provider: gcp +storage: + provider: gcs + bucketName: test-gcp-bucket + fastRegistrationBucketName: test-gcp-bucket + region: us-central1 + enableMultiContainer: true + gcp: + projectId: test-gcp-project-123 +secrets: + admin: + create: true + clientId: byok-test-e2e-gcp-operator + clientSecret: test-not-real-secret +additionalServiceAccountAnnotations: + iam.gke.io/gcp-service-account: 'union-backend@test-gcp-project-123.iam.gserviceaccount.com' +userRoleAnnotationKey: iam.gke.io/gcp-service-account +userRoleAnnotationValue: union-worker@test-gcp-project-123.iam.gserviceaccount.com +fluentbit: + serviceAccount: + annotations: + iam.gke.io/gcp-service-account: union-backend@test-gcp-project-123.iam.gserviceaccount.com diff --git a/tests/values/dataplane.low-priv.yaml b/tests/values/dataplane.low-priv.yaml index 0760b2c8..83bca934 100644 --- a/tests/values/dataplane.low-priv.yaml +++ b/tests/values/dataplane.low-priv.yaml @@ -1,4 +1,4 @@ -# helm-values: values-low-privilege.yaml +# helm-values: values-test-certs.yaml host: union.us-west-2.union.ai clusterName: my-cluster orgName: union @@ -11,6 +11,8 @@ storage: bucketName: bucket fastRegistrationBucketName: bucket #This can be the same as bucketName region: us-west-2 #region where your S3 bucket is configured +imageBuilder: + defaultRepository: "localhost:5000/union-dataplane" secrets: admin: create: true diff --git a/tests/values/dataplane.monitoring.yaml b/tests/values/dataplane.monitoring.yaml index 9922584d..8e6676c0 100644 --- a/tests/values/dataplane.monitoring.yaml +++ b/tests/values/dataplane.monitoring.yaml @@ -1,3 +1,4 @@ +# helm-values: values-test-certs.yaml monitoring: enabled: true alerting: diff --git a/tests/values/dataplane.nodeobserver.yaml b/tests/values/dataplane.nodeobserver.yaml index e9f410ae..6832d3de 100644 --- a/tests/values/dataplane.nodeobserver.yaml +++ b/tests/values/dataplane.nodeobserver.yaml @@ -1,3 +1,4 @@ +# helm-values: values-test-certs.yaml nodeobserver: enabled: true diff --git a/tests/values/dataplane.oci.yaml b/tests/values/dataplane.oci.yaml index e7197c34..6b1d635e 100644 --- a/tests/values/dataplane.oci.yaml +++ b/tests/values/dataplane.oci.yaml @@ -1,3 +1,4 @@ +# helm-values: values-test-certs.yaml host: union.us-west-2.union.ai clusterName: union-oci orgName: union