From e3ca8d725386cc648d491bd8c8e439c40c8f4923 Mon Sep 17 00:00:00 2001 From: Michael Hotan Date: Sat, 2 May 2026 14:51:29 +1000 Subject: [PATCH 1/6] Add v2-focused controlplane and dataplane overview dashboards MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Create separate v2 dashboards that remove v1-only panels and add v2-specific metrics. Existing v1 dashboards are unchanged. Controlplane V2 (union-controlplane-v2-overview): - Executions row: remove v1 handle_create_op, handle_ack_op, workqueue panels. Add CreateRun Rate (connect counter), CreateRun Latency (connect histogram), V2 Run Methods by service. - Relabel apps panels: "Apps — Pending Assignments", "Apps — First Ack Latency" - Keep all shared rows: FlyteAdmin, Cluster Service, Queue, CacheService, Authorizer, Data Proxy, Usage, Infrastructure Dataplane V2 (union-dataplane-v2-overview): - Remove Flyte Propeller (V1) row entirely - Health: remove Active Workflows and Queue Depth (propeller) - SLOs: replace Propeller Latency p99 with Executor Evaluate Duration - Keep: Operator, Executor, gRPC Client, Infrastructure Co-Authored-By: Claude Opus 4.6 (1M context) --- .../union-controlplane-v2-overview.json | 3648 +++++++++++++++++ .../union-dataplane-v2-overview.json | 1441 +++++++ 2 files changed, 5089 insertions(+) create mode 100644 charts/controlplane/dashboards/union-controlplane-v2-overview.json create mode 100644 charts/dataplane/dashboards/union-dataplane-v2-overview.json diff --git a/charts/controlplane/dashboards/union-controlplane-v2-overview.json b/charts/controlplane/dashboards/union-controlplane-v2-overview.json new file mode 100644 index 00000000..64ae63d4 --- /dev/null +++ b/charts/controlplane/dashboards/union-controlplane-v2-overview.json @@ -0,0 +1,3648 @@ +{ + "annotations": { + "list": [] + }, + "description": "Union Controlplane health and service metrics", + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "links": [], + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 1, + "title": "Health", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "orange", + "value": 0.5 + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "percentunit", + "min": 0, + "max": 1 + } + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 0, + "y": 1 + }, + "id": 2, + "options": { + "colorMode": "background", + "graphMode": "none", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + }, + "textMode": "auto" + }, + "title": "Service Availability", + "type": "stat", + "targets": [ + { + "expr": "avg(kube_deployment_status_replicas_available{namespace=\"$namespace\"} / kube_deployment_spec_replicas{namespace=\"$namespace\"})", + "legendFormat": "Availability", + "refId": "A" + } + ], + "description": "Percentage of deployments with all requested replicas available. 1.0 = all healthy." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "orange", + "value": 3 + }, + { + "color": "red", + "value": 10 + } + ] + }, + "unit": "short" + } + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 6, + "y": 1 + }, + "id": 3, + "options": { + "colorMode": "background", + "graphMode": "none", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + }, + "textMode": "auto" + }, + "title": "Pod Restarts (1h)", + "type": "stat", + "targets": [ + { + "expr": "sum(increase(kube_pod_container_status_restarts_total{namespace=\"$namespace\"}[1h]))", + "legendFormat": "Restarts", + "refId": "A" + } + ], + "description": "Total container restarts in the last hour. Non-zero indicates crashlooping or OOM kills." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "orange", + "value": 0.01 + }, + { + "color": "red", + "value": 0.05 + } + ] + }, + "unit": "percentunit", + "min": 0, + "max": 1 + } + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 18, + "y": 1 + }, + "id": 7, + "options": { + "colorMode": "background", + "graphMode": "area", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + }, + "textMode": "auto" + }, + "title": "Connect Error Rate", + "type": "stat", + "targets": [ + { + "expr": "sum(rate(connect:server_requests_handled_total{namespace=\"$namespace\", code!~\"0|OK|Canceled|NotFound\"}[$__rate_interval])) / sum(rate(connect:server_requests_handled_total{namespace=\"$namespace\"}[$__rate_interval]))", + "legendFormat": "Error %", + "refId": "A" + } + ], + "description": "Fraction of Connect RPC responses with non-OK/non-Canceled codes across all CP services." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never", + "stacking": { + "mode": "none" + } + }, + "unit": "reqps" + } + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 5 + }, + "id": 4, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "title": "Connect Request Rate by Service", + "type": "timeseries", + "targets": [ + { + "expr": "sum by (service) (rate(connect:server_requests_handled_total{namespace=\"$namespace\"}[$__rate_interval]))", + "legendFormat": "{{ service }}", + "refId": "A" + } + ], + "description": "Connect protocol request throughput broken down by service (e.g. ExecutionService, ClusterService)." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "reqps" + } + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 5 + }, + "id": 5, + "options": { + "legend": { + "calcs": [ + "mean" + ], + "displayMode": "table", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "title": "Connect Errors by Code", + "type": "timeseries", + "targets": [ + { + "expr": "sum by (code) (rate(connect:server_requests_handled_total{namespace=\"$namespace\", code!~\"0|OK|Canceled|NotFound\"}[$__rate_interval]))", + "legendFormat": "{{ code }}", + "refId": "A" + } + ], + "description": "Connect error responses by gRPC status code (Internal, Unavailable, etc.)." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 1 + } + ] + }, + "unit": "short" + } + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 12, + "y": 1 + }, + "id": 11, + "options": { + "colorMode": "background", + "graphMode": "none", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + }, + "textMode": "value_and_name" + }, + "title": "Handler Panics", + "type": "stat", + "targets": [ + { + "expr": "sum(authorizer:handler_panic{namespace=\"$namespace\"} + cluster:handler_panic{namespace=\"$namespace\"} + dataproxy:handler_panic{namespace=\"$namespace\"} + executions:handler_panic{namespace=\"$namespace\"} + queue:handler_panic{namespace=\"$namespace\"} + usage:handler_panic{namespace=\"$namespace\"})", + "legendFormat": "Total", + "refId": "A" + } + ], + "description": "Total handler panics across all CP services. Any non-zero value indicates a service caught a panic during request handling." + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 9 + }, + "id": 1200, + "title": "SLOs", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "orange", + "value": 0.99 + }, + { + "color": "green", + "value": 0.999 + } + ] + }, + "unit": "percentunit", + "decimals": 3 + } + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 0, + "y": 10 + }, + "id": 1201, + "title": "Service Availability", + "type": "stat", + "options": { + "colorMode": "background", + "graphMode": "none", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + }, + "textMode": "value" + }, + "targets": [ + { + "expr": "avg(kube_deployment_status_replicas_available{namespace=\"$namespace\"} / kube_deployment_spec_replicas{namespace=\"$namespace\"})", + "refId": "A" + } + ], + "description": "Current service availability across all deployments." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": -999 + }, + { + "color": "orange", + "value": 0 + }, + { + "color": "green", + "value": 0.5 + } + ] + }, + "unit": "percentunit", + "decimals": 1, + "noValue": "N/A" + } + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 6, + "y": 10 + }, + "id": 1202, + "title": "Error Budget Remaining", + "type": "stat", + "options": { + "colorMode": "background", + "graphMode": "area", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + }, + "textMode": "value" + }, + "targets": [ + { + "expr": "union:cp:slo:error_budget_remaining", + "refId": "A" + } + ], + "description": "Fraction of error budget remaining. <0 = budget exhausted. Requires monitoring.slos.enabled." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 0 + }, + { + "color": "orange", + "value": 0.95 + }, + { + "color": "green", + "value": 0.999 + } + ] + }, + "unit": "percentunit", + "decimals": 2, + "noValue": "N/A" + } + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 12, + "y": 10 + }, + "id": 1203, + "title": "Ingress Success Rate", + "type": "stat", + "options": { + "colorMode": "background", + "graphMode": "none", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + }, + "textMode": "value" + }, + "targets": [ + { + "expr": "union:cp:slo:ingress_success_rate or (1 - sum(rate(nginx_ingress_controller_request_duration_seconds_count{namespace=\"$namespace\", status=~\"5..\"}[5m])) / sum(rate(nginx_ingress_controller_request_duration_seconds_count{namespace=\"$namespace\"}[5m])))", + "refId": "A" + } + ], + "description": "Ingress success rate (non-5xx). Customer-facing SLO metric. Falls back to raw metric if SLO recording rules are not enabled." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "orange", + "value": 2 + }, + { + "color": "red", + "value": 5 + } + ] + }, + "unit": "s", + "decimals": 2 + } + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 18, + "y": 10 + }, + "id": 1204, + "title": "Ingress Latency p99", + "type": "stat", + "options": { + "colorMode": "background", + "graphMode": "none", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + }, + "textMode": "value" + }, + "targets": [ + { + "expr": "union:cp:slo:ingress_latency_p99 or histogram_quantile(0.99, sum by (le) (rate(nginx_ingress_controller_request_duration_seconds_bucket{namespace=\"$namespace\"}[5m])))", + "refId": "A" + } + ], + "description": "Ingress p99 latency. Falls back to raw metric if SLO recording rules are not enabled." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "percentunit" + } + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 16 + }, + "id": 1205, + "title": "Availability Over Time", + "type": "timeseries", + "targets": [ + { + "expr": "avg(kube_deployment_status_replicas_available{namespace=\"$namespace\"} / kube_deployment_spec_replicas{namespace=\"$namespace\"})", + "legendFormat": "Availability", + "refId": "A" + }, + { + "expr": "vector(0.999)", + "legendFormat": "Target (99.9%)", + "refId": "B" + } + ], + "description": "Service availability over time with SLO target line." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "percentunit", + "max": 1, + "min": -0.5 + } + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 16 + }, + "id": 1206, + "title": "Error Budget Burn Rate", + "type": "timeseries", + "targets": [ + { + "expr": "union:cp:slo:error_budget_remaining", + "legendFormat": "Budget remaining", + "refId": "A" + }, + { + "expr": "vector(0)", + "legendFormat": "Exhausted", + "refId": "B" + } + ], + "description": "Error budget remaining over time. Requires monitoring.slos.enabled." + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 31 + }, + "id": 100, + "title": "Ingress (nginx)", + "type": "row", + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never", + "stacking": { + "mode": "normal" + } + }, + "unit": "reqps" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 10 + }, + "id": 101, + "title": "Request Rate by Path", + "type": "timeseries", + "targets": [ + { + "expr": "sum by (host, path) (rate(nginx_ingress_controller_request_duration_seconds_count{namespace=\"$namespace\"}[$__rate_interval]))", + "legendFormat": "{{ host }}{{ path }}", + "refId": "A" + } + ], + "description": "Ingress request rate broken down by host and URL path." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never", + "stacking": { + "mode": "normal" + } + }, + "unit": "reqps" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 10 + }, + "id": 102, + "title": "Error Rate by Status Code", + "type": "timeseries", + "targets": [ + { + "expr": "sum by (status) (rate(nginx_ingress_controller_request_duration_seconds_count{namespace=\"$namespace\", status=~\"[45]..\"}[$__rate_interval]))", + "legendFormat": "{{ status }}", + "refId": "A" + } + ], + "description": "4xx and 5xx error rates from ingress-nginx by HTTP status code." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "s" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 18 + }, + "id": 103, + "title": "Latency p50 / p95 / p99", + "type": "timeseries", + "targets": [ + { + "expr": "histogram_quantile(0.50, sum by (le) (rate(nginx_ingress_controller_request_duration_seconds_bucket{namespace=\"$namespace\"}[$__rate_interval])))", + "legendFormat": "p50", + "refId": "A" + }, + { + "expr": "histogram_quantile(0.95, sum by (le) (rate(nginx_ingress_controller_request_duration_seconds_bucket{namespace=\"$namespace\"}[$__rate_interval])))", + "legendFormat": "p95", + "refId": "B" + }, + { + "expr": "histogram_quantile(0.99, sum by (le) (rate(nginx_ingress_controller_request_duration_seconds_bucket{namespace=\"$namespace\"}[$__rate_interval])))", + "legendFormat": "p99", + "refId": "C" + } + ], + "description": "Ingress request latency percentiles. Includes TLS + routing + upstream response time." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 18 + }, + "id": 104, + "title": "Active Connections", + "type": "timeseries", + "targets": [ + { + "expr": "sum(nginx_ingress_controller_nginx_process_connections{namespace=\"$namespace\"})", + "legendFormat": "Active", + "refId": "A" + } + ], + "description": "Current number of active client connections to ingress-nginx." + } + ] + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 32 + }, + "id": 200, + "title": "Connect / gRPC", + "type": "row", + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never", + "stacking": { + "mode": "normal" + } + }, + "unit": "reqps" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 11 + }, + "id": 201, + "title": "Connect Request Rate by Service", + "type": "timeseries", + "targets": [ + { + "expr": "sum by (service) (rate(connect:server_requests_handled_total{namespace=\"$namespace\"}[$__rate_interval]))", + "legendFormat": "{{ service }}", + "refId": "A" + } + ], + "description": "Connect protocol request throughput broken down by service (e.g. ExecutionService, ClusterService)." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "reqps" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 11 + }, + "id": 202, + "title": "Connect Errors by Service & Code", + "type": "timeseries", + "targets": [ + { + "expr": "sum by (service, code) (rate(connect:server_requests_handled_total{namespace=\"$namespace\", code!~\"0|OK|Canceled|NotFound\"}[$__rate_interval]))", + "legendFormat": "{{ service }} {{ code }}", + "refId": "A" + } + ], + "description": "Connect errors broken down by service and gRPC code. Identifies which services are erroring." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "reqps" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 19 + }, + "id": 203, + "title": "gRPC Server Request Rate (CacheService)", + "type": "timeseries", + "targets": [ + { + "expr": "sum by (grpc_service, grpc_method) (rate(grpc_server_handled_total{namespace=\"$namespace\"}[$__rate_interval]))", + "legendFormat": "{{ grpc_service }}/{{ grpc_method }}", + "refId": "A" + } + ], + "description": "CacheService is the only CP service using gRPC (not Connect). Shows Get/Put/Delete/Reservation rates." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "reqps" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 19 + }, + "id": 204, + "title": "gRPC Server Errors (CacheService)", + "type": "timeseries", + "targets": [ + { + "expr": "sum by (grpc_method, grpc_code) (rate(grpc_server_handled_total{namespace=\"$namespace\", grpc_code!=\"OK\"}[$__rate_interval]))", + "legendFormat": "{{ grpc_method }} {{ grpc_code }}", + "refId": "A" + } + ], + "description": "CacheService gRPC errors by method and code." + } + ] + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 33 + }, + "id": 300, + "title": "FlyteAdmin", + "type": "row", + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 12 + }, + "id": 301, + "title": "Active Executions", + "type": "timeseries", + "targets": [ + { + "expr": "flyte:admin:execution_manager:active_executions{namespace=\"$namespace\"}", + "legendFormat": "Workflows", + "refId": "A" + }, + { + "expr": "flyte:admin:node_execution_manager:active_node_executions{namespace=\"$namespace\"}", + "legendFormat": "Nodes", + "refId": "B" + }, + { + "expr": "flyte:admin:task_execution_manager:active_executions{namespace=\"$namespace\"}", + "legendFormat": "Tasks", + "refId": "C" + } + ], + "description": "Current count of active workflow, node, and task executions tracked by FlyteAdmin." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 12 + }, + "id": 302, + "title": "Execution Create / Event Rate", + "type": "timeseries", + "targets": [ + { + "expr": "rate(flyte:admin:execution_manager:executions_created{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Executions created", + "refId": "A" + }, + { + "expr": "rate(flyte:admin:execution_manager:execution_events_created{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Workflow events", + "refId": "B" + }, + { + "expr": "rate(flyte:admin:node_execution_manager:node_execution_events_created{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Node events", + "refId": "C" + }, + { + "expr": "rate(flyte:admin:task_execution_manager:task_execution_events_created{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Task events", + "refId": "D" + } + ], + "description": "Rate of execution creations and event ingestion (workflow, node, task events from propeller)." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 12 + }, + "id": 303, + "title": "Errors", + "type": "timeseries", + "targets": [ + { + "expr": "rate(flyte:admin:execution_manager:propeller_failures{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Propeller failures", + "refId": "A" + }, + { + "expr": "rate(flyte:admin:execution_manager:transformer_error{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Transformer errors", + "refId": "B" + }, + { + "expr": "rate(flyte:admin:execution_manager:publish_error{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Publish errors", + "refId": "C" + }, + { + "expr": "rate(flyte:admin:execution_manager:execution_termination_failure{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Termination failures", + "refId": "D" + } + ], + "description": "FlyteAdmin error rates: propeller communication failures, model transform errors, notification publish failures." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "s" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 20 + }, + "id": 304, + "title": "Endpoint Latency (p95)", + "type": "timeseries", + "targets": [ + { + "expr": "flyte:admin:create_execution:duration_ms{namespace=\"$namespace\", quantile=\"0.95\"} / 1000", + "legendFormat": "CreateExecution", + "refId": "A" + }, + { + "expr": "flyte:admin:create_execution_event:duration_ms{namespace=\"$namespace\", quantile=\"0.95\"} / 1000", + "legendFormat": "CreateExecutionEvent", + "refId": "B" + }, + { + "expr": "flyte:admin:get_execution:duration_ms{namespace=\"$namespace\", quantile=\"0.95\"} / 1000", + "legendFormat": "GetExecution", + "refId": "C" + }, + { + "expr": "flyte:admin:list_execution:duration_ms{namespace=\"$namespace\", quantile=\"0.95\"} / 1000", + "legendFormat": "ListExecution", + "refId": "D" + } + ], + "description": "FlyteAdmin gRPC endpoint latency at p95. Key endpoints: CreateExecution, CreateExecutionEvent, GetExecution." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 20 + }, + "id": 305, + "title": "Auth Middleware Decisions", + "type": "timeseries", + "targets": [ + { + "expr": "rate(flyte:middleware:authorization:authz_approved{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Approved", + "refId": "A" + }, + { + "expr": "rate(flyte:middleware:authorization:authz_denied{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Denied", + "refId": "B" + } + ], + "description": "Authorization approve/deny rate from the FlyteAdmin auth middleware. High deny rate may indicate auth misconfiguration." + } + ] + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 34 + }, + "id": 400, + "title": "Executions (V2)", + "type": "row", + "panels": [ + { + "title": "CreateRun Rate", + "type": "timeseries", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "unit": "reqps" + } + }, + "targets": [ + { + "expr": "sum(rate(connect:server_requests_handled_total{namespace=\"$namespace\", method=\"CreateRun\"}[$__rate_interval]))", + "legendFormat": "CreateRun", + "refId": "A" + } + ], + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 0 + } + }, + { + "title": "CreateRun Latency (p50 / p95 / p99)", + "type": "timeseries", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "unit": "s" + } + }, + "targets": [ + { + "expr": "histogram_quantile(0.50, sum by (le) (rate(connect:server_request_duration_seconds_bucket{namespace=\"$namespace\", method=\"CreateRun\"}[$__rate_interval])))", + "legendFormat": "p50", + "refId": "A" + }, + { + "expr": "histogram_quantile(0.95, sum by (le) (rate(connect:server_request_duration_seconds_bucket{namespace=\"$namespace\", method=\"CreateRun\"}[$__rate_interval])))", + "legendFormat": "p95", + "refId": "B" + }, + { + "expr": "histogram_quantile(0.99, sum by (le) (rate(connect:server_request_duration_seconds_bucket{namespace=\"$namespace\", method=\"CreateRun\"}[$__rate_interval])))", + "legendFormat": "p99", + "refId": "C" + } + ], + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 0 + } + }, + { + "title": "V2 Run Methods", + "type": "timeseries", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "unit": "reqps" + } + }, + "targets": [ + { + "expr": "sum by (method) (rate(connect:server_requests_handled_total{namespace=\"$namespace\", service=~\".*RunService.*\"}[$__rate_interval]))", + "legendFormat": "{{method}}", + "refId": "A" + } + ], + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 0 + } + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 20 + }, + "id": 405, + "title": "DB Operation Rate", + "type": "timeseries", + "targets": [ + { + "expr": "sum by (op) (rate(label_replace({__name__=~\"executions:database:postgres:repositories:execution_ops:.*_count\", namespace=\"$namespace\"}, \"op\", \"$1\", \"__name__\", \"executions:database:postgres:repositories:execution_ops:(.*)_count\")[$__rate_interval:]))", + "legendFormat": "{{ op }}", + "refId": "A" + } + ], + "description": "Execution operations DB latency: create, ack, claim, unclaim, get, update." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 20 + }, + "id": 406, + "title": "DB Errors", + "type": "timeseries", + "targets": [ + { + "expr": "rate(executions:database:postgres:errors:gorm_error{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "gorm_error", + "refId": "A" + }, + { + "expr": "rate(executions:database:postgres:errors:postgres_error{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "postgres_error", + "refId": "B" + }, + { + "expr": "rate(executions:database:postgres:errors:not_found{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "not_found", + "refId": "C" + } + ], + "description": "Executions service Postgres error rates by type: gorm errors, native postgres errors, not-found." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 28 + }, + "id": 407, + "title": "Cluster Cache Hit/Miss", + "type": "timeseries", + "targets": [ + { + "expr": "rate(executions:executions:list_clusters:hits{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Cluster hits", + "refId": "A" + }, + { + "expr": "rate(executions:executions:list_clusters:miss{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Cluster miss", + "refId": "B" + }, + { + "expr": "rate(executions:executions:list_nodepools:hits{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Nodepool hits", + "refId": "C" + }, + { + "expr": "rate(executions:executions:list_nodepools:miss{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Nodepool miss", + "refId": "D" + } + ], + "description": "Cluster and nodepool list cache effectiveness. High miss rate = excessive DB queries." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 28 + }, + "id": 408, + "title": "Apps \u2014 Pending Assignments", + "type": "timeseries", + "targets": [ + { + "expr": "executions:app:leaser:pending_assignment_unlabeled{namespace=\"$namespace\"}", + "legendFormat": "Pending", + "refId": "A" + } + ], + "description": "Number of apps waiting for cluster assignment. Growing backlog = scheduling bottleneck." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "s" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 36 + }, + "id": 409, + "title": "Apps \u2014 First Ack Latency", + "type": "timeseries", + "targets": [ + { + "expr": "histogram_quantile(0.50, sum by (le) (rate(executions:app:service:first_ack_latency_unlabeled_bucket{namespace=\"$namespace\"}[$__rate_interval])))", + "legendFormat": "p50", + "refId": "A" + }, + { + "expr": "histogram_quantile(0.95, sum by (le) (rate(executions:app:service:first_ack_latency_unlabeled_bucket{namespace=\"$namespace\"}[$__rate_interval])))", + "legendFormat": "p95", + "refId": "B" + }, + { + "expr": "histogram_quantile(0.99, sum by (le) (rate(executions:app:service:first_ack_latency_unlabeled_bucket{namespace=\"$namespace\"}[$__rate_interval])))", + "legendFormat": "p99", + "refId": "C" + } + ], + "description": "Key V2 SLI: time to deliver an app to the dataplane. Measures end-to-end scheduling latency." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 36 + }, + "id": 410, + "title": "V2 Run Dispatch", + "type": "timeseries", + "targets": [ + { + "expr": "rate(executions:run:runs_sent{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Runs sent", + "refId": "A" + }, + { + "expr": "rate(executions:run:actions_sent{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Actions sent", + "refId": "B" + }, + { + "expr": "rate(executions:run:enqueue_action_failures{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Enqueue failures", + "refId": "C" + } + ], + "description": "V2 run/action dispatch throughput. Enqueue failures indicate queue service issues." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 36 + }, + "id": 411, + "title": "V2 Run Notifier", + "type": "timeseries", + "targets": [ + { + "expr": "rate(executions:run_notifier:notifications_sent{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Notifications/s", + "refId": "A" + }, + { + "expr": "executions:run_notifier:subscribers{namespace=\"$namespace\"}", + "legendFormat": "Subscribers", + "refId": "B" + }, + { + "expr": "rate(executions:run:logs:tail_logs_bytes_read{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Log bytes/s", + "refId": "C" + } + ], + "description": "V2 notification pipeline: notifications sent per second, active subscribers, log bytes streamed." + } + ] + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 34 + }, + "id": 500, + "title": "Queue / Run-Scheduler", + "type": "row", + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 13 + }, + "id": 501, + "title": "Metadata Store Counts", + "type": "timeseries", + "targets": [ + { + "expr": "queue:metadata_store:total_run_count{namespace=\"$namespace\"}", + "legendFormat": "Total runs", + "refId": "A" + }, + { + "expr": "queue:metadata_store:total_action_count{namespace=\"$namespace\"}", + "legendFormat": "Total actions", + "refId": "B" + }, + { + "expr": "queue:metadata_store:scheduled_run_count{namespace=\"$namespace\"}", + "legendFormat": "Scheduled runs", + "refId": "C" + }, + { + "expr": "queue:metadata_store:scheduled_action_count{namespace=\"$namespace\"}", + "legendFormat": "Scheduled actions", + "refId": "D" + } + ], + "description": "Total and scheduled run/action counts in the queue. Shows system load." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 13 + }, + "id": 502, + "title": "Scheduler / Runner / Aborter Throughput", + "type": "timeseries", + "targets": [ + { + "expr": "rate(queue:scheduler:enqueued_leases{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Enqueued", + "refId": "A" + }, + { + "expr": "rate(queue:runner:completed_leases{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Completed", + "refId": "B" + }, + { + "expr": "rate(queue:aborter:aborted_leases{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Aborted", + "refId": "C" + } + ], + "description": "Lease lifecycle throughput: enqueued (new), completed (done), aborted (cancelled)." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 13 + }, + "id": 503, + "title": "Queue Lengths", + "type": "timeseries", + "targets": [ + { + "expr": "queue:scheduler:input_queue_length{namespace=\"$namespace\"}", + "legendFormat": "Scheduler input", + "refId": "A" + }, + { + "expr": "queue:runner:input_queue_length{namespace=\"$namespace\"}", + "legendFormat": "Runner input", + "refId": "B" + }, + { + "expr": "queue:aborter:input_queue_length{namespace=\"$namespace\"}", + "legendFormat": "Aborter input", + "refId": "C" + }, + { + "expr": "queue:dispatcher:chain_queue_length{namespace=\"$namespace\"}", + "legendFormat": "Dispatcher chain", + "refId": "D" + }, + { + "expr": "queue:db:queue_length{namespace=\"$namespace\"}", + "legendFormat": "DB queue", + "refId": "E" + } + ], + "description": "Internal queue depths across scheduler, runner, aborter, dispatcher, and DB worker pool. Growing = backpressure." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "s" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 21 + }, + "id": 504, + "title": "Dispatcher Operation Duration (p99)", + "type": "timeseries", + "targets": [ + { + "expr": "histogram_quantile(0.99, sum by (type, le) (rate(queue:dispatcher:operation_duration_bucket{namespace=\"$namespace\"}[$__rate_interval])))", + "legendFormat": "{{ type }}", + "refId": "A" + } + ], + "description": "Dispatcher multi-step operation chain execution time at p99, by operation type." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "s" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 21 + }, + "id": 505, + "title": "State Get/Put Duration (p99)", + "type": "timeseries", + "targets": [ + { + "expr": "histogram_quantile(0.99, sum by (le) (rate(queue:state:get_duration_bucket{namespace=\"$namespace\"}[$__rate_interval])))", + "legendFormat": "Get p99", + "refId": "A" + }, + { + "expr": "histogram_quantile(0.99, sum by (le) (rate(queue:state:put_duration_bucket{namespace=\"$namespace\"}[$__rate_interval])))", + "legendFormat": "Put p99", + "refId": "B" + } + ], + "description": "In-memory state store operation latency. Backed by ScyllaDB persistence." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 21 + }, + "id": 506, + "title": "State Cache & Eventer", + "type": "timeseries", + "targets": [ + { + "expr": "queue:state:active_states{namespace=\"$namespace\"}", + "legendFormat": "Active states", + "refId": "A" + }, + { + "expr": "queue:state:terminal_states{namespace=\"$namespace\"}", + "legendFormat": "Terminal states", + "refId": "B" + }, + { + "expr": "rate(queue:eventer:record_action_errors{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Eventer errors", + "refId": "C" + } + ], + "description": "Active/terminal state counts and eventer error rate. Eventer reports action status to executions service." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 29 + }, + "id": 507, + "title": "Worker Capacity", + "type": "timeseries", + "targets": [ + { + "expr": "queue:scheduler:worker_capacity{namespace=\"$namespace\"}", + "legendFormat": "{{ worker_name }}", + "refId": "A" + } + ], + "description": "Remaining execution capacity per connected DP worker. Zero = worker saturated." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 29 + }, + "id": 508, + "title": "Dispatcher Failures by Type", + "type": "timeseries", + "targets": [ + { + "expr": "sum by (type) (rate(queue:dispatcher:operation_failures{namespace=\"$namespace\"}[$__rate_interval]))", + "legendFormat": "{{ type }}", + "refId": "A" + } + ], + "description": "Failed dispatcher operations by Go type. Indicates internal queue service errors." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 29 + }, + "id": 509, + "title": "DB & Client Thread Pool", + "type": "timeseries", + "targets": [ + { + "expr": "queue:db:free_threads{namespace=\"$namespace\"}", + "legendFormat": "DB free threads", + "refId": "A" + }, + { + "expr": "queue:queue_client:free_threads{namespace=\"$namespace\"}", + "legendFormat": "Queue client free", + "refId": "B" + }, + { + "expr": "queue:state_client:free_threads{namespace=\"$namespace\"}", + "legendFormat": "State client free", + "refId": "C" + } + ], + "description": "Idle worker goroutines in DB, queue-client, and state-client pools. Zero = all threads busy." + } + ] + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 35 + }, + "id": 600, + "title": "Cluster Service", + "type": "row", + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 14 + }, + "id": 601, + "title": "UpdateStatus / Heartbeat Rate", + "type": "timeseries", + "targets": [ + { + "expr": "rate(cluster:svc:update_status:updates_total{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "UpdateStatus", + "refId": "A" + }, + { + "expr": "rate(cluster:svc:heartbeat:success_ms_count{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Heartbeat", + "refId": "B" + } + ], + "description": "Rate of DP cluster status updates and heartbeats received by the cluster service." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "s" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 14 + }, + "id": 602, + "title": "Cluster API Latency (p95)", + "type": "timeseries", + "targets": [ + { + "expr": "cluster:svc:update_status:success_ms{namespace=\"$namespace\", quantile=\"0.95\"} / 1000", + "legendFormat": "UpdateStatus p95", + "refId": "A" + }, + { + "expr": "cluster:svc:heartbeat:success_ms{namespace=\"$namespace\", quantile=\"0.95\"} / 1000", + "legendFormat": "Heartbeat p95", + "refId": "B" + } + ], + "description": "Cluster service RPC latency for UpdateStatus and Heartbeat calls." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 14 + }, + "id": 603, + "title": "Operator / Propeller Restarts (from DP)", + "type": "timeseries", + "targets": [ + { + "expr": "cluster:svc:update_status:operator_restarts{namespace=\"$namespace\"}", + "legendFormat": "Operator restarts", + "refId": "A" + }, + { + "expr": "cluster:svc:update_status:propeller_restarts{namespace=\"$namespace\"}", + "legendFormat": "Propeller restarts", + "refId": "B" + } + ], + "description": "DP-reported restart counts for operator and propeller pods. Set by DP on each UpdateStatus call." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 22 + }, + "id": 604, + "title": "DB Errors by Type", + "type": "timeseries", + "targets": [ + { + "expr": "rate(cluster:database:postgres:errors:gorm_error{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "gorm_error", + "refId": "A" + }, + { + "expr": "rate(cluster:database:postgres:errors:postgres_error{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "postgres_error", + "refId": "B" + }, + { + "expr": "rate(cluster:database:postgres:errors:not_found{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "not_found", + "refId": "C" + } + ], + "description": "Cluster service Postgres error rates by type: gorm errors, native postgres errors, not-found." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 1 + } + ] + }, + "unit": "short", + "mappings": [ + { + "type": "value", + "options": { + "0": { + "text": "Healthy", + "color": "green" + }, + "1": { + "text": "Unhealthy", + "color": "red" + } + } + } + ] + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 22 + }, + "id": 605, + "title": "Cluster Health Status", + "type": "timeseries", + "targets": [ + { + "expr": "cluster:cluster_sync:health:unhealthy{namespace=\"$namespace\", subsystem=\"\"}", + "legendFormat": "{{ org }}/{{ cluster_name }}", + "refId": "A" + } + ], + "description": "Cluster health collector: 1=unhealthy, 0=healthy. Emitted per cluster on every Prometheus scrape." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "s" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 30 + }, + "id": 606, + "title": "Last Heartbeat Age (stale cluster detection)", + "type": "timeseries", + "targets": [ + { + "expr": "cluster:cluster_sync:health:last_update_age{namespace=\"$namespace\"}", + "legendFormat": "{{ org }}/{{ cluster_name }}", + "refId": "A" + } + ], + "description": "Seconds since each cluster last sent a heartbeat. High values = stale/disconnected cluster." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 30 + }, + "id": 607, + "title": "Managed Cluster Cache", + "type": "timeseries", + "targets": [ + { + "expr": "rate(cluster:managed_cluster_client_cache:get:hits{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Cache hits", + "refId": "A" + }, + { + "expr": "rate(cluster:managed_cluster_client_cache:get:miss{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Cache miss", + "refId": "B" + } + ], + "description": "LRU cache hit/miss rate for managed cluster lookups. High miss rate = excessive DB queries." + } + ] + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 36 + }, + "id": 900, + "title": "CacheService", + "type": "row", + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 15 + }, + "id": 901, + "title": "Cache Hit / Miss Rate", + "type": "timeseries", + "targets": [ + { + "expr": "rate(flyte:cacheservice:cache:cache_hit_unlabeled{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Hits", + "refId": "A" + }, + { + "expr": "rate(flyte:cacheservice:cache:not_found_unlabeled{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Misses", + "refId": "B" + }, + { + "expr": "rate(flyte:cacheservice:cache:get_failure_unlabeled{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Get failures", + "refId": "C" + } + ], + "description": "CacheService hit/miss rate. Hits = cached task output reused. Misses = task must execute. Get failures = storage errors. [Metrics pending: requires cloud service instrumentation to be deployed]" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 15 + }, + "id": 902, + "title": "Reservation Contention & Operations", + "type": "timeseries", + "targets": [ + { + "expr": "rate(flyte:cacheservice:cache:reservation_contention_unlabeled{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Contention", + "refId": "A" + }, + { + "expr": "rate(flyte:cacheservice:cache:get_reservation_success_unlabeled{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Reservation acquired", + "refId": "B" + }, + { + "expr": "rate(flyte:cacheservice:cache:release_reservation_success_unlabeled{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Reservation released", + "refId": "C" + } + ], + "description": "Cache reservation contention: how often workers are blocked waiting for another worker's cache computation. High contention = many workers computing the same task. [Metrics pending: requires cloud service instrumentation to be deployed]" + } + ] + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 36 + }, + "id": 750, + "title": "Authorizer", + "type": "row", + "panels": [ + { + "id": 760, + "title": "Authorizer Mode", + "type": "stat", + "gridPos": { + "h": 4, + "w": 4, + "x": 0, + "y": 37 + }, + "fieldConfig": { + "defaults": { + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "textMode": "name", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "targets": [ + { + "expr": "authorizer:authorizer:cloudauthorizer:connect:authz_type_info{namespace=\"$namespace\"} == 1", + "legendFormat": "{{type}}", + "refId": "A" + } + ], + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + } + }, + { + "id": 751, + "title": "Allow / Deny Rate", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 10, + "x": 4, + "y": 37 + }, + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "spanNulls": false + }, + "noValue": "0", + "unit": "ops", + "decimals": 2 + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": ".*denied.*" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": ".*allowed.*" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + } + ] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "mean" + ] + }, + "tooltip": { + "mode": "multi" + } + }, + "targets": [ + { + "expr": "sum by (identity_type) (rate(authorizer:authorizer:cloudauthorizer:connect:authz_allowed{namespace=\"$namespace\"}[$__rate_interval]))", + "legendFormat": "allowed ({{identity_type}})", + "refId": "A" + }, + { + "expr": "sum by (identity_type) (rate(authorizer:authorizer:cloudauthorizer:connect:authz_denied{namespace=\"$namespace\"}[$__rate_interval]))", + "legendFormat": "denied ({{identity_type}})", + "refId": "B" + } + ], + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + } + }, + { + "id": 753, + "title": "Deny Rate (%)", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 10, + "x": 14, + "y": 37 + }, + "fieldConfig": { + "defaults": { + "unit": "percentunit", + "custom": { + "drawStyle": "line", + "fillOpacity": 10 + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 0.1 + }, + { + "color": "red", + "value": 0.5 + } + ] + }, + "noValue": "0", + "decimals": 1, + "min": 0, + "max": 1 + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + } + }, + "targets": [ + { + "expr": "(sum by (identity_type) (rate(authorizer:authorizer:cloudauthorizer:connect:authz_denied{namespace=\"$namespace\"}[$__rate_interval])) or vector(0)) / clamp_min((sum by (identity_type) (rate(authorizer:authorizer:cloudauthorizer:connect:authz_allowed{namespace=\"$namespace\"}[$__rate_interval])) + sum by (identity_type) (rate(authorizer:authorizer:cloudauthorizer:connect:authz_denied{namespace=\"$namespace\"}[$__rate_interval]))), 1e-10)", + "legendFormat": "{{identity_type}}", + "refId": "A" + } + ], + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + } + }, + { + "id": 752, + "title": "Authorize Latency (service)", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 45 + }, + "fieldConfig": { + "defaults": { + "unit": "ms", + "custom": { + "drawStyle": "line", + "fillOpacity": 10 + }, + "noValue": "0", + "decimals": 1, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 50 + }, + { + "color": "red", + "value": 200 + } + ] + } + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + } + }, + "targets": [ + { + "expr": "authorizer:authorizer:cloudauthorizer:connect:authorize_duration_ms{namespace=\"$namespace\", quantile=\"0.5\"}", + "legendFormat": "p50", + "refId": "A" + }, + { + "expr": "authorizer:authorizer:cloudauthorizer:connect:authorize_duration_ms{namespace=\"$namespace\", quantile=\"0.9\"}", + "legendFormat": "p90", + "refId": "B" + }, + { + "expr": "authorizer:authorizer:cloudauthorizer:connect:authorize_duration_ms{namespace=\"$namespace\", quantile=\"0.99\"}", + "legendFormat": "p99", + "refId": "C" + } + ], + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + } + }, + { + "id": 761, + "title": "Backend Latency", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 45 + }, + "fieldConfig": { + "defaults": { + "unit": "ms", + "custom": { + "drawStyle": "line", + "fillOpacity": 10 + }, + "noValue": "0", + "decimals": 1, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 50 + }, + { + "color": "red", + "value": 200 + } + ] + } + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + } + }, + "targets": [ + { + "expr": "histogram_quantile(0.50, sum by (le) (rate(authorizer:authorizer:cloudauthorizer:connect:backend_authorize_duration_ms_bucket{namespace=\"$namespace\"}[$__rate_interval])))", + "legendFormat": "p50", + "refId": "A" + }, + { + "expr": "histogram_quantile(0.95, sum by (le) (rate(authorizer:authorizer:cloudauthorizer:connect:backend_authorize_duration_ms_bucket{namespace=\"$namespace\"}[$__rate_interval])))", + "legendFormat": "p95", + "refId": "B" + }, + { + "expr": "histogram_quantile(0.99, sum by (le) (rate(authorizer:authorizer:cloudauthorizer:connect:backend_authorize_duration_ms_bucket{namespace=\"$namespace\"}[$__rate_interval])))", + "legendFormat": "p99", + "refId": "C" + } + ], + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + } + }, + { + "id": 764, + "title": "Decisions by Action", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 45 + }, + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "bars", + "fillOpacity": 50, + "stacking": { + "mode": "normal" + } + }, + "noValue": "0", + "unit": "ops", + "decimals": 2 + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "sum" + ] + }, + "tooltip": { + "mode": "multi" + } + }, + "targets": [ + { + "expr": "sum by (action, identity_type) (rate(authorizer:authorizer:cloudauthorizer:connect:authz_allowed{namespace=\"$namespace\"}[$__rate_interval]))", + "legendFormat": "{{action}} {{identity_type}} (allowed)", + "refId": "A" + }, + { + "expr": "sum by (action, identity_type) (rate(authorizer:authorizer:cloudauthorizer:connect:authz_denied{namespace=\"$namespace\"}[$__rate_interval]))", + "legendFormat": "{{action}} {{identity_type}} (denied)", + "refId": "B" + } + ], + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + } + }, + { + "id": 762, + "title": "Backend Errors", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 53 + }, + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 10 + }, + "noValue": "0", + "unit": "ops", + "decimals": 2, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 0.01 + } + ] + } + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + } + }, + "targets": [ + { + "expr": "sum by (error_type) (rate(authorizer:authorizer:cloudauthorizer:connect:backend_authorize_errors{namespace=\"$namespace\"}[$__rate_interval])) or vector(0)", + "legendFormat": "{{error_type}}", + "refId": "A" + } + ], + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + } + }, + { + "id": 765, + "title": "Error Attribution", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 53 + }, + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 10 + }, + "noValue": "0", + "unit": "ops", + "decimals": 2, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 0.01 + } + ] + } + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + } + }, + "targets": [ + { + "expr": "sum by (error_source) (rate(authorizer:authorizer:cloudauthorizer:connect:authorize_errors_total{namespace=\"$namespace\"}[$__rate_interval])) or vector(0)", + "legendFormat": "{{error_source}}", + "refId": "A" + } + ], + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + } + }, + { + "id": 763, + "title": "Fail-Open Activations", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 53 + }, + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 10 + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 0.001 + } + ] + }, + "noValue": "0", + "unit": "ops", + "decimals": 2 + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + } + }, + "targets": [ + { + "expr": "rate(authorizer:authorizer:cloudauthorizer:connect:external:fail_open_activated{namespace=\"$namespace\"}[$__rate_interval]) or vector(0)", + "legendFormat": "fail-open", + "refId": "A" + } + ], + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + } + } + ] + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 36 + }, + "id": 700, + "title": "Data Proxy", + "type": "row", + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 15 + }, + "id": 701, + "title": "Cache Hit/Miss Rates", + "type": "timeseries", + "targets": [ + { + "expr": "rate(dataproxy:domains:hits{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Domain hits", + "refId": "A" + }, + { + "expr": "rate(dataproxy:domains:miss{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Domain miss", + "refId": "B" + }, + { + "expr": "rate(dataproxy:clusterpoolcache:hits{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "ClusterPool hits", + "refId": "C" + }, + { + "expr": "rate(dataproxy:clusterpoolcache:miss{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "ClusterPool miss", + "refId": "D" + } + ], + "description": "DataProxy internal cache effectiveness for domain resolution, cluster pool routing, and namespace mapping." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "s" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 15 + }, + "id": 702, + "title": "Image Read Latency (p95)", + "type": "timeseries", + "targets": [ + { + "expr": "histogram_quantile(0.95, sum by (le) (rate(dataproxy:images:read:success_ms_count{namespace=\"$namespace\"}[$__rate_interval]))) / 1000", + "legendFormat": "Success p95", + "refId": "A" + }, + { + "expr": "histogram_quantile(0.95, sum by (le) (rate(dataproxy:images:read:failure_ms_count{namespace=\"$namespace\"}[$__rate_interval]))) / 1000", + "legendFormat": "Failure p95", + "refId": "B" + } + ], + "description": "Time to read image metadata from the dataplane, proxied through DataProxy." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 23 + }, + "id": 703, + "title": "Secret Proxy Errors by Cluster", + "type": "timeseries", + "targets": [ + { + "expr": "sum by (cluster, operation) (rate(dataproxy:secrets_service:cluster_errors{namespace=\"$namespace\"}[$__rate_interval]))", + "legendFormat": "{{ cluster }} {{ operation }}", + "refId": "A" + } + ], + "description": "Per-cluster secret proxy errors during fan-out operations. Identifies which dataplane cluster is causing failures. [Metrics pending: requires cloud service instrumentation to be deployed]" + } + ] + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 37 + }, + "id": 800, + "title": "Usage Service", + "type": "row", + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 16 + }, + "id": 801, + "title": "Billable Usage Reports", + "type": "timeseries", + "targets": [ + { + "expr": "rate(usage:svc:report_billable_usage{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Reports/s", + "refId": "A" + } + ], + "description": "Rate of ReportBillableUsage calls from DP clusters. Each call reports resource consumption for billing." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 16 + }, + "id": 802, + "title": "Message Pipeline", + "type": "timeseries", + "targets": [ + { + "expr": "rate(usage:messages:messages_received{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Received", + "refId": "A" + }, + { + "expr": "rate(usage:messages:messages_sent{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Sent", + "refId": "B" + }, + { + "expr": "rate(usage:messages:messages_dropped{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Dropped", + "refId": "C" + }, + { + "expr": "rate(usage:messages:messages_failed{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Failed", + "refId": "D" + } + ], + "description": "SQS/queue message processing: received, sent (success), failed, dropped (max retries exceeded)." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never", + "stacking": { + "mode": "normal" + } + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 24 + }, + "id": 803, + "title": "Messages by Type (success)", + "type": "timeseries", + "targets": [ + { + "expr": "sum by (message_type) (rate(usage:messages:messages_processed{namespace=\"$namespace\", outcome=\"success\"}[$__rate_interval]))", + "legendFormat": "{{ message_type }}", + "refId": "A" + } + ], + "description": "Successful message processing rate by type: node_execution, workflow_execution, billable_usage, serverless_billable_usage. [Metrics pending: requires cloud service instrumentation to be deployed]" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "ms" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 24 + }, + "id": 804, + "title": "Message Processing Latency", + "type": "timeseries", + "targets": [ + { + "expr": "usage:messages:processing_time_ms{namespace=\"$namespace\", quantile=\"0.5\"}", + "legendFormat": "p50", + "refId": "A" + }, + { + "expr": "usage:messages:processing_time_ms{namespace=\"$namespace\", quantile=\"0.9\"}", + "legendFormat": "p90", + "refId": "B" + }, + { + "expr": "usage:messages:processing_time_ms{namespace=\"$namespace\", quantile=\"0.99\"}", + "legendFormat": "p99", + "refId": "C" + } + ], + "description": "Time to process individual queue messages. Slow processing may indicate backend API issues (Metronome, timestream). [Metrics pending: requires cloud service instrumentation to be deployed]" + } + ] + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 38 + }, + "id": 1100, + "title": "Infrastructure", + "type": "row", + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never", + "stacking": { + "mode": "normal" + } + }, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 17 + }, + "id": 1101, + "title": "CPU Usage by Service", + "type": "timeseries", + "targets": [ + { + "expr": "sum by (container) (rate(container_cpu_usage_seconds_total{namespace=\"$namespace\", container!=\"\", container!=\"POD\"}[$__rate_interval]))", + "legendFormat": "{{ container }}", + "refId": "A" + } + ], + "description": "CPU usage in cores per container, stacked. Identifies resource-heavy services." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never", + "stacking": { + "mode": "normal" + } + }, + "unit": "bytes" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 17 + }, + "id": 1102, + "title": "Memory Usage by Service", + "type": "timeseries", + "targets": [ + { + "expr": "sum by (container) (container_memory_working_set_bytes{namespace=\"$namespace\", container!=\"\", container!=\"POD\"})", + "legendFormat": "{{ container }}", + "refId": "A" + } + ], + "description": "Working set memory per container, stacked. Watch for approaching limits." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "bars", + "fillOpacity": 80, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 25 + }, + "id": 1103, + "title": "Pod Restart Count by Container", + "type": "timeseries", + "targets": [ + { + "expr": "increase(kube_pod_container_status_restarts_total{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "{{ pod }}/{{ container }}", + "refId": "A" + } + ], + "description": "Per-container restart events. Spikes indicate crashes or OOM kills." + } + ] + } + ], + "schemaVersion": 39, + "tags": [ + "union", + "controlplane" + ], + "templating": { + "list": [ + { + "current": {}, + "hide": 0, + "includeAll": false, + "label": "Data Source", + "multi": false, + "name": "datasource", + "options": [], + "query": "prometheus", + "refresh": 1, + "type": "datasource" + }, + { + "current": { + "selected": true, + "text": "__NAMESPACE__", + "value": "__NAMESPACE__" + }, + "hide": 2, + "label": "Namespace", + "name": "namespace", + "options": [ + { + "selected": true, + "text": "__NAMESPACE__", + "value": "__NAMESPACE__" + } + ], + "query": "__NAMESPACE__", + "type": "constant" + } + ] + }, + "time": { + "from": "now-3h", + "to": "now" + }, + "timepicker": {}, + "timezone": "browser", + "title": "Union Controlplane V2 Overview", + "uid": "union-controlplane-v2-overview", + "version": 2 +} diff --git a/charts/dataplane/dashboards/union-dataplane-v2-overview.json b/charts/dataplane/dashboards/union-dataplane-v2-overview.json new file mode 100644 index 00000000..84ac227c --- /dev/null +++ b/charts/dataplane/dashboards/union-dataplane-v2-overview.json @@ -0,0 +1,1441 @@ +{ + "annotations": { + "list": [] + }, + "description": "Union Dataplane health and service metrics", + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "links": [], + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 1, + "title": "Health", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "orange", + "value": 0.5 + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "percentunit", + "min": 0, + "max": 1 + } + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 0, + "y": 1 + }, + "id": 2, + "options": { + "colorMode": "background", + "graphMode": "none", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + }, + "textMode": "auto" + }, + "title": "Service Availability", + "type": "stat", + "targets": [ + { + "expr": "avg(kube_deployment_status_replicas_available{namespace=\"$namespace\"} / kube_deployment_spec_replicas{namespace=\"$namespace\"})", + "legendFormat": "Availability", + "refId": "A" + } + ], + "description": "Percentage of deployments with all requested replicas available. 1.0 = all healthy." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "orange", + "value": 3 + }, + { + "color": "red", + "value": 10 + } + ] + }, + "unit": "short" + } + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 6, + "y": 1 + }, + "id": 3, + "options": { + "colorMode": "background", + "graphMode": "none", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + }, + "textMode": "auto" + }, + "title": "Pod Restarts (1h)", + "type": "stat", + "targets": [ + { + "expr": "sum(increase(kube_pod_container_status_restarts_total{namespace=\"$namespace\"}[1h]))", + "legendFormat": "Restarts", + "refId": "A" + } + ], + "description": "Total container restarts in the last hour. Non-zero indicates crashlooping or OOM kills." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "short" + } + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 5 + }, + "id": 10, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "title": "Active Executions", + "type": "timeseries", + "targets": [ + { + "expr": "sum(flyte:propeller:all:execstats:active_workflow_executions{namespace=\"$namespace\"})", + "legendFormat": "Workflows", + "refId": "A" + }, + { + "expr": "sum(flyte:propeller:all:execstats:active_node_executions{namespace=\"$namespace\"})", + "legendFormat": "Nodes", + "refId": "B" + }, + { + "expr": "sum(flyte:propeller:all:execstats:active_task_executions{namespace=\"$namespace\"})", + "legendFormat": "Tasks", + "refId": "C" + } + ], + "description": "Current count of active workflow, node, and task executions tracked by FlyteAdmin." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 1 + } + ] + }, + "unit": "short" + } + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 12, + "y": 1 + }, + "id": 11, + "options": { + "colorMode": "background", + "graphMode": "none", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + }, + "textMode": "value_and_name" + }, + "title": "Handler Panics", + "type": "stat", + "targets": [ + { + "expr": "sum(executor:handler_panic{namespace=\"$namespace\"})", + "legendFormat": "Total", + "refId": "A" + } + ], + "description": "Total handler panics in DP services. Any non-zero value indicates a service caught a panic during request handling." + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 9 + }, + "id": 1200, + "title": "SLOs", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "orange", + "value": 0.99 + }, + { + "color": "green", + "value": 0.999 + } + ] + }, + "unit": "percentunit", + "decimals": 3 + } + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 0, + "y": 10 + }, + "id": 1201, + "title": "Service Availability", + "type": "stat", + "options": { + "colorMode": "background", + "graphMode": "none", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + }, + "textMode": "value" + }, + "targets": [ + { + "expr": "avg(kube_deployment_status_replicas_available{namespace=\"$namespace\"} / kube_deployment_spec_replicas{namespace=\"$namespace\"})", + "refId": "A" + } + ], + "description": "Current service availability across all DP deployments." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": -999 + }, + { + "color": "orange", + "value": 0 + }, + { + "color": "green", + "value": 0.5 + } + ] + }, + "unit": "percentunit", + "decimals": 1, + "noValue": "N/A" + } + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 6, + "y": 10 + }, + "id": 1202, + "title": "Error Budget Remaining", + "type": "stat", + "options": { + "colorMode": "background", + "graphMode": "area", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + }, + "textMode": "value" + }, + "targets": [ + { + "expr": "union:dp:slo:error_budget_remaining", + "refId": "A" + } + ], + "description": "Fraction of error budget remaining. Requires monitoring.slos.enabled." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 0 + }, + { + "color": "orange", + "value": 0.95 + }, + { + "color": "green", + "value": 0.999 + } + ] + }, + "unit": "percentunit", + "decimals": 2, + "noValue": "N/A" + } + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 12, + "y": 10 + }, + "id": 1203, + "title": "Execution Success Rate", + "type": "stat", + "options": { + "colorMode": "background", + "graphMode": "none", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + }, + "textMode": "value" + }, + "targets": [ + { + "expr": "union:dp:slo:execution_success_rate or (union:dp:slo:propeller_success_rate + union:dp:slo:executor_success_rate) / 2 or union:dp:slo:propeller_success_rate or vector(1)", + "refId": "A" + } + ], + "description": "Combined V1 (propeller) and V2 (executor) task success rate. Falls back to propeller-only or 100% when idle." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "orange", + "value": 2 + }, + { + "color": "red", + "value": 5 + } + ] + }, + "unit": "s", + "decimals": 2 + } + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 18, + "y": 10 + }, + "id": 1204, + "title": "Executor Evaluate Duration p99", + "type": "stat", + "options": { + "colorMode": "background", + "graphMode": "none", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + }, + "textMode": "value" + }, + "targets": [ + { + "expr": "executor:evaluator:evaluate_duration{namespace=\"$namespace\", quantile=\"0.99\"}", + "legendFormat": "p99", + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + } + } + ], + "description": "Propeller round p99 latency in seconds. Falls back to raw metric if SLO rules not enabled." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "percentunit" + } + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 16 + }, + "id": 1205, + "title": "Availability Over Time", + "type": "timeseries", + "targets": [ + { + "expr": "avg(kube_deployment_status_replicas_available{namespace=\"$namespace\"} / kube_deployment_spec_replicas{namespace=\"$namespace\"})", + "legendFormat": "Availability", + "refId": "A" + }, + { + "expr": "vector(0.999)", + "legendFormat": "Target (99.9%)", + "refId": "B" + } + ], + "description": "DP service availability over time with SLO target line." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "percentunit", + "max": 1, + "min": -0.5 + } + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 16 + }, + "id": 1206, + "title": "Error Budget Burn Rate", + "type": "timeseries", + "targets": [ + { + "expr": "union:dp:slo:error_budget_remaining", + "legendFormat": "Budget remaining", + "refId": "A" + }, + { + "expr": "vector(0)", + "legendFormat": "Exhausted", + "refId": "B" + } + ], + "description": "DP error budget remaining over time. Requires monitoring.slos.enabled." + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 31 + }, + "id": 200, + "title": "Union Operator", + "type": "row", + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 10 + }, + "id": 201, + "title": "Work Queue Operations", + "type": "timeseries", + "targets": [ + { + "expr": "rate(union_operator:work_queue:operations_processed{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Processed", + "refId": "A" + }, + { + "expr": "rate(union_operator:work_queue:operations_failed{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Failed", + "refId": "B" + } + ], + "description": "Operator execution operation processing rate and failure rate." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 10 + }, + "id": 202, + "title": "Background Process Runs / Errors", + "type": "timeseries", + "targets": [ + { + "expr": "rate(union_operator:heartbeat_updater:runs{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Heartbeat runs", + "refId": "A" + }, + { + "expr": "rate(union_operator:heartbeat_updater:run_errors{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Heartbeat errors", + "refId": "B" + }, + { + "expr": "rate(union_operator:status_updater:runs{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Status runs", + "refId": "C" + }, + { + "expr": "rate(union_operator:status_updater:run_errors{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Status errors", + "refId": "D" + }, + { + "expr": "rate(union_operator:prometheus_health_checker:run_errors{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Prom health errors", + "refId": "E" + } + ], + "description": "FlyteAdmin error rates: propeller communication failures, model transform errors, notification publish failures." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "ms" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 10 + }, + "id": 203, + "title": "Heartbeat Latency", + "type": "timeseries", + "targets": [ + { + "expr": "union_operator:heartbeat:compute_capabilities_ms{namespace=\"$namespace\", quantile=\"0.9\"}", + "legendFormat": "Capabilities p90", + "refId": "A" + }, + { + "expr": "union_operator:heartbeat:compute_usages_ms{namespace=\"$namespace\", quantile=\"0.9\"}", + "legendFormat": "Usages p90", + "refId": "B" + }, + { + "expr": "union_operator:heartbeat:list_workflows_ms{namespace=\"$namespace\", quantile=\"0.9\"}", + "legendFormat": "List WFs p90", + "refId": "C" + } + ], + "description": "Breakdown of operator heartbeat computation: capabilities, usages, list workflows." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 18 + }, + "id": 204, + "title": "Config Syncer", + "type": "timeseries", + "targets": [ + { + "expr": "rate(union_operator:config_syncer:runs{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Sync runs", + "refId": "A" + }, + { + "expr": "rate(union_operator:config_syncer:run_errors{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Sync errors", + "refId": "B" + }, + { + "expr": "rate(union_operator:config_syncer:propeller_configmap_updated{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Propeller CM updated", + "refId": "C" + } + ], + "description": "Config sync cycle rate, errors, and propeller ConfigMap update count." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 18 + }, + "id": 205, + "title": "Billable Usage Collector", + "type": "timeseries", + "targets": [ + { + "expr": "rate(union_operator:billable_usage_collector:runs{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Runs", + "refId": "A" + }, + { + "expr": "rate(union_operator:billable_usage_collector:run_errors{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Errors", + "refId": "B" + } + ], + "description": "Billing collection cycle rate and errors. Failures mean billing data may be delayed." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 1 + } + ] + }, + "unit": "bool_yes_no" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 18 + }, + "id": 206, + "title": "Work Queue Paused", + "type": "stat", + "options": { + "colorMode": "background", + "graphMode": "area", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + }, + "textMode": "auto" + }, + "targets": [ + { + "expr": "union_operator:work_queue:paused{namespace=\"$namespace\"}", + "legendFormat": "Paused", + "refId": "A" + } + ], + "description": "1 when operator paused due to resource limits (FlyteWorkflow count or storage exceeded)." + } + ] + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 32 + }, + "id": 300, + "title": "Executor (V2)", + "type": "row", + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 11 + }, + "id": 301, + "title": "Active Actions & Capacity", + "type": "timeseries", + "targets": [ + { + "expr": "executor:active_actions_count{namespace=\"$namespace\"}", + "legendFormat": "Active actions", + "refId": "A" + }, + { + "expr": "executor:available_capacity{namespace=\"$namespace\"}", + "legendFormat": "Available capacity", + "refId": "B" + } + ], + "description": "V2 executor active actions vs available capacity. Capacity=0 means executor is saturated." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 11 + }, + "id": 302, + "title": "Cache Discovery", + "type": "timeseries", + "targets": [ + { + "expr": "rate(executor:discovery_miss_count{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Miss", + "refId": "A" + }, + { + "expr": "rate(executor:discovery_put_success_count{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Put success", + "refId": "B" + } + ], + "description": "V2 executor cache discovery miss/put rates for task output caching." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never", + "stacking": { + "mode": "normal" + } + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 19 + }, + "id": 303, + "title": "Actions Terminated by Phase", + "type": "timeseries", + "targets": [ + { + "expr": "sum by (phase) (rate(executor:actions_terminated{namespace=\"$namespace\"}[$__rate_interval]))", + "legendFormat": "{{ phase }}", + "refId": "A" + } + ], + "description": "Task completion rate by outcome: succeeded, failed, aborted. Key V2 SLI for task health. [Metrics pending: requires cloud service instrumentation to be deployed]" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "ms" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 19 + }, + "id": 304, + "title": "Evaluator Duration (pod creation)", + "type": "timeseries", + "targets": [ + { + "expr": "executor:evaluator:evaluate_duration{namespace=\"$namespace\", quantile=\"0.5\"}", + "legendFormat": "Evaluate p50", + "refId": "A" + }, + { + "expr": "executor:evaluator:evaluate_duration{namespace=\"$namespace\", quantile=\"0.9\"}", + "legendFormat": "Evaluate p90", + "refId": "B" + }, + { + "expr": "executor:evaluator:evaluate_duration{namespace=\"$namespace\", quantile=\"0.99\"}", + "legendFormat": "Evaluate p99", + "refId": "C" + } + ], + "description": "Time spent in RecursiveNodeHandler (pod creation). Dominant component of V2 task latency. [Metrics pending: requires cloud service instrumentation to be deployed]" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 19 + }, + "id": 305, + "title": "System Failures & Invalid Leases", + "type": "timeseries", + "targets": [ + { + "expr": "rate(executor:system_failures{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "System failures", + "refId": "A" + }, + { + "expr": "rate(executor:system_failures_exhausted{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Exhausted retries", + "refId": "B" + }, + { + "expr": "rate(executor:invalid_leases{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Invalid leases", + "refId": "C" + }, + { + "expr": "rate(executor:evaluator:evaluate_errors{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "Evaluate errors", + "refId": "D" + } + ], + "description": "Executor error rates. System failures retry; exhausted means task permanently failed. Invalid leases = malformed from queue service. [Metrics pending: requires cloud service instrumentation to be deployed]" + } + ] + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 34 + }, + "id": 400, + "title": "gRPC Client (DP \u2192 CP)", + "type": "row", + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "reqps" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 13 + }, + "id": 401, + "title": "gRPC Client Request Rate", + "type": "timeseries", + "targets": [ + { + "expr": "sum by (grpc_service, grpc_method) (rate(grpc_client_handled_total{namespace=\"$namespace\"}[$__rate_interval]))", + "legendFormat": "{{ grpc_service }}/{{ grpc_method }}", + "refId": "A" + } + ], + "description": "DP\u2192CP gRPC client request rate by service and method (e.g. CreateWorkflowEvent)." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "reqps" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 13 + }, + "id": 402, + "title": "gRPC Client Error Rate", + "type": "timeseries", + "targets": [ + { + "expr": "sum by (grpc_service, grpc_method, grpc_code) (rate(grpc_client_handled_total{namespace=\"$namespace\", grpc_code!=\"OK\"}[$__rate_interval]))", + "legendFormat": "{{ grpc_method }} {{ grpc_code }}", + "refId": "A" + } + ], + "description": "DP\u2192CP gRPC errors by method and code. Non-OK responses from the control plane." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "s" + } + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 21 + }, + "id": 403, + "title": "gRPC Client Latency (p95)", + "type": "timeseries", + "targets": [ + { + "expr": "histogram_quantile(0.95, sum by (le, grpc_method) (rate(grpc_client_handling_seconds_bucket{namespace=\"$namespace\"}[$__rate_interval])))", + "legendFormat": "{{ grpc_method }} p95", + "refId": "A" + } + ], + "description": "DP\u2192CP gRPC call latency at p95. High latency = slow control plane or network issues." + } + ] + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 35 + }, + "id": 1100, + "title": "Infrastructure", + "type": "row", + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never", + "stacking": { + "mode": "none" + } + }, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 14 + }, + "id": 1101, + "title": "CPU Usage by Service", + "type": "timeseries", + "targets": [ + { + "expr": "sum by (container) (rate(container_cpu_usage_seconds_total{namespace=\"$namespace\", container!=\"\", container!=\"POD\"}[$__rate_interval]))", + "legendFormat": "{{ container }}", + "refId": "A" + } + ], + "description": "CPU usage in cores per container, stacked. Identifies resource-heavy services." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 1, + "showPoints": "never", + "stacking": { + "mode": "none" + } + }, + "unit": "bytes" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 14 + }, + "id": 1102, + "title": "Memory Usage by Service", + "type": "timeseries", + "targets": [ + { + "expr": "sum by (container) (container_memory_working_set_bytes{namespace=\"$namespace\", container!=\"\", container!=\"POD\"})", + "legendFormat": "{{ container }}", + "refId": "A" + } + ], + "description": "Working set memory per container. Watch for approaching limits." + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "drawStyle": "bars", + "fillOpacity": 80, + "lineWidth": 1, + "showPoints": "never" + }, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 22 + }, + "id": 1103, + "title": "Pod Restart Count by Container", + "type": "timeseries", + "targets": [ + { + "expr": "increase(kube_pod_container_status_restarts_total{namespace=\"$namespace\"}[$__rate_interval])", + "legendFormat": "{{ pod }}/{{ container }}", + "refId": "A" + } + ], + "description": "Per-container restart events. Spikes indicate crashes or OOM kills." + } + ] + } + ], + "schemaVersion": 39, + "tags": [ + "union", + "dataplane" + ], + "templating": { + "list": [ + { + "current": {}, + "hide": 0, + "includeAll": false, + "label": "Data Source", + "multi": false, + "name": "datasource", + "options": [], + "query": "prometheus", + "refresh": 1, + "type": "datasource" + }, + { + "current": { + "selected": true, + "text": "__NAMESPACE__", + "value": "__NAMESPACE__" + }, + "hide": 2, + "label": "Namespace", + "name": "namespace", + "options": [ + { + "selected": true, + "text": "__NAMESPACE__", + "value": "__NAMESPACE__" + } + ], + "query": "__NAMESPACE__", + "type": "constant" + } + ] + }, + "time": { + "from": "now-3h", + "to": "now" + }, + "timepicker": {}, + "timezone": "browser", + "title": "Union Dataplane V2 Overview", + "uid": "union-dataplane-v2-overview", + "version": 1 +} From 6e95540635814e652fadfcf3b258f42f9e07b6e3 Mon Sep 17 00:00:00 2001 From: Michael Hotan Date: Sat, 2 May 2026 18:50:30 +1000 Subject: [PATCH 2/6] Replace v1 dashboards with v2 content Remove v1-only panels from the shipped dashboards rather than maintaining separate v1 and v2 files. Keeps the same filenames, UIDs, and titles so existing bookmarks and Grafana links continue to work. The v1 dashboard triage (whether to create a separate legacy dashboard) is tracked as a separate Linear issue. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../union-controlplane-overview.json | 170 +- .../union-controlplane-v2-overview.json | 3648 ----------------- .../dashboards/union-dataplane-overview.json | 681 +-- .../union-dataplane-v2-overview.json | 1441 ------- 4 files changed, 49 insertions(+), 5891 deletions(-) delete mode 100644 charts/controlplane/dashboards/union-controlplane-v2-overview.json delete mode 100644 charts/dataplane/dashboards/union-dataplane-v2-overview.json diff --git a/charts/controlplane/dashboards/union-controlplane-overview.json b/charts/controlplane/dashboards/union-controlplane-overview.json index e37e0cb9..a044a3f0 100644 --- a/charts/controlplane/dashboards/union-controlplane-overview.json +++ b/charts/controlplane/dashboards/union-controlplane-overview.json @@ -1301,186 +1301,96 @@ "y": 34 }, "id": 400, - "title": "Executions", + "title": "Executions (V2)", "type": "row", "panels": [ { + "title": "CreateRun Rate", + "type": "timeseries", "datasource": { "type": "prometheus", "uid": "${datasource}" }, "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "ops" + "unit": "reqps" } }, - "gridPos": { - "h": 8, - "w": 8, - "x": 0, - "y": 12 - }, - "id": 401, - "title": "Execution Create / Ack Rate", - "type": "timeseries", "targets": [ { - "expr": "rate(executions:executions:handle_create_op_count{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Create", + "expr": "sum(rate(connect:server_requests_handled_total{namespace=\"$namespace\", method=\"CreateRun\"}[$__rate_interval]))", + "legendFormat": "CreateRun", "refId": "A" - }, - { - "expr": "rate(executions:executions:handle_ack_op_count{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Ack", - "refId": "B" } ], - "description": "Rate of execution operation creates and acknowledgements. Create = new execution request, Ack = DP confirmed receipt." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "s" - } - }, "gridPos": { "h": 8, "w": 8, - "x": 8, - "y": 12 - }, - "id": 402, - "title": "Execution Create / Ack Latency (p95)", - "type": "timeseries", - "targets": [ - { - "expr": "histogram_quantile(0.95, sum by (le) (rate(executions:executions:handle_create_op_bucket{namespace=\"$namespace\"}[$__rate_interval])))", - "legendFormat": "Create p95", - "refId": "A" - }, - { - "expr": "histogram_quantile(0.95, sum by (le) (rate(executions:executions:handle_ack_op_bucket{namespace=\"$namespace\"}[$__rate_interval])))", - "legendFormat": "Ack p95", - "refId": "B" - } - ], - "description": "Time to prepare create/ack execution requests at p95." + "x": 0, + "y": 0 + } }, { + "title": "CreateRun Latency (p50 / p95 / p99)", + "type": "timeseries", "datasource": { "type": "prometheus", "uid": "${datasource}" }, "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, "unit": "s" } }, - "gridPos": { - "h": 8, - "w": 8, - "x": 16, - "y": 12 - }, - "id": 403, - "title": "Assignment Duration (p50 / p90)", - "type": "timeseries", "targets": [ { - "expr": "histogram_quantile(0.50, sum by (le) (rate(executions:workqueue:announce_cluster_assignment_bucket{namespace=\"$namespace\"}[$__rate_interval])))", + "expr": "histogram_quantile(0.50, sum by (le) (rate(connect:server_request_duration_seconds_bucket{namespace=\"$namespace\", method=\"CreateRun\"}[$__rate_interval])))", "legendFormat": "p50", "refId": "A" }, { - "expr": "histogram_quantile(0.90, sum by (le) (rate(executions:workqueue:announce_cluster_assignment_bucket{namespace=\"$namespace\"}[$__rate_interval])))", - "legendFormat": "p90", + "expr": "histogram_quantile(0.95, sum by (le) (rate(connect:server_request_duration_seconds_bucket{namespace=\"$namespace\", method=\"CreateRun\"}[$__rate_interval])))", + "legendFormat": "p95", "refId": "B" + }, + { + "expr": "histogram_quantile(0.99, sum by (le) (rate(connect:server_request_duration_seconds_bucket{namespace=\"$namespace\", method=\"CreateRun\"}[$__rate_interval])))", + "legendFormat": "p99", + "refId": "C" } ], - "description": "Key SLI: end-to-end time from execution create to cluster assignment. Custom buckets from 10ms to 20min." + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 0 + } }, { + "title": "V2 Run Methods", + "type": "timeseries", "datasource": { "type": "prometheus", "uid": "${datasource}" }, "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "ops" + "unit": "reqps" } }, - "gridPos": { - "h": 8, - "w": 8, - "x": 0, - "y": 20 - }, - "id": 404, - "title": "Workqueue Operations", - "type": "timeseries", "targets": [ { - "expr": "rate(executions:workqueue:send_operation_count{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Send ops", + "expr": "sum by (method) (rate(connect:server_requests_handled_total{namespace=\"$namespace\", service=~\".*RunService.*\"}[$__rate_interval]))", + "legendFormat": "{{method}}", "refId": "A" - }, - { - "expr": "rate(executions:workqueue:claim_operations{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Claims", - "refId": "B" - }, - { - "expr": "rate(executions:workqueue:send_operation_failures{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Send failures", - "refId": "C" - }, - { - "expr": "rate(executions:workqueue:claim_operation_failures{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Claim failures", - "refId": "D" } ], - "description": "Execution operation send/claim rates and failures. Send = dispatch to DP, Claim = pick up from DB." + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 0 + } }, { "datasource": { @@ -1644,7 +1554,7 @@ "y": 28 }, "id": 408, - "title": "Pending Assignments", + "title": "Apps \u2014 Pending Assignments", "type": "timeseries", "targets": [ { @@ -1681,7 +1591,7 @@ "y": 36 }, "id": 409, - "title": "First Ack Latency (V2 SLI)", + "title": "Apps \u2014 First Ack Latency", "type": "timeseries", "targets": [ { @@ -3733,6 +3643,6 @@ "timepicker": {}, "timezone": "browser", "title": "Union Controlplane Overview", - "uid": "union-cp-overview", + "uid": "union-controlplane-overview", "version": 2 -} \ No newline at end of file +} diff --git a/charts/controlplane/dashboards/union-controlplane-v2-overview.json b/charts/controlplane/dashboards/union-controlplane-v2-overview.json deleted file mode 100644 index 64ae63d4..00000000 --- a/charts/controlplane/dashboards/union-controlplane-v2-overview.json +++ /dev/null @@ -1,3648 +0,0 @@ -{ - "annotations": { - "list": [] - }, - "description": "Union Controlplane health and service metrics", - "editable": true, - "fiscalYearStartMonth": 0, - "graphTooltip": 1, - "links": [], - "panels": [ - { - "collapsed": false, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 0 - }, - "id": 1, - "title": "Health", - "type": "row" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "thresholds": { - "steps": [ - { - "color": "red", - "value": null - }, - { - "color": "orange", - "value": 0.5 - }, - { - "color": "green", - "value": 1 - } - ] - }, - "unit": "percentunit", - "min": 0, - "max": 1 - } - }, - "gridPos": { - "h": 4, - "w": 6, - "x": 0, - "y": 1 - }, - "id": 2, - "options": { - "colorMode": "background", - "graphMode": "none", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ] - }, - "textMode": "auto" - }, - "title": "Service Availability", - "type": "stat", - "targets": [ - { - "expr": "avg(kube_deployment_status_replicas_available{namespace=\"$namespace\"} / kube_deployment_spec_replicas{namespace=\"$namespace\"})", - "legendFormat": "Availability", - "refId": "A" - } - ], - "description": "Percentage of deployments with all requested replicas available. 1.0 = all healthy." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "thresholds": { - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "orange", - "value": 3 - }, - { - "color": "red", - "value": 10 - } - ] - }, - "unit": "short" - } - }, - "gridPos": { - "h": 4, - "w": 6, - "x": 6, - "y": 1 - }, - "id": 3, - "options": { - "colorMode": "background", - "graphMode": "none", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ] - }, - "textMode": "auto" - }, - "title": "Pod Restarts (1h)", - "type": "stat", - "targets": [ - { - "expr": "sum(increase(kube_pod_container_status_restarts_total{namespace=\"$namespace\"}[1h]))", - "legendFormat": "Restarts", - "refId": "A" - } - ], - "description": "Total container restarts in the last hour. Non-zero indicates crashlooping or OOM kills." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "thresholds": { - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "orange", - "value": 0.01 - }, - { - "color": "red", - "value": 0.05 - } - ] - }, - "unit": "percentunit", - "min": 0, - "max": 1 - } - }, - "gridPos": { - "h": 4, - "w": 6, - "x": 18, - "y": 1 - }, - "id": 7, - "options": { - "colorMode": "background", - "graphMode": "area", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ] - }, - "textMode": "auto" - }, - "title": "Connect Error Rate", - "type": "stat", - "targets": [ - { - "expr": "sum(rate(connect:server_requests_handled_total{namespace=\"$namespace\", code!~\"0|OK|Canceled|NotFound\"}[$__rate_interval])) / sum(rate(connect:server_requests_handled_total{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "Error %", - "refId": "A" - } - ], - "description": "Fraction of Connect RPC responses with non-OK/non-Canceled codes across all CP services." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never", - "stacking": { - "mode": "none" - } - }, - "unit": "reqps" - } - }, - "gridPos": { - "h": 6, - "w": 12, - "x": 0, - "y": 5 - }, - "id": 4, - "options": { - "legend": { - "calcs": [ - "mean", - "max" - ], - "displayMode": "table", - "placement": "bottom" - }, - "tooltip": { - "mode": "multi", - "sort": "desc" - } - }, - "title": "Connect Request Rate by Service", - "type": "timeseries", - "targets": [ - { - "expr": "sum by (service) (rate(connect:server_requests_handled_total{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "{{ service }}", - "refId": "A" - } - ], - "description": "Connect protocol request throughput broken down by service (e.g. ExecutionService, ClusterService)." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "reqps" - } - }, - "gridPos": { - "h": 6, - "w": 12, - "x": 12, - "y": 5 - }, - "id": 5, - "options": { - "legend": { - "calcs": [ - "mean" - ], - "displayMode": "table", - "placement": "bottom" - }, - "tooltip": { - "mode": "multi", - "sort": "desc" - } - }, - "title": "Connect Errors by Code", - "type": "timeseries", - "targets": [ - { - "expr": "sum by (code) (rate(connect:server_requests_handled_total{namespace=\"$namespace\", code!~\"0|OK|Canceled|NotFound\"}[$__rate_interval]))", - "legendFormat": "{{ code }}", - "refId": "A" - } - ], - "description": "Connect error responses by gRPC status code (Internal, Unavailable, etc.)." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "thresholds": { - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 1 - } - ] - }, - "unit": "short" - } - }, - "gridPos": { - "h": 4, - "w": 6, - "x": 12, - "y": 1 - }, - "id": 11, - "options": { - "colorMode": "background", - "graphMode": "none", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ] - }, - "textMode": "value_and_name" - }, - "title": "Handler Panics", - "type": "stat", - "targets": [ - { - "expr": "sum(authorizer:handler_panic{namespace=\"$namespace\"} + cluster:handler_panic{namespace=\"$namespace\"} + dataproxy:handler_panic{namespace=\"$namespace\"} + executions:handler_panic{namespace=\"$namespace\"} + queue:handler_panic{namespace=\"$namespace\"} + usage:handler_panic{namespace=\"$namespace\"})", - "legendFormat": "Total", - "refId": "A" - } - ], - "description": "Total handler panics across all CP services. Any non-zero value indicates a service caught a panic during request handling." - }, - { - "collapsed": false, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 9 - }, - "id": 1200, - "title": "SLOs", - "type": "row" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "thresholds": { - "steps": [ - { - "color": "red", - "value": null - }, - { - "color": "orange", - "value": 0.99 - }, - { - "color": "green", - "value": 0.999 - } - ] - }, - "unit": "percentunit", - "decimals": 3 - } - }, - "gridPos": { - "h": 6, - "w": 6, - "x": 0, - "y": 10 - }, - "id": 1201, - "title": "Service Availability", - "type": "stat", - "options": { - "colorMode": "background", - "graphMode": "none", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ] - }, - "textMode": "value" - }, - "targets": [ - { - "expr": "avg(kube_deployment_status_replicas_available{namespace=\"$namespace\"} / kube_deployment_spec_replicas{namespace=\"$namespace\"})", - "refId": "A" - } - ], - "description": "Current service availability across all deployments." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": -999 - }, - { - "color": "orange", - "value": 0 - }, - { - "color": "green", - "value": 0.5 - } - ] - }, - "unit": "percentunit", - "decimals": 1, - "noValue": "N/A" - } - }, - "gridPos": { - "h": 6, - "w": 6, - "x": 6, - "y": 10 - }, - "id": 1202, - "title": "Error Budget Remaining", - "type": "stat", - "options": { - "colorMode": "background", - "graphMode": "area", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ] - }, - "textMode": "value" - }, - "targets": [ - { - "expr": "union:cp:slo:error_budget_remaining", - "refId": "A" - } - ], - "description": "Fraction of error budget remaining. <0 = budget exhausted. Requires monitoring.slos.enabled." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 0 - }, - { - "color": "orange", - "value": 0.95 - }, - { - "color": "green", - "value": 0.999 - } - ] - }, - "unit": "percentunit", - "decimals": 2, - "noValue": "N/A" - } - }, - "gridPos": { - "h": 6, - "w": 6, - "x": 12, - "y": 10 - }, - "id": 1203, - "title": "Ingress Success Rate", - "type": "stat", - "options": { - "colorMode": "background", - "graphMode": "none", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ] - }, - "textMode": "value" - }, - "targets": [ - { - "expr": "union:cp:slo:ingress_success_rate or (1 - sum(rate(nginx_ingress_controller_request_duration_seconds_count{namespace=\"$namespace\", status=~\"5..\"}[5m])) / sum(rate(nginx_ingress_controller_request_duration_seconds_count{namespace=\"$namespace\"}[5m])))", - "refId": "A" - } - ], - "description": "Ingress success rate (non-5xx). Customer-facing SLO metric. Falls back to raw metric if SLO recording rules are not enabled." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "thresholds": { - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "orange", - "value": 2 - }, - { - "color": "red", - "value": 5 - } - ] - }, - "unit": "s", - "decimals": 2 - } - }, - "gridPos": { - "h": 6, - "w": 6, - "x": 18, - "y": 10 - }, - "id": 1204, - "title": "Ingress Latency p99", - "type": "stat", - "options": { - "colorMode": "background", - "graphMode": "none", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ] - }, - "textMode": "value" - }, - "targets": [ - { - "expr": "union:cp:slo:ingress_latency_p99 or histogram_quantile(0.99, sum by (le) (rate(nginx_ingress_controller_request_duration_seconds_bucket{namespace=\"$namespace\"}[5m])))", - "refId": "A" - } - ], - "description": "Ingress p99 latency. Falls back to raw metric if SLO recording rules are not enabled." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "percentunit" - } - }, - "gridPos": { - "h": 6, - "w": 12, - "x": 0, - "y": 16 - }, - "id": 1205, - "title": "Availability Over Time", - "type": "timeseries", - "targets": [ - { - "expr": "avg(kube_deployment_status_replicas_available{namespace=\"$namespace\"} / kube_deployment_spec_replicas{namespace=\"$namespace\"})", - "legendFormat": "Availability", - "refId": "A" - }, - { - "expr": "vector(0.999)", - "legendFormat": "Target (99.9%)", - "refId": "B" - } - ], - "description": "Service availability over time with SLO target line." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "percentunit", - "max": 1, - "min": -0.5 - } - }, - "gridPos": { - "h": 6, - "w": 12, - "x": 12, - "y": 16 - }, - "id": 1206, - "title": "Error Budget Burn Rate", - "type": "timeseries", - "targets": [ - { - "expr": "union:cp:slo:error_budget_remaining", - "legendFormat": "Budget remaining", - "refId": "A" - }, - { - "expr": "vector(0)", - "legendFormat": "Exhausted", - "refId": "B" - } - ], - "description": "Error budget remaining over time. Requires monitoring.slos.enabled." - }, - { - "collapsed": true, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 31 - }, - "id": 100, - "title": "Ingress (nginx)", - "type": "row", - "panels": [ - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never", - "stacking": { - "mode": "normal" - } - }, - "unit": "reqps" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 10 - }, - "id": 101, - "title": "Request Rate by Path", - "type": "timeseries", - "targets": [ - { - "expr": "sum by (host, path) (rate(nginx_ingress_controller_request_duration_seconds_count{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "{{ host }}{{ path }}", - "refId": "A" - } - ], - "description": "Ingress request rate broken down by host and URL path." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never", - "stacking": { - "mode": "normal" - } - }, - "unit": "reqps" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 10 - }, - "id": 102, - "title": "Error Rate by Status Code", - "type": "timeseries", - "targets": [ - { - "expr": "sum by (status) (rate(nginx_ingress_controller_request_duration_seconds_count{namespace=\"$namespace\", status=~\"[45]..\"}[$__rate_interval]))", - "legendFormat": "{{ status }}", - "refId": "A" - } - ], - "description": "4xx and 5xx error rates from ingress-nginx by HTTP status code." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "s" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 18 - }, - "id": 103, - "title": "Latency p50 / p95 / p99", - "type": "timeseries", - "targets": [ - { - "expr": "histogram_quantile(0.50, sum by (le) (rate(nginx_ingress_controller_request_duration_seconds_bucket{namespace=\"$namespace\"}[$__rate_interval])))", - "legendFormat": "p50", - "refId": "A" - }, - { - "expr": "histogram_quantile(0.95, sum by (le) (rate(nginx_ingress_controller_request_duration_seconds_bucket{namespace=\"$namespace\"}[$__rate_interval])))", - "legendFormat": "p95", - "refId": "B" - }, - { - "expr": "histogram_quantile(0.99, sum by (le) (rate(nginx_ingress_controller_request_duration_seconds_bucket{namespace=\"$namespace\"}[$__rate_interval])))", - "legendFormat": "p99", - "refId": "C" - } - ], - "description": "Ingress request latency percentiles. Includes TLS + routing + upstream response time." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "short" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 18 - }, - "id": 104, - "title": "Active Connections", - "type": "timeseries", - "targets": [ - { - "expr": "sum(nginx_ingress_controller_nginx_process_connections{namespace=\"$namespace\"})", - "legendFormat": "Active", - "refId": "A" - } - ], - "description": "Current number of active client connections to ingress-nginx." - } - ] - }, - { - "collapsed": true, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 32 - }, - "id": 200, - "title": "Connect / gRPC", - "type": "row", - "panels": [ - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never", - "stacking": { - "mode": "normal" - } - }, - "unit": "reqps" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 11 - }, - "id": 201, - "title": "Connect Request Rate by Service", - "type": "timeseries", - "targets": [ - { - "expr": "sum by (service) (rate(connect:server_requests_handled_total{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "{{ service }}", - "refId": "A" - } - ], - "description": "Connect protocol request throughput broken down by service (e.g. ExecutionService, ClusterService)." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "reqps" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 11 - }, - "id": 202, - "title": "Connect Errors by Service & Code", - "type": "timeseries", - "targets": [ - { - "expr": "sum by (service, code) (rate(connect:server_requests_handled_total{namespace=\"$namespace\", code!~\"0|OK|Canceled|NotFound\"}[$__rate_interval]))", - "legendFormat": "{{ service }} {{ code }}", - "refId": "A" - } - ], - "description": "Connect errors broken down by service and gRPC code. Identifies which services are erroring." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "reqps" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 19 - }, - "id": 203, - "title": "gRPC Server Request Rate (CacheService)", - "type": "timeseries", - "targets": [ - { - "expr": "sum by (grpc_service, grpc_method) (rate(grpc_server_handled_total{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "{{ grpc_service }}/{{ grpc_method }}", - "refId": "A" - } - ], - "description": "CacheService is the only CP service using gRPC (not Connect). Shows Get/Put/Delete/Reservation rates." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "reqps" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 19 - }, - "id": 204, - "title": "gRPC Server Errors (CacheService)", - "type": "timeseries", - "targets": [ - { - "expr": "sum by (grpc_method, grpc_code) (rate(grpc_server_handled_total{namespace=\"$namespace\", grpc_code!=\"OK\"}[$__rate_interval]))", - "legendFormat": "{{ grpc_method }} {{ grpc_code }}", - "refId": "A" - } - ], - "description": "CacheService gRPC errors by method and code." - } - ] - }, - { - "collapsed": true, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 33 - }, - "id": 300, - "title": "FlyteAdmin", - "type": "row", - "panels": [ - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "short" - } - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 0, - "y": 12 - }, - "id": 301, - "title": "Active Executions", - "type": "timeseries", - "targets": [ - { - "expr": "flyte:admin:execution_manager:active_executions{namespace=\"$namespace\"}", - "legendFormat": "Workflows", - "refId": "A" - }, - { - "expr": "flyte:admin:node_execution_manager:active_node_executions{namespace=\"$namespace\"}", - "legendFormat": "Nodes", - "refId": "B" - }, - { - "expr": "flyte:admin:task_execution_manager:active_executions{namespace=\"$namespace\"}", - "legendFormat": "Tasks", - "refId": "C" - } - ], - "description": "Current count of active workflow, node, and task executions tracked by FlyteAdmin." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "ops" - } - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 8, - "y": 12 - }, - "id": 302, - "title": "Execution Create / Event Rate", - "type": "timeseries", - "targets": [ - { - "expr": "rate(flyte:admin:execution_manager:executions_created{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Executions created", - "refId": "A" - }, - { - "expr": "rate(flyte:admin:execution_manager:execution_events_created{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Workflow events", - "refId": "B" - }, - { - "expr": "rate(flyte:admin:node_execution_manager:node_execution_events_created{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Node events", - "refId": "C" - }, - { - "expr": "rate(flyte:admin:task_execution_manager:task_execution_events_created{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Task events", - "refId": "D" - } - ], - "description": "Rate of execution creations and event ingestion (workflow, node, task events from propeller)." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "ops" - } - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 16, - "y": 12 - }, - "id": 303, - "title": "Errors", - "type": "timeseries", - "targets": [ - { - "expr": "rate(flyte:admin:execution_manager:propeller_failures{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Propeller failures", - "refId": "A" - }, - { - "expr": "rate(flyte:admin:execution_manager:transformer_error{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Transformer errors", - "refId": "B" - }, - { - "expr": "rate(flyte:admin:execution_manager:publish_error{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Publish errors", - "refId": "C" - }, - { - "expr": "rate(flyte:admin:execution_manager:execution_termination_failure{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Termination failures", - "refId": "D" - } - ], - "description": "FlyteAdmin error rates: propeller communication failures, model transform errors, notification publish failures." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "s" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 20 - }, - "id": 304, - "title": "Endpoint Latency (p95)", - "type": "timeseries", - "targets": [ - { - "expr": "flyte:admin:create_execution:duration_ms{namespace=\"$namespace\", quantile=\"0.95\"} / 1000", - "legendFormat": "CreateExecution", - "refId": "A" - }, - { - "expr": "flyte:admin:create_execution_event:duration_ms{namespace=\"$namespace\", quantile=\"0.95\"} / 1000", - "legendFormat": "CreateExecutionEvent", - "refId": "B" - }, - { - "expr": "flyte:admin:get_execution:duration_ms{namespace=\"$namespace\", quantile=\"0.95\"} / 1000", - "legendFormat": "GetExecution", - "refId": "C" - }, - { - "expr": "flyte:admin:list_execution:duration_ms{namespace=\"$namespace\", quantile=\"0.95\"} / 1000", - "legendFormat": "ListExecution", - "refId": "D" - } - ], - "description": "FlyteAdmin gRPC endpoint latency at p95. Key endpoints: CreateExecution, CreateExecutionEvent, GetExecution." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "ops" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 20 - }, - "id": 305, - "title": "Auth Middleware Decisions", - "type": "timeseries", - "targets": [ - { - "expr": "rate(flyte:middleware:authorization:authz_approved{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Approved", - "refId": "A" - }, - { - "expr": "rate(flyte:middleware:authorization:authz_denied{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Denied", - "refId": "B" - } - ], - "description": "Authorization approve/deny rate from the FlyteAdmin auth middleware. High deny rate may indicate auth misconfiguration." - } - ] - }, - { - "collapsed": true, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 34 - }, - "id": 400, - "title": "Executions (V2)", - "type": "row", - "panels": [ - { - "title": "CreateRun Rate", - "type": "timeseries", - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "unit": "reqps" - } - }, - "targets": [ - { - "expr": "sum(rate(connect:server_requests_handled_total{namespace=\"$namespace\", method=\"CreateRun\"}[$__rate_interval]))", - "legendFormat": "CreateRun", - "refId": "A" - } - ], - "gridPos": { - "h": 8, - "w": 8, - "x": 0, - "y": 0 - } - }, - { - "title": "CreateRun Latency (p50 / p95 / p99)", - "type": "timeseries", - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "unit": "s" - } - }, - "targets": [ - { - "expr": "histogram_quantile(0.50, sum by (le) (rate(connect:server_request_duration_seconds_bucket{namespace=\"$namespace\", method=\"CreateRun\"}[$__rate_interval])))", - "legendFormat": "p50", - "refId": "A" - }, - { - "expr": "histogram_quantile(0.95, sum by (le) (rate(connect:server_request_duration_seconds_bucket{namespace=\"$namespace\", method=\"CreateRun\"}[$__rate_interval])))", - "legendFormat": "p95", - "refId": "B" - }, - { - "expr": "histogram_quantile(0.99, sum by (le) (rate(connect:server_request_duration_seconds_bucket{namespace=\"$namespace\", method=\"CreateRun\"}[$__rate_interval])))", - "legendFormat": "p99", - "refId": "C" - } - ], - "gridPos": { - "h": 8, - "w": 8, - "x": 8, - "y": 0 - } - }, - { - "title": "V2 Run Methods", - "type": "timeseries", - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "unit": "reqps" - } - }, - "targets": [ - { - "expr": "sum by (method) (rate(connect:server_requests_handled_total{namespace=\"$namespace\", service=~\".*RunService.*\"}[$__rate_interval]))", - "legendFormat": "{{method}}", - "refId": "A" - } - ], - "gridPos": { - "h": 8, - "w": 8, - "x": 16, - "y": 0 - } - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "ops" - } - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 8, - "y": 20 - }, - "id": 405, - "title": "DB Operation Rate", - "type": "timeseries", - "targets": [ - { - "expr": "sum by (op) (rate(label_replace({__name__=~\"executions:database:postgres:repositories:execution_ops:.*_count\", namespace=\"$namespace\"}, \"op\", \"$1\", \"__name__\", \"executions:database:postgres:repositories:execution_ops:(.*)_count\")[$__rate_interval:]))", - "legendFormat": "{{ op }}", - "refId": "A" - } - ], - "description": "Execution operations DB latency: create, ack, claim, unclaim, get, update." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "short" - } - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 16, - "y": 20 - }, - "id": 406, - "title": "DB Errors", - "type": "timeseries", - "targets": [ - { - "expr": "rate(executions:database:postgres:errors:gorm_error{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "gorm_error", - "refId": "A" - }, - { - "expr": "rate(executions:database:postgres:errors:postgres_error{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "postgres_error", - "refId": "B" - }, - { - "expr": "rate(executions:database:postgres:errors:not_found{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "not_found", - "refId": "C" - } - ], - "description": "Executions service Postgres error rates by type: gorm errors, native postgres errors, not-found." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "short" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 28 - }, - "id": 407, - "title": "Cluster Cache Hit/Miss", - "type": "timeseries", - "targets": [ - { - "expr": "rate(executions:executions:list_clusters:hits{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Cluster hits", - "refId": "A" - }, - { - "expr": "rate(executions:executions:list_clusters:miss{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Cluster miss", - "refId": "B" - }, - { - "expr": "rate(executions:executions:list_nodepools:hits{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Nodepool hits", - "refId": "C" - }, - { - "expr": "rate(executions:executions:list_nodepools:miss{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Nodepool miss", - "refId": "D" - } - ], - "description": "Cluster and nodepool list cache effectiveness. High miss rate = excessive DB queries." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "short" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 28 - }, - "id": 408, - "title": "Apps \u2014 Pending Assignments", - "type": "timeseries", - "targets": [ - { - "expr": "executions:app:leaser:pending_assignment_unlabeled{namespace=\"$namespace\"}", - "legendFormat": "Pending", - "refId": "A" - } - ], - "description": "Number of apps waiting for cluster assignment. Growing backlog = scheduling bottleneck." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "s" - } - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 0, - "y": 36 - }, - "id": 409, - "title": "Apps \u2014 First Ack Latency", - "type": "timeseries", - "targets": [ - { - "expr": "histogram_quantile(0.50, sum by (le) (rate(executions:app:service:first_ack_latency_unlabeled_bucket{namespace=\"$namespace\"}[$__rate_interval])))", - "legendFormat": "p50", - "refId": "A" - }, - { - "expr": "histogram_quantile(0.95, sum by (le) (rate(executions:app:service:first_ack_latency_unlabeled_bucket{namespace=\"$namespace\"}[$__rate_interval])))", - "legendFormat": "p95", - "refId": "B" - }, - { - "expr": "histogram_quantile(0.99, sum by (le) (rate(executions:app:service:first_ack_latency_unlabeled_bucket{namespace=\"$namespace\"}[$__rate_interval])))", - "legendFormat": "p99", - "refId": "C" - } - ], - "description": "Key V2 SLI: time to deliver an app to the dataplane. Measures end-to-end scheduling latency." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "ops" - } - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 8, - "y": 36 - }, - "id": 410, - "title": "V2 Run Dispatch", - "type": "timeseries", - "targets": [ - { - "expr": "rate(executions:run:runs_sent{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Runs sent", - "refId": "A" - }, - { - "expr": "rate(executions:run:actions_sent{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Actions sent", - "refId": "B" - }, - { - "expr": "rate(executions:run:enqueue_action_failures{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Enqueue failures", - "refId": "C" - } - ], - "description": "V2 run/action dispatch throughput. Enqueue failures indicate queue service issues." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "short" - } - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 16, - "y": 36 - }, - "id": 411, - "title": "V2 Run Notifier", - "type": "timeseries", - "targets": [ - { - "expr": "rate(executions:run_notifier:notifications_sent{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Notifications/s", - "refId": "A" - }, - { - "expr": "executions:run_notifier:subscribers{namespace=\"$namespace\"}", - "legendFormat": "Subscribers", - "refId": "B" - }, - { - "expr": "rate(executions:run:logs:tail_logs_bytes_read{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Log bytes/s", - "refId": "C" - } - ], - "description": "V2 notification pipeline: notifications sent per second, active subscribers, log bytes streamed." - } - ] - }, - { - "collapsed": true, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 34 - }, - "id": 500, - "title": "Queue / Run-Scheduler", - "type": "row", - "panels": [ - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "short" - } - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 0, - "y": 13 - }, - "id": 501, - "title": "Metadata Store Counts", - "type": "timeseries", - "targets": [ - { - "expr": "queue:metadata_store:total_run_count{namespace=\"$namespace\"}", - "legendFormat": "Total runs", - "refId": "A" - }, - { - "expr": "queue:metadata_store:total_action_count{namespace=\"$namespace\"}", - "legendFormat": "Total actions", - "refId": "B" - }, - { - "expr": "queue:metadata_store:scheduled_run_count{namespace=\"$namespace\"}", - "legendFormat": "Scheduled runs", - "refId": "C" - }, - { - "expr": "queue:metadata_store:scheduled_action_count{namespace=\"$namespace\"}", - "legendFormat": "Scheduled actions", - "refId": "D" - } - ], - "description": "Total and scheduled run/action counts in the queue. Shows system load." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "ops" - } - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 8, - "y": 13 - }, - "id": 502, - "title": "Scheduler / Runner / Aborter Throughput", - "type": "timeseries", - "targets": [ - { - "expr": "rate(queue:scheduler:enqueued_leases{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Enqueued", - "refId": "A" - }, - { - "expr": "rate(queue:runner:completed_leases{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Completed", - "refId": "B" - }, - { - "expr": "rate(queue:aborter:aborted_leases{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Aborted", - "refId": "C" - } - ], - "description": "Lease lifecycle throughput: enqueued (new), completed (done), aborted (cancelled)." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "short" - } - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 16, - "y": 13 - }, - "id": 503, - "title": "Queue Lengths", - "type": "timeseries", - "targets": [ - { - "expr": "queue:scheduler:input_queue_length{namespace=\"$namespace\"}", - "legendFormat": "Scheduler input", - "refId": "A" - }, - { - "expr": "queue:runner:input_queue_length{namespace=\"$namespace\"}", - "legendFormat": "Runner input", - "refId": "B" - }, - { - "expr": "queue:aborter:input_queue_length{namespace=\"$namespace\"}", - "legendFormat": "Aborter input", - "refId": "C" - }, - { - "expr": "queue:dispatcher:chain_queue_length{namespace=\"$namespace\"}", - "legendFormat": "Dispatcher chain", - "refId": "D" - }, - { - "expr": "queue:db:queue_length{namespace=\"$namespace\"}", - "legendFormat": "DB queue", - "refId": "E" - } - ], - "description": "Internal queue depths across scheduler, runner, aborter, dispatcher, and DB worker pool. Growing = backpressure." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "s" - } - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 0, - "y": 21 - }, - "id": 504, - "title": "Dispatcher Operation Duration (p99)", - "type": "timeseries", - "targets": [ - { - "expr": "histogram_quantile(0.99, sum by (type, le) (rate(queue:dispatcher:operation_duration_bucket{namespace=\"$namespace\"}[$__rate_interval])))", - "legendFormat": "{{ type }}", - "refId": "A" - } - ], - "description": "Dispatcher multi-step operation chain execution time at p99, by operation type." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "s" - } - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 8, - "y": 21 - }, - "id": 505, - "title": "State Get/Put Duration (p99)", - "type": "timeseries", - "targets": [ - { - "expr": "histogram_quantile(0.99, sum by (le) (rate(queue:state:get_duration_bucket{namespace=\"$namespace\"}[$__rate_interval])))", - "legendFormat": "Get p99", - "refId": "A" - }, - { - "expr": "histogram_quantile(0.99, sum by (le) (rate(queue:state:put_duration_bucket{namespace=\"$namespace\"}[$__rate_interval])))", - "legendFormat": "Put p99", - "refId": "B" - } - ], - "description": "In-memory state store operation latency. Backed by ScyllaDB persistence." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "short" - } - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 16, - "y": 21 - }, - "id": 506, - "title": "State Cache & Eventer", - "type": "timeseries", - "targets": [ - { - "expr": "queue:state:active_states{namespace=\"$namespace\"}", - "legendFormat": "Active states", - "refId": "A" - }, - { - "expr": "queue:state:terminal_states{namespace=\"$namespace\"}", - "legendFormat": "Terminal states", - "refId": "B" - }, - { - "expr": "rate(queue:eventer:record_action_errors{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Eventer errors", - "refId": "C" - } - ], - "description": "Active/terminal state counts and eventer error rate. Eventer reports action status to executions service." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "short" - } - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 0, - "y": 29 - }, - "id": 507, - "title": "Worker Capacity", - "type": "timeseries", - "targets": [ - { - "expr": "queue:scheduler:worker_capacity{namespace=\"$namespace\"}", - "legendFormat": "{{ worker_name }}", - "refId": "A" - } - ], - "description": "Remaining execution capacity per connected DP worker. Zero = worker saturated." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "ops" - } - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 8, - "y": 29 - }, - "id": 508, - "title": "Dispatcher Failures by Type", - "type": "timeseries", - "targets": [ - { - "expr": "sum by (type) (rate(queue:dispatcher:operation_failures{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "{{ type }}", - "refId": "A" - } - ], - "description": "Failed dispatcher operations by Go type. Indicates internal queue service errors." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "short" - } - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 16, - "y": 29 - }, - "id": 509, - "title": "DB & Client Thread Pool", - "type": "timeseries", - "targets": [ - { - "expr": "queue:db:free_threads{namespace=\"$namespace\"}", - "legendFormat": "DB free threads", - "refId": "A" - }, - { - "expr": "queue:queue_client:free_threads{namespace=\"$namespace\"}", - "legendFormat": "Queue client free", - "refId": "B" - }, - { - "expr": "queue:state_client:free_threads{namespace=\"$namespace\"}", - "legendFormat": "State client free", - "refId": "C" - } - ], - "description": "Idle worker goroutines in DB, queue-client, and state-client pools. Zero = all threads busy." - } - ] - }, - { - "collapsed": true, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 35 - }, - "id": 600, - "title": "Cluster Service", - "type": "row", - "panels": [ - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "ops" - } - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 0, - "y": 14 - }, - "id": 601, - "title": "UpdateStatus / Heartbeat Rate", - "type": "timeseries", - "targets": [ - { - "expr": "rate(cluster:svc:update_status:updates_total{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "UpdateStatus", - "refId": "A" - }, - { - "expr": "rate(cluster:svc:heartbeat:success_ms_count{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Heartbeat", - "refId": "B" - } - ], - "description": "Rate of DP cluster status updates and heartbeats received by the cluster service." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "s" - } - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 8, - "y": 14 - }, - "id": 602, - "title": "Cluster API Latency (p95)", - "type": "timeseries", - "targets": [ - { - "expr": "cluster:svc:update_status:success_ms{namespace=\"$namespace\", quantile=\"0.95\"} / 1000", - "legendFormat": "UpdateStatus p95", - "refId": "A" - }, - { - "expr": "cluster:svc:heartbeat:success_ms{namespace=\"$namespace\", quantile=\"0.95\"} / 1000", - "legendFormat": "Heartbeat p95", - "refId": "B" - } - ], - "description": "Cluster service RPC latency for UpdateStatus and Heartbeat calls." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "short" - } - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 16, - "y": 14 - }, - "id": 603, - "title": "Operator / Propeller Restarts (from DP)", - "type": "timeseries", - "targets": [ - { - "expr": "cluster:svc:update_status:operator_restarts{namespace=\"$namespace\"}", - "legendFormat": "Operator restarts", - "refId": "A" - }, - { - "expr": "cluster:svc:update_status:propeller_restarts{namespace=\"$namespace\"}", - "legendFormat": "Propeller restarts", - "refId": "B" - } - ], - "description": "DP-reported restart counts for operator and propeller pods. Set by DP on each UpdateStatus call." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "short" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 22 - }, - "id": 604, - "title": "DB Errors by Type", - "type": "timeseries", - "targets": [ - { - "expr": "rate(cluster:database:postgres:errors:gorm_error{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "gorm_error", - "refId": "A" - }, - { - "expr": "rate(cluster:database:postgres:errors:postgres_error{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "postgres_error", - "refId": "B" - }, - { - "expr": "rate(cluster:database:postgres:errors:not_found{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "not_found", - "refId": "C" - } - ], - "description": "Cluster service Postgres error rates by type: gorm errors, native postgres errors, not-found." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "thresholds": { - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 1 - } - ] - }, - "unit": "short", - "mappings": [ - { - "type": "value", - "options": { - "0": { - "text": "Healthy", - "color": "green" - }, - "1": { - "text": "Unhealthy", - "color": "red" - } - } - } - ] - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 22 - }, - "id": 605, - "title": "Cluster Health Status", - "type": "timeseries", - "targets": [ - { - "expr": "cluster:cluster_sync:health:unhealthy{namespace=\"$namespace\", subsystem=\"\"}", - "legendFormat": "{{ org }}/{{ cluster_name }}", - "refId": "A" - } - ], - "description": "Cluster health collector: 1=unhealthy, 0=healthy. Emitted per cluster on every Prometheus scrape." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "s" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 30 - }, - "id": 606, - "title": "Last Heartbeat Age (stale cluster detection)", - "type": "timeseries", - "targets": [ - { - "expr": "cluster:cluster_sync:health:last_update_age{namespace=\"$namespace\"}", - "legendFormat": "{{ org }}/{{ cluster_name }}", - "refId": "A" - } - ], - "description": "Seconds since each cluster last sent a heartbeat. High values = stale/disconnected cluster." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "short" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 30 - }, - "id": 607, - "title": "Managed Cluster Cache", - "type": "timeseries", - "targets": [ - { - "expr": "rate(cluster:managed_cluster_client_cache:get:hits{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Cache hits", - "refId": "A" - }, - { - "expr": "rate(cluster:managed_cluster_client_cache:get:miss{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Cache miss", - "refId": "B" - } - ], - "description": "LRU cache hit/miss rate for managed cluster lookups. High miss rate = excessive DB queries." - } - ] - }, - { - "collapsed": true, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 36 - }, - "id": 900, - "title": "CacheService", - "type": "row", - "panels": [ - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "ops" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 15 - }, - "id": 901, - "title": "Cache Hit / Miss Rate", - "type": "timeseries", - "targets": [ - { - "expr": "rate(flyte:cacheservice:cache:cache_hit_unlabeled{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Hits", - "refId": "A" - }, - { - "expr": "rate(flyte:cacheservice:cache:not_found_unlabeled{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Misses", - "refId": "B" - }, - { - "expr": "rate(flyte:cacheservice:cache:get_failure_unlabeled{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Get failures", - "refId": "C" - } - ], - "description": "CacheService hit/miss rate. Hits = cached task output reused. Misses = task must execute. Get failures = storage errors. [Metrics pending: requires cloud service instrumentation to be deployed]" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "ops" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 15 - }, - "id": 902, - "title": "Reservation Contention & Operations", - "type": "timeseries", - "targets": [ - { - "expr": "rate(flyte:cacheservice:cache:reservation_contention_unlabeled{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Contention", - "refId": "A" - }, - { - "expr": "rate(flyte:cacheservice:cache:get_reservation_success_unlabeled{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Reservation acquired", - "refId": "B" - }, - { - "expr": "rate(flyte:cacheservice:cache:release_reservation_success_unlabeled{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Reservation released", - "refId": "C" - } - ], - "description": "Cache reservation contention: how often workers are blocked waiting for another worker's cache computation. High contention = many workers computing the same task. [Metrics pending: requires cloud service instrumentation to be deployed]" - } - ] - }, - { - "collapsed": true, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 36 - }, - "id": 750, - "title": "Authorizer", - "type": "row", - "panels": [ - { - "id": 760, - "title": "Authorizer Mode", - "type": "stat", - "gridPos": { - "h": 4, - "w": 4, - "x": 0, - "y": 37 - }, - "fieldConfig": { - "defaults": { - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - } - ] - } - }, - "overrides": [] - }, - "options": { - "colorMode": "value", - "graphMode": "none", - "justifyMode": "auto", - "orientation": "auto", - "textMode": "name", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - } - }, - "targets": [ - { - "expr": "authorizer:authorizer:cloudauthorizer:connect:authz_type_info{namespace=\"$namespace\"} == 1", - "legendFormat": "{{type}}", - "refId": "A" - } - ], - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - } - }, - { - "id": 751, - "title": "Allow / Deny Rate", - "type": "timeseries", - "gridPos": { - "h": 8, - "w": 10, - "x": 4, - "y": 37 - }, - "fieldConfig": { - "defaults": { - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "spanNulls": false - }, - "noValue": "0", - "unit": "ops", - "decimals": 2 - }, - "overrides": [ - { - "matcher": { - "id": "byRegexp", - "options": ".*denied.*" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "red", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": ".*allowed.*" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "green", - "mode": "fixed" - } - } - ] - } - ] - }, - "options": { - "legend": { - "displayMode": "table", - "placement": "bottom", - "calcs": [ - "mean" - ] - }, - "tooltip": { - "mode": "multi" - } - }, - "targets": [ - { - "expr": "sum by (identity_type) (rate(authorizer:authorizer:cloudauthorizer:connect:authz_allowed{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "allowed ({{identity_type}})", - "refId": "A" - }, - { - "expr": "sum by (identity_type) (rate(authorizer:authorizer:cloudauthorizer:connect:authz_denied{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "denied ({{identity_type}})", - "refId": "B" - } - ], - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - } - }, - { - "id": 753, - "title": "Deny Rate (%)", - "type": "timeseries", - "gridPos": { - "h": 8, - "w": 10, - "x": 14, - "y": 37 - }, - "fieldConfig": { - "defaults": { - "unit": "percentunit", - "custom": { - "drawStyle": "line", - "fillOpacity": 10 - }, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "yellow", - "value": 0.1 - }, - { - "color": "red", - "value": 0.5 - } - ] - }, - "noValue": "0", - "decimals": 1, - "min": 0, - "max": 1 - }, - "overrides": [] - }, - "options": { - "legend": { - "displayMode": "table", - "placement": "bottom" - }, - "tooltip": { - "mode": "multi" - } - }, - "targets": [ - { - "expr": "(sum by (identity_type) (rate(authorizer:authorizer:cloudauthorizer:connect:authz_denied{namespace=\"$namespace\"}[$__rate_interval])) or vector(0)) / clamp_min((sum by (identity_type) (rate(authorizer:authorizer:cloudauthorizer:connect:authz_allowed{namespace=\"$namespace\"}[$__rate_interval])) + sum by (identity_type) (rate(authorizer:authorizer:cloudauthorizer:connect:authz_denied{namespace=\"$namespace\"}[$__rate_interval]))), 1e-10)", - "legendFormat": "{{identity_type}}", - "refId": "A" - } - ], - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - } - }, - { - "id": 752, - "title": "Authorize Latency (service)", - "type": "timeseries", - "gridPos": { - "h": 8, - "w": 8, - "x": 0, - "y": 45 - }, - "fieldConfig": { - "defaults": { - "unit": "ms", - "custom": { - "drawStyle": "line", - "fillOpacity": 10 - }, - "noValue": "0", - "decimals": 1, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "yellow", - "value": 50 - }, - { - "color": "red", - "value": 200 - } - ] - } - }, - "overrides": [] - }, - "options": { - "legend": { - "displayMode": "list", - "placement": "bottom" - }, - "tooltip": { - "mode": "multi" - } - }, - "targets": [ - { - "expr": "authorizer:authorizer:cloudauthorizer:connect:authorize_duration_ms{namespace=\"$namespace\", quantile=\"0.5\"}", - "legendFormat": "p50", - "refId": "A" - }, - { - "expr": "authorizer:authorizer:cloudauthorizer:connect:authorize_duration_ms{namespace=\"$namespace\", quantile=\"0.9\"}", - "legendFormat": "p90", - "refId": "B" - }, - { - "expr": "authorizer:authorizer:cloudauthorizer:connect:authorize_duration_ms{namespace=\"$namespace\", quantile=\"0.99\"}", - "legendFormat": "p99", - "refId": "C" - } - ], - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - } - }, - { - "id": 761, - "title": "Backend Latency", - "type": "timeseries", - "gridPos": { - "h": 8, - "w": 8, - "x": 8, - "y": 45 - }, - "fieldConfig": { - "defaults": { - "unit": "ms", - "custom": { - "drawStyle": "line", - "fillOpacity": 10 - }, - "noValue": "0", - "decimals": 1, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "yellow", - "value": 50 - }, - { - "color": "red", - "value": 200 - } - ] - } - }, - "overrides": [] - }, - "options": { - "legend": { - "displayMode": "list", - "placement": "bottom" - }, - "tooltip": { - "mode": "multi" - } - }, - "targets": [ - { - "expr": "histogram_quantile(0.50, sum by (le) (rate(authorizer:authorizer:cloudauthorizer:connect:backend_authorize_duration_ms_bucket{namespace=\"$namespace\"}[$__rate_interval])))", - "legendFormat": "p50", - "refId": "A" - }, - { - "expr": "histogram_quantile(0.95, sum by (le) (rate(authorizer:authorizer:cloudauthorizer:connect:backend_authorize_duration_ms_bucket{namespace=\"$namespace\"}[$__rate_interval])))", - "legendFormat": "p95", - "refId": "B" - }, - { - "expr": "histogram_quantile(0.99, sum by (le) (rate(authorizer:authorizer:cloudauthorizer:connect:backend_authorize_duration_ms_bucket{namespace=\"$namespace\"}[$__rate_interval])))", - "legendFormat": "p99", - "refId": "C" - } - ], - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - } - }, - { - "id": 764, - "title": "Decisions by Action", - "type": "timeseries", - "gridPos": { - "h": 8, - "w": 8, - "x": 16, - "y": 45 - }, - "fieldConfig": { - "defaults": { - "custom": { - "drawStyle": "bars", - "fillOpacity": 50, - "stacking": { - "mode": "normal" - } - }, - "noValue": "0", - "unit": "ops", - "decimals": 2 - }, - "overrides": [] - }, - "options": { - "legend": { - "displayMode": "table", - "placement": "bottom", - "calcs": [ - "sum" - ] - }, - "tooltip": { - "mode": "multi" - } - }, - "targets": [ - { - "expr": "sum by (action, identity_type) (rate(authorizer:authorizer:cloudauthorizer:connect:authz_allowed{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "{{action}} {{identity_type}} (allowed)", - "refId": "A" - }, - { - "expr": "sum by (action, identity_type) (rate(authorizer:authorizer:cloudauthorizer:connect:authz_denied{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "{{action}} {{identity_type}} (denied)", - "refId": "B" - } - ], - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - } - }, - { - "id": 762, - "title": "Backend Errors", - "type": "timeseries", - "gridPos": { - "h": 8, - "w": 8, - "x": 0, - "y": 53 - }, - "fieldConfig": { - "defaults": { - "custom": { - "drawStyle": "line", - "fillOpacity": 10 - }, - "noValue": "0", - "unit": "ops", - "decimals": 2, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 0.01 - } - ] - } - }, - "overrides": [] - }, - "options": { - "legend": { - "displayMode": "table", - "placement": "bottom" - }, - "tooltip": { - "mode": "multi" - } - }, - "targets": [ - { - "expr": "sum by (error_type) (rate(authorizer:authorizer:cloudauthorizer:connect:backend_authorize_errors{namespace=\"$namespace\"}[$__rate_interval])) or vector(0)", - "legendFormat": "{{error_type}}", - "refId": "A" - } - ], - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - } - }, - { - "id": 765, - "title": "Error Attribution", - "type": "timeseries", - "gridPos": { - "h": 8, - "w": 8, - "x": 8, - "y": 53 - }, - "fieldConfig": { - "defaults": { - "custom": { - "drawStyle": "line", - "fillOpacity": 10 - }, - "noValue": "0", - "unit": "ops", - "decimals": 2, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 0.01 - } - ] - } - }, - "overrides": [] - }, - "options": { - "legend": { - "displayMode": "table", - "placement": "bottom" - }, - "tooltip": { - "mode": "multi" - } - }, - "targets": [ - { - "expr": "sum by (error_source) (rate(authorizer:authorizer:cloudauthorizer:connect:authorize_errors_total{namespace=\"$namespace\"}[$__rate_interval])) or vector(0)", - "legendFormat": "{{error_source}}", - "refId": "A" - } - ], - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - } - }, - { - "id": 763, - "title": "Fail-Open Activations", - "type": "timeseries", - "gridPos": { - "h": 8, - "w": 8, - "x": 16, - "y": 53 - }, - "fieldConfig": { - "defaults": { - "custom": { - "drawStyle": "line", - "fillOpacity": 10 - }, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 0.001 - } - ] - }, - "noValue": "0", - "unit": "ops", - "decimals": 2 - }, - "overrides": [] - }, - "options": { - "legend": { - "displayMode": "list", - "placement": "bottom" - }, - "tooltip": { - "mode": "multi" - } - }, - "targets": [ - { - "expr": "rate(authorizer:authorizer:cloudauthorizer:connect:external:fail_open_activated{namespace=\"$namespace\"}[$__rate_interval]) or vector(0)", - "legendFormat": "fail-open", - "refId": "A" - } - ], - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - } - } - ] - }, - { - "collapsed": true, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 36 - }, - "id": 700, - "title": "Data Proxy", - "type": "row", - "panels": [ - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "short" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 15 - }, - "id": 701, - "title": "Cache Hit/Miss Rates", - "type": "timeseries", - "targets": [ - { - "expr": "rate(dataproxy:domains:hits{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Domain hits", - "refId": "A" - }, - { - "expr": "rate(dataproxy:domains:miss{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Domain miss", - "refId": "B" - }, - { - "expr": "rate(dataproxy:clusterpoolcache:hits{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "ClusterPool hits", - "refId": "C" - }, - { - "expr": "rate(dataproxy:clusterpoolcache:miss{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "ClusterPool miss", - "refId": "D" - } - ], - "description": "DataProxy internal cache effectiveness for domain resolution, cluster pool routing, and namespace mapping." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "s" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 15 - }, - "id": 702, - "title": "Image Read Latency (p95)", - "type": "timeseries", - "targets": [ - { - "expr": "histogram_quantile(0.95, sum by (le) (rate(dataproxy:images:read:success_ms_count{namespace=\"$namespace\"}[$__rate_interval]))) / 1000", - "legendFormat": "Success p95", - "refId": "A" - }, - { - "expr": "histogram_quantile(0.95, sum by (le) (rate(dataproxy:images:read:failure_ms_count{namespace=\"$namespace\"}[$__rate_interval]))) / 1000", - "legendFormat": "Failure p95", - "refId": "B" - } - ], - "description": "Time to read image metadata from the dataplane, proxied through DataProxy." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "ops" - } - }, - "gridPos": { - "h": 8, - "w": 24, - "x": 0, - "y": 23 - }, - "id": 703, - "title": "Secret Proxy Errors by Cluster", - "type": "timeseries", - "targets": [ - { - "expr": "sum by (cluster, operation) (rate(dataproxy:secrets_service:cluster_errors{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "{{ cluster }} {{ operation }}", - "refId": "A" - } - ], - "description": "Per-cluster secret proxy errors during fan-out operations. Identifies which dataplane cluster is causing failures. [Metrics pending: requires cloud service instrumentation to be deployed]" - } - ] - }, - { - "collapsed": true, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 37 - }, - "id": 800, - "title": "Usage Service", - "type": "row", - "panels": [ - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "ops" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 16 - }, - "id": 801, - "title": "Billable Usage Reports", - "type": "timeseries", - "targets": [ - { - "expr": "rate(usage:svc:report_billable_usage{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Reports/s", - "refId": "A" - } - ], - "description": "Rate of ReportBillableUsage calls from DP clusters. Each call reports resource consumption for billing." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "ops" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 16 - }, - "id": 802, - "title": "Message Pipeline", - "type": "timeseries", - "targets": [ - { - "expr": "rate(usage:messages:messages_received{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Received", - "refId": "A" - }, - { - "expr": "rate(usage:messages:messages_sent{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Sent", - "refId": "B" - }, - { - "expr": "rate(usage:messages:messages_dropped{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Dropped", - "refId": "C" - }, - { - "expr": "rate(usage:messages:messages_failed{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Failed", - "refId": "D" - } - ], - "description": "SQS/queue message processing: received, sent (success), failed, dropped (max retries exceeded)." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never", - "stacking": { - "mode": "normal" - } - }, - "unit": "ops" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 24 - }, - "id": 803, - "title": "Messages by Type (success)", - "type": "timeseries", - "targets": [ - { - "expr": "sum by (message_type) (rate(usage:messages:messages_processed{namespace=\"$namespace\", outcome=\"success\"}[$__rate_interval]))", - "legendFormat": "{{ message_type }}", - "refId": "A" - } - ], - "description": "Successful message processing rate by type: node_execution, workflow_execution, billable_usage, serverless_billable_usage. [Metrics pending: requires cloud service instrumentation to be deployed]" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "ms" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 24 - }, - "id": 804, - "title": "Message Processing Latency", - "type": "timeseries", - "targets": [ - { - "expr": "usage:messages:processing_time_ms{namespace=\"$namespace\", quantile=\"0.5\"}", - "legendFormat": "p50", - "refId": "A" - }, - { - "expr": "usage:messages:processing_time_ms{namespace=\"$namespace\", quantile=\"0.9\"}", - "legendFormat": "p90", - "refId": "B" - }, - { - "expr": "usage:messages:processing_time_ms{namespace=\"$namespace\", quantile=\"0.99\"}", - "legendFormat": "p99", - "refId": "C" - } - ], - "description": "Time to process individual queue messages. Slow processing may indicate backend API issues (Metronome, timestream). [Metrics pending: requires cloud service instrumentation to be deployed]" - } - ] - }, - { - "collapsed": true, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 38 - }, - "id": 1100, - "title": "Infrastructure", - "type": "row", - "panels": [ - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never", - "stacking": { - "mode": "normal" - } - }, - "unit": "short" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 17 - }, - "id": 1101, - "title": "CPU Usage by Service", - "type": "timeseries", - "targets": [ - { - "expr": "sum by (container) (rate(container_cpu_usage_seconds_total{namespace=\"$namespace\", container!=\"\", container!=\"POD\"}[$__rate_interval]))", - "legendFormat": "{{ container }}", - "refId": "A" - } - ], - "description": "CPU usage in cores per container, stacked. Identifies resource-heavy services." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never", - "stacking": { - "mode": "normal" - } - }, - "unit": "bytes" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 17 - }, - "id": 1102, - "title": "Memory Usage by Service", - "type": "timeseries", - "targets": [ - { - "expr": "sum by (container) (container_memory_working_set_bytes{namespace=\"$namespace\", container!=\"\", container!=\"POD\"})", - "legendFormat": "{{ container }}", - "refId": "A" - } - ], - "description": "Working set memory per container, stacked. Watch for approaching limits." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "bars", - "fillOpacity": 80, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "short" - } - }, - "gridPos": { - "h": 8, - "w": 24, - "x": 0, - "y": 25 - }, - "id": 1103, - "title": "Pod Restart Count by Container", - "type": "timeseries", - "targets": [ - { - "expr": "increase(kube_pod_container_status_restarts_total{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "{{ pod }}/{{ container }}", - "refId": "A" - } - ], - "description": "Per-container restart events. Spikes indicate crashes or OOM kills." - } - ] - } - ], - "schemaVersion": 39, - "tags": [ - "union", - "controlplane" - ], - "templating": { - "list": [ - { - "current": {}, - "hide": 0, - "includeAll": false, - "label": "Data Source", - "multi": false, - "name": "datasource", - "options": [], - "query": "prometheus", - "refresh": 1, - "type": "datasource" - }, - { - "current": { - "selected": true, - "text": "__NAMESPACE__", - "value": "__NAMESPACE__" - }, - "hide": 2, - "label": "Namespace", - "name": "namespace", - "options": [ - { - "selected": true, - "text": "__NAMESPACE__", - "value": "__NAMESPACE__" - } - ], - "query": "__NAMESPACE__", - "type": "constant" - } - ] - }, - "time": { - "from": "now-3h", - "to": "now" - }, - "timepicker": {}, - "timezone": "browser", - "title": "Union Controlplane V2 Overview", - "uid": "union-controlplane-v2-overview", - "version": 2 -} diff --git a/charts/dataplane/dashboards/union-dataplane-overview.json b/charts/dataplane/dashboards/union-dataplane-overview.json index 84a6798c..4e89bbc8 100644 --- a/charts/dataplane/dashboards/union-dataplane-overview.json +++ b/charts/dataplane/dashboards/union-dataplane-overview.json @@ -136,63 +136,6 @@ ], "description": "Total container restarts in the last hour. Non-zero indicates crashlooping or OOM kills." }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "thresholds": { - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "yellow", - "value": 10 - }, - { - "color": "red", - "value": 50 - } - ] - }, - "unit": "short" - } - }, - "gridPos": { - "h": 4, - "w": 6, - "x": 18, - "y": 1 - }, - "id": 8, - "options": { - "colorMode": "background", - "graphMode": "area", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ] - }, - "textMode": "auto" - }, - "title": "Active Workflows", - "type": "stat", - "targets": [ - { - "expr": "sum(flyte:propeller:all:execstats:active_workflow_executions{namespace=\"$namespace\"})", - "legendFormat": "Workflows", - "refId": "A" - } - ], - "description": "Current active FlyteWorkflow CRD count managed by Propeller." - }, { "datasource": { "type": "prometheus", @@ -254,59 +197,6 @@ ], "description": "Current count of active workflow, node, and task executions tracked by FlyteAdmin." }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "short" - } - }, - "gridPos": { - "h": 6, - "w": 12, - "x": 12, - "y": 5 - }, - "id": 7, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom" - }, - "tooltip": { - "mode": "multi", - "sort": "desc" - } - }, - "title": "Queue Depth", - "type": "timeseries", - "targets": [ - { - "expr": "sum(flyte:propeller:all:main_depth{namespace=\"$namespace\"})", - "legendFormat": "Main", - "refId": "A" - }, - { - "expr": "sum(flyte:propeller:all:sub_depth{namespace=\"$namespace\"})", - "legendFormat": "Sub", - "refId": "B" - } - ], - "description": "Main and sub workqueue depth over time." - }, { "datasource": { "type": "prometheus", @@ -592,7 +482,7 @@ "y": 10 }, "id": 1204, - "title": "Propeller Latency p99", + "title": "Executor Evaluate Duration p99", "type": "stat", "options": { "colorMode": "background", @@ -606,8 +496,13 @@ }, "targets": [ { - "expr": "(union:dp:slo:propeller_round_latency_p99 or flyte:propeller:all:round:round_time_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.99\"} / 1000)", - "refId": "A" + "expr": "executor:evaluator:evaluate_duration{namespace=\"$namespace\", quantile=\"0.99\"}", + "legendFormat": "p99", + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + } } ], "description": "Propeller round p99 latency in seconds. Falls back to raw metric if SLO rules not enabled." @@ -1237,564 +1132,6 @@ } ] }, - { - "collapsed": true, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 33 - }, - "id": 100, - "title": "Flyte Propeller (V1)", - "type": "row", - "panels": [ - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "ms" - } - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 0, - "y": 12 - }, - "id": 101, - "title": "Round Time (p50 / p90 / p99)", - "type": "timeseries", - "targets": [ - { - "expr": "flyte:propeller:all:round:round_time_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.5\"}", - "legendFormat": "p50", - "refId": "A" - }, - { - "expr": "flyte:propeller:all:round:round_time_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.9\"}", - "legendFormat": "p90", - "refId": "B" - }, - { - "expr": "flyte:propeller:all:round:round_time_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.99\"}", - "legendFormat": "p99", - "refId": "C" - } - ], - "description": "Propeller reconciliation round time. One round = one FlyteWorkflow CRD processed." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "ops" - } - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 8, - "y": 12 - }, - "id": 102, - "title": "Round Success / Error Rate", - "type": "timeseries", - "targets": [ - { - "expr": "rate(flyte:propeller:all:round:success_count{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Success", - "refId": "A" - }, - { - "expr": "rate(flyte:propeller:all:round:error_count{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Errors", - "refId": "B" - }, - { - "expr": "rate(flyte:propeller:all:round:panic_unlabeled{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Panics", - "refId": "C" - } - ], - "description": "Propeller round outcomes: success, errors, and panics per second." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "short" - } - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 16, - "y": 12 - }, - "id": 103, - "title": "Free Workers", - "type": "timeseries", - "targets": [ - { - "expr": "sum(flyte:propeller:all:free_workers_count{namespace=\"$namespace\"})", - "legendFormat": "Free workers", - "refId": "A" - } - ], - "description": "Idle propeller worker goroutines. Zero = all workers busy processing workflows." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "ops" - } - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 0, - "y": 20 - }, - "id": 104, - "title": "Queue Add Rate", - "type": "timeseries", - "targets": [ - { - "expr": "rate(flyte:propeller:all:main_adds{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Main adds", - "refId": "A" - }, - { - "expr": "rate(flyte:propeller:all:sub_adds{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Sub adds", - "refId": "B" - }, - { - "expr": "rate(flyte:propeller:all:main_retries{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Main retries", - "refId": "C" - } - ], - "description": "Items enqueued to propeller's main/sub workqueues, plus retry rate." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "short" - } - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 8, - "y": 20 - }, - "id": 105, - "title": "Workflow Updates", - "type": "timeseries", - "targets": [ - { - "expr": "rate(flyte:propeller:all:wf_updated{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Updated", - "refId": "A" - }, - { - "expr": "rate(flyte:propeller:all:wf_update_failed{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Failed", - "refId": "B" - }, - { - "expr": "rate(flyte:propeller:all:wf_too_large{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Too large", - "refId": "C" - }, - { - "expr": "rate(flyte:propeller:all:wf_update_conflict{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Conflict", - "refId": "D" - } - ], - "description": "etcd write outcomes: successful updates, failures, too-large (>1.5MB), optimistic concurrency conflicts." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "ms" - } - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 16, - "y": 20 - }, - "id": 106, - "title": "Workflow Update Latency", - "type": "timeseries", - "targets": [ - { - "expr": "flyte:propeller:all:wf_update_latency_ms{namespace=\"$namespace\", quantile=\"0.5\"}", - "legendFormat": "p50", - "refId": "A" - }, - { - "expr": "flyte:propeller:all:wf_update_latency_ms{namespace=\"$namespace\", quantile=\"0.9\"}", - "legendFormat": "p90", - "refId": "B" - }, - { - "expr": "flyte:propeller:all:wf_update_latency_ms{namespace=\"$namespace\", quantile=\"0.99\"}", - "legendFormat": "p99", - "refId": "C" - } - ], - "description": "etcd write latency for FlyteWorkflow status updates." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "ms" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 28 - }, - "id": 107, - "title": "Node Queueing & Execution Latency", - "type": "timeseries", - "targets": [ - { - "expr": "flyte:propeller:all:node:queueing_latency_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.5\"}", - "legendFormat": "Queue p50", - "refId": "A" - }, - { - "expr": "flyte:propeller:all:node:queueing_latency_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.9\"}", - "legendFormat": "Queue p90", - "refId": "B" - }, - { - "expr": "flyte:propeller:all:node:node_exec_latency_unlabeled_us{namespace=\"$namespace\", quantile=\"0.9\"} / 1000", - "legendFormat": "Exec p90 (ms)", - "refId": "C" - } - ], - "description": "Node lifecycle: queueing latency (queued\u2192running) and execution latency (time in handler)." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "percentunit" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 28 - }, - "id": 108, - "title": "Metastore Cache Hit Rate", - "type": "timeseries", - "targets": [ - { - "expr": "rate(flyte:propeller:all:metastore:cache_hit{namespace=\"$namespace\"}[$__rate_interval]) / (rate(flyte:propeller:all:metastore:cache_hit{namespace=\"$namespace\"}[$__rate_interval]) + rate(flyte:propeller:all:metastore:cache_miss{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "Hit rate", - "refId": "A" - } - ], - "description": "In-memory cache effectiveness for object store (S3/GCS) reads. Low hit rate = excessive storage calls." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "ops" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 36 - }, - "id": 109, - "title": "Event Recording (DP \u2192 CP)", - "type": "timeseries", - "targets": [ - { - "expr": "sum(rate(flyte:propeller:all:task:event_recording:success_duration_ms_count{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "Task success", - "refId": "A" - }, - { - "expr": "sum(rate(flyte:propeller:all:node:event_recording:success_duration_ms_count{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "Node success", - "refId": "B" - }, - { - "expr": "sum(rate(flyte:propeller:all:task:event_recording:failure_duration_ms_count{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "Task failure", - "refId": "C" - }, - { - "expr": "sum(rate(flyte:propeller:all:node:event_recording:failure_duration_ms_count{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "Node failure", - "refId": "D" - } - ], - "description": "Task/node event recording rate from propeller to FlyteAdmin. Failures indicate CP connectivity issues." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "short" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 36 - }, - "id": 110, - "title": "Cache Discovery (hit/miss/skip)", - "type": "timeseries", - "targets": [ - { - "expr": "rate(flyte:propeller:all:discovery_hit_count{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Hits", - "refId": "A" - }, - { - "expr": "rate(flyte:propeller:all:discovery_miss_count{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Misses", - "refId": "B" - }, - { - "expr": "rate(flyte:propeller:all:discovery_skip_count{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Skips", - "refId": "C" - }, - { - "expr": "rate(flyte:propeller:all:discovery_get_failure_count{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Get failures", - "refId": "D" - } - ], - "description": "V2 executor cache discovery miss/put rates for task output caching." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "reqps" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 44 - }, - "id": 111, - "title": "K8s API Client Rate", - "type": "timeseries", - "targets": [ - { - "expr": "sum(rate(k8s_client_request_total_unlabeled{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "K8s requests/s", - "refId": "A" - } - ], - "description": "Propeller's K8s API request rate. High rates may indicate excessive pod watches or creates." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "s" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 44 - }, - "id": 112, - "title": "K8s API Client Latency (p90)", - "type": "timeseries", - "targets": [ - { - "expr": "histogram_quantile(0.90, sum by (le) (rate(k8s_client_request_latency_unlabeled_bucket{namespace=\"$namespace\"}[$__rate_interval])))", - "legendFormat": "Request p90", - "refId": "A" - }, - { - "expr": "histogram_quantile(0.90, sum by (le) (rate(k8s_client_rate_limiter_latency_unlabeled_bucket{namespace=\"$namespace\"}[$__rate_interval])))", - "legendFormat": "Rate limiter p90", - "refId": "B" - } - ], - "description": "K8s API request latency and client-side rate limiter wait time at p90." - } - ] - }, { "collapsed": true, "gridPos": { @@ -2099,6 +1436,6 @@ "timepicker": {}, "timezone": "browser", "title": "Union Dataplane Overview", - "uid": "union-dp-overview", + "uid": "union-dataplane-overview", "version": 1 } diff --git a/charts/dataplane/dashboards/union-dataplane-v2-overview.json b/charts/dataplane/dashboards/union-dataplane-v2-overview.json deleted file mode 100644 index 84ac227c..00000000 --- a/charts/dataplane/dashboards/union-dataplane-v2-overview.json +++ /dev/null @@ -1,1441 +0,0 @@ -{ - "annotations": { - "list": [] - }, - "description": "Union Dataplane health and service metrics", - "editable": true, - "fiscalYearStartMonth": 0, - "graphTooltip": 1, - "links": [], - "panels": [ - { - "collapsed": false, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 0 - }, - "id": 1, - "title": "Health", - "type": "row" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "thresholds": { - "steps": [ - { - "color": "red", - "value": null - }, - { - "color": "orange", - "value": 0.5 - }, - { - "color": "green", - "value": 1 - } - ] - }, - "unit": "percentunit", - "min": 0, - "max": 1 - } - }, - "gridPos": { - "h": 4, - "w": 6, - "x": 0, - "y": 1 - }, - "id": 2, - "options": { - "colorMode": "background", - "graphMode": "none", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ] - }, - "textMode": "auto" - }, - "title": "Service Availability", - "type": "stat", - "targets": [ - { - "expr": "avg(kube_deployment_status_replicas_available{namespace=\"$namespace\"} / kube_deployment_spec_replicas{namespace=\"$namespace\"})", - "legendFormat": "Availability", - "refId": "A" - } - ], - "description": "Percentage of deployments with all requested replicas available. 1.0 = all healthy." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "thresholds": { - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "orange", - "value": 3 - }, - { - "color": "red", - "value": 10 - } - ] - }, - "unit": "short" - } - }, - "gridPos": { - "h": 4, - "w": 6, - "x": 6, - "y": 1 - }, - "id": 3, - "options": { - "colorMode": "background", - "graphMode": "none", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ] - }, - "textMode": "auto" - }, - "title": "Pod Restarts (1h)", - "type": "stat", - "targets": [ - { - "expr": "sum(increase(kube_pod_container_status_restarts_total{namespace=\"$namespace\"}[1h]))", - "legendFormat": "Restarts", - "refId": "A" - } - ], - "description": "Total container restarts in the last hour. Non-zero indicates crashlooping or OOM kills." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "short" - } - }, - "gridPos": { - "h": 6, - "w": 12, - "x": 0, - "y": 5 - }, - "id": 10, - "options": { - "legend": { - "calcs": [ - "mean", - "max" - ], - "displayMode": "table", - "placement": "bottom" - }, - "tooltip": { - "mode": "multi", - "sort": "desc" - } - }, - "title": "Active Executions", - "type": "timeseries", - "targets": [ - { - "expr": "sum(flyte:propeller:all:execstats:active_workflow_executions{namespace=\"$namespace\"})", - "legendFormat": "Workflows", - "refId": "A" - }, - { - "expr": "sum(flyte:propeller:all:execstats:active_node_executions{namespace=\"$namespace\"})", - "legendFormat": "Nodes", - "refId": "B" - }, - { - "expr": "sum(flyte:propeller:all:execstats:active_task_executions{namespace=\"$namespace\"})", - "legendFormat": "Tasks", - "refId": "C" - } - ], - "description": "Current count of active workflow, node, and task executions tracked by FlyteAdmin." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "thresholds": { - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 1 - } - ] - }, - "unit": "short" - } - }, - "gridPos": { - "h": 4, - "w": 6, - "x": 12, - "y": 1 - }, - "id": 11, - "options": { - "colorMode": "background", - "graphMode": "none", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ] - }, - "textMode": "value_and_name" - }, - "title": "Handler Panics", - "type": "stat", - "targets": [ - { - "expr": "sum(executor:handler_panic{namespace=\"$namespace\"})", - "legendFormat": "Total", - "refId": "A" - } - ], - "description": "Total handler panics in DP services. Any non-zero value indicates a service caught a panic during request handling." - }, - { - "collapsed": false, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 9 - }, - "id": 1200, - "title": "SLOs", - "type": "row" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "thresholds": { - "steps": [ - { - "color": "red", - "value": null - }, - { - "color": "orange", - "value": 0.99 - }, - { - "color": "green", - "value": 0.999 - } - ] - }, - "unit": "percentunit", - "decimals": 3 - } - }, - "gridPos": { - "h": 6, - "w": 6, - "x": 0, - "y": 10 - }, - "id": 1201, - "title": "Service Availability", - "type": "stat", - "options": { - "colorMode": "background", - "graphMode": "none", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ] - }, - "textMode": "value" - }, - "targets": [ - { - "expr": "avg(kube_deployment_status_replicas_available{namespace=\"$namespace\"} / kube_deployment_spec_replicas{namespace=\"$namespace\"})", - "refId": "A" - } - ], - "description": "Current service availability across all DP deployments." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": -999 - }, - { - "color": "orange", - "value": 0 - }, - { - "color": "green", - "value": 0.5 - } - ] - }, - "unit": "percentunit", - "decimals": 1, - "noValue": "N/A" - } - }, - "gridPos": { - "h": 6, - "w": 6, - "x": 6, - "y": 10 - }, - "id": 1202, - "title": "Error Budget Remaining", - "type": "stat", - "options": { - "colorMode": "background", - "graphMode": "area", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ] - }, - "textMode": "value" - }, - "targets": [ - { - "expr": "union:dp:slo:error_budget_remaining", - "refId": "A" - } - ], - "description": "Fraction of error budget remaining. Requires monitoring.slos.enabled." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 0 - }, - { - "color": "orange", - "value": 0.95 - }, - { - "color": "green", - "value": 0.999 - } - ] - }, - "unit": "percentunit", - "decimals": 2, - "noValue": "N/A" - } - }, - "gridPos": { - "h": 6, - "w": 6, - "x": 12, - "y": 10 - }, - "id": 1203, - "title": "Execution Success Rate", - "type": "stat", - "options": { - "colorMode": "background", - "graphMode": "none", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ] - }, - "textMode": "value" - }, - "targets": [ - { - "expr": "union:dp:slo:execution_success_rate or (union:dp:slo:propeller_success_rate + union:dp:slo:executor_success_rate) / 2 or union:dp:slo:propeller_success_rate or vector(1)", - "refId": "A" - } - ], - "description": "Combined V1 (propeller) and V2 (executor) task success rate. Falls back to propeller-only or 100% when idle." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "thresholds": { - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "orange", - "value": 2 - }, - { - "color": "red", - "value": 5 - } - ] - }, - "unit": "s", - "decimals": 2 - } - }, - "gridPos": { - "h": 6, - "w": 6, - "x": 18, - "y": 10 - }, - "id": 1204, - "title": "Executor Evaluate Duration p99", - "type": "stat", - "options": { - "colorMode": "background", - "graphMode": "none", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ] - }, - "textMode": "value" - }, - "targets": [ - { - "expr": "executor:evaluator:evaluate_duration{namespace=\"$namespace\", quantile=\"0.99\"}", - "legendFormat": "p99", - "refId": "A", - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - } - } - ], - "description": "Propeller round p99 latency in seconds. Falls back to raw metric if SLO rules not enabled." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "percentunit" - } - }, - "gridPos": { - "h": 6, - "w": 12, - "x": 0, - "y": 16 - }, - "id": 1205, - "title": "Availability Over Time", - "type": "timeseries", - "targets": [ - { - "expr": "avg(kube_deployment_status_replicas_available{namespace=\"$namespace\"} / kube_deployment_spec_replicas{namespace=\"$namespace\"})", - "legendFormat": "Availability", - "refId": "A" - }, - { - "expr": "vector(0.999)", - "legendFormat": "Target (99.9%)", - "refId": "B" - } - ], - "description": "DP service availability over time with SLO target line." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "percentunit", - "max": 1, - "min": -0.5 - } - }, - "gridPos": { - "h": 6, - "w": 12, - "x": 12, - "y": 16 - }, - "id": 1206, - "title": "Error Budget Burn Rate", - "type": "timeseries", - "targets": [ - { - "expr": "union:dp:slo:error_budget_remaining", - "legendFormat": "Budget remaining", - "refId": "A" - }, - { - "expr": "vector(0)", - "legendFormat": "Exhausted", - "refId": "B" - } - ], - "description": "DP error budget remaining over time. Requires monitoring.slos.enabled." - }, - { - "collapsed": true, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 31 - }, - "id": 200, - "title": "Union Operator", - "type": "row", - "panels": [ - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "ops" - } - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 0, - "y": 10 - }, - "id": 201, - "title": "Work Queue Operations", - "type": "timeseries", - "targets": [ - { - "expr": "rate(union_operator:work_queue:operations_processed{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Processed", - "refId": "A" - }, - { - "expr": "rate(union_operator:work_queue:operations_failed{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Failed", - "refId": "B" - } - ], - "description": "Operator execution operation processing rate and failure rate." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "short" - } - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 8, - "y": 10 - }, - "id": 202, - "title": "Background Process Runs / Errors", - "type": "timeseries", - "targets": [ - { - "expr": "rate(union_operator:heartbeat_updater:runs{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Heartbeat runs", - "refId": "A" - }, - { - "expr": "rate(union_operator:heartbeat_updater:run_errors{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Heartbeat errors", - "refId": "B" - }, - { - "expr": "rate(union_operator:status_updater:runs{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Status runs", - "refId": "C" - }, - { - "expr": "rate(union_operator:status_updater:run_errors{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Status errors", - "refId": "D" - }, - { - "expr": "rate(union_operator:prometheus_health_checker:run_errors{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Prom health errors", - "refId": "E" - } - ], - "description": "FlyteAdmin error rates: propeller communication failures, model transform errors, notification publish failures." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "ms" - } - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 16, - "y": 10 - }, - "id": 203, - "title": "Heartbeat Latency", - "type": "timeseries", - "targets": [ - { - "expr": "union_operator:heartbeat:compute_capabilities_ms{namespace=\"$namespace\", quantile=\"0.9\"}", - "legendFormat": "Capabilities p90", - "refId": "A" - }, - { - "expr": "union_operator:heartbeat:compute_usages_ms{namespace=\"$namespace\", quantile=\"0.9\"}", - "legendFormat": "Usages p90", - "refId": "B" - }, - { - "expr": "union_operator:heartbeat:list_workflows_ms{namespace=\"$namespace\", quantile=\"0.9\"}", - "legendFormat": "List WFs p90", - "refId": "C" - } - ], - "description": "Breakdown of operator heartbeat computation: capabilities, usages, list workflows." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "ops" - } - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 0, - "y": 18 - }, - "id": 204, - "title": "Config Syncer", - "type": "timeseries", - "targets": [ - { - "expr": "rate(union_operator:config_syncer:runs{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Sync runs", - "refId": "A" - }, - { - "expr": "rate(union_operator:config_syncer:run_errors{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Sync errors", - "refId": "B" - }, - { - "expr": "rate(union_operator:config_syncer:propeller_configmap_updated{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Propeller CM updated", - "refId": "C" - } - ], - "description": "Config sync cycle rate, errors, and propeller ConfigMap update count." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "ops" - } - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 8, - "y": 18 - }, - "id": 205, - "title": "Billable Usage Collector", - "type": "timeseries", - "targets": [ - { - "expr": "rate(union_operator:billable_usage_collector:runs{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Runs", - "refId": "A" - }, - { - "expr": "rate(union_operator:billable_usage_collector:run_errors{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Errors", - "refId": "B" - } - ], - "description": "Billing collection cycle rate and errors. Failures mean billing data may be delayed." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "thresholds": { - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 1 - } - ] - }, - "unit": "bool_yes_no" - } - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 16, - "y": 18 - }, - "id": 206, - "title": "Work Queue Paused", - "type": "stat", - "options": { - "colorMode": "background", - "graphMode": "area", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ] - }, - "textMode": "auto" - }, - "targets": [ - { - "expr": "union_operator:work_queue:paused{namespace=\"$namespace\"}", - "legendFormat": "Paused", - "refId": "A" - } - ], - "description": "1 when operator paused due to resource limits (FlyteWorkflow count or storage exceeded)." - } - ] - }, - { - "collapsed": true, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 32 - }, - "id": 300, - "title": "Executor (V2)", - "type": "row", - "panels": [ - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "short" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 11 - }, - "id": 301, - "title": "Active Actions & Capacity", - "type": "timeseries", - "targets": [ - { - "expr": "executor:active_actions_count{namespace=\"$namespace\"}", - "legendFormat": "Active actions", - "refId": "A" - }, - { - "expr": "executor:available_capacity{namespace=\"$namespace\"}", - "legendFormat": "Available capacity", - "refId": "B" - } - ], - "description": "V2 executor active actions vs available capacity. Capacity=0 means executor is saturated." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "short" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 11 - }, - "id": 302, - "title": "Cache Discovery", - "type": "timeseries", - "targets": [ - { - "expr": "rate(executor:discovery_miss_count{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Miss", - "refId": "A" - }, - { - "expr": "rate(executor:discovery_put_success_count{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Put success", - "refId": "B" - } - ], - "description": "V2 executor cache discovery miss/put rates for task output caching." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never", - "stacking": { - "mode": "normal" - } - }, - "unit": "ops" - } - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 0, - "y": 19 - }, - "id": 303, - "title": "Actions Terminated by Phase", - "type": "timeseries", - "targets": [ - { - "expr": "sum by (phase) (rate(executor:actions_terminated{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "{{ phase }}", - "refId": "A" - } - ], - "description": "Task completion rate by outcome: succeeded, failed, aborted. Key V2 SLI for task health. [Metrics pending: requires cloud service instrumentation to be deployed]" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "ms" - } - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 8, - "y": 19 - }, - "id": 304, - "title": "Evaluator Duration (pod creation)", - "type": "timeseries", - "targets": [ - { - "expr": "executor:evaluator:evaluate_duration{namespace=\"$namespace\", quantile=\"0.5\"}", - "legendFormat": "Evaluate p50", - "refId": "A" - }, - { - "expr": "executor:evaluator:evaluate_duration{namespace=\"$namespace\", quantile=\"0.9\"}", - "legendFormat": "Evaluate p90", - "refId": "B" - }, - { - "expr": "executor:evaluator:evaluate_duration{namespace=\"$namespace\", quantile=\"0.99\"}", - "legendFormat": "Evaluate p99", - "refId": "C" - } - ], - "description": "Time spent in RecursiveNodeHandler (pod creation). Dominant component of V2 task latency. [Metrics pending: requires cloud service instrumentation to be deployed]" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "ops" - } - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 16, - "y": 19 - }, - "id": 305, - "title": "System Failures & Invalid Leases", - "type": "timeseries", - "targets": [ - { - "expr": "rate(executor:system_failures{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "System failures", - "refId": "A" - }, - { - "expr": "rate(executor:system_failures_exhausted{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Exhausted retries", - "refId": "B" - }, - { - "expr": "rate(executor:invalid_leases{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Invalid leases", - "refId": "C" - }, - { - "expr": "rate(executor:evaluator:evaluate_errors{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Evaluate errors", - "refId": "D" - } - ], - "description": "Executor error rates. System failures retry; exhausted means task permanently failed. Invalid leases = malformed from queue service. [Metrics pending: requires cloud service instrumentation to be deployed]" - } - ] - }, - { - "collapsed": true, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 34 - }, - "id": 400, - "title": "gRPC Client (DP \u2192 CP)", - "type": "row", - "panels": [ - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "reqps" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 13 - }, - "id": 401, - "title": "gRPC Client Request Rate", - "type": "timeseries", - "targets": [ - { - "expr": "sum by (grpc_service, grpc_method) (rate(grpc_client_handled_total{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "{{ grpc_service }}/{{ grpc_method }}", - "refId": "A" - } - ], - "description": "DP\u2192CP gRPC client request rate by service and method (e.g. CreateWorkflowEvent)." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "reqps" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 13 - }, - "id": 402, - "title": "gRPC Client Error Rate", - "type": "timeseries", - "targets": [ - { - "expr": "sum by (grpc_service, grpc_method, grpc_code) (rate(grpc_client_handled_total{namespace=\"$namespace\", grpc_code!=\"OK\"}[$__rate_interval]))", - "legendFormat": "{{ grpc_method }} {{ grpc_code }}", - "refId": "A" - } - ], - "description": "DP\u2192CP gRPC errors by method and code. Non-OK responses from the control plane." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "s" - } - }, - "gridPos": { - "h": 8, - "w": 24, - "x": 0, - "y": 21 - }, - "id": 403, - "title": "gRPC Client Latency (p95)", - "type": "timeseries", - "targets": [ - { - "expr": "histogram_quantile(0.95, sum by (le, grpc_method) (rate(grpc_client_handling_seconds_bucket{namespace=\"$namespace\"}[$__rate_interval])))", - "legendFormat": "{{ grpc_method }} p95", - "refId": "A" - } - ], - "description": "DP\u2192CP gRPC call latency at p95. High latency = slow control plane or network issues." - } - ] - }, - { - "collapsed": true, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 35 - }, - "id": 1100, - "title": "Infrastructure", - "type": "row", - "panels": [ - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never", - "stacking": { - "mode": "none" - } - }, - "unit": "short" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 14 - }, - "id": 1101, - "title": "CPU Usage by Service", - "type": "timeseries", - "targets": [ - { - "expr": "sum by (container) (rate(container_cpu_usage_seconds_total{namespace=\"$namespace\", container!=\"\", container!=\"POD\"}[$__rate_interval]))", - "legendFormat": "{{ container }}", - "refId": "A" - } - ], - "description": "CPU usage in cores per container, stacked. Identifies resource-heavy services." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never", - "stacking": { - "mode": "none" - } - }, - "unit": "bytes" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 14 - }, - "id": 1102, - "title": "Memory Usage by Service", - "type": "timeseries", - "targets": [ - { - "expr": "sum by (container) (container_memory_working_set_bytes{namespace=\"$namespace\", container!=\"\", container!=\"POD\"})", - "legendFormat": "{{ container }}", - "refId": "A" - } - ], - "description": "Working set memory per container. Watch for approaching limits." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "bars", - "fillOpacity": 80, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "short" - } - }, - "gridPos": { - "h": 8, - "w": 24, - "x": 0, - "y": 22 - }, - "id": 1103, - "title": "Pod Restart Count by Container", - "type": "timeseries", - "targets": [ - { - "expr": "increase(kube_pod_container_status_restarts_total{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "{{ pod }}/{{ container }}", - "refId": "A" - } - ], - "description": "Per-container restart events. Spikes indicate crashes or OOM kills." - } - ] - } - ], - "schemaVersion": 39, - "tags": [ - "union", - "dataplane" - ], - "templating": { - "list": [ - { - "current": {}, - "hide": 0, - "includeAll": false, - "label": "Data Source", - "multi": false, - "name": "datasource", - "options": [], - "query": "prometheus", - "refresh": 1, - "type": "datasource" - }, - { - "current": { - "selected": true, - "text": "__NAMESPACE__", - "value": "__NAMESPACE__" - }, - "hide": 2, - "label": "Namespace", - "name": "namespace", - "options": [ - { - "selected": true, - "text": "__NAMESPACE__", - "value": "__NAMESPACE__" - } - ], - "query": "__NAMESPACE__", - "type": "constant" - } - ] - }, - "time": { - "from": "now-3h", - "to": "now" - }, - "timepicker": {}, - "timezone": "browser", - "title": "Union Dataplane V2 Overview", - "uid": "union-dataplane-v2-overview", - "version": 1 -} From c88573869266bfcfe15313c01d3937e38f68c940 Mon Sep 17 00:00:00 2001 From: Michael Hotan Date: Sun, 3 May 2026 15:39:44 +1000 Subject: [PATCH 3/6] Fix executor metric names: executor: -> executor::v2: The executorv2 binary registers metrics with a v2 scope suffix, producing metric names like executor::v2:active_actions_count instead of executor:active_actions_count. Update dashboard panels and PrometheusRule recording rules to match the actual metric names. Note: executor:handler_panic is unchanged (emitted by v1 scope). A separate issue will be filed for the Runtime team to fix the double-colon scope naming in the executor binary. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../dashboards/union-dataplane-overview.json | 26 +++++++++---------- .../templates/monitoring/prometheusrule.yaml | 6 ++--- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/charts/dataplane/dashboards/union-dataplane-overview.json b/charts/dataplane/dashboards/union-dataplane-overview.json index 4e89bbc8..aebe5be4 100644 --- a/charts/dataplane/dashboards/union-dataplane-overview.json +++ b/charts/dataplane/dashboards/union-dataplane-overview.json @@ -496,7 +496,7 @@ }, "targets": [ { - "expr": "executor:evaluator:evaluate_duration{namespace=\"$namespace\", quantile=\"0.99\"}", + "expr": "executor::v2:evaluator:evaluate_duration_ms{namespace=\"$namespace\", quantile=\"0.99\"}", "legendFormat": "p99", "refId": "A", "datasource": { @@ -937,12 +937,12 @@ "type": "timeseries", "targets": [ { - "expr": "executor:active_actions_count{namespace=\"$namespace\"}", + "expr": "executor::v2:active_actions_count{namespace=\"$namespace\"}", "legendFormat": "Active actions", "refId": "A" }, { - "expr": "executor:available_capacity{namespace=\"$namespace\"}", + "expr": "executor::v2:available_capacity{namespace=\"$namespace\"}", "legendFormat": "Available capacity", "refId": "B" } @@ -979,12 +979,12 @@ "type": "timeseries", "targets": [ { - "expr": "rate(executor:discovery_miss_count{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(executor::v2:discovery_miss_count{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Miss", "refId": "A" }, { - "expr": "rate(executor:discovery_put_success_count{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(executor::v2:discovery_put_success_count{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Put success", "refId": "B" } @@ -1024,7 +1024,7 @@ "type": "timeseries", "targets": [ { - "expr": "sum by (phase) (rate(executor:actions_terminated{namespace=\"$namespace\"}[$__rate_interval]))", + "expr": "sum by (phase) (rate(executor::v2:actions_terminated{namespace=\"$namespace\"}[$__rate_interval]))", "legendFormat": "{{ phase }}", "refId": "A" } @@ -1061,17 +1061,17 @@ "type": "timeseries", "targets": [ { - "expr": "executor:evaluator:evaluate_duration{namespace=\"$namespace\", quantile=\"0.5\"}", + "expr": "executor::v2:evaluator:evaluate_duration_ms{namespace=\"$namespace\", quantile=\"0.5\"}", "legendFormat": "Evaluate p50", "refId": "A" }, { - "expr": "executor:evaluator:evaluate_duration{namespace=\"$namespace\", quantile=\"0.9\"}", + "expr": "executor::v2:evaluator:evaluate_duration_ms{namespace=\"$namespace\", quantile=\"0.9\"}", "legendFormat": "Evaluate p90", "refId": "B" }, { - "expr": "executor:evaluator:evaluate_duration{namespace=\"$namespace\", quantile=\"0.99\"}", + "expr": "executor::v2:evaluator:evaluate_duration_ms{namespace=\"$namespace\", quantile=\"0.99\"}", "legendFormat": "Evaluate p99", "refId": "C" } @@ -1108,22 +1108,22 @@ "type": "timeseries", "targets": [ { - "expr": "rate(executor:system_failures{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(executor::v2:system_failures{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "System failures", "refId": "A" }, { - "expr": "rate(executor:system_failures_exhausted{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(executor::v2:system_failures_exhausted{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Exhausted retries", "refId": "B" }, { - "expr": "rate(executor:invalid_leases{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(executor::v2:invalid_leases{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Invalid leases", "refId": "C" }, { - "expr": "rate(executor:evaluator:evaluate_errors{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(executor::v2:evaluator:evaluate_errors{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Evaluate errors", "refId": "D" } diff --git a/charts/dataplane/templates/monitoring/prometheusrule.yaml b/charts/dataplane/templates/monitoring/prometheusrule.yaml index b36353b1..f080334e 100644 --- a/charts/dataplane/templates/monitoring/prometheusrule.yaml +++ b/charts/dataplane/templates/monitoring/prometheusrule.yaml @@ -74,7 +74,7 @@ spec: - record: union:dp:executor:active_actions expr: | - executor:active_actions_count{namespace="{{ .Release.Namespace }}"} + executor::v2:active_actions_count{namespace="{{ .Release.Namespace }}"} {{- if .Values.monitoring.alerting.enabled }} # --- Operational alerts (opt-in) --- @@ -143,8 +143,8 @@ spec: - record: union:dp:slo:executor_success_rate expr: | ( - sum(rate(executor:actions_terminated{namespace="{{ .Release.Namespace }}", phase="Succeeded"}[5m])) - / sum(rate(executor:actions_terminated{namespace="{{ .Release.Namespace }}"}[5m])) + sum(rate(executor::v2:actions_terminated{namespace="{{ .Release.Namespace }}", phase="Succeeded"}[5m])) + / sum(rate(executor::v2:actions_terminated{namespace="{{ .Release.Namespace }}"}[5m])) ) or vector(1) - record: union:dp:slo:execution_success_rate From f6132c7713dd45b864ee732ca4c1bb1db313e626 Mon Sep 17 00:00:00 2001 From: Michael Hotan Date: Sun, 3 May 2026 16:59:09 +1000 Subject: [PATCH 4/6] Fix heartbeat latency panel: quantile 0.95 does not exist cluster:svc:heartbeat:success_ms is a summary metric with quantiles 0.5, 0.9, and 0.99. The panel queried for quantile="0.95" which returned empty. Changed to 0.99 and renamed panel to "Cluster API Latency (p99)". Co-Authored-By: Claude Opus 4.6 (1M context) --- .../controlplane/dashboards/union-controlplane-overview.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/charts/controlplane/dashboards/union-controlplane-overview.json b/charts/controlplane/dashboards/union-controlplane-overview.json index a044a3f0..f53c1ce8 100644 --- a/charts/controlplane/dashboards/union-controlplane-overview.json +++ b/charts/controlplane/dashboards/union-controlplane-overview.json @@ -2205,7 +2205,7 @@ "y": 14 }, "id": 602, - "title": "Cluster API Latency (p95)", + "title": "Cluster API Latency (p99)", "type": "timeseries", "targets": [ { @@ -2214,7 +2214,7 @@ "refId": "A" }, { - "expr": "cluster:svc:heartbeat:success_ms{namespace=\"$namespace\", quantile=\"0.95\"} / 1000", + "expr": "cluster:svc:heartbeat:success_ms{namespace=\"$namespace\", quantile=\"0.99\"} / 1000", "legendFormat": "Heartbeat p95", "refId": "B" } From 796fcea278bbae7b85a04ad7e6ff89f7df09918e Mon Sep 17 00:00:00 2001 From: Michael Hotan Date: Mon, 4 May 2026 09:30:31 +1000 Subject: [PATCH 5/6] Add table legends with min/max/latest to all timeseries panels Set legend displayMode=table with min, max, lastNotNull calcs on all 82 timeseries panels across both CP and DP dashboards. This makes it easier to spot anomalies at a glance without hovering. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../union-controlplane-overview.json | 748 ++++++++++++++++-- .../dashboards/union-dataplane-overview.json | 239 +++++- 2 files changed, 903 insertions(+), 84 deletions(-) diff --git a/charts/controlplane/dashboards/union-controlplane-overview.json b/charts/controlplane/dashboards/union-controlplane-overview.json index f53c1ce8..5dbe2113 100644 --- a/charts/controlplane/dashboards/union-controlplane-overview.json +++ b/charts/controlplane/dashboards/union-controlplane-overview.json @@ -227,8 +227,9 @@ "options": { "legend": { "calcs": [ - "mean", - "max" + "min", + "max", + "lastNotNull" ], "displayMode": "table", "placement": "bottom" @@ -278,7 +279,9 @@ "options": { "legend": { "calcs": [ - "mean" + "min", + "max", + "lastNotNull" ], "displayMode": "table", "placement": "bottom" @@ -644,7 +647,18 @@ "refId": "B" } ], - "description": "Service availability over time with SLO target line." + "description": "Service availability over time with SLO target line.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -688,7 +702,18 @@ "refId": "B" } ], - "description": "Error budget remaining over time. Requires monitoring.slos.enabled." + "description": "Error budget remaining over time. Requires monitoring.slos.enabled.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "collapsed": true, @@ -740,7 +765,18 @@ "refId": "A" } ], - "description": "Ingress request rate broken down by host and URL path." + "description": "Ingress request rate broken down by host and URL path.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -780,7 +816,18 @@ "refId": "A" } ], - "description": "4xx and 5xx error rates from ingress-nginx by HTTP status code." + "description": "4xx and 5xx error rates from ingress-nginx by HTTP status code.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -827,7 +874,18 @@ "refId": "C" } ], - "description": "Ingress request latency percentiles. Includes TLS + routing + upstream response time." + "description": "Ingress request latency percentiles. Includes TLS + routing + upstream response time.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -864,7 +922,18 @@ "refId": "A" } ], - "description": "Current number of active client connections to ingress-nginx." + "description": "Current number of active client connections to ingress-nginx.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } } ] }, @@ -918,7 +987,18 @@ "refId": "A" } ], - "description": "Connect protocol request throughput broken down by service (e.g. ExecutionService, ClusterService)." + "description": "Connect protocol request throughput broken down by service (e.g. ExecutionService, ClusterService).", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -955,7 +1035,18 @@ "refId": "A" } ], - "description": "Connect errors broken down by service and gRPC code. Identifies which services are erroring." + "description": "Connect errors broken down by service and gRPC code. Identifies which services are erroring.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -992,7 +1083,18 @@ "refId": "A" } ], - "description": "CacheService is the only CP service using gRPC (not Connect). Shows Get/Put/Delete/Reservation rates." + "description": "CacheService is the only CP service using gRPC (not Connect). Shows Get/Put/Delete/Reservation rates.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -1029,7 +1131,18 @@ "refId": "A" } ], - "description": "CacheService gRPC errors by method and code." + "description": "CacheService gRPC errors by method and code.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } } ] }, @@ -1090,7 +1203,18 @@ "refId": "C" } ], - "description": "Current count of active workflow, node, and task executions tracked by FlyteAdmin." + "description": "Current count of active workflow, node, and task executions tracked by FlyteAdmin.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -1142,7 +1266,18 @@ "refId": "D" } ], - "description": "Rate of execution creations and event ingestion (workflow, node, task events from propeller)." + "description": "Rate of execution creations and event ingestion (workflow, node, task events from propeller).", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -1194,7 +1329,18 @@ "refId": "D" } ], - "description": "FlyteAdmin error rates: propeller communication failures, model transform errors, notification publish failures." + "description": "FlyteAdmin error rates: propeller communication failures, model transform errors, notification publish failures.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -1246,7 +1392,18 @@ "refId": "D" } ], - "description": "FlyteAdmin gRPC endpoint latency at p95. Key endpoints: CreateExecution, CreateExecutionEvent, GetExecution." + "description": "FlyteAdmin gRPC endpoint latency at p95. Key endpoints: CreateExecution, CreateExecutionEvent, GetExecution.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -1288,7 +1445,18 @@ "refId": "B" } ], - "description": "Authorization approve/deny rate from the FlyteAdmin auth middleware. High deny rate may indicate auth misconfiguration." + "description": "Authorization approve/deny rate from the FlyteAdmin auth middleware. High deny rate may indicate auth misconfiguration.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } } ] }, @@ -1328,6 +1496,17 @@ "w": 8, "x": 0, "y": 0 + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } } }, { @@ -1364,6 +1543,17 @@ "w": 8, "x": 8, "y": 0 + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } } }, { @@ -1390,6 +1580,17 @@ "w": 8, "x": 16, "y": 0 + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } } }, { @@ -1427,7 +1628,18 @@ "refId": "A" } ], - "description": "Execution operations DB latency: create, ack, claim, unclaim, get, update." + "description": "Execution operations DB latency: create, ack, claim, unclaim, get, update.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -1474,7 +1686,18 @@ "refId": "C" } ], - "description": "Executions service Postgres error rates by type: gorm errors, native postgres errors, not-found." + "description": "Executions service Postgres error rates by type: gorm errors, native postgres errors, not-found.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -1526,7 +1749,18 @@ "refId": "D" } ], - "description": "Cluster and nodepool list cache effectiveness. High miss rate = excessive DB queries." + "description": "Cluster and nodepool list cache effectiveness. High miss rate = excessive DB queries.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -1563,7 +1797,18 @@ "refId": "A" } ], - "description": "Number of apps waiting for cluster assignment. Growing backlog = scheduling bottleneck." + "description": "Number of apps waiting for cluster assignment. Growing backlog = scheduling bottleneck.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -1610,7 +1855,18 @@ "refId": "C" } ], - "description": "Key V2 SLI: time to deliver an app to the dataplane. Measures end-to-end scheduling latency." + "description": "Key V2 SLI: time to deliver an app to the dataplane. Measures end-to-end scheduling latency.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -1657,7 +1913,18 @@ "refId": "C" } ], - "description": "V2 run/action dispatch throughput. Enqueue failures indicate queue service issues." + "description": "V2 run/action dispatch throughput. Enqueue failures indicate queue service issues.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -1704,7 +1971,18 @@ "refId": "C" } ], - "description": "V2 notification pipeline: notifications sent per second, active subscribers, log bytes streamed." + "description": "V2 notification pipeline: notifications sent per second, active subscribers, log bytes streamed.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } } ] }, @@ -1770,7 +2048,18 @@ "refId": "D" } ], - "description": "Total and scheduled run/action counts in the queue. Shows system load." + "description": "Total and scheduled run/action counts in the queue. Shows system load.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -1817,7 +2106,18 @@ "refId": "C" } ], - "description": "Lease lifecycle throughput: enqueued (new), completed (done), aborted (cancelled)." + "description": "Lease lifecycle throughput: enqueued (new), completed (done), aborted (cancelled).", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -1874,7 +2174,18 @@ "refId": "E" } ], - "description": "Internal queue depths across scheduler, runner, aborter, dispatcher, and DB worker pool. Growing = backpressure." + "description": "Internal queue depths across scheduler, runner, aborter, dispatcher, and DB worker pool. Growing = backpressure.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -1911,7 +2222,18 @@ "refId": "A" } ], - "description": "Dispatcher multi-step operation chain execution time at p99, by operation type." + "description": "Dispatcher multi-step operation chain execution time at p99, by operation type.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -1953,7 +2275,18 @@ "refId": "B" } ], - "description": "In-memory state store operation latency. Backed by ScyllaDB persistence." + "description": "In-memory state store operation latency. Backed by ScyllaDB persistence.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2000,7 +2333,18 @@ "refId": "C" } ], - "description": "Active/terminal state counts and eventer error rate. Eventer reports action status to executions service." + "description": "Active/terminal state counts and eventer error rate. Eventer reports action status to executions service.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2037,7 +2381,18 @@ "refId": "A" } ], - "description": "Remaining execution capacity per connected DP worker. Zero = worker saturated." + "description": "Remaining execution capacity per connected DP worker. Zero = worker saturated.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2074,7 +2429,18 @@ "refId": "A" } ], - "description": "Failed dispatcher operations by Go type. Indicates internal queue service errors." + "description": "Failed dispatcher operations by Go type. Indicates internal queue service errors.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2121,7 +2487,18 @@ "refId": "C" } ], - "description": "Idle worker goroutines in DB, queue-client, and state-client pools. Zero = all threads busy." + "description": "Idle worker goroutines in DB, queue-client, and state-client pools. Zero = all threads busy.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } } ] }, @@ -2177,7 +2554,18 @@ "refId": "B" } ], - "description": "Rate of DP cluster status updates and heartbeats received by the cluster service." + "description": "Rate of DP cluster status updates and heartbeats received by the cluster service.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2219,7 +2607,18 @@ "refId": "B" } ], - "description": "Cluster service RPC latency for UpdateStatus and Heartbeat calls." + "description": "Cluster service RPC latency for UpdateStatus and Heartbeat calls.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2261,7 +2660,18 @@ "refId": "B" } ], - "description": "DP-reported restart counts for operator and propeller pods. Set by DP on each UpdateStatus call." + "description": "DP-reported restart counts for operator and propeller pods. Set by DP on each UpdateStatus call.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2308,7 +2718,18 @@ "refId": "C" } ], - "description": "Cluster service Postgres error rates by type: gorm errors, native postgres errors, not-found." + "description": "Cluster service Postgres error rates by type: gorm errors, native postgres errors, not-found.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2366,7 +2787,18 @@ "refId": "A" } ], - "description": "Cluster health collector: 1=unhealthy, 0=healthy. Emitted per cluster on every Prometheus scrape." + "description": "Cluster health collector: 1=unhealthy, 0=healthy. Emitted per cluster on every Prometheus scrape.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2403,7 +2835,18 @@ "refId": "A" } ], - "description": "Seconds since each cluster last sent a heartbeat. High values = stale/disconnected cluster." + "description": "Seconds since each cluster last sent a heartbeat. High values = stale/disconnected cluster.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2445,7 +2888,18 @@ "refId": "B" } ], - "description": "LRU cache hit/miss rate for managed cluster lookups. High miss rate = excessive DB queries." + "description": "LRU cache hit/miss rate for managed cluster lookups. High miss rate = excessive DB queries.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } } ] }, @@ -2506,7 +2960,18 @@ "refId": "C" } ], - "description": "CacheService hit/miss rate. Hits = cached task output reused. Misses = task must execute. Get failures = storage errors. [Metrics pending: requires cloud service instrumentation to be deployed]" + "description": "CacheService hit/miss rate. Hits = cached task output reused. Misses = task must execute. Get failures = storage errors. [Metrics pending: requires cloud service instrumentation to be deployed]", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2553,7 +3018,18 @@ "refId": "C" } ], - "description": "Cache reservation contention: how often workers are blocked waiting for another worker's cache computation. High contention = many workers computing the same task. [Metrics pending: requires cloud service instrumentation to be deployed]" + "description": "Cache reservation contention: how often workers are blocked waiting for another worker's cache computation. High contention = many workers computing the same task. [Metrics pending: requires cloud service instrumentation to be deployed]", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } } ] }, @@ -2679,7 +3155,9 @@ "displayMode": "table", "placement": "bottom", "calcs": [ - "mean" + "min", + "max", + "lastNotNull" ] }, "tooltip": { @@ -2747,7 +3225,12 @@ "options": { "legend": { "displayMode": "table", - "placement": "bottom" + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] }, "tooltip": { "mode": "multi" @@ -2806,8 +3289,13 @@ }, "options": { "legend": { - "displayMode": "list", - "placement": "bottom" + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] }, "tooltip": { "mode": "multi" @@ -2876,8 +3364,13 @@ }, "options": { "legend": { - "displayMode": "list", - "placement": "bottom" + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] }, "tooltip": { "mode": "multi" @@ -2935,7 +3428,9 @@ "displayMode": "table", "placement": "bottom", "calcs": [ - "sum" + "min", + "max", + "lastNotNull" ] }, "tooltip": { @@ -2997,7 +3492,12 @@ "options": { "legend": { "displayMode": "table", - "placement": "bottom" + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] }, "tooltip": { "mode": "multi" @@ -3053,7 +3553,12 @@ "options": { "legend": { "displayMode": "table", - "placement": "bottom" + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] }, "tooltip": { "mode": "multi" @@ -3108,8 +3613,13 @@ }, "options": { "legend": { - "displayMode": "list", - "placement": "bottom" + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] }, "tooltip": { "mode": "multi" @@ -3191,7 +3701,18 @@ "refId": "D" } ], - "description": "DataProxy internal cache effectiveness for domain resolution, cluster pool routing, and namespace mapping." + "description": "DataProxy internal cache effectiveness for domain resolution, cluster pool routing, and namespace mapping.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3233,7 +3754,18 @@ "refId": "B" } ], - "description": "Time to read image metadata from the dataplane, proxied through DataProxy." + "description": "Time to read image metadata from the dataplane, proxied through DataProxy.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3270,7 +3802,18 @@ "refId": "A" } ], - "description": "Per-cluster secret proxy errors during fan-out operations. Identifies which dataplane cluster is causing failures. [Metrics pending: requires cloud service instrumentation to be deployed]" + "description": "Per-cluster secret proxy errors during fan-out operations. Identifies which dataplane cluster is causing failures. [Metrics pending: requires cloud service instrumentation to be deployed]", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } } ] }, @@ -3321,7 +3864,18 @@ "refId": "A" } ], - "description": "Rate of ReportBillableUsage calls from DP clusters. Each call reports resource consumption for billing." + "description": "Rate of ReportBillableUsage calls from DP clusters. Each call reports resource consumption for billing.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3373,7 +3927,18 @@ "refId": "D" } ], - "description": "SQS/queue message processing: received, sent (success), failed, dropped (max retries exceeded)." + "description": "SQS/queue message processing: received, sent (success), failed, dropped (max retries exceeded).", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3413,7 +3978,18 @@ "refId": "A" } ], - "description": "Successful message processing rate by type: node_execution, workflow_execution, billable_usage, serverless_billable_usage. [Metrics pending: requires cloud service instrumentation to be deployed]" + "description": "Successful message processing rate by type: node_execution, workflow_execution, billable_usage, serverless_billable_usage. [Metrics pending: requires cloud service instrumentation to be deployed]", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3460,7 +4036,18 @@ "refId": "C" } ], - "description": "Time to process individual queue messages. Slow processing may indicate backend API issues (Metronome, timestream). [Metrics pending: requires cloud service instrumentation to be deployed]" + "description": "Time to process individual queue messages. Slow processing may indicate backend API issues (Metronome, timestream). [Metrics pending: requires cloud service instrumentation to be deployed]", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } } ] }, @@ -3514,7 +4101,18 @@ "refId": "A" } ], - "description": "CPU usage in cores per container, stacked. Identifies resource-heavy services." + "description": "CPU usage in cores per container, stacked. Identifies resource-heavy services.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3554,7 +4152,18 @@ "refId": "A" } ], - "description": "Working set memory per container, stacked. Watch for approaching limits." + "description": "Working set memory per container, stacked. Watch for approaching limits.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3591,7 +4200,18 @@ "refId": "A" } ], - "description": "Per-container restart events. Spikes indicate crashes or OOM kills." + "description": "Per-container restart events. Spikes indicate crashes or OOM kills.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } } ] } diff --git a/charts/dataplane/dashboards/union-dataplane-overview.json b/charts/dataplane/dashboards/union-dataplane-overview.json index aebe5be4..15e325ca 100644 --- a/charts/dataplane/dashboards/union-dataplane-overview.json +++ b/charts/dataplane/dashboards/union-dataplane-overview.json @@ -165,8 +165,9 @@ "options": { "legend": { "calcs": [ - "mean", - "max" + "min", + "max", + "lastNotNull" ], "displayMode": "table", "placement": "bottom" @@ -547,7 +548,18 @@ "refId": "B" } ], - "description": "DP service availability over time with SLO target line." + "description": "DP service availability over time with SLO target line.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -591,7 +603,18 @@ "refId": "B" } ], - "description": "DP error budget remaining over time. Requires monitoring.slos.enabled." + "description": "DP error budget remaining over time. Requires monitoring.slos.enabled.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "collapsed": true, @@ -645,7 +668,18 @@ "refId": "B" } ], - "description": "Operator execution operation processing rate and failure rate." + "description": "Operator execution operation processing rate and failure rate.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -702,7 +736,18 @@ "refId": "E" } ], - "description": "FlyteAdmin error rates: propeller communication failures, model transform errors, notification publish failures." + "description": "FlyteAdmin error rates: propeller communication failures, model transform errors, notification publish failures.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -749,7 +794,18 @@ "refId": "C" } ], - "description": "Breakdown of operator heartbeat computation: capabilities, usages, list workflows." + "description": "Breakdown of operator heartbeat computation: capabilities, usages, list workflows.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -796,7 +852,18 @@ "refId": "C" } ], - "description": "Config sync cycle rate, errors, and propeller ConfigMap update count." + "description": "Config sync cycle rate, errors, and propeller ConfigMap update count.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -838,7 +905,18 @@ "refId": "B" } ], - "description": "Billing collection cycle rate and errors. Failures mean billing data may be delayed." + "description": "Billing collection cycle rate and errors. Failures mean billing data may be delayed.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -947,7 +1025,18 @@ "refId": "B" } ], - "description": "V2 executor active actions vs available capacity. Capacity=0 means executor is saturated." + "description": "V2 executor active actions vs available capacity. Capacity=0 means executor is saturated.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -989,7 +1078,18 @@ "refId": "B" } ], - "description": "V2 executor cache discovery miss/put rates for task output caching." + "description": "V2 executor cache discovery miss/put rates for task output caching.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -1029,7 +1129,18 @@ "refId": "A" } ], - "description": "Task completion rate by outcome: succeeded, failed, aborted. Key V2 SLI for task health. [Metrics pending: requires cloud service instrumentation to be deployed]" + "description": "Task completion rate by outcome: succeeded, failed, aborted. Key V2 SLI for task health. [Metrics pending: requires cloud service instrumentation to be deployed]", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -1076,7 +1187,18 @@ "refId": "C" } ], - "description": "Time spent in RecursiveNodeHandler (pod creation). Dominant component of V2 task latency. [Metrics pending: requires cloud service instrumentation to be deployed]" + "description": "Time spent in RecursiveNodeHandler (pod creation). Dominant component of V2 task latency. [Metrics pending: requires cloud service instrumentation to be deployed]", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -1128,7 +1250,18 @@ "refId": "D" } ], - "description": "Executor error rates. System failures retry; exhausted means task permanently failed. Invalid leases = malformed from queue service. [Metrics pending: requires cloud service instrumentation to be deployed]" + "description": "Executor error rates. System failures retry; exhausted means task permanently failed. Invalid leases = malformed from queue service. [Metrics pending: requires cloud service instrumentation to be deployed]", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } } ] }, @@ -1179,7 +1312,18 @@ "refId": "A" } ], - "description": "DP\u2192CP gRPC client request rate by service and method (e.g. CreateWorkflowEvent)." + "description": "DP\u2192CP gRPC client request rate by service and method (e.g. CreateWorkflowEvent).", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -1216,7 +1360,18 @@ "refId": "A" } ], - "description": "DP\u2192CP gRPC errors by method and code. Non-OK responses from the control plane." + "description": "DP\u2192CP gRPC errors by method and code. Non-OK responses from the control plane.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -1253,7 +1408,18 @@ "refId": "A" } ], - "description": "DP\u2192CP gRPC call latency at p95. High latency = slow control plane or network issues." + "description": "DP\u2192CP gRPC call latency at p95. High latency = slow control plane or network issues.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } } ] }, @@ -1307,7 +1473,18 @@ "refId": "A" } ], - "description": "CPU usage in cores per container, stacked. Identifies resource-heavy services." + "description": "CPU usage in cores per container, stacked. Identifies resource-heavy services.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -1347,7 +1524,18 @@ "refId": "A" } ], - "description": "Working set memory per container. Watch for approaching limits." + "description": "Working set memory per container. Watch for approaching limits.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -1384,7 +1572,18 @@ "refId": "A" } ], - "description": "Per-container restart events. Spikes indicate crashes or OOM kills." + "description": "Per-container restart events. Spikes indicate crashes or OOM kills.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } } ] } From 77cdd98a9a9a00d1a5b363b7c0325556a7eb1b93 Mon Sep 17 00:00:00 2001 From: Michael Hotan Date: Mon, 4 May 2026 09:50:45 +1000 Subject: [PATCH 6/6] Regenerate test snapshots via make generate-expected Co-Authored-By: Claude Opus 4.6 (1M context) --- .../controlplane.aws.billing-enable.yaml | 918 ++++++++++++---- tests/generated/controlplane.aws.yaml | 918 ++++++++++++---- tests/generated/controlplane.custom-oidc.yaml | 918 ++++++++++++---- .../controlplane.external-authz.yaml | 918 ++++++++++++---- tests/generated/controlplane.userclouds.yaml | 918 ++++++++++++---- .../dataplane.additional-podlabels.yaml | 994 +++++------------- .../dataplane.additional-templates.yaml | 994 +++++------------- .../generated/dataplane.aws.eks-automode.yaml | 994 +++++------------- .../generated/dataplane.aws.with-ingress.yaml | 994 +++++------------- tests/generated/dataplane.aws.yaml | 994 +++++------------- ...dataplane.azure-custom-storage-prefix.yaml | 994 +++++------------- tests/generated/dataplane.azure.yaml | 994 +++++------------- tests/generated/dataplane.cost.yaml | 994 +++++------------- tests/generated/dataplane.dcgm-exporter.yaml | 994 +++++------------- .../generated/dataplane.fully-selfhosted.yaml | 994 +++++------------- tests/generated/dataplane.gcp.yaml | 994 +++++------------- tests/generated/dataplane.low-priv.yaml | 994 +++++------------- tests/generated/dataplane.monitoring.yaml | 994 +++++------------- tests/generated/dataplane.nodeobserver.yaml | 994 +++++------------- tests/generated/dataplane.oci.yaml | 994 +++++------------- 20 files changed, 7595 insertions(+), 11905 deletions(-) diff --git a/tests/generated/controlplane.aws.billing-enable.yaml b/tests/generated/controlplane.aws.billing-enable.yaml index 7c314675..eda9ee8b 100644 --- a/tests/generated/controlplane.aws.billing-enable.yaml +++ b/tests/generated/controlplane.aws.billing-enable.yaml @@ -1851,8 +1851,9 @@ data: "options": { "legend": { "calcs": [ - "mean", - "max" + "min", + "max", + "lastNotNull" ], "displayMode": "table", "placement": "bottom" @@ -1902,7 +1903,9 @@ data: "options": { "legend": { "calcs": [ - "mean" + "min", + "max", + "lastNotNull" ], "displayMode": "table", "placement": "bottom" @@ -2268,7 +2271,18 @@ data: "refId": "B" } ], - "description": "Service availability over time with SLO target line." + "description": "Service availability over time with SLO target line.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2312,7 +2326,18 @@ data: "refId": "B" } ], - "description": "Error budget remaining over time. Requires monitoring.slos.enabled." + "description": "Error budget remaining over time. Requires monitoring.slos.enabled.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "collapsed": true, @@ -2364,7 +2389,18 @@ data: "refId": "A" } ], - "description": "Ingress request rate broken down by host and URL path." + "description": "Ingress request rate broken down by host and URL path.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2404,7 +2440,18 @@ data: "refId": "A" } ], - "description": "4xx and 5xx error rates from ingress-nginx by HTTP status code." + "description": "4xx and 5xx error rates from ingress-nginx by HTTP status code.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2451,7 +2498,18 @@ data: "refId": "C" } ], - "description": "Ingress request latency percentiles. Includes TLS + routing + upstream response time." + "description": "Ingress request latency percentiles. Includes TLS + routing + upstream response time.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2488,7 +2546,18 @@ data: "refId": "A" } ], - "description": "Current number of active client connections to ingress-nginx." + "description": "Current number of active client connections to ingress-nginx.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } } ] }, @@ -2542,7 +2611,18 @@ data: "refId": "A" } ], - "description": "Connect protocol request throughput broken down by service (e.g. ExecutionService, ClusterService)." + "description": "Connect protocol request throughput broken down by service (e.g. ExecutionService, ClusterService).", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2579,7 +2659,18 @@ data: "refId": "A" } ], - "description": "Connect errors broken down by service and gRPC code. Identifies which services are erroring." + "description": "Connect errors broken down by service and gRPC code. Identifies which services are erroring.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2616,7 +2707,18 @@ data: "refId": "A" } ], - "description": "CacheService is the only CP service using gRPC (not Connect). Shows Get/Put/Delete/Reservation rates." + "description": "CacheService is the only CP service using gRPC (not Connect). Shows Get/Put/Delete/Reservation rates.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2653,7 +2755,18 @@ data: "refId": "A" } ], - "description": "CacheService gRPC errors by method and code." + "description": "CacheService gRPC errors by method and code.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } } ] }, @@ -2714,7 +2827,18 @@ data: "refId": "C" } ], - "description": "Current count of active workflow, node, and task executions tracked by FlyteAdmin." + "description": "Current count of active workflow, node, and task executions tracked by FlyteAdmin.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2766,7 +2890,18 @@ data: "refId": "D" } ], - "description": "Rate of execution creations and event ingestion (workflow, node, task events from propeller)." + "description": "Rate of execution creations and event ingestion (workflow, node, task events from propeller).", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2818,7 +2953,18 @@ data: "refId": "D" } ], - "description": "FlyteAdmin error rates: propeller communication failures, model transform errors, notification publish failures." + "description": "FlyteAdmin error rates: propeller communication failures, model transform errors, notification publish failures.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2870,7 +3016,18 @@ data: "refId": "D" } ], - "description": "FlyteAdmin gRPC endpoint latency at p95. Key endpoints: CreateExecution, CreateExecutionEvent, GetExecution." + "description": "FlyteAdmin gRPC endpoint latency at p95. Key endpoints: CreateExecution, CreateExecutionEvent, GetExecution.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2912,7 +3069,18 @@ data: "refId": "B" } ], - "description": "Authorization approve/deny rate from the FlyteAdmin auth middleware. High deny rate may indicate auth misconfiguration." + "description": "Authorization approve/deny rate from the FlyteAdmin auth middleware. High deny rate may indicate auth misconfiguration.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } } ] }, @@ -2925,186 +3093,129 @@ data: "y": 34 }, "id": 400, - "title": "Executions", + "title": "Executions (V2)", "type": "row", "panels": [ { + "title": "CreateRun Rate", + "type": "timeseries", "datasource": { "type": "prometheus", "uid": "${datasource}" }, "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "ops" + "unit": "reqps" } }, - "gridPos": { - "h": 8, - "w": 8, - "x": 0, - "y": 12 - }, - "id": 401, - "title": "Execution Create / Ack Rate", - "type": "timeseries", "targets": [ { - "expr": "rate(executions:executions:handle_create_op_count{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Create", + "expr": "sum(rate(connect:server_requests_handled_total{namespace=\"$namespace\", method=\"CreateRun\"}[$__rate_interval]))", + "legendFormat": "CreateRun", "refId": "A" - }, - { - "expr": "rate(executions:executions:handle_ack_op_count{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Ack", - "refId": "B" } ], - "description": "Rate of execution operation creates and acknowledgements. Create = new execution request, Ack = DP confirmed receipt." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "s" - } - }, "gridPos": { "h": 8, "w": 8, - "x": 8, - "y": 12 + "x": 0, + "y": 0 }, - "id": 402, - "title": "Execution Create / Ack Latency (p95)", - "type": "timeseries", - "targets": [ - { - "expr": "histogram_quantile(0.95, sum by (le) (rate(executions:executions:handle_create_op_bucket{namespace=\"$namespace\"}[$__rate_interval])))", - "legendFormat": "Create p95", - "refId": "A" - }, - { - "expr": "histogram_quantile(0.95, sum by (le) (rate(executions:executions:handle_ack_op_bucket{namespace=\"$namespace\"}[$__rate_interval])))", - "legendFormat": "Ack p95", - "refId": "B" + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] } - ], - "description": "Time to prepare create/ack execution requests at p95." + } }, { + "title": "CreateRun Latency (p50 / p95 / p99)", + "type": "timeseries", "datasource": { "type": "prometheus", "uid": "${datasource}" }, "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, "unit": "s" } }, - "gridPos": { - "h": 8, - "w": 8, - "x": 16, - "y": 12 - }, - "id": 403, - "title": "Assignment Duration (p50 / p90)", - "type": "timeseries", "targets": [ { - "expr": "histogram_quantile(0.50, sum by (le) (rate(executions:workqueue:announce_cluster_assignment_bucket{namespace=\"$namespace\"}[$__rate_interval])))", + "expr": "histogram_quantile(0.50, sum by (le) (rate(connect:server_request_duration_seconds_bucket{namespace=\"$namespace\", method=\"CreateRun\"}[$__rate_interval])))", "legendFormat": "p50", "refId": "A" }, { - "expr": "histogram_quantile(0.90, sum by (le) (rate(executions:workqueue:announce_cluster_assignment_bucket{namespace=\"$namespace\"}[$__rate_interval])))", - "legendFormat": "p90", + "expr": "histogram_quantile(0.95, sum by (le) (rate(connect:server_request_duration_seconds_bucket{namespace=\"$namespace\", method=\"CreateRun\"}[$__rate_interval])))", + "legendFormat": "p95", "refId": "B" + }, + { + "expr": "histogram_quantile(0.99, sum by (le) (rate(connect:server_request_duration_seconds_bucket{namespace=\"$namespace\", method=\"CreateRun\"}[$__rate_interval])))", + "legendFormat": "p99", + "refId": "C" } ], - "description": "Key SLI: end-to-end time from execution create to cluster assignment. Custom buckets from 10ms to 20min." + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 0 + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { + "title": "V2 Run Methods", + "type": "timeseries", "datasource": { "type": "prometheus", "uid": "${datasource}" }, "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "ops" + "unit": "reqps" } }, - "gridPos": { - "h": 8, - "w": 8, - "x": 0, - "y": 20 - }, - "id": 404, - "title": "Workqueue Operations", - "type": "timeseries", "targets": [ { - "expr": "rate(executions:workqueue:send_operation_count{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Send ops", + "expr": "sum by (method) (rate(connect:server_requests_handled_total{namespace=\"$namespace\", service=~\".*RunService.*\"}[$__rate_interval]))", + "legendFormat": "{{method}}", "refId": "A" - }, - { - "expr": "rate(executions:workqueue:claim_operations{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Claims", - "refId": "B" - }, - { - "expr": "rate(executions:workqueue:send_operation_failures{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Send failures", - "refId": "C" - }, - { - "expr": "rate(executions:workqueue:claim_operation_failures{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Claim failures", - "refId": "D" } ], - "description": "Execution operation send/claim rates and failures. Send = dispatch to DP, Claim = pick up from DB." + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 0 + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3141,7 +3252,18 @@ data: "refId": "A" } ], - "description": "Execution operations DB latency: create, ack, claim, unclaim, get, update." + "description": "Execution operations DB latency: create, ack, claim, unclaim, get, update.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3188,7 +3310,18 @@ data: "refId": "C" } ], - "description": "Executions service Postgres error rates by type: gorm errors, native postgres errors, not-found." + "description": "Executions service Postgres error rates by type: gorm errors, native postgres errors, not-found.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3240,7 +3373,18 @@ data: "refId": "D" } ], - "description": "Cluster and nodepool list cache effectiveness. High miss rate = excessive DB queries." + "description": "Cluster and nodepool list cache effectiveness. High miss rate = excessive DB queries.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3268,7 +3412,7 @@ data: "y": 28 }, "id": 408, - "title": "Pending Assignments", + "title": "Apps \u2014 Pending Assignments", "type": "timeseries", "targets": [ { @@ -3277,7 +3421,18 @@ data: "refId": "A" } ], - "description": "Number of apps waiting for cluster assignment. Growing backlog = scheduling bottleneck." + "description": "Number of apps waiting for cluster assignment. Growing backlog = scheduling bottleneck.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3305,7 +3460,7 @@ data: "y": 36 }, "id": 409, - "title": "First Ack Latency (V2 SLI)", + "title": "Apps \u2014 First Ack Latency", "type": "timeseries", "targets": [ { @@ -3324,7 +3479,18 @@ data: "refId": "C" } ], - "description": "Key V2 SLI: time to deliver an app to the dataplane. Measures end-to-end scheduling latency." + "description": "Key V2 SLI: time to deliver an app to the dataplane. Measures end-to-end scheduling latency.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3371,7 +3537,18 @@ data: "refId": "C" } ], - "description": "V2 run/action dispatch throughput. Enqueue failures indicate queue service issues." + "description": "V2 run/action dispatch throughput. Enqueue failures indicate queue service issues.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3418,7 +3595,18 @@ data: "refId": "C" } ], - "description": "V2 notification pipeline: notifications sent per second, active subscribers, log bytes streamed." + "description": "V2 notification pipeline: notifications sent per second, active subscribers, log bytes streamed.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } } ] }, @@ -3484,7 +3672,18 @@ data: "refId": "D" } ], - "description": "Total and scheduled run/action counts in the queue. Shows system load." + "description": "Total and scheduled run/action counts in the queue. Shows system load.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3531,7 +3730,18 @@ data: "refId": "C" } ], - "description": "Lease lifecycle throughput: enqueued (new), completed (done), aborted (cancelled)." + "description": "Lease lifecycle throughput: enqueued (new), completed (done), aborted (cancelled).", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3587,8 +3797,19 @@ data: "legendFormat": "DB queue", "refId": "E" } - ], - "description": "Internal queue depths across scheduler, runner, aborter, dispatcher, and DB worker pool. Growing = backpressure." + ], + "description": "Internal queue depths across scheduler, runner, aborter, dispatcher, and DB worker pool. Growing = backpressure.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3625,7 +3846,18 @@ data: "refId": "A" } ], - "description": "Dispatcher multi-step operation chain execution time at p99, by operation type." + "description": "Dispatcher multi-step operation chain execution time at p99, by operation type.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3667,7 +3899,18 @@ data: "refId": "B" } ], - "description": "In-memory state store operation latency. Backed by ScyllaDB persistence." + "description": "In-memory state store operation latency. Backed by ScyllaDB persistence.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3714,7 +3957,18 @@ data: "refId": "C" } ], - "description": "Active/terminal state counts and eventer error rate. Eventer reports action status to executions service." + "description": "Active/terminal state counts and eventer error rate. Eventer reports action status to executions service.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3751,7 +4005,18 @@ data: "refId": "A" } ], - "description": "Remaining execution capacity per connected DP worker. Zero = worker saturated." + "description": "Remaining execution capacity per connected DP worker. Zero = worker saturated.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3788,7 +4053,18 @@ data: "refId": "A" } ], - "description": "Failed dispatcher operations by Go type. Indicates internal queue service errors." + "description": "Failed dispatcher operations by Go type. Indicates internal queue service errors.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3835,7 +4111,18 @@ data: "refId": "C" } ], - "description": "Idle worker goroutines in DB, queue-client, and state-client pools. Zero = all threads busy." + "description": "Idle worker goroutines in DB, queue-client, and state-client pools. Zero = all threads busy.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } } ] }, @@ -3891,7 +4178,18 @@ data: "refId": "B" } ], - "description": "Rate of DP cluster status updates and heartbeats received by the cluster service." + "description": "Rate of DP cluster status updates and heartbeats received by the cluster service.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3919,7 +4217,7 @@ data: "y": 14 }, "id": 602, - "title": "Cluster API Latency (p95)", + "title": "Cluster API Latency (p99)", "type": "timeseries", "targets": [ { @@ -3928,12 +4226,23 @@ data: "refId": "A" }, { - "expr": "cluster:svc:heartbeat:success_ms{namespace=\"$namespace\", quantile=\"0.95\"} / 1000", + "expr": "cluster:svc:heartbeat:success_ms{namespace=\"$namespace\", quantile=\"0.99\"} / 1000", "legendFormat": "Heartbeat p95", "refId": "B" } ], - "description": "Cluster service RPC latency for UpdateStatus and Heartbeat calls." + "description": "Cluster service RPC latency for UpdateStatus and Heartbeat calls.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3975,7 +4284,18 @@ data: "refId": "B" } ], - "description": "DP-reported restart counts for operator and propeller pods. Set by DP on each UpdateStatus call." + "description": "DP-reported restart counts for operator and propeller pods. Set by DP on each UpdateStatus call.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -4022,7 +4342,18 @@ data: "refId": "C" } ], - "description": "Cluster service Postgres error rates by type: gorm errors, native postgres errors, not-found." + "description": "Cluster service Postgres error rates by type: gorm errors, native postgres errors, not-found.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -4080,7 +4411,18 @@ data: "refId": "A" } ], - "description": "Cluster health collector: 1=unhealthy, 0=healthy. Emitted per cluster on every Prometheus scrape." + "description": "Cluster health collector: 1=unhealthy, 0=healthy. Emitted per cluster on every Prometheus scrape.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -4117,7 +4459,18 @@ data: "refId": "A" } ], - "description": "Seconds since each cluster last sent a heartbeat. High values = stale/disconnected cluster." + "description": "Seconds since each cluster last sent a heartbeat. High values = stale/disconnected cluster.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -4159,7 +4512,18 @@ data: "refId": "B" } ], - "description": "LRU cache hit/miss rate for managed cluster lookups. High miss rate = excessive DB queries." + "description": "LRU cache hit/miss rate for managed cluster lookups. High miss rate = excessive DB queries.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } } ] }, @@ -4220,7 +4584,18 @@ data: "refId": "C" } ], - "description": "CacheService hit/miss rate. Hits = cached task output reused. Misses = task must execute. Get failures = storage errors. [Metrics pending: requires cloud service instrumentation to be deployed]" + "description": "CacheService hit/miss rate. Hits = cached task output reused. Misses = task must execute. Get failures = storage errors. [Metrics pending: requires cloud service instrumentation to be deployed]", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -4267,7 +4642,18 @@ data: "refId": "C" } ], - "description": "Cache reservation contention: how often workers are blocked waiting for another worker's cache computation. High contention = many workers computing the same task. [Metrics pending: requires cloud service instrumentation to be deployed]" + "description": "Cache reservation contention: how often workers are blocked waiting for another worker's cache computation. High contention = many workers computing the same task. [Metrics pending: requires cloud service instrumentation to be deployed]", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } } ] }, @@ -4393,7 +4779,9 @@ data: "displayMode": "table", "placement": "bottom", "calcs": [ - "mean" + "min", + "max", + "lastNotNull" ] }, "tooltip": { @@ -4461,7 +4849,12 @@ data: "options": { "legend": { "displayMode": "table", - "placement": "bottom" + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] }, "tooltip": { "mode": "multi" @@ -4520,8 +4913,13 @@ data: }, "options": { "legend": { - "displayMode": "list", - "placement": "bottom" + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] }, "tooltip": { "mode": "multi" @@ -4590,8 +4988,13 @@ data: }, "options": { "legend": { - "displayMode": "list", - "placement": "bottom" + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] }, "tooltip": { "mode": "multi" @@ -4649,7 +5052,9 @@ data: "displayMode": "table", "placement": "bottom", "calcs": [ - "sum" + "min", + "max", + "lastNotNull" ] }, "tooltip": { @@ -4711,7 +5116,12 @@ data: "options": { "legend": { "displayMode": "table", - "placement": "bottom" + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] }, "tooltip": { "mode": "multi" @@ -4767,7 +5177,12 @@ data: "options": { "legend": { "displayMode": "table", - "placement": "bottom" + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] }, "tooltip": { "mode": "multi" @@ -4822,8 +5237,13 @@ data: }, "options": { "legend": { - "displayMode": "list", - "placement": "bottom" + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] }, "tooltip": { "mode": "multi" @@ -4905,7 +5325,18 @@ data: "refId": "D" } ], - "description": "DataProxy internal cache effectiveness for domain resolution, cluster pool routing, and namespace mapping." + "description": "DataProxy internal cache effectiveness for domain resolution, cluster pool routing, and namespace mapping.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -4947,7 +5378,18 @@ data: "refId": "B" } ], - "description": "Time to read image metadata from the dataplane, proxied through DataProxy." + "description": "Time to read image metadata from the dataplane, proxied through DataProxy.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -4984,7 +5426,18 @@ data: "refId": "A" } ], - "description": "Per-cluster secret proxy errors during fan-out operations. Identifies which dataplane cluster is causing failures. [Metrics pending: requires cloud service instrumentation to be deployed]" + "description": "Per-cluster secret proxy errors during fan-out operations. Identifies which dataplane cluster is causing failures. [Metrics pending: requires cloud service instrumentation to be deployed]", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } } ] }, @@ -5035,7 +5488,18 @@ data: "refId": "A" } ], - "description": "Rate of ReportBillableUsage calls from DP clusters. Each call reports resource consumption for billing." + "description": "Rate of ReportBillableUsage calls from DP clusters. Each call reports resource consumption for billing.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -5087,7 +5551,18 @@ data: "refId": "D" } ], - "description": "SQS/queue message processing: received, sent (success), failed, dropped (max retries exceeded)." + "description": "SQS/queue message processing: received, sent (success), failed, dropped (max retries exceeded).", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -5127,7 +5602,18 @@ data: "refId": "A" } ], - "description": "Successful message processing rate by type: node_execution, workflow_execution, billable_usage, serverless_billable_usage. [Metrics pending: requires cloud service instrumentation to be deployed]" + "description": "Successful message processing rate by type: node_execution, workflow_execution, billable_usage, serverless_billable_usage. [Metrics pending: requires cloud service instrumentation to be deployed]", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -5174,7 +5660,18 @@ data: "refId": "C" } ], - "description": "Time to process individual queue messages. Slow processing may indicate backend API issues (Metronome, timestream). [Metrics pending: requires cloud service instrumentation to be deployed]" + "description": "Time to process individual queue messages. Slow processing may indicate backend API issues (Metronome, timestream). [Metrics pending: requires cloud service instrumentation to be deployed]", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } } ] }, @@ -5228,7 +5725,18 @@ data: "refId": "A" } ], - "description": "CPU usage in cores per container, stacked. Identifies resource-heavy services." + "description": "CPU usage in cores per container, stacked. Identifies resource-heavy services.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -5268,7 +5776,18 @@ data: "refId": "A" } ], - "description": "Working set memory per container, stacked. Watch for approaching limits." + "description": "Working set memory per container, stacked. Watch for approaching limits.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -5305,7 +5824,18 @@ data: "refId": "A" } ], - "description": "Per-container restart events. Spikes indicate crashes or OOM kills." + "description": "Per-container restart events. Spikes indicate crashes or OOM kills.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } } ] } @@ -5357,7 +5887,7 @@ data: "timepicker": {}, "timezone": "browser", "title": "Union Controlplane Overview", - "uid": "union-cp-overview", + "uid": "union-controlplane-overview", "version": 2 } --- diff --git a/tests/generated/controlplane.aws.yaml b/tests/generated/controlplane.aws.yaml index f6d784b6..a851ab9f 100644 --- a/tests/generated/controlplane.aws.yaml +++ b/tests/generated/controlplane.aws.yaml @@ -1851,8 +1851,9 @@ data: "options": { "legend": { "calcs": [ - "mean", - "max" + "min", + "max", + "lastNotNull" ], "displayMode": "table", "placement": "bottom" @@ -1902,7 +1903,9 @@ data: "options": { "legend": { "calcs": [ - "mean" + "min", + "max", + "lastNotNull" ], "displayMode": "table", "placement": "bottom" @@ -2268,7 +2271,18 @@ data: "refId": "B" } ], - "description": "Service availability over time with SLO target line." + "description": "Service availability over time with SLO target line.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2312,7 +2326,18 @@ data: "refId": "B" } ], - "description": "Error budget remaining over time. Requires monitoring.slos.enabled." + "description": "Error budget remaining over time. Requires monitoring.slos.enabled.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "collapsed": true, @@ -2364,7 +2389,18 @@ data: "refId": "A" } ], - "description": "Ingress request rate broken down by host and URL path." + "description": "Ingress request rate broken down by host and URL path.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2404,7 +2440,18 @@ data: "refId": "A" } ], - "description": "4xx and 5xx error rates from ingress-nginx by HTTP status code." + "description": "4xx and 5xx error rates from ingress-nginx by HTTP status code.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2451,7 +2498,18 @@ data: "refId": "C" } ], - "description": "Ingress request latency percentiles. Includes TLS + routing + upstream response time." + "description": "Ingress request latency percentiles. Includes TLS + routing + upstream response time.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2488,7 +2546,18 @@ data: "refId": "A" } ], - "description": "Current number of active client connections to ingress-nginx." + "description": "Current number of active client connections to ingress-nginx.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } } ] }, @@ -2542,7 +2611,18 @@ data: "refId": "A" } ], - "description": "Connect protocol request throughput broken down by service (e.g. ExecutionService, ClusterService)." + "description": "Connect protocol request throughput broken down by service (e.g. ExecutionService, ClusterService).", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2579,7 +2659,18 @@ data: "refId": "A" } ], - "description": "Connect errors broken down by service and gRPC code. Identifies which services are erroring." + "description": "Connect errors broken down by service and gRPC code. Identifies which services are erroring.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2616,7 +2707,18 @@ data: "refId": "A" } ], - "description": "CacheService is the only CP service using gRPC (not Connect). Shows Get/Put/Delete/Reservation rates." + "description": "CacheService is the only CP service using gRPC (not Connect). Shows Get/Put/Delete/Reservation rates.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2653,7 +2755,18 @@ data: "refId": "A" } ], - "description": "CacheService gRPC errors by method and code." + "description": "CacheService gRPC errors by method and code.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } } ] }, @@ -2714,7 +2827,18 @@ data: "refId": "C" } ], - "description": "Current count of active workflow, node, and task executions tracked by FlyteAdmin." + "description": "Current count of active workflow, node, and task executions tracked by FlyteAdmin.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2766,7 +2890,18 @@ data: "refId": "D" } ], - "description": "Rate of execution creations and event ingestion (workflow, node, task events from propeller)." + "description": "Rate of execution creations and event ingestion (workflow, node, task events from propeller).", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2818,7 +2953,18 @@ data: "refId": "D" } ], - "description": "FlyteAdmin error rates: propeller communication failures, model transform errors, notification publish failures." + "description": "FlyteAdmin error rates: propeller communication failures, model transform errors, notification publish failures.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2870,7 +3016,18 @@ data: "refId": "D" } ], - "description": "FlyteAdmin gRPC endpoint latency at p95. Key endpoints: CreateExecution, CreateExecutionEvent, GetExecution." + "description": "FlyteAdmin gRPC endpoint latency at p95. Key endpoints: CreateExecution, CreateExecutionEvent, GetExecution.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2912,7 +3069,18 @@ data: "refId": "B" } ], - "description": "Authorization approve/deny rate from the FlyteAdmin auth middleware. High deny rate may indicate auth misconfiguration." + "description": "Authorization approve/deny rate from the FlyteAdmin auth middleware. High deny rate may indicate auth misconfiguration.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } } ] }, @@ -2925,186 +3093,129 @@ data: "y": 34 }, "id": 400, - "title": "Executions", + "title": "Executions (V2)", "type": "row", "panels": [ { + "title": "CreateRun Rate", + "type": "timeseries", "datasource": { "type": "prometheus", "uid": "${datasource}" }, "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "ops" + "unit": "reqps" } }, - "gridPos": { - "h": 8, - "w": 8, - "x": 0, - "y": 12 - }, - "id": 401, - "title": "Execution Create / Ack Rate", - "type": "timeseries", "targets": [ { - "expr": "rate(executions:executions:handle_create_op_count{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Create", + "expr": "sum(rate(connect:server_requests_handled_total{namespace=\"$namespace\", method=\"CreateRun\"}[$__rate_interval]))", + "legendFormat": "CreateRun", "refId": "A" - }, - { - "expr": "rate(executions:executions:handle_ack_op_count{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Ack", - "refId": "B" } ], - "description": "Rate of execution operation creates and acknowledgements. Create = new execution request, Ack = DP confirmed receipt." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "s" - } - }, "gridPos": { "h": 8, "w": 8, - "x": 8, - "y": 12 + "x": 0, + "y": 0 }, - "id": 402, - "title": "Execution Create / Ack Latency (p95)", - "type": "timeseries", - "targets": [ - { - "expr": "histogram_quantile(0.95, sum by (le) (rate(executions:executions:handle_create_op_bucket{namespace=\"$namespace\"}[$__rate_interval])))", - "legendFormat": "Create p95", - "refId": "A" - }, - { - "expr": "histogram_quantile(0.95, sum by (le) (rate(executions:executions:handle_ack_op_bucket{namespace=\"$namespace\"}[$__rate_interval])))", - "legendFormat": "Ack p95", - "refId": "B" + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] } - ], - "description": "Time to prepare create/ack execution requests at p95." + } }, { + "title": "CreateRun Latency (p50 / p95 / p99)", + "type": "timeseries", "datasource": { "type": "prometheus", "uid": "${datasource}" }, "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, "unit": "s" } }, - "gridPos": { - "h": 8, - "w": 8, - "x": 16, - "y": 12 - }, - "id": 403, - "title": "Assignment Duration (p50 / p90)", - "type": "timeseries", "targets": [ { - "expr": "histogram_quantile(0.50, sum by (le) (rate(executions:workqueue:announce_cluster_assignment_bucket{namespace=\"$namespace\"}[$__rate_interval])))", + "expr": "histogram_quantile(0.50, sum by (le) (rate(connect:server_request_duration_seconds_bucket{namespace=\"$namespace\", method=\"CreateRun\"}[$__rate_interval])))", "legendFormat": "p50", "refId": "A" }, { - "expr": "histogram_quantile(0.90, sum by (le) (rate(executions:workqueue:announce_cluster_assignment_bucket{namespace=\"$namespace\"}[$__rate_interval])))", - "legendFormat": "p90", + "expr": "histogram_quantile(0.95, sum by (le) (rate(connect:server_request_duration_seconds_bucket{namespace=\"$namespace\", method=\"CreateRun\"}[$__rate_interval])))", + "legendFormat": "p95", "refId": "B" + }, + { + "expr": "histogram_quantile(0.99, sum by (le) (rate(connect:server_request_duration_seconds_bucket{namespace=\"$namespace\", method=\"CreateRun\"}[$__rate_interval])))", + "legendFormat": "p99", + "refId": "C" } ], - "description": "Key SLI: end-to-end time from execution create to cluster assignment. Custom buckets from 10ms to 20min." + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 0 + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { + "title": "V2 Run Methods", + "type": "timeseries", "datasource": { "type": "prometheus", "uid": "${datasource}" }, "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "ops" + "unit": "reqps" } }, - "gridPos": { - "h": 8, - "w": 8, - "x": 0, - "y": 20 - }, - "id": 404, - "title": "Workqueue Operations", - "type": "timeseries", "targets": [ { - "expr": "rate(executions:workqueue:send_operation_count{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Send ops", + "expr": "sum by (method) (rate(connect:server_requests_handled_total{namespace=\"$namespace\", service=~\".*RunService.*\"}[$__rate_interval]))", + "legendFormat": "{{method}}", "refId": "A" - }, - { - "expr": "rate(executions:workqueue:claim_operations{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Claims", - "refId": "B" - }, - { - "expr": "rate(executions:workqueue:send_operation_failures{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Send failures", - "refId": "C" - }, - { - "expr": "rate(executions:workqueue:claim_operation_failures{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Claim failures", - "refId": "D" } ], - "description": "Execution operation send/claim rates and failures. Send = dispatch to DP, Claim = pick up from DB." + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 0 + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3141,7 +3252,18 @@ data: "refId": "A" } ], - "description": "Execution operations DB latency: create, ack, claim, unclaim, get, update." + "description": "Execution operations DB latency: create, ack, claim, unclaim, get, update.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3188,7 +3310,18 @@ data: "refId": "C" } ], - "description": "Executions service Postgres error rates by type: gorm errors, native postgres errors, not-found." + "description": "Executions service Postgres error rates by type: gorm errors, native postgres errors, not-found.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3240,7 +3373,18 @@ data: "refId": "D" } ], - "description": "Cluster and nodepool list cache effectiveness. High miss rate = excessive DB queries." + "description": "Cluster and nodepool list cache effectiveness. High miss rate = excessive DB queries.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3268,7 +3412,7 @@ data: "y": 28 }, "id": 408, - "title": "Pending Assignments", + "title": "Apps \u2014 Pending Assignments", "type": "timeseries", "targets": [ { @@ -3277,7 +3421,18 @@ data: "refId": "A" } ], - "description": "Number of apps waiting for cluster assignment. Growing backlog = scheduling bottleneck." + "description": "Number of apps waiting for cluster assignment. Growing backlog = scheduling bottleneck.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3305,7 +3460,7 @@ data: "y": 36 }, "id": 409, - "title": "First Ack Latency (V2 SLI)", + "title": "Apps \u2014 First Ack Latency", "type": "timeseries", "targets": [ { @@ -3324,7 +3479,18 @@ data: "refId": "C" } ], - "description": "Key V2 SLI: time to deliver an app to the dataplane. Measures end-to-end scheduling latency." + "description": "Key V2 SLI: time to deliver an app to the dataplane. Measures end-to-end scheduling latency.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3371,7 +3537,18 @@ data: "refId": "C" } ], - "description": "V2 run/action dispatch throughput. Enqueue failures indicate queue service issues." + "description": "V2 run/action dispatch throughput. Enqueue failures indicate queue service issues.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3418,7 +3595,18 @@ data: "refId": "C" } ], - "description": "V2 notification pipeline: notifications sent per second, active subscribers, log bytes streamed." + "description": "V2 notification pipeline: notifications sent per second, active subscribers, log bytes streamed.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } } ] }, @@ -3484,7 +3672,18 @@ data: "refId": "D" } ], - "description": "Total and scheduled run/action counts in the queue. Shows system load." + "description": "Total and scheduled run/action counts in the queue. Shows system load.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3531,7 +3730,18 @@ data: "refId": "C" } ], - "description": "Lease lifecycle throughput: enqueued (new), completed (done), aborted (cancelled)." + "description": "Lease lifecycle throughput: enqueued (new), completed (done), aborted (cancelled).", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3587,8 +3797,19 @@ data: "legendFormat": "DB queue", "refId": "E" } - ], - "description": "Internal queue depths across scheduler, runner, aborter, dispatcher, and DB worker pool. Growing = backpressure." + ], + "description": "Internal queue depths across scheduler, runner, aborter, dispatcher, and DB worker pool. Growing = backpressure.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3625,7 +3846,18 @@ data: "refId": "A" } ], - "description": "Dispatcher multi-step operation chain execution time at p99, by operation type." + "description": "Dispatcher multi-step operation chain execution time at p99, by operation type.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3667,7 +3899,18 @@ data: "refId": "B" } ], - "description": "In-memory state store operation latency. Backed by ScyllaDB persistence." + "description": "In-memory state store operation latency. Backed by ScyllaDB persistence.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3714,7 +3957,18 @@ data: "refId": "C" } ], - "description": "Active/terminal state counts and eventer error rate. Eventer reports action status to executions service." + "description": "Active/terminal state counts and eventer error rate. Eventer reports action status to executions service.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3751,7 +4005,18 @@ data: "refId": "A" } ], - "description": "Remaining execution capacity per connected DP worker. Zero = worker saturated." + "description": "Remaining execution capacity per connected DP worker. Zero = worker saturated.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3788,7 +4053,18 @@ data: "refId": "A" } ], - "description": "Failed dispatcher operations by Go type. Indicates internal queue service errors." + "description": "Failed dispatcher operations by Go type. Indicates internal queue service errors.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3835,7 +4111,18 @@ data: "refId": "C" } ], - "description": "Idle worker goroutines in DB, queue-client, and state-client pools. Zero = all threads busy." + "description": "Idle worker goroutines in DB, queue-client, and state-client pools. Zero = all threads busy.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } } ] }, @@ -3891,7 +4178,18 @@ data: "refId": "B" } ], - "description": "Rate of DP cluster status updates and heartbeats received by the cluster service." + "description": "Rate of DP cluster status updates and heartbeats received by the cluster service.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3919,7 +4217,7 @@ data: "y": 14 }, "id": 602, - "title": "Cluster API Latency (p95)", + "title": "Cluster API Latency (p99)", "type": "timeseries", "targets": [ { @@ -3928,12 +4226,23 @@ data: "refId": "A" }, { - "expr": "cluster:svc:heartbeat:success_ms{namespace=\"$namespace\", quantile=\"0.95\"} / 1000", + "expr": "cluster:svc:heartbeat:success_ms{namespace=\"$namespace\", quantile=\"0.99\"} / 1000", "legendFormat": "Heartbeat p95", "refId": "B" } ], - "description": "Cluster service RPC latency for UpdateStatus and Heartbeat calls." + "description": "Cluster service RPC latency for UpdateStatus and Heartbeat calls.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3975,7 +4284,18 @@ data: "refId": "B" } ], - "description": "DP-reported restart counts for operator and propeller pods. Set by DP on each UpdateStatus call." + "description": "DP-reported restart counts for operator and propeller pods. Set by DP on each UpdateStatus call.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -4022,7 +4342,18 @@ data: "refId": "C" } ], - "description": "Cluster service Postgres error rates by type: gorm errors, native postgres errors, not-found." + "description": "Cluster service Postgres error rates by type: gorm errors, native postgres errors, not-found.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -4080,7 +4411,18 @@ data: "refId": "A" } ], - "description": "Cluster health collector: 1=unhealthy, 0=healthy. Emitted per cluster on every Prometheus scrape." + "description": "Cluster health collector: 1=unhealthy, 0=healthy. Emitted per cluster on every Prometheus scrape.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -4117,7 +4459,18 @@ data: "refId": "A" } ], - "description": "Seconds since each cluster last sent a heartbeat. High values = stale/disconnected cluster." + "description": "Seconds since each cluster last sent a heartbeat. High values = stale/disconnected cluster.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -4159,7 +4512,18 @@ data: "refId": "B" } ], - "description": "LRU cache hit/miss rate for managed cluster lookups. High miss rate = excessive DB queries." + "description": "LRU cache hit/miss rate for managed cluster lookups. High miss rate = excessive DB queries.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } } ] }, @@ -4220,7 +4584,18 @@ data: "refId": "C" } ], - "description": "CacheService hit/miss rate. Hits = cached task output reused. Misses = task must execute. Get failures = storage errors. [Metrics pending: requires cloud service instrumentation to be deployed]" + "description": "CacheService hit/miss rate. Hits = cached task output reused. Misses = task must execute. Get failures = storage errors. [Metrics pending: requires cloud service instrumentation to be deployed]", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -4267,7 +4642,18 @@ data: "refId": "C" } ], - "description": "Cache reservation contention: how often workers are blocked waiting for another worker's cache computation. High contention = many workers computing the same task. [Metrics pending: requires cloud service instrumentation to be deployed]" + "description": "Cache reservation contention: how often workers are blocked waiting for another worker's cache computation. High contention = many workers computing the same task. [Metrics pending: requires cloud service instrumentation to be deployed]", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } } ] }, @@ -4393,7 +4779,9 @@ data: "displayMode": "table", "placement": "bottom", "calcs": [ - "mean" + "min", + "max", + "lastNotNull" ] }, "tooltip": { @@ -4461,7 +4849,12 @@ data: "options": { "legend": { "displayMode": "table", - "placement": "bottom" + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] }, "tooltip": { "mode": "multi" @@ -4520,8 +4913,13 @@ data: }, "options": { "legend": { - "displayMode": "list", - "placement": "bottom" + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] }, "tooltip": { "mode": "multi" @@ -4590,8 +4988,13 @@ data: }, "options": { "legend": { - "displayMode": "list", - "placement": "bottom" + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] }, "tooltip": { "mode": "multi" @@ -4649,7 +5052,9 @@ data: "displayMode": "table", "placement": "bottom", "calcs": [ - "sum" + "min", + "max", + "lastNotNull" ] }, "tooltip": { @@ -4711,7 +5116,12 @@ data: "options": { "legend": { "displayMode": "table", - "placement": "bottom" + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] }, "tooltip": { "mode": "multi" @@ -4767,7 +5177,12 @@ data: "options": { "legend": { "displayMode": "table", - "placement": "bottom" + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] }, "tooltip": { "mode": "multi" @@ -4822,8 +5237,13 @@ data: }, "options": { "legend": { - "displayMode": "list", - "placement": "bottom" + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] }, "tooltip": { "mode": "multi" @@ -4905,7 +5325,18 @@ data: "refId": "D" } ], - "description": "DataProxy internal cache effectiveness for domain resolution, cluster pool routing, and namespace mapping." + "description": "DataProxy internal cache effectiveness for domain resolution, cluster pool routing, and namespace mapping.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -4947,7 +5378,18 @@ data: "refId": "B" } ], - "description": "Time to read image metadata from the dataplane, proxied through DataProxy." + "description": "Time to read image metadata from the dataplane, proxied through DataProxy.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -4984,7 +5426,18 @@ data: "refId": "A" } ], - "description": "Per-cluster secret proxy errors during fan-out operations. Identifies which dataplane cluster is causing failures. [Metrics pending: requires cloud service instrumentation to be deployed]" + "description": "Per-cluster secret proxy errors during fan-out operations. Identifies which dataplane cluster is causing failures. [Metrics pending: requires cloud service instrumentation to be deployed]", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } } ] }, @@ -5035,7 +5488,18 @@ data: "refId": "A" } ], - "description": "Rate of ReportBillableUsage calls from DP clusters. Each call reports resource consumption for billing." + "description": "Rate of ReportBillableUsage calls from DP clusters. Each call reports resource consumption for billing.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -5087,7 +5551,18 @@ data: "refId": "D" } ], - "description": "SQS/queue message processing: received, sent (success), failed, dropped (max retries exceeded)." + "description": "SQS/queue message processing: received, sent (success), failed, dropped (max retries exceeded).", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -5127,7 +5602,18 @@ data: "refId": "A" } ], - "description": "Successful message processing rate by type: node_execution, workflow_execution, billable_usage, serverless_billable_usage. [Metrics pending: requires cloud service instrumentation to be deployed]" + "description": "Successful message processing rate by type: node_execution, workflow_execution, billable_usage, serverless_billable_usage. [Metrics pending: requires cloud service instrumentation to be deployed]", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -5174,7 +5660,18 @@ data: "refId": "C" } ], - "description": "Time to process individual queue messages. Slow processing may indicate backend API issues (Metronome, timestream). [Metrics pending: requires cloud service instrumentation to be deployed]" + "description": "Time to process individual queue messages. Slow processing may indicate backend API issues (Metronome, timestream). [Metrics pending: requires cloud service instrumentation to be deployed]", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } } ] }, @@ -5228,7 +5725,18 @@ data: "refId": "A" } ], - "description": "CPU usage in cores per container, stacked. Identifies resource-heavy services." + "description": "CPU usage in cores per container, stacked. Identifies resource-heavy services.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -5268,7 +5776,18 @@ data: "refId": "A" } ], - "description": "Working set memory per container, stacked. Watch for approaching limits." + "description": "Working set memory per container, stacked. Watch for approaching limits.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -5305,7 +5824,18 @@ data: "refId": "A" } ], - "description": "Per-container restart events. Spikes indicate crashes or OOM kills." + "description": "Per-container restart events. Spikes indicate crashes or OOM kills.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } } ] } @@ -5357,7 +5887,7 @@ data: "timepicker": {}, "timezone": "browser", "title": "Union Controlplane Overview", - "uid": "union-cp-overview", + "uid": "union-controlplane-overview", "version": 2 } --- diff --git a/tests/generated/controlplane.custom-oidc.yaml b/tests/generated/controlplane.custom-oidc.yaml index 820c59bc..cd3b8428 100644 --- a/tests/generated/controlplane.custom-oidc.yaml +++ b/tests/generated/controlplane.custom-oidc.yaml @@ -1866,8 +1866,9 @@ data: "options": { "legend": { "calcs": [ - "mean", - "max" + "min", + "max", + "lastNotNull" ], "displayMode": "table", "placement": "bottom" @@ -1917,7 +1918,9 @@ data: "options": { "legend": { "calcs": [ - "mean" + "min", + "max", + "lastNotNull" ], "displayMode": "table", "placement": "bottom" @@ -2283,7 +2286,18 @@ data: "refId": "B" } ], - "description": "Service availability over time with SLO target line." + "description": "Service availability over time with SLO target line.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2327,7 +2341,18 @@ data: "refId": "B" } ], - "description": "Error budget remaining over time. Requires monitoring.slos.enabled." + "description": "Error budget remaining over time. Requires monitoring.slos.enabled.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "collapsed": true, @@ -2379,7 +2404,18 @@ data: "refId": "A" } ], - "description": "Ingress request rate broken down by host and URL path." + "description": "Ingress request rate broken down by host and URL path.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2419,7 +2455,18 @@ data: "refId": "A" } ], - "description": "4xx and 5xx error rates from ingress-nginx by HTTP status code." + "description": "4xx and 5xx error rates from ingress-nginx by HTTP status code.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2466,7 +2513,18 @@ data: "refId": "C" } ], - "description": "Ingress request latency percentiles. Includes TLS + routing + upstream response time." + "description": "Ingress request latency percentiles. Includes TLS + routing + upstream response time.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2503,7 +2561,18 @@ data: "refId": "A" } ], - "description": "Current number of active client connections to ingress-nginx." + "description": "Current number of active client connections to ingress-nginx.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } } ] }, @@ -2557,7 +2626,18 @@ data: "refId": "A" } ], - "description": "Connect protocol request throughput broken down by service (e.g. ExecutionService, ClusterService)." + "description": "Connect protocol request throughput broken down by service (e.g. ExecutionService, ClusterService).", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2594,7 +2674,18 @@ data: "refId": "A" } ], - "description": "Connect errors broken down by service and gRPC code. Identifies which services are erroring." + "description": "Connect errors broken down by service and gRPC code. Identifies which services are erroring.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2631,7 +2722,18 @@ data: "refId": "A" } ], - "description": "CacheService is the only CP service using gRPC (not Connect). Shows Get/Put/Delete/Reservation rates." + "description": "CacheService is the only CP service using gRPC (not Connect). Shows Get/Put/Delete/Reservation rates.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2668,7 +2770,18 @@ data: "refId": "A" } ], - "description": "CacheService gRPC errors by method and code." + "description": "CacheService gRPC errors by method and code.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } } ] }, @@ -2729,7 +2842,18 @@ data: "refId": "C" } ], - "description": "Current count of active workflow, node, and task executions tracked by FlyteAdmin." + "description": "Current count of active workflow, node, and task executions tracked by FlyteAdmin.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2781,7 +2905,18 @@ data: "refId": "D" } ], - "description": "Rate of execution creations and event ingestion (workflow, node, task events from propeller)." + "description": "Rate of execution creations and event ingestion (workflow, node, task events from propeller).", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2833,7 +2968,18 @@ data: "refId": "D" } ], - "description": "FlyteAdmin error rates: propeller communication failures, model transform errors, notification publish failures." + "description": "FlyteAdmin error rates: propeller communication failures, model transform errors, notification publish failures.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2885,7 +3031,18 @@ data: "refId": "D" } ], - "description": "FlyteAdmin gRPC endpoint latency at p95. Key endpoints: CreateExecution, CreateExecutionEvent, GetExecution." + "description": "FlyteAdmin gRPC endpoint latency at p95. Key endpoints: CreateExecution, CreateExecutionEvent, GetExecution.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2927,7 +3084,18 @@ data: "refId": "B" } ], - "description": "Authorization approve/deny rate from the FlyteAdmin auth middleware. High deny rate may indicate auth misconfiguration." + "description": "Authorization approve/deny rate from the FlyteAdmin auth middleware. High deny rate may indicate auth misconfiguration.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } } ] }, @@ -2940,186 +3108,129 @@ data: "y": 34 }, "id": 400, - "title": "Executions", + "title": "Executions (V2)", "type": "row", "panels": [ { + "title": "CreateRun Rate", + "type": "timeseries", "datasource": { "type": "prometheus", "uid": "${datasource}" }, "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "ops" + "unit": "reqps" } }, - "gridPos": { - "h": 8, - "w": 8, - "x": 0, - "y": 12 - }, - "id": 401, - "title": "Execution Create / Ack Rate", - "type": "timeseries", "targets": [ { - "expr": "rate(executions:executions:handle_create_op_count{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Create", + "expr": "sum(rate(connect:server_requests_handled_total{namespace=\"$namespace\", method=\"CreateRun\"}[$__rate_interval]))", + "legendFormat": "CreateRun", "refId": "A" - }, - { - "expr": "rate(executions:executions:handle_ack_op_count{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Ack", - "refId": "B" } ], - "description": "Rate of execution operation creates and acknowledgements. Create = new execution request, Ack = DP confirmed receipt." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "s" - } - }, "gridPos": { "h": 8, "w": 8, - "x": 8, - "y": 12 + "x": 0, + "y": 0 }, - "id": 402, - "title": "Execution Create / Ack Latency (p95)", - "type": "timeseries", - "targets": [ - { - "expr": "histogram_quantile(0.95, sum by (le) (rate(executions:executions:handle_create_op_bucket{namespace=\"$namespace\"}[$__rate_interval])))", - "legendFormat": "Create p95", - "refId": "A" - }, - { - "expr": "histogram_quantile(0.95, sum by (le) (rate(executions:executions:handle_ack_op_bucket{namespace=\"$namespace\"}[$__rate_interval])))", - "legendFormat": "Ack p95", - "refId": "B" + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] } - ], - "description": "Time to prepare create/ack execution requests at p95." + } }, { + "title": "CreateRun Latency (p50 / p95 / p99)", + "type": "timeseries", "datasource": { "type": "prometheus", "uid": "${datasource}" }, "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, "unit": "s" } }, - "gridPos": { - "h": 8, - "w": 8, - "x": 16, - "y": 12 - }, - "id": 403, - "title": "Assignment Duration (p50 / p90)", - "type": "timeseries", "targets": [ { - "expr": "histogram_quantile(0.50, sum by (le) (rate(executions:workqueue:announce_cluster_assignment_bucket{namespace=\"$namespace\"}[$__rate_interval])))", + "expr": "histogram_quantile(0.50, sum by (le) (rate(connect:server_request_duration_seconds_bucket{namespace=\"$namespace\", method=\"CreateRun\"}[$__rate_interval])))", "legendFormat": "p50", "refId": "A" }, { - "expr": "histogram_quantile(0.90, sum by (le) (rate(executions:workqueue:announce_cluster_assignment_bucket{namespace=\"$namespace\"}[$__rate_interval])))", - "legendFormat": "p90", + "expr": "histogram_quantile(0.95, sum by (le) (rate(connect:server_request_duration_seconds_bucket{namespace=\"$namespace\", method=\"CreateRun\"}[$__rate_interval])))", + "legendFormat": "p95", "refId": "B" + }, + { + "expr": "histogram_quantile(0.99, sum by (le) (rate(connect:server_request_duration_seconds_bucket{namespace=\"$namespace\", method=\"CreateRun\"}[$__rate_interval])))", + "legendFormat": "p99", + "refId": "C" } ], - "description": "Key SLI: end-to-end time from execution create to cluster assignment. Custom buckets from 10ms to 20min." + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 0 + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { + "title": "V2 Run Methods", + "type": "timeseries", "datasource": { "type": "prometheus", "uid": "${datasource}" }, "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "ops" + "unit": "reqps" } }, - "gridPos": { - "h": 8, - "w": 8, - "x": 0, - "y": 20 - }, - "id": 404, - "title": "Workqueue Operations", - "type": "timeseries", "targets": [ { - "expr": "rate(executions:workqueue:send_operation_count{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Send ops", + "expr": "sum by (method) (rate(connect:server_requests_handled_total{namespace=\"$namespace\", service=~\".*RunService.*\"}[$__rate_interval]))", + "legendFormat": "{{method}}", "refId": "A" - }, - { - "expr": "rate(executions:workqueue:claim_operations{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Claims", - "refId": "B" - }, - { - "expr": "rate(executions:workqueue:send_operation_failures{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Send failures", - "refId": "C" - }, - { - "expr": "rate(executions:workqueue:claim_operation_failures{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Claim failures", - "refId": "D" } ], - "description": "Execution operation send/claim rates and failures. Send = dispatch to DP, Claim = pick up from DB." + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 0 + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3156,7 +3267,18 @@ data: "refId": "A" } ], - "description": "Execution operations DB latency: create, ack, claim, unclaim, get, update." + "description": "Execution operations DB latency: create, ack, claim, unclaim, get, update.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3203,7 +3325,18 @@ data: "refId": "C" } ], - "description": "Executions service Postgres error rates by type: gorm errors, native postgres errors, not-found." + "description": "Executions service Postgres error rates by type: gorm errors, native postgres errors, not-found.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3255,7 +3388,18 @@ data: "refId": "D" } ], - "description": "Cluster and nodepool list cache effectiveness. High miss rate = excessive DB queries." + "description": "Cluster and nodepool list cache effectiveness. High miss rate = excessive DB queries.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3283,7 +3427,7 @@ data: "y": 28 }, "id": 408, - "title": "Pending Assignments", + "title": "Apps \u2014 Pending Assignments", "type": "timeseries", "targets": [ { @@ -3292,7 +3436,18 @@ data: "refId": "A" } ], - "description": "Number of apps waiting for cluster assignment. Growing backlog = scheduling bottleneck." + "description": "Number of apps waiting for cluster assignment. Growing backlog = scheduling bottleneck.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3320,7 +3475,7 @@ data: "y": 36 }, "id": 409, - "title": "First Ack Latency (V2 SLI)", + "title": "Apps \u2014 First Ack Latency", "type": "timeseries", "targets": [ { @@ -3339,7 +3494,18 @@ data: "refId": "C" } ], - "description": "Key V2 SLI: time to deliver an app to the dataplane. Measures end-to-end scheduling latency." + "description": "Key V2 SLI: time to deliver an app to the dataplane. Measures end-to-end scheduling latency.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3386,7 +3552,18 @@ data: "refId": "C" } ], - "description": "V2 run/action dispatch throughput. Enqueue failures indicate queue service issues." + "description": "V2 run/action dispatch throughput. Enqueue failures indicate queue service issues.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3433,7 +3610,18 @@ data: "refId": "C" } ], - "description": "V2 notification pipeline: notifications sent per second, active subscribers, log bytes streamed." + "description": "V2 notification pipeline: notifications sent per second, active subscribers, log bytes streamed.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } } ] }, @@ -3499,7 +3687,18 @@ data: "refId": "D" } ], - "description": "Total and scheduled run/action counts in the queue. Shows system load." + "description": "Total and scheduled run/action counts in the queue. Shows system load.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3546,7 +3745,18 @@ data: "refId": "C" } ], - "description": "Lease lifecycle throughput: enqueued (new), completed (done), aborted (cancelled)." + "description": "Lease lifecycle throughput: enqueued (new), completed (done), aborted (cancelled).", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3602,8 +3812,19 @@ data: "legendFormat": "DB queue", "refId": "E" } - ], - "description": "Internal queue depths across scheduler, runner, aborter, dispatcher, and DB worker pool. Growing = backpressure." + ], + "description": "Internal queue depths across scheduler, runner, aborter, dispatcher, and DB worker pool. Growing = backpressure.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3640,7 +3861,18 @@ data: "refId": "A" } ], - "description": "Dispatcher multi-step operation chain execution time at p99, by operation type." + "description": "Dispatcher multi-step operation chain execution time at p99, by operation type.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3682,7 +3914,18 @@ data: "refId": "B" } ], - "description": "In-memory state store operation latency. Backed by ScyllaDB persistence." + "description": "In-memory state store operation latency. Backed by ScyllaDB persistence.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3729,7 +3972,18 @@ data: "refId": "C" } ], - "description": "Active/terminal state counts and eventer error rate. Eventer reports action status to executions service." + "description": "Active/terminal state counts and eventer error rate. Eventer reports action status to executions service.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3766,7 +4020,18 @@ data: "refId": "A" } ], - "description": "Remaining execution capacity per connected DP worker. Zero = worker saturated." + "description": "Remaining execution capacity per connected DP worker. Zero = worker saturated.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3803,7 +4068,18 @@ data: "refId": "A" } ], - "description": "Failed dispatcher operations by Go type. Indicates internal queue service errors." + "description": "Failed dispatcher operations by Go type. Indicates internal queue service errors.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3850,7 +4126,18 @@ data: "refId": "C" } ], - "description": "Idle worker goroutines in DB, queue-client, and state-client pools. Zero = all threads busy." + "description": "Idle worker goroutines in DB, queue-client, and state-client pools. Zero = all threads busy.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } } ] }, @@ -3906,7 +4193,18 @@ data: "refId": "B" } ], - "description": "Rate of DP cluster status updates and heartbeats received by the cluster service." + "description": "Rate of DP cluster status updates and heartbeats received by the cluster service.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3934,7 +4232,7 @@ data: "y": 14 }, "id": 602, - "title": "Cluster API Latency (p95)", + "title": "Cluster API Latency (p99)", "type": "timeseries", "targets": [ { @@ -3943,12 +4241,23 @@ data: "refId": "A" }, { - "expr": "cluster:svc:heartbeat:success_ms{namespace=\"$namespace\", quantile=\"0.95\"} / 1000", + "expr": "cluster:svc:heartbeat:success_ms{namespace=\"$namespace\", quantile=\"0.99\"} / 1000", "legendFormat": "Heartbeat p95", "refId": "B" } ], - "description": "Cluster service RPC latency for UpdateStatus and Heartbeat calls." + "description": "Cluster service RPC latency for UpdateStatus and Heartbeat calls.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3990,7 +4299,18 @@ data: "refId": "B" } ], - "description": "DP-reported restart counts for operator and propeller pods. Set by DP on each UpdateStatus call." + "description": "DP-reported restart counts for operator and propeller pods. Set by DP on each UpdateStatus call.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -4037,7 +4357,18 @@ data: "refId": "C" } ], - "description": "Cluster service Postgres error rates by type: gorm errors, native postgres errors, not-found." + "description": "Cluster service Postgres error rates by type: gorm errors, native postgres errors, not-found.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -4095,7 +4426,18 @@ data: "refId": "A" } ], - "description": "Cluster health collector: 1=unhealthy, 0=healthy. Emitted per cluster on every Prometheus scrape." + "description": "Cluster health collector: 1=unhealthy, 0=healthy. Emitted per cluster on every Prometheus scrape.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -4132,7 +4474,18 @@ data: "refId": "A" } ], - "description": "Seconds since each cluster last sent a heartbeat. High values = stale/disconnected cluster." + "description": "Seconds since each cluster last sent a heartbeat. High values = stale/disconnected cluster.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -4174,7 +4527,18 @@ data: "refId": "B" } ], - "description": "LRU cache hit/miss rate for managed cluster lookups. High miss rate = excessive DB queries." + "description": "LRU cache hit/miss rate for managed cluster lookups. High miss rate = excessive DB queries.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } } ] }, @@ -4235,7 +4599,18 @@ data: "refId": "C" } ], - "description": "CacheService hit/miss rate. Hits = cached task output reused. Misses = task must execute. Get failures = storage errors. [Metrics pending: requires cloud service instrumentation to be deployed]" + "description": "CacheService hit/miss rate. Hits = cached task output reused. Misses = task must execute. Get failures = storage errors. [Metrics pending: requires cloud service instrumentation to be deployed]", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -4282,7 +4657,18 @@ data: "refId": "C" } ], - "description": "Cache reservation contention: how often workers are blocked waiting for another worker's cache computation. High contention = many workers computing the same task. [Metrics pending: requires cloud service instrumentation to be deployed]" + "description": "Cache reservation contention: how often workers are blocked waiting for another worker's cache computation. High contention = many workers computing the same task. [Metrics pending: requires cloud service instrumentation to be deployed]", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } } ] }, @@ -4408,7 +4794,9 @@ data: "displayMode": "table", "placement": "bottom", "calcs": [ - "mean" + "min", + "max", + "lastNotNull" ] }, "tooltip": { @@ -4476,7 +4864,12 @@ data: "options": { "legend": { "displayMode": "table", - "placement": "bottom" + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] }, "tooltip": { "mode": "multi" @@ -4535,8 +4928,13 @@ data: }, "options": { "legend": { - "displayMode": "list", - "placement": "bottom" + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] }, "tooltip": { "mode": "multi" @@ -4605,8 +5003,13 @@ data: }, "options": { "legend": { - "displayMode": "list", - "placement": "bottom" + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] }, "tooltip": { "mode": "multi" @@ -4664,7 +5067,9 @@ data: "displayMode": "table", "placement": "bottom", "calcs": [ - "sum" + "min", + "max", + "lastNotNull" ] }, "tooltip": { @@ -4726,7 +5131,12 @@ data: "options": { "legend": { "displayMode": "table", - "placement": "bottom" + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] }, "tooltip": { "mode": "multi" @@ -4782,7 +5192,12 @@ data: "options": { "legend": { "displayMode": "table", - "placement": "bottom" + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] }, "tooltip": { "mode": "multi" @@ -4837,8 +5252,13 @@ data: }, "options": { "legend": { - "displayMode": "list", - "placement": "bottom" + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] }, "tooltip": { "mode": "multi" @@ -4920,7 +5340,18 @@ data: "refId": "D" } ], - "description": "DataProxy internal cache effectiveness for domain resolution, cluster pool routing, and namespace mapping." + "description": "DataProxy internal cache effectiveness for domain resolution, cluster pool routing, and namespace mapping.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -4962,7 +5393,18 @@ data: "refId": "B" } ], - "description": "Time to read image metadata from the dataplane, proxied through DataProxy." + "description": "Time to read image metadata from the dataplane, proxied through DataProxy.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -4999,7 +5441,18 @@ data: "refId": "A" } ], - "description": "Per-cluster secret proxy errors during fan-out operations. Identifies which dataplane cluster is causing failures. [Metrics pending: requires cloud service instrumentation to be deployed]" + "description": "Per-cluster secret proxy errors during fan-out operations. Identifies which dataplane cluster is causing failures. [Metrics pending: requires cloud service instrumentation to be deployed]", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } } ] }, @@ -5050,7 +5503,18 @@ data: "refId": "A" } ], - "description": "Rate of ReportBillableUsage calls from DP clusters. Each call reports resource consumption for billing." + "description": "Rate of ReportBillableUsage calls from DP clusters. Each call reports resource consumption for billing.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -5102,7 +5566,18 @@ data: "refId": "D" } ], - "description": "SQS/queue message processing: received, sent (success), failed, dropped (max retries exceeded)." + "description": "SQS/queue message processing: received, sent (success), failed, dropped (max retries exceeded).", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -5142,7 +5617,18 @@ data: "refId": "A" } ], - "description": "Successful message processing rate by type: node_execution, workflow_execution, billable_usage, serverless_billable_usage. [Metrics pending: requires cloud service instrumentation to be deployed]" + "description": "Successful message processing rate by type: node_execution, workflow_execution, billable_usage, serverless_billable_usage. [Metrics pending: requires cloud service instrumentation to be deployed]", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -5189,7 +5675,18 @@ data: "refId": "C" } ], - "description": "Time to process individual queue messages. Slow processing may indicate backend API issues (Metronome, timestream). [Metrics pending: requires cloud service instrumentation to be deployed]" + "description": "Time to process individual queue messages. Slow processing may indicate backend API issues (Metronome, timestream). [Metrics pending: requires cloud service instrumentation to be deployed]", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } } ] }, @@ -5243,7 +5740,18 @@ data: "refId": "A" } ], - "description": "CPU usage in cores per container, stacked. Identifies resource-heavy services." + "description": "CPU usage in cores per container, stacked. Identifies resource-heavy services.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -5283,7 +5791,18 @@ data: "refId": "A" } ], - "description": "Working set memory per container, stacked. Watch for approaching limits." + "description": "Working set memory per container, stacked. Watch for approaching limits.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -5320,7 +5839,18 @@ data: "refId": "A" } ], - "description": "Per-container restart events. Spikes indicate crashes or OOM kills." + "description": "Per-container restart events. Spikes indicate crashes or OOM kills.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } } ] } @@ -5372,7 +5902,7 @@ data: "timepicker": {}, "timezone": "browser", "title": "Union Controlplane Overview", - "uid": "union-cp-overview", + "uid": "union-controlplane-overview", "version": 2 } --- diff --git a/tests/generated/controlplane.external-authz.yaml b/tests/generated/controlplane.external-authz.yaml index 152c4464..7bf88217 100644 --- a/tests/generated/controlplane.external-authz.yaml +++ b/tests/generated/controlplane.external-authz.yaml @@ -1856,8 +1856,9 @@ data: "options": { "legend": { "calcs": [ - "mean", - "max" + "min", + "max", + "lastNotNull" ], "displayMode": "table", "placement": "bottom" @@ -1907,7 +1908,9 @@ data: "options": { "legend": { "calcs": [ - "mean" + "min", + "max", + "lastNotNull" ], "displayMode": "table", "placement": "bottom" @@ -2273,7 +2276,18 @@ data: "refId": "B" } ], - "description": "Service availability over time with SLO target line." + "description": "Service availability over time with SLO target line.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2317,7 +2331,18 @@ data: "refId": "B" } ], - "description": "Error budget remaining over time. Requires monitoring.slos.enabled." + "description": "Error budget remaining over time. Requires monitoring.slos.enabled.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "collapsed": true, @@ -2369,7 +2394,18 @@ data: "refId": "A" } ], - "description": "Ingress request rate broken down by host and URL path." + "description": "Ingress request rate broken down by host and URL path.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2409,7 +2445,18 @@ data: "refId": "A" } ], - "description": "4xx and 5xx error rates from ingress-nginx by HTTP status code." + "description": "4xx and 5xx error rates from ingress-nginx by HTTP status code.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2456,7 +2503,18 @@ data: "refId": "C" } ], - "description": "Ingress request latency percentiles. Includes TLS + routing + upstream response time." + "description": "Ingress request latency percentiles. Includes TLS + routing + upstream response time.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2493,7 +2551,18 @@ data: "refId": "A" } ], - "description": "Current number of active client connections to ingress-nginx." + "description": "Current number of active client connections to ingress-nginx.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } } ] }, @@ -2547,7 +2616,18 @@ data: "refId": "A" } ], - "description": "Connect protocol request throughput broken down by service (e.g. ExecutionService, ClusterService)." + "description": "Connect protocol request throughput broken down by service (e.g. ExecutionService, ClusterService).", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2584,7 +2664,18 @@ data: "refId": "A" } ], - "description": "Connect errors broken down by service and gRPC code. Identifies which services are erroring." + "description": "Connect errors broken down by service and gRPC code. Identifies which services are erroring.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2621,7 +2712,18 @@ data: "refId": "A" } ], - "description": "CacheService is the only CP service using gRPC (not Connect). Shows Get/Put/Delete/Reservation rates." + "description": "CacheService is the only CP service using gRPC (not Connect). Shows Get/Put/Delete/Reservation rates.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2658,7 +2760,18 @@ data: "refId": "A" } ], - "description": "CacheService gRPC errors by method and code." + "description": "CacheService gRPC errors by method and code.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } } ] }, @@ -2719,7 +2832,18 @@ data: "refId": "C" } ], - "description": "Current count of active workflow, node, and task executions tracked by FlyteAdmin." + "description": "Current count of active workflow, node, and task executions tracked by FlyteAdmin.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2771,7 +2895,18 @@ data: "refId": "D" } ], - "description": "Rate of execution creations and event ingestion (workflow, node, task events from propeller)." + "description": "Rate of execution creations and event ingestion (workflow, node, task events from propeller).", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2823,7 +2958,18 @@ data: "refId": "D" } ], - "description": "FlyteAdmin error rates: propeller communication failures, model transform errors, notification publish failures." + "description": "FlyteAdmin error rates: propeller communication failures, model transform errors, notification publish failures.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2875,7 +3021,18 @@ data: "refId": "D" } ], - "description": "FlyteAdmin gRPC endpoint latency at p95. Key endpoints: CreateExecution, CreateExecutionEvent, GetExecution." + "description": "FlyteAdmin gRPC endpoint latency at p95. Key endpoints: CreateExecution, CreateExecutionEvent, GetExecution.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2917,7 +3074,18 @@ data: "refId": "B" } ], - "description": "Authorization approve/deny rate from the FlyteAdmin auth middleware. High deny rate may indicate auth misconfiguration." + "description": "Authorization approve/deny rate from the FlyteAdmin auth middleware. High deny rate may indicate auth misconfiguration.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } } ] }, @@ -2930,186 +3098,129 @@ data: "y": 34 }, "id": 400, - "title": "Executions", + "title": "Executions (V2)", "type": "row", "panels": [ { + "title": "CreateRun Rate", + "type": "timeseries", "datasource": { "type": "prometheus", "uid": "${datasource}" }, "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "ops" + "unit": "reqps" } }, - "gridPos": { - "h": 8, - "w": 8, - "x": 0, - "y": 12 - }, - "id": 401, - "title": "Execution Create / Ack Rate", - "type": "timeseries", "targets": [ { - "expr": "rate(executions:executions:handle_create_op_count{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Create", + "expr": "sum(rate(connect:server_requests_handled_total{namespace=\"$namespace\", method=\"CreateRun\"}[$__rate_interval]))", + "legendFormat": "CreateRun", "refId": "A" - }, - { - "expr": "rate(executions:executions:handle_ack_op_count{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Ack", - "refId": "B" } ], - "description": "Rate of execution operation creates and acknowledgements. Create = new execution request, Ack = DP confirmed receipt." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "s" - } - }, "gridPos": { "h": 8, "w": 8, - "x": 8, - "y": 12 + "x": 0, + "y": 0 }, - "id": 402, - "title": "Execution Create / Ack Latency (p95)", - "type": "timeseries", - "targets": [ - { - "expr": "histogram_quantile(0.95, sum by (le) (rate(executions:executions:handle_create_op_bucket{namespace=\"$namespace\"}[$__rate_interval])))", - "legendFormat": "Create p95", - "refId": "A" - }, - { - "expr": "histogram_quantile(0.95, sum by (le) (rate(executions:executions:handle_ack_op_bucket{namespace=\"$namespace\"}[$__rate_interval])))", - "legendFormat": "Ack p95", - "refId": "B" + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] } - ], - "description": "Time to prepare create/ack execution requests at p95." + } }, { + "title": "CreateRun Latency (p50 / p95 / p99)", + "type": "timeseries", "datasource": { "type": "prometheus", "uid": "${datasource}" }, "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, "unit": "s" } }, - "gridPos": { - "h": 8, - "w": 8, - "x": 16, - "y": 12 - }, - "id": 403, - "title": "Assignment Duration (p50 / p90)", - "type": "timeseries", "targets": [ { - "expr": "histogram_quantile(0.50, sum by (le) (rate(executions:workqueue:announce_cluster_assignment_bucket{namespace=\"$namespace\"}[$__rate_interval])))", + "expr": "histogram_quantile(0.50, sum by (le) (rate(connect:server_request_duration_seconds_bucket{namespace=\"$namespace\", method=\"CreateRun\"}[$__rate_interval])))", "legendFormat": "p50", "refId": "A" }, { - "expr": "histogram_quantile(0.90, sum by (le) (rate(executions:workqueue:announce_cluster_assignment_bucket{namespace=\"$namespace\"}[$__rate_interval])))", - "legendFormat": "p90", + "expr": "histogram_quantile(0.95, sum by (le) (rate(connect:server_request_duration_seconds_bucket{namespace=\"$namespace\", method=\"CreateRun\"}[$__rate_interval])))", + "legendFormat": "p95", "refId": "B" + }, + { + "expr": "histogram_quantile(0.99, sum by (le) (rate(connect:server_request_duration_seconds_bucket{namespace=\"$namespace\", method=\"CreateRun\"}[$__rate_interval])))", + "legendFormat": "p99", + "refId": "C" } ], - "description": "Key SLI: end-to-end time from execution create to cluster assignment. Custom buckets from 10ms to 20min." + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 0 + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { + "title": "V2 Run Methods", + "type": "timeseries", "datasource": { "type": "prometheus", "uid": "${datasource}" }, "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "ops" + "unit": "reqps" } }, - "gridPos": { - "h": 8, - "w": 8, - "x": 0, - "y": 20 - }, - "id": 404, - "title": "Workqueue Operations", - "type": "timeseries", "targets": [ { - "expr": "rate(executions:workqueue:send_operation_count{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Send ops", + "expr": "sum by (method) (rate(connect:server_requests_handled_total{namespace=\"$namespace\", service=~\".*RunService.*\"}[$__rate_interval]))", + "legendFormat": "{{method}}", "refId": "A" - }, - { - "expr": "rate(executions:workqueue:claim_operations{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Claims", - "refId": "B" - }, - { - "expr": "rate(executions:workqueue:send_operation_failures{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Send failures", - "refId": "C" - }, - { - "expr": "rate(executions:workqueue:claim_operation_failures{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Claim failures", - "refId": "D" } ], - "description": "Execution operation send/claim rates and failures. Send = dispatch to DP, Claim = pick up from DB." + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 0 + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3146,7 +3257,18 @@ data: "refId": "A" } ], - "description": "Execution operations DB latency: create, ack, claim, unclaim, get, update." + "description": "Execution operations DB latency: create, ack, claim, unclaim, get, update.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3193,7 +3315,18 @@ data: "refId": "C" } ], - "description": "Executions service Postgres error rates by type: gorm errors, native postgres errors, not-found." + "description": "Executions service Postgres error rates by type: gorm errors, native postgres errors, not-found.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3245,7 +3378,18 @@ data: "refId": "D" } ], - "description": "Cluster and nodepool list cache effectiveness. High miss rate = excessive DB queries." + "description": "Cluster and nodepool list cache effectiveness. High miss rate = excessive DB queries.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3273,7 +3417,7 @@ data: "y": 28 }, "id": 408, - "title": "Pending Assignments", + "title": "Apps \u2014 Pending Assignments", "type": "timeseries", "targets": [ { @@ -3282,7 +3426,18 @@ data: "refId": "A" } ], - "description": "Number of apps waiting for cluster assignment. Growing backlog = scheduling bottleneck." + "description": "Number of apps waiting for cluster assignment. Growing backlog = scheduling bottleneck.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3310,7 +3465,7 @@ data: "y": 36 }, "id": 409, - "title": "First Ack Latency (V2 SLI)", + "title": "Apps \u2014 First Ack Latency", "type": "timeseries", "targets": [ { @@ -3329,7 +3484,18 @@ data: "refId": "C" } ], - "description": "Key V2 SLI: time to deliver an app to the dataplane. Measures end-to-end scheduling latency." + "description": "Key V2 SLI: time to deliver an app to the dataplane. Measures end-to-end scheduling latency.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3376,7 +3542,18 @@ data: "refId": "C" } ], - "description": "V2 run/action dispatch throughput. Enqueue failures indicate queue service issues." + "description": "V2 run/action dispatch throughput. Enqueue failures indicate queue service issues.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3423,7 +3600,18 @@ data: "refId": "C" } ], - "description": "V2 notification pipeline: notifications sent per second, active subscribers, log bytes streamed." + "description": "V2 notification pipeline: notifications sent per second, active subscribers, log bytes streamed.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } } ] }, @@ -3489,7 +3677,18 @@ data: "refId": "D" } ], - "description": "Total and scheduled run/action counts in the queue. Shows system load." + "description": "Total and scheduled run/action counts in the queue. Shows system load.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3536,7 +3735,18 @@ data: "refId": "C" } ], - "description": "Lease lifecycle throughput: enqueued (new), completed (done), aborted (cancelled)." + "description": "Lease lifecycle throughput: enqueued (new), completed (done), aborted (cancelled).", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3592,8 +3802,19 @@ data: "legendFormat": "DB queue", "refId": "E" } - ], - "description": "Internal queue depths across scheduler, runner, aborter, dispatcher, and DB worker pool. Growing = backpressure." + ], + "description": "Internal queue depths across scheduler, runner, aborter, dispatcher, and DB worker pool. Growing = backpressure.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3630,7 +3851,18 @@ data: "refId": "A" } ], - "description": "Dispatcher multi-step operation chain execution time at p99, by operation type." + "description": "Dispatcher multi-step operation chain execution time at p99, by operation type.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3672,7 +3904,18 @@ data: "refId": "B" } ], - "description": "In-memory state store operation latency. Backed by ScyllaDB persistence." + "description": "In-memory state store operation latency. Backed by ScyllaDB persistence.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3719,7 +3962,18 @@ data: "refId": "C" } ], - "description": "Active/terminal state counts and eventer error rate. Eventer reports action status to executions service." + "description": "Active/terminal state counts and eventer error rate. Eventer reports action status to executions service.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3756,7 +4010,18 @@ data: "refId": "A" } ], - "description": "Remaining execution capacity per connected DP worker. Zero = worker saturated." + "description": "Remaining execution capacity per connected DP worker. Zero = worker saturated.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3793,7 +4058,18 @@ data: "refId": "A" } ], - "description": "Failed dispatcher operations by Go type. Indicates internal queue service errors." + "description": "Failed dispatcher operations by Go type. Indicates internal queue service errors.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3840,7 +4116,18 @@ data: "refId": "C" } ], - "description": "Idle worker goroutines in DB, queue-client, and state-client pools. Zero = all threads busy." + "description": "Idle worker goroutines in DB, queue-client, and state-client pools. Zero = all threads busy.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } } ] }, @@ -3896,7 +4183,18 @@ data: "refId": "B" } ], - "description": "Rate of DP cluster status updates and heartbeats received by the cluster service." + "description": "Rate of DP cluster status updates and heartbeats received by the cluster service.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3924,7 +4222,7 @@ data: "y": 14 }, "id": 602, - "title": "Cluster API Latency (p95)", + "title": "Cluster API Latency (p99)", "type": "timeseries", "targets": [ { @@ -3933,12 +4231,23 @@ data: "refId": "A" }, { - "expr": "cluster:svc:heartbeat:success_ms{namespace=\"$namespace\", quantile=\"0.95\"} / 1000", + "expr": "cluster:svc:heartbeat:success_ms{namespace=\"$namespace\", quantile=\"0.99\"} / 1000", "legendFormat": "Heartbeat p95", "refId": "B" } ], - "description": "Cluster service RPC latency for UpdateStatus and Heartbeat calls." + "description": "Cluster service RPC latency for UpdateStatus and Heartbeat calls.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3980,7 +4289,18 @@ data: "refId": "B" } ], - "description": "DP-reported restart counts for operator and propeller pods. Set by DP on each UpdateStatus call." + "description": "DP-reported restart counts for operator and propeller pods. Set by DP on each UpdateStatus call.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -4027,7 +4347,18 @@ data: "refId": "C" } ], - "description": "Cluster service Postgres error rates by type: gorm errors, native postgres errors, not-found." + "description": "Cluster service Postgres error rates by type: gorm errors, native postgres errors, not-found.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -4085,7 +4416,18 @@ data: "refId": "A" } ], - "description": "Cluster health collector: 1=unhealthy, 0=healthy. Emitted per cluster on every Prometheus scrape." + "description": "Cluster health collector: 1=unhealthy, 0=healthy. Emitted per cluster on every Prometheus scrape.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -4122,7 +4464,18 @@ data: "refId": "A" } ], - "description": "Seconds since each cluster last sent a heartbeat. High values = stale/disconnected cluster." + "description": "Seconds since each cluster last sent a heartbeat. High values = stale/disconnected cluster.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -4164,7 +4517,18 @@ data: "refId": "B" } ], - "description": "LRU cache hit/miss rate for managed cluster lookups. High miss rate = excessive DB queries." + "description": "LRU cache hit/miss rate for managed cluster lookups. High miss rate = excessive DB queries.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } } ] }, @@ -4225,7 +4589,18 @@ data: "refId": "C" } ], - "description": "CacheService hit/miss rate. Hits = cached task output reused. Misses = task must execute. Get failures = storage errors. [Metrics pending: requires cloud service instrumentation to be deployed]" + "description": "CacheService hit/miss rate. Hits = cached task output reused. Misses = task must execute. Get failures = storage errors. [Metrics pending: requires cloud service instrumentation to be deployed]", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -4272,7 +4647,18 @@ data: "refId": "C" } ], - "description": "Cache reservation contention: how often workers are blocked waiting for another worker's cache computation. High contention = many workers computing the same task. [Metrics pending: requires cloud service instrumentation to be deployed]" + "description": "Cache reservation contention: how often workers are blocked waiting for another worker's cache computation. High contention = many workers computing the same task. [Metrics pending: requires cloud service instrumentation to be deployed]", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } } ] }, @@ -4398,7 +4784,9 @@ data: "displayMode": "table", "placement": "bottom", "calcs": [ - "mean" + "min", + "max", + "lastNotNull" ] }, "tooltip": { @@ -4466,7 +4854,12 @@ data: "options": { "legend": { "displayMode": "table", - "placement": "bottom" + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] }, "tooltip": { "mode": "multi" @@ -4525,8 +4918,13 @@ data: }, "options": { "legend": { - "displayMode": "list", - "placement": "bottom" + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] }, "tooltip": { "mode": "multi" @@ -4595,8 +4993,13 @@ data: }, "options": { "legend": { - "displayMode": "list", - "placement": "bottom" + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] }, "tooltip": { "mode": "multi" @@ -4654,7 +5057,9 @@ data: "displayMode": "table", "placement": "bottom", "calcs": [ - "sum" + "min", + "max", + "lastNotNull" ] }, "tooltip": { @@ -4716,7 +5121,12 @@ data: "options": { "legend": { "displayMode": "table", - "placement": "bottom" + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] }, "tooltip": { "mode": "multi" @@ -4772,7 +5182,12 @@ data: "options": { "legend": { "displayMode": "table", - "placement": "bottom" + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] }, "tooltip": { "mode": "multi" @@ -4827,8 +5242,13 @@ data: }, "options": { "legend": { - "displayMode": "list", - "placement": "bottom" + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] }, "tooltip": { "mode": "multi" @@ -4910,7 +5330,18 @@ data: "refId": "D" } ], - "description": "DataProxy internal cache effectiveness for domain resolution, cluster pool routing, and namespace mapping." + "description": "DataProxy internal cache effectiveness for domain resolution, cluster pool routing, and namespace mapping.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -4952,7 +5383,18 @@ data: "refId": "B" } ], - "description": "Time to read image metadata from the dataplane, proxied through DataProxy." + "description": "Time to read image metadata from the dataplane, proxied through DataProxy.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -4989,7 +5431,18 @@ data: "refId": "A" } ], - "description": "Per-cluster secret proxy errors during fan-out operations. Identifies which dataplane cluster is causing failures. [Metrics pending: requires cloud service instrumentation to be deployed]" + "description": "Per-cluster secret proxy errors during fan-out operations. Identifies which dataplane cluster is causing failures. [Metrics pending: requires cloud service instrumentation to be deployed]", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } } ] }, @@ -5040,7 +5493,18 @@ data: "refId": "A" } ], - "description": "Rate of ReportBillableUsage calls from DP clusters. Each call reports resource consumption for billing." + "description": "Rate of ReportBillableUsage calls from DP clusters. Each call reports resource consumption for billing.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -5092,7 +5556,18 @@ data: "refId": "D" } ], - "description": "SQS/queue message processing: received, sent (success), failed, dropped (max retries exceeded)." + "description": "SQS/queue message processing: received, sent (success), failed, dropped (max retries exceeded).", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -5132,7 +5607,18 @@ data: "refId": "A" } ], - "description": "Successful message processing rate by type: node_execution, workflow_execution, billable_usage, serverless_billable_usage. [Metrics pending: requires cloud service instrumentation to be deployed]" + "description": "Successful message processing rate by type: node_execution, workflow_execution, billable_usage, serverless_billable_usage. [Metrics pending: requires cloud service instrumentation to be deployed]", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -5179,7 +5665,18 @@ data: "refId": "C" } ], - "description": "Time to process individual queue messages. Slow processing may indicate backend API issues (Metronome, timestream). [Metrics pending: requires cloud service instrumentation to be deployed]" + "description": "Time to process individual queue messages. Slow processing may indicate backend API issues (Metronome, timestream). [Metrics pending: requires cloud service instrumentation to be deployed]", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } } ] }, @@ -5233,7 +5730,18 @@ data: "refId": "A" } ], - "description": "CPU usage in cores per container, stacked. Identifies resource-heavy services." + "description": "CPU usage in cores per container, stacked. Identifies resource-heavy services.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -5273,7 +5781,18 @@ data: "refId": "A" } ], - "description": "Working set memory per container, stacked. Watch for approaching limits." + "description": "Working set memory per container, stacked. Watch for approaching limits.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -5310,7 +5829,18 @@ data: "refId": "A" } ], - "description": "Per-container restart events. Spikes indicate crashes or OOM kills." + "description": "Per-container restart events. Spikes indicate crashes or OOM kills.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } } ] } @@ -5362,7 +5892,7 @@ data: "timepicker": {}, "timezone": "browser", "title": "Union Controlplane Overview", - "uid": "union-cp-overview", + "uid": "union-controlplane-overview", "version": 2 } --- diff --git a/tests/generated/controlplane.userclouds.yaml b/tests/generated/controlplane.userclouds.yaml index 2a58acc4..7b62ece9 100644 --- a/tests/generated/controlplane.userclouds.yaml +++ b/tests/generated/controlplane.userclouds.yaml @@ -1939,8 +1939,9 @@ data: "options": { "legend": { "calcs": [ - "mean", - "max" + "min", + "max", + "lastNotNull" ], "displayMode": "table", "placement": "bottom" @@ -1990,7 +1991,9 @@ data: "options": { "legend": { "calcs": [ - "mean" + "min", + "max", + "lastNotNull" ], "displayMode": "table", "placement": "bottom" @@ -2356,7 +2359,18 @@ data: "refId": "B" } ], - "description": "Service availability over time with SLO target line." + "description": "Service availability over time with SLO target line.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2400,7 +2414,18 @@ data: "refId": "B" } ], - "description": "Error budget remaining over time. Requires monitoring.slos.enabled." + "description": "Error budget remaining over time. Requires monitoring.slos.enabled.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "collapsed": true, @@ -2452,7 +2477,18 @@ data: "refId": "A" } ], - "description": "Ingress request rate broken down by host and URL path." + "description": "Ingress request rate broken down by host and URL path.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2492,7 +2528,18 @@ data: "refId": "A" } ], - "description": "4xx and 5xx error rates from ingress-nginx by HTTP status code." + "description": "4xx and 5xx error rates from ingress-nginx by HTTP status code.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2539,7 +2586,18 @@ data: "refId": "C" } ], - "description": "Ingress request latency percentiles. Includes TLS + routing + upstream response time." + "description": "Ingress request latency percentiles. Includes TLS + routing + upstream response time.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2576,7 +2634,18 @@ data: "refId": "A" } ], - "description": "Current number of active client connections to ingress-nginx." + "description": "Current number of active client connections to ingress-nginx.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } } ] }, @@ -2630,7 +2699,18 @@ data: "refId": "A" } ], - "description": "Connect protocol request throughput broken down by service (e.g. ExecutionService, ClusterService)." + "description": "Connect protocol request throughput broken down by service (e.g. ExecutionService, ClusterService).", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2667,7 +2747,18 @@ data: "refId": "A" } ], - "description": "Connect errors broken down by service and gRPC code. Identifies which services are erroring." + "description": "Connect errors broken down by service and gRPC code. Identifies which services are erroring.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2704,7 +2795,18 @@ data: "refId": "A" } ], - "description": "CacheService is the only CP service using gRPC (not Connect). Shows Get/Put/Delete/Reservation rates." + "description": "CacheService is the only CP service using gRPC (not Connect). Shows Get/Put/Delete/Reservation rates.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2741,7 +2843,18 @@ data: "refId": "A" } ], - "description": "CacheService gRPC errors by method and code." + "description": "CacheService gRPC errors by method and code.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } } ] }, @@ -2802,7 +2915,18 @@ data: "refId": "C" } ], - "description": "Current count of active workflow, node, and task executions tracked by FlyteAdmin." + "description": "Current count of active workflow, node, and task executions tracked by FlyteAdmin.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2854,7 +2978,18 @@ data: "refId": "D" } ], - "description": "Rate of execution creations and event ingestion (workflow, node, task events from propeller)." + "description": "Rate of execution creations and event ingestion (workflow, node, task events from propeller).", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2906,7 +3041,18 @@ data: "refId": "D" } ], - "description": "FlyteAdmin error rates: propeller communication failures, model transform errors, notification publish failures." + "description": "FlyteAdmin error rates: propeller communication failures, model transform errors, notification publish failures.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2958,7 +3104,18 @@ data: "refId": "D" } ], - "description": "FlyteAdmin gRPC endpoint latency at p95. Key endpoints: CreateExecution, CreateExecutionEvent, GetExecution." + "description": "FlyteAdmin gRPC endpoint latency at p95. Key endpoints: CreateExecution, CreateExecutionEvent, GetExecution.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3000,7 +3157,18 @@ data: "refId": "B" } ], - "description": "Authorization approve/deny rate from the FlyteAdmin auth middleware. High deny rate may indicate auth misconfiguration." + "description": "Authorization approve/deny rate from the FlyteAdmin auth middleware. High deny rate may indicate auth misconfiguration.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } } ] }, @@ -3013,186 +3181,129 @@ data: "y": 34 }, "id": 400, - "title": "Executions", + "title": "Executions (V2)", "type": "row", "panels": [ { + "title": "CreateRun Rate", + "type": "timeseries", "datasource": { "type": "prometheus", "uid": "${datasource}" }, "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "ops" + "unit": "reqps" } }, - "gridPos": { - "h": 8, - "w": 8, - "x": 0, - "y": 12 - }, - "id": 401, - "title": "Execution Create / Ack Rate", - "type": "timeseries", "targets": [ { - "expr": "rate(executions:executions:handle_create_op_count{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Create", + "expr": "sum(rate(connect:server_requests_handled_total{namespace=\"$namespace\", method=\"CreateRun\"}[$__rate_interval]))", + "legendFormat": "CreateRun", "refId": "A" - }, - { - "expr": "rate(executions:executions:handle_ack_op_count{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Ack", - "refId": "B" } ], - "description": "Rate of execution operation creates and acknowledgements. Create = new execution request, Ack = DP confirmed receipt." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "s" - } - }, "gridPos": { "h": 8, "w": 8, - "x": 8, - "y": 12 + "x": 0, + "y": 0 }, - "id": 402, - "title": "Execution Create / Ack Latency (p95)", - "type": "timeseries", - "targets": [ - { - "expr": "histogram_quantile(0.95, sum by (le) (rate(executions:executions:handle_create_op_bucket{namespace=\"$namespace\"}[$__rate_interval])))", - "legendFormat": "Create p95", - "refId": "A" - }, - { - "expr": "histogram_quantile(0.95, sum by (le) (rate(executions:executions:handle_ack_op_bucket{namespace=\"$namespace\"}[$__rate_interval])))", - "legendFormat": "Ack p95", - "refId": "B" + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] } - ], - "description": "Time to prepare create/ack execution requests at p95." + } }, { + "title": "CreateRun Latency (p50 / p95 / p99)", + "type": "timeseries", "datasource": { "type": "prometheus", "uid": "${datasource}" }, "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, "unit": "s" } }, - "gridPos": { - "h": 8, - "w": 8, - "x": 16, - "y": 12 - }, - "id": 403, - "title": "Assignment Duration (p50 / p90)", - "type": "timeseries", "targets": [ { - "expr": "histogram_quantile(0.50, sum by (le) (rate(executions:workqueue:announce_cluster_assignment_bucket{namespace=\"$namespace\"}[$__rate_interval])))", + "expr": "histogram_quantile(0.50, sum by (le) (rate(connect:server_request_duration_seconds_bucket{namespace=\"$namespace\", method=\"CreateRun\"}[$__rate_interval])))", "legendFormat": "p50", "refId": "A" }, { - "expr": "histogram_quantile(0.90, sum by (le) (rate(executions:workqueue:announce_cluster_assignment_bucket{namespace=\"$namespace\"}[$__rate_interval])))", - "legendFormat": "p90", + "expr": "histogram_quantile(0.95, sum by (le) (rate(connect:server_request_duration_seconds_bucket{namespace=\"$namespace\", method=\"CreateRun\"}[$__rate_interval])))", + "legendFormat": "p95", "refId": "B" + }, + { + "expr": "histogram_quantile(0.99, sum by (le) (rate(connect:server_request_duration_seconds_bucket{namespace=\"$namespace\", method=\"CreateRun\"}[$__rate_interval])))", + "legendFormat": "p99", + "refId": "C" } ], - "description": "Key SLI: end-to-end time from execution create to cluster assignment. Custom buckets from 10ms to 20min." + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 0 + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { + "title": "V2 Run Methods", + "type": "timeseries", "datasource": { "type": "prometheus", "uid": "${datasource}" }, "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "ops" + "unit": "reqps" } }, - "gridPos": { - "h": 8, - "w": 8, - "x": 0, - "y": 20 - }, - "id": 404, - "title": "Workqueue Operations", - "type": "timeseries", "targets": [ { - "expr": "rate(executions:workqueue:send_operation_count{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Send ops", + "expr": "sum by (method) (rate(connect:server_requests_handled_total{namespace=\"$namespace\", service=~\".*RunService.*\"}[$__rate_interval]))", + "legendFormat": "{{method}}", "refId": "A" - }, - { - "expr": "rate(executions:workqueue:claim_operations{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Claims", - "refId": "B" - }, - { - "expr": "rate(executions:workqueue:send_operation_failures{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Send failures", - "refId": "C" - }, - { - "expr": "rate(executions:workqueue:claim_operation_failures{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Claim failures", - "refId": "D" } ], - "description": "Execution operation send/claim rates and failures. Send = dispatch to DP, Claim = pick up from DB." + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 0 + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3229,7 +3340,18 @@ data: "refId": "A" } ], - "description": "Execution operations DB latency: create, ack, claim, unclaim, get, update." + "description": "Execution operations DB latency: create, ack, claim, unclaim, get, update.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3276,7 +3398,18 @@ data: "refId": "C" } ], - "description": "Executions service Postgres error rates by type: gorm errors, native postgres errors, not-found." + "description": "Executions service Postgres error rates by type: gorm errors, native postgres errors, not-found.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3328,7 +3461,18 @@ data: "refId": "D" } ], - "description": "Cluster and nodepool list cache effectiveness. High miss rate = excessive DB queries." + "description": "Cluster and nodepool list cache effectiveness. High miss rate = excessive DB queries.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3356,7 +3500,7 @@ data: "y": 28 }, "id": 408, - "title": "Pending Assignments", + "title": "Apps \u2014 Pending Assignments", "type": "timeseries", "targets": [ { @@ -3365,7 +3509,18 @@ data: "refId": "A" } ], - "description": "Number of apps waiting for cluster assignment. Growing backlog = scheduling bottleneck." + "description": "Number of apps waiting for cluster assignment. Growing backlog = scheduling bottleneck.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3393,7 +3548,7 @@ data: "y": 36 }, "id": 409, - "title": "First Ack Latency (V2 SLI)", + "title": "Apps \u2014 First Ack Latency", "type": "timeseries", "targets": [ { @@ -3412,7 +3567,18 @@ data: "refId": "C" } ], - "description": "Key V2 SLI: time to deliver an app to the dataplane. Measures end-to-end scheduling latency." + "description": "Key V2 SLI: time to deliver an app to the dataplane. Measures end-to-end scheduling latency.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3459,7 +3625,18 @@ data: "refId": "C" } ], - "description": "V2 run/action dispatch throughput. Enqueue failures indicate queue service issues." + "description": "V2 run/action dispatch throughput. Enqueue failures indicate queue service issues.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3506,7 +3683,18 @@ data: "refId": "C" } ], - "description": "V2 notification pipeline: notifications sent per second, active subscribers, log bytes streamed." + "description": "V2 notification pipeline: notifications sent per second, active subscribers, log bytes streamed.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } } ] }, @@ -3572,7 +3760,18 @@ data: "refId": "D" } ], - "description": "Total and scheduled run/action counts in the queue. Shows system load." + "description": "Total and scheduled run/action counts in the queue. Shows system load.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3619,7 +3818,18 @@ data: "refId": "C" } ], - "description": "Lease lifecycle throughput: enqueued (new), completed (done), aborted (cancelled)." + "description": "Lease lifecycle throughput: enqueued (new), completed (done), aborted (cancelled).", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3675,8 +3885,19 @@ data: "legendFormat": "DB queue", "refId": "E" } - ], - "description": "Internal queue depths across scheduler, runner, aborter, dispatcher, and DB worker pool. Growing = backpressure." + ], + "description": "Internal queue depths across scheduler, runner, aborter, dispatcher, and DB worker pool. Growing = backpressure.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3713,7 +3934,18 @@ data: "refId": "A" } ], - "description": "Dispatcher multi-step operation chain execution time at p99, by operation type." + "description": "Dispatcher multi-step operation chain execution time at p99, by operation type.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3755,7 +3987,18 @@ data: "refId": "B" } ], - "description": "In-memory state store operation latency. Backed by ScyllaDB persistence." + "description": "In-memory state store operation latency. Backed by ScyllaDB persistence.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3802,7 +4045,18 @@ data: "refId": "C" } ], - "description": "Active/terminal state counts and eventer error rate. Eventer reports action status to executions service." + "description": "Active/terminal state counts and eventer error rate. Eventer reports action status to executions service.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3839,7 +4093,18 @@ data: "refId": "A" } ], - "description": "Remaining execution capacity per connected DP worker. Zero = worker saturated." + "description": "Remaining execution capacity per connected DP worker. Zero = worker saturated.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3876,7 +4141,18 @@ data: "refId": "A" } ], - "description": "Failed dispatcher operations by Go type. Indicates internal queue service errors." + "description": "Failed dispatcher operations by Go type. Indicates internal queue service errors.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3923,7 +4199,18 @@ data: "refId": "C" } ], - "description": "Idle worker goroutines in DB, queue-client, and state-client pools. Zero = all threads busy." + "description": "Idle worker goroutines in DB, queue-client, and state-client pools. Zero = all threads busy.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } } ] }, @@ -3979,7 +4266,18 @@ data: "refId": "B" } ], - "description": "Rate of DP cluster status updates and heartbeats received by the cluster service." + "description": "Rate of DP cluster status updates and heartbeats received by the cluster service.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -4007,7 +4305,7 @@ data: "y": 14 }, "id": 602, - "title": "Cluster API Latency (p95)", + "title": "Cluster API Latency (p99)", "type": "timeseries", "targets": [ { @@ -4016,12 +4314,23 @@ data: "refId": "A" }, { - "expr": "cluster:svc:heartbeat:success_ms{namespace=\"$namespace\", quantile=\"0.95\"} / 1000", + "expr": "cluster:svc:heartbeat:success_ms{namespace=\"$namespace\", quantile=\"0.99\"} / 1000", "legendFormat": "Heartbeat p95", "refId": "B" } ], - "description": "Cluster service RPC latency for UpdateStatus and Heartbeat calls." + "description": "Cluster service RPC latency for UpdateStatus and Heartbeat calls.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -4063,7 +4372,18 @@ data: "refId": "B" } ], - "description": "DP-reported restart counts for operator and propeller pods. Set by DP on each UpdateStatus call." + "description": "DP-reported restart counts for operator and propeller pods. Set by DP on each UpdateStatus call.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -4110,7 +4430,18 @@ data: "refId": "C" } ], - "description": "Cluster service Postgres error rates by type: gorm errors, native postgres errors, not-found." + "description": "Cluster service Postgres error rates by type: gorm errors, native postgres errors, not-found.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -4168,7 +4499,18 @@ data: "refId": "A" } ], - "description": "Cluster health collector: 1=unhealthy, 0=healthy. Emitted per cluster on every Prometheus scrape." + "description": "Cluster health collector: 1=unhealthy, 0=healthy. Emitted per cluster on every Prometheus scrape.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -4205,7 +4547,18 @@ data: "refId": "A" } ], - "description": "Seconds since each cluster last sent a heartbeat. High values = stale/disconnected cluster." + "description": "Seconds since each cluster last sent a heartbeat. High values = stale/disconnected cluster.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -4247,7 +4600,18 @@ data: "refId": "B" } ], - "description": "LRU cache hit/miss rate for managed cluster lookups. High miss rate = excessive DB queries." + "description": "LRU cache hit/miss rate for managed cluster lookups. High miss rate = excessive DB queries.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } } ] }, @@ -4308,7 +4672,18 @@ data: "refId": "C" } ], - "description": "CacheService hit/miss rate. Hits = cached task output reused. Misses = task must execute. Get failures = storage errors. [Metrics pending: requires cloud service instrumentation to be deployed]" + "description": "CacheService hit/miss rate. Hits = cached task output reused. Misses = task must execute. Get failures = storage errors. [Metrics pending: requires cloud service instrumentation to be deployed]", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -4355,7 +4730,18 @@ data: "refId": "C" } ], - "description": "Cache reservation contention: how often workers are blocked waiting for another worker's cache computation. High contention = many workers computing the same task. [Metrics pending: requires cloud service instrumentation to be deployed]" + "description": "Cache reservation contention: how often workers are blocked waiting for another worker's cache computation. High contention = many workers computing the same task. [Metrics pending: requires cloud service instrumentation to be deployed]", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } } ] }, @@ -4481,7 +4867,9 @@ data: "displayMode": "table", "placement": "bottom", "calcs": [ - "mean" + "min", + "max", + "lastNotNull" ] }, "tooltip": { @@ -4549,7 +4937,12 @@ data: "options": { "legend": { "displayMode": "table", - "placement": "bottom" + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] }, "tooltip": { "mode": "multi" @@ -4608,8 +5001,13 @@ data: }, "options": { "legend": { - "displayMode": "list", - "placement": "bottom" + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] }, "tooltip": { "mode": "multi" @@ -4678,8 +5076,13 @@ data: }, "options": { "legend": { - "displayMode": "list", - "placement": "bottom" + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] }, "tooltip": { "mode": "multi" @@ -4737,7 +5140,9 @@ data: "displayMode": "table", "placement": "bottom", "calcs": [ - "sum" + "min", + "max", + "lastNotNull" ] }, "tooltip": { @@ -4799,7 +5204,12 @@ data: "options": { "legend": { "displayMode": "table", - "placement": "bottom" + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] }, "tooltip": { "mode": "multi" @@ -4855,7 +5265,12 @@ data: "options": { "legend": { "displayMode": "table", - "placement": "bottom" + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] }, "tooltip": { "mode": "multi" @@ -4910,8 +5325,13 @@ data: }, "options": { "legend": { - "displayMode": "list", - "placement": "bottom" + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] }, "tooltip": { "mode": "multi" @@ -4993,7 +5413,18 @@ data: "refId": "D" } ], - "description": "DataProxy internal cache effectiveness for domain resolution, cluster pool routing, and namespace mapping." + "description": "DataProxy internal cache effectiveness for domain resolution, cluster pool routing, and namespace mapping.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -5035,7 +5466,18 @@ data: "refId": "B" } ], - "description": "Time to read image metadata from the dataplane, proxied through DataProxy." + "description": "Time to read image metadata from the dataplane, proxied through DataProxy.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -5072,7 +5514,18 @@ data: "refId": "A" } ], - "description": "Per-cluster secret proxy errors during fan-out operations. Identifies which dataplane cluster is causing failures. [Metrics pending: requires cloud service instrumentation to be deployed]" + "description": "Per-cluster secret proxy errors during fan-out operations. Identifies which dataplane cluster is causing failures. [Metrics pending: requires cloud service instrumentation to be deployed]", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } } ] }, @@ -5123,7 +5576,18 @@ data: "refId": "A" } ], - "description": "Rate of ReportBillableUsage calls from DP clusters. Each call reports resource consumption for billing." + "description": "Rate of ReportBillableUsage calls from DP clusters. Each call reports resource consumption for billing.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -5175,7 +5639,18 @@ data: "refId": "D" } ], - "description": "SQS/queue message processing: received, sent (success), failed, dropped (max retries exceeded)." + "description": "SQS/queue message processing: received, sent (success), failed, dropped (max retries exceeded).", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -5215,7 +5690,18 @@ data: "refId": "A" } ], - "description": "Successful message processing rate by type: node_execution, workflow_execution, billable_usage, serverless_billable_usage. [Metrics pending: requires cloud service instrumentation to be deployed]" + "description": "Successful message processing rate by type: node_execution, workflow_execution, billable_usage, serverless_billable_usage. [Metrics pending: requires cloud service instrumentation to be deployed]", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -5262,7 +5748,18 @@ data: "refId": "C" } ], - "description": "Time to process individual queue messages. Slow processing may indicate backend API issues (Metronome, timestream). [Metrics pending: requires cloud service instrumentation to be deployed]" + "description": "Time to process individual queue messages. Slow processing may indicate backend API issues (Metronome, timestream). [Metrics pending: requires cloud service instrumentation to be deployed]", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } } ] }, @@ -5316,7 +5813,18 @@ data: "refId": "A" } ], - "description": "CPU usage in cores per container, stacked. Identifies resource-heavy services." + "description": "CPU usage in cores per container, stacked. Identifies resource-heavy services.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -5356,7 +5864,18 @@ data: "refId": "A" } ], - "description": "Working set memory per container, stacked. Watch for approaching limits." + "description": "Working set memory per container, stacked. Watch for approaching limits.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -5393,7 +5912,18 @@ data: "refId": "A" } ], - "description": "Per-container restart events. Spikes indicate crashes or OOM kills." + "description": "Per-container restart events. Spikes indicate crashes or OOM kills.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } } ] } @@ -5445,7 +5975,7 @@ data: "timepicker": {}, "timezone": "browser", "title": "Union Controlplane Overview", - "uid": "union-cp-overview", + "uid": "union-controlplane-overview", "version": 2 } --- diff --git a/tests/generated/dataplane.additional-podlabels.yaml b/tests/generated/dataplane.additional-podlabels.yaml index 2b29aebe..29a4fd73 100644 --- a/tests/generated/dataplane.additional-podlabels.yaml +++ b/tests/generated/dataplane.additional-podlabels.yaml @@ -1170,63 +1170,6 @@ data: ], "description": "Total container restarts in the last hour. Non-zero indicates crashlooping or OOM kills." }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "thresholds": { - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "yellow", - "value": 10 - }, - { - "color": "red", - "value": 50 - } - ] - }, - "unit": "short" - } - }, - "gridPos": { - "h": 4, - "w": 6, - "x": 18, - "y": 1 - }, - "id": 8, - "options": { - "colorMode": "background", - "graphMode": "area", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ] - }, - "textMode": "auto" - }, - "title": "Active Workflows", - "type": "stat", - "targets": [ - { - "expr": "sum(flyte:propeller:all:execstats:active_workflow_executions{namespace=\"$namespace\"})", - "legendFormat": "Workflows", - "refId": "A" - } - ], - "description": "Current active FlyteWorkflow CRD count managed by Propeller." - }, { "datasource": { "type": "prometheus", @@ -1256,8 +1199,9 @@ data: "options": { "legend": { "calcs": [ - "mean", - "max" + "min", + "max", + "lastNotNull" ], "displayMode": "table", "placement": "bottom" @@ -1288,59 +1232,6 @@ data: ], "description": "Current count of active workflow, node, and task executions tracked by FlyteAdmin." }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "short" - } - }, - "gridPos": { - "h": 6, - "w": 12, - "x": 12, - "y": 5 - }, - "id": 7, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom" - }, - "tooltip": { - "mode": "multi", - "sort": "desc" - } - }, - "title": "Queue Depth", - "type": "timeseries", - "targets": [ - { - "expr": "sum(flyte:propeller:all:main_depth{namespace=\"$namespace\"})", - "legendFormat": "Main", - "refId": "A" - }, - { - "expr": "sum(flyte:propeller:all:sub_depth{namespace=\"$namespace\"})", - "legendFormat": "Sub", - "refId": "B" - } - ], - "description": "Main and sub workqueue depth over time." - }, { "datasource": { "type": "prometheus", @@ -1626,7 +1517,7 @@ data: "y": 10 }, "id": 1204, - "title": "Propeller Latency p99", + "title": "Executor Evaluate Duration p99", "type": "stat", "options": { "colorMode": "background", @@ -1640,8 +1531,13 @@ data: }, "targets": [ { - "expr": "(union:dp:slo:propeller_round_latency_p99 or flyte:propeller:all:round:round_time_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.99\"} / 1000)", - "refId": "A" + "expr": "executor::v2:evaluator:evaluate_duration_ms{namespace=\"$namespace\", quantile=\"0.99\"}", + "legendFormat": "p99", + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + } } ], "description": "Propeller round p99 latency in seconds. Falls back to raw metric if SLO rules not enabled." @@ -1686,7 +1582,18 @@ data: "refId": "B" } ], - "description": "DP service availability over time with SLO target line." + "description": "DP service availability over time with SLO target line.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -1730,7 +1637,18 @@ data: "refId": "B" } ], - "description": "DP error budget remaining over time. Requires monitoring.slos.enabled." + "description": "DP error budget remaining over time. Requires monitoring.slos.enabled.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "collapsed": true, @@ -1784,7 +1702,18 @@ data: "refId": "B" } ], - "description": "Operator execution operation processing rate and failure rate." + "description": "Operator execution operation processing rate and failure rate.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -1841,7 +1770,18 @@ data: "refId": "E" } ], - "description": "FlyteAdmin error rates: propeller communication failures, model transform errors, notification publish failures." + "description": "FlyteAdmin error rates: propeller communication failures, model transform errors, notification publish failures.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -1888,7 +1828,18 @@ data: "refId": "C" } ], - "description": "Breakdown of operator heartbeat computation: capabilities, usages, list workflows." + "description": "Breakdown of operator heartbeat computation: capabilities, usages, list workflows.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -1935,7 +1886,18 @@ data: "refId": "C" } ], - "description": "Config sync cycle rate, errors, and propeller ConfigMap update count." + "description": "Config sync cycle rate, errors, and propeller ConfigMap update count.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -1977,7 +1939,18 @@ data: "refId": "B" } ], - "description": "Billing collection cycle rate and errors. Failures mean billing data may be delayed." + "description": "Billing collection cycle rate and errors. Failures mean billing data may be delayed.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2076,17 +2049,28 @@ data: "type": "timeseries", "targets": [ { - "expr": "executor:active_actions_count{namespace=\"$namespace\"}", + "expr": "executor::v2:active_actions_count{namespace=\"$namespace\"}", "legendFormat": "Active actions", "refId": "A" }, { - "expr": "executor:available_capacity{namespace=\"$namespace\"}", + "expr": "executor::v2:available_capacity{namespace=\"$namespace\"}", "legendFormat": "Available capacity", "refId": "B" } ], - "description": "V2 executor active actions vs available capacity. Capacity=0 means executor is saturated." + "description": "V2 executor active actions vs available capacity. Capacity=0 means executor is saturated.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2118,17 +2102,28 @@ data: "type": "timeseries", "targets": [ { - "expr": "rate(executor:discovery_miss_count{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(executor::v2:discovery_miss_count{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Miss", "refId": "A" }, { - "expr": "rate(executor:discovery_put_success_count{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(executor::v2:discovery_put_success_count{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Put success", "refId": "B" } ], - "description": "V2 executor cache discovery miss/put rates for task output caching." + "description": "V2 executor cache discovery miss/put rates for task output caching.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2163,12 +2158,23 @@ data: "type": "timeseries", "targets": [ { - "expr": "sum by (phase) (rate(executor:actions_terminated{namespace=\"$namespace\"}[$__rate_interval]))", + "expr": "sum by (phase) (rate(executor::v2:actions_terminated{namespace=\"$namespace\"}[$__rate_interval]))", "legendFormat": "{{ phase }}", "refId": "A" } ], - "description": "Task completion rate by outcome: succeeded, failed, aborted. Key V2 SLI for task health. [Metrics pending: requires cloud service instrumentation to be deployed]" + "description": "Task completion rate by outcome: succeeded, failed, aborted. Key V2 SLI for task health. [Metrics pending: requires cloud service instrumentation to be deployed]", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2200,22 +2206,33 @@ data: "type": "timeseries", "targets": [ { - "expr": "executor:evaluator:evaluate_duration{namespace=\"$namespace\", quantile=\"0.5\"}", + "expr": "executor::v2:evaluator:evaluate_duration_ms{namespace=\"$namespace\", quantile=\"0.5\"}", "legendFormat": "Evaluate p50", "refId": "A" }, { - "expr": "executor:evaluator:evaluate_duration{namespace=\"$namespace\", quantile=\"0.9\"}", + "expr": "executor::v2:evaluator:evaluate_duration_ms{namespace=\"$namespace\", quantile=\"0.9\"}", "legendFormat": "Evaluate p90", "refId": "B" }, { - "expr": "executor:evaluator:evaluate_duration{namespace=\"$namespace\", quantile=\"0.99\"}", + "expr": "executor::v2:evaluator:evaluate_duration_ms{namespace=\"$namespace\", quantile=\"0.99\"}", "legendFormat": "Evaluate p99", "refId": "C" } ], - "description": "Time spent in RecursiveNodeHandler (pod creation). Dominant component of V2 task latency. [Metrics pending: requires cloud service instrumentation to be deployed]" + "description": "Time spent in RecursiveNodeHandler (pod creation). Dominant component of V2 task latency. [Metrics pending: requires cloud service instrumentation to be deployed]", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2247,27 +2264,38 @@ data: "type": "timeseries", "targets": [ { - "expr": "rate(executor:system_failures{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(executor::v2:system_failures{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "System failures", "refId": "A" }, { - "expr": "rate(executor:system_failures_exhausted{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(executor::v2:system_failures_exhausted{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Exhausted retries", "refId": "B" }, { - "expr": "rate(executor:invalid_leases{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(executor::v2:invalid_leases{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Invalid leases", "refId": "C" }, { - "expr": "rate(executor:evaluator:evaluate_errors{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(executor::v2:evaluator:evaluate_errors{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Evaluate errors", "refId": "D" } ], - "description": "Executor error rates. System failures retry; exhausted means task permanently failed. Invalid leases = malformed from queue service. [Metrics pending: requires cloud service instrumentation to be deployed]" + "description": "Executor error rates. System failures retry; exhausted means task permanently failed. Invalid leases = malformed from queue service. [Metrics pending: requires cloud service instrumentation to be deployed]", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } } ] }, @@ -2277,10 +2305,10 @@ data: "h": 1, "w": 24, "x": 0, - "y": 33 + "y": 34 }, - "id": 100, - "title": "Flyte Propeller (V1)", + "id": 400, + "title": "gRPC Client (DP \u2192 CP)", "type": "row", "panels": [ { @@ -2299,36 +2327,37 @@ data: "lineWidth": 1, "showPoints": "never" }, - "unit": "ms" + "unit": "reqps" } }, "gridPos": { "h": 8, - "w": 8, + "w": 12, "x": 0, - "y": 12 + "y": 13 }, - "id": 101, - "title": "Round Time (p50 / p90 / p99)", + "id": 401, + "title": "gRPC Client Request Rate", "type": "timeseries", "targets": [ { - "expr": "flyte:propeller:all:round:round_time_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.5\"}", - "legendFormat": "p50", + "expr": "sum by (grpc_service, grpc_method) (rate(grpc_client_handled_total{namespace=\"$namespace\"}[$__rate_interval]))", + "legendFormat": "{{ grpc_service }}/{{ grpc_method }}", "refId": "A" - }, - { - "expr": "flyte:propeller:all:round:round_time_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.9\"}", - "legendFormat": "p90", - "refId": "B" - }, - { - "expr": "flyte:propeller:all:round:round_time_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.99\"}", - "legendFormat": "p99", - "refId": "C" } ], - "description": "Propeller reconciliation round time. One round = one FlyteWorkflow CRD processed." + "description": "DP\u2192CP gRPC client request rate by service and method (e.g. CreateWorkflowEvent).", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2346,36 +2375,37 @@ data: "lineWidth": 1, "showPoints": "never" }, - "unit": "ops" + "unit": "reqps" } }, "gridPos": { "h": 8, - "w": 8, - "x": 8, - "y": 12 + "w": 12, + "x": 12, + "y": 13 }, - "id": 102, - "title": "Round Success / Error Rate", + "id": 402, + "title": "gRPC Client Error Rate", "type": "timeseries", "targets": [ { - "expr": "rate(flyte:propeller:all:round:success_count{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Success", + "expr": "sum by (grpc_service, grpc_method, grpc_code) (rate(grpc_client_handled_total{namespace=\"$namespace\", grpc_code!=\"OK\"}[$__rate_interval]))", + "legendFormat": "{{ grpc_method }} {{ grpc_code }}", "refId": "A" - }, - { - "expr": "rate(flyte:propeller:all:round:error_count{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Errors", - "refId": "B" - }, - { - "expr": "rate(flyte:propeller:all:round:panic_unlabeled{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Panics", - "refId": "C" } ], - "description": "Propeller round outcomes: success, errors, and panics per second." + "description": "DP\u2192CP gRPC errors by method and code. Non-OK responses from the control plane.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2393,564 +2423,37 @@ data: "lineWidth": 1, "showPoints": "never" }, - "unit": "short" + "unit": "s" } }, "gridPos": { "h": 8, - "w": 8, - "x": 16, - "y": 12 + "w": 24, + "x": 0, + "y": 21 }, - "id": 103, - "title": "Free Workers", + "id": 403, + "title": "gRPC Client Latency (p95)", "type": "timeseries", "targets": [ { - "expr": "sum(flyte:propeller:all:free_workers_count{namespace=\"$namespace\"})", - "legendFormat": "Free workers", + "expr": "histogram_quantile(0.95, sum by (le, grpc_method) (rate(grpc_client_handling_seconds_bucket{namespace=\"$namespace\"}[$__rate_interval])))", + "legendFormat": "{{ grpc_method }} p95", "refId": "A" } ], - "description": "Idle propeller worker goroutines. Zero = all workers busy processing workflows." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "ops" - } - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 0, - "y": 20 - }, - "id": 104, - "title": "Queue Add Rate", - "type": "timeseries", - "targets": [ - { - "expr": "rate(flyte:propeller:all:main_adds{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Main adds", - "refId": "A" - }, - { - "expr": "rate(flyte:propeller:all:sub_adds{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Sub adds", - "refId": "B" - }, - { - "expr": "rate(flyte:propeller:all:main_retries{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Main retries", - "refId": "C" - } - ], - "description": "Items enqueued to propeller's main/sub workqueues, plus retry rate." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "short" - } - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 8, - "y": 20 - }, - "id": 105, - "title": "Workflow Updates", - "type": "timeseries", - "targets": [ - { - "expr": "rate(flyte:propeller:all:wf_updated{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Updated", - "refId": "A" - }, - { - "expr": "rate(flyte:propeller:all:wf_update_failed{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Failed", - "refId": "B" - }, - { - "expr": "rate(flyte:propeller:all:wf_too_large{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Too large", - "refId": "C" - }, - { - "expr": "rate(flyte:propeller:all:wf_update_conflict{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Conflict", - "refId": "D" - } - ], - "description": "etcd write outcomes: successful updates, failures, too-large (>1.5MB), optimistic concurrency conflicts." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "ms" - } - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 16, - "y": 20 - }, - "id": 106, - "title": "Workflow Update Latency", - "type": "timeseries", - "targets": [ - { - "expr": "flyte:propeller:all:wf_update_latency_ms{namespace=\"$namespace\", quantile=\"0.5\"}", - "legendFormat": "p50", - "refId": "A" - }, - { - "expr": "flyte:propeller:all:wf_update_latency_ms{namespace=\"$namespace\", quantile=\"0.9\"}", - "legendFormat": "p90", - "refId": "B" - }, - { - "expr": "flyte:propeller:all:wf_update_latency_ms{namespace=\"$namespace\", quantile=\"0.99\"}", - "legendFormat": "p99", - "refId": "C" - } - ], - "description": "etcd write latency for FlyteWorkflow status updates." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "ms" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 28 - }, - "id": 107, - "title": "Node Queueing & Execution Latency", - "type": "timeseries", - "targets": [ - { - "expr": "flyte:propeller:all:node:queueing_latency_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.5\"}", - "legendFormat": "Queue p50", - "refId": "A" - }, - { - "expr": "flyte:propeller:all:node:queueing_latency_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.9\"}", - "legendFormat": "Queue p90", - "refId": "B" - }, - { - "expr": "flyte:propeller:all:node:node_exec_latency_unlabeled_us{namespace=\"$namespace\", quantile=\"0.9\"} / 1000", - "legendFormat": "Exec p90 (ms)", - "refId": "C" - } - ], - "description": "Node lifecycle: queueing latency (queued\u2192running) and execution latency (time in handler)." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "percentunit" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 28 - }, - "id": 108, - "title": "Metastore Cache Hit Rate", - "type": "timeseries", - "targets": [ - { - "expr": "rate(flyte:propeller:all:metastore:cache_hit{namespace=\"$namespace\"}[$__rate_interval]) / (rate(flyte:propeller:all:metastore:cache_hit{namespace=\"$namespace\"}[$__rate_interval]) + rate(flyte:propeller:all:metastore:cache_miss{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "Hit rate", - "refId": "A" - } - ], - "description": "In-memory cache effectiveness for object store (S3/GCS) reads. Low hit rate = excessive storage calls." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "ops" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 36 - }, - "id": 109, - "title": "Event Recording (DP \u2192 CP)", - "type": "timeseries", - "targets": [ - { - "expr": "sum(rate(flyte:propeller:all:task:event_recording:success_duration_ms_count{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "Task success", - "refId": "A" - }, - { - "expr": "sum(rate(flyte:propeller:all:node:event_recording:success_duration_ms_count{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "Node success", - "refId": "B" - }, - { - "expr": "sum(rate(flyte:propeller:all:task:event_recording:failure_duration_ms_count{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "Task failure", - "refId": "C" - }, - { - "expr": "sum(rate(flyte:propeller:all:node:event_recording:failure_duration_ms_count{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "Node failure", - "refId": "D" - } - ], - "description": "Task/node event recording rate from propeller to FlyteAdmin. Failures indicate CP connectivity issues." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "short" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 36 - }, - "id": 110, - "title": "Cache Discovery (hit/miss/skip)", - "type": "timeseries", - "targets": [ - { - "expr": "rate(flyte:propeller:all:discovery_hit_count{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Hits", - "refId": "A" - }, - { - "expr": "rate(flyte:propeller:all:discovery_miss_count{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Misses", - "refId": "B" - }, - { - "expr": "rate(flyte:propeller:all:discovery_skip_count{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Skips", - "refId": "C" - }, - { - "expr": "rate(flyte:propeller:all:discovery_get_failure_count{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Get failures", - "refId": "D" - } - ], - "description": "V2 executor cache discovery miss/put rates for task output caching." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "reqps" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 44 - }, - "id": 111, - "title": "K8s API Client Rate", - "type": "timeseries", - "targets": [ - { - "expr": "sum(rate(k8s_client_request_total_unlabeled{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "K8s requests/s", - "refId": "A" - } - ], - "description": "Propeller's K8s API request rate. High rates may indicate excessive pod watches or creates." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "s" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 44 - }, - "id": 112, - "title": "K8s API Client Latency (p90)", - "type": "timeseries", - "targets": [ - { - "expr": "histogram_quantile(0.90, sum by (le) (rate(k8s_client_request_latency_unlabeled_bucket{namespace=\"$namespace\"}[$__rate_interval])))", - "legendFormat": "Request p90", - "refId": "A" - }, - { - "expr": "histogram_quantile(0.90, sum by (le) (rate(k8s_client_rate_limiter_latency_unlabeled_bucket{namespace=\"$namespace\"}[$__rate_interval])))", - "legendFormat": "Rate limiter p90", - "refId": "B" - } - ], - "description": "K8s API request latency and client-side rate limiter wait time at p90." - } - ] - }, - { - "collapsed": true, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 34 - }, - "id": 400, - "title": "gRPC Client (DP \u2192 CP)", - "type": "row", - "panels": [ - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "reqps" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 13 - }, - "id": 401, - "title": "gRPC Client Request Rate", - "type": "timeseries", - "targets": [ - { - "expr": "sum by (grpc_service, grpc_method) (rate(grpc_client_handled_total{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "{{ grpc_service }}/{{ grpc_method }}", - "refId": "A" - } - ], - "description": "DP\u2192CP gRPC client request rate by service and method (e.g. CreateWorkflowEvent)." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "reqps" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 13 - }, - "id": 402, - "title": "gRPC Client Error Rate", - "type": "timeseries", - "targets": [ - { - "expr": "sum by (grpc_service, grpc_method, grpc_code) (rate(grpc_client_handled_total{namespace=\"$namespace\", grpc_code!=\"OK\"}[$__rate_interval]))", - "legendFormat": "{{ grpc_method }} {{ grpc_code }}", - "refId": "A" - } - ], - "description": "DP\u2192CP gRPC errors by method and code. Non-OK responses from the control plane." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "s" - } - }, - "gridPos": { - "h": 8, - "w": 24, - "x": 0, - "y": 21 - }, - "id": 403, - "title": "gRPC Client Latency (p95)", - "type": "timeseries", - "targets": [ - { - "expr": "histogram_quantile(0.95, sum by (le, grpc_method) (rate(grpc_client_handling_seconds_bucket{namespace=\"$namespace\"}[$__rate_interval])))", - "legendFormat": "{{ grpc_method }} p95", - "refId": "A" + "description": "DP\u2192CP gRPC call latency at p95. High latency = slow control plane or network issues.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] } - ], - "description": "DP\u2192CP gRPC call latency at p95. High latency = slow control plane or network issues." + } } ] }, @@ -3004,7 +2507,18 @@ data: "refId": "A" } ], - "description": "CPU usage in cores per container, stacked. Identifies resource-heavy services." + "description": "CPU usage in cores per container, stacked. Identifies resource-heavy services.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3044,7 +2558,18 @@ data: "refId": "A" } ], - "description": "Working set memory per container. Watch for approaching limits." + "description": "Working set memory per container. Watch for approaching limits.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3081,7 +2606,18 @@ data: "refId": "A" } ], - "description": "Per-container restart events. Spikes indicate crashes or OOM kills." + "description": "Per-container restart events. Spikes indicate crashes or OOM kills.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } } ] } @@ -3133,7 +2669,7 @@ data: "timepicker": {}, "timezone": "browser", "title": "Union Dataplane Overview", - "uid": "union-dp-overview", + "uid": "union-dataplane-overview", "version": 1 } --- diff --git a/tests/generated/dataplane.additional-templates.yaml b/tests/generated/dataplane.additional-templates.yaml index 4bcf822e..6235b0b8 100644 --- a/tests/generated/dataplane.additional-templates.yaml +++ b/tests/generated/dataplane.additional-templates.yaml @@ -1182,63 +1182,6 @@ data: ], "description": "Total container restarts in the last hour. Non-zero indicates crashlooping or OOM kills." }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "thresholds": { - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "yellow", - "value": 10 - }, - { - "color": "red", - "value": 50 - } - ] - }, - "unit": "short" - } - }, - "gridPos": { - "h": 4, - "w": 6, - "x": 18, - "y": 1 - }, - "id": 8, - "options": { - "colorMode": "background", - "graphMode": "area", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ] - }, - "textMode": "auto" - }, - "title": "Active Workflows", - "type": "stat", - "targets": [ - { - "expr": "sum(flyte:propeller:all:execstats:active_workflow_executions{namespace=\"$namespace\"})", - "legendFormat": "Workflows", - "refId": "A" - } - ], - "description": "Current active FlyteWorkflow CRD count managed by Propeller." - }, { "datasource": { "type": "prometheus", @@ -1268,8 +1211,9 @@ data: "options": { "legend": { "calcs": [ - "mean", - "max" + "min", + "max", + "lastNotNull" ], "displayMode": "table", "placement": "bottom" @@ -1300,59 +1244,6 @@ data: ], "description": "Current count of active workflow, node, and task executions tracked by FlyteAdmin." }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "short" - } - }, - "gridPos": { - "h": 6, - "w": 12, - "x": 12, - "y": 5 - }, - "id": 7, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom" - }, - "tooltip": { - "mode": "multi", - "sort": "desc" - } - }, - "title": "Queue Depth", - "type": "timeseries", - "targets": [ - { - "expr": "sum(flyte:propeller:all:main_depth{namespace=\"$namespace\"})", - "legendFormat": "Main", - "refId": "A" - }, - { - "expr": "sum(flyte:propeller:all:sub_depth{namespace=\"$namespace\"})", - "legendFormat": "Sub", - "refId": "B" - } - ], - "description": "Main and sub workqueue depth over time." - }, { "datasource": { "type": "prometheus", @@ -1638,7 +1529,7 @@ data: "y": 10 }, "id": 1204, - "title": "Propeller Latency p99", + "title": "Executor Evaluate Duration p99", "type": "stat", "options": { "colorMode": "background", @@ -1652,8 +1543,13 @@ data: }, "targets": [ { - "expr": "(union:dp:slo:propeller_round_latency_p99 or flyte:propeller:all:round:round_time_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.99\"} / 1000)", - "refId": "A" + "expr": "executor::v2:evaluator:evaluate_duration_ms{namespace=\"$namespace\", quantile=\"0.99\"}", + "legendFormat": "p99", + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + } } ], "description": "Propeller round p99 latency in seconds. Falls back to raw metric if SLO rules not enabled." @@ -1698,7 +1594,18 @@ data: "refId": "B" } ], - "description": "DP service availability over time with SLO target line." + "description": "DP service availability over time with SLO target line.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -1742,7 +1649,18 @@ data: "refId": "B" } ], - "description": "DP error budget remaining over time. Requires monitoring.slos.enabled." + "description": "DP error budget remaining over time. Requires monitoring.slos.enabled.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "collapsed": true, @@ -1796,7 +1714,18 @@ data: "refId": "B" } ], - "description": "Operator execution operation processing rate and failure rate." + "description": "Operator execution operation processing rate and failure rate.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -1853,7 +1782,18 @@ data: "refId": "E" } ], - "description": "FlyteAdmin error rates: propeller communication failures, model transform errors, notification publish failures." + "description": "FlyteAdmin error rates: propeller communication failures, model transform errors, notification publish failures.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -1900,7 +1840,18 @@ data: "refId": "C" } ], - "description": "Breakdown of operator heartbeat computation: capabilities, usages, list workflows." + "description": "Breakdown of operator heartbeat computation: capabilities, usages, list workflows.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -1947,7 +1898,18 @@ data: "refId": "C" } ], - "description": "Config sync cycle rate, errors, and propeller ConfigMap update count." + "description": "Config sync cycle rate, errors, and propeller ConfigMap update count.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -1989,7 +1951,18 @@ data: "refId": "B" } ], - "description": "Billing collection cycle rate and errors. Failures mean billing data may be delayed." + "description": "Billing collection cycle rate and errors. Failures mean billing data may be delayed.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2088,17 +2061,28 @@ data: "type": "timeseries", "targets": [ { - "expr": "executor:active_actions_count{namespace=\"$namespace\"}", + "expr": "executor::v2:active_actions_count{namespace=\"$namespace\"}", "legendFormat": "Active actions", "refId": "A" }, { - "expr": "executor:available_capacity{namespace=\"$namespace\"}", + "expr": "executor::v2:available_capacity{namespace=\"$namespace\"}", "legendFormat": "Available capacity", "refId": "B" } ], - "description": "V2 executor active actions vs available capacity. Capacity=0 means executor is saturated." + "description": "V2 executor active actions vs available capacity. Capacity=0 means executor is saturated.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2130,17 +2114,28 @@ data: "type": "timeseries", "targets": [ { - "expr": "rate(executor:discovery_miss_count{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(executor::v2:discovery_miss_count{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Miss", "refId": "A" }, { - "expr": "rate(executor:discovery_put_success_count{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(executor::v2:discovery_put_success_count{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Put success", "refId": "B" } ], - "description": "V2 executor cache discovery miss/put rates for task output caching." + "description": "V2 executor cache discovery miss/put rates for task output caching.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2175,12 +2170,23 @@ data: "type": "timeseries", "targets": [ { - "expr": "sum by (phase) (rate(executor:actions_terminated{namespace=\"$namespace\"}[$__rate_interval]))", + "expr": "sum by (phase) (rate(executor::v2:actions_terminated{namespace=\"$namespace\"}[$__rate_interval]))", "legendFormat": "{{ phase }}", "refId": "A" } ], - "description": "Task completion rate by outcome: succeeded, failed, aborted. Key V2 SLI for task health. [Metrics pending: requires cloud service instrumentation to be deployed]" + "description": "Task completion rate by outcome: succeeded, failed, aborted. Key V2 SLI for task health. [Metrics pending: requires cloud service instrumentation to be deployed]", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2212,22 +2218,33 @@ data: "type": "timeseries", "targets": [ { - "expr": "executor:evaluator:evaluate_duration{namespace=\"$namespace\", quantile=\"0.5\"}", + "expr": "executor::v2:evaluator:evaluate_duration_ms{namespace=\"$namespace\", quantile=\"0.5\"}", "legendFormat": "Evaluate p50", "refId": "A" }, { - "expr": "executor:evaluator:evaluate_duration{namespace=\"$namespace\", quantile=\"0.9\"}", + "expr": "executor::v2:evaluator:evaluate_duration_ms{namespace=\"$namespace\", quantile=\"0.9\"}", "legendFormat": "Evaluate p90", "refId": "B" }, { - "expr": "executor:evaluator:evaluate_duration{namespace=\"$namespace\", quantile=\"0.99\"}", + "expr": "executor::v2:evaluator:evaluate_duration_ms{namespace=\"$namespace\", quantile=\"0.99\"}", "legendFormat": "Evaluate p99", "refId": "C" } ], - "description": "Time spent in RecursiveNodeHandler (pod creation). Dominant component of V2 task latency. [Metrics pending: requires cloud service instrumentation to be deployed]" + "description": "Time spent in RecursiveNodeHandler (pod creation). Dominant component of V2 task latency. [Metrics pending: requires cloud service instrumentation to be deployed]", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2259,27 +2276,38 @@ data: "type": "timeseries", "targets": [ { - "expr": "rate(executor:system_failures{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(executor::v2:system_failures{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "System failures", "refId": "A" }, { - "expr": "rate(executor:system_failures_exhausted{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(executor::v2:system_failures_exhausted{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Exhausted retries", "refId": "B" }, { - "expr": "rate(executor:invalid_leases{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(executor::v2:invalid_leases{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Invalid leases", "refId": "C" }, { - "expr": "rate(executor:evaluator:evaluate_errors{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(executor::v2:evaluator:evaluate_errors{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Evaluate errors", "refId": "D" } ], - "description": "Executor error rates. System failures retry; exhausted means task permanently failed. Invalid leases = malformed from queue service. [Metrics pending: requires cloud service instrumentation to be deployed]" + "description": "Executor error rates. System failures retry; exhausted means task permanently failed. Invalid leases = malformed from queue service. [Metrics pending: requires cloud service instrumentation to be deployed]", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } } ] }, @@ -2289,10 +2317,10 @@ data: "h": 1, "w": 24, "x": 0, - "y": 33 + "y": 34 }, - "id": 100, - "title": "Flyte Propeller (V1)", + "id": 400, + "title": "gRPC Client (DP \u2192 CP)", "type": "row", "panels": [ { @@ -2311,36 +2339,37 @@ data: "lineWidth": 1, "showPoints": "never" }, - "unit": "ms" + "unit": "reqps" } }, "gridPos": { "h": 8, - "w": 8, + "w": 12, "x": 0, - "y": 12 + "y": 13 }, - "id": 101, - "title": "Round Time (p50 / p90 / p99)", + "id": 401, + "title": "gRPC Client Request Rate", "type": "timeseries", "targets": [ { - "expr": "flyte:propeller:all:round:round_time_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.5\"}", - "legendFormat": "p50", + "expr": "sum by (grpc_service, grpc_method) (rate(grpc_client_handled_total{namespace=\"$namespace\"}[$__rate_interval]))", + "legendFormat": "{{ grpc_service }}/{{ grpc_method }}", "refId": "A" - }, - { - "expr": "flyte:propeller:all:round:round_time_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.9\"}", - "legendFormat": "p90", - "refId": "B" - }, - { - "expr": "flyte:propeller:all:round:round_time_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.99\"}", - "legendFormat": "p99", - "refId": "C" } ], - "description": "Propeller reconciliation round time. One round = one FlyteWorkflow CRD processed." + "description": "DP\u2192CP gRPC client request rate by service and method (e.g. CreateWorkflowEvent).", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2358,36 +2387,37 @@ data: "lineWidth": 1, "showPoints": "never" }, - "unit": "ops" + "unit": "reqps" } }, "gridPos": { "h": 8, - "w": 8, - "x": 8, - "y": 12 + "w": 12, + "x": 12, + "y": 13 }, - "id": 102, - "title": "Round Success / Error Rate", + "id": 402, + "title": "gRPC Client Error Rate", "type": "timeseries", "targets": [ { - "expr": "rate(flyte:propeller:all:round:success_count{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Success", + "expr": "sum by (grpc_service, grpc_method, grpc_code) (rate(grpc_client_handled_total{namespace=\"$namespace\", grpc_code!=\"OK\"}[$__rate_interval]))", + "legendFormat": "{{ grpc_method }} {{ grpc_code }}", "refId": "A" - }, - { - "expr": "rate(flyte:propeller:all:round:error_count{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Errors", - "refId": "B" - }, - { - "expr": "rate(flyte:propeller:all:round:panic_unlabeled{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Panics", - "refId": "C" } ], - "description": "Propeller round outcomes: success, errors, and panics per second." + "description": "DP\u2192CP gRPC errors by method and code. Non-OK responses from the control plane.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2405,564 +2435,37 @@ data: "lineWidth": 1, "showPoints": "never" }, - "unit": "short" + "unit": "s" } }, "gridPos": { "h": 8, - "w": 8, - "x": 16, - "y": 12 + "w": 24, + "x": 0, + "y": 21 }, - "id": 103, - "title": "Free Workers", + "id": 403, + "title": "gRPC Client Latency (p95)", "type": "timeseries", "targets": [ { - "expr": "sum(flyte:propeller:all:free_workers_count{namespace=\"$namespace\"})", - "legendFormat": "Free workers", + "expr": "histogram_quantile(0.95, sum by (le, grpc_method) (rate(grpc_client_handling_seconds_bucket{namespace=\"$namespace\"}[$__rate_interval])))", + "legendFormat": "{{ grpc_method }} p95", "refId": "A" } ], - "description": "Idle propeller worker goroutines. Zero = all workers busy processing workflows." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "ops" - } - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 0, - "y": 20 - }, - "id": 104, - "title": "Queue Add Rate", - "type": "timeseries", - "targets": [ - { - "expr": "rate(flyte:propeller:all:main_adds{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Main adds", - "refId": "A" - }, - { - "expr": "rate(flyte:propeller:all:sub_adds{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Sub adds", - "refId": "B" - }, - { - "expr": "rate(flyte:propeller:all:main_retries{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Main retries", - "refId": "C" - } - ], - "description": "Items enqueued to propeller's main/sub workqueues, plus retry rate." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "short" - } - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 8, - "y": 20 - }, - "id": 105, - "title": "Workflow Updates", - "type": "timeseries", - "targets": [ - { - "expr": "rate(flyte:propeller:all:wf_updated{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Updated", - "refId": "A" - }, - { - "expr": "rate(flyte:propeller:all:wf_update_failed{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Failed", - "refId": "B" - }, - { - "expr": "rate(flyte:propeller:all:wf_too_large{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Too large", - "refId": "C" - }, - { - "expr": "rate(flyte:propeller:all:wf_update_conflict{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Conflict", - "refId": "D" - } - ], - "description": "etcd write outcomes: successful updates, failures, too-large (>1.5MB), optimistic concurrency conflicts." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "ms" - } - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 16, - "y": 20 - }, - "id": 106, - "title": "Workflow Update Latency", - "type": "timeseries", - "targets": [ - { - "expr": "flyte:propeller:all:wf_update_latency_ms{namespace=\"$namespace\", quantile=\"0.5\"}", - "legendFormat": "p50", - "refId": "A" - }, - { - "expr": "flyte:propeller:all:wf_update_latency_ms{namespace=\"$namespace\", quantile=\"0.9\"}", - "legendFormat": "p90", - "refId": "B" - }, - { - "expr": "flyte:propeller:all:wf_update_latency_ms{namespace=\"$namespace\", quantile=\"0.99\"}", - "legendFormat": "p99", - "refId": "C" - } - ], - "description": "etcd write latency for FlyteWorkflow status updates." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "ms" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 28 - }, - "id": 107, - "title": "Node Queueing & Execution Latency", - "type": "timeseries", - "targets": [ - { - "expr": "flyte:propeller:all:node:queueing_latency_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.5\"}", - "legendFormat": "Queue p50", - "refId": "A" - }, - { - "expr": "flyte:propeller:all:node:queueing_latency_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.9\"}", - "legendFormat": "Queue p90", - "refId": "B" - }, - { - "expr": "flyte:propeller:all:node:node_exec_latency_unlabeled_us{namespace=\"$namespace\", quantile=\"0.9\"} / 1000", - "legendFormat": "Exec p90 (ms)", - "refId": "C" - } - ], - "description": "Node lifecycle: queueing latency (queued\u2192running) and execution latency (time in handler)." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "percentunit" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 28 - }, - "id": 108, - "title": "Metastore Cache Hit Rate", - "type": "timeseries", - "targets": [ - { - "expr": "rate(flyte:propeller:all:metastore:cache_hit{namespace=\"$namespace\"}[$__rate_interval]) / (rate(flyte:propeller:all:metastore:cache_hit{namespace=\"$namespace\"}[$__rate_interval]) + rate(flyte:propeller:all:metastore:cache_miss{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "Hit rate", - "refId": "A" - } - ], - "description": "In-memory cache effectiveness for object store (S3/GCS) reads. Low hit rate = excessive storage calls." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "ops" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 36 - }, - "id": 109, - "title": "Event Recording (DP \u2192 CP)", - "type": "timeseries", - "targets": [ - { - "expr": "sum(rate(flyte:propeller:all:task:event_recording:success_duration_ms_count{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "Task success", - "refId": "A" - }, - { - "expr": "sum(rate(flyte:propeller:all:node:event_recording:success_duration_ms_count{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "Node success", - "refId": "B" - }, - { - "expr": "sum(rate(flyte:propeller:all:task:event_recording:failure_duration_ms_count{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "Task failure", - "refId": "C" - }, - { - "expr": "sum(rate(flyte:propeller:all:node:event_recording:failure_duration_ms_count{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "Node failure", - "refId": "D" - } - ], - "description": "Task/node event recording rate from propeller to FlyteAdmin. Failures indicate CP connectivity issues." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "short" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 36 - }, - "id": 110, - "title": "Cache Discovery (hit/miss/skip)", - "type": "timeseries", - "targets": [ - { - "expr": "rate(flyte:propeller:all:discovery_hit_count{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Hits", - "refId": "A" - }, - { - "expr": "rate(flyte:propeller:all:discovery_miss_count{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Misses", - "refId": "B" - }, - { - "expr": "rate(flyte:propeller:all:discovery_skip_count{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Skips", - "refId": "C" - }, - { - "expr": "rate(flyte:propeller:all:discovery_get_failure_count{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Get failures", - "refId": "D" - } - ], - "description": "V2 executor cache discovery miss/put rates for task output caching." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "reqps" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 44 - }, - "id": 111, - "title": "K8s API Client Rate", - "type": "timeseries", - "targets": [ - { - "expr": "sum(rate(k8s_client_request_total_unlabeled{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "K8s requests/s", - "refId": "A" - } - ], - "description": "Propeller's K8s API request rate. High rates may indicate excessive pod watches or creates." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "s" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 44 - }, - "id": 112, - "title": "K8s API Client Latency (p90)", - "type": "timeseries", - "targets": [ - { - "expr": "histogram_quantile(0.90, sum by (le) (rate(k8s_client_request_latency_unlabeled_bucket{namespace=\"$namespace\"}[$__rate_interval])))", - "legendFormat": "Request p90", - "refId": "A" - }, - { - "expr": "histogram_quantile(0.90, sum by (le) (rate(k8s_client_rate_limiter_latency_unlabeled_bucket{namespace=\"$namespace\"}[$__rate_interval])))", - "legendFormat": "Rate limiter p90", - "refId": "B" - } - ], - "description": "K8s API request latency and client-side rate limiter wait time at p90." - } - ] - }, - { - "collapsed": true, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 34 - }, - "id": 400, - "title": "gRPC Client (DP \u2192 CP)", - "type": "row", - "panels": [ - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "reqps" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 13 - }, - "id": 401, - "title": "gRPC Client Request Rate", - "type": "timeseries", - "targets": [ - { - "expr": "sum by (grpc_service, grpc_method) (rate(grpc_client_handled_total{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "{{ grpc_service }}/{{ grpc_method }}", - "refId": "A" - } - ], - "description": "DP\u2192CP gRPC client request rate by service and method (e.g. CreateWorkflowEvent)." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "reqps" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 13 - }, - "id": 402, - "title": "gRPC Client Error Rate", - "type": "timeseries", - "targets": [ - { - "expr": "sum by (grpc_service, grpc_method, grpc_code) (rate(grpc_client_handled_total{namespace=\"$namespace\", grpc_code!=\"OK\"}[$__rate_interval]))", - "legendFormat": "{{ grpc_method }} {{ grpc_code }}", - "refId": "A" - } - ], - "description": "DP\u2192CP gRPC errors by method and code. Non-OK responses from the control plane." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "s" - } - }, - "gridPos": { - "h": 8, - "w": 24, - "x": 0, - "y": 21 - }, - "id": 403, - "title": "gRPC Client Latency (p95)", - "type": "timeseries", - "targets": [ - { - "expr": "histogram_quantile(0.95, sum by (le, grpc_method) (rate(grpc_client_handling_seconds_bucket{namespace=\"$namespace\"}[$__rate_interval])))", - "legendFormat": "{{ grpc_method }} p95", - "refId": "A" + "description": "DP\u2192CP gRPC call latency at p95. High latency = slow control plane or network issues.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] } - ], - "description": "DP\u2192CP gRPC call latency at p95. High latency = slow control plane or network issues." + } } ] }, @@ -3016,7 +2519,18 @@ data: "refId": "A" } ], - "description": "CPU usage in cores per container, stacked. Identifies resource-heavy services." + "description": "CPU usage in cores per container, stacked. Identifies resource-heavy services.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3056,7 +2570,18 @@ data: "refId": "A" } ], - "description": "Working set memory per container. Watch for approaching limits." + "description": "Working set memory per container. Watch for approaching limits.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3093,7 +2618,18 @@ data: "refId": "A" } ], - "description": "Per-container restart events. Spikes indicate crashes or OOM kills." + "description": "Per-container restart events. Spikes indicate crashes or OOM kills.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } } ] } @@ -3145,7 +2681,7 @@ data: "timepicker": {}, "timezone": "browser", "title": "Union Dataplane Overview", - "uid": "union-dp-overview", + "uid": "union-dataplane-overview", "version": 1 } --- diff --git a/tests/generated/dataplane.aws.eks-automode.yaml b/tests/generated/dataplane.aws.eks-automode.yaml index b7b0ac8d..9f4fc590 100644 --- a/tests/generated/dataplane.aws.eks-automode.yaml +++ b/tests/generated/dataplane.aws.eks-automode.yaml @@ -1319,63 +1319,6 @@ data: ], "description": "Total container restarts in the last hour. Non-zero indicates crashlooping or OOM kills." }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "thresholds": { - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "yellow", - "value": 10 - }, - { - "color": "red", - "value": 50 - } - ] - }, - "unit": "short" - } - }, - "gridPos": { - "h": 4, - "w": 6, - "x": 18, - "y": 1 - }, - "id": 8, - "options": { - "colorMode": "background", - "graphMode": "area", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ] - }, - "textMode": "auto" - }, - "title": "Active Workflows", - "type": "stat", - "targets": [ - { - "expr": "sum(flyte:propeller:all:execstats:active_workflow_executions{namespace=\"$namespace\"})", - "legendFormat": "Workflows", - "refId": "A" - } - ], - "description": "Current active FlyteWorkflow CRD count managed by Propeller." - }, { "datasource": { "type": "prometheus", @@ -1405,8 +1348,9 @@ data: "options": { "legend": { "calcs": [ - "mean", - "max" + "min", + "max", + "lastNotNull" ], "displayMode": "table", "placement": "bottom" @@ -1437,59 +1381,6 @@ data: ], "description": "Current count of active workflow, node, and task executions tracked by FlyteAdmin." }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "short" - } - }, - "gridPos": { - "h": 6, - "w": 12, - "x": 12, - "y": 5 - }, - "id": 7, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom" - }, - "tooltip": { - "mode": "multi", - "sort": "desc" - } - }, - "title": "Queue Depth", - "type": "timeseries", - "targets": [ - { - "expr": "sum(flyte:propeller:all:main_depth{namespace=\"$namespace\"})", - "legendFormat": "Main", - "refId": "A" - }, - { - "expr": "sum(flyte:propeller:all:sub_depth{namespace=\"$namespace\"})", - "legendFormat": "Sub", - "refId": "B" - } - ], - "description": "Main and sub workqueue depth over time." - }, { "datasource": { "type": "prometheus", @@ -1775,7 +1666,7 @@ data: "y": 10 }, "id": 1204, - "title": "Propeller Latency p99", + "title": "Executor Evaluate Duration p99", "type": "stat", "options": { "colorMode": "background", @@ -1789,8 +1680,13 @@ data: }, "targets": [ { - "expr": "(union:dp:slo:propeller_round_latency_p99 or flyte:propeller:all:round:round_time_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.99\"} / 1000)", - "refId": "A" + "expr": "executor::v2:evaluator:evaluate_duration_ms{namespace=\"$namespace\", quantile=\"0.99\"}", + "legendFormat": "p99", + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + } } ], "description": "Propeller round p99 latency in seconds. Falls back to raw metric if SLO rules not enabled." @@ -1835,7 +1731,18 @@ data: "refId": "B" } ], - "description": "DP service availability over time with SLO target line." + "description": "DP service availability over time with SLO target line.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -1879,7 +1786,18 @@ data: "refId": "B" } ], - "description": "DP error budget remaining over time. Requires monitoring.slos.enabled." + "description": "DP error budget remaining over time. Requires monitoring.slos.enabled.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "collapsed": true, @@ -1933,7 +1851,18 @@ data: "refId": "B" } ], - "description": "Operator execution operation processing rate and failure rate." + "description": "Operator execution operation processing rate and failure rate.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -1990,7 +1919,18 @@ data: "refId": "E" } ], - "description": "FlyteAdmin error rates: propeller communication failures, model transform errors, notification publish failures." + "description": "FlyteAdmin error rates: propeller communication failures, model transform errors, notification publish failures.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2037,7 +1977,18 @@ data: "refId": "C" } ], - "description": "Breakdown of operator heartbeat computation: capabilities, usages, list workflows." + "description": "Breakdown of operator heartbeat computation: capabilities, usages, list workflows.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2084,7 +2035,18 @@ data: "refId": "C" } ], - "description": "Config sync cycle rate, errors, and propeller ConfigMap update count." + "description": "Config sync cycle rate, errors, and propeller ConfigMap update count.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2126,7 +2088,18 @@ data: "refId": "B" } ], - "description": "Billing collection cycle rate and errors. Failures mean billing data may be delayed." + "description": "Billing collection cycle rate and errors. Failures mean billing data may be delayed.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2225,17 +2198,28 @@ data: "type": "timeseries", "targets": [ { - "expr": "executor:active_actions_count{namespace=\"$namespace\"}", + "expr": "executor::v2:active_actions_count{namespace=\"$namespace\"}", "legendFormat": "Active actions", "refId": "A" }, { - "expr": "executor:available_capacity{namespace=\"$namespace\"}", + "expr": "executor::v2:available_capacity{namespace=\"$namespace\"}", "legendFormat": "Available capacity", "refId": "B" } ], - "description": "V2 executor active actions vs available capacity. Capacity=0 means executor is saturated." + "description": "V2 executor active actions vs available capacity. Capacity=0 means executor is saturated.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2267,17 +2251,28 @@ data: "type": "timeseries", "targets": [ { - "expr": "rate(executor:discovery_miss_count{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(executor::v2:discovery_miss_count{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Miss", "refId": "A" }, { - "expr": "rate(executor:discovery_put_success_count{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(executor::v2:discovery_put_success_count{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Put success", "refId": "B" } ], - "description": "V2 executor cache discovery miss/put rates for task output caching." + "description": "V2 executor cache discovery miss/put rates for task output caching.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2312,12 +2307,23 @@ data: "type": "timeseries", "targets": [ { - "expr": "sum by (phase) (rate(executor:actions_terminated{namespace=\"$namespace\"}[$__rate_interval]))", + "expr": "sum by (phase) (rate(executor::v2:actions_terminated{namespace=\"$namespace\"}[$__rate_interval]))", "legendFormat": "{{ phase }}", "refId": "A" } ], - "description": "Task completion rate by outcome: succeeded, failed, aborted. Key V2 SLI for task health. [Metrics pending: requires cloud service instrumentation to be deployed]" + "description": "Task completion rate by outcome: succeeded, failed, aborted. Key V2 SLI for task health. [Metrics pending: requires cloud service instrumentation to be deployed]", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2349,22 +2355,33 @@ data: "type": "timeseries", "targets": [ { - "expr": "executor:evaluator:evaluate_duration{namespace=\"$namespace\", quantile=\"0.5\"}", + "expr": "executor::v2:evaluator:evaluate_duration_ms{namespace=\"$namespace\", quantile=\"0.5\"}", "legendFormat": "Evaluate p50", "refId": "A" }, { - "expr": "executor:evaluator:evaluate_duration{namespace=\"$namespace\", quantile=\"0.9\"}", + "expr": "executor::v2:evaluator:evaluate_duration_ms{namespace=\"$namespace\", quantile=\"0.9\"}", "legendFormat": "Evaluate p90", "refId": "B" }, { - "expr": "executor:evaluator:evaluate_duration{namespace=\"$namespace\", quantile=\"0.99\"}", + "expr": "executor::v2:evaluator:evaluate_duration_ms{namespace=\"$namespace\", quantile=\"0.99\"}", "legendFormat": "Evaluate p99", "refId": "C" } ], - "description": "Time spent in RecursiveNodeHandler (pod creation). Dominant component of V2 task latency. [Metrics pending: requires cloud service instrumentation to be deployed]" + "description": "Time spent in RecursiveNodeHandler (pod creation). Dominant component of V2 task latency. [Metrics pending: requires cloud service instrumentation to be deployed]", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2396,27 +2413,38 @@ data: "type": "timeseries", "targets": [ { - "expr": "rate(executor:system_failures{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(executor::v2:system_failures{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "System failures", "refId": "A" }, { - "expr": "rate(executor:system_failures_exhausted{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(executor::v2:system_failures_exhausted{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Exhausted retries", "refId": "B" }, { - "expr": "rate(executor:invalid_leases{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(executor::v2:invalid_leases{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Invalid leases", "refId": "C" }, { - "expr": "rate(executor:evaluator:evaluate_errors{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(executor::v2:evaluator:evaluate_errors{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Evaluate errors", "refId": "D" } ], - "description": "Executor error rates. System failures retry; exhausted means task permanently failed. Invalid leases = malformed from queue service. [Metrics pending: requires cloud service instrumentation to be deployed]" + "description": "Executor error rates. System failures retry; exhausted means task permanently failed. Invalid leases = malformed from queue service. [Metrics pending: requires cloud service instrumentation to be deployed]", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } } ] }, @@ -2426,10 +2454,10 @@ data: "h": 1, "w": 24, "x": 0, - "y": 33 + "y": 34 }, - "id": 100, - "title": "Flyte Propeller (V1)", + "id": 400, + "title": "gRPC Client (DP \u2192 CP)", "type": "row", "panels": [ { @@ -2448,36 +2476,37 @@ data: "lineWidth": 1, "showPoints": "never" }, - "unit": "ms" + "unit": "reqps" } }, "gridPos": { "h": 8, - "w": 8, + "w": 12, "x": 0, - "y": 12 + "y": 13 }, - "id": 101, - "title": "Round Time (p50 / p90 / p99)", + "id": 401, + "title": "gRPC Client Request Rate", "type": "timeseries", "targets": [ { - "expr": "flyte:propeller:all:round:round_time_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.5\"}", - "legendFormat": "p50", + "expr": "sum by (grpc_service, grpc_method) (rate(grpc_client_handled_total{namespace=\"$namespace\"}[$__rate_interval]))", + "legendFormat": "{{ grpc_service }}/{{ grpc_method }}", "refId": "A" - }, - { - "expr": "flyte:propeller:all:round:round_time_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.9\"}", - "legendFormat": "p90", - "refId": "B" - }, - { - "expr": "flyte:propeller:all:round:round_time_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.99\"}", - "legendFormat": "p99", - "refId": "C" } ], - "description": "Propeller reconciliation round time. One round = one FlyteWorkflow CRD processed." + "description": "DP\u2192CP gRPC client request rate by service and method (e.g. CreateWorkflowEvent).", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2495,36 +2524,37 @@ data: "lineWidth": 1, "showPoints": "never" }, - "unit": "ops" + "unit": "reqps" } }, "gridPos": { "h": 8, - "w": 8, - "x": 8, - "y": 12 + "w": 12, + "x": 12, + "y": 13 }, - "id": 102, - "title": "Round Success / Error Rate", + "id": 402, + "title": "gRPC Client Error Rate", "type": "timeseries", "targets": [ { - "expr": "rate(flyte:propeller:all:round:success_count{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Success", + "expr": "sum by (grpc_service, grpc_method, grpc_code) (rate(grpc_client_handled_total{namespace=\"$namespace\", grpc_code!=\"OK\"}[$__rate_interval]))", + "legendFormat": "{{ grpc_method }} {{ grpc_code }}", "refId": "A" - }, - { - "expr": "rate(flyte:propeller:all:round:error_count{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Errors", - "refId": "B" - }, - { - "expr": "rate(flyte:propeller:all:round:panic_unlabeled{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Panics", - "refId": "C" } ], - "description": "Propeller round outcomes: success, errors, and panics per second." + "description": "DP\u2192CP gRPC errors by method and code. Non-OK responses from the control plane.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2542,564 +2572,37 @@ data: "lineWidth": 1, "showPoints": "never" }, - "unit": "short" + "unit": "s" } }, "gridPos": { "h": 8, - "w": 8, - "x": 16, - "y": 12 + "w": 24, + "x": 0, + "y": 21 }, - "id": 103, - "title": "Free Workers", + "id": 403, + "title": "gRPC Client Latency (p95)", "type": "timeseries", "targets": [ { - "expr": "sum(flyte:propeller:all:free_workers_count{namespace=\"$namespace\"})", - "legendFormat": "Free workers", + "expr": "histogram_quantile(0.95, sum by (le, grpc_method) (rate(grpc_client_handling_seconds_bucket{namespace=\"$namespace\"}[$__rate_interval])))", + "legendFormat": "{{ grpc_method }} p95", "refId": "A" } ], - "description": "Idle propeller worker goroutines. Zero = all workers busy processing workflows." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "ops" - } - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 0, - "y": 20 - }, - "id": 104, - "title": "Queue Add Rate", - "type": "timeseries", - "targets": [ - { - "expr": "rate(flyte:propeller:all:main_adds{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Main adds", - "refId": "A" - }, - { - "expr": "rate(flyte:propeller:all:sub_adds{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Sub adds", - "refId": "B" - }, - { - "expr": "rate(flyte:propeller:all:main_retries{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Main retries", - "refId": "C" - } - ], - "description": "Items enqueued to propeller's main/sub workqueues, plus retry rate." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "short" - } - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 8, - "y": 20 - }, - "id": 105, - "title": "Workflow Updates", - "type": "timeseries", - "targets": [ - { - "expr": "rate(flyte:propeller:all:wf_updated{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Updated", - "refId": "A" - }, - { - "expr": "rate(flyte:propeller:all:wf_update_failed{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Failed", - "refId": "B" - }, - { - "expr": "rate(flyte:propeller:all:wf_too_large{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Too large", - "refId": "C" - }, - { - "expr": "rate(flyte:propeller:all:wf_update_conflict{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Conflict", - "refId": "D" - } - ], - "description": "etcd write outcomes: successful updates, failures, too-large (>1.5MB), optimistic concurrency conflicts." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "ms" - } - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 16, - "y": 20 - }, - "id": 106, - "title": "Workflow Update Latency", - "type": "timeseries", - "targets": [ - { - "expr": "flyte:propeller:all:wf_update_latency_ms{namespace=\"$namespace\", quantile=\"0.5\"}", - "legendFormat": "p50", - "refId": "A" - }, - { - "expr": "flyte:propeller:all:wf_update_latency_ms{namespace=\"$namespace\", quantile=\"0.9\"}", - "legendFormat": "p90", - "refId": "B" - }, - { - "expr": "flyte:propeller:all:wf_update_latency_ms{namespace=\"$namespace\", quantile=\"0.99\"}", - "legendFormat": "p99", - "refId": "C" - } - ], - "description": "etcd write latency for FlyteWorkflow status updates." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "ms" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 28 - }, - "id": 107, - "title": "Node Queueing & Execution Latency", - "type": "timeseries", - "targets": [ - { - "expr": "flyte:propeller:all:node:queueing_latency_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.5\"}", - "legendFormat": "Queue p50", - "refId": "A" - }, - { - "expr": "flyte:propeller:all:node:queueing_latency_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.9\"}", - "legendFormat": "Queue p90", - "refId": "B" - }, - { - "expr": "flyte:propeller:all:node:node_exec_latency_unlabeled_us{namespace=\"$namespace\", quantile=\"0.9\"} / 1000", - "legendFormat": "Exec p90 (ms)", - "refId": "C" - } - ], - "description": "Node lifecycle: queueing latency (queued\u2192running) and execution latency (time in handler)." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "percentunit" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 28 - }, - "id": 108, - "title": "Metastore Cache Hit Rate", - "type": "timeseries", - "targets": [ - { - "expr": "rate(flyte:propeller:all:metastore:cache_hit{namespace=\"$namespace\"}[$__rate_interval]) / (rate(flyte:propeller:all:metastore:cache_hit{namespace=\"$namespace\"}[$__rate_interval]) + rate(flyte:propeller:all:metastore:cache_miss{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "Hit rate", - "refId": "A" - } - ], - "description": "In-memory cache effectiveness for object store (S3/GCS) reads. Low hit rate = excessive storage calls." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "ops" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 36 - }, - "id": 109, - "title": "Event Recording (DP \u2192 CP)", - "type": "timeseries", - "targets": [ - { - "expr": "sum(rate(flyte:propeller:all:task:event_recording:success_duration_ms_count{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "Task success", - "refId": "A" - }, - { - "expr": "sum(rate(flyte:propeller:all:node:event_recording:success_duration_ms_count{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "Node success", - "refId": "B" - }, - { - "expr": "sum(rate(flyte:propeller:all:task:event_recording:failure_duration_ms_count{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "Task failure", - "refId": "C" - }, - { - "expr": "sum(rate(flyte:propeller:all:node:event_recording:failure_duration_ms_count{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "Node failure", - "refId": "D" - } - ], - "description": "Task/node event recording rate from propeller to FlyteAdmin. Failures indicate CP connectivity issues." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "short" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 36 - }, - "id": 110, - "title": "Cache Discovery (hit/miss/skip)", - "type": "timeseries", - "targets": [ - { - "expr": "rate(flyte:propeller:all:discovery_hit_count{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Hits", - "refId": "A" - }, - { - "expr": "rate(flyte:propeller:all:discovery_miss_count{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Misses", - "refId": "B" - }, - { - "expr": "rate(flyte:propeller:all:discovery_skip_count{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Skips", - "refId": "C" - }, - { - "expr": "rate(flyte:propeller:all:discovery_get_failure_count{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Get failures", - "refId": "D" - } - ], - "description": "V2 executor cache discovery miss/put rates for task output caching." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "reqps" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 44 - }, - "id": 111, - "title": "K8s API Client Rate", - "type": "timeseries", - "targets": [ - { - "expr": "sum(rate(k8s_client_request_total_unlabeled{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "K8s requests/s", - "refId": "A" - } - ], - "description": "Propeller's K8s API request rate. High rates may indicate excessive pod watches or creates." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "s" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 44 - }, - "id": 112, - "title": "K8s API Client Latency (p90)", - "type": "timeseries", - "targets": [ - { - "expr": "histogram_quantile(0.90, sum by (le) (rate(k8s_client_request_latency_unlabeled_bucket{namespace=\"$namespace\"}[$__rate_interval])))", - "legendFormat": "Request p90", - "refId": "A" - }, - { - "expr": "histogram_quantile(0.90, sum by (le) (rate(k8s_client_rate_limiter_latency_unlabeled_bucket{namespace=\"$namespace\"}[$__rate_interval])))", - "legendFormat": "Rate limiter p90", - "refId": "B" - } - ], - "description": "K8s API request latency and client-side rate limiter wait time at p90." - } - ] - }, - { - "collapsed": true, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 34 - }, - "id": 400, - "title": "gRPC Client (DP \u2192 CP)", - "type": "row", - "panels": [ - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "reqps" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 13 - }, - "id": 401, - "title": "gRPC Client Request Rate", - "type": "timeseries", - "targets": [ - { - "expr": "sum by (grpc_service, grpc_method) (rate(grpc_client_handled_total{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "{{ grpc_service }}/{{ grpc_method }}", - "refId": "A" - } - ], - "description": "DP\u2192CP gRPC client request rate by service and method (e.g. CreateWorkflowEvent)." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "reqps" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 13 - }, - "id": 402, - "title": "gRPC Client Error Rate", - "type": "timeseries", - "targets": [ - { - "expr": "sum by (grpc_service, grpc_method, grpc_code) (rate(grpc_client_handled_total{namespace=\"$namespace\", grpc_code!=\"OK\"}[$__rate_interval]))", - "legendFormat": "{{ grpc_method }} {{ grpc_code }}", - "refId": "A" - } - ], - "description": "DP\u2192CP gRPC errors by method and code. Non-OK responses from the control plane." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "s" - } - }, - "gridPos": { - "h": 8, - "w": 24, - "x": 0, - "y": 21 - }, - "id": 403, - "title": "gRPC Client Latency (p95)", - "type": "timeseries", - "targets": [ - { - "expr": "histogram_quantile(0.95, sum by (le, grpc_method) (rate(grpc_client_handling_seconds_bucket{namespace=\"$namespace\"}[$__rate_interval])))", - "legendFormat": "{{ grpc_method }} p95", - "refId": "A" + "description": "DP\u2192CP gRPC call latency at p95. High latency = slow control plane or network issues.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] } - ], - "description": "DP\u2192CP gRPC call latency at p95. High latency = slow control plane or network issues." + } } ] }, @@ -3153,7 +2656,18 @@ data: "refId": "A" } ], - "description": "CPU usage in cores per container, stacked. Identifies resource-heavy services." + "description": "CPU usage in cores per container, stacked. Identifies resource-heavy services.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3193,7 +2707,18 @@ data: "refId": "A" } ], - "description": "Working set memory per container. Watch for approaching limits." + "description": "Working set memory per container. Watch for approaching limits.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3230,7 +2755,18 @@ data: "refId": "A" } ], - "description": "Per-container restart events. Spikes indicate crashes or OOM kills." + "description": "Per-container restart events. Spikes indicate crashes or OOM kills.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } } ] } @@ -3282,7 +2818,7 @@ data: "timepicker": {}, "timezone": "browser", "title": "Union Dataplane Overview", - "uid": "union-dp-overview", + "uid": "union-dataplane-overview", "version": 1 } --- diff --git a/tests/generated/dataplane.aws.with-ingress.yaml b/tests/generated/dataplane.aws.with-ingress.yaml index 77bb8a6b..e35afb81 100644 --- a/tests/generated/dataplane.aws.with-ingress.yaml +++ b/tests/generated/dataplane.aws.with-ingress.yaml @@ -1171,63 +1171,6 @@ data: ], "description": "Total container restarts in the last hour. Non-zero indicates crashlooping or OOM kills." }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "thresholds": { - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "yellow", - "value": 10 - }, - { - "color": "red", - "value": 50 - } - ] - }, - "unit": "short" - } - }, - "gridPos": { - "h": 4, - "w": 6, - "x": 18, - "y": 1 - }, - "id": 8, - "options": { - "colorMode": "background", - "graphMode": "area", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ] - }, - "textMode": "auto" - }, - "title": "Active Workflows", - "type": "stat", - "targets": [ - { - "expr": "sum(flyte:propeller:all:execstats:active_workflow_executions{namespace=\"$namespace\"})", - "legendFormat": "Workflows", - "refId": "A" - } - ], - "description": "Current active FlyteWorkflow CRD count managed by Propeller." - }, { "datasource": { "type": "prometheus", @@ -1257,8 +1200,9 @@ data: "options": { "legend": { "calcs": [ - "mean", - "max" + "min", + "max", + "lastNotNull" ], "displayMode": "table", "placement": "bottom" @@ -1289,59 +1233,6 @@ data: ], "description": "Current count of active workflow, node, and task executions tracked by FlyteAdmin." }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "short" - } - }, - "gridPos": { - "h": 6, - "w": 12, - "x": 12, - "y": 5 - }, - "id": 7, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom" - }, - "tooltip": { - "mode": "multi", - "sort": "desc" - } - }, - "title": "Queue Depth", - "type": "timeseries", - "targets": [ - { - "expr": "sum(flyte:propeller:all:main_depth{namespace=\"$namespace\"})", - "legendFormat": "Main", - "refId": "A" - }, - { - "expr": "sum(flyte:propeller:all:sub_depth{namespace=\"$namespace\"})", - "legendFormat": "Sub", - "refId": "B" - } - ], - "description": "Main and sub workqueue depth over time." - }, { "datasource": { "type": "prometheus", @@ -1627,7 +1518,7 @@ data: "y": 10 }, "id": 1204, - "title": "Propeller Latency p99", + "title": "Executor Evaluate Duration p99", "type": "stat", "options": { "colorMode": "background", @@ -1641,8 +1532,13 @@ data: }, "targets": [ { - "expr": "(union:dp:slo:propeller_round_latency_p99 or flyte:propeller:all:round:round_time_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.99\"} / 1000)", - "refId": "A" + "expr": "executor::v2:evaluator:evaluate_duration_ms{namespace=\"$namespace\", quantile=\"0.99\"}", + "legendFormat": "p99", + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + } } ], "description": "Propeller round p99 latency in seconds. Falls back to raw metric if SLO rules not enabled." @@ -1687,7 +1583,18 @@ data: "refId": "B" } ], - "description": "DP service availability over time with SLO target line." + "description": "DP service availability over time with SLO target line.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -1731,7 +1638,18 @@ data: "refId": "B" } ], - "description": "DP error budget remaining over time. Requires monitoring.slos.enabled." + "description": "DP error budget remaining over time. Requires monitoring.slos.enabled.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "collapsed": true, @@ -1785,7 +1703,18 @@ data: "refId": "B" } ], - "description": "Operator execution operation processing rate and failure rate." + "description": "Operator execution operation processing rate and failure rate.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -1842,7 +1771,18 @@ data: "refId": "E" } ], - "description": "FlyteAdmin error rates: propeller communication failures, model transform errors, notification publish failures." + "description": "FlyteAdmin error rates: propeller communication failures, model transform errors, notification publish failures.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -1889,7 +1829,18 @@ data: "refId": "C" } ], - "description": "Breakdown of operator heartbeat computation: capabilities, usages, list workflows." + "description": "Breakdown of operator heartbeat computation: capabilities, usages, list workflows.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -1936,7 +1887,18 @@ data: "refId": "C" } ], - "description": "Config sync cycle rate, errors, and propeller ConfigMap update count." + "description": "Config sync cycle rate, errors, and propeller ConfigMap update count.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -1978,7 +1940,18 @@ data: "refId": "B" } ], - "description": "Billing collection cycle rate and errors. Failures mean billing data may be delayed." + "description": "Billing collection cycle rate and errors. Failures mean billing data may be delayed.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2077,17 +2050,28 @@ data: "type": "timeseries", "targets": [ { - "expr": "executor:active_actions_count{namespace=\"$namespace\"}", + "expr": "executor::v2:active_actions_count{namespace=\"$namespace\"}", "legendFormat": "Active actions", "refId": "A" }, { - "expr": "executor:available_capacity{namespace=\"$namespace\"}", + "expr": "executor::v2:available_capacity{namespace=\"$namespace\"}", "legendFormat": "Available capacity", "refId": "B" } ], - "description": "V2 executor active actions vs available capacity. Capacity=0 means executor is saturated." + "description": "V2 executor active actions vs available capacity. Capacity=0 means executor is saturated.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2119,17 +2103,28 @@ data: "type": "timeseries", "targets": [ { - "expr": "rate(executor:discovery_miss_count{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(executor::v2:discovery_miss_count{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Miss", "refId": "A" }, { - "expr": "rate(executor:discovery_put_success_count{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(executor::v2:discovery_put_success_count{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Put success", "refId": "B" } ], - "description": "V2 executor cache discovery miss/put rates for task output caching." + "description": "V2 executor cache discovery miss/put rates for task output caching.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2164,12 +2159,23 @@ data: "type": "timeseries", "targets": [ { - "expr": "sum by (phase) (rate(executor:actions_terminated{namespace=\"$namespace\"}[$__rate_interval]))", + "expr": "sum by (phase) (rate(executor::v2:actions_terminated{namespace=\"$namespace\"}[$__rate_interval]))", "legendFormat": "{{ phase }}", "refId": "A" } ], - "description": "Task completion rate by outcome: succeeded, failed, aborted. Key V2 SLI for task health. [Metrics pending: requires cloud service instrumentation to be deployed]" + "description": "Task completion rate by outcome: succeeded, failed, aborted. Key V2 SLI for task health. [Metrics pending: requires cloud service instrumentation to be deployed]", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2201,22 +2207,33 @@ data: "type": "timeseries", "targets": [ { - "expr": "executor:evaluator:evaluate_duration{namespace=\"$namespace\", quantile=\"0.5\"}", + "expr": "executor::v2:evaluator:evaluate_duration_ms{namespace=\"$namespace\", quantile=\"0.5\"}", "legendFormat": "Evaluate p50", "refId": "A" }, { - "expr": "executor:evaluator:evaluate_duration{namespace=\"$namespace\", quantile=\"0.9\"}", + "expr": "executor::v2:evaluator:evaluate_duration_ms{namespace=\"$namespace\", quantile=\"0.9\"}", "legendFormat": "Evaluate p90", "refId": "B" }, { - "expr": "executor:evaluator:evaluate_duration{namespace=\"$namespace\", quantile=\"0.99\"}", + "expr": "executor::v2:evaluator:evaluate_duration_ms{namespace=\"$namespace\", quantile=\"0.99\"}", "legendFormat": "Evaluate p99", "refId": "C" } ], - "description": "Time spent in RecursiveNodeHandler (pod creation). Dominant component of V2 task latency. [Metrics pending: requires cloud service instrumentation to be deployed]" + "description": "Time spent in RecursiveNodeHandler (pod creation). Dominant component of V2 task latency. [Metrics pending: requires cloud service instrumentation to be deployed]", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2248,27 +2265,38 @@ data: "type": "timeseries", "targets": [ { - "expr": "rate(executor:system_failures{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(executor::v2:system_failures{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "System failures", "refId": "A" }, { - "expr": "rate(executor:system_failures_exhausted{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(executor::v2:system_failures_exhausted{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Exhausted retries", "refId": "B" }, { - "expr": "rate(executor:invalid_leases{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(executor::v2:invalid_leases{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Invalid leases", "refId": "C" }, { - "expr": "rate(executor:evaluator:evaluate_errors{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(executor::v2:evaluator:evaluate_errors{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Evaluate errors", "refId": "D" } ], - "description": "Executor error rates. System failures retry; exhausted means task permanently failed. Invalid leases = malformed from queue service. [Metrics pending: requires cloud service instrumentation to be deployed]" + "description": "Executor error rates. System failures retry; exhausted means task permanently failed. Invalid leases = malformed from queue service. [Metrics pending: requires cloud service instrumentation to be deployed]", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } } ] }, @@ -2278,10 +2306,10 @@ data: "h": 1, "w": 24, "x": 0, - "y": 33 + "y": 34 }, - "id": 100, - "title": "Flyte Propeller (V1)", + "id": 400, + "title": "gRPC Client (DP \u2192 CP)", "type": "row", "panels": [ { @@ -2300,36 +2328,37 @@ data: "lineWidth": 1, "showPoints": "never" }, - "unit": "ms" + "unit": "reqps" } }, "gridPos": { "h": 8, - "w": 8, + "w": 12, "x": 0, - "y": 12 + "y": 13 }, - "id": 101, - "title": "Round Time (p50 / p90 / p99)", + "id": 401, + "title": "gRPC Client Request Rate", "type": "timeseries", "targets": [ { - "expr": "flyte:propeller:all:round:round_time_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.5\"}", - "legendFormat": "p50", + "expr": "sum by (grpc_service, grpc_method) (rate(grpc_client_handled_total{namespace=\"$namespace\"}[$__rate_interval]))", + "legendFormat": "{{ grpc_service }}/{{ grpc_method }}", "refId": "A" - }, - { - "expr": "flyte:propeller:all:round:round_time_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.9\"}", - "legendFormat": "p90", - "refId": "B" - }, - { - "expr": "flyte:propeller:all:round:round_time_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.99\"}", - "legendFormat": "p99", - "refId": "C" } ], - "description": "Propeller reconciliation round time. One round = one FlyteWorkflow CRD processed." + "description": "DP\u2192CP gRPC client request rate by service and method (e.g. CreateWorkflowEvent).", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2347,36 +2376,37 @@ data: "lineWidth": 1, "showPoints": "never" }, - "unit": "ops" + "unit": "reqps" } }, "gridPos": { "h": 8, - "w": 8, - "x": 8, - "y": 12 + "w": 12, + "x": 12, + "y": 13 }, - "id": 102, - "title": "Round Success / Error Rate", + "id": 402, + "title": "gRPC Client Error Rate", "type": "timeseries", "targets": [ { - "expr": "rate(flyte:propeller:all:round:success_count{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Success", + "expr": "sum by (grpc_service, grpc_method, grpc_code) (rate(grpc_client_handled_total{namespace=\"$namespace\", grpc_code!=\"OK\"}[$__rate_interval]))", + "legendFormat": "{{ grpc_method }} {{ grpc_code }}", "refId": "A" - }, - { - "expr": "rate(flyte:propeller:all:round:error_count{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Errors", - "refId": "B" - }, - { - "expr": "rate(flyte:propeller:all:round:panic_unlabeled{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Panics", - "refId": "C" } ], - "description": "Propeller round outcomes: success, errors, and panics per second." + "description": "DP\u2192CP gRPC errors by method and code. Non-OK responses from the control plane.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2394,564 +2424,37 @@ data: "lineWidth": 1, "showPoints": "never" }, - "unit": "short" + "unit": "s" } }, "gridPos": { "h": 8, - "w": 8, - "x": 16, - "y": 12 + "w": 24, + "x": 0, + "y": 21 }, - "id": 103, - "title": "Free Workers", + "id": 403, + "title": "gRPC Client Latency (p95)", "type": "timeseries", "targets": [ { - "expr": "sum(flyte:propeller:all:free_workers_count{namespace=\"$namespace\"})", - "legendFormat": "Free workers", + "expr": "histogram_quantile(0.95, sum by (le, grpc_method) (rate(grpc_client_handling_seconds_bucket{namespace=\"$namespace\"}[$__rate_interval])))", + "legendFormat": "{{ grpc_method }} p95", "refId": "A" } ], - "description": "Idle propeller worker goroutines. Zero = all workers busy processing workflows." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "ops" - } - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 0, - "y": 20 - }, - "id": 104, - "title": "Queue Add Rate", - "type": "timeseries", - "targets": [ - { - "expr": "rate(flyte:propeller:all:main_adds{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Main adds", - "refId": "A" - }, - { - "expr": "rate(flyte:propeller:all:sub_adds{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Sub adds", - "refId": "B" - }, - { - "expr": "rate(flyte:propeller:all:main_retries{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Main retries", - "refId": "C" - } - ], - "description": "Items enqueued to propeller's main/sub workqueues, plus retry rate." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "short" - } - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 8, - "y": 20 - }, - "id": 105, - "title": "Workflow Updates", - "type": "timeseries", - "targets": [ - { - "expr": "rate(flyte:propeller:all:wf_updated{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Updated", - "refId": "A" - }, - { - "expr": "rate(flyte:propeller:all:wf_update_failed{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Failed", - "refId": "B" - }, - { - "expr": "rate(flyte:propeller:all:wf_too_large{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Too large", - "refId": "C" - }, - { - "expr": "rate(flyte:propeller:all:wf_update_conflict{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Conflict", - "refId": "D" - } - ], - "description": "etcd write outcomes: successful updates, failures, too-large (>1.5MB), optimistic concurrency conflicts." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "ms" - } - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 16, - "y": 20 - }, - "id": 106, - "title": "Workflow Update Latency", - "type": "timeseries", - "targets": [ - { - "expr": "flyte:propeller:all:wf_update_latency_ms{namespace=\"$namespace\", quantile=\"0.5\"}", - "legendFormat": "p50", - "refId": "A" - }, - { - "expr": "flyte:propeller:all:wf_update_latency_ms{namespace=\"$namespace\", quantile=\"0.9\"}", - "legendFormat": "p90", - "refId": "B" - }, - { - "expr": "flyte:propeller:all:wf_update_latency_ms{namespace=\"$namespace\", quantile=\"0.99\"}", - "legendFormat": "p99", - "refId": "C" - } - ], - "description": "etcd write latency for FlyteWorkflow status updates." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "ms" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 28 - }, - "id": 107, - "title": "Node Queueing & Execution Latency", - "type": "timeseries", - "targets": [ - { - "expr": "flyte:propeller:all:node:queueing_latency_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.5\"}", - "legendFormat": "Queue p50", - "refId": "A" - }, - { - "expr": "flyte:propeller:all:node:queueing_latency_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.9\"}", - "legendFormat": "Queue p90", - "refId": "B" - }, - { - "expr": "flyte:propeller:all:node:node_exec_latency_unlabeled_us{namespace=\"$namespace\", quantile=\"0.9\"} / 1000", - "legendFormat": "Exec p90 (ms)", - "refId": "C" - } - ], - "description": "Node lifecycle: queueing latency (queued\u2192running) and execution latency (time in handler)." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "percentunit" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 28 - }, - "id": 108, - "title": "Metastore Cache Hit Rate", - "type": "timeseries", - "targets": [ - { - "expr": "rate(flyte:propeller:all:metastore:cache_hit{namespace=\"$namespace\"}[$__rate_interval]) / (rate(flyte:propeller:all:metastore:cache_hit{namespace=\"$namespace\"}[$__rate_interval]) + rate(flyte:propeller:all:metastore:cache_miss{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "Hit rate", - "refId": "A" - } - ], - "description": "In-memory cache effectiveness for object store (S3/GCS) reads. Low hit rate = excessive storage calls." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "ops" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 36 - }, - "id": 109, - "title": "Event Recording (DP \u2192 CP)", - "type": "timeseries", - "targets": [ - { - "expr": "sum(rate(flyte:propeller:all:task:event_recording:success_duration_ms_count{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "Task success", - "refId": "A" - }, - { - "expr": "sum(rate(flyte:propeller:all:node:event_recording:success_duration_ms_count{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "Node success", - "refId": "B" - }, - { - "expr": "sum(rate(flyte:propeller:all:task:event_recording:failure_duration_ms_count{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "Task failure", - "refId": "C" - }, - { - "expr": "sum(rate(flyte:propeller:all:node:event_recording:failure_duration_ms_count{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "Node failure", - "refId": "D" - } - ], - "description": "Task/node event recording rate from propeller to FlyteAdmin. Failures indicate CP connectivity issues." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "short" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 36 - }, - "id": 110, - "title": "Cache Discovery (hit/miss/skip)", - "type": "timeseries", - "targets": [ - { - "expr": "rate(flyte:propeller:all:discovery_hit_count{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Hits", - "refId": "A" - }, - { - "expr": "rate(flyte:propeller:all:discovery_miss_count{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Misses", - "refId": "B" - }, - { - "expr": "rate(flyte:propeller:all:discovery_skip_count{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Skips", - "refId": "C" - }, - { - "expr": "rate(flyte:propeller:all:discovery_get_failure_count{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Get failures", - "refId": "D" - } - ], - "description": "V2 executor cache discovery miss/put rates for task output caching." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "reqps" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 44 - }, - "id": 111, - "title": "K8s API Client Rate", - "type": "timeseries", - "targets": [ - { - "expr": "sum(rate(k8s_client_request_total_unlabeled{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "K8s requests/s", - "refId": "A" - } - ], - "description": "Propeller's K8s API request rate. High rates may indicate excessive pod watches or creates." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "s" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 44 - }, - "id": 112, - "title": "K8s API Client Latency (p90)", - "type": "timeseries", - "targets": [ - { - "expr": "histogram_quantile(0.90, sum by (le) (rate(k8s_client_request_latency_unlabeled_bucket{namespace=\"$namespace\"}[$__rate_interval])))", - "legendFormat": "Request p90", - "refId": "A" - }, - { - "expr": "histogram_quantile(0.90, sum by (le) (rate(k8s_client_rate_limiter_latency_unlabeled_bucket{namespace=\"$namespace\"}[$__rate_interval])))", - "legendFormat": "Rate limiter p90", - "refId": "B" - } - ], - "description": "K8s API request latency and client-side rate limiter wait time at p90." - } - ] - }, - { - "collapsed": true, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 34 - }, - "id": 400, - "title": "gRPC Client (DP \u2192 CP)", - "type": "row", - "panels": [ - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "reqps" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 13 - }, - "id": 401, - "title": "gRPC Client Request Rate", - "type": "timeseries", - "targets": [ - { - "expr": "sum by (grpc_service, grpc_method) (rate(grpc_client_handled_total{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "{{ grpc_service }}/{{ grpc_method }}", - "refId": "A" - } - ], - "description": "DP\u2192CP gRPC client request rate by service and method (e.g. CreateWorkflowEvent)." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "reqps" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 13 - }, - "id": 402, - "title": "gRPC Client Error Rate", - "type": "timeseries", - "targets": [ - { - "expr": "sum by (grpc_service, grpc_method, grpc_code) (rate(grpc_client_handled_total{namespace=\"$namespace\", grpc_code!=\"OK\"}[$__rate_interval]))", - "legendFormat": "{{ grpc_method }} {{ grpc_code }}", - "refId": "A" - } - ], - "description": "DP\u2192CP gRPC errors by method and code. Non-OK responses from the control plane." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "s" - } - }, - "gridPos": { - "h": 8, - "w": 24, - "x": 0, - "y": 21 - }, - "id": 403, - "title": "gRPC Client Latency (p95)", - "type": "timeseries", - "targets": [ - { - "expr": "histogram_quantile(0.95, sum by (le, grpc_method) (rate(grpc_client_handling_seconds_bucket{namespace=\"$namespace\"}[$__rate_interval])))", - "legendFormat": "{{ grpc_method }} p95", - "refId": "A" + "description": "DP\u2192CP gRPC call latency at p95. High latency = slow control plane or network issues.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] } - ], - "description": "DP\u2192CP gRPC call latency at p95. High latency = slow control plane or network issues." + } } ] }, @@ -3005,7 +2508,18 @@ data: "refId": "A" } ], - "description": "CPU usage in cores per container, stacked. Identifies resource-heavy services." + "description": "CPU usage in cores per container, stacked. Identifies resource-heavy services.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3045,7 +2559,18 @@ data: "refId": "A" } ], - "description": "Working set memory per container. Watch for approaching limits." + "description": "Working set memory per container. Watch for approaching limits.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3082,7 +2607,18 @@ data: "refId": "A" } ], - "description": "Per-container restart events. Spikes indicate crashes or OOM kills." + "description": "Per-container restart events. Spikes indicate crashes or OOM kills.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } } ] } @@ -3134,7 +2670,7 @@ data: "timepicker": {}, "timezone": "browser", "title": "Union Dataplane Overview", - "uid": "union-dp-overview", + "uid": "union-dataplane-overview", "version": 1 } --- diff --git a/tests/generated/dataplane.aws.yaml b/tests/generated/dataplane.aws.yaml index 6bc527d9..df9b6b63 100644 --- a/tests/generated/dataplane.aws.yaml +++ b/tests/generated/dataplane.aws.yaml @@ -1294,63 +1294,6 @@ data: ], "description": "Total container restarts in the last hour. Non-zero indicates crashlooping or OOM kills." }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "thresholds": { - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "yellow", - "value": 10 - }, - { - "color": "red", - "value": 50 - } - ] - }, - "unit": "short" - } - }, - "gridPos": { - "h": 4, - "w": 6, - "x": 18, - "y": 1 - }, - "id": 8, - "options": { - "colorMode": "background", - "graphMode": "area", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ] - }, - "textMode": "auto" - }, - "title": "Active Workflows", - "type": "stat", - "targets": [ - { - "expr": "sum(flyte:propeller:all:execstats:active_workflow_executions{namespace=\"$namespace\"})", - "legendFormat": "Workflows", - "refId": "A" - } - ], - "description": "Current active FlyteWorkflow CRD count managed by Propeller." - }, { "datasource": { "type": "prometheus", @@ -1380,8 +1323,9 @@ data: "options": { "legend": { "calcs": [ - "mean", - "max" + "min", + "max", + "lastNotNull" ], "displayMode": "table", "placement": "bottom" @@ -1412,59 +1356,6 @@ data: ], "description": "Current count of active workflow, node, and task executions tracked by FlyteAdmin." }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "short" - } - }, - "gridPos": { - "h": 6, - "w": 12, - "x": 12, - "y": 5 - }, - "id": 7, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom" - }, - "tooltip": { - "mode": "multi", - "sort": "desc" - } - }, - "title": "Queue Depth", - "type": "timeseries", - "targets": [ - { - "expr": "sum(flyte:propeller:all:main_depth{namespace=\"$namespace\"})", - "legendFormat": "Main", - "refId": "A" - }, - { - "expr": "sum(flyte:propeller:all:sub_depth{namespace=\"$namespace\"})", - "legendFormat": "Sub", - "refId": "B" - } - ], - "description": "Main and sub workqueue depth over time." - }, { "datasource": { "type": "prometheus", @@ -1750,7 +1641,7 @@ data: "y": 10 }, "id": 1204, - "title": "Propeller Latency p99", + "title": "Executor Evaluate Duration p99", "type": "stat", "options": { "colorMode": "background", @@ -1764,8 +1655,13 @@ data: }, "targets": [ { - "expr": "(union:dp:slo:propeller_round_latency_p99 or flyte:propeller:all:round:round_time_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.99\"} / 1000)", - "refId": "A" + "expr": "executor::v2:evaluator:evaluate_duration_ms{namespace=\"$namespace\", quantile=\"0.99\"}", + "legendFormat": "p99", + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + } } ], "description": "Propeller round p99 latency in seconds. Falls back to raw metric if SLO rules not enabled." @@ -1810,7 +1706,18 @@ data: "refId": "B" } ], - "description": "DP service availability over time with SLO target line." + "description": "DP service availability over time with SLO target line.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -1854,7 +1761,18 @@ data: "refId": "B" } ], - "description": "DP error budget remaining over time. Requires monitoring.slos.enabled." + "description": "DP error budget remaining over time. Requires monitoring.slos.enabled.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "collapsed": true, @@ -1908,7 +1826,18 @@ data: "refId": "B" } ], - "description": "Operator execution operation processing rate and failure rate." + "description": "Operator execution operation processing rate and failure rate.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -1965,7 +1894,18 @@ data: "refId": "E" } ], - "description": "FlyteAdmin error rates: propeller communication failures, model transform errors, notification publish failures." + "description": "FlyteAdmin error rates: propeller communication failures, model transform errors, notification publish failures.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2012,7 +1952,18 @@ data: "refId": "C" } ], - "description": "Breakdown of operator heartbeat computation: capabilities, usages, list workflows." + "description": "Breakdown of operator heartbeat computation: capabilities, usages, list workflows.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2059,7 +2010,18 @@ data: "refId": "C" } ], - "description": "Config sync cycle rate, errors, and propeller ConfigMap update count." + "description": "Config sync cycle rate, errors, and propeller ConfigMap update count.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2101,7 +2063,18 @@ data: "refId": "B" } ], - "description": "Billing collection cycle rate and errors. Failures mean billing data may be delayed." + "description": "Billing collection cycle rate and errors. Failures mean billing data may be delayed.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2200,17 +2173,28 @@ data: "type": "timeseries", "targets": [ { - "expr": "executor:active_actions_count{namespace=\"$namespace\"}", + "expr": "executor::v2:active_actions_count{namespace=\"$namespace\"}", "legendFormat": "Active actions", "refId": "A" }, { - "expr": "executor:available_capacity{namespace=\"$namespace\"}", + "expr": "executor::v2:available_capacity{namespace=\"$namespace\"}", "legendFormat": "Available capacity", "refId": "B" } ], - "description": "V2 executor active actions vs available capacity. Capacity=0 means executor is saturated." + "description": "V2 executor active actions vs available capacity. Capacity=0 means executor is saturated.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2242,17 +2226,28 @@ data: "type": "timeseries", "targets": [ { - "expr": "rate(executor:discovery_miss_count{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(executor::v2:discovery_miss_count{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Miss", "refId": "A" }, { - "expr": "rate(executor:discovery_put_success_count{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(executor::v2:discovery_put_success_count{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Put success", "refId": "B" } ], - "description": "V2 executor cache discovery miss/put rates for task output caching." + "description": "V2 executor cache discovery miss/put rates for task output caching.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2287,12 +2282,23 @@ data: "type": "timeseries", "targets": [ { - "expr": "sum by (phase) (rate(executor:actions_terminated{namespace=\"$namespace\"}[$__rate_interval]))", + "expr": "sum by (phase) (rate(executor::v2:actions_terminated{namespace=\"$namespace\"}[$__rate_interval]))", "legendFormat": "{{ phase }}", "refId": "A" } ], - "description": "Task completion rate by outcome: succeeded, failed, aborted. Key V2 SLI for task health. [Metrics pending: requires cloud service instrumentation to be deployed]" + "description": "Task completion rate by outcome: succeeded, failed, aborted. Key V2 SLI for task health. [Metrics pending: requires cloud service instrumentation to be deployed]", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2324,22 +2330,33 @@ data: "type": "timeseries", "targets": [ { - "expr": "executor:evaluator:evaluate_duration{namespace=\"$namespace\", quantile=\"0.5\"}", + "expr": "executor::v2:evaluator:evaluate_duration_ms{namespace=\"$namespace\", quantile=\"0.5\"}", "legendFormat": "Evaluate p50", "refId": "A" }, { - "expr": "executor:evaluator:evaluate_duration{namespace=\"$namespace\", quantile=\"0.9\"}", + "expr": "executor::v2:evaluator:evaluate_duration_ms{namespace=\"$namespace\", quantile=\"0.9\"}", "legendFormat": "Evaluate p90", "refId": "B" }, { - "expr": "executor:evaluator:evaluate_duration{namespace=\"$namespace\", quantile=\"0.99\"}", + "expr": "executor::v2:evaluator:evaluate_duration_ms{namespace=\"$namespace\", quantile=\"0.99\"}", "legendFormat": "Evaluate p99", "refId": "C" } ], - "description": "Time spent in RecursiveNodeHandler (pod creation). Dominant component of V2 task latency. [Metrics pending: requires cloud service instrumentation to be deployed]" + "description": "Time spent in RecursiveNodeHandler (pod creation). Dominant component of V2 task latency. [Metrics pending: requires cloud service instrumentation to be deployed]", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2371,27 +2388,38 @@ data: "type": "timeseries", "targets": [ { - "expr": "rate(executor:system_failures{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(executor::v2:system_failures{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "System failures", "refId": "A" }, { - "expr": "rate(executor:system_failures_exhausted{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(executor::v2:system_failures_exhausted{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Exhausted retries", "refId": "B" }, { - "expr": "rate(executor:invalid_leases{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(executor::v2:invalid_leases{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Invalid leases", "refId": "C" }, { - "expr": "rate(executor:evaluator:evaluate_errors{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(executor::v2:evaluator:evaluate_errors{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Evaluate errors", "refId": "D" } ], - "description": "Executor error rates. System failures retry; exhausted means task permanently failed. Invalid leases = malformed from queue service. [Metrics pending: requires cloud service instrumentation to be deployed]" + "description": "Executor error rates. System failures retry; exhausted means task permanently failed. Invalid leases = malformed from queue service. [Metrics pending: requires cloud service instrumentation to be deployed]", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } } ] }, @@ -2401,10 +2429,10 @@ data: "h": 1, "w": 24, "x": 0, - "y": 33 + "y": 34 }, - "id": 100, - "title": "Flyte Propeller (V1)", + "id": 400, + "title": "gRPC Client (DP \u2192 CP)", "type": "row", "panels": [ { @@ -2423,36 +2451,37 @@ data: "lineWidth": 1, "showPoints": "never" }, - "unit": "ms" + "unit": "reqps" } }, "gridPos": { "h": 8, - "w": 8, + "w": 12, "x": 0, - "y": 12 + "y": 13 }, - "id": 101, - "title": "Round Time (p50 / p90 / p99)", + "id": 401, + "title": "gRPC Client Request Rate", "type": "timeseries", "targets": [ { - "expr": "flyte:propeller:all:round:round_time_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.5\"}", - "legendFormat": "p50", + "expr": "sum by (grpc_service, grpc_method) (rate(grpc_client_handled_total{namespace=\"$namespace\"}[$__rate_interval]))", + "legendFormat": "{{ grpc_service }}/{{ grpc_method }}", "refId": "A" - }, - { - "expr": "flyte:propeller:all:round:round_time_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.9\"}", - "legendFormat": "p90", - "refId": "B" - }, - { - "expr": "flyte:propeller:all:round:round_time_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.99\"}", - "legendFormat": "p99", - "refId": "C" } ], - "description": "Propeller reconciliation round time. One round = one FlyteWorkflow CRD processed." + "description": "DP\u2192CP gRPC client request rate by service and method (e.g. CreateWorkflowEvent).", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2470,36 +2499,37 @@ data: "lineWidth": 1, "showPoints": "never" }, - "unit": "ops" + "unit": "reqps" } }, "gridPos": { "h": 8, - "w": 8, - "x": 8, - "y": 12 + "w": 12, + "x": 12, + "y": 13 }, - "id": 102, - "title": "Round Success / Error Rate", + "id": 402, + "title": "gRPC Client Error Rate", "type": "timeseries", "targets": [ { - "expr": "rate(flyte:propeller:all:round:success_count{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Success", + "expr": "sum by (grpc_service, grpc_method, grpc_code) (rate(grpc_client_handled_total{namespace=\"$namespace\", grpc_code!=\"OK\"}[$__rate_interval]))", + "legendFormat": "{{ grpc_method }} {{ grpc_code }}", "refId": "A" - }, - { - "expr": "rate(flyte:propeller:all:round:error_count{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Errors", - "refId": "B" - }, - { - "expr": "rate(flyte:propeller:all:round:panic_unlabeled{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Panics", - "refId": "C" } ], - "description": "Propeller round outcomes: success, errors, and panics per second." + "description": "DP\u2192CP gRPC errors by method and code. Non-OK responses from the control plane.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2517,564 +2547,37 @@ data: "lineWidth": 1, "showPoints": "never" }, - "unit": "short" + "unit": "s" } }, "gridPos": { "h": 8, - "w": 8, - "x": 16, - "y": 12 + "w": 24, + "x": 0, + "y": 21 }, - "id": 103, - "title": "Free Workers", + "id": 403, + "title": "gRPC Client Latency (p95)", "type": "timeseries", "targets": [ { - "expr": "sum(flyte:propeller:all:free_workers_count{namespace=\"$namespace\"})", - "legendFormat": "Free workers", + "expr": "histogram_quantile(0.95, sum by (le, grpc_method) (rate(grpc_client_handling_seconds_bucket{namespace=\"$namespace\"}[$__rate_interval])))", + "legendFormat": "{{ grpc_method }} p95", "refId": "A" } ], - "description": "Idle propeller worker goroutines. Zero = all workers busy processing workflows." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "ops" - } - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 0, - "y": 20 - }, - "id": 104, - "title": "Queue Add Rate", - "type": "timeseries", - "targets": [ - { - "expr": "rate(flyte:propeller:all:main_adds{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Main adds", - "refId": "A" - }, - { - "expr": "rate(flyte:propeller:all:sub_adds{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Sub adds", - "refId": "B" - }, - { - "expr": "rate(flyte:propeller:all:main_retries{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Main retries", - "refId": "C" - } - ], - "description": "Items enqueued to propeller's main/sub workqueues, plus retry rate." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "short" - } - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 8, - "y": 20 - }, - "id": 105, - "title": "Workflow Updates", - "type": "timeseries", - "targets": [ - { - "expr": "rate(flyte:propeller:all:wf_updated{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Updated", - "refId": "A" - }, - { - "expr": "rate(flyte:propeller:all:wf_update_failed{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Failed", - "refId": "B" - }, - { - "expr": "rate(flyte:propeller:all:wf_too_large{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Too large", - "refId": "C" - }, - { - "expr": "rate(flyte:propeller:all:wf_update_conflict{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Conflict", - "refId": "D" - } - ], - "description": "etcd write outcomes: successful updates, failures, too-large (>1.5MB), optimistic concurrency conflicts." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "ms" - } - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 16, - "y": 20 - }, - "id": 106, - "title": "Workflow Update Latency", - "type": "timeseries", - "targets": [ - { - "expr": "flyte:propeller:all:wf_update_latency_ms{namespace=\"$namespace\", quantile=\"0.5\"}", - "legendFormat": "p50", - "refId": "A" - }, - { - "expr": "flyte:propeller:all:wf_update_latency_ms{namespace=\"$namespace\", quantile=\"0.9\"}", - "legendFormat": "p90", - "refId": "B" - }, - { - "expr": "flyte:propeller:all:wf_update_latency_ms{namespace=\"$namespace\", quantile=\"0.99\"}", - "legendFormat": "p99", - "refId": "C" - } - ], - "description": "etcd write latency for FlyteWorkflow status updates." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "ms" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 28 - }, - "id": 107, - "title": "Node Queueing & Execution Latency", - "type": "timeseries", - "targets": [ - { - "expr": "flyte:propeller:all:node:queueing_latency_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.5\"}", - "legendFormat": "Queue p50", - "refId": "A" - }, - { - "expr": "flyte:propeller:all:node:queueing_latency_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.9\"}", - "legendFormat": "Queue p90", - "refId": "B" - }, - { - "expr": "flyte:propeller:all:node:node_exec_latency_unlabeled_us{namespace=\"$namespace\", quantile=\"0.9\"} / 1000", - "legendFormat": "Exec p90 (ms)", - "refId": "C" - } - ], - "description": "Node lifecycle: queueing latency (queued\u2192running) and execution latency (time in handler)." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "percentunit" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 28 - }, - "id": 108, - "title": "Metastore Cache Hit Rate", - "type": "timeseries", - "targets": [ - { - "expr": "rate(flyte:propeller:all:metastore:cache_hit{namespace=\"$namespace\"}[$__rate_interval]) / (rate(flyte:propeller:all:metastore:cache_hit{namespace=\"$namespace\"}[$__rate_interval]) + rate(flyte:propeller:all:metastore:cache_miss{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "Hit rate", - "refId": "A" - } - ], - "description": "In-memory cache effectiveness for object store (S3/GCS) reads. Low hit rate = excessive storage calls." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "ops" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 36 - }, - "id": 109, - "title": "Event Recording (DP \u2192 CP)", - "type": "timeseries", - "targets": [ - { - "expr": "sum(rate(flyte:propeller:all:task:event_recording:success_duration_ms_count{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "Task success", - "refId": "A" - }, - { - "expr": "sum(rate(flyte:propeller:all:node:event_recording:success_duration_ms_count{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "Node success", - "refId": "B" - }, - { - "expr": "sum(rate(flyte:propeller:all:task:event_recording:failure_duration_ms_count{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "Task failure", - "refId": "C" - }, - { - "expr": "sum(rate(flyte:propeller:all:node:event_recording:failure_duration_ms_count{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "Node failure", - "refId": "D" - } - ], - "description": "Task/node event recording rate from propeller to FlyteAdmin. Failures indicate CP connectivity issues." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "short" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 36 - }, - "id": 110, - "title": "Cache Discovery (hit/miss/skip)", - "type": "timeseries", - "targets": [ - { - "expr": "rate(flyte:propeller:all:discovery_hit_count{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Hits", - "refId": "A" - }, - { - "expr": "rate(flyte:propeller:all:discovery_miss_count{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Misses", - "refId": "B" - }, - { - "expr": "rate(flyte:propeller:all:discovery_skip_count{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Skips", - "refId": "C" - }, - { - "expr": "rate(flyte:propeller:all:discovery_get_failure_count{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Get failures", - "refId": "D" - } - ], - "description": "V2 executor cache discovery miss/put rates for task output caching." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "reqps" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 44 - }, - "id": 111, - "title": "K8s API Client Rate", - "type": "timeseries", - "targets": [ - { - "expr": "sum(rate(k8s_client_request_total_unlabeled{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "K8s requests/s", - "refId": "A" - } - ], - "description": "Propeller's K8s API request rate. High rates may indicate excessive pod watches or creates." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "s" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 44 - }, - "id": 112, - "title": "K8s API Client Latency (p90)", - "type": "timeseries", - "targets": [ - { - "expr": "histogram_quantile(0.90, sum by (le) (rate(k8s_client_request_latency_unlabeled_bucket{namespace=\"$namespace\"}[$__rate_interval])))", - "legendFormat": "Request p90", - "refId": "A" - }, - { - "expr": "histogram_quantile(0.90, sum by (le) (rate(k8s_client_rate_limiter_latency_unlabeled_bucket{namespace=\"$namespace\"}[$__rate_interval])))", - "legendFormat": "Rate limiter p90", - "refId": "B" - } - ], - "description": "K8s API request latency and client-side rate limiter wait time at p90." - } - ] - }, - { - "collapsed": true, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 34 - }, - "id": 400, - "title": "gRPC Client (DP \u2192 CP)", - "type": "row", - "panels": [ - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "reqps" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 13 - }, - "id": 401, - "title": "gRPC Client Request Rate", - "type": "timeseries", - "targets": [ - { - "expr": "sum by (grpc_service, grpc_method) (rate(grpc_client_handled_total{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "{{ grpc_service }}/{{ grpc_method }}", - "refId": "A" - } - ], - "description": "DP\u2192CP gRPC client request rate by service and method (e.g. CreateWorkflowEvent)." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "reqps" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 13 - }, - "id": 402, - "title": "gRPC Client Error Rate", - "type": "timeseries", - "targets": [ - { - "expr": "sum by (grpc_service, grpc_method, grpc_code) (rate(grpc_client_handled_total{namespace=\"$namespace\", grpc_code!=\"OK\"}[$__rate_interval]))", - "legendFormat": "{{ grpc_method }} {{ grpc_code }}", - "refId": "A" - } - ], - "description": "DP\u2192CP gRPC errors by method and code. Non-OK responses from the control plane." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "s" - } - }, - "gridPos": { - "h": 8, - "w": 24, - "x": 0, - "y": 21 - }, - "id": 403, - "title": "gRPC Client Latency (p95)", - "type": "timeseries", - "targets": [ - { - "expr": "histogram_quantile(0.95, sum by (le, grpc_method) (rate(grpc_client_handling_seconds_bucket{namespace=\"$namespace\"}[$__rate_interval])))", - "legendFormat": "{{ grpc_method }} p95", - "refId": "A" + "description": "DP\u2192CP gRPC call latency at p95. High latency = slow control plane or network issues.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] } - ], - "description": "DP\u2192CP gRPC call latency at p95. High latency = slow control plane or network issues." + } } ] }, @@ -3128,7 +2631,18 @@ data: "refId": "A" } ], - "description": "CPU usage in cores per container, stacked. Identifies resource-heavy services." + "description": "CPU usage in cores per container, stacked. Identifies resource-heavy services.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3168,7 +2682,18 @@ data: "refId": "A" } ], - "description": "Working set memory per container. Watch for approaching limits." + "description": "Working set memory per container. Watch for approaching limits.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3205,7 +2730,18 @@ data: "refId": "A" } ], - "description": "Per-container restart events. Spikes indicate crashes or OOM kills." + "description": "Per-container restart events. Spikes indicate crashes or OOM kills.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } } ] } @@ -3257,7 +2793,7 @@ data: "timepicker": {}, "timezone": "browser", "title": "Union Dataplane Overview", - "uid": "union-dp-overview", + "uid": "union-dataplane-overview", "version": 1 } --- diff --git a/tests/generated/dataplane.azure-custom-storage-prefix.yaml b/tests/generated/dataplane.azure-custom-storage-prefix.yaml index de2af4a6..868b6d8b 100644 --- a/tests/generated/dataplane.azure-custom-storage-prefix.yaml +++ b/tests/generated/dataplane.azure-custom-storage-prefix.yaml @@ -1183,63 +1183,6 @@ data: ], "description": "Total container restarts in the last hour. Non-zero indicates crashlooping or OOM kills." }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "thresholds": { - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "yellow", - "value": 10 - }, - { - "color": "red", - "value": 50 - } - ] - }, - "unit": "short" - } - }, - "gridPos": { - "h": 4, - "w": 6, - "x": 18, - "y": 1 - }, - "id": 8, - "options": { - "colorMode": "background", - "graphMode": "area", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ] - }, - "textMode": "auto" - }, - "title": "Active Workflows", - "type": "stat", - "targets": [ - { - "expr": "sum(flyte:propeller:all:execstats:active_workflow_executions{namespace=\"$namespace\"})", - "legendFormat": "Workflows", - "refId": "A" - } - ], - "description": "Current active FlyteWorkflow CRD count managed by Propeller." - }, { "datasource": { "type": "prometheus", @@ -1269,8 +1212,9 @@ data: "options": { "legend": { "calcs": [ - "mean", - "max" + "min", + "max", + "lastNotNull" ], "displayMode": "table", "placement": "bottom" @@ -1301,59 +1245,6 @@ data: ], "description": "Current count of active workflow, node, and task executions tracked by FlyteAdmin." }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "short" - } - }, - "gridPos": { - "h": 6, - "w": 12, - "x": 12, - "y": 5 - }, - "id": 7, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom" - }, - "tooltip": { - "mode": "multi", - "sort": "desc" - } - }, - "title": "Queue Depth", - "type": "timeseries", - "targets": [ - { - "expr": "sum(flyte:propeller:all:main_depth{namespace=\"$namespace\"})", - "legendFormat": "Main", - "refId": "A" - }, - { - "expr": "sum(flyte:propeller:all:sub_depth{namespace=\"$namespace\"})", - "legendFormat": "Sub", - "refId": "B" - } - ], - "description": "Main and sub workqueue depth over time." - }, { "datasource": { "type": "prometheus", @@ -1639,7 +1530,7 @@ data: "y": 10 }, "id": 1204, - "title": "Propeller Latency p99", + "title": "Executor Evaluate Duration p99", "type": "stat", "options": { "colorMode": "background", @@ -1653,8 +1544,13 @@ data: }, "targets": [ { - "expr": "(union:dp:slo:propeller_round_latency_p99 or flyte:propeller:all:round:round_time_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.99\"} / 1000)", - "refId": "A" + "expr": "executor::v2:evaluator:evaluate_duration_ms{namespace=\"$namespace\", quantile=\"0.99\"}", + "legendFormat": "p99", + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + } } ], "description": "Propeller round p99 latency in seconds. Falls back to raw metric if SLO rules not enabled." @@ -1699,7 +1595,18 @@ data: "refId": "B" } ], - "description": "DP service availability over time with SLO target line." + "description": "DP service availability over time with SLO target line.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -1743,7 +1650,18 @@ data: "refId": "B" } ], - "description": "DP error budget remaining over time. Requires monitoring.slos.enabled." + "description": "DP error budget remaining over time. Requires monitoring.slos.enabled.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "collapsed": true, @@ -1797,7 +1715,18 @@ data: "refId": "B" } ], - "description": "Operator execution operation processing rate and failure rate." + "description": "Operator execution operation processing rate and failure rate.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -1854,7 +1783,18 @@ data: "refId": "E" } ], - "description": "FlyteAdmin error rates: propeller communication failures, model transform errors, notification publish failures." + "description": "FlyteAdmin error rates: propeller communication failures, model transform errors, notification publish failures.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -1901,7 +1841,18 @@ data: "refId": "C" } ], - "description": "Breakdown of operator heartbeat computation: capabilities, usages, list workflows." + "description": "Breakdown of operator heartbeat computation: capabilities, usages, list workflows.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -1948,7 +1899,18 @@ data: "refId": "C" } ], - "description": "Config sync cycle rate, errors, and propeller ConfigMap update count." + "description": "Config sync cycle rate, errors, and propeller ConfigMap update count.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -1990,7 +1952,18 @@ data: "refId": "B" } ], - "description": "Billing collection cycle rate and errors. Failures mean billing data may be delayed." + "description": "Billing collection cycle rate and errors. Failures mean billing data may be delayed.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2089,17 +2062,28 @@ data: "type": "timeseries", "targets": [ { - "expr": "executor:active_actions_count{namespace=\"$namespace\"}", + "expr": "executor::v2:active_actions_count{namespace=\"$namespace\"}", "legendFormat": "Active actions", "refId": "A" }, { - "expr": "executor:available_capacity{namespace=\"$namespace\"}", + "expr": "executor::v2:available_capacity{namespace=\"$namespace\"}", "legendFormat": "Available capacity", "refId": "B" } ], - "description": "V2 executor active actions vs available capacity. Capacity=0 means executor is saturated." + "description": "V2 executor active actions vs available capacity. Capacity=0 means executor is saturated.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2131,17 +2115,28 @@ data: "type": "timeseries", "targets": [ { - "expr": "rate(executor:discovery_miss_count{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(executor::v2:discovery_miss_count{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Miss", "refId": "A" }, { - "expr": "rate(executor:discovery_put_success_count{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(executor::v2:discovery_put_success_count{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Put success", "refId": "B" } ], - "description": "V2 executor cache discovery miss/put rates for task output caching." + "description": "V2 executor cache discovery miss/put rates for task output caching.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2176,12 +2171,23 @@ data: "type": "timeseries", "targets": [ { - "expr": "sum by (phase) (rate(executor:actions_terminated{namespace=\"$namespace\"}[$__rate_interval]))", + "expr": "sum by (phase) (rate(executor::v2:actions_terminated{namespace=\"$namespace\"}[$__rate_interval]))", "legendFormat": "{{ phase }}", "refId": "A" } ], - "description": "Task completion rate by outcome: succeeded, failed, aborted. Key V2 SLI for task health. [Metrics pending: requires cloud service instrumentation to be deployed]" + "description": "Task completion rate by outcome: succeeded, failed, aborted. Key V2 SLI for task health. [Metrics pending: requires cloud service instrumentation to be deployed]", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2213,22 +2219,33 @@ data: "type": "timeseries", "targets": [ { - "expr": "executor:evaluator:evaluate_duration{namespace=\"$namespace\", quantile=\"0.5\"}", + "expr": "executor::v2:evaluator:evaluate_duration_ms{namespace=\"$namespace\", quantile=\"0.5\"}", "legendFormat": "Evaluate p50", "refId": "A" }, { - "expr": "executor:evaluator:evaluate_duration{namespace=\"$namespace\", quantile=\"0.9\"}", + "expr": "executor::v2:evaluator:evaluate_duration_ms{namespace=\"$namespace\", quantile=\"0.9\"}", "legendFormat": "Evaluate p90", "refId": "B" }, { - "expr": "executor:evaluator:evaluate_duration{namespace=\"$namespace\", quantile=\"0.99\"}", + "expr": "executor::v2:evaluator:evaluate_duration_ms{namespace=\"$namespace\", quantile=\"0.99\"}", "legendFormat": "Evaluate p99", "refId": "C" } ], - "description": "Time spent in RecursiveNodeHandler (pod creation). Dominant component of V2 task latency. [Metrics pending: requires cloud service instrumentation to be deployed]" + "description": "Time spent in RecursiveNodeHandler (pod creation). Dominant component of V2 task latency. [Metrics pending: requires cloud service instrumentation to be deployed]", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2260,27 +2277,38 @@ data: "type": "timeseries", "targets": [ { - "expr": "rate(executor:system_failures{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(executor::v2:system_failures{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "System failures", "refId": "A" }, { - "expr": "rate(executor:system_failures_exhausted{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(executor::v2:system_failures_exhausted{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Exhausted retries", "refId": "B" }, { - "expr": "rate(executor:invalid_leases{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(executor::v2:invalid_leases{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Invalid leases", "refId": "C" }, { - "expr": "rate(executor:evaluator:evaluate_errors{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(executor::v2:evaluator:evaluate_errors{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Evaluate errors", "refId": "D" } ], - "description": "Executor error rates. System failures retry; exhausted means task permanently failed. Invalid leases = malformed from queue service. [Metrics pending: requires cloud service instrumentation to be deployed]" + "description": "Executor error rates. System failures retry; exhausted means task permanently failed. Invalid leases = malformed from queue service. [Metrics pending: requires cloud service instrumentation to be deployed]", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } } ] }, @@ -2290,10 +2318,10 @@ data: "h": 1, "w": 24, "x": 0, - "y": 33 + "y": 34 }, - "id": 100, - "title": "Flyte Propeller (V1)", + "id": 400, + "title": "gRPC Client (DP \u2192 CP)", "type": "row", "panels": [ { @@ -2312,36 +2340,37 @@ data: "lineWidth": 1, "showPoints": "never" }, - "unit": "ms" + "unit": "reqps" } }, "gridPos": { "h": 8, - "w": 8, + "w": 12, "x": 0, - "y": 12 + "y": 13 }, - "id": 101, - "title": "Round Time (p50 / p90 / p99)", + "id": 401, + "title": "gRPC Client Request Rate", "type": "timeseries", "targets": [ { - "expr": "flyte:propeller:all:round:round_time_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.5\"}", - "legendFormat": "p50", + "expr": "sum by (grpc_service, grpc_method) (rate(grpc_client_handled_total{namespace=\"$namespace\"}[$__rate_interval]))", + "legendFormat": "{{ grpc_service }}/{{ grpc_method }}", "refId": "A" - }, - { - "expr": "flyte:propeller:all:round:round_time_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.9\"}", - "legendFormat": "p90", - "refId": "B" - }, - { - "expr": "flyte:propeller:all:round:round_time_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.99\"}", - "legendFormat": "p99", - "refId": "C" } ], - "description": "Propeller reconciliation round time. One round = one FlyteWorkflow CRD processed." + "description": "DP\u2192CP gRPC client request rate by service and method (e.g. CreateWorkflowEvent).", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2359,36 +2388,37 @@ data: "lineWidth": 1, "showPoints": "never" }, - "unit": "ops" + "unit": "reqps" } }, "gridPos": { "h": 8, - "w": 8, - "x": 8, - "y": 12 + "w": 12, + "x": 12, + "y": 13 }, - "id": 102, - "title": "Round Success / Error Rate", + "id": 402, + "title": "gRPC Client Error Rate", "type": "timeseries", "targets": [ { - "expr": "rate(flyte:propeller:all:round:success_count{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Success", + "expr": "sum by (grpc_service, grpc_method, grpc_code) (rate(grpc_client_handled_total{namespace=\"$namespace\", grpc_code!=\"OK\"}[$__rate_interval]))", + "legendFormat": "{{ grpc_method }} {{ grpc_code }}", "refId": "A" - }, - { - "expr": "rate(flyte:propeller:all:round:error_count{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Errors", - "refId": "B" - }, - { - "expr": "rate(flyte:propeller:all:round:panic_unlabeled{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Panics", - "refId": "C" } ], - "description": "Propeller round outcomes: success, errors, and panics per second." + "description": "DP\u2192CP gRPC errors by method and code. Non-OK responses from the control plane.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2406,564 +2436,37 @@ data: "lineWidth": 1, "showPoints": "never" }, - "unit": "short" + "unit": "s" } }, "gridPos": { "h": 8, - "w": 8, - "x": 16, - "y": 12 + "w": 24, + "x": 0, + "y": 21 }, - "id": 103, - "title": "Free Workers", + "id": 403, + "title": "gRPC Client Latency (p95)", "type": "timeseries", "targets": [ { - "expr": "sum(flyte:propeller:all:free_workers_count{namespace=\"$namespace\"})", - "legendFormat": "Free workers", + "expr": "histogram_quantile(0.95, sum by (le, grpc_method) (rate(grpc_client_handling_seconds_bucket{namespace=\"$namespace\"}[$__rate_interval])))", + "legendFormat": "{{ grpc_method }} p95", "refId": "A" } ], - "description": "Idle propeller worker goroutines. Zero = all workers busy processing workflows." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "ops" - } - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 0, - "y": 20 - }, - "id": 104, - "title": "Queue Add Rate", - "type": "timeseries", - "targets": [ - { - "expr": "rate(flyte:propeller:all:main_adds{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Main adds", - "refId": "A" - }, - { - "expr": "rate(flyte:propeller:all:sub_adds{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Sub adds", - "refId": "B" - }, - { - "expr": "rate(flyte:propeller:all:main_retries{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Main retries", - "refId": "C" - } - ], - "description": "Items enqueued to propeller's main/sub workqueues, plus retry rate." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "short" - } - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 8, - "y": 20 - }, - "id": 105, - "title": "Workflow Updates", - "type": "timeseries", - "targets": [ - { - "expr": "rate(flyte:propeller:all:wf_updated{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Updated", - "refId": "A" - }, - { - "expr": "rate(flyte:propeller:all:wf_update_failed{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Failed", - "refId": "B" - }, - { - "expr": "rate(flyte:propeller:all:wf_too_large{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Too large", - "refId": "C" - }, - { - "expr": "rate(flyte:propeller:all:wf_update_conflict{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Conflict", - "refId": "D" - } - ], - "description": "etcd write outcomes: successful updates, failures, too-large (>1.5MB), optimistic concurrency conflicts." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "ms" - } - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 16, - "y": 20 - }, - "id": 106, - "title": "Workflow Update Latency", - "type": "timeseries", - "targets": [ - { - "expr": "flyte:propeller:all:wf_update_latency_ms{namespace=\"$namespace\", quantile=\"0.5\"}", - "legendFormat": "p50", - "refId": "A" - }, - { - "expr": "flyte:propeller:all:wf_update_latency_ms{namespace=\"$namespace\", quantile=\"0.9\"}", - "legendFormat": "p90", - "refId": "B" - }, - { - "expr": "flyte:propeller:all:wf_update_latency_ms{namespace=\"$namespace\", quantile=\"0.99\"}", - "legendFormat": "p99", - "refId": "C" - } - ], - "description": "etcd write latency for FlyteWorkflow status updates." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "ms" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 28 - }, - "id": 107, - "title": "Node Queueing & Execution Latency", - "type": "timeseries", - "targets": [ - { - "expr": "flyte:propeller:all:node:queueing_latency_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.5\"}", - "legendFormat": "Queue p50", - "refId": "A" - }, - { - "expr": "flyte:propeller:all:node:queueing_latency_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.9\"}", - "legendFormat": "Queue p90", - "refId": "B" - }, - { - "expr": "flyte:propeller:all:node:node_exec_latency_unlabeled_us{namespace=\"$namespace\", quantile=\"0.9\"} / 1000", - "legendFormat": "Exec p90 (ms)", - "refId": "C" - } - ], - "description": "Node lifecycle: queueing latency (queued\u2192running) and execution latency (time in handler)." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "percentunit" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 28 - }, - "id": 108, - "title": "Metastore Cache Hit Rate", - "type": "timeseries", - "targets": [ - { - "expr": "rate(flyte:propeller:all:metastore:cache_hit{namespace=\"$namespace\"}[$__rate_interval]) / (rate(flyte:propeller:all:metastore:cache_hit{namespace=\"$namespace\"}[$__rate_interval]) + rate(flyte:propeller:all:metastore:cache_miss{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "Hit rate", - "refId": "A" - } - ], - "description": "In-memory cache effectiveness for object store (S3/GCS) reads. Low hit rate = excessive storage calls." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "ops" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 36 - }, - "id": 109, - "title": "Event Recording (DP \u2192 CP)", - "type": "timeseries", - "targets": [ - { - "expr": "sum(rate(flyte:propeller:all:task:event_recording:success_duration_ms_count{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "Task success", - "refId": "A" - }, - { - "expr": "sum(rate(flyte:propeller:all:node:event_recording:success_duration_ms_count{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "Node success", - "refId": "B" - }, - { - "expr": "sum(rate(flyte:propeller:all:task:event_recording:failure_duration_ms_count{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "Task failure", - "refId": "C" - }, - { - "expr": "sum(rate(flyte:propeller:all:node:event_recording:failure_duration_ms_count{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "Node failure", - "refId": "D" - } - ], - "description": "Task/node event recording rate from propeller to FlyteAdmin. Failures indicate CP connectivity issues." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "short" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 36 - }, - "id": 110, - "title": "Cache Discovery (hit/miss/skip)", - "type": "timeseries", - "targets": [ - { - "expr": "rate(flyte:propeller:all:discovery_hit_count{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Hits", - "refId": "A" - }, - { - "expr": "rate(flyte:propeller:all:discovery_miss_count{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Misses", - "refId": "B" - }, - { - "expr": "rate(flyte:propeller:all:discovery_skip_count{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Skips", - "refId": "C" - }, - { - "expr": "rate(flyte:propeller:all:discovery_get_failure_count{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Get failures", - "refId": "D" - } - ], - "description": "V2 executor cache discovery miss/put rates for task output caching." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "reqps" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 44 - }, - "id": 111, - "title": "K8s API Client Rate", - "type": "timeseries", - "targets": [ - { - "expr": "sum(rate(k8s_client_request_total_unlabeled{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "K8s requests/s", - "refId": "A" - } - ], - "description": "Propeller's K8s API request rate. High rates may indicate excessive pod watches or creates." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "s" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 44 - }, - "id": 112, - "title": "K8s API Client Latency (p90)", - "type": "timeseries", - "targets": [ - { - "expr": "histogram_quantile(0.90, sum by (le) (rate(k8s_client_request_latency_unlabeled_bucket{namespace=\"$namespace\"}[$__rate_interval])))", - "legendFormat": "Request p90", - "refId": "A" - }, - { - "expr": "histogram_quantile(0.90, sum by (le) (rate(k8s_client_rate_limiter_latency_unlabeled_bucket{namespace=\"$namespace\"}[$__rate_interval])))", - "legendFormat": "Rate limiter p90", - "refId": "B" - } - ], - "description": "K8s API request latency and client-side rate limiter wait time at p90." - } - ] - }, - { - "collapsed": true, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 34 - }, - "id": 400, - "title": "gRPC Client (DP \u2192 CP)", - "type": "row", - "panels": [ - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "reqps" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 13 - }, - "id": 401, - "title": "gRPC Client Request Rate", - "type": "timeseries", - "targets": [ - { - "expr": "sum by (grpc_service, grpc_method) (rate(grpc_client_handled_total{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "{{ grpc_service }}/{{ grpc_method }}", - "refId": "A" - } - ], - "description": "DP\u2192CP gRPC client request rate by service and method (e.g. CreateWorkflowEvent)." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "reqps" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 13 - }, - "id": 402, - "title": "gRPC Client Error Rate", - "type": "timeseries", - "targets": [ - { - "expr": "sum by (grpc_service, grpc_method, grpc_code) (rate(grpc_client_handled_total{namespace=\"$namespace\", grpc_code!=\"OK\"}[$__rate_interval]))", - "legendFormat": "{{ grpc_method }} {{ grpc_code }}", - "refId": "A" - } - ], - "description": "DP\u2192CP gRPC errors by method and code. Non-OK responses from the control plane." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "s" - } - }, - "gridPos": { - "h": 8, - "w": 24, - "x": 0, - "y": 21 - }, - "id": 403, - "title": "gRPC Client Latency (p95)", - "type": "timeseries", - "targets": [ - { - "expr": "histogram_quantile(0.95, sum by (le, grpc_method) (rate(grpc_client_handling_seconds_bucket{namespace=\"$namespace\"}[$__rate_interval])))", - "legendFormat": "{{ grpc_method }} p95", - "refId": "A" + "description": "DP\u2192CP gRPC call latency at p95. High latency = slow control plane or network issues.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] } - ], - "description": "DP\u2192CP gRPC call latency at p95. High latency = slow control plane or network issues." + } } ] }, @@ -3017,7 +2520,18 @@ data: "refId": "A" } ], - "description": "CPU usage in cores per container, stacked. Identifies resource-heavy services." + "description": "CPU usage in cores per container, stacked. Identifies resource-heavy services.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3057,7 +2571,18 @@ data: "refId": "A" } ], - "description": "Working set memory per container. Watch for approaching limits." + "description": "Working set memory per container. Watch for approaching limits.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3094,7 +2619,18 @@ data: "refId": "A" } ], - "description": "Per-container restart events. Spikes indicate crashes or OOM kills." + "description": "Per-container restart events. Spikes indicate crashes or OOM kills.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } } ] } @@ -3146,7 +2682,7 @@ data: "timepicker": {}, "timezone": "browser", "title": "Union Dataplane Overview", - "uid": "union-dp-overview", + "uid": "union-dataplane-overview", "version": 1 } --- diff --git a/tests/generated/dataplane.azure.yaml b/tests/generated/dataplane.azure.yaml index 5b5264be..43f885e6 100644 --- a/tests/generated/dataplane.azure.yaml +++ b/tests/generated/dataplane.azure.yaml @@ -1183,63 +1183,6 @@ data: ], "description": "Total container restarts in the last hour. Non-zero indicates crashlooping or OOM kills." }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "thresholds": { - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "yellow", - "value": 10 - }, - { - "color": "red", - "value": 50 - } - ] - }, - "unit": "short" - } - }, - "gridPos": { - "h": 4, - "w": 6, - "x": 18, - "y": 1 - }, - "id": 8, - "options": { - "colorMode": "background", - "graphMode": "area", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ] - }, - "textMode": "auto" - }, - "title": "Active Workflows", - "type": "stat", - "targets": [ - { - "expr": "sum(flyte:propeller:all:execstats:active_workflow_executions{namespace=\"$namespace\"})", - "legendFormat": "Workflows", - "refId": "A" - } - ], - "description": "Current active FlyteWorkflow CRD count managed by Propeller." - }, { "datasource": { "type": "prometheus", @@ -1269,8 +1212,9 @@ data: "options": { "legend": { "calcs": [ - "mean", - "max" + "min", + "max", + "lastNotNull" ], "displayMode": "table", "placement": "bottom" @@ -1301,59 +1245,6 @@ data: ], "description": "Current count of active workflow, node, and task executions tracked by FlyteAdmin." }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "short" - } - }, - "gridPos": { - "h": 6, - "w": 12, - "x": 12, - "y": 5 - }, - "id": 7, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom" - }, - "tooltip": { - "mode": "multi", - "sort": "desc" - } - }, - "title": "Queue Depth", - "type": "timeseries", - "targets": [ - { - "expr": "sum(flyte:propeller:all:main_depth{namespace=\"$namespace\"})", - "legendFormat": "Main", - "refId": "A" - }, - { - "expr": "sum(flyte:propeller:all:sub_depth{namespace=\"$namespace\"})", - "legendFormat": "Sub", - "refId": "B" - } - ], - "description": "Main and sub workqueue depth over time." - }, { "datasource": { "type": "prometheus", @@ -1639,7 +1530,7 @@ data: "y": 10 }, "id": 1204, - "title": "Propeller Latency p99", + "title": "Executor Evaluate Duration p99", "type": "stat", "options": { "colorMode": "background", @@ -1653,8 +1544,13 @@ data: }, "targets": [ { - "expr": "(union:dp:slo:propeller_round_latency_p99 or flyte:propeller:all:round:round_time_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.99\"} / 1000)", - "refId": "A" + "expr": "executor::v2:evaluator:evaluate_duration_ms{namespace=\"$namespace\", quantile=\"0.99\"}", + "legendFormat": "p99", + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + } } ], "description": "Propeller round p99 latency in seconds. Falls back to raw metric if SLO rules not enabled." @@ -1699,7 +1595,18 @@ data: "refId": "B" } ], - "description": "DP service availability over time with SLO target line." + "description": "DP service availability over time with SLO target line.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -1743,7 +1650,18 @@ data: "refId": "B" } ], - "description": "DP error budget remaining over time. Requires monitoring.slos.enabled." + "description": "DP error budget remaining over time. Requires monitoring.slos.enabled.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "collapsed": true, @@ -1797,7 +1715,18 @@ data: "refId": "B" } ], - "description": "Operator execution operation processing rate and failure rate." + "description": "Operator execution operation processing rate and failure rate.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -1854,7 +1783,18 @@ data: "refId": "E" } ], - "description": "FlyteAdmin error rates: propeller communication failures, model transform errors, notification publish failures." + "description": "FlyteAdmin error rates: propeller communication failures, model transform errors, notification publish failures.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -1901,7 +1841,18 @@ data: "refId": "C" } ], - "description": "Breakdown of operator heartbeat computation: capabilities, usages, list workflows." + "description": "Breakdown of operator heartbeat computation: capabilities, usages, list workflows.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -1948,7 +1899,18 @@ data: "refId": "C" } ], - "description": "Config sync cycle rate, errors, and propeller ConfigMap update count." + "description": "Config sync cycle rate, errors, and propeller ConfigMap update count.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -1990,7 +1952,18 @@ data: "refId": "B" } ], - "description": "Billing collection cycle rate and errors. Failures mean billing data may be delayed." + "description": "Billing collection cycle rate and errors. Failures mean billing data may be delayed.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2089,17 +2062,28 @@ data: "type": "timeseries", "targets": [ { - "expr": "executor:active_actions_count{namespace=\"$namespace\"}", + "expr": "executor::v2:active_actions_count{namespace=\"$namespace\"}", "legendFormat": "Active actions", "refId": "A" }, { - "expr": "executor:available_capacity{namespace=\"$namespace\"}", + "expr": "executor::v2:available_capacity{namespace=\"$namespace\"}", "legendFormat": "Available capacity", "refId": "B" } ], - "description": "V2 executor active actions vs available capacity. Capacity=0 means executor is saturated." + "description": "V2 executor active actions vs available capacity. Capacity=0 means executor is saturated.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2131,17 +2115,28 @@ data: "type": "timeseries", "targets": [ { - "expr": "rate(executor:discovery_miss_count{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(executor::v2:discovery_miss_count{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Miss", "refId": "A" }, { - "expr": "rate(executor:discovery_put_success_count{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(executor::v2:discovery_put_success_count{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Put success", "refId": "B" } ], - "description": "V2 executor cache discovery miss/put rates for task output caching." + "description": "V2 executor cache discovery miss/put rates for task output caching.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2176,12 +2171,23 @@ data: "type": "timeseries", "targets": [ { - "expr": "sum by (phase) (rate(executor:actions_terminated{namespace=\"$namespace\"}[$__rate_interval]))", + "expr": "sum by (phase) (rate(executor::v2:actions_terminated{namespace=\"$namespace\"}[$__rate_interval]))", "legendFormat": "{{ phase }}", "refId": "A" } ], - "description": "Task completion rate by outcome: succeeded, failed, aborted. Key V2 SLI for task health. [Metrics pending: requires cloud service instrumentation to be deployed]" + "description": "Task completion rate by outcome: succeeded, failed, aborted. Key V2 SLI for task health. [Metrics pending: requires cloud service instrumentation to be deployed]", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2213,22 +2219,33 @@ data: "type": "timeseries", "targets": [ { - "expr": "executor:evaluator:evaluate_duration{namespace=\"$namespace\", quantile=\"0.5\"}", + "expr": "executor::v2:evaluator:evaluate_duration_ms{namespace=\"$namespace\", quantile=\"0.5\"}", "legendFormat": "Evaluate p50", "refId": "A" }, { - "expr": "executor:evaluator:evaluate_duration{namespace=\"$namespace\", quantile=\"0.9\"}", + "expr": "executor::v2:evaluator:evaluate_duration_ms{namespace=\"$namespace\", quantile=\"0.9\"}", "legendFormat": "Evaluate p90", "refId": "B" }, { - "expr": "executor:evaluator:evaluate_duration{namespace=\"$namespace\", quantile=\"0.99\"}", + "expr": "executor::v2:evaluator:evaluate_duration_ms{namespace=\"$namespace\", quantile=\"0.99\"}", "legendFormat": "Evaluate p99", "refId": "C" } ], - "description": "Time spent in RecursiveNodeHandler (pod creation). Dominant component of V2 task latency. [Metrics pending: requires cloud service instrumentation to be deployed]" + "description": "Time spent in RecursiveNodeHandler (pod creation). Dominant component of V2 task latency. [Metrics pending: requires cloud service instrumentation to be deployed]", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2260,27 +2277,38 @@ data: "type": "timeseries", "targets": [ { - "expr": "rate(executor:system_failures{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(executor::v2:system_failures{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "System failures", "refId": "A" }, { - "expr": "rate(executor:system_failures_exhausted{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(executor::v2:system_failures_exhausted{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Exhausted retries", "refId": "B" }, { - "expr": "rate(executor:invalid_leases{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(executor::v2:invalid_leases{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Invalid leases", "refId": "C" }, { - "expr": "rate(executor:evaluator:evaluate_errors{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(executor::v2:evaluator:evaluate_errors{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Evaluate errors", "refId": "D" } ], - "description": "Executor error rates. System failures retry; exhausted means task permanently failed. Invalid leases = malformed from queue service. [Metrics pending: requires cloud service instrumentation to be deployed]" + "description": "Executor error rates. System failures retry; exhausted means task permanently failed. Invalid leases = malformed from queue service. [Metrics pending: requires cloud service instrumentation to be deployed]", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } } ] }, @@ -2290,10 +2318,10 @@ data: "h": 1, "w": 24, "x": 0, - "y": 33 + "y": 34 }, - "id": 100, - "title": "Flyte Propeller (V1)", + "id": 400, + "title": "gRPC Client (DP \u2192 CP)", "type": "row", "panels": [ { @@ -2312,36 +2340,37 @@ data: "lineWidth": 1, "showPoints": "never" }, - "unit": "ms" + "unit": "reqps" } }, "gridPos": { "h": 8, - "w": 8, + "w": 12, "x": 0, - "y": 12 + "y": 13 }, - "id": 101, - "title": "Round Time (p50 / p90 / p99)", + "id": 401, + "title": "gRPC Client Request Rate", "type": "timeseries", "targets": [ { - "expr": "flyte:propeller:all:round:round_time_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.5\"}", - "legendFormat": "p50", + "expr": "sum by (grpc_service, grpc_method) (rate(grpc_client_handled_total{namespace=\"$namespace\"}[$__rate_interval]))", + "legendFormat": "{{ grpc_service }}/{{ grpc_method }}", "refId": "A" - }, - { - "expr": "flyte:propeller:all:round:round_time_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.9\"}", - "legendFormat": "p90", - "refId": "B" - }, - { - "expr": "flyte:propeller:all:round:round_time_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.99\"}", - "legendFormat": "p99", - "refId": "C" } ], - "description": "Propeller reconciliation round time. One round = one FlyteWorkflow CRD processed." + "description": "DP\u2192CP gRPC client request rate by service and method (e.g. CreateWorkflowEvent).", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2359,36 +2388,37 @@ data: "lineWidth": 1, "showPoints": "never" }, - "unit": "ops" + "unit": "reqps" } }, "gridPos": { "h": 8, - "w": 8, - "x": 8, - "y": 12 + "w": 12, + "x": 12, + "y": 13 }, - "id": 102, - "title": "Round Success / Error Rate", + "id": 402, + "title": "gRPC Client Error Rate", "type": "timeseries", "targets": [ { - "expr": "rate(flyte:propeller:all:round:success_count{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Success", + "expr": "sum by (grpc_service, grpc_method, grpc_code) (rate(grpc_client_handled_total{namespace=\"$namespace\", grpc_code!=\"OK\"}[$__rate_interval]))", + "legendFormat": "{{ grpc_method }} {{ grpc_code }}", "refId": "A" - }, - { - "expr": "rate(flyte:propeller:all:round:error_count{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Errors", - "refId": "B" - }, - { - "expr": "rate(flyte:propeller:all:round:panic_unlabeled{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Panics", - "refId": "C" } ], - "description": "Propeller round outcomes: success, errors, and panics per second." + "description": "DP\u2192CP gRPC errors by method and code. Non-OK responses from the control plane.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2406,564 +2436,37 @@ data: "lineWidth": 1, "showPoints": "never" }, - "unit": "short" + "unit": "s" } }, "gridPos": { "h": 8, - "w": 8, - "x": 16, - "y": 12 + "w": 24, + "x": 0, + "y": 21 }, - "id": 103, - "title": "Free Workers", + "id": 403, + "title": "gRPC Client Latency (p95)", "type": "timeseries", "targets": [ { - "expr": "sum(flyte:propeller:all:free_workers_count{namespace=\"$namespace\"})", - "legendFormat": "Free workers", + "expr": "histogram_quantile(0.95, sum by (le, grpc_method) (rate(grpc_client_handling_seconds_bucket{namespace=\"$namespace\"}[$__rate_interval])))", + "legendFormat": "{{ grpc_method }} p95", "refId": "A" } ], - "description": "Idle propeller worker goroutines. Zero = all workers busy processing workflows." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "ops" - } - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 0, - "y": 20 - }, - "id": 104, - "title": "Queue Add Rate", - "type": "timeseries", - "targets": [ - { - "expr": "rate(flyte:propeller:all:main_adds{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Main adds", - "refId": "A" - }, - { - "expr": "rate(flyte:propeller:all:sub_adds{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Sub adds", - "refId": "B" - }, - { - "expr": "rate(flyte:propeller:all:main_retries{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Main retries", - "refId": "C" - } - ], - "description": "Items enqueued to propeller's main/sub workqueues, plus retry rate." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "short" - } - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 8, - "y": 20 - }, - "id": 105, - "title": "Workflow Updates", - "type": "timeseries", - "targets": [ - { - "expr": "rate(flyte:propeller:all:wf_updated{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Updated", - "refId": "A" - }, - { - "expr": "rate(flyte:propeller:all:wf_update_failed{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Failed", - "refId": "B" - }, - { - "expr": "rate(flyte:propeller:all:wf_too_large{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Too large", - "refId": "C" - }, - { - "expr": "rate(flyte:propeller:all:wf_update_conflict{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Conflict", - "refId": "D" - } - ], - "description": "etcd write outcomes: successful updates, failures, too-large (>1.5MB), optimistic concurrency conflicts." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "ms" - } - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 16, - "y": 20 - }, - "id": 106, - "title": "Workflow Update Latency", - "type": "timeseries", - "targets": [ - { - "expr": "flyte:propeller:all:wf_update_latency_ms{namespace=\"$namespace\", quantile=\"0.5\"}", - "legendFormat": "p50", - "refId": "A" - }, - { - "expr": "flyte:propeller:all:wf_update_latency_ms{namespace=\"$namespace\", quantile=\"0.9\"}", - "legendFormat": "p90", - "refId": "B" - }, - { - "expr": "flyte:propeller:all:wf_update_latency_ms{namespace=\"$namespace\", quantile=\"0.99\"}", - "legendFormat": "p99", - "refId": "C" - } - ], - "description": "etcd write latency for FlyteWorkflow status updates." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "ms" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 28 - }, - "id": 107, - "title": "Node Queueing & Execution Latency", - "type": "timeseries", - "targets": [ - { - "expr": "flyte:propeller:all:node:queueing_latency_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.5\"}", - "legendFormat": "Queue p50", - "refId": "A" - }, - { - "expr": "flyte:propeller:all:node:queueing_latency_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.9\"}", - "legendFormat": "Queue p90", - "refId": "B" - }, - { - "expr": "flyte:propeller:all:node:node_exec_latency_unlabeled_us{namespace=\"$namespace\", quantile=\"0.9\"} / 1000", - "legendFormat": "Exec p90 (ms)", - "refId": "C" - } - ], - "description": "Node lifecycle: queueing latency (queued\u2192running) and execution latency (time in handler)." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "percentunit" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 28 - }, - "id": 108, - "title": "Metastore Cache Hit Rate", - "type": "timeseries", - "targets": [ - { - "expr": "rate(flyte:propeller:all:metastore:cache_hit{namespace=\"$namespace\"}[$__rate_interval]) / (rate(flyte:propeller:all:metastore:cache_hit{namespace=\"$namespace\"}[$__rate_interval]) + rate(flyte:propeller:all:metastore:cache_miss{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "Hit rate", - "refId": "A" - } - ], - "description": "In-memory cache effectiveness for object store (S3/GCS) reads. Low hit rate = excessive storage calls." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "ops" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 36 - }, - "id": 109, - "title": "Event Recording (DP \u2192 CP)", - "type": "timeseries", - "targets": [ - { - "expr": "sum(rate(flyte:propeller:all:task:event_recording:success_duration_ms_count{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "Task success", - "refId": "A" - }, - { - "expr": "sum(rate(flyte:propeller:all:node:event_recording:success_duration_ms_count{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "Node success", - "refId": "B" - }, - { - "expr": "sum(rate(flyte:propeller:all:task:event_recording:failure_duration_ms_count{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "Task failure", - "refId": "C" - }, - { - "expr": "sum(rate(flyte:propeller:all:node:event_recording:failure_duration_ms_count{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "Node failure", - "refId": "D" - } - ], - "description": "Task/node event recording rate from propeller to FlyteAdmin. Failures indicate CP connectivity issues." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "short" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 36 - }, - "id": 110, - "title": "Cache Discovery (hit/miss/skip)", - "type": "timeseries", - "targets": [ - { - "expr": "rate(flyte:propeller:all:discovery_hit_count{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Hits", - "refId": "A" - }, - { - "expr": "rate(flyte:propeller:all:discovery_miss_count{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Misses", - "refId": "B" - }, - { - "expr": "rate(flyte:propeller:all:discovery_skip_count{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Skips", - "refId": "C" - }, - { - "expr": "rate(flyte:propeller:all:discovery_get_failure_count{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Get failures", - "refId": "D" - } - ], - "description": "V2 executor cache discovery miss/put rates for task output caching." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "reqps" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 44 - }, - "id": 111, - "title": "K8s API Client Rate", - "type": "timeseries", - "targets": [ - { - "expr": "sum(rate(k8s_client_request_total_unlabeled{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "K8s requests/s", - "refId": "A" - } - ], - "description": "Propeller's K8s API request rate. High rates may indicate excessive pod watches or creates." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "s" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 44 - }, - "id": 112, - "title": "K8s API Client Latency (p90)", - "type": "timeseries", - "targets": [ - { - "expr": "histogram_quantile(0.90, sum by (le) (rate(k8s_client_request_latency_unlabeled_bucket{namespace=\"$namespace\"}[$__rate_interval])))", - "legendFormat": "Request p90", - "refId": "A" - }, - { - "expr": "histogram_quantile(0.90, sum by (le) (rate(k8s_client_rate_limiter_latency_unlabeled_bucket{namespace=\"$namespace\"}[$__rate_interval])))", - "legendFormat": "Rate limiter p90", - "refId": "B" - } - ], - "description": "K8s API request latency and client-side rate limiter wait time at p90." - } - ] - }, - { - "collapsed": true, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 34 - }, - "id": 400, - "title": "gRPC Client (DP \u2192 CP)", - "type": "row", - "panels": [ - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "reqps" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 13 - }, - "id": 401, - "title": "gRPC Client Request Rate", - "type": "timeseries", - "targets": [ - { - "expr": "sum by (grpc_service, grpc_method) (rate(grpc_client_handled_total{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "{{ grpc_service }}/{{ grpc_method }}", - "refId": "A" - } - ], - "description": "DP\u2192CP gRPC client request rate by service and method (e.g. CreateWorkflowEvent)." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "reqps" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 13 - }, - "id": 402, - "title": "gRPC Client Error Rate", - "type": "timeseries", - "targets": [ - { - "expr": "sum by (grpc_service, grpc_method, grpc_code) (rate(grpc_client_handled_total{namespace=\"$namespace\", grpc_code!=\"OK\"}[$__rate_interval]))", - "legendFormat": "{{ grpc_method }} {{ grpc_code }}", - "refId": "A" - } - ], - "description": "DP\u2192CP gRPC errors by method and code. Non-OK responses from the control plane." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "s" - } - }, - "gridPos": { - "h": 8, - "w": 24, - "x": 0, - "y": 21 - }, - "id": 403, - "title": "gRPC Client Latency (p95)", - "type": "timeseries", - "targets": [ - { - "expr": "histogram_quantile(0.95, sum by (le, grpc_method) (rate(grpc_client_handling_seconds_bucket{namespace=\"$namespace\"}[$__rate_interval])))", - "legendFormat": "{{ grpc_method }} p95", - "refId": "A" + "description": "DP\u2192CP gRPC call latency at p95. High latency = slow control plane or network issues.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] } - ], - "description": "DP\u2192CP gRPC call latency at p95. High latency = slow control plane or network issues." + } } ] }, @@ -3017,7 +2520,18 @@ data: "refId": "A" } ], - "description": "CPU usage in cores per container, stacked. Identifies resource-heavy services." + "description": "CPU usage in cores per container, stacked. Identifies resource-heavy services.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3057,7 +2571,18 @@ data: "refId": "A" } ], - "description": "Working set memory per container. Watch for approaching limits." + "description": "Working set memory per container. Watch for approaching limits.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3094,7 +2619,18 @@ data: "refId": "A" } ], - "description": "Per-container restart events. Spikes indicate crashes or OOM kills." + "description": "Per-container restart events. Spikes indicate crashes or OOM kills.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } } ] } @@ -3146,7 +2682,7 @@ data: "timepicker": {}, "timezone": "browser", "title": "Union Dataplane Overview", - "uid": "union-dp-overview", + "uid": "union-dataplane-overview", "version": 1 } --- diff --git a/tests/generated/dataplane.cost.yaml b/tests/generated/dataplane.cost.yaml index 28b76bf6..8e1913e9 100644 --- a/tests/generated/dataplane.cost.yaml +++ b/tests/generated/dataplane.cost.yaml @@ -1170,63 +1170,6 @@ data: ], "description": "Total container restarts in the last hour. Non-zero indicates crashlooping or OOM kills." }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "thresholds": { - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "yellow", - "value": 10 - }, - { - "color": "red", - "value": 50 - } - ] - }, - "unit": "short" - } - }, - "gridPos": { - "h": 4, - "w": 6, - "x": 18, - "y": 1 - }, - "id": 8, - "options": { - "colorMode": "background", - "graphMode": "area", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ] - }, - "textMode": "auto" - }, - "title": "Active Workflows", - "type": "stat", - "targets": [ - { - "expr": "sum(flyte:propeller:all:execstats:active_workflow_executions{namespace=\"$namespace\"})", - "legendFormat": "Workflows", - "refId": "A" - } - ], - "description": "Current active FlyteWorkflow CRD count managed by Propeller." - }, { "datasource": { "type": "prometheus", @@ -1256,8 +1199,9 @@ data: "options": { "legend": { "calcs": [ - "mean", - "max" + "min", + "max", + "lastNotNull" ], "displayMode": "table", "placement": "bottom" @@ -1288,59 +1232,6 @@ data: ], "description": "Current count of active workflow, node, and task executions tracked by FlyteAdmin." }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "short" - } - }, - "gridPos": { - "h": 6, - "w": 12, - "x": 12, - "y": 5 - }, - "id": 7, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom" - }, - "tooltip": { - "mode": "multi", - "sort": "desc" - } - }, - "title": "Queue Depth", - "type": "timeseries", - "targets": [ - { - "expr": "sum(flyte:propeller:all:main_depth{namespace=\"$namespace\"})", - "legendFormat": "Main", - "refId": "A" - }, - { - "expr": "sum(flyte:propeller:all:sub_depth{namespace=\"$namespace\"})", - "legendFormat": "Sub", - "refId": "B" - } - ], - "description": "Main and sub workqueue depth over time." - }, { "datasource": { "type": "prometheus", @@ -1626,7 +1517,7 @@ data: "y": 10 }, "id": 1204, - "title": "Propeller Latency p99", + "title": "Executor Evaluate Duration p99", "type": "stat", "options": { "colorMode": "background", @@ -1640,8 +1531,13 @@ data: }, "targets": [ { - "expr": "(union:dp:slo:propeller_round_latency_p99 or flyte:propeller:all:round:round_time_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.99\"} / 1000)", - "refId": "A" + "expr": "executor::v2:evaluator:evaluate_duration_ms{namespace=\"$namespace\", quantile=\"0.99\"}", + "legendFormat": "p99", + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + } } ], "description": "Propeller round p99 latency in seconds. Falls back to raw metric if SLO rules not enabled." @@ -1686,7 +1582,18 @@ data: "refId": "B" } ], - "description": "DP service availability over time with SLO target line." + "description": "DP service availability over time with SLO target line.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -1730,7 +1637,18 @@ data: "refId": "B" } ], - "description": "DP error budget remaining over time. Requires monitoring.slos.enabled." + "description": "DP error budget remaining over time. Requires monitoring.slos.enabled.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "collapsed": true, @@ -1784,7 +1702,18 @@ data: "refId": "B" } ], - "description": "Operator execution operation processing rate and failure rate." + "description": "Operator execution operation processing rate and failure rate.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -1841,7 +1770,18 @@ data: "refId": "E" } ], - "description": "FlyteAdmin error rates: propeller communication failures, model transform errors, notification publish failures." + "description": "FlyteAdmin error rates: propeller communication failures, model transform errors, notification publish failures.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -1888,7 +1828,18 @@ data: "refId": "C" } ], - "description": "Breakdown of operator heartbeat computation: capabilities, usages, list workflows." + "description": "Breakdown of operator heartbeat computation: capabilities, usages, list workflows.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -1935,7 +1886,18 @@ data: "refId": "C" } ], - "description": "Config sync cycle rate, errors, and propeller ConfigMap update count." + "description": "Config sync cycle rate, errors, and propeller ConfigMap update count.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -1977,7 +1939,18 @@ data: "refId": "B" } ], - "description": "Billing collection cycle rate and errors. Failures mean billing data may be delayed." + "description": "Billing collection cycle rate and errors. Failures mean billing data may be delayed.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2076,17 +2049,28 @@ data: "type": "timeseries", "targets": [ { - "expr": "executor:active_actions_count{namespace=\"$namespace\"}", + "expr": "executor::v2:active_actions_count{namespace=\"$namespace\"}", "legendFormat": "Active actions", "refId": "A" }, { - "expr": "executor:available_capacity{namespace=\"$namespace\"}", + "expr": "executor::v2:available_capacity{namespace=\"$namespace\"}", "legendFormat": "Available capacity", "refId": "B" } ], - "description": "V2 executor active actions vs available capacity. Capacity=0 means executor is saturated." + "description": "V2 executor active actions vs available capacity. Capacity=0 means executor is saturated.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2118,17 +2102,28 @@ data: "type": "timeseries", "targets": [ { - "expr": "rate(executor:discovery_miss_count{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(executor::v2:discovery_miss_count{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Miss", "refId": "A" }, { - "expr": "rate(executor:discovery_put_success_count{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(executor::v2:discovery_put_success_count{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Put success", "refId": "B" } ], - "description": "V2 executor cache discovery miss/put rates for task output caching." + "description": "V2 executor cache discovery miss/put rates for task output caching.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2163,12 +2158,23 @@ data: "type": "timeseries", "targets": [ { - "expr": "sum by (phase) (rate(executor:actions_terminated{namespace=\"$namespace\"}[$__rate_interval]))", + "expr": "sum by (phase) (rate(executor::v2:actions_terminated{namespace=\"$namespace\"}[$__rate_interval]))", "legendFormat": "{{ phase }}", "refId": "A" } ], - "description": "Task completion rate by outcome: succeeded, failed, aborted. Key V2 SLI for task health. [Metrics pending: requires cloud service instrumentation to be deployed]" + "description": "Task completion rate by outcome: succeeded, failed, aborted. Key V2 SLI for task health. [Metrics pending: requires cloud service instrumentation to be deployed]", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2200,22 +2206,33 @@ data: "type": "timeseries", "targets": [ { - "expr": "executor:evaluator:evaluate_duration{namespace=\"$namespace\", quantile=\"0.5\"}", + "expr": "executor::v2:evaluator:evaluate_duration_ms{namespace=\"$namespace\", quantile=\"0.5\"}", "legendFormat": "Evaluate p50", "refId": "A" }, { - "expr": "executor:evaluator:evaluate_duration{namespace=\"$namespace\", quantile=\"0.9\"}", + "expr": "executor::v2:evaluator:evaluate_duration_ms{namespace=\"$namespace\", quantile=\"0.9\"}", "legendFormat": "Evaluate p90", "refId": "B" }, { - "expr": "executor:evaluator:evaluate_duration{namespace=\"$namespace\", quantile=\"0.99\"}", + "expr": "executor::v2:evaluator:evaluate_duration_ms{namespace=\"$namespace\", quantile=\"0.99\"}", "legendFormat": "Evaluate p99", "refId": "C" } ], - "description": "Time spent in RecursiveNodeHandler (pod creation). Dominant component of V2 task latency. [Metrics pending: requires cloud service instrumentation to be deployed]" + "description": "Time spent in RecursiveNodeHandler (pod creation). Dominant component of V2 task latency. [Metrics pending: requires cloud service instrumentation to be deployed]", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2247,27 +2264,38 @@ data: "type": "timeseries", "targets": [ { - "expr": "rate(executor:system_failures{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(executor::v2:system_failures{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "System failures", "refId": "A" }, { - "expr": "rate(executor:system_failures_exhausted{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(executor::v2:system_failures_exhausted{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Exhausted retries", "refId": "B" }, { - "expr": "rate(executor:invalid_leases{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(executor::v2:invalid_leases{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Invalid leases", "refId": "C" }, { - "expr": "rate(executor:evaluator:evaluate_errors{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(executor::v2:evaluator:evaluate_errors{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Evaluate errors", "refId": "D" } ], - "description": "Executor error rates. System failures retry; exhausted means task permanently failed. Invalid leases = malformed from queue service. [Metrics pending: requires cloud service instrumentation to be deployed]" + "description": "Executor error rates. System failures retry; exhausted means task permanently failed. Invalid leases = malformed from queue service. [Metrics pending: requires cloud service instrumentation to be deployed]", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } } ] }, @@ -2277,10 +2305,10 @@ data: "h": 1, "w": 24, "x": 0, - "y": 33 + "y": 34 }, - "id": 100, - "title": "Flyte Propeller (V1)", + "id": 400, + "title": "gRPC Client (DP \u2192 CP)", "type": "row", "panels": [ { @@ -2299,36 +2327,37 @@ data: "lineWidth": 1, "showPoints": "never" }, - "unit": "ms" + "unit": "reqps" } }, "gridPos": { "h": 8, - "w": 8, + "w": 12, "x": 0, - "y": 12 + "y": 13 }, - "id": 101, - "title": "Round Time (p50 / p90 / p99)", + "id": 401, + "title": "gRPC Client Request Rate", "type": "timeseries", "targets": [ { - "expr": "flyte:propeller:all:round:round_time_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.5\"}", - "legendFormat": "p50", + "expr": "sum by (grpc_service, grpc_method) (rate(grpc_client_handled_total{namespace=\"$namespace\"}[$__rate_interval]))", + "legendFormat": "{{ grpc_service }}/{{ grpc_method }}", "refId": "A" - }, - { - "expr": "flyte:propeller:all:round:round_time_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.9\"}", - "legendFormat": "p90", - "refId": "B" - }, - { - "expr": "flyte:propeller:all:round:round_time_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.99\"}", - "legendFormat": "p99", - "refId": "C" } ], - "description": "Propeller reconciliation round time. One round = one FlyteWorkflow CRD processed." + "description": "DP\u2192CP gRPC client request rate by service and method (e.g. CreateWorkflowEvent).", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2346,36 +2375,37 @@ data: "lineWidth": 1, "showPoints": "never" }, - "unit": "ops" + "unit": "reqps" } }, "gridPos": { "h": 8, - "w": 8, - "x": 8, - "y": 12 + "w": 12, + "x": 12, + "y": 13 }, - "id": 102, - "title": "Round Success / Error Rate", + "id": 402, + "title": "gRPC Client Error Rate", "type": "timeseries", "targets": [ { - "expr": "rate(flyte:propeller:all:round:success_count{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Success", + "expr": "sum by (grpc_service, grpc_method, grpc_code) (rate(grpc_client_handled_total{namespace=\"$namespace\", grpc_code!=\"OK\"}[$__rate_interval]))", + "legendFormat": "{{ grpc_method }} {{ grpc_code }}", "refId": "A" - }, - { - "expr": "rate(flyte:propeller:all:round:error_count{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Errors", - "refId": "B" - }, - { - "expr": "rate(flyte:propeller:all:round:panic_unlabeled{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Panics", - "refId": "C" } ], - "description": "Propeller round outcomes: success, errors, and panics per second." + "description": "DP\u2192CP gRPC errors by method and code. Non-OK responses from the control plane.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2393,564 +2423,37 @@ data: "lineWidth": 1, "showPoints": "never" }, - "unit": "short" + "unit": "s" } }, "gridPos": { "h": 8, - "w": 8, - "x": 16, - "y": 12 + "w": 24, + "x": 0, + "y": 21 }, - "id": 103, - "title": "Free Workers", + "id": 403, + "title": "gRPC Client Latency (p95)", "type": "timeseries", "targets": [ { - "expr": "sum(flyte:propeller:all:free_workers_count{namespace=\"$namespace\"})", - "legendFormat": "Free workers", + "expr": "histogram_quantile(0.95, sum by (le, grpc_method) (rate(grpc_client_handling_seconds_bucket{namespace=\"$namespace\"}[$__rate_interval])))", + "legendFormat": "{{ grpc_method }} p95", "refId": "A" } ], - "description": "Idle propeller worker goroutines. Zero = all workers busy processing workflows." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "ops" - } - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 0, - "y": 20 - }, - "id": 104, - "title": "Queue Add Rate", - "type": "timeseries", - "targets": [ - { - "expr": "rate(flyte:propeller:all:main_adds{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Main adds", - "refId": "A" - }, - { - "expr": "rate(flyte:propeller:all:sub_adds{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Sub adds", - "refId": "B" - }, - { - "expr": "rate(flyte:propeller:all:main_retries{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Main retries", - "refId": "C" - } - ], - "description": "Items enqueued to propeller's main/sub workqueues, plus retry rate." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "short" - } - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 8, - "y": 20 - }, - "id": 105, - "title": "Workflow Updates", - "type": "timeseries", - "targets": [ - { - "expr": "rate(flyte:propeller:all:wf_updated{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Updated", - "refId": "A" - }, - { - "expr": "rate(flyte:propeller:all:wf_update_failed{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Failed", - "refId": "B" - }, - { - "expr": "rate(flyte:propeller:all:wf_too_large{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Too large", - "refId": "C" - }, - { - "expr": "rate(flyte:propeller:all:wf_update_conflict{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Conflict", - "refId": "D" - } - ], - "description": "etcd write outcomes: successful updates, failures, too-large (>1.5MB), optimistic concurrency conflicts." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "ms" - } - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 16, - "y": 20 - }, - "id": 106, - "title": "Workflow Update Latency", - "type": "timeseries", - "targets": [ - { - "expr": "flyte:propeller:all:wf_update_latency_ms{namespace=\"$namespace\", quantile=\"0.5\"}", - "legendFormat": "p50", - "refId": "A" - }, - { - "expr": "flyte:propeller:all:wf_update_latency_ms{namespace=\"$namespace\", quantile=\"0.9\"}", - "legendFormat": "p90", - "refId": "B" - }, - { - "expr": "flyte:propeller:all:wf_update_latency_ms{namespace=\"$namespace\", quantile=\"0.99\"}", - "legendFormat": "p99", - "refId": "C" - } - ], - "description": "etcd write latency for FlyteWorkflow status updates." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "ms" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 28 - }, - "id": 107, - "title": "Node Queueing & Execution Latency", - "type": "timeseries", - "targets": [ - { - "expr": "flyte:propeller:all:node:queueing_latency_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.5\"}", - "legendFormat": "Queue p50", - "refId": "A" - }, - { - "expr": "flyte:propeller:all:node:queueing_latency_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.9\"}", - "legendFormat": "Queue p90", - "refId": "B" - }, - { - "expr": "flyte:propeller:all:node:node_exec_latency_unlabeled_us{namespace=\"$namespace\", quantile=\"0.9\"} / 1000", - "legendFormat": "Exec p90 (ms)", - "refId": "C" - } - ], - "description": "Node lifecycle: queueing latency (queued\u2192running) and execution latency (time in handler)." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "percentunit" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 28 - }, - "id": 108, - "title": "Metastore Cache Hit Rate", - "type": "timeseries", - "targets": [ - { - "expr": "rate(flyte:propeller:all:metastore:cache_hit{namespace=\"$namespace\"}[$__rate_interval]) / (rate(flyte:propeller:all:metastore:cache_hit{namespace=\"$namespace\"}[$__rate_interval]) + rate(flyte:propeller:all:metastore:cache_miss{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "Hit rate", - "refId": "A" - } - ], - "description": "In-memory cache effectiveness for object store (S3/GCS) reads. Low hit rate = excessive storage calls." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "ops" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 36 - }, - "id": 109, - "title": "Event Recording (DP \u2192 CP)", - "type": "timeseries", - "targets": [ - { - "expr": "sum(rate(flyte:propeller:all:task:event_recording:success_duration_ms_count{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "Task success", - "refId": "A" - }, - { - "expr": "sum(rate(flyte:propeller:all:node:event_recording:success_duration_ms_count{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "Node success", - "refId": "B" - }, - { - "expr": "sum(rate(flyte:propeller:all:task:event_recording:failure_duration_ms_count{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "Task failure", - "refId": "C" - }, - { - "expr": "sum(rate(flyte:propeller:all:node:event_recording:failure_duration_ms_count{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "Node failure", - "refId": "D" - } - ], - "description": "Task/node event recording rate from propeller to FlyteAdmin. Failures indicate CP connectivity issues." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "short" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 36 - }, - "id": 110, - "title": "Cache Discovery (hit/miss/skip)", - "type": "timeseries", - "targets": [ - { - "expr": "rate(flyte:propeller:all:discovery_hit_count{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Hits", - "refId": "A" - }, - { - "expr": "rate(flyte:propeller:all:discovery_miss_count{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Misses", - "refId": "B" - }, - { - "expr": "rate(flyte:propeller:all:discovery_skip_count{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Skips", - "refId": "C" - }, - { - "expr": "rate(flyte:propeller:all:discovery_get_failure_count{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Get failures", - "refId": "D" - } - ], - "description": "V2 executor cache discovery miss/put rates for task output caching." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "reqps" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 44 - }, - "id": 111, - "title": "K8s API Client Rate", - "type": "timeseries", - "targets": [ - { - "expr": "sum(rate(k8s_client_request_total_unlabeled{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "K8s requests/s", - "refId": "A" - } - ], - "description": "Propeller's K8s API request rate. High rates may indicate excessive pod watches or creates." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "s" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 44 - }, - "id": 112, - "title": "K8s API Client Latency (p90)", - "type": "timeseries", - "targets": [ - { - "expr": "histogram_quantile(0.90, sum by (le) (rate(k8s_client_request_latency_unlabeled_bucket{namespace=\"$namespace\"}[$__rate_interval])))", - "legendFormat": "Request p90", - "refId": "A" - }, - { - "expr": "histogram_quantile(0.90, sum by (le) (rate(k8s_client_rate_limiter_latency_unlabeled_bucket{namespace=\"$namespace\"}[$__rate_interval])))", - "legendFormat": "Rate limiter p90", - "refId": "B" - } - ], - "description": "K8s API request latency and client-side rate limiter wait time at p90." - } - ] - }, - { - "collapsed": true, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 34 - }, - "id": 400, - "title": "gRPC Client (DP \u2192 CP)", - "type": "row", - "panels": [ - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "reqps" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 13 - }, - "id": 401, - "title": "gRPC Client Request Rate", - "type": "timeseries", - "targets": [ - { - "expr": "sum by (grpc_service, grpc_method) (rate(grpc_client_handled_total{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "{{ grpc_service }}/{{ grpc_method }}", - "refId": "A" - } - ], - "description": "DP\u2192CP gRPC client request rate by service and method (e.g. CreateWorkflowEvent)." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "reqps" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 13 - }, - "id": 402, - "title": "gRPC Client Error Rate", - "type": "timeseries", - "targets": [ - { - "expr": "sum by (grpc_service, grpc_method, grpc_code) (rate(grpc_client_handled_total{namespace=\"$namespace\", grpc_code!=\"OK\"}[$__rate_interval]))", - "legendFormat": "{{ grpc_method }} {{ grpc_code }}", - "refId": "A" - } - ], - "description": "DP\u2192CP gRPC errors by method and code. Non-OK responses from the control plane." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "s" - } - }, - "gridPos": { - "h": 8, - "w": 24, - "x": 0, - "y": 21 - }, - "id": 403, - "title": "gRPC Client Latency (p95)", - "type": "timeseries", - "targets": [ - { - "expr": "histogram_quantile(0.95, sum by (le, grpc_method) (rate(grpc_client_handling_seconds_bucket{namespace=\"$namespace\"}[$__rate_interval])))", - "legendFormat": "{{ grpc_method }} p95", - "refId": "A" + "description": "DP\u2192CP gRPC call latency at p95. High latency = slow control plane or network issues.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] } - ], - "description": "DP\u2192CP gRPC call latency at p95. High latency = slow control plane or network issues." + } } ] }, @@ -3004,7 +2507,18 @@ data: "refId": "A" } ], - "description": "CPU usage in cores per container, stacked. Identifies resource-heavy services." + "description": "CPU usage in cores per container, stacked. Identifies resource-heavy services.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3044,7 +2558,18 @@ data: "refId": "A" } ], - "description": "Working set memory per container. Watch for approaching limits." + "description": "Working set memory per container. Watch for approaching limits.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3081,7 +2606,18 @@ data: "refId": "A" } ], - "description": "Per-container restart events. Spikes indicate crashes or OOM kills." + "description": "Per-container restart events. Spikes indicate crashes or OOM kills.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } } ] } @@ -3133,7 +2669,7 @@ data: "timepicker": {}, "timezone": "browser", "title": "Union Dataplane Overview", - "uid": "union-dp-overview", + "uid": "union-dataplane-overview", "version": 1 } --- diff --git a/tests/generated/dataplane.dcgm-exporter.yaml b/tests/generated/dataplane.dcgm-exporter.yaml index 9a5f8436..503986c9 100644 --- a/tests/generated/dataplane.dcgm-exporter.yaml +++ b/tests/generated/dataplane.dcgm-exporter.yaml @@ -1293,63 +1293,6 @@ data: ], "description": "Total container restarts in the last hour. Non-zero indicates crashlooping or OOM kills." }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "thresholds": { - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "yellow", - "value": 10 - }, - { - "color": "red", - "value": 50 - } - ] - }, - "unit": "short" - } - }, - "gridPos": { - "h": 4, - "w": 6, - "x": 18, - "y": 1 - }, - "id": 8, - "options": { - "colorMode": "background", - "graphMode": "area", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ] - }, - "textMode": "auto" - }, - "title": "Active Workflows", - "type": "stat", - "targets": [ - { - "expr": "sum(flyte:propeller:all:execstats:active_workflow_executions{namespace=\"$namespace\"})", - "legendFormat": "Workflows", - "refId": "A" - } - ], - "description": "Current active FlyteWorkflow CRD count managed by Propeller." - }, { "datasource": { "type": "prometheus", @@ -1379,8 +1322,9 @@ data: "options": { "legend": { "calcs": [ - "mean", - "max" + "min", + "max", + "lastNotNull" ], "displayMode": "table", "placement": "bottom" @@ -1411,59 +1355,6 @@ data: ], "description": "Current count of active workflow, node, and task executions tracked by FlyteAdmin." }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "short" - } - }, - "gridPos": { - "h": 6, - "w": 12, - "x": 12, - "y": 5 - }, - "id": 7, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom" - }, - "tooltip": { - "mode": "multi", - "sort": "desc" - } - }, - "title": "Queue Depth", - "type": "timeseries", - "targets": [ - { - "expr": "sum(flyte:propeller:all:main_depth{namespace=\"$namespace\"})", - "legendFormat": "Main", - "refId": "A" - }, - { - "expr": "sum(flyte:propeller:all:sub_depth{namespace=\"$namespace\"})", - "legendFormat": "Sub", - "refId": "B" - } - ], - "description": "Main and sub workqueue depth over time." - }, { "datasource": { "type": "prometheus", @@ -1749,7 +1640,7 @@ data: "y": 10 }, "id": 1204, - "title": "Propeller Latency p99", + "title": "Executor Evaluate Duration p99", "type": "stat", "options": { "colorMode": "background", @@ -1763,8 +1654,13 @@ data: }, "targets": [ { - "expr": "(union:dp:slo:propeller_round_latency_p99 or flyte:propeller:all:round:round_time_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.99\"} / 1000)", - "refId": "A" + "expr": "executor::v2:evaluator:evaluate_duration_ms{namespace=\"$namespace\", quantile=\"0.99\"}", + "legendFormat": "p99", + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + } } ], "description": "Propeller round p99 latency in seconds. Falls back to raw metric if SLO rules not enabled." @@ -1809,7 +1705,18 @@ data: "refId": "B" } ], - "description": "DP service availability over time with SLO target line." + "description": "DP service availability over time with SLO target line.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -1853,7 +1760,18 @@ data: "refId": "B" } ], - "description": "DP error budget remaining over time. Requires monitoring.slos.enabled." + "description": "DP error budget remaining over time. Requires monitoring.slos.enabled.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "collapsed": true, @@ -1907,7 +1825,18 @@ data: "refId": "B" } ], - "description": "Operator execution operation processing rate and failure rate." + "description": "Operator execution operation processing rate and failure rate.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -1964,7 +1893,18 @@ data: "refId": "E" } ], - "description": "FlyteAdmin error rates: propeller communication failures, model transform errors, notification publish failures." + "description": "FlyteAdmin error rates: propeller communication failures, model transform errors, notification publish failures.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2011,7 +1951,18 @@ data: "refId": "C" } ], - "description": "Breakdown of operator heartbeat computation: capabilities, usages, list workflows." + "description": "Breakdown of operator heartbeat computation: capabilities, usages, list workflows.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2058,7 +2009,18 @@ data: "refId": "C" } ], - "description": "Config sync cycle rate, errors, and propeller ConfigMap update count." + "description": "Config sync cycle rate, errors, and propeller ConfigMap update count.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2100,7 +2062,18 @@ data: "refId": "B" } ], - "description": "Billing collection cycle rate and errors. Failures mean billing data may be delayed." + "description": "Billing collection cycle rate and errors. Failures mean billing data may be delayed.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2199,17 +2172,28 @@ data: "type": "timeseries", "targets": [ { - "expr": "executor:active_actions_count{namespace=\"$namespace\"}", + "expr": "executor::v2:active_actions_count{namespace=\"$namespace\"}", "legendFormat": "Active actions", "refId": "A" }, { - "expr": "executor:available_capacity{namespace=\"$namespace\"}", + "expr": "executor::v2:available_capacity{namespace=\"$namespace\"}", "legendFormat": "Available capacity", "refId": "B" } ], - "description": "V2 executor active actions vs available capacity. Capacity=0 means executor is saturated." + "description": "V2 executor active actions vs available capacity. Capacity=0 means executor is saturated.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2241,17 +2225,28 @@ data: "type": "timeseries", "targets": [ { - "expr": "rate(executor:discovery_miss_count{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(executor::v2:discovery_miss_count{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Miss", "refId": "A" }, { - "expr": "rate(executor:discovery_put_success_count{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(executor::v2:discovery_put_success_count{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Put success", "refId": "B" } ], - "description": "V2 executor cache discovery miss/put rates for task output caching." + "description": "V2 executor cache discovery miss/put rates for task output caching.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2286,12 +2281,23 @@ data: "type": "timeseries", "targets": [ { - "expr": "sum by (phase) (rate(executor:actions_terminated{namespace=\"$namespace\"}[$__rate_interval]))", + "expr": "sum by (phase) (rate(executor::v2:actions_terminated{namespace=\"$namespace\"}[$__rate_interval]))", "legendFormat": "{{ phase }}", "refId": "A" } ], - "description": "Task completion rate by outcome: succeeded, failed, aborted. Key V2 SLI for task health. [Metrics pending: requires cloud service instrumentation to be deployed]" + "description": "Task completion rate by outcome: succeeded, failed, aborted. Key V2 SLI for task health. [Metrics pending: requires cloud service instrumentation to be deployed]", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2323,22 +2329,33 @@ data: "type": "timeseries", "targets": [ { - "expr": "executor:evaluator:evaluate_duration{namespace=\"$namespace\", quantile=\"0.5\"}", + "expr": "executor::v2:evaluator:evaluate_duration_ms{namespace=\"$namespace\", quantile=\"0.5\"}", "legendFormat": "Evaluate p50", "refId": "A" }, { - "expr": "executor:evaluator:evaluate_duration{namespace=\"$namespace\", quantile=\"0.9\"}", + "expr": "executor::v2:evaluator:evaluate_duration_ms{namespace=\"$namespace\", quantile=\"0.9\"}", "legendFormat": "Evaluate p90", "refId": "B" }, { - "expr": "executor:evaluator:evaluate_duration{namespace=\"$namespace\", quantile=\"0.99\"}", + "expr": "executor::v2:evaluator:evaluate_duration_ms{namespace=\"$namespace\", quantile=\"0.99\"}", "legendFormat": "Evaluate p99", "refId": "C" } ], - "description": "Time spent in RecursiveNodeHandler (pod creation). Dominant component of V2 task latency. [Metrics pending: requires cloud service instrumentation to be deployed]" + "description": "Time spent in RecursiveNodeHandler (pod creation). Dominant component of V2 task latency. [Metrics pending: requires cloud service instrumentation to be deployed]", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2370,27 +2387,38 @@ data: "type": "timeseries", "targets": [ { - "expr": "rate(executor:system_failures{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(executor::v2:system_failures{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "System failures", "refId": "A" }, { - "expr": "rate(executor:system_failures_exhausted{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(executor::v2:system_failures_exhausted{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Exhausted retries", "refId": "B" }, { - "expr": "rate(executor:invalid_leases{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(executor::v2:invalid_leases{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Invalid leases", "refId": "C" }, { - "expr": "rate(executor:evaluator:evaluate_errors{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(executor::v2:evaluator:evaluate_errors{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Evaluate errors", "refId": "D" } ], - "description": "Executor error rates. System failures retry; exhausted means task permanently failed. Invalid leases = malformed from queue service. [Metrics pending: requires cloud service instrumentation to be deployed]" + "description": "Executor error rates. System failures retry; exhausted means task permanently failed. Invalid leases = malformed from queue service. [Metrics pending: requires cloud service instrumentation to be deployed]", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } } ] }, @@ -2400,10 +2428,10 @@ data: "h": 1, "w": 24, "x": 0, - "y": 33 + "y": 34 }, - "id": 100, - "title": "Flyte Propeller (V1)", + "id": 400, + "title": "gRPC Client (DP \u2192 CP)", "type": "row", "panels": [ { @@ -2422,36 +2450,37 @@ data: "lineWidth": 1, "showPoints": "never" }, - "unit": "ms" + "unit": "reqps" } }, "gridPos": { "h": 8, - "w": 8, + "w": 12, "x": 0, - "y": 12 + "y": 13 }, - "id": 101, - "title": "Round Time (p50 / p90 / p99)", + "id": 401, + "title": "gRPC Client Request Rate", "type": "timeseries", "targets": [ { - "expr": "flyte:propeller:all:round:round_time_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.5\"}", - "legendFormat": "p50", + "expr": "sum by (grpc_service, grpc_method) (rate(grpc_client_handled_total{namespace=\"$namespace\"}[$__rate_interval]))", + "legendFormat": "{{ grpc_service }}/{{ grpc_method }}", "refId": "A" - }, - { - "expr": "flyte:propeller:all:round:round_time_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.9\"}", - "legendFormat": "p90", - "refId": "B" - }, - { - "expr": "flyte:propeller:all:round:round_time_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.99\"}", - "legendFormat": "p99", - "refId": "C" } ], - "description": "Propeller reconciliation round time. One round = one FlyteWorkflow CRD processed." + "description": "DP\u2192CP gRPC client request rate by service and method (e.g. CreateWorkflowEvent).", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2469,36 +2498,37 @@ data: "lineWidth": 1, "showPoints": "never" }, - "unit": "ops" + "unit": "reqps" } }, "gridPos": { "h": 8, - "w": 8, - "x": 8, - "y": 12 + "w": 12, + "x": 12, + "y": 13 }, - "id": 102, - "title": "Round Success / Error Rate", + "id": 402, + "title": "gRPC Client Error Rate", "type": "timeseries", "targets": [ { - "expr": "rate(flyte:propeller:all:round:success_count{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Success", + "expr": "sum by (grpc_service, grpc_method, grpc_code) (rate(grpc_client_handled_total{namespace=\"$namespace\", grpc_code!=\"OK\"}[$__rate_interval]))", + "legendFormat": "{{ grpc_method }} {{ grpc_code }}", "refId": "A" - }, - { - "expr": "rate(flyte:propeller:all:round:error_count{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Errors", - "refId": "B" - }, - { - "expr": "rate(flyte:propeller:all:round:panic_unlabeled{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Panics", - "refId": "C" } ], - "description": "Propeller round outcomes: success, errors, and panics per second." + "description": "DP\u2192CP gRPC errors by method and code. Non-OK responses from the control plane.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2516,564 +2546,37 @@ data: "lineWidth": 1, "showPoints": "never" }, - "unit": "short" + "unit": "s" } }, "gridPos": { "h": 8, - "w": 8, - "x": 16, - "y": 12 + "w": 24, + "x": 0, + "y": 21 }, - "id": 103, - "title": "Free Workers", + "id": 403, + "title": "gRPC Client Latency (p95)", "type": "timeseries", "targets": [ { - "expr": "sum(flyte:propeller:all:free_workers_count{namespace=\"$namespace\"})", - "legendFormat": "Free workers", + "expr": "histogram_quantile(0.95, sum by (le, grpc_method) (rate(grpc_client_handling_seconds_bucket{namespace=\"$namespace\"}[$__rate_interval])))", + "legendFormat": "{{ grpc_method }} p95", "refId": "A" } ], - "description": "Idle propeller worker goroutines. Zero = all workers busy processing workflows." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "ops" - } - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 0, - "y": 20 - }, - "id": 104, - "title": "Queue Add Rate", - "type": "timeseries", - "targets": [ - { - "expr": "rate(flyte:propeller:all:main_adds{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Main adds", - "refId": "A" - }, - { - "expr": "rate(flyte:propeller:all:sub_adds{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Sub adds", - "refId": "B" - }, - { - "expr": "rate(flyte:propeller:all:main_retries{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Main retries", - "refId": "C" - } - ], - "description": "Items enqueued to propeller's main/sub workqueues, plus retry rate." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "short" - } - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 8, - "y": 20 - }, - "id": 105, - "title": "Workflow Updates", - "type": "timeseries", - "targets": [ - { - "expr": "rate(flyte:propeller:all:wf_updated{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Updated", - "refId": "A" - }, - { - "expr": "rate(flyte:propeller:all:wf_update_failed{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Failed", - "refId": "B" - }, - { - "expr": "rate(flyte:propeller:all:wf_too_large{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Too large", - "refId": "C" - }, - { - "expr": "rate(flyte:propeller:all:wf_update_conflict{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Conflict", - "refId": "D" - } - ], - "description": "etcd write outcomes: successful updates, failures, too-large (>1.5MB), optimistic concurrency conflicts." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "ms" - } - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 16, - "y": 20 - }, - "id": 106, - "title": "Workflow Update Latency", - "type": "timeseries", - "targets": [ - { - "expr": "flyte:propeller:all:wf_update_latency_ms{namespace=\"$namespace\", quantile=\"0.5\"}", - "legendFormat": "p50", - "refId": "A" - }, - { - "expr": "flyte:propeller:all:wf_update_latency_ms{namespace=\"$namespace\", quantile=\"0.9\"}", - "legendFormat": "p90", - "refId": "B" - }, - { - "expr": "flyte:propeller:all:wf_update_latency_ms{namespace=\"$namespace\", quantile=\"0.99\"}", - "legendFormat": "p99", - "refId": "C" - } - ], - "description": "etcd write latency for FlyteWorkflow status updates." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "ms" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 28 - }, - "id": 107, - "title": "Node Queueing & Execution Latency", - "type": "timeseries", - "targets": [ - { - "expr": "flyte:propeller:all:node:queueing_latency_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.5\"}", - "legendFormat": "Queue p50", - "refId": "A" - }, - { - "expr": "flyte:propeller:all:node:queueing_latency_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.9\"}", - "legendFormat": "Queue p90", - "refId": "B" - }, - { - "expr": "flyte:propeller:all:node:node_exec_latency_unlabeled_us{namespace=\"$namespace\", quantile=\"0.9\"} / 1000", - "legendFormat": "Exec p90 (ms)", - "refId": "C" - } - ], - "description": "Node lifecycle: queueing latency (queued\u2192running) and execution latency (time in handler)." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "percentunit" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 28 - }, - "id": 108, - "title": "Metastore Cache Hit Rate", - "type": "timeseries", - "targets": [ - { - "expr": "rate(flyte:propeller:all:metastore:cache_hit{namespace=\"$namespace\"}[$__rate_interval]) / (rate(flyte:propeller:all:metastore:cache_hit{namespace=\"$namespace\"}[$__rate_interval]) + rate(flyte:propeller:all:metastore:cache_miss{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "Hit rate", - "refId": "A" - } - ], - "description": "In-memory cache effectiveness for object store (S3/GCS) reads. Low hit rate = excessive storage calls." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "ops" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 36 - }, - "id": 109, - "title": "Event Recording (DP \u2192 CP)", - "type": "timeseries", - "targets": [ - { - "expr": "sum(rate(flyte:propeller:all:task:event_recording:success_duration_ms_count{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "Task success", - "refId": "A" - }, - { - "expr": "sum(rate(flyte:propeller:all:node:event_recording:success_duration_ms_count{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "Node success", - "refId": "B" - }, - { - "expr": "sum(rate(flyte:propeller:all:task:event_recording:failure_duration_ms_count{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "Task failure", - "refId": "C" - }, - { - "expr": "sum(rate(flyte:propeller:all:node:event_recording:failure_duration_ms_count{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "Node failure", - "refId": "D" - } - ], - "description": "Task/node event recording rate from propeller to FlyteAdmin. Failures indicate CP connectivity issues." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "short" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 36 - }, - "id": 110, - "title": "Cache Discovery (hit/miss/skip)", - "type": "timeseries", - "targets": [ - { - "expr": "rate(flyte:propeller:all:discovery_hit_count{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Hits", - "refId": "A" - }, - { - "expr": "rate(flyte:propeller:all:discovery_miss_count{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Misses", - "refId": "B" - }, - { - "expr": "rate(flyte:propeller:all:discovery_skip_count{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Skips", - "refId": "C" - }, - { - "expr": "rate(flyte:propeller:all:discovery_get_failure_count{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Get failures", - "refId": "D" - } - ], - "description": "V2 executor cache discovery miss/put rates for task output caching." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "reqps" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 44 - }, - "id": 111, - "title": "K8s API Client Rate", - "type": "timeseries", - "targets": [ - { - "expr": "sum(rate(k8s_client_request_total_unlabeled{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "K8s requests/s", - "refId": "A" - } - ], - "description": "Propeller's K8s API request rate. High rates may indicate excessive pod watches or creates." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "s" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 44 - }, - "id": 112, - "title": "K8s API Client Latency (p90)", - "type": "timeseries", - "targets": [ - { - "expr": "histogram_quantile(0.90, sum by (le) (rate(k8s_client_request_latency_unlabeled_bucket{namespace=\"$namespace\"}[$__rate_interval])))", - "legendFormat": "Request p90", - "refId": "A" - }, - { - "expr": "histogram_quantile(0.90, sum by (le) (rate(k8s_client_rate_limiter_latency_unlabeled_bucket{namespace=\"$namespace\"}[$__rate_interval])))", - "legendFormat": "Rate limiter p90", - "refId": "B" - } - ], - "description": "K8s API request latency and client-side rate limiter wait time at p90." - } - ] - }, - { - "collapsed": true, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 34 - }, - "id": 400, - "title": "gRPC Client (DP \u2192 CP)", - "type": "row", - "panels": [ - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "reqps" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 13 - }, - "id": 401, - "title": "gRPC Client Request Rate", - "type": "timeseries", - "targets": [ - { - "expr": "sum by (grpc_service, grpc_method) (rate(grpc_client_handled_total{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "{{ grpc_service }}/{{ grpc_method }}", - "refId": "A" - } - ], - "description": "DP\u2192CP gRPC client request rate by service and method (e.g. CreateWorkflowEvent)." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "reqps" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 13 - }, - "id": 402, - "title": "gRPC Client Error Rate", - "type": "timeseries", - "targets": [ - { - "expr": "sum by (grpc_service, grpc_method, grpc_code) (rate(grpc_client_handled_total{namespace=\"$namespace\", grpc_code!=\"OK\"}[$__rate_interval]))", - "legendFormat": "{{ grpc_method }} {{ grpc_code }}", - "refId": "A" - } - ], - "description": "DP\u2192CP gRPC errors by method and code. Non-OK responses from the control plane." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "s" - } - }, - "gridPos": { - "h": 8, - "w": 24, - "x": 0, - "y": 21 - }, - "id": 403, - "title": "gRPC Client Latency (p95)", - "type": "timeseries", - "targets": [ - { - "expr": "histogram_quantile(0.95, sum by (le, grpc_method) (rate(grpc_client_handling_seconds_bucket{namespace=\"$namespace\"}[$__rate_interval])))", - "legendFormat": "{{ grpc_method }} p95", - "refId": "A" + "description": "DP\u2192CP gRPC call latency at p95. High latency = slow control plane or network issues.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] } - ], - "description": "DP\u2192CP gRPC call latency at p95. High latency = slow control plane or network issues." + } } ] }, @@ -3127,7 +2630,18 @@ data: "refId": "A" } ], - "description": "CPU usage in cores per container, stacked. Identifies resource-heavy services." + "description": "CPU usage in cores per container, stacked. Identifies resource-heavy services.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3167,7 +2681,18 @@ data: "refId": "A" } ], - "description": "Working set memory per container. Watch for approaching limits." + "description": "Working set memory per container. Watch for approaching limits.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3204,7 +2729,18 @@ data: "refId": "A" } ], - "description": "Per-container restart events. Spikes indicate crashes or OOM kills." + "description": "Per-container restart events. Spikes indicate crashes or OOM kills.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } } ] } @@ -3256,7 +2792,7 @@ data: "timepicker": {}, "timezone": "browser", "title": "Union Dataplane Overview", - "uid": "union-dp-overview", + "uid": "union-dataplane-overview", "version": 1 } --- diff --git a/tests/generated/dataplane.fully-selfhosted.yaml b/tests/generated/dataplane.fully-selfhosted.yaml index 0a2e32ef..15e45b9c 100644 --- a/tests/generated/dataplane.fully-selfhosted.yaml +++ b/tests/generated/dataplane.fully-selfhosted.yaml @@ -1171,63 +1171,6 @@ data: ], "description": "Total container restarts in the last hour. Non-zero indicates crashlooping or OOM kills." }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "thresholds": { - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "yellow", - "value": 10 - }, - { - "color": "red", - "value": 50 - } - ] - }, - "unit": "short" - } - }, - "gridPos": { - "h": 4, - "w": 6, - "x": 18, - "y": 1 - }, - "id": 8, - "options": { - "colorMode": "background", - "graphMode": "area", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ] - }, - "textMode": "auto" - }, - "title": "Active Workflows", - "type": "stat", - "targets": [ - { - "expr": "sum(flyte:propeller:all:execstats:active_workflow_executions{namespace=\"$namespace\"})", - "legendFormat": "Workflows", - "refId": "A" - } - ], - "description": "Current active FlyteWorkflow CRD count managed by Propeller." - }, { "datasource": { "type": "prometheus", @@ -1257,8 +1200,9 @@ data: "options": { "legend": { "calcs": [ - "mean", - "max" + "min", + "max", + "lastNotNull" ], "displayMode": "table", "placement": "bottom" @@ -1289,59 +1233,6 @@ data: ], "description": "Current count of active workflow, node, and task executions tracked by FlyteAdmin." }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "short" - } - }, - "gridPos": { - "h": 6, - "w": 12, - "x": 12, - "y": 5 - }, - "id": 7, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom" - }, - "tooltip": { - "mode": "multi", - "sort": "desc" - } - }, - "title": "Queue Depth", - "type": "timeseries", - "targets": [ - { - "expr": "sum(flyte:propeller:all:main_depth{namespace=\"$namespace\"})", - "legendFormat": "Main", - "refId": "A" - }, - { - "expr": "sum(flyte:propeller:all:sub_depth{namespace=\"$namespace\"})", - "legendFormat": "Sub", - "refId": "B" - } - ], - "description": "Main and sub workqueue depth over time." - }, { "datasource": { "type": "prometheus", @@ -1627,7 +1518,7 @@ data: "y": 10 }, "id": 1204, - "title": "Propeller Latency p99", + "title": "Executor Evaluate Duration p99", "type": "stat", "options": { "colorMode": "background", @@ -1641,8 +1532,13 @@ data: }, "targets": [ { - "expr": "(union:dp:slo:propeller_round_latency_p99 or flyte:propeller:all:round:round_time_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.99\"} / 1000)", - "refId": "A" + "expr": "executor::v2:evaluator:evaluate_duration_ms{namespace=\"$namespace\", quantile=\"0.99\"}", + "legendFormat": "p99", + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + } } ], "description": "Propeller round p99 latency in seconds. Falls back to raw metric if SLO rules not enabled." @@ -1687,7 +1583,18 @@ data: "refId": "B" } ], - "description": "DP service availability over time with SLO target line." + "description": "DP service availability over time with SLO target line.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -1731,7 +1638,18 @@ data: "refId": "B" } ], - "description": "DP error budget remaining over time. Requires monitoring.slos.enabled." + "description": "DP error budget remaining over time. Requires monitoring.slos.enabled.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "collapsed": true, @@ -1785,7 +1703,18 @@ data: "refId": "B" } ], - "description": "Operator execution operation processing rate and failure rate." + "description": "Operator execution operation processing rate and failure rate.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -1842,7 +1771,18 @@ data: "refId": "E" } ], - "description": "FlyteAdmin error rates: propeller communication failures, model transform errors, notification publish failures." + "description": "FlyteAdmin error rates: propeller communication failures, model transform errors, notification publish failures.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -1889,7 +1829,18 @@ data: "refId": "C" } ], - "description": "Breakdown of operator heartbeat computation: capabilities, usages, list workflows." + "description": "Breakdown of operator heartbeat computation: capabilities, usages, list workflows.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -1936,7 +1887,18 @@ data: "refId": "C" } ], - "description": "Config sync cycle rate, errors, and propeller ConfigMap update count." + "description": "Config sync cycle rate, errors, and propeller ConfigMap update count.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -1978,7 +1940,18 @@ data: "refId": "B" } ], - "description": "Billing collection cycle rate and errors. Failures mean billing data may be delayed." + "description": "Billing collection cycle rate and errors. Failures mean billing data may be delayed.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2077,17 +2050,28 @@ data: "type": "timeseries", "targets": [ { - "expr": "executor:active_actions_count{namespace=\"$namespace\"}", + "expr": "executor::v2:active_actions_count{namespace=\"$namespace\"}", "legendFormat": "Active actions", "refId": "A" }, { - "expr": "executor:available_capacity{namespace=\"$namespace\"}", + "expr": "executor::v2:available_capacity{namespace=\"$namespace\"}", "legendFormat": "Available capacity", "refId": "B" } ], - "description": "V2 executor active actions vs available capacity. Capacity=0 means executor is saturated." + "description": "V2 executor active actions vs available capacity. Capacity=0 means executor is saturated.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2119,17 +2103,28 @@ data: "type": "timeseries", "targets": [ { - "expr": "rate(executor:discovery_miss_count{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(executor::v2:discovery_miss_count{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Miss", "refId": "A" }, { - "expr": "rate(executor:discovery_put_success_count{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(executor::v2:discovery_put_success_count{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Put success", "refId": "B" } ], - "description": "V2 executor cache discovery miss/put rates for task output caching." + "description": "V2 executor cache discovery miss/put rates for task output caching.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2164,12 +2159,23 @@ data: "type": "timeseries", "targets": [ { - "expr": "sum by (phase) (rate(executor:actions_terminated{namespace=\"$namespace\"}[$__rate_interval]))", + "expr": "sum by (phase) (rate(executor::v2:actions_terminated{namespace=\"$namespace\"}[$__rate_interval]))", "legendFormat": "{{ phase }}", "refId": "A" } ], - "description": "Task completion rate by outcome: succeeded, failed, aborted. Key V2 SLI for task health. [Metrics pending: requires cloud service instrumentation to be deployed]" + "description": "Task completion rate by outcome: succeeded, failed, aborted. Key V2 SLI for task health. [Metrics pending: requires cloud service instrumentation to be deployed]", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2201,22 +2207,33 @@ data: "type": "timeseries", "targets": [ { - "expr": "executor:evaluator:evaluate_duration{namespace=\"$namespace\", quantile=\"0.5\"}", + "expr": "executor::v2:evaluator:evaluate_duration_ms{namespace=\"$namespace\", quantile=\"0.5\"}", "legendFormat": "Evaluate p50", "refId": "A" }, { - "expr": "executor:evaluator:evaluate_duration{namespace=\"$namespace\", quantile=\"0.9\"}", + "expr": "executor::v2:evaluator:evaluate_duration_ms{namespace=\"$namespace\", quantile=\"0.9\"}", "legendFormat": "Evaluate p90", "refId": "B" }, { - "expr": "executor:evaluator:evaluate_duration{namespace=\"$namespace\", quantile=\"0.99\"}", + "expr": "executor::v2:evaluator:evaluate_duration_ms{namespace=\"$namespace\", quantile=\"0.99\"}", "legendFormat": "Evaluate p99", "refId": "C" } ], - "description": "Time spent in RecursiveNodeHandler (pod creation). Dominant component of V2 task latency. [Metrics pending: requires cloud service instrumentation to be deployed]" + "description": "Time spent in RecursiveNodeHandler (pod creation). Dominant component of V2 task latency. [Metrics pending: requires cloud service instrumentation to be deployed]", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2248,27 +2265,38 @@ data: "type": "timeseries", "targets": [ { - "expr": "rate(executor:system_failures{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(executor::v2:system_failures{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "System failures", "refId": "A" }, { - "expr": "rate(executor:system_failures_exhausted{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(executor::v2:system_failures_exhausted{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Exhausted retries", "refId": "B" }, { - "expr": "rate(executor:invalid_leases{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(executor::v2:invalid_leases{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Invalid leases", "refId": "C" }, { - "expr": "rate(executor:evaluator:evaluate_errors{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(executor::v2:evaluator:evaluate_errors{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Evaluate errors", "refId": "D" } ], - "description": "Executor error rates. System failures retry; exhausted means task permanently failed. Invalid leases = malformed from queue service. [Metrics pending: requires cloud service instrumentation to be deployed]" + "description": "Executor error rates. System failures retry; exhausted means task permanently failed. Invalid leases = malformed from queue service. [Metrics pending: requires cloud service instrumentation to be deployed]", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } } ] }, @@ -2278,10 +2306,10 @@ data: "h": 1, "w": 24, "x": 0, - "y": 33 + "y": 34 }, - "id": 100, - "title": "Flyte Propeller (V1)", + "id": 400, + "title": "gRPC Client (DP \u2192 CP)", "type": "row", "panels": [ { @@ -2300,36 +2328,37 @@ data: "lineWidth": 1, "showPoints": "never" }, - "unit": "ms" + "unit": "reqps" } }, "gridPos": { "h": 8, - "w": 8, + "w": 12, "x": 0, - "y": 12 + "y": 13 }, - "id": 101, - "title": "Round Time (p50 / p90 / p99)", + "id": 401, + "title": "gRPC Client Request Rate", "type": "timeseries", "targets": [ { - "expr": "flyte:propeller:all:round:round_time_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.5\"}", - "legendFormat": "p50", + "expr": "sum by (grpc_service, grpc_method) (rate(grpc_client_handled_total{namespace=\"$namespace\"}[$__rate_interval]))", + "legendFormat": "{{ grpc_service }}/{{ grpc_method }}", "refId": "A" - }, - { - "expr": "flyte:propeller:all:round:round_time_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.9\"}", - "legendFormat": "p90", - "refId": "B" - }, - { - "expr": "flyte:propeller:all:round:round_time_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.99\"}", - "legendFormat": "p99", - "refId": "C" } ], - "description": "Propeller reconciliation round time. One round = one FlyteWorkflow CRD processed." + "description": "DP\u2192CP gRPC client request rate by service and method (e.g. CreateWorkflowEvent).", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2347,36 +2376,37 @@ data: "lineWidth": 1, "showPoints": "never" }, - "unit": "ops" + "unit": "reqps" } }, "gridPos": { "h": 8, - "w": 8, - "x": 8, - "y": 12 + "w": 12, + "x": 12, + "y": 13 }, - "id": 102, - "title": "Round Success / Error Rate", + "id": 402, + "title": "gRPC Client Error Rate", "type": "timeseries", "targets": [ { - "expr": "rate(flyte:propeller:all:round:success_count{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Success", + "expr": "sum by (grpc_service, grpc_method, grpc_code) (rate(grpc_client_handled_total{namespace=\"$namespace\", grpc_code!=\"OK\"}[$__rate_interval]))", + "legendFormat": "{{ grpc_method }} {{ grpc_code }}", "refId": "A" - }, - { - "expr": "rate(flyte:propeller:all:round:error_count{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Errors", - "refId": "B" - }, - { - "expr": "rate(flyte:propeller:all:round:panic_unlabeled{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Panics", - "refId": "C" } ], - "description": "Propeller round outcomes: success, errors, and panics per second." + "description": "DP\u2192CP gRPC errors by method and code. Non-OK responses from the control plane.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2394,564 +2424,37 @@ data: "lineWidth": 1, "showPoints": "never" }, - "unit": "short" + "unit": "s" } }, "gridPos": { "h": 8, - "w": 8, - "x": 16, - "y": 12 + "w": 24, + "x": 0, + "y": 21 }, - "id": 103, - "title": "Free Workers", + "id": 403, + "title": "gRPC Client Latency (p95)", "type": "timeseries", "targets": [ { - "expr": "sum(flyte:propeller:all:free_workers_count{namespace=\"$namespace\"})", - "legendFormat": "Free workers", + "expr": "histogram_quantile(0.95, sum by (le, grpc_method) (rate(grpc_client_handling_seconds_bucket{namespace=\"$namespace\"}[$__rate_interval])))", + "legendFormat": "{{ grpc_method }} p95", "refId": "A" } ], - "description": "Idle propeller worker goroutines. Zero = all workers busy processing workflows." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "ops" - } - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 0, - "y": 20 - }, - "id": 104, - "title": "Queue Add Rate", - "type": "timeseries", - "targets": [ - { - "expr": "rate(flyte:propeller:all:main_adds{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Main adds", - "refId": "A" - }, - { - "expr": "rate(flyte:propeller:all:sub_adds{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Sub adds", - "refId": "B" - }, - { - "expr": "rate(flyte:propeller:all:main_retries{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Main retries", - "refId": "C" - } - ], - "description": "Items enqueued to propeller's main/sub workqueues, plus retry rate." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "short" - } - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 8, - "y": 20 - }, - "id": 105, - "title": "Workflow Updates", - "type": "timeseries", - "targets": [ - { - "expr": "rate(flyte:propeller:all:wf_updated{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Updated", - "refId": "A" - }, - { - "expr": "rate(flyte:propeller:all:wf_update_failed{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Failed", - "refId": "B" - }, - { - "expr": "rate(flyte:propeller:all:wf_too_large{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Too large", - "refId": "C" - }, - { - "expr": "rate(flyte:propeller:all:wf_update_conflict{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Conflict", - "refId": "D" - } - ], - "description": "etcd write outcomes: successful updates, failures, too-large (>1.5MB), optimistic concurrency conflicts." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "ms" - } - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 16, - "y": 20 - }, - "id": 106, - "title": "Workflow Update Latency", - "type": "timeseries", - "targets": [ - { - "expr": "flyte:propeller:all:wf_update_latency_ms{namespace=\"$namespace\", quantile=\"0.5\"}", - "legendFormat": "p50", - "refId": "A" - }, - { - "expr": "flyte:propeller:all:wf_update_latency_ms{namespace=\"$namespace\", quantile=\"0.9\"}", - "legendFormat": "p90", - "refId": "B" - }, - { - "expr": "flyte:propeller:all:wf_update_latency_ms{namespace=\"$namespace\", quantile=\"0.99\"}", - "legendFormat": "p99", - "refId": "C" - } - ], - "description": "etcd write latency for FlyteWorkflow status updates." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "ms" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 28 - }, - "id": 107, - "title": "Node Queueing & Execution Latency", - "type": "timeseries", - "targets": [ - { - "expr": "flyte:propeller:all:node:queueing_latency_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.5\"}", - "legendFormat": "Queue p50", - "refId": "A" - }, - { - "expr": "flyte:propeller:all:node:queueing_latency_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.9\"}", - "legendFormat": "Queue p90", - "refId": "B" - }, - { - "expr": "flyte:propeller:all:node:node_exec_latency_unlabeled_us{namespace=\"$namespace\", quantile=\"0.9\"} / 1000", - "legendFormat": "Exec p90 (ms)", - "refId": "C" - } - ], - "description": "Node lifecycle: queueing latency (queued\u2192running) and execution latency (time in handler)." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "percentunit" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 28 - }, - "id": 108, - "title": "Metastore Cache Hit Rate", - "type": "timeseries", - "targets": [ - { - "expr": "rate(flyte:propeller:all:metastore:cache_hit{namespace=\"$namespace\"}[$__rate_interval]) / (rate(flyte:propeller:all:metastore:cache_hit{namespace=\"$namespace\"}[$__rate_interval]) + rate(flyte:propeller:all:metastore:cache_miss{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "Hit rate", - "refId": "A" - } - ], - "description": "In-memory cache effectiveness for object store (S3/GCS) reads. Low hit rate = excessive storage calls." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "ops" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 36 - }, - "id": 109, - "title": "Event Recording (DP \u2192 CP)", - "type": "timeseries", - "targets": [ - { - "expr": "sum(rate(flyte:propeller:all:task:event_recording:success_duration_ms_count{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "Task success", - "refId": "A" - }, - { - "expr": "sum(rate(flyte:propeller:all:node:event_recording:success_duration_ms_count{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "Node success", - "refId": "B" - }, - { - "expr": "sum(rate(flyte:propeller:all:task:event_recording:failure_duration_ms_count{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "Task failure", - "refId": "C" - }, - { - "expr": "sum(rate(flyte:propeller:all:node:event_recording:failure_duration_ms_count{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "Node failure", - "refId": "D" - } - ], - "description": "Task/node event recording rate from propeller to FlyteAdmin. Failures indicate CP connectivity issues." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "short" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 36 - }, - "id": 110, - "title": "Cache Discovery (hit/miss/skip)", - "type": "timeseries", - "targets": [ - { - "expr": "rate(flyte:propeller:all:discovery_hit_count{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Hits", - "refId": "A" - }, - { - "expr": "rate(flyte:propeller:all:discovery_miss_count{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Misses", - "refId": "B" - }, - { - "expr": "rate(flyte:propeller:all:discovery_skip_count{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Skips", - "refId": "C" - }, - { - "expr": "rate(flyte:propeller:all:discovery_get_failure_count{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Get failures", - "refId": "D" - } - ], - "description": "V2 executor cache discovery miss/put rates for task output caching." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "reqps" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 44 - }, - "id": 111, - "title": "K8s API Client Rate", - "type": "timeseries", - "targets": [ - { - "expr": "sum(rate(k8s_client_request_total_unlabeled{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "K8s requests/s", - "refId": "A" - } - ], - "description": "Propeller's K8s API request rate. High rates may indicate excessive pod watches or creates." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "s" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 44 - }, - "id": 112, - "title": "K8s API Client Latency (p90)", - "type": "timeseries", - "targets": [ - { - "expr": "histogram_quantile(0.90, sum by (le) (rate(k8s_client_request_latency_unlabeled_bucket{namespace=\"$namespace\"}[$__rate_interval])))", - "legendFormat": "Request p90", - "refId": "A" - }, - { - "expr": "histogram_quantile(0.90, sum by (le) (rate(k8s_client_rate_limiter_latency_unlabeled_bucket{namespace=\"$namespace\"}[$__rate_interval])))", - "legendFormat": "Rate limiter p90", - "refId": "B" - } - ], - "description": "K8s API request latency and client-side rate limiter wait time at p90." - } - ] - }, - { - "collapsed": true, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 34 - }, - "id": 400, - "title": "gRPC Client (DP \u2192 CP)", - "type": "row", - "panels": [ - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "reqps" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 13 - }, - "id": 401, - "title": "gRPC Client Request Rate", - "type": "timeseries", - "targets": [ - { - "expr": "sum by (grpc_service, grpc_method) (rate(grpc_client_handled_total{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "{{ grpc_service }}/{{ grpc_method }}", - "refId": "A" - } - ], - "description": "DP\u2192CP gRPC client request rate by service and method (e.g. CreateWorkflowEvent)." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "reqps" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 13 - }, - "id": 402, - "title": "gRPC Client Error Rate", - "type": "timeseries", - "targets": [ - { - "expr": "sum by (grpc_service, grpc_method, grpc_code) (rate(grpc_client_handled_total{namespace=\"$namespace\", grpc_code!=\"OK\"}[$__rate_interval]))", - "legendFormat": "{{ grpc_method }} {{ grpc_code }}", - "refId": "A" - } - ], - "description": "DP\u2192CP gRPC errors by method and code. Non-OK responses from the control plane." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "s" - } - }, - "gridPos": { - "h": 8, - "w": 24, - "x": 0, - "y": 21 - }, - "id": 403, - "title": "gRPC Client Latency (p95)", - "type": "timeseries", - "targets": [ - { - "expr": "histogram_quantile(0.95, sum by (le, grpc_method) (rate(grpc_client_handling_seconds_bucket{namespace=\"$namespace\"}[$__rate_interval])))", - "legendFormat": "{{ grpc_method }} p95", - "refId": "A" + "description": "DP\u2192CP gRPC call latency at p95. High latency = slow control plane or network issues.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] } - ], - "description": "DP\u2192CP gRPC call latency at p95. High latency = slow control plane or network issues." + } } ] }, @@ -3005,7 +2508,18 @@ data: "refId": "A" } ], - "description": "CPU usage in cores per container, stacked. Identifies resource-heavy services." + "description": "CPU usage in cores per container, stacked. Identifies resource-heavy services.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3045,7 +2559,18 @@ data: "refId": "A" } ], - "description": "Working set memory per container. Watch for approaching limits." + "description": "Working set memory per container. Watch for approaching limits.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3082,7 +2607,18 @@ data: "refId": "A" } ], - "description": "Per-container restart events. Spikes indicate crashes or OOM kills." + "description": "Per-container restart events. Spikes indicate crashes or OOM kills.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } } ] } @@ -3134,7 +2670,7 @@ data: "timepicker": {}, "timezone": "browser", "title": "Union Dataplane Overview", - "uid": "union-dp-overview", + "uid": "union-dataplane-overview", "version": 1 } --- diff --git a/tests/generated/dataplane.gcp.yaml b/tests/generated/dataplane.gcp.yaml index 4cb31e3f..7c13ee1a 100644 --- a/tests/generated/dataplane.gcp.yaml +++ b/tests/generated/dataplane.gcp.yaml @@ -1177,63 +1177,6 @@ data: ], "description": "Total container restarts in the last hour. Non-zero indicates crashlooping or OOM kills." }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "thresholds": { - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "yellow", - "value": 10 - }, - { - "color": "red", - "value": 50 - } - ] - }, - "unit": "short" - } - }, - "gridPos": { - "h": 4, - "w": 6, - "x": 18, - "y": 1 - }, - "id": 8, - "options": { - "colorMode": "background", - "graphMode": "area", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ] - }, - "textMode": "auto" - }, - "title": "Active Workflows", - "type": "stat", - "targets": [ - { - "expr": "sum(flyte:propeller:all:execstats:active_workflow_executions{namespace=\"$namespace\"})", - "legendFormat": "Workflows", - "refId": "A" - } - ], - "description": "Current active FlyteWorkflow CRD count managed by Propeller." - }, { "datasource": { "type": "prometheus", @@ -1263,8 +1206,9 @@ data: "options": { "legend": { "calcs": [ - "mean", - "max" + "min", + "max", + "lastNotNull" ], "displayMode": "table", "placement": "bottom" @@ -1295,59 +1239,6 @@ data: ], "description": "Current count of active workflow, node, and task executions tracked by FlyteAdmin." }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "short" - } - }, - "gridPos": { - "h": 6, - "w": 12, - "x": 12, - "y": 5 - }, - "id": 7, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom" - }, - "tooltip": { - "mode": "multi", - "sort": "desc" - } - }, - "title": "Queue Depth", - "type": "timeseries", - "targets": [ - { - "expr": "sum(flyte:propeller:all:main_depth{namespace=\"$namespace\"})", - "legendFormat": "Main", - "refId": "A" - }, - { - "expr": "sum(flyte:propeller:all:sub_depth{namespace=\"$namespace\"})", - "legendFormat": "Sub", - "refId": "B" - } - ], - "description": "Main and sub workqueue depth over time." - }, { "datasource": { "type": "prometheus", @@ -1633,7 +1524,7 @@ data: "y": 10 }, "id": 1204, - "title": "Propeller Latency p99", + "title": "Executor Evaluate Duration p99", "type": "stat", "options": { "colorMode": "background", @@ -1647,8 +1538,13 @@ data: }, "targets": [ { - "expr": "(union:dp:slo:propeller_round_latency_p99 or flyte:propeller:all:round:round_time_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.99\"} / 1000)", - "refId": "A" + "expr": "executor::v2:evaluator:evaluate_duration_ms{namespace=\"$namespace\", quantile=\"0.99\"}", + "legendFormat": "p99", + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + } } ], "description": "Propeller round p99 latency in seconds. Falls back to raw metric if SLO rules not enabled." @@ -1693,7 +1589,18 @@ data: "refId": "B" } ], - "description": "DP service availability over time with SLO target line." + "description": "DP service availability over time with SLO target line.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -1737,7 +1644,18 @@ data: "refId": "B" } ], - "description": "DP error budget remaining over time. Requires monitoring.slos.enabled." + "description": "DP error budget remaining over time. Requires monitoring.slos.enabled.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "collapsed": true, @@ -1791,7 +1709,18 @@ data: "refId": "B" } ], - "description": "Operator execution operation processing rate and failure rate." + "description": "Operator execution operation processing rate and failure rate.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -1848,7 +1777,18 @@ data: "refId": "E" } ], - "description": "FlyteAdmin error rates: propeller communication failures, model transform errors, notification publish failures." + "description": "FlyteAdmin error rates: propeller communication failures, model transform errors, notification publish failures.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -1895,7 +1835,18 @@ data: "refId": "C" } ], - "description": "Breakdown of operator heartbeat computation: capabilities, usages, list workflows." + "description": "Breakdown of operator heartbeat computation: capabilities, usages, list workflows.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -1942,7 +1893,18 @@ data: "refId": "C" } ], - "description": "Config sync cycle rate, errors, and propeller ConfigMap update count." + "description": "Config sync cycle rate, errors, and propeller ConfigMap update count.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -1984,7 +1946,18 @@ data: "refId": "B" } ], - "description": "Billing collection cycle rate and errors. Failures mean billing data may be delayed." + "description": "Billing collection cycle rate and errors. Failures mean billing data may be delayed.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2083,17 +2056,28 @@ data: "type": "timeseries", "targets": [ { - "expr": "executor:active_actions_count{namespace=\"$namespace\"}", + "expr": "executor::v2:active_actions_count{namespace=\"$namespace\"}", "legendFormat": "Active actions", "refId": "A" }, { - "expr": "executor:available_capacity{namespace=\"$namespace\"}", + "expr": "executor::v2:available_capacity{namespace=\"$namespace\"}", "legendFormat": "Available capacity", "refId": "B" } ], - "description": "V2 executor active actions vs available capacity. Capacity=0 means executor is saturated." + "description": "V2 executor active actions vs available capacity. Capacity=0 means executor is saturated.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2125,17 +2109,28 @@ data: "type": "timeseries", "targets": [ { - "expr": "rate(executor:discovery_miss_count{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(executor::v2:discovery_miss_count{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Miss", "refId": "A" }, { - "expr": "rate(executor:discovery_put_success_count{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(executor::v2:discovery_put_success_count{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Put success", "refId": "B" } ], - "description": "V2 executor cache discovery miss/put rates for task output caching." + "description": "V2 executor cache discovery miss/put rates for task output caching.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2170,12 +2165,23 @@ data: "type": "timeseries", "targets": [ { - "expr": "sum by (phase) (rate(executor:actions_terminated{namespace=\"$namespace\"}[$__rate_interval]))", + "expr": "sum by (phase) (rate(executor::v2:actions_terminated{namespace=\"$namespace\"}[$__rate_interval]))", "legendFormat": "{{ phase }}", "refId": "A" } ], - "description": "Task completion rate by outcome: succeeded, failed, aborted. Key V2 SLI for task health. [Metrics pending: requires cloud service instrumentation to be deployed]" + "description": "Task completion rate by outcome: succeeded, failed, aborted. Key V2 SLI for task health. [Metrics pending: requires cloud service instrumentation to be deployed]", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2207,22 +2213,33 @@ data: "type": "timeseries", "targets": [ { - "expr": "executor:evaluator:evaluate_duration{namespace=\"$namespace\", quantile=\"0.5\"}", + "expr": "executor::v2:evaluator:evaluate_duration_ms{namespace=\"$namespace\", quantile=\"0.5\"}", "legendFormat": "Evaluate p50", "refId": "A" }, { - "expr": "executor:evaluator:evaluate_duration{namespace=\"$namespace\", quantile=\"0.9\"}", + "expr": "executor::v2:evaluator:evaluate_duration_ms{namespace=\"$namespace\", quantile=\"0.9\"}", "legendFormat": "Evaluate p90", "refId": "B" }, { - "expr": "executor:evaluator:evaluate_duration{namespace=\"$namespace\", quantile=\"0.99\"}", + "expr": "executor::v2:evaluator:evaluate_duration_ms{namespace=\"$namespace\", quantile=\"0.99\"}", "legendFormat": "Evaluate p99", "refId": "C" } ], - "description": "Time spent in RecursiveNodeHandler (pod creation). Dominant component of V2 task latency. [Metrics pending: requires cloud service instrumentation to be deployed]" + "description": "Time spent in RecursiveNodeHandler (pod creation). Dominant component of V2 task latency. [Metrics pending: requires cloud service instrumentation to be deployed]", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2254,27 +2271,38 @@ data: "type": "timeseries", "targets": [ { - "expr": "rate(executor:system_failures{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(executor::v2:system_failures{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "System failures", "refId": "A" }, { - "expr": "rate(executor:system_failures_exhausted{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(executor::v2:system_failures_exhausted{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Exhausted retries", "refId": "B" }, { - "expr": "rate(executor:invalid_leases{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(executor::v2:invalid_leases{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Invalid leases", "refId": "C" }, { - "expr": "rate(executor:evaluator:evaluate_errors{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(executor::v2:evaluator:evaluate_errors{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Evaluate errors", "refId": "D" } ], - "description": "Executor error rates. System failures retry; exhausted means task permanently failed. Invalid leases = malformed from queue service. [Metrics pending: requires cloud service instrumentation to be deployed]" + "description": "Executor error rates. System failures retry; exhausted means task permanently failed. Invalid leases = malformed from queue service. [Metrics pending: requires cloud service instrumentation to be deployed]", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } } ] }, @@ -2284,10 +2312,10 @@ data: "h": 1, "w": 24, "x": 0, - "y": 33 + "y": 34 }, - "id": 100, - "title": "Flyte Propeller (V1)", + "id": 400, + "title": "gRPC Client (DP \u2192 CP)", "type": "row", "panels": [ { @@ -2306,36 +2334,37 @@ data: "lineWidth": 1, "showPoints": "never" }, - "unit": "ms" + "unit": "reqps" } }, "gridPos": { "h": 8, - "w": 8, + "w": 12, "x": 0, - "y": 12 + "y": 13 }, - "id": 101, - "title": "Round Time (p50 / p90 / p99)", + "id": 401, + "title": "gRPC Client Request Rate", "type": "timeseries", "targets": [ { - "expr": "flyte:propeller:all:round:round_time_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.5\"}", - "legendFormat": "p50", + "expr": "sum by (grpc_service, grpc_method) (rate(grpc_client_handled_total{namespace=\"$namespace\"}[$__rate_interval]))", + "legendFormat": "{{ grpc_service }}/{{ grpc_method }}", "refId": "A" - }, - { - "expr": "flyte:propeller:all:round:round_time_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.9\"}", - "legendFormat": "p90", - "refId": "B" - }, - { - "expr": "flyte:propeller:all:round:round_time_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.99\"}", - "legendFormat": "p99", - "refId": "C" } ], - "description": "Propeller reconciliation round time. One round = one FlyteWorkflow CRD processed." + "description": "DP\u2192CP gRPC client request rate by service and method (e.g. CreateWorkflowEvent).", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2353,36 +2382,37 @@ data: "lineWidth": 1, "showPoints": "never" }, - "unit": "ops" + "unit": "reqps" } }, "gridPos": { "h": 8, - "w": 8, - "x": 8, - "y": 12 + "w": 12, + "x": 12, + "y": 13 }, - "id": 102, - "title": "Round Success / Error Rate", + "id": 402, + "title": "gRPC Client Error Rate", "type": "timeseries", "targets": [ { - "expr": "rate(flyte:propeller:all:round:success_count{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Success", + "expr": "sum by (grpc_service, grpc_method, grpc_code) (rate(grpc_client_handled_total{namespace=\"$namespace\", grpc_code!=\"OK\"}[$__rate_interval]))", + "legendFormat": "{{ grpc_method }} {{ grpc_code }}", "refId": "A" - }, - { - "expr": "rate(flyte:propeller:all:round:error_count{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Errors", - "refId": "B" - }, - { - "expr": "rate(flyte:propeller:all:round:panic_unlabeled{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Panics", - "refId": "C" } ], - "description": "Propeller round outcomes: success, errors, and panics per second." + "description": "DP\u2192CP gRPC errors by method and code. Non-OK responses from the control plane.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2400,564 +2430,37 @@ data: "lineWidth": 1, "showPoints": "never" }, - "unit": "short" + "unit": "s" } }, "gridPos": { "h": 8, - "w": 8, - "x": 16, - "y": 12 + "w": 24, + "x": 0, + "y": 21 }, - "id": 103, - "title": "Free Workers", + "id": 403, + "title": "gRPC Client Latency (p95)", "type": "timeseries", "targets": [ { - "expr": "sum(flyte:propeller:all:free_workers_count{namespace=\"$namespace\"})", - "legendFormat": "Free workers", + "expr": "histogram_quantile(0.95, sum by (le, grpc_method) (rate(grpc_client_handling_seconds_bucket{namespace=\"$namespace\"}[$__rate_interval])))", + "legendFormat": "{{ grpc_method }} p95", "refId": "A" } ], - "description": "Idle propeller worker goroutines. Zero = all workers busy processing workflows." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "ops" - } - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 0, - "y": 20 - }, - "id": 104, - "title": "Queue Add Rate", - "type": "timeseries", - "targets": [ - { - "expr": "rate(flyte:propeller:all:main_adds{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Main adds", - "refId": "A" - }, - { - "expr": "rate(flyte:propeller:all:sub_adds{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Sub adds", - "refId": "B" - }, - { - "expr": "rate(flyte:propeller:all:main_retries{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Main retries", - "refId": "C" - } - ], - "description": "Items enqueued to propeller's main/sub workqueues, plus retry rate." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "short" - } - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 8, - "y": 20 - }, - "id": 105, - "title": "Workflow Updates", - "type": "timeseries", - "targets": [ - { - "expr": "rate(flyte:propeller:all:wf_updated{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Updated", - "refId": "A" - }, - { - "expr": "rate(flyte:propeller:all:wf_update_failed{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Failed", - "refId": "B" - }, - { - "expr": "rate(flyte:propeller:all:wf_too_large{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Too large", - "refId": "C" - }, - { - "expr": "rate(flyte:propeller:all:wf_update_conflict{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Conflict", - "refId": "D" - } - ], - "description": "etcd write outcomes: successful updates, failures, too-large (>1.5MB), optimistic concurrency conflicts." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "ms" - } - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 16, - "y": 20 - }, - "id": 106, - "title": "Workflow Update Latency", - "type": "timeseries", - "targets": [ - { - "expr": "flyte:propeller:all:wf_update_latency_ms{namespace=\"$namespace\", quantile=\"0.5\"}", - "legendFormat": "p50", - "refId": "A" - }, - { - "expr": "flyte:propeller:all:wf_update_latency_ms{namespace=\"$namespace\", quantile=\"0.9\"}", - "legendFormat": "p90", - "refId": "B" - }, - { - "expr": "flyte:propeller:all:wf_update_latency_ms{namespace=\"$namespace\", quantile=\"0.99\"}", - "legendFormat": "p99", - "refId": "C" - } - ], - "description": "etcd write latency for FlyteWorkflow status updates." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "ms" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 28 - }, - "id": 107, - "title": "Node Queueing & Execution Latency", - "type": "timeseries", - "targets": [ - { - "expr": "flyte:propeller:all:node:queueing_latency_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.5\"}", - "legendFormat": "Queue p50", - "refId": "A" - }, - { - "expr": "flyte:propeller:all:node:queueing_latency_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.9\"}", - "legendFormat": "Queue p90", - "refId": "B" - }, - { - "expr": "flyte:propeller:all:node:node_exec_latency_unlabeled_us{namespace=\"$namespace\", quantile=\"0.9\"} / 1000", - "legendFormat": "Exec p90 (ms)", - "refId": "C" - } - ], - "description": "Node lifecycle: queueing latency (queued\u2192running) and execution latency (time in handler)." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "percentunit" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 28 - }, - "id": 108, - "title": "Metastore Cache Hit Rate", - "type": "timeseries", - "targets": [ - { - "expr": "rate(flyte:propeller:all:metastore:cache_hit{namespace=\"$namespace\"}[$__rate_interval]) / (rate(flyte:propeller:all:metastore:cache_hit{namespace=\"$namespace\"}[$__rate_interval]) + rate(flyte:propeller:all:metastore:cache_miss{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "Hit rate", - "refId": "A" - } - ], - "description": "In-memory cache effectiveness for object store (S3/GCS) reads. Low hit rate = excessive storage calls." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "ops" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 36 - }, - "id": 109, - "title": "Event Recording (DP \u2192 CP)", - "type": "timeseries", - "targets": [ - { - "expr": "sum(rate(flyte:propeller:all:task:event_recording:success_duration_ms_count{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "Task success", - "refId": "A" - }, - { - "expr": "sum(rate(flyte:propeller:all:node:event_recording:success_duration_ms_count{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "Node success", - "refId": "B" - }, - { - "expr": "sum(rate(flyte:propeller:all:task:event_recording:failure_duration_ms_count{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "Task failure", - "refId": "C" - }, - { - "expr": "sum(rate(flyte:propeller:all:node:event_recording:failure_duration_ms_count{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "Node failure", - "refId": "D" - } - ], - "description": "Task/node event recording rate from propeller to FlyteAdmin. Failures indicate CP connectivity issues." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "short" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 36 - }, - "id": 110, - "title": "Cache Discovery (hit/miss/skip)", - "type": "timeseries", - "targets": [ - { - "expr": "rate(flyte:propeller:all:discovery_hit_count{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Hits", - "refId": "A" - }, - { - "expr": "rate(flyte:propeller:all:discovery_miss_count{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Misses", - "refId": "B" - }, - { - "expr": "rate(flyte:propeller:all:discovery_skip_count{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Skips", - "refId": "C" - }, - { - "expr": "rate(flyte:propeller:all:discovery_get_failure_count{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Get failures", - "refId": "D" - } - ], - "description": "V2 executor cache discovery miss/put rates for task output caching." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "reqps" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 44 - }, - "id": 111, - "title": "K8s API Client Rate", - "type": "timeseries", - "targets": [ - { - "expr": "sum(rate(k8s_client_request_total_unlabeled{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "K8s requests/s", - "refId": "A" - } - ], - "description": "Propeller's K8s API request rate. High rates may indicate excessive pod watches or creates." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "s" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 44 - }, - "id": 112, - "title": "K8s API Client Latency (p90)", - "type": "timeseries", - "targets": [ - { - "expr": "histogram_quantile(0.90, sum by (le) (rate(k8s_client_request_latency_unlabeled_bucket{namespace=\"$namespace\"}[$__rate_interval])))", - "legendFormat": "Request p90", - "refId": "A" - }, - { - "expr": "histogram_quantile(0.90, sum by (le) (rate(k8s_client_rate_limiter_latency_unlabeled_bucket{namespace=\"$namespace\"}[$__rate_interval])))", - "legendFormat": "Rate limiter p90", - "refId": "B" - } - ], - "description": "K8s API request latency and client-side rate limiter wait time at p90." - } - ] - }, - { - "collapsed": true, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 34 - }, - "id": 400, - "title": "gRPC Client (DP \u2192 CP)", - "type": "row", - "panels": [ - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "reqps" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 13 - }, - "id": 401, - "title": "gRPC Client Request Rate", - "type": "timeseries", - "targets": [ - { - "expr": "sum by (grpc_service, grpc_method) (rate(grpc_client_handled_total{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "{{ grpc_service }}/{{ grpc_method }}", - "refId": "A" - } - ], - "description": "DP\u2192CP gRPC client request rate by service and method (e.g. CreateWorkflowEvent)." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "reqps" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 13 - }, - "id": 402, - "title": "gRPC Client Error Rate", - "type": "timeseries", - "targets": [ - { - "expr": "sum by (grpc_service, grpc_method, grpc_code) (rate(grpc_client_handled_total{namespace=\"$namespace\", grpc_code!=\"OK\"}[$__rate_interval]))", - "legendFormat": "{{ grpc_method }} {{ grpc_code }}", - "refId": "A" - } - ], - "description": "DP\u2192CP gRPC errors by method and code. Non-OK responses from the control plane." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "s" - } - }, - "gridPos": { - "h": 8, - "w": 24, - "x": 0, - "y": 21 - }, - "id": 403, - "title": "gRPC Client Latency (p95)", - "type": "timeseries", - "targets": [ - { - "expr": "histogram_quantile(0.95, sum by (le, grpc_method) (rate(grpc_client_handling_seconds_bucket{namespace=\"$namespace\"}[$__rate_interval])))", - "legendFormat": "{{ grpc_method }} p95", - "refId": "A" + "description": "DP\u2192CP gRPC call latency at p95. High latency = slow control plane or network issues.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] } - ], - "description": "DP\u2192CP gRPC call latency at p95. High latency = slow control plane or network issues." + } } ] }, @@ -3011,7 +2514,18 @@ data: "refId": "A" } ], - "description": "CPU usage in cores per container, stacked. Identifies resource-heavy services." + "description": "CPU usage in cores per container, stacked. Identifies resource-heavy services.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3051,7 +2565,18 @@ data: "refId": "A" } ], - "description": "Working set memory per container. Watch for approaching limits." + "description": "Working set memory per container. Watch for approaching limits.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3088,7 +2613,18 @@ data: "refId": "A" } ], - "description": "Per-container restart events. Spikes indicate crashes or OOM kills." + "description": "Per-container restart events. Spikes indicate crashes or OOM kills.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } } ] } @@ -3140,7 +2676,7 @@ data: "timepicker": {}, "timezone": "browser", "title": "Union Dataplane Overview", - "uid": "union-dp-overview", + "uid": "union-dataplane-overview", "version": 1 } --- diff --git a/tests/generated/dataplane.low-priv.yaml b/tests/generated/dataplane.low-priv.yaml index 27ee6c9d..81a719d1 100644 --- a/tests/generated/dataplane.low-priv.yaml +++ b/tests/generated/dataplane.low-priv.yaml @@ -1186,63 +1186,6 @@ data: ], "description": "Total container restarts in the last hour. Non-zero indicates crashlooping or OOM kills." }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "thresholds": { - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "yellow", - "value": 10 - }, - { - "color": "red", - "value": 50 - } - ] - }, - "unit": "short" - } - }, - "gridPos": { - "h": 4, - "w": 6, - "x": 18, - "y": 1 - }, - "id": 8, - "options": { - "colorMode": "background", - "graphMode": "area", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ] - }, - "textMode": "auto" - }, - "title": "Active Workflows", - "type": "stat", - "targets": [ - { - "expr": "sum(flyte:propeller:all:execstats:active_workflow_executions{namespace=\"$namespace\"})", - "legendFormat": "Workflows", - "refId": "A" - } - ], - "description": "Current active FlyteWorkflow CRD count managed by Propeller." - }, { "datasource": { "type": "prometheus", @@ -1272,8 +1215,9 @@ data: "options": { "legend": { "calcs": [ - "mean", - "max" + "min", + "max", + "lastNotNull" ], "displayMode": "table", "placement": "bottom" @@ -1304,59 +1248,6 @@ data: ], "description": "Current count of active workflow, node, and task executions tracked by FlyteAdmin." }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "short" - } - }, - "gridPos": { - "h": 6, - "w": 12, - "x": 12, - "y": 5 - }, - "id": 7, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom" - }, - "tooltip": { - "mode": "multi", - "sort": "desc" - } - }, - "title": "Queue Depth", - "type": "timeseries", - "targets": [ - { - "expr": "sum(flyte:propeller:all:main_depth{namespace=\"$namespace\"})", - "legendFormat": "Main", - "refId": "A" - }, - { - "expr": "sum(flyte:propeller:all:sub_depth{namespace=\"$namespace\"})", - "legendFormat": "Sub", - "refId": "B" - } - ], - "description": "Main and sub workqueue depth over time." - }, { "datasource": { "type": "prometheus", @@ -1642,7 +1533,7 @@ data: "y": 10 }, "id": 1204, - "title": "Propeller Latency p99", + "title": "Executor Evaluate Duration p99", "type": "stat", "options": { "colorMode": "background", @@ -1656,8 +1547,13 @@ data: }, "targets": [ { - "expr": "(union:dp:slo:propeller_round_latency_p99 or flyte:propeller:all:round:round_time_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.99\"} / 1000)", - "refId": "A" + "expr": "executor::v2:evaluator:evaluate_duration_ms{namespace=\"$namespace\", quantile=\"0.99\"}", + "legendFormat": "p99", + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + } } ], "description": "Propeller round p99 latency in seconds. Falls back to raw metric if SLO rules not enabled." @@ -1702,7 +1598,18 @@ data: "refId": "B" } ], - "description": "DP service availability over time with SLO target line." + "description": "DP service availability over time with SLO target line.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -1746,7 +1653,18 @@ data: "refId": "B" } ], - "description": "DP error budget remaining over time. Requires monitoring.slos.enabled." + "description": "DP error budget remaining over time. Requires monitoring.slos.enabled.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "collapsed": true, @@ -1800,7 +1718,18 @@ data: "refId": "B" } ], - "description": "Operator execution operation processing rate and failure rate." + "description": "Operator execution operation processing rate and failure rate.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -1857,7 +1786,18 @@ data: "refId": "E" } ], - "description": "FlyteAdmin error rates: propeller communication failures, model transform errors, notification publish failures." + "description": "FlyteAdmin error rates: propeller communication failures, model transform errors, notification publish failures.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -1904,7 +1844,18 @@ data: "refId": "C" } ], - "description": "Breakdown of operator heartbeat computation: capabilities, usages, list workflows." + "description": "Breakdown of operator heartbeat computation: capabilities, usages, list workflows.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -1951,7 +1902,18 @@ data: "refId": "C" } ], - "description": "Config sync cycle rate, errors, and propeller ConfigMap update count." + "description": "Config sync cycle rate, errors, and propeller ConfigMap update count.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -1993,7 +1955,18 @@ data: "refId": "B" } ], - "description": "Billing collection cycle rate and errors. Failures mean billing data may be delayed." + "description": "Billing collection cycle rate and errors. Failures mean billing data may be delayed.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2092,17 +2065,28 @@ data: "type": "timeseries", "targets": [ { - "expr": "executor:active_actions_count{namespace=\"$namespace\"}", + "expr": "executor::v2:active_actions_count{namespace=\"$namespace\"}", "legendFormat": "Active actions", "refId": "A" }, { - "expr": "executor:available_capacity{namespace=\"$namespace\"}", + "expr": "executor::v2:available_capacity{namespace=\"$namespace\"}", "legendFormat": "Available capacity", "refId": "B" } ], - "description": "V2 executor active actions vs available capacity. Capacity=0 means executor is saturated." + "description": "V2 executor active actions vs available capacity. Capacity=0 means executor is saturated.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2134,17 +2118,28 @@ data: "type": "timeseries", "targets": [ { - "expr": "rate(executor:discovery_miss_count{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(executor::v2:discovery_miss_count{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Miss", "refId": "A" }, { - "expr": "rate(executor:discovery_put_success_count{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(executor::v2:discovery_put_success_count{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Put success", "refId": "B" } ], - "description": "V2 executor cache discovery miss/put rates for task output caching." + "description": "V2 executor cache discovery miss/put rates for task output caching.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2179,12 +2174,23 @@ data: "type": "timeseries", "targets": [ { - "expr": "sum by (phase) (rate(executor:actions_terminated{namespace=\"$namespace\"}[$__rate_interval]))", + "expr": "sum by (phase) (rate(executor::v2:actions_terminated{namespace=\"$namespace\"}[$__rate_interval]))", "legendFormat": "{{ phase }}", "refId": "A" } ], - "description": "Task completion rate by outcome: succeeded, failed, aborted. Key V2 SLI for task health. [Metrics pending: requires cloud service instrumentation to be deployed]" + "description": "Task completion rate by outcome: succeeded, failed, aborted. Key V2 SLI for task health. [Metrics pending: requires cloud service instrumentation to be deployed]", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2216,22 +2222,33 @@ data: "type": "timeseries", "targets": [ { - "expr": "executor:evaluator:evaluate_duration{namespace=\"$namespace\", quantile=\"0.5\"}", + "expr": "executor::v2:evaluator:evaluate_duration_ms{namespace=\"$namespace\", quantile=\"0.5\"}", "legendFormat": "Evaluate p50", "refId": "A" }, { - "expr": "executor:evaluator:evaluate_duration{namespace=\"$namespace\", quantile=\"0.9\"}", + "expr": "executor::v2:evaluator:evaluate_duration_ms{namespace=\"$namespace\", quantile=\"0.9\"}", "legendFormat": "Evaluate p90", "refId": "B" }, { - "expr": "executor:evaluator:evaluate_duration{namespace=\"$namespace\", quantile=\"0.99\"}", + "expr": "executor::v2:evaluator:evaluate_duration_ms{namespace=\"$namespace\", quantile=\"0.99\"}", "legendFormat": "Evaluate p99", "refId": "C" } ], - "description": "Time spent in RecursiveNodeHandler (pod creation). Dominant component of V2 task latency. [Metrics pending: requires cloud service instrumentation to be deployed]" + "description": "Time spent in RecursiveNodeHandler (pod creation). Dominant component of V2 task latency. [Metrics pending: requires cloud service instrumentation to be deployed]", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2263,27 +2280,38 @@ data: "type": "timeseries", "targets": [ { - "expr": "rate(executor:system_failures{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(executor::v2:system_failures{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "System failures", "refId": "A" }, { - "expr": "rate(executor:system_failures_exhausted{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(executor::v2:system_failures_exhausted{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Exhausted retries", "refId": "B" }, { - "expr": "rate(executor:invalid_leases{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(executor::v2:invalid_leases{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Invalid leases", "refId": "C" }, { - "expr": "rate(executor:evaluator:evaluate_errors{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(executor::v2:evaluator:evaluate_errors{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Evaluate errors", "refId": "D" } ], - "description": "Executor error rates. System failures retry; exhausted means task permanently failed. Invalid leases = malformed from queue service. [Metrics pending: requires cloud service instrumentation to be deployed]" + "description": "Executor error rates. System failures retry; exhausted means task permanently failed. Invalid leases = malformed from queue service. [Metrics pending: requires cloud service instrumentation to be deployed]", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } } ] }, @@ -2293,10 +2321,10 @@ data: "h": 1, "w": 24, "x": 0, - "y": 33 + "y": 34 }, - "id": 100, - "title": "Flyte Propeller (V1)", + "id": 400, + "title": "gRPC Client (DP \u2192 CP)", "type": "row", "panels": [ { @@ -2315,36 +2343,37 @@ data: "lineWidth": 1, "showPoints": "never" }, - "unit": "ms" + "unit": "reqps" } }, "gridPos": { "h": 8, - "w": 8, + "w": 12, "x": 0, - "y": 12 + "y": 13 }, - "id": 101, - "title": "Round Time (p50 / p90 / p99)", + "id": 401, + "title": "gRPC Client Request Rate", "type": "timeseries", "targets": [ { - "expr": "flyte:propeller:all:round:round_time_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.5\"}", - "legendFormat": "p50", + "expr": "sum by (grpc_service, grpc_method) (rate(grpc_client_handled_total{namespace=\"$namespace\"}[$__rate_interval]))", + "legendFormat": "{{ grpc_service }}/{{ grpc_method }}", "refId": "A" - }, - { - "expr": "flyte:propeller:all:round:round_time_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.9\"}", - "legendFormat": "p90", - "refId": "B" - }, - { - "expr": "flyte:propeller:all:round:round_time_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.99\"}", - "legendFormat": "p99", - "refId": "C" } ], - "description": "Propeller reconciliation round time. One round = one FlyteWorkflow CRD processed." + "description": "DP\u2192CP gRPC client request rate by service and method (e.g. CreateWorkflowEvent).", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2362,36 +2391,37 @@ data: "lineWidth": 1, "showPoints": "never" }, - "unit": "ops" + "unit": "reqps" } }, "gridPos": { "h": 8, - "w": 8, - "x": 8, - "y": 12 + "w": 12, + "x": 12, + "y": 13 }, - "id": 102, - "title": "Round Success / Error Rate", + "id": 402, + "title": "gRPC Client Error Rate", "type": "timeseries", "targets": [ { - "expr": "rate(flyte:propeller:all:round:success_count{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Success", + "expr": "sum by (grpc_service, grpc_method, grpc_code) (rate(grpc_client_handled_total{namespace=\"$namespace\", grpc_code!=\"OK\"}[$__rate_interval]))", + "legendFormat": "{{ grpc_method }} {{ grpc_code }}", "refId": "A" - }, - { - "expr": "rate(flyte:propeller:all:round:error_count{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Errors", - "refId": "B" - }, - { - "expr": "rate(flyte:propeller:all:round:panic_unlabeled{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Panics", - "refId": "C" } ], - "description": "Propeller round outcomes: success, errors, and panics per second." + "description": "DP\u2192CP gRPC errors by method and code. Non-OK responses from the control plane.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2409,564 +2439,37 @@ data: "lineWidth": 1, "showPoints": "never" }, - "unit": "short" + "unit": "s" } }, "gridPos": { "h": 8, - "w": 8, - "x": 16, - "y": 12 + "w": 24, + "x": 0, + "y": 21 }, - "id": 103, - "title": "Free Workers", + "id": 403, + "title": "gRPC Client Latency (p95)", "type": "timeseries", "targets": [ { - "expr": "sum(flyte:propeller:all:free_workers_count{namespace=\"$namespace\"})", - "legendFormat": "Free workers", + "expr": "histogram_quantile(0.95, sum by (le, grpc_method) (rate(grpc_client_handling_seconds_bucket{namespace=\"$namespace\"}[$__rate_interval])))", + "legendFormat": "{{ grpc_method }} p95", "refId": "A" } ], - "description": "Idle propeller worker goroutines. Zero = all workers busy processing workflows." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "ops" - } - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 0, - "y": 20 - }, - "id": 104, - "title": "Queue Add Rate", - "type": "timeseries", - "targets": [ - { - "expr": "rate(flyte:propeller:all:main_adds{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Main adds", - "refId": "A" - }, - { - "expr": "rate(flyte:propeller:all:sub_adds{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Sub adds", - "refId": "B" - }, - { - "expr": "rate(flyte:propeller:all:main_retries{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Main retries", - "refId": "C" - } - ], - "description": "Items enqueued to propeller's main/sub workqueues, plus retry rate." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "short" - } - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 8, - "y": 20 - }, - "id": 105, - "title": "Workflow Updates", - "type": "timeseries", - "targets": [ - { - "expr": "rate(flyte:propeller:all:wf_updated{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Updated", - "refId": "A" - }, - { - "expr": "rate(flyte:propeller:all:wf_update_failed{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Failed", - "refId": "B" - }, - { - "expr": "rate(flyte:propeller:all:wf_too_large{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Too large", - "refId": "C" - }, - { - "expr": "rate(flyte:propeller:all:wf_update_conflict{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Conflict", - "refId": "D" - } - ], - "description": "etcd write outcomes: successful updates, failures, too-large (>1.5MB), optimistic concurrency conflicts." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "ms" - } - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 16, - "y": 20 - }, - "id": 106, - "title": "Workflow Update Latency", - "type": "timeseries", - "targets": [ - { - "expr": "flyte:propeller:all:wf_update_latency_ms{namespace=\"$namespace\", quantile=\"0.5\"}", - "legendFormat": "p50", - "refId": "A" - }, - { - "expr": "flyte:propeller:all:wf_update_latency_ms{namespace=\"$namespace\", quantile=\"0.9\"}", - "legendFormat": "p90", - "refId": "B" - }, - { - "expr": "flyte:propeller:all:wf_update_latency_ms{namespace=\"$namespace\", quantile=\"0.99\"}", - "legendFormat": "p99", - "refId": "C" - } - ], - "description": "etcd write latency for FlyteWorkflow status updates." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "ms" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 28 - }, - "id": 107, - "title": "Node Queueing & Execution Latency", - "type": "timeseries", - "targets": [ - { - "expr": "flyte:propeller:all:node:queueing_latency_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.5\"}", - "legendFormat": "Queue p50", - "refId": "A" - }, - { - "expr": "flyte:propeller:all:node:queueing_latency_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.9\"}", - "legendFormat": "Queue p90", - "refId": "B" - }, - { - "expr": "flyte:propeller:all:node:node_exec_latency_unlabeled_us{namespace=\"$namespace\", quantile=\"0.9\"} / 1000", - "legendFormat": "Exec p90 (ms)", - "refId": "C" - } - ], - "description": "Node lifecycle: queueing latency (queued\u2192running) and execution latency (time in handler)." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "percentunit" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 28 - }, - "id": 108, - "title": "Metastore Cache Hit Rate", - "type": "timeseries", - "targets": [ - { - "expr": "rate(flyte:propeller:all:metastore:cache_hit{namespace=\"$namespace\"}[$__rate_interval]) / (rate(flyte:propeller:all:metastore:cache_hit{namespace=\"$namespace\"}[$__rate_interval]) + rate(flyte:propeller:all:metastore:cache_miss{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "Hit rate", - "refId": "A" - } - ], - "description": "In-memory cache effectiveness for object store (S3/GCS) reads. Low hit rate = excessive storage calls." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "ops" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 36 - }, - "id": 109, - "title": "Event Recording (DP \u2192 CP)", - "type": "timeseries", - "targets": [ - { - "expr": "sum(rate(flyte:propeller:all:task:event_recording:success_duration_ms_count{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "Task success", - "refId": "A" - }, - { - "expr": "sum(rate(flyte:propeller:all:node:event_recording:success_duration_ms_count{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "Node success", - "refId": "B" - }, - { - "expr": "sum(rate(flyte:propeller:all:task:event_recording:failure_duration_ms_count{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "Task failure", - "refId": "C" - }, - { - "expr": "sum(rate(flyte:propeller:all:node:event_recording:failure_duration_ms_count{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "Node failure", - "refId": "D" - } - ], - "description": "Task/node event recording rate from propeller to FlyteAdmin. Failures indicate CP connectivity issues." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "short" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 36 - }, - "id": 110, - "title": "Cache Discovery (hit/miss/skip)", - "type": "timeseries", - "targets": [ - { - "expr": "rate(flyte:propeller:all:discovery_hit_count{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Hits", - "refId": "A" - }, - { - "expr": "rate(flyte:propeller:all:discovery_miss_count{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Misses", - "refId": "B" - }, - { - "expr": "rate(flyte:propeller:all:discovery_skip_count{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Skips", - "refId": "C" - }, - { - "expr": "rate(flyte:propeller:all:discovery_get_failure_count{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Get failures", - "refId": "D" - } - ], - "description": "V2 executor cache discovery miss/put rates for task output caching." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "reqps" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 44 - }, - "id": 111, - "title": "K8s API Client Rate", - "type": "timeseries", - "targets": [ - { - "expr": "sum(rate(k8s_client_request_total_unlabeled{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "K8s requests/s", - "refId": "A" - } - ], - "description": "Propeller's K8s API request rate. High rates may indicate excessive pod watches or creates." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "s" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 44 - }, - "id": 112, - "title": "K8s API Client Latency (p90)", - "type": "timeseries", - "targets": [ - { - "expr": "histogram_quantile(0.90, sum by (le) (rate(k8s_client_request_latency_unlabeled_bucket{namespace=\"$namespace\"}[$__rate_interval])))", - "legendFormat": "Request p90", - "refId": "A" - }, - { - "expr": "histogram_quantile(0.90, sum by (le) (rate(k8s_client_rate_limiter_latency_unlabeled_bucket{namespace=\"$namespace\"}[$__rate_interval])))", - "legendFormat": "Rate limiter p90", - "refId": "B" - } - ], - "description": "K8s API request latency and client-side rate limiter wait time at p90." - } - ] - }, - { - "collapsed": true, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 34 - }, - "id": 400, - "title": "gRPC Client (DP \u2192 CP)", - "type": "row", - "panels": [ - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "reqps" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 13 - }, - "id": 401, - "title": "gRPC Client Request Rate", - "type": "timeseries", - "targets": [ - { - "expr": "sum by (grpc_service, grpc_method) (rate(grpc_client_handled_total{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "{{ grpc_service }}/{{ grpc_method }}", - "refId": "A" - } - ], - "description": "DP\u2192CP gRPC client request rate by service and method (e.g. CreateWorkflowEvent)." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "reqps" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 13 - }, - "id": 402, - "title": "gRPC Client Error Rate", - "type": "timeseries", - "targets": [ - { - "expr": "sum by (grpc_service, grpc_method, grpc_code) (rate(grpc_client_handled_total{namespace=\"$namespace\", grpc_code!=\"OK\"}[$__rate_interval]))", - "legendFormat": "{{ grpc_method }} {{ grpc_code }}", - "refId": "A" - } - ], - "description": "DP\u2192CP gRPC errors by method and code. Non-OK responses from the control plane." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "s" - } - }, - "gridPos": { - "h": 8, - "w": 24, - "x": 0, - "y": 21 - }, - "id": 403, - "title": "gRPC Client Latency (p95)", - "type": "timeseries", - "targets": [ - { - "expr": "histogram_quantile(0.95, sum by (le, grpc_method) (rate(grpc_client_handling_seconds_bucket{namespace=\"$namespace\"}[$__rate_interval])))", - "legendFormat": "{{ grpc_method }} p95", - "refId": "A" + "description": "DP\u2192CP gRPC call latency at p95. High latency = slow control plane or network issues.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] } - ], - "description": "DP\u2192CP gRPC call latency at p95. High latency = slow control plane or network issues." + } } ] }, @@ -3020,7 +2523,18 @@ data: "refId": "A" } ], - "description": "CPU usage in cores per container, stacked. Identifies resource-heavy services." + "description": "CPU usage in cores per container, stacked. Identifies resource-heavy services.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3060,7 +2574,18 @@ data: "refId": "A" } ], - "description": "Working set memory per container. Watch for approaching limits." + "description": "Working set memory per container. Watch for approaching limits.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3097,7 +2622,18 @@ data: "refId": "A" } ], - "description": "Per-container restart events. Spikes indicate crashes or OOM kills." + "description": "Per-container restart events. Spikes indicate crashes or OOM kills.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } } ] } @@ -3149,7 +2685,7 @@ data: "timepicker": {}, "timezone": "browser", "title": "Union Dataplane Overview", - "uid": "union-dp-overview", + "uid": "union-dataplane-overview", "version": 1 } --- diff --git a/tests/generated/dataplane.monitoring.yaml b/tests/generated/dataplane.monitoring.yaml index c914cd2c..397cf4ba 100644 --- a/tests/generated/dataplane.monitoring.yaml +++ b/tests/generated/dataplane.monitoring.yaml @@ -1991,63 +1991,6 @@ data: ], "description": "Total container restarts in the last hour. Non-zero indicates crashlooping or OOM kills." }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "thresholds": { - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "yellow", - "value": 10 - }, - { - "color": "red", - "value": 50 - } - ] - }, - "unit": "short" - } - }, - "gridPos": { - "h": 4, - "w": 6, - "x": 18, - "y": 1 - }, - "id": 8, - "options": { - "colorMode": "background", - "graphMode": "area", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ] - }, - "textMode": "auto" - }, - "title": "Active Workflows", - "type": "stat", - "targets": [ - { - "expr": "sum(flyte:propeller:all:execstats:active_workflow_executions{namespace=\"$namespace\"})", - "legendFormat": "Workflows", - "refId": "A" - } - ], - "description": "Current active FlyteWorkflow CRD count managed by Propeller." - }, { "datasource": { "type": "prometheus", @@ -2077,8 +2020,9 @@ data: "options": { "legend": { "calcs": [ - "mean", - "max" + "min", + "max", + "lastNotNull" ], "displayMode": "table", "placement": "bottom" @@ -2109,59 +2053,6 @@ data: ], "description": "Current count of active workflow, node, and task executions tracked by FlyteAdmin." }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "short" - } - }, - "gridPos": { - "h": 6, - "w": 12, - "x": 12, - "y": 5 - }, - "id": 7, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom" - }, - "tooltip": { - "mode": "multi", - "sort": "desc" - } - }, - "title": "Queue Depth", - "type": "timeseries", - "targets": [ - { - "expr": "sum(flyte:propeller:all:main_depth{namespace=\"$namespace\"})", - "legendFormat": "Main", - "refId": "A" - }, - { - "expr": "sum(flyte:propeller:all:sub_depth{namespace=\"$namespace\"})", - "legendFormat": "Sub", - "refId": "B" - } - ], - "description": "Main and sub workqueue depth over time." - }, { "datasource": { "type": "prometheus", @@ -2447,7 +2338,7 @@ data: "y": 10 }, "id": 1204, - "title": "Propeller Latency p99", + "title": "Executor Evaluate Duration p99", "type": "stat", "options": { "colorMode": "background", @@ -2461,8 +2352,13 @@ data: }, "targets": [ { - "expr": "(union:dp:slo:propeller_round_latency_p99 or flyte:propeller:all:round:round_time_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.99\"} / 1000)", - "refId": "A" + "expr": "executor::v2:evaluator:evaluate_duration_ms{namespace=\"$namespace\", quantile=\"0.99\"}", + "legendFormat": "p99", + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + } } ], "description": "Propeller round p99 latency in seconds. Falls back to raw metric if SLO rules not enabled." @@ -2507,7 +2403,18 @@ data: "refId": "B" } ], - "description": "DP service availability over time with SLO target line." + "description": "DP service availability over time with SLO target line.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2551,7 +2458,18 @@ data: "refId": "B" } ], - "description": "DP error budget remaining over time. Requires monitoring.slos.enabled." + "description": "DP error budget remaining over time. Requires monitoring.slos.enabled.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "collapsed": true, @@ -2605,7 +2523,18 @@ data: "refId": "B" } ], - "description": "Operator execution operation processing rate and failure rate." + "description": "Operator execution operation processing rate and failure rate.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2662,7 +2591,18 @@ data: "refId": "E" } ], - "description": "FlyteAdmin error rates: propeller communication failures, model transform errors, notification publish failures." + "description": "FlyteAdmin error rates: propeller communication failures, model transform errors, notification publish failures.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2709,7 +2649,18 @@ data: "refId": "C" } ], - "description": "Breakdown of operator heartbeat computation: capabilities, usages, list workflows." + "description": "Breakdown of operator heartbeat computation: capabilities, usages, list workflows.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2756,7 +2707,18 @@ data: "refId": "C" } ], - "description": "Config sync cycle rate, errors, and propeller ConfigMap update count." + "description": "Config sync cycle rate, errors, and propeller ConfigMap update count.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2798,7 +2760,18 @@ data: "refId": "B" } ], - "description": "Billing collection cycle rate and errors. Failures mean billing data may be delayed." + "description": "Billing collection cycle rate and errors. Failures mean billing data may be delayed.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2897,17 +2870,28 @@ data: "type": "timeseries", "targets": [ { - "expr": "executor:active_actions_count{namespace=\"$namespace\"}", + "expr": "executor::v2:active_actions_count{namespace=\"$namespace\"}", "legendFormat": "Active actions", "refId": "A" }, { - "expr": "executor:available_capacity{namespace=\"$namespace\"}", + "expr": "executor::v2:available_capacity{namespace=\"$namespace\"}", "legendFormat": "Available capacity", "refId": "B" } ], - "description": "V2 executor active actions vs available capacity. Capacity=0 means executor is saturated." + "description": "V2 executor active actions vs available capacity. Capacity=0 means executor is saturated.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2939,17 +2923,28 @@ data: "type": "timeseries", "targets": [ { - "expr": "rate(executor:discovery_miss_count{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(executor::v2:discovery_miss_count{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Miss", "refId": "A" }, { - "expr": "rate(executor:discovery_put_success_count{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(executor::v2:discovery_put_success_count{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Put success", "refId": "B" } ], - "description": "V2 executor cache discovery miss/put rates for task output caching." + "description": "V2 executor cache discovery miss/put rates for task output caching.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2984,12 +2979,23 @@ data: "type": "timeseries", "targets": [ { - "expr": "sum by (phase) (rate(executor:actions_terminated{namespace=\"$namespace\"}[$__rate_interval]))", + "expr": "sum by (phase) (rate(executor::v2:actions_terminated{namespace=\"$namespace\"}[$__rate_interval]))", "legendFormat": "{{ phase }}", "refId": "A" } ], - "description": "Task completion rate by outcome: succeeded, failed, aborted. Key V2 SLI for task health. [Metrics pending: requires cloud service instrumentation to be deployed]" + "description": "Task completion rate by outcome: succeeded, failed, aborted. Key V2 SLI for task health. [Metrics pending: requires cloud service instrumentation to be deployed]", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3021,22 +3027,33 @@ data: "type": "timeseries", "targets": [ { - "expr": "executor:evaluator:evaluate_duration{namespace=\"$namespace\", quantile=\"0.5\"}", + "expr": "executor::v2:evaluator:evaluate_duration_ms{namespace=\"$namespace\", quantile=\"0.5\"}", "legendFormat": "Evaluate p50", "refId": "A" }, { - "expr": "executor:evaluator:evaluate_duration{namespace=\"$namespace\", quantile=\"0.9\"}", + "expr": "executor::v2:evaluator:evaluate_duration_ms{namespace=\"$namespace\", quantile=\"0.9\"}", "legendFormat": "Evaluate p90", "refId": "B" }, { - "expr": "executor:evaluator:evaluate_duration{namespace=\"$namespace\", quantile=\"0.99\"}", + "expr": "executor::v2:evaluator:evaluate_duration_ms{namespace=\"$namespace\", quantile=\"0.99\"}", "legendFormat": "Evaluate p99", "refId": "C" } ], - "description": "Time spent in RecursiveNodeHandler (pod creation). Dominant component of V2 task latency. [Metrics pending: requires cloud service instrumentation to be deployed]" + "description": "Time spent in RecursiveNodeHandler (pod creation). Dominant component of V2 task latency. [Metrics pending: requires cloud service instrumentation to be deployed]", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3068,27 +3085,38 @@ data: "type": "timeseries", "targets": [ { - "expr": "rate(executor:system_failures{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(executor::v2:system_failures{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "System failures", "refId": "A" }, { - "expr": "rate(executor:system_failures_exhausted{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(executor::v2:system_failures_exhausted{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Exhausted retries", "refId": "B" }, { - "expr": "rate(executor:invalid_leases{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(executor::v2:invalid_leases{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Invalid leases", "refId": "C" }, { - "expr": "rate(executor:evaluator:evaluate_errors{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(executor::v2:evaluator:evaluate_errors{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Evaluate errors", "refId": "D" } ], - "description": "Executor error rates. System failures retry; exhausted means task permanently failed. Invalid leases = malformed from queue service. [Metrics pending: requires cloud service instrumentation to be deployed]" + "description": "Executor error rates. System failures retry; exhausted means task permanently failed. Invalid leases = malformed from queue service. [Metrics pending: requires cloud service instrumentation to be deployed]", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } } ] }, @@ -3098,10 +3126,10 @@ data: "h": 1, "w": 24, "x": 0, - "y": 33 + "y": 34 }, - "id": 100, - "title": "Flyte Propeller (V1)", + "id": 400, + "title": "gRPC Client (DP \u2192 CP)", "type": "row", "panels": [ { @@ -3120,36 +3148,37 @@ data: "lineWidth": 1, "showPoints": "never" }, - "unit": "ms" + "unit": "reqps" } }, "gridPos": { "h": 8, - "w": 8, + "w": 12, "x": 0, - "y": 12 + "y": 13 }, - "id": 101, - "title": "Round Time (p50 / p90 / p99)", + "id": 401, + "title": "gRPC Client Request Rate", "type": "timeseries", "targets": [ { - "expr": "flyte:propeller:all:round:round_time_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.5\"}", - "legendFormat": "p50", + "expr": "sum by (grpc_service, grpc_method) (rate(grpc_client_handled_total{namespace=\"$namespace\"}[$__rate_interval]))", + "legendFormat": "{{ grpc_service }}/{{ grpc_method }}", "refId": "A" - }, - { - "expr": "flyte:propeller:all:round:round_time_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.9\"}", - "legendFormat": "p90", - "refId": "B" - }, - { - "expr": "flyte:propeller:all:round:round_time_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.99\"}", - "legendFormat": "p99", - "refId": "C" } ], - "description": "Propeller reconciliation round time. One round = one FlyteWorkflow CRD processed." + "description": "DP\u2192CP gRPC client request rate by service and method (e.g. CreateWorkflowEvent).", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3167,36 +3196,37 @@ data: "lineWidth": 1, "showPoints": "never" }, - "unit": "ops" + "unit": "reqps" } }, "gridPos": { "h": 8, - "w": 8, - "x": 8, - "y": 12 + "w": 12, + "x": 12, + "y": 13 }, - "id": 102, - "title": "Round Success / Error Rate", + "id": 402, + "title": "gRPC Client Error Rate", "type": "timeseries", "targets": [ { - "expr": "rate(flyte:propeller:all:round:success_count{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Success", + "expr": "sum by (grpc_service, grpc_method, grpc_code) (rate(grpc_client_handled_total{namespace=\"$namespace\", grpc_code!=\"OK\"}[$__rate_interval]))", + "legendFormat": "{{ grpc_method }} {{ grpc_code }}", "refId": "A" - }, - { - "expr": "rate(flyte:propeller:all:round:error_count{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Errors", - "refId": "B" - }, - { - "expr": "rate(flyte:propeller:all:round:panic_unlabeled{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Panics", - "refId": "C" } ], - "description": "Propeller round outcomes: success, errors, and panics per second." + "description": "DP\u2192CP gRPC errors by method and code. Non-OK responses from the control plane.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3214,564 +3244,37 @@ data: "lineWidth": 1, "showPoints": "never" }, - "unit": "short" + "unit": "s" } }, "gridPos": { "h": 8, - "w": 8, - "x": 16, - "y": 12 + "w": 24, + "x": 0, + "y": 21 }, - "id": 103, - "title": "Free Workers", + "id": 403, + "title": "gRPC Client Latency (p95)", "type": "timeseries", "targets": [ { - "expr": "sum(flyte:propeller:all:free_workers_count{namespace=\"$namespace\"})", - "legendFormat": "Free workers", + "expr": "histogram_quantile(0.95, sum by (le, grpc_method) (rate(grpc_client_handling_seconds_bucket{namespace=\"$namespace\"}[$__rate_interval])))", + "legendFormat": "{{ grpc_method }} p95", "refId": "A" } ], - "description": "Idle propeller worker goroutines. Zero = all workers busy processing workflows." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "ops" - } - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 0, - "y": 20 - }, - "id": 104, - "title": "Queue Add Rate", - "type": "timeseries", - "targets": [ - { - "expr": "rate(flyte:propeller:all:main_adds{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Main adds", - "refId": "A" - }, - { - "expr": "rate(flyte:propeller:all:sub_adds{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Sub adds", - "refId": "B" - }, - { - "expr": "rate(flyte:propeller:all:main_retries{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Main retries", - "refId": "C" - } - ], - "description": "Items enqueued to propeller's main/sub workqueues, plus retry rate." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "short" - } - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 8, - "y": 20 - }, - "id": 105, - "title": "Workflow Updates", - "type": "timeseries", - "targets": [ - { - "expr": "rate(flyte:propeller:all:wf_updated{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Updated", - "refId": "A" - }, - { - "expr": "rate(flyte:propeller:all:wf_update_failed{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Failed", - "refId": "B" - }, - { - "expr": "rate(flyte:propeller:all:wf_too_large{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Too large", - "refId": "C" - }, - { - "expr": "rate(flyte:propeller:all:wf_update_conflict{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Conflict", - "refId": "D" - } - ], - "description": "etcd write outcomes: successful updates, failures, too-large (>1.5MB), optimistic concurrency conflicts." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "ms" - } - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 16, - "y": 20 - }, - "id": 106, - "title": "Workflow Update Latency", - "type": "timeseries", - "targets": [ - { - "expr": "flyte:propeller:all:wf_update_latency_ms{namespace=\"$namespace\", quantile=\"0.5\"}", - "legendFormat": "p50", - "refId": "A" - }, - { - "expr": "flyte:propeller:all:wf_update_latency_ms{namespace=\"$namespace\", quantile=\"0.9\"}", - "legendFormat": "p90", - "refId": "B" - }, - { - "expr": "flyte:propeller:all:wf_update_latency_ms{namespace=\"$namespace\", quantile=\"0.99\"}", - "legendFormat": "p99", - "refId": "C" - } - ], - "description": "etcd write latency for FlyteWorkflow status updates." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "ms" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 28 - }, - "id": 107, - "title": "Node Queueing & Execution Latency", - "type": "timeseries", - "targets": [ - { - "expr": "flyte:propeller:all:node:queueing_latency_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.5\"}", - "legendFormat": "Queue p50", - "refId": "A" - }, - { - "expr": "flyte:propeller:all:node:queueing_latency_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.9\"}", - "legendFormat": "Queue p90", - "refId": "B" - }, - { - "expr": "flyte:propeller:all:node:node_exec_latency_unlabeled_us{namespace=\"$namespace\", quantile=\"0.9\"} / 1000", - "legendFormat": "Exec p90 (ms)", - "refId": "C" - } - ], - "description": "Node lifecycle: queueing latency (queued\u2192running) and execution latency (time in handler)." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "percentunit" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 28 - }, - "id": 108, - "title": "Metastore Cache Hit Rate", - "type": "timeseries", - "targets": [ - { - "expr": "rate(flyte:propeller:all:metastore:cache_hit{namespace=\"$namespace\"}[$__rate_interval]) / (rate(flyte:propeller:all:metastore:cache_hit{namespace=\"$namespace\"}[$__rate_interval]) + rate(flyte:propeller:all:metastore:cache_miss{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "Hit rate", - "refId": "A" - } - ], - "description": "In-memory cache effectiveness for object store (S3/GCS) reads. Low hit rate = excessive storage calls." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "ops" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 36 - }, - "id": 109, - "title": "Event Recording (DP \u2192 CP)", - "type": "timeseries", - "targets": [ - { - "expr": "sum(rate(flyte:propeller:all:task:event_recording:success_duration_ms_count{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "Task success", - "refId": "A" - }, - { - "expr": "sum(rate(flyte:propeller:all:node:event_recording:success_duration_ms_count{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "Node success", - "refId": "B" - }, - { - "expr": "sum(rate(flyte:propeller:all:task:event_recording:failure_duration_ms_count{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "Task failure", - "refId": "C" - }, - { - "expr": "sum(rate(flyte:propeller:all:node:event_recording:failure_duration_ms_count{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "Node failure", - "refId": "D" - } - ], - "description": "Task/node event recording rate from propeller to FlyteAdmin. Failures indicate CP connectivity issues." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "short" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 36 - }, - "id": 110, - "title": "Cache Discovery (hit/miss/skip)", - "type": "timeseries", - "targets": [ - { - "expr": "rate(flyte:propeller:all:discovery_hit_count{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Hits", - "refId": "A" - }, - { - "expr": "rate(flyte:propeller:all:discovery_miss_count{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Misses", - "refId": "B" - }, - { - "expr": "rate(flyte:propeller:all:discovery_skip_count{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Skips", - "refId": "C" - }, - { - "expr": "rate(flyte:propeller:all:discovery_get_failure_count{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Get failures", - "refId": "D" - } - ], - "description": "V2 executor cache discovery miss/put rates for task output caching." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "reqps" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 44 - }, - "id": 111, - "title": "K8s API Client Rate", - "type": "timeseries", - "targets": [ - { - "expr": "sum(rate(k8s_client_request_total_unlabeled{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "K8s requests/s", - "refId": "A" - } - ], - "description": "Propeller's K8s API request rate. High rates may indicate excessive pod watches or creates." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "s" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 44 - }, - "id": 112, - "title": "K8s API Client Latency (p90)", - "type": "timeseries", - "targets": [ - { - "expr": "histogram_quantile(0.90, sum by (le) (rate(k8s_client_request_latency_unlabeled_bucket{namespace=\"$namespace\"}[$__rate_interval])))", - "legendFormat": "Request p90", - "refId": "A" - }, - { - "expr": "histogram_quantile(0.90, sum by (le) (rate(k8s_client_rate_limiter_latency_unlabeled_bucket{namespace=\"$namespace\"}[$__rate_interval])))", - "legendFormat": "Rate limiter p90", - "refId": "B" - } - ], - "description": "K8s API request latency and client-side rate limiter wait time at p90." - } - ] - }, - { - "collapsed": true, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 34 - }, - "id": 400, - "title": "gRPC Client (DP \u2192 CP)", - "type": "row", - "panels": [ - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "reqps" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 13 - }, - "id": 401, - "title": "gRPC Client Request Rate", - "type": "timeseries", - "targets": [ - { - "expr": "sum by (grpc_service, grpc_method) (rate(grpc_client_handled_total{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "{{ grpc_service }}/{{ grpc_method }}", - "refId": "A" - } - ], - "description": "DP\u2192CP gRPC client request rate by service and method (e.g. CreateWorkflowEvent)." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "reqps" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 13 - }, - "id": 402, - "title": "gRPC Client Error Rate", - "type": "timeseries", - "targets": [ - { - "expr": "sum by (grpc_service, grpc_method, grpc_code) (rate(grpc_client_handled_total{namespace=\"$namespace\", grpc_code!=\"OK\"}[$__rate_interval]))", - "legendFormat": "{{ grpc_method }} {{ grpc_code }}", - "refId": "A" - } - ], - "description": "DP\u2192CP gRPC errors by method and code. Non-OK responses from the control plane." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "s" - } - }, - "gridPos": { - "h": 8, - "w": 24, - "x": 0, - "y": 21 - }, - "id": 403, - "title": "gRPC Client Latency (p95)", - "type": "timeseries", - "targets": [ - { - "expr": "histogram_quantile(0.95, sum by (le, grpc_method) (rate(grpc_client_handling_seconds_bucket{namespace=\"$namespace\"}[$__rate_interval])))", - "legendFormat": "{{ grpc_method }} p95", - "refId": "A" + "description": "DP\u2192CP gRPC call latency at p95. High latency = slow control plane or network issues.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] } - ], - "description": "DP\u2192CP gRPC call latency at p95. High latency = slow control plane or network issues." + } } ] }, @@ -3825,7 +3328,18 @@ data: "refId": "A" } ], - "description": "CPU usage in cores per container, stacked. Identifies resource-heavy services." + "description": "CPU usage in cores per container, stacked. Identifies resource-heavy services.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3865,7 +3379,18 @@ data: "refId": "A" } ], - "description": "Working set memory per container. Watch for approaching limits." + "description": "Working set memory per container. Watch for approaching limits.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3902,7 +3427,18 @@ data: "refId": "A" } ], - "description": "Per-container restart events. Spikes indicate crashes or OOM kills." + "description": "Per-container restart events. Spikes indicate crashes or OOM kills.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } } ] } @@ -3954,7 +3490,7 @@ data: "timepicker": {}, "timezone": "browser", "title": "Union Dataplane Overview", - "uid": "union-dp-overview", + "uid": "union-dataplane-overview", "version": 1 } --- diff --git a/tests/generated/dataplane.nodeobserver.yaml b/tests/generated/dataplane.nodeobserver.yaml index f2cfd498..00997c7f 100644 --- a/tests/generated/dataplane.nodeobserver.yaml +++ b/tests/generated/dataplane.nodeobserver.yaml @@ -1177,63 +1177,6 @@ data: ], "description": "Total container restarts in the last hour. Non-zero indicates crashlooping or OOM kills." }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "thresholds": { - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "yellow", - "value": 10 - }, - { - "color": "red", - "value": 50 - } - ] - }, - "unit": "short" - } - }, - "gridPos": { - "h": 4, - "w": 6, - "x": 18, - "y": 1 - }, - "id": 8, - "options": { - "colorMode": "background", - "graphMode": "area", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ] - }, - "textMode": "auto" - }, - "title": "Active Workflows", - "type": "stat", - "targets": [ - { - "expr": "sum(flyte:propeller:all:execstats:active_workflow_executions{namespace=\"$namespace\"})", - "legendFormat": "Workflows", - "refId": "A" - } - ], - "description": "Current active FlyteWorkflow CRD count managed by Propeller." - }, { "datasource": { "type": "prometheus", @@ -1263,8 +1206,9 @@ data: "options": { "legend": { "calcs": [ - "mean", - "max" + "min", + "max", + "lastNotNull" ], "displayMode": "table", "placement": "bottom" @@ -1295,59 +1239,6 @@ data: ], "description": "Current count of active workflow, node, and task executions tracked by FlyteAdmin." }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "short" - } - }, - "gridPos": { - "h": 6, - "w": 12, - "x": 12, - "y": 5 - }, - "id": 7, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom" - }, - "tooltip": { - "mode": "multi", - "sort": "desc" - } - }, - "title": "Queue Depth", - "type": "timeseries", - "targets": [ - { - "expr": "sum(flyte:propeller:all:main_depth{namespace=\"$namespace\"})", - "legendFormat": "Main", - "refId": "A" - }, - { - "expr": "sum(flyte:propeller:all:sub_depth{namespace=\"$namespace\"})", - "legendFormat": "Sub", - "refId": "B" - } - ], - "description": "Main and sub workqueue depth over time." - }, { "datasource": { "type": "prometheus", @@ -1633,7 +1524,7 @@ data: "y": 10 }, "id": 1204, - "title": "Propeller Latency p99", + "title": "Executor Evaluate Duration p99", "type": "stat", "options": { "colorMode": "background", @@ -1647,8 +1538,13 @@ data: }, "targets": [ { - "expr": "(union:dp:slo:propeller_round_latency_p99 or flyte:propeller:all:round:round_time_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.99\"} / 1000)", - "refId": "A" + "expr": "executor::v2:evaluator:evaluate_duration_ms{namespace=\"$namespace\", quantile=\"0.99\"}", + "legendFormat": "p99", + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + } } ], "description": "Propeller round p99 latency in seconds. Falls back to raw metric if SLO rules not enabled." @@ -1693,7 +1589,18 @@ data: "refId": "B" } ], - "description": "DP service availability over time with SLO target line." + "description": "DP service availability over time with SLO target line.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -1737,7 +1644,18 @@ data: "refId": "B" } ], - "description": "DP error budget remaining over time. Requires monitoring.slos.enabled." + "description": "DP error budget remaining over time. Requires monitoring.slos.enabled.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "collapsed": true, @@ -1791,7 +1709,18 @@ data: "refId": "B" } ], - "description": "Operator execution operation processing rate and failure rate." + "description": "Operator execution operation processing rate and failure rate.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -1848,7 +1777,18 @@ data: "refId": "E" } ], - "description": "FlyteAdmin error rates: propeller communication failures, model transform errors, notification publish failures." + "description": "FlyteAdmin error rates: propeller communication failures, model transform errors, notification publish failures.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -1895,7 +1835,18 @@ data: "refId": "C" } ], - "description": "Breakdown of operator heartbeat computation: capabilities, usages, list workflows." + "description": "Breakdown of operator heartbeat computation: capabilities, usages, list workflows.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -1942,7 +1893,18 @@ data: "refId": "C" } ], - "description": "Config sync cycle rate, errors, and propeller ConfigMap update count." + "description": "Config sync cycle rate, errors, and propeller ConfigMap update count.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -1984,7 +1946,18 @@ data: "refId": "B" } ], - "description": "Billing collection cycle rate and errors. Failures mean billing data may be delayed." + "description": "Billing collection cycle rate and errors. Failures mean billing data may be delayed.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2083,17 +2056,28 @@ data: "type": "timeseries", "targets": [ { - "expr": "executor:active_actions_count{namespace=\"$namespace\"}", + "expr": "executor::v2:active_actions_count{namespace=\"$namespace\"}", "legendFormat": "Active actions", "refId": "A" }, { - "expr": "executor:available_capacity{namespace=\"$namespace\"}", + "expr": "executor::v2:available_capacity{namespace=\"$namespace\"}", "legendFormat": "Available capacity", "refId": "B" } ], - "description": "V2 executor active actions vs available capacity. Capacity=0 means executor is saturated." + "description": "V2 executor active actions vs available capacity. Capacity=0 means executor is saturated.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2125,17 +2109,28 @@ data: "type": "timeseries", "targets": [ { - "expr": "rate(executor:discovery_miss_count{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(executor::v2:discovery_miss_count{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Miss", "refId": "A" }, { - "expr": "rate(executor:discovery_put_success_count{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(executor::v2:discovery_put_success_count{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Put success", "refId": "B" } ], - "description": "V2 executor cache discovery miss/put rates for task output caching." + "description": "V2 executor cache discovery miss/put rates for task output caching.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2170,12 +2165,23 @@ data: "type": "timeseries", "targets": [ { - "expr": "sum by (phase) (rate(executor:actions_terminated{namespace=\"$namespace\"}[$__rate_interval]))", + "expr": "sum by (phase) (rate(executor::v2:actions_terminated{namespace=\"$namespace\"}[$__rate_interval]))", "legendFormat": "{{ phase }}", "refId": "A" } ], - "description": "Task completion rate by outcome: succeeded, failed, aborted. Key V2 SLI for task health. [Metrics pending: requires cloud service instrumentation to be deployed]" + "description": "Task completion rate by outcome: succeeded, failed, aborted. Key V2 SLI for task health. [Metrics pending: requires cloud service instrumentation to be deployed]", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2207,22 +2213,33 @@ data: "type": "timeseries", "targets": [ { - "expr": "executor:evaluator:evaluate_duration{namespace=\"$namespace\", quantile=\"0.5\"}", + "expr": "executor::v2:evaluator:evaluate_duration_ms{namespace=\"$namespace\", quantile=\"0.5\"}", "legendFormat": "Evaluate p50", "refId": "A" }, { - "expr": "executor:evaluator:evaluate_duration{namespace=\"$namespace\", quantile=\"0.9\"}", + "expr": "executor::v2:evaluator:evaluate_duration_ms{namespace=\"$namespace\", quantile=\"0.9\"}", "legendFormat": "Evaluate p90", "refId": "B" }, { - "expr": "executor:evaluator:evaluate_duration{namespace=\"$namespace\", quantile=\"0.99\"}", + "expr": "executor::v2:evaluator:evaluate_duration_ms{namespace=\"$namespace\", quantile=\"0.99\"}", "legendFormat": "Evaluate p99", "refId": "C" } ], - "description": "Time spent in RecursiveNodeHandler (pod creation). Dominant component of V2 task latency. [Metrics pending: requires cloud service instrumentation to be deployed]" + "description": "Time spent in RecursiveNodeHandler (pod creation). Dominant component of V2 task latency. [Metrics pending: requires cloud service instrumentation to be deployed]", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2254,27 +2271,38 @@ data: "type": "timeseries", "targets": [ { - "expr": "rate(executor:system_failures{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(executor::v2:system_failures{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "System failures", "refId": "A" }, { - "expr": "rate(executor:system_failures_exhausted{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(executor::v2:system_failures_exhausted{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Exhausted retries", "refId": "B" }, { - "expr": "rate(executor:invalid_leases{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(executor::v2:invalid_leases{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Invalid leases", "refId": "C" }, { - "expr": "rate(executor:evaluator:evaluate_errors{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(executor::v2:evaluator:evaluate_errors{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Evaluate errors", "refId": "D" } ], - "description": "Executor error rates. System failures retry; exhausted means task permanently failed. Invalid leases = malformed from queue service. [Metrics pending: requires cloud service instrumentation to be deployed]" + "description": "Executor error rates. System failures retry; exhausted means task permanently failed. Invalid leases = malformed from queue service. [Metrics pending: requires cloud service instrumentation to be deployed]", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } } ] }, @@ -2284,10 +2312,10 @@ data: "h": 1, "w": 24, "x": 0, - "y": 33 + "y": 34 }, - "id": 100, - "title": "Flyte Propeller (V1)", + "id": 400, + "title": "gRPC Client (DP \u2192 CP)", "type": "row", "panels": [ { @@ -2306,36 +2334,37 @@ data: "lineWidth": 1, "showPoints": "never" }, - "unit": "ms" + "unit": "reqps" } }, "gridPos": { "h": 8, - "w": 8, + "w": 12, "x": 0, - "y": 12 + "y": 13 }, - "id": 101, - "title": "Round Time (p50 / p90 / p99)", + "id": 401, + "title": "gRPC Client Request Rate", "type": "timeseries", "targets": [ { - "expr": "flyte:propeller:all:round:round_time_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.5\"}", - "legendFormat": "p50", + "expr": "sum by (grpc_service, grpc_method) (rate(grpc_client_handled_total{namespace=\"$namespace\"}[$__rate_interval]))", + "legendFormat": "{{ grpc_service }}/{{ grpc_method }}", "refId": "A" - }, - { - "expr": "flyte:propeller:all:round:round_time_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.9\"}", - "legendFormat": "p90", - "refId": "B" - }, - { - "expr": "flyte:propeller:all:round:round_time_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.99\"}", - "legendFormat": "p99", - "refId": "C" } ], - "description": "Propeller reconciliation round time. One round = one FlyteWorkflow CRD processed." + "description": "DP\u2192CP gRPC client request rate by service and method (e.g. CreateWorkflowEvent).", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2353,36 +2382,37 @@ data: "lineWidth": 1, "showPoints": "never" }, - "unit": "ops" + "unit": "reqps" } }, "gridPos": { "h": 8, - "w": 8, - "x": 8, - "y": 12 + "w": 12, + "x": 12, + "y": 13 }, - "id": 102, - "title": "Round Success / Error Rate", + "id": 402, + "title": "gRPC Client Error Rate", "type": "timeseries", "targets": [ { - "expr": "rate(flyte:propeller:all:round:success_count{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Success", + "expr": "sum by (grpc_service, grpc_method, grpc_code) (rate(grpc_client_handled_total{namespace=\"$namespace\", grpc_code!=\"OK\"}[$__rate_interval]))", + "legendFormat": "{{ grpc_method }} {{ grpc_code }}", "refId": "A" - }, - { - "expr": "rate(flyte:propeller:all:round:error_count{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Errors", - "refId": "B" - }, - { - "expr": "rate(flyte:propeller:all:round:panic_unlabeled{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Panics", - "refId": "C" } ], - "description": "Propeller round outcomes: success, errors, and panics per second." + "description": "DP\u2192CP gRPC errors by method and code. Non-OK responses from the control plane.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2400,564 +2430,37 @@ data: "lineWidth": 1, "showPoints": "never" }, - "unit": "short" + "unit": "s" } }, "gridPos": { "h": 8, - "w": 8, - "x": 16, - "y": 12 + "w": 24, + "x": 0, + "y": 21 }, - "id": 103, - "title": "Free Workers", + "id": 403, + "title": "gRPC Client Latency (p95)", "type": "timeseries", "targets": [ { - "expr": "sum(flyte:propeller:all:free_workers_count{namespace=\"$namespace\"})", - "legendFormat": "Free workers", + "expr": "histogram_quantile(0.95, sum by (le, grpc_method) (rate(grpc_client_handling_seconds_bucket{namespace=\"$namespace\"}[$__rate_interval])))", + "legendFormat": "{{ grpc_method }} p95", "refId": "A" } ], - "description": "Idle propeller worker goroutines. Zero = all workers busy processing workflows." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "ops" - } - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 0, - "y": 20 - }, - "id": 104, - "title": "Queue Add Rate", - "type": "timeseries", - "targets": [ - { - "expr": "rate(flyte:propeller:all:main_adds{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Main adds", - "refId": "A" - }, - { - "expr": "rate(flyte:propeller:all:sub_adds{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Sub adds", - "refId": "B" - }, - { - "expr": "rate(flyte:propeller:all:main_retries{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Main retries", - "refId": "C" - } - ], - "description": "Items enqueued to propeller's main/sub workqueues, plus retry rate." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "short" - } - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 8, - "y": 20 - }, - "id": 105, - "title": "Workflow Updates", - "type": "timeseries", - "targets": [ - { - "expr": "rate(flyte:propeller:all:wf_updated{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Updated", - "refId": "A" - }, - { - "expr": "rate(flyte:propeller:all:wf_update_failed{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Failed", - "refId": "B" - }, - { - "expr": "rate(flyte:propeller:all:wf_too_large{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Too large", - "refId": "C" - }, - { - "expr": "rate(flyte:propeller:all:wf_update_conflict{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Conflict", - "refId": "D" - } - ], - "description": "etcd write outcomes: successful updates, failures, too-large (>1.5MB), optimistic concurrency conflicts." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "ms" - } - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 16, - "y": 20 - }, - "id": 106, - "title": "Workflow Update Latency", - "type": "timeseries", - "targets": [ - { - "expr": "flyte:propeller:all:wf_update_latency_ms{namespace=\"$namespace\", quantile=\"0.5\"}", - "legendFormat": "p50", - "refId": "A" - }, - { - "expr": "flyte:propeller:all:wf_update_latency_ms{namespace=\"$namespace\", quantile=\"0.9\"}", - "legendFormat": "p90", - "refId": "B" - }, - { - "expr": "flyte:propeller:all:wf_update_latency_ms{namespace=\"$namespace\", quantile=\"0.99\"}", - "legendFormat": "p99", - "refId": "C" - } - ], - "description": "etcd write latency for FlyteWorkflow status updates." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "ms" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 28 - }, - "id": 107, - "title": "Node Queueing & Execution Latency", - "type": "timeseries", - "targets": [ - { - "expr": "flyte:propeller:all:node:queueing_latency_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.5\"}", - "legendFormat": "Queue p50", - "refId": "A" - }, - { - "expr": "flyte:propeller:all:node:queueing_latency_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.9\"}", - "legendFormat": "Queue p90", - "refId": "B" - }, - { - "expr": "flyte:propeller:all:node:node_exec_latency_unlabeled_us{namespace=\"$namespace\", quantile=\"0.9\"} / 1000", - "legendFormat": "Exec p90 (ms)", - "refId": "C" - } - ], - "description": "Node lifecycle: queueing latency (queued\u2192running) and execution latency (time in handler)." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "percentunit" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 28 - }, - "id": 108, - "title": "Metastore Cache Hit Rate", - "type": "timeseries", - "targets": [ - { - "expr": "rate(flyte:propeller:all:metastore:cache_hit{namespace=\"$namespace\"}[$__rate_interval]) / (rate(flyte:propeller:all:metastore:cache_hit{namespace=\"$namespace\"}[$__rate_interval]) + rate(flyte:propeller:all:metastore:cache_miss{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "Hit rate", - "refId": "A" - } - ], - "description": "In-memory cache effectiveness for object store (S3/GCS) reads. Low hit rate = excessive storage calls." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "ops" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 36 - }, - "id": 109, - "title": "Event Recording (DP \u2192 CP)", - "type": "timeseries", - "targets": [ - { - "expr": "sum(rate(flyte:propeller:all:task:event_recording:success_duration_ms_count{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "Task success", - "refId": "A" - }, - { - "expr": "sum(rate(flyte:propeller:all:node:event_recording:success_duration_ms_count{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "Node success", - "refId": "B" - }, - { - "expr": "sum(rate(flyte:propeller:all:task:event_recording:failure_duration_ms_count{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "Task failure", - "refId": "C" - }, - { - "expr": "sum(rate(flyte:propeller:all:node:event_recording:failure_duration_ms_count{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "Node failure", - "refId": "D" - } - ], - "description": "Task/node event recording rate from propeller to FlyteAdmin. Failures indicate CP connectivity issues." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "short" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 36 - }, - "id": 110, - "title": "Cache Discovery (hit/miss/skip)", - "type": "timeseries", - "targets": [ - { - "expr": "rate(flyte:propeller:all:discovery_hit_count{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Hits", - "refId": "A" - }, - { - "expr": "rate(flyte:propeller:all:discovery_miss_count{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Misses", - "refId": "B" - }, - { - "expr": "rate(flyte:propeller:all:discovery_skip_count{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Skips", - "refId": "C" - }, - { - "expr": "rate(flyte:propeller:all:discovery_get_failure_count{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Get failures", - "refId": "D" - } - ], - "description": "V2 executor cache discovery miss/put rates for task output caching." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "reqps" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 44 - }, - "id": 111, - "title": "K8s API Client Rate", - "type": "timeseries", - "targets": [ - { - "expr": "sum(rate(k8s_client_request_total_unlabeled{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "K8s requests/s", - "refId": "A" - } - ], - "description": "Propeller's K8s API request rate. High rates may indicate excessive pod watches or creates." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "s" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 44 - }, - "id": 112, - "title": "K8s API Client Latency (p90)", - "type": "timeseries", - "targets": [ - { - "expr": "histogram_quantile(0.90, sum by (le) (rate(k8s_client_request_latency_unlabeled_bucket{namespace=\"$namespace\"}[$__rate_interval])))", - "legendFormat": "Request p90", - "refId": "A" - }, - { - "expr": "histogram_quantile(0.90, sum by (le) (rate(k8s_client_rate_limiter_latency_unlabeled_bucket{namespace=\"$namespace\"}[$__rate_interval])))", - "legendFormat": "Rate limiter p90", - "refId": "B" - } - ], - "description": "K8s API request latency and client-side rate limiter wait time at p90." - } - ] - }, - { - "collapsed": true, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 34 - }, - "id": 400, - "title": "gRPC Client (DP \u2192 CP)", - "type": "row", - "panels": [ - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "reqps" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 13 - }, - "id": 401, - "title": "gRPC Client Request Rate", - "type": "timeseries", - "targets": [ - { - "expr": "sum by (grpc_service, grpc_method) (rate(grpc_client_handled_total{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "{{ grpc_service }}/{{ grpc_method }}", - "refId": "A" - } - ], - "description": "DP\u2192CP gRPC client request rate by service and method (e.g. CreateWorkflowEvent)." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "reqps" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 13 - }, - "id": 402, - "title": "gRPC Client Error Rate", - "type": "timeseries", - "targets": [ - { - "expr": "sum by (grpc_service, grpc_method, grpc_code) (rate(grpc_client_handled_total{namespace=\"$namespace\", grpc_code!=\"OK\"}[$__rate_interval]))", - "legendFormat": "{{ grpc_method }} {{ grpc_code }}", - "refId": "A" - } - ], - "description": "DP\u2192CP gRPC errors by method and code. Non-OK responses from the control plane." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "s" - } - }, - "gridPos": { - "h": 8, - "w": 24, - "x": 0, - "y": 21 - }, - "id": 403, - "title": "gRPC Client Latency (p95)", - "type": "timeseries", - "targets": [ - { - "expr": "histogram_quantile(0.95, sum by (le, grpc_method) (rate(grpc_client_handling_seconds_bucket{namespace=\"$namespace\"}[$__rate_interval])))", - "legendFormat": "{{ grpc_method }} p95", - "refId": "A" + "description": "DP\u2192CP gRPC call latency at p95. High latency = slow control plane or network issues.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] } - ], - "description": "DP\u2192CP gRPC call latency at p95. High latency = slow control plane or network issues." + } } ] }, @@ -3011,7 +2514,18 @@ data: "refId": "A" } ], - "description": "CPU usage in cores per container, stacked. Identifies resource-heavy services." + "description": "CPU usage in cores per container, stacked. Identifies resource-heavy services.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3051,7 +2565,18 @@ data: "refId": "A" } ], - "description": "Working set memory per container. Watch for approaching limits." + "description": "Working set memory per container. Watch for approaching limits.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3088,7 +2613,18 @@ data: "refId": "A" } ], - "description": "Per-container restart events. Spikes indicate crashes or OOM kills." + "description": "Per-container restart events. Spikes indicate crashes or OOM kills.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } } ] } @@ -3140,7 +2676,7 @@ data: "timepicker": {}, "timezone": "browser", "title": "Union Dataplane Overview", - "uid": "union-dp-overview", + "uid": "union-dataplane-overview", "version": 1 } --- diff --git a/tests/generated/dataplane.oci.yaml b/tests/generated/dataplane.oci.yaml index 6e24bebc..8b3a9ad2 100644 --- a/tests/generated/dataplane.oci.yaml +++ b/tests/generated/dataplane.oci.yaml @@ -1191,63 +1191,6 @@ data: ], "description": "Total container restarts in the last hour. Non-zero indicates crashlooping or OOM kills." }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "thresholds": { - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "yellow", - "value": 10 - }, - { - "color": "red", - "value": 50 - } - ] - }, - "unit": "short" - } - }, - "gridPos": { - "h": 4, - "w": 6, - "x": 18, - "y": 1 - }, - "id": 8, - "options": { - "colorMode": "background", - "graphMode": "area", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ] - }, - "textMode": "auto" - }, - "title": "Active Workflows", - "type": "stat", - "targets": [ - { - "expr": "sum(flyte:propeller:all:execstats:active_workflow_executions{namespace=\"$namespace\"})", - "legendFormat": "Workflows", - "refId": "A" - } - ], - "description": "Current active FlyteWorkflow CRD count managed by Propeller." - }, { "datasource": { "type": "prometheus", @@ -1277,8 +1220,9 @@ data: "options": { "legend": { "calcs": [ - "mean", - "max" + "min", + "max", + "lastNotNull" ], "displayMode": "table", "placement": "bottom" @@ -1309,59 +1253,6 @@ data: ], "description": "Current count of active workflow, node, and task executions tracked by FlyteAdmin." }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "short" - } - }, - "gridPos": { - "h": 6, - "w": 12, - "x": 12, - "y": 5 - }, - "id": 7, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom" - }, - "tooltip": { - "mode": "multi", - "sort": "desc" - } - }, - "title": "Queue Depth", - "type": "timeseries", - "targets": [ - { - "expr": "sum(flyte:propeller:all:main_depth{namespace=\"$namespace\"})", - "legendFormat": "Main", - "refId": "A" - }, - { - "expr": "sum(flyte:propeller:all:sub_depth{namespace=\"$namespace\"})", - "legendFormat": "Sub", - "refId": "B" - } - ], - "description": "Main and sub workqueue depth over time." - }, { "datasource": { "type": "prometheus", @@ -1647,7 +1538,7 @@ data: "y": 10 }, "id": 1204, - "title": "Propeller Latency p99", + "title": "Executor Evaluate Duration p99", "type": "stat", "options": { "colorMode": "background", @@ -1661,8 +1552,13 @@ data: }, "targets": [ { - "expr": "(union:dp:slo:propeller_round_latency_p99 or flyte:propeller:all:round:round_time_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.99\"} / 1000)", - "refId": "A" + "expr": "executor::v2:evaluator:evaluate_duration_ms{namespace=\"$namespace\", quantile=\"0.99\"}", + "legendFormat": "p99", + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + } } ], "description": "Propeller round p99 latency in seconds. Falls back to raw metric if SLO rules not enabled." @@ -1707,7 +1603,18 @@ data: "refId": "B" } ], - "description": "DP service availability over time with SLO target line." + "description": "DP service availability over time with SLO target line.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -1751,7 +1658,18 @@ data: "refId": "B" } ], - "description": "DP error budget remaining over time. Requires monitoring.slos.enabled." + "description": "DP error budget remaining over time. Requires monitoring.slos.enabled.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "collapsed": true, @@ -1805,7 +1723,18 @@ data: "refId": "B" } ], - "description": "Operator execution operation processing rate and failure rate." + "description": "Operator execution operation processing rate and failure rate.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -1862,7 +1791,18 @@ data: "refId": "E" } ], - "description": "FlyteAdmin error rates: propeller communication failures, model transform errors, notification publish failures." + "description": "FlyteAdmin error rates: propeller communication failures, model transform errors, notification publish failures.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -1909,7 +1849,18 @@ data: "refId": "C" } ], - "description": "Breakdown of operator heartbeat computation: capabilities, usages, list workflows." + "description": "Breakdown of operator heartbeat computation: capabilities, usages, list workflows.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -1956,7 +1907,18 @@ data: "refId": "C" } ], - "description": "Config sync cycle rate, errors, and propeller ConfigMap update count." + "description": "Config sync cycle rate, errors, and propeller ConfigMap update count.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -1998,7 +1960,18 @@ data: "refId": "B" } ], - "description": "Billing collection cycle rate and errors. Failures mean billing data may be delayed." + "description": "Billing collection cycle rate and errors. Failures mean billing data may be delayed.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2097,17 +2070,28 @@ data: "type": "timeseries", "targets": [ { - "expr": "executor:active_actions_count{namespace=\"$namespace\"}", + "expr": "executor::v2:active_actions_count{namespace=\"$namespace\"}", "legendFormat": "Active actions", "refId": "A" }, { - "expr": "executor:available_capacity{namespace=\"$namespace\"}", + "expr": "executor::v2:available_capacity{namespace=\"$namespace\"}", "legendFormat": "Available capacity", "refId": "B" } ], - "description": "V2 executor active actions vs available capacity. Capacity=0 means executor is saturated." + "description": "V2 executor active actions vs available capacity. Capacity=0 means executor is saturated.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2139,17 +2123,28 @@ data: "type": "timeseries", "targets": [ { - "expr": "rate(executor:discovery_miss_count{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(executor::v2:discovery_miss_count{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Miss", "refId": "A" }, { - "expr": "rate(executor:discovery_put_success_count{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(executor::v2:discovery_put_success_count{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Put success", "refId": "B" } ], - "description": "V2 executor cache discovery miss/put rates for task output caching." + "description": "V2 executor cache discovery miss/put rates for task output caching.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2184,12 +2179,23 @@ data: "type": "timeseries", "targets": [ { - "expr": "sum by (phase) (rate(executor:actions_terminated{namespace=\"$namespace\"}[$__rate_interval]))", + "expr": "sum by (phase) (rate(executor::v2:actions_terminated{namespace=\"$namespace\"}[$__rate_interval]))", "legendFormat": "{{ phase }}", "refId": "A" } ], - "description": "Task completion rate by outcome: succeeded, failed, aborted. Key V2 SLI for task health. [Metrics pending: requires cloud service instrumentation to be deployed]" + "description": "Task completion rate by outcome: succeeded, failed, aborted. Key V2 SLI for task health. [Metrics pending: requires cloud service instrumentation to be deployed]", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2221,22 +2227,33 @@ data: "type": "timeseries", "targets": [ { - "expr": "executor:evaluator:evaluate_duration{namespace=\"$namespace\", quantile=\"0.5\"}", + "expr": "executor::v2:evaluator:evaluate_duration_ms{namespace=\"$namespace\", quantile=\"0.5\"}", "legendFormat": "Evaluate p50", "refId": "A" }, { - "expr": "executor:evaluator:evaluate_duration{namespace=\"$namespace\", quantile=\"0.9\"}", + "expr": "executor::v2:evaluator:evaluate_duration_ms{namespace=\"$namespace\", quantile=\"0.9\"}", "legendFormat": "Evaluate p90", "refId": "B" }, { - "expr": "executor:evaluator:evaluate_duration{namespace=\"$namespace\", quantile=\"0.99\"}", + "expr": "executor::v2:evaluator:evaluate_duration_ms{namespace=\"$namespace\", quantile=\"0.99\"}", "legendFormat": "Evaluate p99", "refId": "C" } ], - "description": "Time spent in RecursiveNodeHandler (pod creation). Dominant component of V2 task latency. [Metrics pending: requires cloud service instrumentation to be deployed]" + "description": "Time spent in RecursiveNodeHandler (pod creation). Dominant component of V2 task latency. [Metrics pending: requires cloud service instrumentation to be deployed]", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2268,27 +2285,38 @@ data: "type": "timeseries", "targets": [ { - "expr": "rate(executor:system_failures{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(executor::v2:system_failures{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "System failures", "refId": "A" }, { - "expr": "rate(executor:system_failures_exhausted{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(executor::v2:system_failures_exhausted{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Exhausted retries", "refId": "B" }, { - "expr": "rate(executor:invalid_leases{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(executor::v2:invalid_leases{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Invalid leases", "refId": "C" }, { - "expr": "rate(executor:evaluator:evaluate_errors{namespace=\"$namespace\"}[$__rate_interval])", + "expr": "rate(executor::v2:evaluator:evaluate_errors{namespace=\"$namespace\"}[$__rate_interval])", "legendFormat": "Evaluate errors", "refId": "D" } ], - "description": "Executor error rates. System failures retry; exhausted means task permanently failed. Invalid leases = malformed from queue service. [Metrics pending: requires cloud service instrumentation to be deployed]" + "description": "Executor error rates. System failures retry; exhausted means task permanently failed. Invalid leases = malformed from queue service. [Metrics pending: requires cloud service instrumentation to be deployed]", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } } ] }, @@ -2298,10 +2326,10 @@ data: "h": 1, "w": 24, "x": 0, - "y": 33 + "y": 34 }, - "id": 100, - "title": "Flyte Propeller (V1)", + "id": 400, + "title": "gRPC Client (DP \u2192 CP)", "type": "row", "panels": [ { @@ -2320,36 +2348,37 @@ data: "lineWidth": 1, "showPoints": "never" }, - "unit": "ms" + "unit": "reqps" } }, "gridPos": { "h": 8, - "w": 8, + "w": 12, "x": 0, - "y": 12 + "y": 13 }, - "id": 101, - "title": "Round Time (p50 / p90 / p99)", + "id": 401, + "title": "gRPC Client Request Rate", "type": "timeseries", "targets": [ { - "expr": "flyte:propeller:all:round:round_time_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.5\"}", - "legendFormat": "p50", + "expr": "sum by (grpc_service, grpc_method) (rate(grpc_client_handled_total{namespace=\"$namespace\"}[$__rate_interval]))", + "legendFormat": "{{ grpc_service }}/{{ grpc_method }}", "refId": "A" - }, - { - "expr": "flyte:propeller:all:round:round_time_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.9\"}", - "legendFormat": "p90", - "refId": "B" - }, - { - "expr": "flyte:propeller:all:round:round_time_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.99\"}", - "legendFormat": "p99", - "refId": "C" } ], - "description": "Propeller reconciliation round time. One round = one FlyteWorkflow CRD processed." + "description": "DP\u2192CP gRPC client request rate by service and method (e.g. CreateWorkflowEvent).", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2367,36 +2396,37 @@ data: "lineWidth": 1, "showPoints": "never" }, - "unit": "ops" + "unit": "reqps" } }, "gridPos": { "h": 8, - "w": 8, - "x": 8, - "y": 12 + "w": 12, + "x": 12, + "y": 13 }, - "id": 102, - "title": "Round Success / Error Rate", + "id": 402, + "title": "gRPC Client Error Rate", "type": "timeseries", "targets": [ { - "expr": "rate(flyte:propeller:all:round:success_count{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Success", + "expr": "sum by (grpc_service, grpc_method, grpc_code) (rate(grpc_client_handled_total{namespace=\"$namespace\", grpc_code!=\"OK\"}[$__rate_interval]))", + "legendFormat": "{{ grpc_method }} {{ grpc_code }}", "refId": "A" - }, - { - "expr": "rate(flyte:propeller:all:round:error_count{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Errors", - "refId": "B" - }, - { - "expr": "rate(flyte:propeller:all:round:panic_unlabeled{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Panics", - "refId": "C" } ], - "description": "Propeller round outcomes: success, errors, and panics per second." + "description": "DP\u2192CP gRPC errors by method and code. Non-OK responses from the control plane.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -2414,564 +2444,37 @@ data: "lineWidth": 1, "showPoints": "never" }, - "unit": "short" + "unit": "s" } }, "gridPos": { "h": 8, - "w": 8, - "x": 16, - "y": 12 + "w": 24, + "x": 0, + "y": 21 }, - "id": 103, - "title": "Free Workers", + "id": 403, + "title": "gRPC Client Latency (p95)", "type": "timeseries", "targets": [ { - "expr": "sum(flyte:propeller:all:free_workers_count{namespace=\"$namespace\"})", - "legendFormat": "Free workers", + "expr": "histogram_quantile(0.95, sum by (le, grpc_method) (rate(grpc_client_handling_seconds_bucket{namespace=\"$namespace\"}[$__rate_interval])))", + "legendFormat": "{{ grpc_method }} p95", "refId": "A" } ], - "description": "Idle propeller worker goroutines. Zero = all workers busy processing workflows." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "ops" - } - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 0, - "y": 20 - }, - "id": 104, - "title": "Queue Add Rate", - "type": "timeseries", - "targets": [ - { - "expr": "rate(flyte:propeller:all:main_adds{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Main adds", - "refId": "A" - }, - { - "expr": "rate(flyte:propeller:all:sub_adds{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Sub adds", - "refId": "B" - }, - { - "expr": "rate(flyte:propeller:all:main_retries{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Main retries", - "refId": "C" - } - ], - "description": "Items enqueued to propeller's main/sub workqueues, plus retry rate." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "short" - } - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 8, - "y": 20 - }, - "id": 105, - "title": "Workflow Updates", - "type": "timeseries", - "targets": [ - { - "expr": "rate(flyte:propeller:all:wf_updated{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Updated", - "refId": "A" - }, - { - "expr": "rate(flyte:propeller:all:wf_update_failed{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Failed", - "refId": "B" - }, - { - "expr": "rate(flyte:propeller:all:wf_too_large{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Too large", - "refId": "C" - }, - { - "expr": "rate(flyte:propeller:all:wf_update_conflict{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Conflict", - "refId": "D" - } - ], - "description": "etcd write outcomes: successful updates, failures, too-large (>1.5MB), optimistic concurrency conflicts." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "ms" - } - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 16, - "y": 20 - }, - "id": 106, - "title": "Workflow Update Latency", - "type": "timeseries", - "targets": [ - { - "expr": "flyte:propeller:all:wf_update_latency_ms{namespace=\"$namespace\", quantile=\"0.5\"}", - "legendFormat": "p50", - "refId": "A" - }, - { - "expr": "flyte:propeller:all:wf_update_latency_ms{namespace=\"$namespace\", quantile=\"0.9\"}", - "legendFormat": "p90", - "refId": "B" - }, - { - "expr": "flyte:propeller:all:wf_update_latency_ms{namespace=\"$namespace\", quantile=\"0.99\"}", - "legendFormat": "p99", - "refId": "C" - } - ], - "description": "etcd write latency for FlyteWorkflow status updates." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "ms" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 28 - }, - "id": 107, - "title": "Node Queueing & Execution Latency", - "type": "timeseries", - "targets": [ - { - "expr": "flyte:propeller:all:node:queueing_latency_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.5\"}", - "legendFormat": "Queue p50", - "refId": "A" - }, - { - "expr": "flyte:propeller:all:node:queueing_latency_unlabeled_ms{namespace=\"$namespace\", quantile=\"0.9\"}", - "legendFormat": "Queue p90", - "refId": "B" - }, - { - "expr": "flyte:propeller:all:node:node_exec_latency_unlabeled_us{namespace=\"$namespace\", quantile=\"0.9\"} / 1000", - "legendFormat": "Exec p90 (ms)", - "refId": "C" - } - ], - "description": "Node lifecycle: queueing latency (queued\u2192running) and execution latency (time in handler)." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "percentunit" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 28 - }, - "id": 108, - "title": "Metastore Cache Hit Rate", - "type": "timeseries", - "targets": [ - { - "expr": "rate(flyte:propeller:all:metastore:cache_hit{namespace=\"$namespace\"}[$__rate_interval]) / (rate(flyte:propeller:all:metastore:cache_hit{namespace=\"$namespace\"}[$__rate_interval]) + rate(flyte:propeller:all:metastore:cache_miss{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "Hit rate", - "refId": "A" - } - ], - "description": "In-memory cache effectiveness for object store (S3/GCS) reads. Low hit rate = excessive storage calls." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "ops" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 36 - }, - "id": 109, - "title": "Event Recording (DP \u2192 CP)", - "type": "timeseries", - "targets": [ - { - "expr": "sum(rate(flyte:propeller:all:task:event_recording:success_duration_ms_count{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "Task success", - "refId": "A" - }, - { - "expr": "sum(rate(flyte:propeller:all:node:event_recording:success_duration_ms_count{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "Node success", - "refId": "B" - }, - { - "expr": "sum(rate(flyte:propeller:all:task:event_recording:failure_duration_ms_count{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "Task failure", - "refId": "C" - }, - { - "expr": "sum(rate(flyte:propeller:all:node:event_recording:failure_duration_ms_count{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "Node failure", - "refId": "D" - } - ], - "description": "Task/node event recording rate from propeller to FlyteAdmin. Failures indicate CP connectivity issues." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "short" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 36 - }, - "id": 110, - "title": "Cache Discovery (hit/miss/skip)", - "type": "timeseries", - "targets": [ - { - "expr": "rate(flyte:propeller:all:discovery_hit_count{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Hits", - "refId": "A" - }, - { - "expr": "rate(flyte:propeller:all:discovery_miss_count{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Misses", - "refId": "B" - }, - { - "expr": "rate(flyte:propeller:all:discovery_skip_count{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Skips", - "refId": "C" - }, - { - "expr": "rate(flyte:propeller:all:discovery_get_failure_count{namespace=\"$namespace\"}[$__rate_interval])", - "legendFormat": "Get failures", - "refId": "D" - } - ], - "description": "V2 executor cache discovery miss/put rates for task output caching." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "reqps" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 44 - }, - "id": 111, - "title": "K8s API Client Rate", - "type": "timeseries", - "targets": [ - { - "expr": "sum(rate(k8s_client_request_total_unlabeled{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "K8s requests/s", - "refId": "A" - } - ], - "description": "Propeller's K8s API request rate. High rates may indicate excessive pod watches or creates." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "s" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 44 - }, - "id": 112, - "title": "K8s API Client Latency (p90)", - "type": "timeseries", - "targets": [ - { - "expr": "histogram_quantile(0.90, sum by (le) (rate(k8s_client_request_latency_unlabeled_bucket{namespace=\"$namespace\"}[$__rate_interval])))", - "legendFormat": "Request p90", - "refId": "A" - }, - { - "expr": "histogram_quantile(0.90, sum by (le) (rate(k8s_client_rate_limiter_latency_unlabeled_bucket{namespace=\"$namespace\"}[$__rate_interval])))", - "legendFormat": "Rate limiter p90", - "refId": "B" - } - ], - "description": "K8s API request latency and client-side rate limiter wait time at p90." - } - ] - }, - { - "collapsed": true, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 34 - }, - "id": 400, - "title": "gRPC Client (DP \u2192 CP)", - "type": "row", - "panels": [ - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "reqps" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 13 - }, - "id": 401, - "title": "gRPC Client Request Rate", - "type": "timeseries", - "targets": [ - { - "expr": "sum by (grpc_service, grpc_method) (rate(grpc_client_handled_total{namespace=\"$namespace\"}[$__rate_interval]))", - "legendFormat": "{{ grpc_service }}/{{ grpc_method }}", - "refId": "A" - } - ], - "description": "DP\u2192CP gRPC client request rate by service and method (e.g. CreateWorkflowEvent)." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "reqps" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 13 - }, - "id": 402, - "title": "gRPC Client Error Rate", - "type": "timeseries", - "targets": [ - { - "expr": "sum by (grpc_service, grpc_method, grpc_code) (rate(grpc_client_handled_total{namespace=\"$namespace\", grpc_code!=\"OK\"}[$__rate_interval]))", - "legendFormat": "{{ grpc_method }} {{ grpc_code }}", - "refId": "A" - } - ], - "description": "DP\u2192CP gRPC errors by method and code. Non-OK responses from the control plane." - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "drawStyle": "line", - "fillOpacity": 10, - "lineWidth": 1, - "showPoints": "never" - }, - "unit": "s" - } - }, - "gridPos": { - "h": 8, - "w": 24, - "x": 0, - "y": 21 - }, - "id": 403, - "title": "gRPC Client Latency (p95)", - "type": "timeseries", - "targets": [ - { - "expr": "histogram_quantile(0.95, sum by (le, grpc_method) (rate(grpc_client_handling_seconds_bucket{namespace=\"$namespace\"}[$__rate_interval])))", - "legendFormat": "{{ grpc_method }} p95", - "refId": "A" + "description": "DP\u2192CP gRPC call latency at p95. High latency = slow control plane or network issues.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] } - ], - "description": "DP\u2192CP gRPC call latency at p95. High latency = slow control plane or network issues." + } } ] }, @@ -3025,7 +2528,18 @@ data: "refId": "A" } ], - "description": "CPU usage in cores per container, stacked. Identifies resource-heavy services." + "description": "CPU usage in cores per container, stacked. Identifies resource-heavy services.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3065,7 +2579,18 @@ data: "refId": "A" } ], - "description": "Working set memory per container. Watch for approaching limits." + "description": "Working set memory per container. Watch for approaching limits.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } }, { "datasource": { @@ -3102,7 +2627,18 @@ data: "refId": "A" } ], - "description": "Per-container restart events. Spikes indicate crashes or OOM kills." + "description": "Per-container restart events. Spikes indicate crashes or OOM kills.", + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "min", + "max", + "lastNotNull" + ] + } + } } ] } @@ -3154,7 +2690,7 @@ data: "timepicker": {}, "timezone": "browser", "title": "Union Dataplane Overview", - "uid": "union-dp-overview", + "uid": "union-dataplane-overview", "version": 1 } ---