diff --git a/CLAUDE.md b/CLAUDE.md index 6060be0f..b454a6fd 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -67,6 +67,27 @@ helm template ... # Manually inspect rendered output All tests must pass before merging. Generated test snapshots (`tests/generated/*.yaml`) should be committed alongside template changes. +## Metrics Glossary Maintenance + +When modifying any of these files, the metrics glossary may need updating: + +- `charts/controlplane/dashboards/*.json` — Grafana dashboard panels +- `charts/dataplane/dashboards/*.json` — Grafana dashboard panels +- `charts/*/templates/monitoring/prometheusrule.yaml` — Recording rules and alerts + +After changes, regenerate the metrics manifest and commit it: + +```bash +make generate-metrics-manifest +``` + +If the manifest diff shows added, removed, or renamed metrics, the customer-facing +metrics glossary in `unionai-docs` must also be updated: + +- `unionai-docs/content/deployment/selfhosted/monitoring/metrics-glossary.md` + +The manifest (`metrics-manifest.yaml`) is auto-generated — do not edit it manually. + ## Selfhosted Documentation Deployment guides are colocated with the charts: diff --git a/Makefile b/Makefile index ee1b431b..8a06eb80 100644 --- a/Makefile +++ b/Makefile @@ -69,3 +69,7 @@ lint-dataplane-crds: .PHONY: lint-sandbox lint-sandbox: helm lint charts/sandbox + +.PHONY: generate-metrics-manifest +generate-metrics-manifest: + python3 scripts/extract-metrics.py > metrics-manifest.yaml diff --git a/metrics-manifest.yaml b/metrics-manifest.yaml new file mode 100644 index 00000000..a7c6790f --- /dev/null +++ b/metrics-manifest.yaml @@ -0,0 +1,257 @@ +# Auto-generated metrics manifest — do not edit manually. +# Regenerate with: make generate-metrics-manifest +# +# This file tracks all metrics referenced in shipped Grafana dashboards +# and PrometheusRule CRDs. Changes here signal that the metrics glossary +# in unionai-docs may need updating. +# +# Docs: unionai-docs/content/deployment/selfhosted/monitoring/metrics-glossary.md + +controlplane: + dashboard_metrics: + - authorizer:authorizer:cloudauthorizer:connect:authorize_duration_ms + - authorizer:authorizer:cloudauthorizer:connect:authorize_errors_total + - authorizer:authorizer:cloudauthorizer:connect:authz_allowed + - authorizer:authorizer:cloudauthorizer:connect:authz_denied + - authorizer:authorizer:cloudauthorizer:connect:authz_type_info + - authorizer:authorizer:cloudauthorizer:connect:backend_authorize_duration_ms_bucket + - authorizer:authorizer:cloudauthorizer:connect:backend_authorize_errors + - authorizer:authorizer:cloudauthorizer:connect:external:authorize_duration_count + - authorizer:authorizer:cloudauthorizer:connect:external:errors + - authorizer:authorizer:cloudauthorizer:connect:external:fail_open_activated + - authorizer:handler_panic + - cluster:cluster_sync:health:last_update_age + - cluster:cluster_sync:health:unhealthy + - cluster:database:postgres:errors:gorm_error + - cluster:database:postgres:errors:not_found + - cluster:database:postgres:errors:postgres_error + - cluster:handler_panic + - cluster:managed_cluster_client_cache:get:hits + - cluster:managed_cluster_client_cache:get:miss + - cluster:svc:heartbeat:success_ms + - cluster:svc:heartbeat:success_ms_count + - cluster:svc:update_status:operator_restarts + - cluster:svc:update_status:propeller_restarts + - cluster:svc:update_status:success_ms + - cluster:svc:update_status:updates_total + - connect:server_requests_handled_total + - container_cpu_usage_seconds_total + - container_memory_working_set_bytes + - dataproxy:clusterpoolcache:hits + - dataproxy:clusterpoolcache:miss + - dataproxy:domains:hits + - dataproxy:domains:miss + - dataproxy:handler_panic + - dataproxy:images:read:failure_ms_count + - dataproxy:images:read:success_ms_count + - dataproxy:secrets_service:cluster_errors + - executions:app:leaser:pending_assignment_unlabeled + - executions:app:service:first_ack_latency_unlabeled_bucket + - executions:database:postgres:errors:gorm_error + - executions:database:postgres:errors:not_found + - executions:database:postgres:errors:postgres_error + - executions:executions:handle_ack_op_bucket + - executions:executions:handle_ack_op_count + - executions:executions:handle_create_op_bucket + - executions:executions:handle_create_op_count + - executions:executions:list_clusters:hits + - executions:executions:list_clusters:miss + - executions:executions:list_nodepools:hits + - executions:executions:list_nodepools:miss + - executions:handler_panic + - executions:run:actions_sent + - executions:run:enqueue_action_failures + - executions:run:logs:tail_logs_bytes_read + - executions:run:runs_sent + - executions:run_notifier:notifications_sent + - executions:run_notifier:subscribers + - executions:workqueue:announce_cluster_assignment_bucket + - executions:workqueue:claim_operation_failures + - executions:workqueue:claim_operations + - executions:workqueue:send_operation_count + - executions:workqueue:send_operation_failures + - flyte:admin:create_execution:duration_ms + - flyte:admin:create_execution_event:duration_ms + - flyte:admin:execution_manager:active_executions + - flyte:admin:execution_manager:execution_events_created + - flyte:admin:execution_manager:execution_termination_failure + - flyte:admin:execution_manager:executions_created + - flyte:admin:execution_manager:propeller_failures + - flyte:admin:execution_manager:publish_error + - flyte:admin:execution_manager:transformer_error + - flyte:admin:get_execution:duration_ms + - flyte:admin:list_execution:duration_ms + - flyte:admin:node_execution_manager:active_node_executions + - flyte:admin:node_execution_manager:node_execution_events_created + - flyte:admin:task_execution_manager:active_executions + - flyte:admin:task_execution_manager:task_execution_events_created + - flyte:cacheservice:cache:cache_hit_unlabeled + - flyte:cacheservice:cache:get_failure_unlabeled + - flyte:cacheservice:cache:get_reservation_success_unlabeled + - flyte:cacheservice:cache:not_found_unlabeled + - flyte:cacheservice:cache:release_reservation_success_unlabeled + - flyte:cacheservice:cache:reservation_contention_unlabeled + - flyte:middleware:authorization:authz_approved + - flyte:middleware:authorization:authz_denied + - grpc_server_handled_total + - kube_deployment_spec_replicas + - kube_deployment_status_replicas_available + - kube_pod_container_status_restarts_total + - message_type + - nginx_ingress_controller_nginx_process_connections + - nginx_ingress_controller_request_duration_seconds_bucket + - nginx_ingress_controller_request_duration_seconds_count + - queue:aborter:aborted_leases + - queue:aborter:input_queue_length + - queue:db:free_threads + - queue:db:queue_length + - queue:dispatcher:chain_queue_length + - queue:dispatcher:operation_duration_bucket + - queue:dispatcher:operation_failures + - queue:eventer:record_action_errors + - queue:handler_panic + - queue:metadata_store:scheduled_action_count + - queue:metadata_store:scheduled_run_count + - queue:metadata_store:total_action_count + - queue:metadata_store:total_run_count + - queue:queue_client:free_threads + - queue:runner:completed_leases + - queue:runner:input_queue_length + - queue:scheduler:enqueued_leases + - queue:scheduler:input_queue_length + - queue:scheduler:worker_capacity + - queue:state:active_states + - queue:state:get_duration_bucket + - queue:state:put_duration_bucket + - queue:state:terminal_states + - queue:state_client:free_threads + - union:cp:slo:error_budget_remaining + - union:cp:slo:ingress_latency_p99 + - union:cp:slo:ingress_success_rate + - usage:handler_panic + - usage:messages:messages_dropped + - usage:messages:messages_failed + - usage:messages:messages_processed + - usage:messages:messages_received + - usage:messages:messages_sent + - usage:messages:processing_time_ms + - usage:svc:report_billable_usage + recording_rules: + - union:cp:authz:external_error_rate + - union:cp:connect_error_rate:by_service + - union:cp:connect_request_rate:by_service + - union:cp:ingress_error_rate_5xx + - union:cp:ingress_request_rate + - union:cp:pod_restart_rate_1h + - union:cp:service_availability + - union:cp:slo:availability:ratio + - union:cp:slo:connect_success_rate + - union:cp:slo:error_budget_remaining + - union:cp:slo:ingress_latency_p99 + - union:cp:slo:ingress_success_rate + alerts: + - UnionCPAuthorizerExternalErrors + - UnionCPAuthorizerFailOpenActive + - UnionCPAuthorizerHighDenyRate + - UnionCPErrorBudgetExhausted + - UnionCPHandlerPanic + - UnionCPHighErrorBudgetBurn + - UnionCPHighRestartRate + - UnionCPIngressLatencySLOBreach + - UnionCPServiceDown + +dataplane: + dashboard_metrics: + - container_cpu_usage_seconds_total + - container_memory_working_set_bytes + - executor:actions_terminated + - executor:active_actions_count + - executor:available_capacity + - executor:discovery_miss_count + - executor:discovery_put_success_count + - executor:evaluator:evaluate_duration + - executor:evaluator:evaluate_errors + - executor:handler_panic + - executor:invalid_leases + - executor:system_failures + - executor:system_failures_exhausted + - flyte:propeller:all:discovery_get_failure_count + - flyte:propeller:all:discovery_hit_count + - flyte:propeller:all:discovery_miss_count + - flyte:propeller:all:discovery_skip_count + - flyte:propeller:all:execstats:active_node_executions + - flyte:propeller:all:execstats:active_task_executions + - flyte:propeller:all:execstats:active_workflow_executions + - flyte:propeller:all:free_workers_count + - flyte:propeller:all:main_adds + - flyte:propeller:all:main_depth + - flyte:propeller:all:main_retries + - flyte:propeller:all:metastore:cache_hit + - flyte:propeller:all:metastore:cache_miss + - flyte:propeller:all:node:event_recording:failure_duration_ms_count + - flyte:propeller:all:node:event_recording:success_duration_ms_count + - flyte:propeller:all:node:node_exec_latency_unlabeled_us + - flyte:propeller:all:node:queueing_latency_unlabeled_ms + - flyte:propeller:all:round:error_count + - flyte:propeller:all:round:panic_unlabeled + - flyte:propeller:all:round:round_time_unlabeled_ms + - flyte:propeller:all:round:success_count + - flyte:propeller:all:sub_adds + - flyte:propeller:all:sub_depth + - flyte:propeller:all:task:event_recording:failure_duration_ms_count + - flyte:propeller:all:task:event_recording:success_duration_ms_count + - flyte:propeller:all:wf_too_large + - flyte:propeller:all:wf_update_conflict + - flyte:propeller:all:wf_update_failed + - flyte:propeller:all:wf_update_latency_ms + - flyte:propeller:all:wf_updated + - grpc_client_handled_total + - grpc_client_handling_seconds_bucket + - k8s_client_rate_limiter_latency_unlabeled_bucket + - k8s_client_request_latency_unlabeled_bucket + - k8s_client_request_total_unlabeled + - kube_deployment_spec_replicas + - kube_deployment_status_replicas_available + - kube_pod_container_status_restarts_total + - union:dp:slo:error_budget_remaining + - union:dp:slo:execution_success_rate + - union:dp:slo:executor_success_rate + - union:dp:slo:propeller_round_latency_p99 + - union:dp:slo:propeller_success_rate + - union_operator:billable_usage_collector:run_errors + - union_operator:billable_usage_collector:runs + - union_operator:config_syncer:propeller_configmap_updated + - union_operator:config_syncer:run_errors + - union_operator:config_syncer:runs + - union_operator:heartbeat:compute_capabilities_ms + - union_operator:heartbeat:compute_usages_ms + - union_operator:heartbeat:list_workflows_ms + - union_operator:heartbeat_updater:run_errors + - union_operator:heartbeat_updater:runs + - union_operator:prometheus_health_checker:run_errors + - union_operator:status_updater:run_errors + - union_operator:status_updater:runs + - union_operator:work_queue:operations_failed + - union_operator:work_queue:operations_processed + - union_operator:work_queue:paused + recording_rules: + - union:dp:executor:active_actions + - union:dp:operator:work_queue_failed_rate + - union:dp:propeller:active_workflows + - union:dp:propeller:queue_depth + - union:dp:propeller:round_latency_p99 + - union:dp:service_availability + - union:dp:slo:availability:ratio + - union:dp:slo:error_budget_remaining + - union:dp:slo:execution_success_rate + - union:dp:slo:executor_success_rate + - union:dp:slo:propeller_round_latency_p99 + - union:dp:slo:propeller_success_rate + alerts: + - UnionDPErrorBudgetExhausted + - UnionDPHandlerPanic + - UnionDPHighErrorBudgetBurn + - UnionDPHighRestartRate + - UnionDPPropellerLatencySLOBreach + - UnionDPServiceDown + diff --git a/scripts/extract-metrics.py b/scripts/extract-metrics.py new file mode 100755 index 00000000..5791e353 --- /dev/null +++ b/scripts/extract-metrics.py @@ -0,0 +1,239 @@ +#!/usr/bin/env python3 +"""Extract metric names from Grafana dashboards and PrometheusRule templates. + +Produces a sorted, deduplicated YAML manifest of all metrics referenced in +shipped dashboards and PrometheusRule CRDs. This manifest makes metric +additions, removals, and renames visible in PR diffs. + +Usage: + python scripts/extract-metrics.py > metrics-manifest.yaml +""" + +import json +import re +import sys +from pathlib import Path + +REPO_ROOT = Path(__file__).resolve().parent.parent + +DASHBOARD_FILES = [ + REPO_ROOT / "charts/controlplane/dashboards/union-controlplane-overview.json", + REPO_ROOT / "charts/dataplane/dashboards/union-dataplane-overview.json", +] + +PROMETHEUSRULE_FILES = [ + REPO_ROOT / "charts/controlplane/templates/monitoring/prometheusrule.yaml", + REPO_ROOT / "charts/dataplane/templates/monitoring/prometheusrule.yaml", +] + +# Metric name pattern: prometheus metric names are [a-zA-Z_:][a-zA-Z0-9_:]* +# We look for names used in PromQL expressions. +METRIC_NAME_RE = re.compile(r'[a-zA-Z_:][a-zA-Z0-9_:]*') + +# Known non-metric tokens to exclude (PromQL functions, keywords, labels, etc.) +EXCLUDE = { + # PromQL functions + 'abs', 'absent', 'avg', 'avg_over_time', 'bottomk', 'ceil', 'changes', + 'clamp', 'clamp_max', 'clamp_min', 'count', 'count_over_time', 'count_values', + 'day_of_month', 'day_of_week', 'days_in_month', 'delta', 'deriv', 'exp', + 'floor', 'group', 'histogram_quantile', 'holt_winters', 'hour', 'idelta', + 'increase', 'irate', 'label_join', 'label_replace', 'last_over_time', 'ln', + 'log2', 'log10', 'max', 'max_over_time', 'min', 'min_over_time', 'minute', + 'month', 'predict_linear', 'quantile', 'quantile_over_time', 'rate', 'resets', + 'round', 'scalar', 'sgn', 'sort', 'sort_desc', 'sqrt', 'stddev', + 'stddev_over_time', 'stdvar', 'stdvar_over_time', 'sum', 'sum_over_time', + 'time', 'timestamp', 'topk', 'vector', 'year', + # PromQL aggregation modifiers / keywords + 'by', 'without', 'on', 'ignoring', 'group_left', 'group_right', 'bool', + 'offset', 'and', 'or', 'unless', + # Common label names (not metrics) + 'namespace', 'pod', 'container', 'deployment', 'service', 'code', 'status', + 'host', 'path', 'le', 'quantile', 'job', 'instance', 'grpc_service', + 'grpc_method', 'grpc_code', 'type', 'op', 'worker_name', 'org', + 'cluster_name', 'cluster', 'operation', 'phase', 'error_type', + 'error_source', 'identity_type', 'action', 'subsystem', 'name', + # Grafana template variables / constants + '__rate_interval', '__name__', '__NAMESPACE__', + # Short tokens that are label values not metrics + 'OK', 'Canceled', 'NotFound', 'Succeeded', +} + +# Minimum length to be considered a metric (avoids label values like "5m") +MIN_METRIC_LEN = 4 + + +def extract_from_dashboard(filepath: Path) -> set[str]: + """Extract metric names from Grafana dashboard JSON.""" + metrics = set() + with open(filepath) as f: + dashboard = json.load(f) + + def walk_panels(panels): + for panel in panels: + # Nested panels (collapsed rows) + if 'panels' in panel: + walk_panels(panel['panels']) + for target in panel.get('targets', []): + expr = target.get('expr', '') + if not expr: + continue + for token in METRIC_NAME_RE.findall(expr): + if _is_metric_name(token): + metrics.add(token) + + walk_panels(dashboard.get('panels', [])) + return metrics + + +def _is_metric_name(token: str) -> bool: + """Return True if a token looks like a Prometheus metric name.""" + if token in EXCLUDE or len(token) < MIN_METRIC_LEN: + return False + if token.startswith('$') or token.startswith('.'): + return False + if token.replace('_', '').isdigit(): + return False + if token[0].isupper(): # Capitalized words are English text, not metrics + return False + if token.startswith(':') or token.startswith('_') or token.endswith(':'): + return False # Fragment, not a full metric name + if ':' not in token and '_' not in token: + return False # Real metrics have : or _ + return True + + +def _extract_promql_metrics(expr: str, metrics: set) -> None: + """Extract metric names from a PromQL expression string.""" + for token in METRIC_NAME_RE.findall(expr): + if _is_metric_name(token): + metrics.add(token) + + +def extract_from_prometheusrule(filepath: Path) -> dict: + """Extract recording rule names, alert names, and source metrics from PrometheusRule templates.""" + text = filepath.read_text() + recording_rules = set() + alerts = set() + source_metrics = set() + + # Extract recording rule names + for m in re.finditer(r'record:\s*(\S+)', text): + recording_rules.add(m.group(1)) + + # Extract alert names + for m in re.finditer(r'alert:\s*(\S+)', text): + alerts.add(m.group(1)) + + # Extract metrics from expr: blocks only (skip annotations, labels, etc.) + in_expr = False + in_skip_block = False + for line in text.splitlines(): + stripped = line.strip() + + # Skip annotation and label blocks entirely + if stripped.startswith('annotations:') or stripped.startswith('summary:') or stripped.startswith('description:'): + in_skip_block = True + in_expr = False + continue + if stripped.startswith('labels:') and not stripped.startswith('labels.'): + in_skip_block = True + in_expr = False + continue + + if stripped.startswith('expr:'): + in_skip_block = False + in_expr = True + rest = stripped[len('expr:'):].strip().lstrip('|').strip() + if rest: + _extract_promql_metrics(rest, source_metrics) + elif stripped.startswith('record:') or stripped.startswith('alert:') or stripped.startswith('for:'): + in_expr = False + in_skip_block = False + elif in_expr and not in_skip_block: + if stripped and not stripped.startswith('#'): + _extract_promql_metrics(stripped, source_metrics) + if not stripped: + in_expr = False + + # Remove recording rule names and alert names from source metrics + source_metrics -= recording_rules + source_metrics -= alerts + # Remove Helm template artifacts and non-metric tokens + source_metrics = {m for m in source_metrics + if not m.startswith('Values') + and not m.startswith('Release') + and ':' in m or '_' in m} + + return { + 'recording_rules': recording_rules, + 'alerts': alerts, + 'source_metrics': source_metrics, + } + + +def plane_name(filepath: Path) -> str: + if 'controlplane' in str(filepath): + return 'controlplane' + elif 'dataplane' in str(filepath): + return 'dataplane' + return filepath.stem + + +def emit_yaml(data: dict) -> str: + """Emit clean YAML without external dependencies.""" + lines = [ + "# Auto-generated metrics manifest — do not edit manually.", + "# Regenerate with: make generate-metrics-manifest", + "#", + "# This file tracks all metrics referenced in shipped Grafana dashboards", + "# and PrometheusRule CRDs. Changes here signal that the metrics glossary", + "# in unionai-docs may need updating.", + "#", + "# Docs: unionai-docs/content/deployment/selfhosted/monitoring/metrics-glossary.md", + "", + ] + + for plane in ('controlplane', 'dataplane'): + lines.append(f"{plane}:") + pd = data[plane] + + lines.append(" dashboard_metrics:") + for m in sorted(pd.get('dashboard_metrics', [])): + lines.append(f" - {m}") + + lines.append(" recording_rules:") + for m in sorted(pd.get('recording_rules', [])): + lines.append(f" - {m}") + + lines.append(" alerts:") + for m in sorted(pd.get('alerts', [])): + lines.append(f" - {m}") + + lines.append("") + + return '\n'.join(lines) + + +def main(): + data = {} + + for df in DASHBOARD_FILES: + plane = plane_name(df) + metrics = extract_from_dashboard(df) + data.setdefault(plane, {})['dashboard_metrics'] = metrics + + for pf in PROMETHEUSRULE_FILES: + plane = plane_name(pf) + result = extract_from_prometheusrule(pf) + data.setdefault(plane, {}) + data[plane]['recording_rules'] = result['recording_rules'] + data[plane]['alerts'] = result['alerts'] + # Merge source metrics into dashboard metrics + data[plane].setdefault('dashboard_metrics', set()) + data[plane]['dashboard_metrics'] |= result['source_metrics'] + + print(emit_yaml(data)) + + +if __name__ == '__main__': + main()