diff --git a/CLAUDE.md b/CLAUDE.md
index 6060be0f..b454a6fd 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -67,6 +67,27 @@ helm template ...       # Manually inspect rendered output
 All tests must pass before merging. Generated test snapshots (`tests/generated/*.yaml`) should
 be committed alongside template changes.
 
+## Metrics Glossary Maintenance
+
+When modifying any of these files, the metrics glossary may need updating:
+
+- `charts/controlplane/dashboards/*.json` — Grafana dashboard panels
+- `charts/dataplane/dashboards/*.json` — Grafana dashboard panels
+- `charts/*/templates/monitoring/prometheusrule.yaml` — Recording rules and alerts
+
+After changes, regenerate the metrics manifest and commit it:
+
+```bash
+make generate-metrics-manifest
+```
+
+If the manifest diff shows added, removed, or renamed metrics, the customer-facing
+metrics glossary in `unionai-docs` must also be updated:
+
+- `unionai-docs/content/deployment/selfhosted/monitoring/metrics-glossary.md`
+
+The manifest (`metrics-manifest.yaml`) is auto-generated — do not edit it manually.
+
 ## Selfhosted Documentation
 
 Deployment guides are colocated with the charts:
diff --git a/Makefile b/Makefile
index ee1b431b..8a06eb80 100644
--- a/Makefile
+++ b/Makefile
@@ -69,3 +69,7 @@ lint-dataplane-crds:
 .PHONY: lint-sandbox
 lint-sandbox:
 	helm lint charts/sandbox
+
+.PHONY: generate-metrics-manifest
+generate-metrics-manifest:
+	python3 scripts/extract-metrics.py > metrics-manifest.yaml
diff --git a/metrics-manifest.yaml b/metrics-manifest.yaml
new file mode 100644
index 00000000..a7c6790f
--- /dev/null
+++ b/metrics-manifest.yaml
@@ -0,0 +1,257 @@
+# Auto-generated metrics manifest — do not edit manually.
+# Regenerate with: make generate-metrics-manifest
+#
+# This file tracks all metrics referenced in shipped Grafana dashboards
+# and PrometheusRule CRDs. Changes here signal that the metrics glossary
+# in unionai-docs may need updating.
+#
+# Docs: unionai-docs/content/deployment/selfhosted/monitoring/metrics-glossary.md
+
+controlplane:
+  dashboard_metrics:
+    - authorizer:authorizer:cloudauthorizer:connect:authorize_duration_ms
+    - authorizer:authorizer:cloudauthorizer:connect:authorize_errors_total
+    - authorizer:authorizer:cloudauthorizer:connect:authz_allowed
+    - authorizer:authorizer:cloudauthorizer:connect:authz_denied
+    - authorizer:authorizer:cloudauthorizer:connect:authz_type_info
+    - authorizer:authorizer:cloudauthorizer:connect:backend_authorize_duration_ms_bucket
+    - authorizer:authorizer:cloudauthorizer:connect:backend_authorize_errors
+    - authorizer:authorizer:cloudauthorizer:connect:external:authorize_duration_count
+    - authorizer:authorizer:cloudauthorizer:connect:external:errors
+    - authorizer:authorizer:cloudauthorizer:connect:external:fail_open_activated
+    - authorizer:handler_panic
+    - cluster:cluster_sync:health:last_update_age
+    - cluster:cluster_sync:health:unhealthy
+    - cluster:database:postgres:errors:gorm_error
+    - cluster:database:postgres:errors:not_found
+    - cluster:database:postgres:errors:postgres_error
+    - cluster:handler_panic
+    - cluster:managed_cluster_client_cache:get:hits
+    - cluster:managed_cluster_client_cache:get:miss
+    - cluster:svc:heartbeat:success_ms
+    - cluster:svc:heartbeat:success_ms_count
+    - cluster:svc:update_status:operator_restarts
+    - cluster:svc:update_status:propeller_restarts
+    - cluster:svc:update_status:success_ms
+    - cluster:svc:update_status:updates_total
+    - connect:server_requests_handled_total
+    - container_cpu_usage_seconds_total
+    - container_memory_working_set_bytes
+    - dataproxy:clusterpoolcache:hits
+    - dataproxy:clusterpoolcache:miss
+    - dataproxy:domains:hits
+    - dataproxy:domains:miss
+    - dataproxy:handler_panic
+    - dataproxy:images:read:failure_ms_count
+    - dataproxy:images:read:success_ms_count
+    - dataproxy:secrets_service:cluster_errors
+    - executions:app:leaser:pending_assignment_unlabeled
+    - executions:app:service:first_ack_latency_unlabeled_bucket
+    - executions:database:postgres:errors:gorm_error
+    - executions:database:postgres:errors:not_found
+    - executions:database:postgres:errors:postgres_error
+    - executions:executions:handle_ack_op_bucket
+    - executions:executions:handle_ack_op_count
+    - executions:executions:handle_create_op_bucket
+    - executions:executions:handle_create_op_count
+    - executions:executions:list_clusters:hits
+    - executions:executions:list_clusters:miss
+    - executions:executions:list_nodepools:hits
+    - executions:executions:list_nodepools:miss
+    - executions:handler_panic
+    - executions:run:actions_sent
+    - executions:run:enqueue_action_failures
+    - executions:run:logs:tail_logs_bytes_read
+    - executions:run:runs_sent
+    - executions:run_notifier:notifications_sent
+    - executions:run_notifier:subscribers
+    - executions:workqueue:announce_cluster_assignment_bucket
+    - executions:workqueue:claim_operation_failures
+    - executions:workqueue:claim_operations
+    - executions:workqueue:send_operation_count
+    - executions:workqueue:send_operation_failures
+    - flyte:admin:create_execution:duration_ms
+    - flyte:admin:create_execution_event:duration_ms
+    - flyte:admin:execution_manager:active_executions
+    - flyte:admin:execution_manager:execution_events_created
+    - flyte:admin:execution_manager:execution_termination_failure
+    - flyte:admin:execution_manager:executions_created
+    - flyte:admin:execution_manager:propeller_failures
+    - flyte:admin:execution_manager:publish_error
+    - flyte:admin:execution_manager:transformer_error
+    - flyte:admin:get_execution:duration_ms
+    - flyte:admin:list_execution:duration_ms
+    - flyte:admin:node_execution_manager:active_node_executions
+    - flyte:admin:node_execution_manager:node_execution_events_created
+    - flyte:admin:task_execution_manager:active_executions
+    - flyte:admin:task_execution_manager:task_execution_events_created
+    - flyte:cacheservice:cache:cache_hit_unlabeled
+    - flyte:cacheservice:cache:get_failure_unlabeled
+    - flyte:cacheservice:cache:get_reservation_success_unlabeled
+    - flyte:cacheservice:cache:not_found_unlabeled
+    - flyte:cacheservice:cache:release_reservation_success_unlabeled
+    - flyte:cacheservice:cache:reservation_contention_unlabeled
+    - flyte:middleware:authorization:authz_approved
+    - flyte:middleware:authorization:authz_denied
+    - grpc_server_handled_total
+    - kube_deployment_spec_replicas
+    - kube_deployment_status_replicas_available
+    - kube_pod_container_status_restarts_total
+    - message_type
+    - nginx_ingress_controller_nginx_process_connections
+    - nginx_ingress_controller_request_duration_seconds_bucket
+    - nginx_ingress_controller_request_duration_seconds_count
+    - queue:aborter:aborted_leases
+    - queue:aborter:input_queue_length
+    - queue:db:free_threads
+    - queue:db:queue_length
+    - queue:dispatcher:chain_queue_length
+    - queue:dispatcher:operation_duration_bucket
+    - queue:dispatcher:operation_failures
+    - queue:eventer:record_action_errors
+    - queue:handler_panic
+    - queue:metadata_store:scheduled_action_count
+    - queue:metadata_store:scheduled_run_count
+    - queue:metadata_store:total_action_count
+    - queue:metadata_store:total_run_count
+    - queue:queue_client:free_threads
+    - queue:runner:completed_leases
+    - queue:runner:input_queue_length
+    - queue:scheduler:enqueued_leases
+    - queue:scheduler:input_queue_length
+    - queue:scheduler:worker_capacity
+    - queue:state:active_states
+    - queue:state:get_duration_bucket
+    - queue:state:put_duration_bucket
+    - queue:state:terminal_states
+    - queue:state_client:free_threads
+    - union:cp:slo:error_budget_remaining
+    - union:cp:slo:ingress_latency_p99
+    - union:cp:slo:ingress_success_rate
+    - usage:handler_panic
+    - usage:messages:messages_dropped
+    - usage:messages:messages_failed
+    - usage:messages:messages_processed
+    - usage:messages:messages_received
+    - usage:messages:messages_sent
+    - usage:messages:processing_time_ms
+    - usage:svc:report_billable_usage
+  recording_rules:
+    - union:cp:authz:external_error_rate
+    - union:cp:connect_error_rate:by_service
+    - union:cp:connect_request_rate:by_service
+    - union:cp:ingress_error_rate_5xx
+    - union:cp:ingress_request_rate
+    - union:cp:pod_restart_rate_1h
+    - union:cp:service_availability
+    - union:cp:slo:availability:ratio
+    - union:cp:slo:connect_success_rate
+    - union:cp:slo:error_budget_remaining
+    - union:cp:slo:ingress_latency_p99
+    - union:cp:slo:ingress_success_rate
+  alerts:
+    - UnionCPAuthorizerExternalErrors
+    - UnionCPAuthorizerFailOpenActive
+    - UnionCPAuthorizerHighDenyRate
+    - UnionCPErrorBudgetExhausted
+    - UnionCPHandlerPanic
+    - UnionCPHighErrorBudgetBurn
+    - UnionCPHighRestartRate
+    - UnionCPIngressLatencySLOBreach
+    - UnionCPServiceDown
+
+dataplane:
+  dashboard_metrics:
+    - container_cpu_usage_seconds_total
+    - container_memory_working_set_bytes
+    - executor:actions_terminated
+    - executor:active_actions_count
+    - executor:available_capacity
+    - executor:discovery_miss_count
+    - executor:discovery_put_success_count
+    - executor:evaluator:evaluate_duration
+    - executor:evaluator:evaluate_errors
+    - executor:handler_panic
+    - executor:invalid_leases
+    - executor:system_failures
+    - executor:system_failures_exhausted
+    - flyte:propeller:all:discovery_get_failure_count
+    - flyte:propeller:all:discovery_hit_count
+    - flyte:propeller:all:discovery_miss_count
+    - flyte:propeller:all:discovery_skip_count
+    - flyte:propeller:all:execstats:active_node_executions
+    - flyte:propeller:all:execstats:active_task_executions
+    - flyte:propeller:all:execstats:active_workflow_executions
+    - flyte:propeller:all:free_workers_count
+    - flyte:propeller:all:main_adds
+    - flyte:propeller:all:main_depth
+    - flyte:propeller:all:main_retries
+    - flyte:propeller:all:metastore:cache_hit
+    - flyte:propeller:all:metastore:cache_miss
+    - flyte:propeller:all:node:event_recording:failure_duration_ms_count
+    - flyte:propeller:all:node:event_recording:success_duration_ms_count
+    - flyte:propeller:all:node:node_exec_latency_unlabeled_us
+    - flyte:propeller:all:node:queueing_latency_unlabeled_ms
+    - flyte:propeller:all:round:error_count
+    - flyte:propeller:all:round:panic_unlabeled
+    - flyte:propeller:all:round:round_time_unlabeled_ms
+    - flyte:propeller:all:round:success_count
+    - flyte:propeller:all:sub_adds
+    - flyte:propeller:all:sub_depth
+    - flyte:propeller:all:task:event_recording:failure_duration_ms_count
+    - flyte:propeller:all:task:event_recording:success_duration_ms_count
+    - flyte:propeller:all:wf_too_large
+    - flyte:propeller:all:wf_update_conflict
+    - flyte:propeller:all:wf_update_failed
+    - flyte:propeller:all:wf_update_latency_ms
+    - flyte:propeller:all:wf_updated
+    - grpc_client_handled_total
+    - grpc_client_handling_seconds_bucket
+    - k8s_client_rate_limiter_latency_unlabeled_bucket
+    - k8s_client_request_latency_unlabeled_bucket
+    - k8s_client_request_total_unlabeled
+    - kube_deployment_spec_replicas
+    - kube_deployment_status_replicas_available
+    - kube_pod_container_status_restarts_total
+    - union:dp:slo:error_budget_remaining
+    - union:dp:slo:execution_success_rate
+    - union:dp:slo:executor_success_rate
+    - union:dp:slo:propeller_round_latency_p99
+    - union:dp:slo:propeller_success_rate
+    - union_operator:billable_usage_collector:run_errors
+    - union_operator:billable_usage_collector:runs
+    - union_operator:config_syncer:propeller_configmap_updated
+    - union_operator:config_syncer:run_errors
+    - union_operator:config_syncer:runs
+    - union_operator:heartbeat:compute_capabilities_ms
+    - union_operator:heartbeat:compute_usages_ms
+    - union_operator:heartbeat:list_workflows_ms
+    - union_operator:heartbeat_updater:run_errors
+    - union_operator:heartbeat_updater:runs
+    - union_operator:prometheus_health_checker:run_errors
+    - union_operator:status_updater:run_errors
+    - union_operator:status_updater:runs
+    - union_operator:work_queue:operations_failed
+    - union_operator:work_queue:operations_processed
+    - union_operator:work_queue:paused
+  recording_rules:
+    - union:dp:executor:active_actions
+    - union:dp:operator:work_queue_failed_rate
+    - union:dp:propeller:active_workflows
+    - union:dp:propeller:queue_depth
+    - union:dp:propeller:round_latency_p99
+    - union:dp:service_availability
+    - union:dp:slo:availability:ratio
+    - union:dp:slo:error_budget_remaining
+    - union:dp:slo:execution_success_rate
+    - union:dp:slo:executor_success_rate
+    - union:dp:slo:propeller_round_latency_p99
+    - union:dp:slo:propeller_success_rate
+  alerts:
+    - UnionDPErrorBudgetExhausted
+    - UnionDPHandlerPanic
+    - UnionDPHighErrorBudgetBurn
+    - UnionDPHighRestartRate
+    - UnionDPPropellerLatencySLOBreach
+    - UnionDPServiceDown
+
diff --git a/scripts/extract-metrics.py b/scripts/extract-metrics.py
new file mode 100755
index 00000000..5791e353
--- /dev/null
+++ b/scripts/extract-metrics.py
@@ -0,0 +1,239 @@
+#!/usr/bin/env python3
+"""Extract metric names from Grafana dashboards and PrometheusRule templates.
+
+Produces a sorted, deduplicated YAML manifest of all metrics referenced in
+shipped dashboards and PrometheusRule CRDs. This manifest makes metric
+additions, removals, and renames visible in PR diffs.
+
+Usage:
+    python scripts/extract-metrics.py > metrics-manifest.yaml
+"""
+
+import json
+import re
+import sys
+from pathlib import Path
+
+REPO_ROOT = Path(__file__).resolve().parent.parent
+
+DASHBOARD_FILES = [
+    REPO_ROOT / "charts/controlplane/dashboards/union-controlplane-overview.json",
+    REPO_ROOT / "charts/dataplane/dashboards/union-dataplane-overview.json",
+]
+
+PROMETHEUSRULE_FILES = [
+    REPO_ROOT / "charts/controlplane/templates/monitoring/prometheusrule.yaml",
+    REPO_ROOT / "charts/dataplane/templates/monitoring/prometheusrule.yaml",
+]
+
+# Metric name pattern: prometheus metric names are [a-zA-Z_:][a-zA-Z0-9_:]*
+# We look for names used in PromQL expressions.
+METRIC_NAME_RE = re.compile(r'[a-zA-Z_:][a-zA-Z0-9_:]*')
+
+# Known non-metric tokens to exclude (PromQL functions, keywords, labels, etc.)
+EXCLUDE = {
+    # PromQL functions
+    'abs', 'absent', 'avg', 'avg_over_time', 'bottomk', 'ceil', 'changes',
+    'clamp', 'clamp_max', 'clamp_min', 'count', 'count_over_time', 'count_values',
+    'day_of_month', 'day_of_week', 'days_in_month', 'delta', 'deriv', 'exp',
+    'floor', 'group', 'histogram_quantile', 'holt_winters', 'hour', 'idelta',
+    'increase', 'irate', 'label_join', 'label_replace', 'last_over_time', 'ln',
+    'log2', 'log10', 'max', 'max_over_time', 'min', 'min_over_time', 'minute',
+    'month', 'predict_linear', 'quantile', 'quantile_over_time', 'rate', 'resets',
+    'round', 'scalar', 'sgn', 'sort', 'sort_desc', 'sqrt', 'stddev',
+    'stddev_over_time', 'stdvar', 'stdvar_over_time', 'sum', 'sum_over_time',
+    'time', 'timestamp', 'topk', 'vector', 'year',
+    # PromQL aggregation modifiers / keywords
+    'by', 'without', 'on', 'ignoring', 'group_left', 'group_right', 'bool',
+    'offset', 'and', 'or', 'unless',
+    # Common label names (not metrics)
+    'namespace', 'pod', 'container', 'deployment', 'service', 'code', 'status',
+    'host', 'path', 'le', 'quantile', 'job', 'instance', 'grpc_service',
+    'grpc_method', 'grpc_code', 'type', 'op', 'worker_name', 'org',
+    'cluster_name', 'cluster', 'operation', 'phase', 'error_type',
+    'error_source', 'identity_type', 'action', 'subsystem', 'name',
+    # Grafana template variables / constants
+    '__rate_interval', '__name__', '__NAMESPACE__',
+    # Short tokens that are label values not metrics
+    'OK', 'Canceled', 'NotFound', 'Succeeded',
+}
+
+# Minimum length to be considered a metric (avoids label values like "5m")
+MIN_METRIC_LEN = 4
+
+
+def extract_from_dashboard(filepath: Path) -> set[str]:
+    """Extract metric names from Grafana dashboard JSON."""
+    metrics = set()
+    with open(filepath) as f:
+        dashboard = json.load(f)
+
+    def walk_panels(panels):
+        for panel in panels:
+            # Nested panels (collapsed rows)
+            if 'panels' in panel:
+                walk_panels(panel['panels'])
+            for target in panel.get('targets', []):
+                expr = target.get('expr', '')
+                if not expr:
+                    continue
+                for token in METRIC_NAME_RE.findall(expr):
+                    if _is_metric_name(token):
+                        metrics.add(token)
+
+    walk_panels(dashboard.get('panels', []))
+    return metrics
+
+
+def _is_metric_name(token: str) -> bool:
+    """Return True if a token looks like a Prometheus metric name."""
+    if token in EXCLUDE or len(token) < MIN_METRIC_LEN:
+        return False
+    if token.startswith('$') or token.startswith('.'):
+        return False
+    if token.replace('_', '').isdigit():
+        return False
+    if token[0].isupper():  # Capitalized words are English text, not metrics
+        return False
+    if token.startswith(':') or token.startswith('_') or token.endswith(':'):
+        return False  # Fragment, not a full metric name
+    if ':' not in token and '_' not in token:
+        return False  # Real metrics have : or _
+    return True
+
+
+def _extract_promql_metrics(expr: str, metrics: set) -> None:
+    """Extract metric names from a PromQL expression string."""
+    for token in METRIC_NAME_RE.findall(expr):
+        if _is_metric_name(token):
+            metrics.add(token)
+
+
+def extract_from_prometheusrule(filepath: Path) -> dict:
+    """Extract recording rule names, alert names, and source metrics from PrometheusRule templates."""
+    text = filepath.read_text()
+    recording_rules = set()
+    alerts = set()
+    source_metrics = set()
+
+    # Extract recording rule names
+    for m in re.finditer(r'record:\s*(\S+)', text):
+        recording_rules.add(m.group(1))
+
+    # Extract alert names
+    for m in re.finditer(r'alert:\s*(\S+)', text):
+        alerts.add(m.group(1))
+
+    # Extract metrics from expr: blocks only (skip annotations, labels, etc.)
+    in_expr = False
+    in_skip_block = False
+    for line in text.splitlines():
+        stripped = line.strip()
+
+        # Skip annotation and label blocks entirely
+        if stripped.startswith('annotations:') or stripped.startswith('summary:') or stripped.startswith('description:'):
+            in_skip_block = True
+            in_expr = False
+            continue
+        if stripped.startswith('labels:') and not stripped.startswith('labels.'):
+            in_skip_block = True
+            in_expr = False
+            continue
+
+        if stripped.startswith('expr:'):
+            in_skip_block = False
+            in_expr = True
+            rest = stripped[len('expr:'):].strip().lstrip('|').strip()
+            if rest:
+                _extract_promql_metrics(rest, source_metrics)
+        elif stripped.startswith('record:') or stripped.startswith('alert:') or stripped.startswith('for:'):
+            in_expr = False
+            in_skip_block = False
+        elif in_expr and not in_skip_block:
+            if stripped and not stripped.startswith('#'):
+                _extract_promql_metrics(stripped, source_metrics)
+            if not stripped:
+                in_expr = False
+
+    # Remove recording rule names and alert names from source metrics
+    source_metrics -= recording_rules
+    source_metrics -= alerts
+    # Remove Helm template artifacts and non-metric tokens
+    source_metrics = {m for m in source_metrics
+                      if not m.startswith('Values')
+                      and not m.startswith('Release')
+                      and ':' in m or '_' in m}
+
+    return {
+        'recording_rules': recording_rules,
+        'alerts': alerts,
+        'source_metrics': source_metrics,
+    }
+
+
+def plane_name(filepath: Path) -> str:
+    if 'controlplane' in str(filepath):
+        return 'controlplane'
+    elif 'dataplane' in str(filepath):
+        return 'dataplane'
+    return filepath.stem
+
+
+def emit_yaml(data: dict) -> str:
+    """Emit clean YAML without external dependencies."""
+    lines = [
+        "# Auto-generated metrics manifest — do not edit manually.",
+        "# Regenerate with: make generate-metrics-manifest",
+        "#",
+        "# This file tracks all metrics referenced in shipped Grafana dashboards",
+        "# and PrometheusRule CRDs. Changes here signal that the metrics glossary",
+        "# in unionai-docs may need updating.",
+        "#",
+        "# Docs: unionai-docs/content/deployment/selfhosted/monitoring/metrics-glossary.md",
+        "",
+    ]
+
+    for plane in ('controlplane', 'dataplane'):
+        lines.append(f"{plane}:")
+        pd = data[plane]
+
+        lines.append("  dashboard_metrics:")
+        for m in sorted(pd.get('dashboard_metrics', [])):
+            lines.append(f"    - {m}")
+
+        lines.append("  recording_rules:")
+        for m in sorted(pd.get('recording_rules', [])):
+            lines.append(f"    - {m}")
+
+        lines.append("  alerts:")
+        for m in sorted(pd.get('alerts', [])):
+            lines.append(f"    - {m}")
+
+        lines.append("")
+
+    return '\n'.join(lines)
+
+
+def main():
+    data = {}
+
+    for df in DASHBOARD_FILES:
+        plane = plane_name(df)
+        metrics = extract_from_dashboard(df)
+        data.setdefault(plane, {})['dashboard_metrics'] = metrics
+
+    for pf in PROMETHEUSRULE_FILES:
+        plane = plane_name(pf)
+        result = extract_from_prometheusrule(pf)
+        data.setdefault(plane, {})
+        data[plane]['recording_rules'] = result['recording_rules']
+        data[plane]['alerts'] = result['alerts']
+        # Merge source metrics into dashboard metrics
+        data[plane].setdefault('dashboard_metrics', set())
+        data[plane]['dashboard_metrics'] |= result['source_metrics']
+
+    print(emit_yaml(data))
+
+
+if __name__ == '__main__':
+    main()