Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions CLAUDE.md
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,27 @@ helm template ... # Manually inspect rendered output
All tests must pass before merging. Generated test snapshots (`tests/generated/*.yaml`) should
be committed alongside template changes.

## Metrics Glossary Maintenance

When modifying any of these files, the metrics glossary may need updating:

- `charts/controlplane/dashboards/*.json` — Grafana dashboard panels
- `charts/dataplane/dashboards/*.json` — Grafana dashboard panels
- `charts/*/templates/monitoring/prometheusrule.yaml` — Recording rules and alerts

After changes, regenerate the metrics manifest and commit it:

```bash
make generate-metrics-manifest
```

If the manifest diff shows added, removed, or renamed metrics, the customer-facing
metrics glossary in `unionai-docs` must also be updated:

- `unionai-docs/content/deployment/selfhosted/monitoring/metrics-glossary.md`

The manifest (`metrics-manifest.yaml`) is auto-generated — do not edit it manually.

## Selfhosted Documentation

Deployment guides are colocated with the charts:
Expand Down
4 changes: 4 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -69,3 +69,7 @@ lint-dataplane-crds:
.PHONY: lint-sandbox
lint-sandbox:
helm lint charts/sandbox

.PHONY: generate-metrics-manifest
generate-metrics-manifest:
python3 scripts/extract-metrics.py > metrics-manifest.yaml
257 changes: 257 additions & 0 deletions metrics-manifest.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,257 @@
# Auto-generated metrics manifest — do not edit manually.
# Regenerate with: make generate-metrics-manifest
#
# This file tracks all metrics referenced in shipped Grafana dashboards
# and PrometheusRule CRDs. Changes here signal that the metrics glossary
# in unionai-docs may need updating.
#
# Docs: unionai-docs/content/deployment/selfhosted/monitoring/metrics-glossary.md

controlplane:
dashboard_metrics:
- authorizer:authorizer:cloudauthorizer:connect:authorize_duration_ms
- authorizer:authorizer:cloudauthorizer:connect:authorize_errors_total
- authorizer:authorizer:cloudauthorizer:connect:authz_allowed
- authorizer:authorizer:cloudauthorizer:connect:authz_denied
- authorizer:authorizer:cloudauthorizer:connect:authz_type_info
- authorizer:authorizer:cloudauthorizer:connect:backend_authorize_duration_ms_bucket
- authorizer:authorizer:cloudauthorizer:connect:backend_authorize_errors
- authorizer:authorizer:cloudauthorizer:connect:external:authorize_duration_count
- authorizer:authorizer:cloudauthorizer:connect:external:errors
- authorizer:authorizer:cloudauthorizer:connect:external:fail_open_activated
- authorizer:handler_panic
- cluster:cluster_sync:health:last_update_age
- cluster:cluster_sync:health:unhealthy
- cluster:database:postgres:errors:gorm_error
- cluster:database:postgres:errors:not_found
- cluster:database:postgres:errors:postgres_error
- cluster:handler_panic
- cluster:managed_cluster_client_cache:get:hits
- cluster:managed_cluster_client_cache:get:miss
- cluster:svc:heartbeat:success_ms
- cluster:svc:heartbeat:success_ms_count
- cluster:svc:update_status:operator_restarts
- cluster:svc:update_status:propeller_restarts
- cluster:svc:update_status:success_ms
- cluster:svc:update_status:updates_total
- connect:server_requests_handled_total
- container_cpu_usage_seconds_total
- container_memory_working_set_bytes
- dataproxy:clusterpoolcache:hits
- dataproxy:clusterpoolcache:miss
- dataproxy:domains:hits
- dataproxy:domains:miss
- dataproxy:handler_panic
- dataproxy:images:read:failure_ms_count
- dataproxy:images:read:success_ms_count
- dataproxy:secrets_service:cluster_errors
- executions:app:leaser:pending_assignment_unlabeled
- executions:app:service:first_ack_latency_unlabeled_bucket
- executions:database:postgres:errors:gorm_error
- executions:database:postgres:errors:not_found
- executions:database:postgres:errors:postgres_error
- executions:executions:handle_ack_op_bucket
- executions:executions:handle_ack_op_count
- executions:executions:handle_create_op_bucket
- executions:executions:handle_create_op_count
- executions:executions:list_clusters:hits
- executions:executions:list_clusters:miss
- executions:executions:list_nodepools:hits
- executions:executions:list_nodepools:miss
- executions:handler_panic
- executions:run:actions_sent
- executions:run:enqueue_action_failures
- executions:run:logs:tail_logs_bytes_read
- executions:run:runs_sent
- executions:run_notifier:notifications_sent
- executions:run_notifier:subscribers
- executions:workqueue:announce_cluster_assignment_bucket
- executions:workqueue:claim_operation_failures
- executions:workqueue:claim_operations
- executions:workqueue:send_operation_count
- executions:workqueue:send_operation_failures
- flyte:admin:create_execution:duration_ms
- flyte:admin:create_execution_event:duration_ms
- flyte:admin:execution_manager:active_executions
- flyte:admin:execution_manager:execution_events_created
- flyte:admin:execution_manager:execution_termination_failure
- flyte:admin:execution_manager:executions_created
- flyte:admin:execution_manager:propeller_failures
- flyte:admin:execution_manager:publish_error
- flyte:admin:execution_manager:transformer_error
- flyte:admin:get_execution:duration_ms
- flyte:admin:list_execution:duration_ms
- flyte:admin:node_execution_manager:active_node_executions
- flyte:admin:node_execution_manager:node_execution_events_created
- flyte:admin:task_execution_manager:active_executions
- flyte:admin:task_execution_manager:task_execution_events_created
- flyte:cacheservice:cache:cache_hit_unlabeled
- flyte:cacheservice:cache:get_failure_unlabeled
- flyte:cacheservice:cache:get_reservation_success_unlabeled
- flyte:cacheservice:cache:not_found_unlabeled
- flyte:cacheservice:cache:release_reservation_success_unlabeled
- flyte:cacheservice:cache:reservation_contention_unlabeled
- flyte:middleware:authorization:authz_approved
- flyte:middleware:authorization:authz_denied
- grpc_server_handled_total
- kube_deployment_spec_replicas
- kube_deployment_status_replicas_available
- kube_pod_container_status_restarts_total
- message_type
- nginx_ingress_controller_nginx_process_connections
- nginx_ingress_controller_request_duration_seconds_bucket
- nginx_ingress_controller_request_duration_seconds_count
- queue:aborter:aborted_leases
- queue:aborter:input_queue_length
- queue:db:free_threads
- queue:db:queue_length
- queue:dispatcher:chain_queue_length
- queue:dispatcher:operation_duration_bucket
- queue:dispatcher:operation_failures
- queue:eventer:record_action_errors
- queue:handler_panic
- queue:metadata_store:scheduled_action_count
- queue:metadata_store:scheduled_run_count
- queue:metadata_store:total_action_count
- queue:metadata_store:total_run_count
- queue:queue_client:free_threads
- queue:runner:completed_leases
- queue:runner:input_queue_length
- queue:scheduler:enqueued_leases
- queue:scheduler:input_queue_length
- queue:scheduler:worker_capacity
- queue:state:active_states
- queue:state:get_duration_bucket
- queue:state:put_duration_bucket
- queue:state:terminal_states
- queue:state_client:free_threads
- union:cp:slo:error_budget_remaining
- union:cp:slo:ingress_latency_p99
- union:cp:slo:ingress_success_rate
- usage:handler_panic
- usage:messages:messages_dropped
- usage:messages:messages_failed
- usage:messages:messages_processed
- usage:messages:messages_received
- usage:messages:messages_sent
- usage:messages:processing_time_ms
- usage:svc:report_billable_usage
recording_rules:
- union:cp:authz:external_error_rate
- union:cp:connect_error_rate:by_service
- union:cp:connect_request_rate:by_service
- union:cp:ingress_error_rate_5xx
- union:cp:ingress_request_rate
- union:cp:pod_restart_rate_1h
- union:cp:service_availability
- union:cp:slo:availability:ratio
- union:cp:slo:connect_success_rate
- union:cp:slo:error_budget_remaining
- union:cp:slo:ingress_latency_p99
- union:cp:slo:ingress_success_rate
alerts:
- UnionCPAuthorizerExternalErrors
- UnionCPAuthorizerFailOpenActive
- UnionCPAuthorizerHighDenyRate
- UnionCPErrorBudgetExhausted
- UnionCPHandlerPanic
- UnionCPHighErrorBudgetBurn
- UnionCPHighRestartRate
- UnionCPIngressLatencySLOBreach
- UnionCPServiceDown

dataplane:
dashboard_metrics:
- container_cpu_usage_seconds_total
- container_memory_working_set_bytes
- executor:actions_terminated
- executor:active_actions_count
- executor:available_capacity
- executor:discovery_miss_count
- executor:discovery_put_success_count
- executor:evaluator:evaluate_duration
- executor:evaluator:evaluate_errors
- executor:handler_panic
- executor:invalid_leases
- executor:system_failures
- executor:system_failures_exhausted
- flyte:propeller:all:discovery_get_failure_count
- flyte:propeller:all:discovery_hit_count
- flyte:propeller:all:discovery_miss_count
- flyte:propeller:all:discovery_skip_count
- flyte:propeller:all:execstats:active_node_executions
- flyte:propeller:all:execstats:active_task_executions
- flyte:propeller:all:execstats:active_workflow_executions
- flyte:propeller:all:free_workers_count
- flyte:propeller:all:main_adds
- flyte:propeller:all:main_depth
- flyte:propeller:all:main_retries
- flyte:propeller:all:metastore:cache_hit
- flyte:propeller:all:metastore:cache_miss
- flyte:propeller:all:node:event_recording:failure_duration_ms_count
- flyte:propeller:all:node:event_recording:success_duration_ms_count
- flyte:propeller:all:node:node_exec_latency_unlabeled_us
- flyte:propeller:all:node:queueing_latency_unlabeled_ms
- flyte:propeller:all:round:error_count
- flyte:propeller:all:round:panic_unlabeled
- flyte:propeller:all:round:round_time_unlabeled_ms
- flyte:propeller:all:round:success_count
- flyte:propeller:all:sub_adds
- flyte:propeller:all:sub_depth
- flyte:propeller:all:task:event_recording:failure_duration_ms_count
- flyte:propeller:all:task:event_recording:success_duration_ms_count
- flyte:propeller:all:wf_too_large
- flyte:propeller:all:wf_update_conflict
- flyte:propeller:all:wf_update_failed
- flyte:propeller:all:wf_update_latency_ms
- flyte:propeller:all:wf_updated
- grpc_client_handled_total
- grpc_client_handling_seconds_bucket
- k8s_client_rate_limiter_latency_unlabeled_bucket
- k8s_client_request_latency_unlabeled_bucket
- k8s_client_request_total_unlabeled
- kube_deployment_spec_replicas
- kube_deployment_status_replicas_available
- kube_pod_container_status_restarts_total
- union:dp:slo:error_budget_remaining
- union:dp:slo:execution_success_rate
- union:dp:slo:executor_success_rate
- union:dp:slo:propeller_round_latency_p99
- union:dp:slo:propeller_success_rate
- union_operator:billable_usage_collector:run_errors
- union_operator:billable_usage_collector:runs
- union_operator:config_syncer:propeller_configmap_updated
- union_operator:config_syncer:run_errors
- union_operator:config_syncer:runs
- union_operator:heartbeat:compute_capabilities_ms
- union_operator:heartbeat:compute_usages_ms
- union_operator:heartbeat:list_workflows_ms
- union_operator:heartbeat_updater:run_errors
- union_operator:heartbeat_updater:runs
- union_operator:prometheus_health_checker:run_errors
- union_operator:status_updater:run_errors
- union_operator:status_updater:runs
- union_operator:work_queue:operations_failed
- union_operator:work_queue:operations_processed
- union_operator:work_queue:paused
recording_rules:
- union:dp:executor:active_actions
- union:dp:operator:work_queue_failed_rate
- union:dp:propeller:active_workflows
- union:dp:propeller:queue_depth
- union:dp:propeller:round_latency_p99
- union:dp:service_availability
- union:dp:slo:availability:ratio
- union:dp:slo:error_budget_remaining
- union:dp:slo:execution_success_rate
- union:dp:slo:executor_success_rate
- union:dp:slo:propeller_round_latency_p99
- union:dp:slo:propeller_success_rate
alerts:
- UnionDPErrorBudgetExhausted
- UnionDPHandlerPanic
- UnionDPHighErrorBudgetBurn
- UnionDPHighRestartRate
- UnionDPPropellerLatencySLOBreach
- UnionDPServiceDown

Loading
Loading