Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,8 @@ collect_mf(_Registry, Callback) ->
%% INTERNAL

collect_aggregate_metrics(Prefix, Callback) ->
collect_max_values(Prefix, Callback),
collect_key_component_metrics(Prefix, Callback).
collect_key_component_metrics(Prefix, Callback),
collect_max_values(Prefix, Callback).

collect_per_object_metrics(Prefix, Callback) ->
collect_key_component_metrics(Prefix, Callback),
Expand All @@ -71,8 +71,6 @@ collect_detailed_metrics(Prefix, Callback) ->
false
end
end,

collect_key_component_metrics(Prefix, Callback),
collect_all_matching_metrics(Prefix, Callback, VHostFilterFun).

collect_key_per_object_metrics(Prefix, Callback) ->
Expand All @@ -91,7 +89,10 @@ collect_key_per_object_metrics(Prefix, Callback) ->
Type,
Values))
end,
seshat:format(ra, #{labels => as_binary, metrics => QQMetrics})).
seshat:format(ra,
#{labels => as_binary,
metrics => QQMetrics,
filter_fun => fun onlyQueues/1})).

collect_all_matching_metrics(Prefix, Callback, VHostFilterFun) ->
maps:foreach(
Expand All @@ -106,7 +107,10 @@ collect_all_matching_metrics(Prefix, Callback, VHostFilterFun) ->
Type,
Values))
end,
seshat:format(ra, #{labels => as_binary, metrics => all, filter_fun => VHostFilterFun})).
seshat:format(ra,
#{labels => as_binary,
metrics => all,
filter_fun => VHostFilterFun})).

collect_max_values(Prefix, Callback) ->
%% max values for QQ metrics
Expand All @@ -115,20 +119,26 @@ collect_max_values(Prefix, Callback) ->
%% rabbitmq_raft_num_segments{queue="q2",vhost="/"} 10.0
%% becomes
%% rabbitmq_raft_max_num_segments 10.0
QQMetrics = [num_segments],
QQMetrics = [num_segments, commit_latency],
maps:foreach(
fun(Name, #{type := Type, help := Help, values := Values}) ->
Max = lists:max(maps:values(Values)),
Callback(
create_mf(<<Prefix/binary, "max_", (prometheus_model_helpers:metric_name(Name))/binary>>,
Help,
Type,
#{#{} => Max}))
%% TODO: this should not be hardcoded, we should
%% something more like 'max() GROUP BY ra_system'
#{#{ra_system => quorum_queues} => Max}))

end,
seshat:format(ra, #{labels => as_binary, metrics => QQMetrics})).
seshat:format(ra,
#{labels => as_binary,
metrics => QQMetrics,
filter_fun => fun onlyQueues/1})).

collect_key_component_metrics(Prefix, Callback) ->
%% quorum queue metrics
WALMetrics = [wal_files, bytes_written, mem_tables],
SegmentWriterMetrics = [entries, segments],
maps:foreach(
Expand All @@ -139,4 +149,25 @@ collect_key_component_metrics(Prefix, Callback) ->
Type,
Values))
end,
seshat:format(ra, #{labels => as_binary, metrics => WALMetrics ++ SegmentWriterMetrics})).
seshat:format(ra,
#{labels => as_binary,
metrics => WALMetrics ++ SegmentWriterMetrics,
filter_fun => fun onlyQueues/1})),
%% Khepri and other coordination metrics
maps:foreach(
fun(Name, #{type := Type, help := Help, values := Values}) ->
Callback(
create_mf(<<Prefix/binary, (prometheus_model_helpers:metric_name(Name))/binary>>,
Help,
Type,
Values))
end,
seshat:format(ra,
#{labels => as_binary,
filter_fun => fun onlyCoordinationSystem/1})).

onlyCoordinationSystem(#{ra_system := coordination}) -> true;
onlyCoordinationSystem(_) -> false.

onlyQueues(#{queue := _}) -> true;
onlyQueues(_) -> false.
Original file line number Diff line number Diff line change
Expand Up @@ -411,7 +411,8 @@ aggregated_metrics_test(Config) ->
?assertEqual(match, re:run(Body, "^rabbitmq_io_read_time_seconds_total ", [{capture, none}, multiline])),
%% Check the first TOTALS metric value
?assertEqual(match, re:run(Body, "^rabbitmq_connections ", [{capture, none}, multiline])),
?assertEqual(nomatch, re:run(Body, "^rabbitmq_raft_commit_latency_seconds", [{capture, none}, multiline])),
?assertEqual(match, re:run(Body, "^rabbitmq_raft_commit_latency_seconds", [{capture, none}, multiline])),
?assertEqual(match, re:run(Body, "^rabbitmq_raft_max_commit_latency_seconds", [{capture, none}, multiline])),
?assertEqual(match, re:run(Body, "^rabbitmq_raft_bytes_written.*ra_log_segment_writer", [{capture, none}, multiline])),
?assertEqual(match, re:run(Body, "^rabbitmq_raft_bytes_written.*ra_log_wal", [{capture, none}, multiline])),
?assertEqual(match, re:run(Body, "^rabbitmq_raft_entries{", [{capture, none}, multiline])),
Expand Down Expand Up @@ -870,16 +871,10 @@ detailed_raft_metrics_test(Config) ->
QQMetrics = #{#{queue => "a_quorum_queue", vhost => "/"} => ["1.0"]},

{_, Body1} = http_get_with_pal(Config, "/metrics/detailed?family=ra_metrics&vhost=foo", [], 200),
%% no queues in vhost foo, so no QQ metrics
?assertEqual(ComponentMetrics,
map_get(rabbitmq_detailed_raft_wal_files, parse_response(Body1))),
?assertEqual(undefined,
maps:get(rabbitmq_detailed_raft_term, parse_response(Body1), undefined)),

{_, Body2} = http_get_with_pal(Config, "/metrics/detailed?family=ra_metrics&vhost=/", [], 200),
%% there's a queue in vhost /
?assertEqual(ComponentMetrics,
map_get(rabbitmq_detailed_raft_wal_files, parse_response(Body2))),
?assertEqual(QQMetrics,
map_get(rabbitmq_detailed_raft_term, parse_response(Body2))),

Expand Down
90 changes: 90 additions & 0 deletions release-notes/4.2.0.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,14 @@ In other words, if the responder publishes to only this queue name, then the mes
`*.cacerts` (not to be confused with `cacertfile`) settings in `rabbitmq.conf` did not have the expected effect and were removed
to eliminate confusion.

### Quorum Queue Metric Changes

Metrics emitted for Ra-based components (quorum queues, Khepri, Stream Coordinator)
have changed. Some metrics were removed, many were added, some changed their names.
Users relying on Prometheus metrics starting with `rabbitmq_raft` or `rabbitmq_detailed_raft`
will need to update their dashboards and/or alerts. If you are using the
[RabbitMQ-Quorum-Queues-Raft dashboard](https://grafana.com/grafana/dashboards/11340-rabbitmq-quorum-queues-raft/),
please update it to the latest version for RabbitMQ 4.2 compatibility.

## Release Highlights

Expand Down Expand Up @@ -407,6 +415,88 @@ compared to other versions.
* `cuttlefish` was upgraded to [`3.5.0`](https://github.com/kyorai/cuttlefish/releases)


## Ra Metric Changes

Metrics emitted for Ra-based components (quorum queues, Khepri, Stream Coordinator)
have changed. Some metrics were removed, many were added, some changed their names.
For most users this should not require any action. However, users relying on Prometheus
metrics starting with `rabbitmq_raft` or `rabbitmq_detailed_raft` will need to update
their dashboards and/or alerts. If you are using the
[RabbitMQ-Quorum-Queues-Raft dashboard](https://grafana.com/grafana/dashboards/11340-rabbitmq-quorum-queues-raft/),
please update it to the latest version for RabbitMQ 4.2 compatibility.

#### More Accurate and Detailed Ra Metrics

Ra is an internal component implementing the Raft protocol. It's the basis
for quorum queues, as well as some internal components (currently Khepri
and Stream Coordinator). For quite some time, Ra metrics were tracked in two places
but RabbitMQ relied on the old metric subsystem. In RabbitMQ 4.2, the old
Ra metrics subsystem has been removed and RabbitMQ now reports Ra metrics
from the new subsystem (implemented using [Seshat](https://github.com/rabbitmq/seshat) library).
This migration has the following benefits:

* lower overhead, since only one subsystem is used
* more up-to-date information - the old subsystem was only refreshed every 5 seconds,
the new subsystem always returns the latest values
* additional metrics are exposed, making it easier to debug the system if necessary

### Aggregated metrics (/metrics endpoint)

* `rabbitmq_raft_num_segments` was added; it reports the number of segment files of the internal components

* `rabbitmq_raft_max_num_segments` was added; it reports the highest number of segment
files of any of the quorum queues; per-object metrics can be used to find which queue
has a high number of segment files

* `rabbitmq_raft_term_total` has been removed
this metric was emitted accidentally as a side effect of metric aggregation;
the sum of Raft terms across all Raft clusters is a meaningless number

* some metrics contained the `_log_` substring in their name, even though they are not related to the Raft log;
hence, they were renamed to avoid the misleading part:
* `rabbitmq_raft_log_snapshot_index` -> `rabbitmq_raft_snapshot_index`
* `rabbitmq_raft_log_last_applied_index` -> `rabbitmq_raft_last_applied`
* `rabbitmq_raft_log_commit_index` -> `rabbitmq_raft_commit_index`
* `rabbitmq_raft_log_last_written_index` -> `rabbitmq_raft_last_written_index`

* `rabbitmq_raft_entry_commit_latency_seconds` has been removed; it was an average latency across all Ra clusters
in all Ra systems (RabbitMQ currently uses two separate Ra systems: one for quorum queues and one for internal
components, currently Khepri and Stream Coordinator); it was therefore not very useful, since different
components can have very different latencies

* `rabbitmq_raft_commit_latency_seconds` was added; in case of aggregated metrics, it is only reported for
internal components (currently Khepri and Stream Coordinator)

* `rabbitmq_raft_max_commit_latency_seconds` has been added; it's the highest commit latency reported by any
of the quorum queues. When it's high, per-object can be used to find which specific queue reports high commit latency

### Per-object metrics (/metrics/per-object endpoint)

More metrics are reported for each queue than in older versions.

Incorrect metric names were corrected as described above.

Additionally:
* `rabbitmq_raft_term_total` has been renamed to `rabbitmq_raft_term` (the "total" suffix
was incorrect and misleading, since the metrics is reported for each specific Ra cluster)

* `rabbitmq_raft_num_segments` was added; it reports the number of segment files of the internal components
and for each quorum queue

### Detailed metrics (/metrics/detailed endpoint)

When the detailed endpoints is scraped with `family=ra_metrics` parameter,
more metrics are reported for each queue than in older versions.

Incorrect metric names were corrected as described above.

Additionally:
* `rabbitmq_raft_term_total` has been renamed to `rabbitmq_raft_term` (the "total" suffix
was incorrect and misleading, since the metrics is reported for each specific Ra cluster)

* `rabbitmq_raft_num_segments` was added; it reports the number of segment files of the internal components
and for each quorum queue

## Source Code Archives

To obtain source code of the entire distribution, please download the archive named `rabbitmq-server-4.2.0.tar.xz`
Expand Down
Loading