diff --git a/CHANGELOG.md b/CHANGELOG.md index 82053613..631b3ae2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,7 @@ All notable changes to this project will be documented in this file. - Use `--file-log-max-files` (or `FILE_LOG_MAX_FILES`) to limit the number of log files kept. - Use `--file-log-rotation-period` (or `FILE_LOG_ROTATION_PERIOD`) to configure the frequency of rotation. - Use `--console-log-format` (or `CONSOLE_LOG_FORMAT`) to set the format to `plain` (default) or `json`. +- Add built-in Prometheus support and expose metrics on `/metrics` path of `native-metrics` port ([#955]). ### Changed @@ -45,6 +46,7 @@ All notable changes to this project will be documented in this file. [#942]: https://github.com/stackabletech/zookeeper-operator/pull/942 [#946]: https://github.com/stackabletech/zookeeper-operator/pull/946 [#950]: https://github.com/stackabletech/zookeeper-operator/pull/950 +[#955]: https://github.com/stackabletech/zookeeper-operator/pull/955 ## [25.3.0] - 2025-03-21 diff --git a/docs/modules/zookeeper/pages/usage_guide/monitoring.adoc b/docs/modules/zookeeper/pages/usage_guide/monitoring.adoc index f50ad9f0..b53f18c6 100644 --- a/docs/modules/zookeeper/pages/usage_guide/monitoring.adoc +++ b/docs/modules/zookeeper/pages/usage_guide/monitoring.adoc @@ -2,4 +2,23 @@ :description: The managed ZooKeeper instances are automatically configured to export Prometheus metrics. The managed ZooKeeper instances are automatically configured to export Prometheus metrics. -See xref:operators:monitoring.adoc[] for more details. +See xref:operators:monitoring.adoc[window=_blank] for more details. + +Depending on the SDP version, different ZooKeeper monitoring systems are used to produce metrics. Currently, JMX in combination with JMX Exporter +is used, but will be removed in a later release. Starting with SDP 25.7 the built-in Prometheus support of ZooKeeper is also added. +The naming of the metrics differs between the two systems. + +== Metrics + +Starting with SDP 25.7 ZooKeeper is configured to export metrics using the built-in Prometheus provider. More on the Prometheus provider in +the https://zookeeper.apache.org/doc/current/zookeeperMonitor.html[ZooKeeper Monitor Guide,window=_blank]. + +The configuration is located in the `zoo.cfg`: + +[source,properties] +---- +metricsProvider.className=org.apache.zookeeper.metrics.prometheus.PrometheusMetricsProvider +metricsProvider.httpPort=7000 +---- + +The metrics can be accessed by calling the `/metrics` endpoint on the specified port. diff --git a/rust/operator-binary/src/crd/mod.rs b/rust/operator-binary/src/crd/mod.rs index 532a95d2..8fd6291a 100644 --- a/rust/operator-binary/src/crd/mod.rs +++ b/rust/operator-binary/src/crd/mod.rs @@ -48,6 +48,8 @@ pub const ZOOKEEPER_PROPERTIES_FILE: &str = "zoo.cfg"; pub const JVM_SECURITY_PROPERTIES_FILE: &str = "security.properties"; pub const METRICS_PORT: u16 = 9505; +pub const METRICS_PROVIDER_HTTP_PORT_KEY: &str = "metricsProvider.httpPort"; +pub const METRICS_PROVIDER_HTTP_PORT: u16 = 7000; pub const STACKABLE_DATA_DIR: &str = "/stackable/data"; pub const STACKABLE_CONFIG_DIR: &str = "/stackable/config"; @@ -468,6 +470,16 @@ impl Configuration for v1alpha1::ZookeeperConfigFragment { v1alpha1::ZookeeperConfig::DATA_DIR.to_string(), Some(STACKABLE_DATA_DIR.to_string()), ); + result.insert( + "metricsProvider.className".to_string(), + Some( + "org.apache.zookeeper.metrics.prometheus.PrometheusMetricsProvider".to_string(), + ), + ); + result.insert( + METRICS_PROVIDER_HTTP_PORT_KEY.to_string(), + Some(METRICS_PROVIDER_HTTP_PORT.to_string()), + ); } Ok(result) diff --git a/rust/operator-binary/src/zk_controller.rs b/rust/operator-binary/src/zk_controller.rs index 1553be86..706677d3 100644 --- a/rust/operator-binary/src/zk_controller.rs +++ b/rust/operator-binary/src/zk_controller.rs @@ -72,8 +72,9 @@ use crate::{ config::jvm::{construct_non_heap_jvm_args, construct_zk_server_heap_env}, crd::{ DOCKER_IMAGE_BASE_NAME, JVM_SECURITY_PROPERTIES_FILE, MAX_PREPARE_LOG_FILE_SIZE, - MAX_ZK_LOG_FILES_SIZE, STACKABLE_CONFIG_DIR, STACKABLE_DATA_DIR, STACKABLE_LOG_CONFIG_DIR, - STACKABLE_LOG_DIR, STACKABLE_RW_CONFIG_DIR, ZOOKEEPER_PROPERTIES_FILE, ZookeeperRole, + MAX_ZK_LOG_FILES_SIZE, METRICS_PROVIDER_HTTP_PORT, METRICS_PROVIDER_HTTP_PORT_KEY, + STACKABLE_CONFIG_DIR, STACKABLE_DATA_DIR, STACKABLE_LOG_CONFIG_DIR, STACKABLE_LOG_DIR, + STACKABLE_RW_CONFIG_DIR, ZOOKEEPER_PROPERTIES_FILE, ZookeeperRole, security::{self, ZookeeperSecurity}, v1alpha1, }, @@ -415,6 +416,7 @@ pub async fn reconcile_zk( &rolegroup, &resolved_product_image, &zookeeper_security, + rolegroup_config, )?; let rg_configmap = build_server_rolegroup_config_map( zk, @@ -675,6 +677,7 @@ fn build_server_rolegroup_service( rolegroup: &RoleGroupRef, resolved_product_image: &ResolvedProductImage, zookeeper_security: &ZookeeperSecurity, + rolegroup_config: &HashMap>, ) -> Result { let prometheus_label = Label::try_from(("prometheus.io/scrape", "true")).context(BuildLabelSnafu)?; @@ -716,6 +719,12 @@ fn build_server_rolegroup_service( protocol: Some("TCP".to_string()), ..ServicePort::default() }, + ServicePort { + name: Some("native-metrics".to_string()), + port: metrics_port_from_rolegroup_config(rolegroup_config).into(), + protocol: Some("TCP".to_string()), + ..ServicePort::default() + }, ]), selector: Some(service_selector_labels.into()), publish_not_ready_addresses: Some(true), @@ -898,6 +907,10 @@ fn build_server_rolegroup_statefulset( .add_container_port("zk-leader", 2888) .add_container_port("zk-election", 3888) .add_container_port("metrics", 9505) + .add_container_port( + "native-metrics", + metrics_port_from_rolegroup_config(server_config).into(), + ) .add_volume_mount("data", STACKABLE_DATA_DIR) .context(AddVolumeMountSnafu)? .add_volume_mount("config", STACKABLE_CONFIG_DIR) @@ -1063,6 +1076,27 @@ fn build_server_rolegroup_statefulset( }) } +fn metrics_port_from_rolegroup_config( + rolegroup_config: &HashMap>, +) -> u16 { + let metrics_port = rolegroup_config + .get(&PropertyNameKind::File( + ZOOKEEPER_PROPERTIES_FILE.to_string(), + )) + .expect("{ZOOKEEPER_PROPERTIES_FILE} is present") + .get(METRICS_PROVIDER_HTTP_PORT_KEY) + .expect("{METRICS_PROVIDER_HTTP_PORT_KEY} is set"); + + match u16::from_str(metrics_port) { + Ok(port) => port, + Err(err) => { + tracing::error!("{err}"); + tracing::info!("Defaulting to using {METRICS_PROVIDER_HTTP_PORT} as metrics port."); + METRICS_PROVIDER_HTTP_PORT + } + } +} + pub fn error_policy( _obj: Arc>, error: &Error, diff --git a/tests/templates/kuttl/smoke/test_zookeeper.py b/tests/templates/kuttl/smoke/test_zookeeper.py index dfef9589..f3582f9c 100755 --- a/tests/templates/kuttl/smoke/test_zookeeper.py +++ b/tests/templates/kuttl/smoke/test_zookeeper.py @@ -3,6 +3,7 @@ import requests import time import sys + sys.tracebacklimit = 0 @@ -37,17 +38,29 @@ def check_ruok(hosts): url = host + ":8080/commands/" + cmd_ruok response = try_get(url).json() - if "command" in response and response["command"] == cmd_ruok \ - and "error" in response and response["error"] is None: + if ( + "command" in response + and response["command"] == cmd_ruok + and "error" in response + and response["error"] is None + ): continue else: - print("Error[" + cmd_ruok + "] for [" + url + "]: received " + str( - response) + " - expected {'command': 'ruok', 'error': None} ") + print( + "Error[" + + cmd_ruok + + "] for [" + + url + + "]: received " + + str(response) + + " - expected {'command': 'ruok', 'error': None} " + ) exit(-1) def check_monitoring(hosts): for host in hosts: + # test for the jmx exporter metrics url = host + ":9505" response = try_get(url) @@ -57,16 +70,46 @@ def check_monitoring(hosts): print("Error for [" + url + "]: could not access monitoring") exit(-1) + # test for the native metrics + url = host + ":7000/metrics" + response = try_get(url) + + if response.ok: + # arbitrary metric was chosen to test if metrics are present in the response + if "quorum_size" in response.text: + continue + else: + print("Error for [" + url + "]: missing metrics") + exit(-1) + continue + else: + print("Error for [" + url + "]: could not access monitoring") + exit(-1) + -if __name__ == '__main__': +if __name__ == "__main__": all_args = argparse.ArgumentParser(description="Test ZooKeeper.") - all_args.add_argument("-n", "--namespace", help="The namespace to run in", required=True) + all_args.add_argument( + "-n", "--namespace", help="The namespace to run in", required=True + ) args = vars(all_args.parse_args()) namespace = args["namespace"] - host_primary_0 = "http://test-zk-server-primary-0.test-zk-server-primary." + namespace + ".svc.cluster.local" - host_primary_1 = "http://test-zk-server-primary-1.test-zk-server-primary." + namespace + ".svc.cluster.local" - host_secondary = "http://test-zk-server-secondary-0.test-zk-server-secondary." + namespace + ".svc.cluster.local" + host_primary_0 = ( + "http://test-zk-server-primary-0.test-zk-server-primary." + + namespace + + ".svc.cluster.local" + ) + host_primary_1 = ( + "http://test-zk-server-primary-1.test-zk-server-primary." + + namespace + + ".svc.cluster.local" + ) + host_secondary = ( + "http://test-zk-server-secondary-0.test-zk-server-secondary." + + namespace + + ".svc.cluster.local" + ) hosts = [host_primary_0, host_primary_1, host_secondary]