From d40bb50fdf58560f3bb0a16206c02d7eb20dd1f0 Mon Sep 17 00:00:00 2001 From: Mason Legere Date: Thu, 5 May 2022 17:02:45 -0700 Subject: [PATCH 1/2] Move config file format to YAML - Updates the config to use YAML rather than a weird properties format that is actually JSON but allows comments through the removal of lines starting with `#` --- .gitignore | 3 - README.md | 20 +- bin/windows/kafka-monitor-start.bat | 4 +- bin/windows/kmf-run-class.bat | 6 +- build.gradle | 2 + config/multi-cluster-monitor.properties | 92 -------- config/multi-cluster-monitor.yaml | 80 +++++++ config/xinfra-monitor.properties | 197 ------------------ config/xinfra-monitor.yaml | 172 +++++++++++++++ docker/kafka-monitor-docker-entry.sh | 2 +- gradle/wrapper/gradle-wrapper.properties | 3 +- .../xinfra/monitor/XinfraMonitor.java | 17 +- .../StatsdMetricsReporterServiceConfig.java | 2 +- 13 files changed, 277 insertions(+), 323 deletions(-) delete mode 100644 config/multi-cluster-monitor.properties create mode 100644 config/multi-cluster-monitor.yaml delete mode 100644 config/xinfra-monitor.properties create mode 100644 config/xinfra-monitor.yaml diff --git a/.gitignore b/.gitignore index 5b4afd4a..177dcbe4 100644 --- a/.gitignore +++ b/.gitignore @@ -8,9 +8,6 @@ logs/ .settings/ src/test/java/com/linkedin/xinfra/monitor/RandomTests.java -config/andrew-choi.properties -config/andrew-multi-cluster-monitor.properties - kafka-monitor.iml kafka-monitor.ipr kafka-monitor.iws diff --git a/README.md b/README.md index 8313ba42..86916624 100644 --- a/README.md +++ b/README.md @@ -99,8 +99,8 @@ Xinfra Monitor supports Apache Kafka 0.8 to 2.0:
  1. We advise advanced users to run Xinfra Monitor with -./bin/xinfra-monitor-start.sh config/xinfra-monitor.properties. The default -xinfra-monitor.properties in the repo provides an simple example of how to +./bin/xinfra-monitor-start.sh config/xinfra-monitor.yaml. The default +xinfra-monitor.yaml in the repo provides an simple example of how to monitor a single cluster. You probably need to change the value of zookeeper.connect and bootstrap.servers to point to your cluster.
  2. @@ -109,7 +109,7 @@ monitor a single cluster. You probably need to change the value of Config class for respective service, e.g. ProduceServiceConfig.java and ConsumeServiceConfig.java.
    -
  3. You can specify multiple SingleClusterMonitor in the xinfra-monitor.properties to +
  4. You can specify multiple SingleClusterMonitor in the xinfra-monitor.yaml to monitor multiple Kafka clusters in one Xinfra Monitor process. As another advanced use-case, you can point ProduceService and ConsumeService to two different Kafka clusters that are connected by MirrorMaker to monitor their end-to-end latency.

  5. @@ -146,16 +146,16 @@ $ ./gradlew jar ### Start XinfraMonitor to run tests/services specified in the config file ``` -$ ./bin/xinfra-monitor-start.sh config/xinfra-monitor.properties +$ ./bin/xinfra-monitor-start.sh config/xinfra-monitor.yaml ``` ### Run Xinfra Monitor with arbitrary producer/consumer configuration (e.g. SASL enabled client) -Edit `config/xinfra-monitor.properties` to specify custom configurations for producer in the key/value map `produce.producer.props` in -`config/xinfra-monitor.properties`. Similarly specify configurations for +Edit `config/xinfra-monitor.yaml` to specify custom configurations for producer in the key/value map `produce.producer.props` in +`config/xinfra-monitor.yaml`. Similarly specify configurations for consumer as well. The documentation for producer and consumer in the key/value maps can be found in the Apache Kafka wiki. ``` -$ ./bin/xinfra-monitor-start.sh config/xinfra-monitor.properties +$ ./bin/xinfra-monitor-start.sh config/xinfra-monitor.yaml ``` ### Run SingleClusterMonitor app to monitor kafka cluster @@ -169,16 +169,16 @@ $ ./bin/single-cluster-monitor.sh --topic test --broker-list localhost:9092 --zo ``` ### Run MultiClusterMonitor app to monitor a pipeline of Kafka clusters connected by MirrorMaker -Edit `config/multi-cluster-monitor.properties` to specify the right broker and +Edit `config/multi-cluster-monitor.yaml` to specify the right broker and zookeeper url as suggested by the comment in the properties file Metrics `produce-availability-avg` and `consume-availability-avg` demonstrate whether messages can be properly produced to the source cluster and consumed -from the destination cluster. See config/multi-cluster-monitor.properties for +from the destination cluster. See config/multi-cluster-monitor.yaml for the full jmx path for these metrics. ``` -$ ./bin/xinfra-monitor-start.sh config/multi-cluster-monitor.properties +$ ./bin/xinfra-monitor-start.sh config/multi-cluster-monitor.yaml ``` ### Run checkstyle on the java code diff --git a/bin/windows/kafka-monitor-start.bat b/bin/windows/kafka-monitor-start.bat index 45eedad7..1220c773 100644 --- a/bin/windows/kafka-monitor-start.bat +++ b/bin/windows/kafka-monitor-start.bat @@ -1,5 +1,5 @@ @echo off -REM Copyright 2016 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this +REM Copyright 2022 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this REM file except in compliance with the License. You may obtain a copy of the License at REM REM http://www.apache.org/licenses/LICENSE-2.0 @@ -15,7 +15,7 @@ popd IF [%1] EQU [] ( - echo USAGE: %0 config/xinfra-monitor.properties + echo USAGE: %0 config/xinfra-monitor.yaml EXIT /B 1 ) diff --git a/bin/windows/kmf-run-class.bat b/bin/windows/kmf-run-class.bat index caddf261..f113fe8d 100644 --- a/bin/windows/kmf-run-class.bat +++ b/bin/windows/kmf-run-class.bat @@ -1,5 +1,5 @@ @echo off -REM Copyright 2016 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this +REM Copyright 2022 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this REM file except in compliance with the License. You may obtain a copy of the License at REM REM http://www.apache.org/licenses/LICENSE-2.0 @@ -10,12 +10,12 @@ REM an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either expre setlocal enabledelayedexpansion IF [%1] EQU [] ( - echo USAGE: %0 com.linkedin.xinfra.monitor.XinfraMonitor config/xinfra-monitor.properties + echo USAGE: %0 com.linkedin.xinfra.monitor.XinfraMonitor config/xinfra-monitor.yaml EXIT /B 1 ) IF [%2] EQU [] ( - echo USAGE: %0 %1 config/xinfra-monitor.properties + echo USAGE: %0 %1 config/xinfra-monitor.yaml EXIT /B 1 ) diff --git a/build.gradle b/build.gradle index b370bb09..74ebaa77 100644 --- a/build.gradle +++ b/build.gradle @@ -45,6 +45,8 @@ allprojects { compile group: 'org.apache.kafka', name: 'kafka-clients', version: '2.3.1' compile 'org.apache.commons:commons-lang3:3.12.0' compile 'com.linkedin.avroutil1:helper-all:0.2.81' + compile group: 'com.fasterxml.jackson.dataformat', name: 'jackson-dataformat-yaml', version: '2.10.3' + compile group: 'com.fasterxml.jackson.core', name: 'jackson-databind', version: '2.10.3' testCompile 'org.mockito:mockito-core:2.24.0' testCompile 'org.testng:testng:6.8.8' } diff --git a/config/multi-cluster-monitor.properties b/config/multi-cluster-monitor.properties deleted file mode 100644 index dd40b035..00000000 --- a/config/multi-cluster-monitor.properties +++ /dev/null @@ -1,92 +0,0 @@ -# Copyright 2016 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this -# file except in compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on -# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - -# This properties file specifies an example configure to monitor a pipeline of Kafka clusters. -# User probably needs to change zookeeper.connect and bootstrap.servers to point to respective clusters. -# More clusters can be added in the map for "topic.management.config.per.cluster" to reference -# each cluster in the pipeline. The "produce.service.props" should use the first cluster and -# the "consume.service.props" should use the last cluster in the pipeline. - -# Produce service: Configure Produce Service to produce to the first cluster of the pipeline -# Consume service: Configure Consume Service to consume from the last cluster of the pipeline -# Last cluster: If there are more than two clusters in the pipeline, add one property map for each one of them. -{ - "multi-cluster-monitor": { - "class.name": "com.linkedin.kmf.apps.MultiClusterMonitor", - "topic": "kafka-monitor-topic", - "produce.service.props": { - "zookeeper.connect": "localhost:2181/first_cluster", - "bootstrap.servers": "localhost:9092", - "produce.record.delay.ms": 100, - "produce.producer.props": { - "client.id": "kafka-monitor-client-id" - } - }, - "consume.service.props": { - "zookeeper.connect": "localhost:2181/last_cluster", - "bootstrap.servers": "localhost:9095", - "consume.latency.sla.ms": "20000", - "consume.consumer.props": { - "group.id": "kafka-monitor-group-id" - } - }, - - "topic.management.props.per.cluster" : { - "first-cluster" : { - "bootstrap.servers": "localhost:9092", - "zookeeper.connect": "localhost:2181/first_cluster", - "topic-management.topicCreationEnabled": true, - "topic-management.replicationFactor" : 1, - "topic-management.partitionsToBrokersRatio" : 2.0, - "topic-management.rebalance.interval.ms" : 600000, - "topic-management.topicFactory.props": { - } - }, - - "last-cluster" : { - "bootstrap.servers": "localhost:9095", - "zookeeper.connect": "localhost:2181/last_cluster", - "topic-management.topicCreationEnabled": true, - "topic-management.replicationFactor" : 1, - "topic-management.partitionsToBrokersRatio" : 2.0, - "topic-management.rebalance.interval.ms" : 600000, - "topic-management.topicFactory.props": { - } - } - } - - }, - - "reporter-service": { - "class.name": "com.linkedin.kmf.services.DefaultMetricsReporterService", - "report.interval.sec": 1, - "report.metrics.list": [ - "kmf.services:type=produce-service,name=*:produce-availability-avg", - "kmf.services:type=consume-service,name=*:consume-availability-avg", - "kmf.services:type=produce-service,name=*:records-produced-total", - "kmf.services:type=consume-service,name=*:records-consumed-total", - "kmf.services:type=consume-service,name=*:records-lost-total", - "kmf.services:type=consume-service,name=*:records-lost-rate", - "kmf.services:type=consume-service,name=*:records-duplicated-total", - "kmf.services:type=consume-service,name=*:records-delay-ms-avg", - "kmf.services:type=produce-service,name=*:records-produced-rate", - "kmf.services:type=produce-service,name=*:produce-error-rate", - "kmf.services:type=consume-service,name=*:consume-error-rate", - "kmf.services:type=commit-availability-service,name=*:offsets-committed-avg", - "kmf.services:type=commit-availability-service,name=*:commit-latency-avg", - "kmf.services:type=commit-availability-service,name=*:commit-availability-avg", - "kmf.services:type=commit-availability-service,name=*:failed-commit-offsets-avg", - "kmf.services:type=commit-availability-service,name=*:offsets-committed-total", - "kmf.services:type=commit-availability-service,name=*:failed-commit-offsets-total" - ] - }, - - "jolokia-service": { - "class.name": "com.linkedin.kmf.services.JolokiaService" - } -} diff --git a/config/multi-cluster-monitor.yaml b/config/multi-cluster-monitor.yaml new file mode 100644 index 00000000..95bb9f69 --- /dev/null +++ b/config/multi-cluster-monitor.yaml @@ -0,0 +1,80 @@ +--- +# Copyright 2022 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this +# file except in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on +# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + +# This properties file specifies an example configure to monitor a pipeline of Kafka clusters. +# Users will need to update `zookeeper.connect` and `bootstrap.servers` to point to respective clusters. +# More clusters can be added in the map for "topic.management.config.per.cluster" to reference +# each cluster in the pipeline. The "produce.service.props" should use the first cluster and +# the "consume.service.props" should use the last cluster in the pipeline. + +# Produce service: Configure Produce Service to produce to the first cluster of the pipeline +# Consume service: Configure Consume Service to consume from the last cluster of the pipeline +# Last cluster: If there are more than two clusters in the pipeline, add one property map for each one of them. +# +# For additional service configuration examples see `config/xinfra-monitor.yaml` + +multi-cluster-monitor: + class.name: com.linkedin.kmf.apps.MultiClusterMonitor + topic: kafka-monitor-topic + + produce.service.props: + zookeeper.connect: localhost:2181/first_cluster + bootstrap.servers: localhost:9092 + produce.record.delay.ms: 100 + produce.producer.props: + client.id: kafka-monitor-client-id + + consume.service.props: + zookeeper.connect: localhost:2181/last_cluster + bootstrap.servers: localhost:9095 + consume.latency.sla.ms: '20000' + consume.consumer.props: + group.id: kafka-monitor-group-id + topic.management.props.per.cluster: + first-cluster: + bootstrap.servers: localhost:9092 + zookeeper.connect: localhost:2181/first_cluster + topic-management.topicCreationEnabled: true + topic-management.replicationFactor: 1 + topic-management.partitionsToBrokersRatio: 2 + topic-management.rebalance.interval.ms: 600000 + topic-management.topicFactory.props: {} + last-cluster: + bootstrap.servers: localhost:9095 + zookeeper.connect: localhost:2181/last_cluster + topic-management.topicCreationEnabled: true + topic-management.replicationFactor: 1 + topic-management.partitionsToBrokersRatio: 2 + topic-management.rebalance.interval.ms: 600000 + topic-management.topicFactory.props: {} + +reporter-service: + class.name: com.linkedin.kmf.services.DefaultMetricsReporterService + report.interval.sec: 1 + report.metrics.list: + - kmf.services:type=produce-service,name=*:produce-availability-avg + - kmf.services:type=consume-service,name=*:consume-availability-avg + - kmf.services:type=produce-service,name=*:records-produced-total + - kmf.services:type=consume-service,name=*:records-consumed-total + - kmf.services:type=consume-service,name=*:records-lost-total + - kmf.services:type=consume-service,name=*:records-lost-rate + - kmf.services:type=consume-service,name=*:records-duplicated-total + - kmf.services:type=consume-service,name=*:records-delay-ms-avg + - kmf.services:type=produce-service,name=*:records-produced-rate + - kmf.services:type=produce-service,name=*:produce-error-rate + - kmf.services:type=consume-service,name=*:consume-error-rate + - kmf.services:type=commit-availability-service,name=*:offsets-committed-avg + - kmf.services:type=commit-availability-service,name=*:commit-latency-avg + - kmf.services:type=commit-availability-service,name=*:commit-availability-avg + - kmf.services:type=commit-availability-service,name=*:failed-commit-offsets-avg + - kmf.services:type=commit-availability-service,name=*:offsets-committed-total + - kmf.services:type=commit-availability-service,name=*:failed-commit-offsets-total + - +jolokia-service: + class.name: com.linkedin.kmf.services.JolokiaService diff --git a/config/xinfra-monitor.properties b/config/xinfra-monitor.properties deleted file mode 100644 index 6993bf47..00000000 --- a/config/xinfra-monitor.properties +++ /dev/null @@ -1,197 +0,0 @@ -# Copyright 2016 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this -# file except in compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on -# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - -# This properties file specifies the tests/services that XinfraMonitor -# should instantiate and run, together with the key/value pairs used to -# configure these tests/services. It should have the following format: -# -# { -# "name1" : { -# "type": TestClassName -# "key1": value1, -# "key2": value2, -# ... -# }, -# "name2" : { -# "type": ServiceClassName -# "key1": value1, -# "key2": value2, -# ... -# }, -# ... -# } -# -# TestClassName can be canonical name or simple name of any class that implements -# interface com.linkedin.kmf.services.Test. These classes should be under -# package com.linkedin.kmf.tests. -# -# ServiceClassName can be canonical name or simple name of any class that implements -# interface com.linkedin.kmf.services.Service. These classes should be under -# package com.linkedin.kmf.services. -# -# Each test/service should be configured with class.name which can be either TestClassName -# or ServiceClassName. The key for the test/service in the json map is used as name to -# identify the test/service in the log or JMX metrics, which is useful if multiple -# test/service with the same class.name are run in the same Kafka Monitor process. -# -# If using Secure Socket Layer for security protocol, SSL properties must be defined under -# produce.producer.props, consume.consumer.props, as well as single-cluster-monitor props - -{ - "single-cluster-monitor": { - "class.name": "com.linkedin.xinfra.monitor.apps.SingleClusterMonitor", - "topic": "xinfra-monitor-topic", - "zookeeper.connect": "localhost:2181", - "bootstrap.servers": "localhost:9092,localhost:9093", - "request.timeout.ms": 9000, - "produce.record.delay.ms": 100, - "topic-management.topicManagementEnabled": true, - "topic-management.topicCreationEnabled": true, - "topic-management.replicationFactor" : 1, - "topic-management.partitionsToBrokersRatio" : 2.0, - "topic-management.rebalance.interval.ms" : 600000, - "topic-management.preferred.leader.election.check.interval.ms" : 300000, - "topic-management.topicFactory.props": { - }, - "topic-management.topic.props": { - "retention.ms": "3600000" - }, - "produce.producer.props": { - "client.id": "kmf-client-id" - }, - - "consume.latency.sla.ms": "20000", - "consume.consumer.props": { - } - }, - - "offset-commit-service": { - "class.name": "com.linkedin.xinfra.monitor.services.OffsetCommitService", - "zookeeper.connect": "localhost:2181", - "bootstrap.servers": "localhost:9092,localhost:9093", - "consumer.props": { - "group.id": "target-consumer-group" - } - }, - - "jolokia-service": { - "class.name": "com.linkedin.xinfra.monitor.services.JolokiaService" - }, - - "reporter-service": { - "class.name": "com.linkedin.xinfra.monitor.services.DefaultMetricsReporterService", - "report.interval.sec": 1, - "report.metrics.list": [ - "kmf:type=kafka-monitor:offline-runnable-count", - "kmf.services:type=produce-service,name=*:produce-availability-avg", - "kmf.services:type=consume-service,name=*:consume-availability-avg", - "kmf.services:type=produce-service,name=*:records-produced-total", - "kmf.services:type=consume-service,name=*:records-consumed-total", - "kmf.services:type=produce-service,name=*:records-produced-rate", - "kmf.services:type=produce-service,name=*:produce-error-rate", - "kmf.services:type=consume-service,name=*:consume-error-rate", - "kmf.services:type=consume-service,name=*:records-lost-total", - "kmf.services:type=consume-service,name=*:records-lost-rate", - "kmf.services:type=consume-service,name=*:records-duplicated-total", - "kmf.services:type=consume-service,name=*:records-delay-ms-avg", - "kmf.services:type=commit-availability-service,name=*:offsets-committed-avg", - "kmf.services:type=commit-availability-service,name=*:offsets-committed-total", - "kmf.services:type=commit-availability-service,name=*:failed-commit-offsets-avg", - "kmf.services:type=commit-availability-service,name=*:failed-commit-offsets-total", - "kmf.services:type=commit-latency-service,name=*:commit-offset-latency-ms-avg", - "kmf.services:type=commit-latency-service,name=*:commit-offset-latency-ms-max", - "kmf.services:type=commit-latency-service,name=*:commit-offset-latency-ms-99th", - "kmf.services:type=commit-latency-service,name=*:commit-offset-latency-ms-999th", - "kmf.services:type=commit-latency-service,name=*:commit-offset-latency-ms-9999th", - "kmf.services:type=cluster-topic-manipulation-service,name=*:topic-creation-metadata-propagation-ms-avg", - "kmf.services:type=cluster-topic-manipulation-service,name=*:topic-creation-metadata-propagation-ms-max", - "kmf.services:type=cluster-topic-manipulation-service,name=*:topic-deletion-metadata-propagation-ms-avg", - "kmf.services:type=cluster-topic-manipulation-service,name=*:topic-deletion-metadata-propagation-ms-max", - "kmf.services:type=offset-commit-service,name=*:offset-commit-availability-avg", - "kmf.services:type=offset-commit-service,name=*:offset-commit-service-success-rate", - "kmf.services:type=offset-commit-service,name=*:offset-commit-service-success-total", - "kmf.services:type=offset-commit-service,name=*:offset-commit-service-failure-rate", - "kmf.services:type=offset-commit-service,name=*:offset-commit-service-failure-total" - ] - }, - - "cluster-topic-manipulation-service":{ - "class.name":"com.linkedin.xinfra.monitor.services.ClusterTopicManipulationService", - "zookeeper.connect": "localhost:2181", - "bootstrap.servers":"localhost:9092,localhost:9093", - "topic": "xinfra-monitor-topic" - }, - -# Example produce-service to produce messages to cluster -# "produce-service": { -# "class.name": "com.linkedin.kmf.services.ProduceService", -# "topic": "xinfra-monitor-topic", -# "zookeeper.connect": "localhost:2181", -# "bootstrap.servers": "localhost:9092", -# "consume.latency.sla.ms": "20000", -# "consume.consumer.props": { -# } -# }, - -# Example consume-service to consume messages -# "consume-service": { -# "class.name": "com.linkedin.kmf.services.ConsumeService", -# "topic": "xinfra-monitor-topic", -# "zookeeper.connect": "localhost:2181", -# "bootstrap.servers": "localhost:9092", -# "consume.latency.sla.ms": "20000", -# "consume.consumer.props": { -# } -# }, - -# Example statsd-service to report metrics -# "statsd-service": { -# "class.name": "com.linkedin.xinfra.monitor.services.StatsdMetricsReporterService", -# "report.statsd.host": "localhost", -# "report.statsd.port": "8125", -# "report.statsd.prefix": "xinfra-monitor", -# "report.interval.sec": 1, -# "report.metrics.list": [ -# "kmf.services:type=produce-service,name=*:produce-availability-avg", -# "kmf.services:type=consume-service,name=*:consume-availability-avg" -# ] -# }, - -# Example kafka-service to report metrics - "reporter-kafka-service": { - "class.name": "com.linkedin.xinfra.monitor.services.KafkaMetricsReporterService", - "report.interval.sec": 3, - "zookeeper.connect": "localhost:2181", - "bootstrap.servers": "localhost:9092", - "topic": "xinfra-monitor-topic-metrics", - "report.kafka.topic.replication.factor": 1, - "report.metrics.list": [ - "kmf.services:type=produce-service,name=*:produce-availability-avg", - "kmf.services:type=consume-service,name=*:consume-availability-avg", - "kmf.services:type=produce-service,name=*:records-produced-total", - "kmf.services:type=consume-service,name=*:records-consumed-total", - "kmf.services:type=consume-service,name=*:records-lost-total", - "kmf.services:type=consume-service,name=*:records-duplicated-total", - "kmf.services:type=consume-service,name=*:records-delay-ms-avg", - "kmf.services:type=produce-service,name=*:records-produced-rate", - "kmf.services:type=produce-service,name=*:produce-error-rate", - "kmf.services:type=consume-service,name=*:consume-error-rate" - ] - } - -# Example signalfx-service to report metrics -# "signalfx-service": { -# "class.name": "com.linkedin.kmf.services.SignalFxMetricsReporterService", -# "report.interval.sec": 1, -# "report.metric.dimensions": { -# }, -# "report.signalfx.url": "", -# "report.signalfx.token" : "" -# } - -} diff --git a/config/xinfra-monitor.yaml b/config/xinfra-monitor.yaml new file mode 100644 index 00000000..e6c32aad --- /dev/null +++ b/config/xinfra-monitor.yaml @@ -0,0 +1,172 @@ +--- +# Copyright 2022 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this +# file except in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on +# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + +# This properties file specifies the tests/services that XinfraMonitor +# should instantiate and run, together with the key/value pairs used to +# configure the tests and services. It should have the following format: +# +# +# "name1": +# "type": TestClassName +# "key1": value1, +# "key2": value2, +# ... +# "name2": +# "type": ServiceClassName +# "key1": value1, +# "key2": value2, +# ... +# ... +# +# +# TestClassName can be the canonical name or simple name of any class that implements +# the interface com.linkedin.kmf.services.Test. These classes should be under the +# package com.linkedin.kmf.tests +# +# ServiceClassName can be the canonical name or the simple name of any class that implements +# interface com.linkedin.kmf.services.Service. These classes should be under +# package com.linkedin.kmf.services +# +# Each test and service should be configured with class.name which can be either TestClassName +# or ServiceClassName. The key for the test/service in the configuration map is used as the name to +# identify the test/service in the log or JMX metrics. This is useful if multiple +# tests or services with the same class.name are ran under the same Kafka Monitor process. +# +# If using Secure Socket Layer for security protocol, SSL properties must be defined under +# produce.producer.props, consume.consumer.props, as well as single-cluster-monitor properties. + +single-cluster-monitor: + class.name: com.linkedin.xinfra.monitor.apps.SingleClusterMonitor + topic: xinfra-monitor-topic + zookeeper.connect: localhost:2181 + bootstrap.servers: localhost:9092,localhost:9093 + request.timeout.ms: 9000 + produce.record.delay.ms: 100 + topic-management.topicManagementEnabled: true + topic-management.topicCreationEnabled: true + topic-management.replicationFactor: 1 + topic-management.partitionsToBrokersRatio: 2 + topic-management.rebalance.interval.ms: 600000 + topic-management.preferred.leader.election.check.interval.ms: 300000 + topic-management.topicFactory.props: {} + topic-management.topic.props: + retention.ms: '3600000' + produce.producer.props: + client.id: kmf-client-id + consume.latency.sla.ms: '20000' + consume.consumer.props: {} + +offset-commit-service: + class.name: com.linkedin.xinfra.monitor.services.OffsetCommitService + zookeeper.connect: localhost:2181 + bootstrap.servers: localhost:9092,localhost:9093 + consumer.props: + group.id: target-consumer-group + +jolokia-service: + class.name: com.linkedin.xinfra.monitor.services.JolokiaService + +reporter-service: + class.name: com.linkedin.xinfra.monitor.services.DefaultMetricsReporterService + report.interval.sec: 1 + report.metrics.list: + - kmf:type=kafka-monitor:offline-runnable-count + - kmf.services:type=produce-service,name=*:produce-availability-avg + - kmf.services:type=consume-service,name=*:consume-availability-avg + - kmf.services:type=produce-service,name=*:records-produced-total + - kmf.services:type=consume-service,name=*:records-consumed-total + - kmf.services:type=produce-service,name=*:records-produced-rate + - kmf.services:type=produce-service,name=*:produce-error-rate + - kmf.services:type=consume-service,name=*:consume-error-rate + - kmf.services:type=consume-service,name=*:records-lost-total + - kmf.services:type=consume-service,name=*:records-lost-rate + - kmf.services:type=consume-service,name=*:records-duplicated-total + - kmf.services:type=consume-service,name=*:records-delay-ms-avg + - kmf.services:type=commit-availability-service,name=*:offsets-committed-avg + - kmf.services:type=commit-availability-service,name=*:offsets-committed-total + - kmf.services:type=commit-availability-service,name=*:failed-commit-offsets-avg + - kmf.services:type=commit-availability-service,name=*:failed-commit-offsets-total + - kmf.services:type=commit-latency-service,name=*:commit-offset-latency-ms-avg + - kmf.services:type=commit-latency-service,name=*:commit-offset-latency-ms-max + - kmf.services:type=commit-latency-service,name=*:commit-offset-latency-ms-99th + - kmf.services:type=commit-latency-service,name=*:commit-offset-latency-ms-999th + - kmf.services:type=commit-latency-service,name=*:commit-offset-latency-ms-9999th + - kmf.services:type=cluster-topic-manipulation-service,name=*:topic-creation-metadata-propagation-ms-avg + - kmf.services:type=cluster-topic-manipulation-service,name=*:topic-creation-metadata-propagation-ms-max + - kmf.services:type=cluster-topic-manipulation-service,name=*:topic-deletion-metadata-propagation-ms-avg + - kmf.services:type=cluster-topic-manipulation-service,name=*:topic-deletion-metadata-propagation-ms-max + - kmf.services:type=offset-commit-service,name=*:offset-commit-availability-avg + - kmf.services:type=offset-commit-service,name=*:offset-commit-service-success-rate + - kmf.services:type=offset-commit-service,name=*:offset-commit-service-success-total + - kmf.services:type=offset-commit-service,name=*:offset-commit-service-failure-rate + - kmf.services:type=offset-commit-service,name=*:offset-commit-service-failure-total + +cluster-topic-manipulation-service: + class.name: com.linkedin.xinfra.monitor.services.ClusterTopicManipulationService + zookeeper.connect: localhost:2181 + bootstrap.servers: localhost:9092,localhost:9093 + topic: xinfra-monitor-topic + +# Example produce-service to produce messages to cluster +produce-service: + class.name: com.linkedin.kmf.services.ProduceService + topic: xinfra-monitor-topic + zookeeper.connect: localhost:2181 + bootstrap.servers: localhost:9092 + consume.latency.sla.ms: '20000' + consume.consumer.props: {} + +# Example consume-service to consume messages +consume-service: + class.name: com.linkedin.kmf.services.ConsumeService + topic: xinfra-monitor-topic + zookeeper.connect: localhost:2181 + bootstrap.servers: localhost:9092 + consume.latency.sla.ms: '20000' + consume.consumer.props: {} + +# Example statsd-service to report metrics +statsd-service: + class.name: com.linkedin.xinfra.monitor.services.StatsdMetricsReporterService + report.statsd.host: localhost + report.statsd.port: '8125' + report.statsd.prefix: xinfra-monitor + report.interval.sec: 1 + report.metrics.list: + - kmf.services:type=produce-service,name=*:produce-availability-avg + - kmf.services:type=consume-service,name=*:consume-availability-avg + +# Example kafka-service to report metrics +reporter-kafka-service: + class.name: com.linkedin.xinfra.monitor.services.KafkaMetricsReporterService + report.interval.sec: 3 + zookeeper.connect: localhost:2181 + bootstrap.servers: localhost:9092 + topic: xinfra-monitor-topic-metrics + report.kafka.topic.replication.factor: 1 + report.metrics.list: + - kmf.services:type=produce-service,name=*:produce-availability-avg + - kmf.services:type=consume-service,name=*:consume-availability-avg + - kmf.services:type=produce-service,name=*:records-produced-total + - kmf.services:type=consume-service,name=*:records-consumed-total + - kmf.services:type=consume-service,name=*:records-lost-total + - kmf.services:type=consume-service,name=*:records-duplicated-total + - kmf.services:type=consume-service,name=*:records-delay-ms-avg + - kmf.services:type=produce-service,name=*:records-produced-rate + - kmf.services:type=produce-service,name=*:produce-error-rate + - kmf.services:type=consume-service,name=*:consume-error-rate + + +# Example signalfx-service to report metrics +signalfx-service: + class.name: com.linkedin.kmf.services.SignalFxMetricsReporterService + report.interval.sec: 1 + report.metric.dimensions: {} + report.signalfx.url: '' + report.signalfx.token: '' diff --git a/docker/kafka-monitor-docker-entry.sh b/docker/kafka-monitor-docker-entry.sh index 97554bb0..7a23c65b 100755 --- a/docker/kafka-monitor-docker-entry.sh +++ b/docker/kafka-monitor-docker-entry.sh @@ -22,6 +22,6 @@ trap 'pkill java; exit 143' SIGTERM # wait for DNS services to be available sleep 10 -bin/xinfra-monitor-start.sh config/xinfra-monitor.properties & +bin/xinfra-monitor-start.sh config/xinfra-monitor.yaml & wait $! \ No newline at end of file diff --git a/gradle/wrapper/gradle-wrapper.properties b/gradle/wrapper/gradle-wrapper.properties index 842c8c5a..a4b44297 100644 --- a/gradle/wrapper/gradle-wrapper.properties +++ b/gradle/wrapper/gradle-wrapper.properties @@ -1,6 +1,5 @@ -#Mon Apr 01 18:19:43 PDT 2019 distributionBase=GRADLE_USER_HOME distributionPath=wrapper/dists +distributionUrl=https\://services.gradle.org/distributions/gradle-6.3-bin.zip zipStoreBase=GRADLE_USER_HOME zipStorePath=wrapper/dists -distributionUrl=https\://services.gradle.org/distributions/gradle-5.2.1-all.zip diff --git a/src/main/java/com/linkedin/xinfra/monitor/XinfraMonitor.java b/src/main/java/com/linkedin/xinfra/monitor/XinfraMonitor.java index d516b076..bfea356d 100644 --- a/src/main/java/com/linkedin/xinfra/monitor/XinfraMonitor.java +++ b/src/main/java/com/linkedin/xinfra/monitor/XinfraMonitor.java @@ -11,11 +11,11 @@ package com.linkedin.xinfra.monitor; import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.dataformat.yaml.YAMLFactory; import com.linkedin.xinfra.monitor.apps.App; import com.linkedin.xinfra.monitor.services.Service; import com.linkedin.xinfra.monitor.services.ServiceFactory; -import java.io.BufferedReader; -import java.io.FileReader; +import java.io.File; import java.lang.reflect.Constructor; import java.util.ArrayList; import java.util.List; @@ -168,21 +168,14 @@ public void awaitShutdown() { @SuppressWarnings("rawtypes") public static void main(String[] args) throws Exception { if (args.length <= 0) { - LOG.info("USAGE: java [options] " + XinfraMonitor.class.getName() + " config/xinfra-monitor.properties"); + LOG.info("USAGE: java [options] " + XinfraMonitor.class.getName() + " config/xinfra-monitor.yaml"); return; } - StringBuilder buffer = new StringBuilder(); - try (BufferedReader br = new BufferedReader(new FileReader(args[0].trim()))) { - String line; - while ((line = br.readLine()) != null) { - if (!line.startsWith("#")) - buffer.append(line); - } - } + File configurationFile = new File(args[0].trim()); @SuppressWarnings("unchecked") - Map props = new ObjectMapper().readValue(buffer.toString(), Map.class); + Map props = new ObjectMapper(new YAMLFactory()).readValue(configurationFile, Map.class); XinfraMonitor xinfraMonitor = new XinfraMonitor(props); xinfraMonitor.start(); LOG.info("Xinfra Monitor has started."); diff --git a/src/main/java/com/linkedin/xinfra/monitor/services/configs/StatsdMetricsReporterServiceConfig.java b/src/main/java/com/linkedin/xinfra/monitor/services/configs/StatsdMetricsReporterServiceConfig.java index 8438fd78..f46ac13d 100644 --- a/src/main/java/com/linkedin/xinfra/monitor/services/configs/StatsdMetricsReporterServiceConfig.java +++ b/src/main/java/com/linkedin/xinfra/monitor/services/configs/StatsdMetricsReporterServiceConfig.java @@ -8,7 +8,7 @@ * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * * - * In order to enable the StatsD metrics export, add the following section to xinfra-monitor.properties file + * In order to enable the StatsD metrics export, add the following section to xinfra-monitor.yaml file * */ From 4b289dc33691657a2077a2dfb1f878383793af42 Mon Sep 17 00:00:00 2001 From: Mason Legere Date: Thu, 5 May 2022 17:13:34 -0700 Subject: [PATCH 2/2] Remove unneeded dependency --- build.gradle | 1 - 1 file changed, 1 deletion(-) diff --git a/build.gradle b/build.gradle index 74ebaa77..17f171cb 100644 --- a/build.gradle +++ b/build.gradle @@ -46,7 +46,6 @@ allprojects { compile 'org.apache.commons:commons-lang3:3.12.0' compile 'com.linkedin.avroutil1:helper-all:0.2.81' compile group: 'com.fasterxml.jackson.dataformat', name: 'jackson-dataformat-yaml', version: '2.10.3' - compile group: 'com.fasterxml.jackson.core', name: 'jackson-databind', version: '2.10.3' testCompile 'org.mockito:mockito-core:2.24.0' testCompile 'org.testng:testng:6.8.8' }