diff --git a/.gitignore b/.gitignore
index 5b4afd4a..177dcbe4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -8,9 +8,6 @@ logs/
.settings/
src/test/java/com/linkedin/xinfra/monitor/RandomTests.java
-config/andrew-choi.properties
-config/andrew-multi-cluster-monitor.properties
-
kafka-monitor.iml
kafka-monitor.ipr
kafka-monitor.iws
diff --git a/README.md b/README.md
index 8313ba42..86916624 100644
--- a/README.md
+++ b/README.md
@@ -99,8 +99,8 @@ Xinfra Monitor supports Apache Kafka 0.8 to 2.0:
- We advise advanced users to run Xinfra Monitor with
-
./bin/xinfra-monitor-start.sh config/xinfra-monitor.properties
. The default
-xinfra-monitor.properties in the repo provides an simple example of how to
+./bin/xinfra-monitor-start.sh config/xinfra-monitor.yaml
. The default
+xinfra-monitor.yaml in the repo provides an simple example of how to
monitor a single cluster. You probably need to change the value of
zookeeper.connect
and bootstrap.servers
to point to your cluster.
@@ -109,7 +109,7 @@ monitor a single cluster. You probably need to change the value of
Config class for respective service, e.g. ProduceServiceConfig.java and
ConsumeServiceConfig.java.
-- You can specify multiple SingleClusterMonitor in the xinfra-monitor.properties to
+
- You can specify multiple SingleClusterMonitor in the xinfra-monitor.yaml to
monitor multiple Kafka clusters in one Xinfra Monitor process. As another
advanced use-case, you can point ProduceService and ConsumeService to two different Kafka clusters that are connected by MirrorMaker to monitor their end-to-end latency.
@@ -146,16 +146,16 @@ $ ./gradlew jar
### Start XinfraMonitor to run tests/services specified in the config file
```
-$ ./bin/xinfra-monitor-start.sh config/xinfra-monitor.properties
+$ ./bin/xinfra-monitor-start.sh config/xinfra-monitor.yaml
```
### Run Xinfra Monitor with arbitrary producer/consumer configuration (e.g. SASL enabled client)
-Edit `config/xinfra-monitor.properties` to specify custom configurations for producer in the key/value map `produce.producer.props` in
-`config/xinfra-monitor.properties`. Similarly specify configurations for
+Edit `config/xinfra-monitor.yaml` to specify custom configurations for producer in the key/value map `produce.producer.props` in
+`config/xinfra-monitor.yaml`. Similarly specify configurations for
consumer as well. The documentation for producer and consumer in the key/value maps can be found in the Apache Kafka wiki.
```
-$ ./bin/xinfra-monitor-start.sh config/xinfra-monitor.properties
+$ ./bin/xinfra-monitor-start.sh config/xinfra-monitor.yaml
```
### Run SingleClusterMonitor app to monitor kafka cluster
@@ -169,16 +169,16 @@ $ ./bin/single-cluster-monitor.sh --topic test --broker-list localhost:9092 --zo
```
### Run MultiClusterMonitor app to monitor a pipeline of Kafka clusters connected by MirrorMaker
-Edit `config/multi-cluster-monitor.properties` to specify the right broker and
+Edit `config/multi-cluster-monitor.yaml` to specify the right broker and
zookeeper url as suggested by the comment in the properties file
Metrics `produce-availability-avg` and `consume-availability-avg` demonstrate
whether messages can be properly produced to the source cluster and consumed
-from the destination cluster. See config/multi-cluster-monitor.properties for
+from the destination cluster. See config/multi-cluster-monitor.yaml for
the full jmx path for these metrics.
```
-$ ./bin/xinfra-monitor-start.sh config/multi-cluster-monitor.properties
+$ ./bin/xinfra-monitor-start.sh config/multi-cluster-monitor.yaml
```
### Run checkstyle on the java code
diff --git a/bin/windows/kafka-monitor-start.bat b/bin/windows/kafka-monitor-start.bat
index 45eedad7..1220c773 100644
--- a/bin/windows/kafka-monitor-start.bat
+++ b/bin/windows/kafka-monitor-start.bat
@@ -1,5 +1,5 @@
@echo off
-REM Copyright 2016 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this
+REM Copyright 2022 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this
REM file except in compliance with the License. You may obtain a copy of the License at
REM
REM http://www.apache.org/licenses/LICENSE-2.0
@@ -15,7 +15,7 @@ popd
IF [%1] EQU [] (
- echo USAGE: %0 config/xinfra-monitor.properties
+ echo USAGE: %0 config/xinfra-monitor.yaml
EXIT /B 1
)
diff --git a/bin/windows/kmf-run-class.bat b/bin/windows/kmf-run-class.bat
index caddf261..f113fe8d 100644
--- a/bin/windows/kmf-run-class.bat
+++ b/bin/windows/kmf-run-class.bat
@@ -1,5 +1,5 @@
@echo off
-REM Copyright 2016 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this
+REM Copyright 2022 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this
REM file except in compliance with the License. You may obtain a copy of the License at
REM
REM http://www.apache.org/licenses/LICENSE-2.0
@@ -10,12 +10,12 @@ REM an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either expre
setlocal enabledelayedexpansion
IF [%1] EQU [] (
- echo USAGE: %0 com.linkedin.xinfra.monitor.XinfraMonitor config/xinfra-monitor.properties
+ echo USAGE: %0 com.linkedin.xinfra.monitor.XinfraMonitor config/xinfra-monitor.yaml
EXIT /B 1
)
IF [%2] EQU [] (
- echo USAGE: %0 %1 config/xinfra-monitor.properties
+ echo USAGE: %0 %1 config/xinfra-monitor.yaml
EXIT /B 1
)
diff --git a/build.gradle b/build.gradle
index b370bb09..17f171cb 100644
--- a/build.gradle
+++ b/build.gradle
@@ -45,6 +45,7 @@ allprojects {
compile group: 'org.apache.kafka', name: 'kafka-clients', version: '2.3.1'
compile 'org.apache.commons:commons-lang3:3.12.0'
compile 'com.linkedin.avroutil1:helper-all:0.2.81'
+ compile group: 'com.fasterxml.jackson.dataformat', name: 'jackson-dataformat-yaml', version: '2.10.3'
testCompile 'org.mockito:mockito-core:2.24.0'
testCompile 'org.testng:testng:6.8.8'
}
diff --git a/config/multi-cluster-monitor.properties b/config/multi-cluster-monitor.properties
deleted file mode 100644
index dd40b035..00000000
--- a/config/multi-cluster-monitor.properties
+++ /dev/null
@@ -1,92 +0,0 @@
-# Copyright 2016 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this
-# file except in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-
-# This properties file specifies an example configure to monitor a pipeline of Kafka clusters.
-# User probably needs to change zookeeper.connect and bootstrap.servers to point to respective clusters.
-# More clusters can be added in the map for "topic.management.config.per.cluster" to reference
-# each cluster in the pipeline. The "produce.service.props" should use the first cluster and
-# the "consume.service.props" should use the last cluster in the pipeline.
-
-# Produce service: Configure Produce Service to produce to the first cluster of the pipeline
-# Consume service: Configure Consume Service to consume from the last cluster of the pipeline
-# Last cluster: If there are more than two clusters in the pipeline, add one property map for each one of them.
-{
- "multi-cluster-monitor": {
- "class.name": "com.linkedin.kmf.apps.MultiClusterMonitor",
- "topic": "kafka-monitor-topic",
- "produce.service.props": {
- "zookeeper.connect": "localhost:2181/first_cluster",
- "bootstrap.servers": "localhost:9092",
- "produce.record.delay.ms": 100,
- "produce.producer.props": {
- "client.id": "kafka-monitor-client-id"
- }
- },
- "consume.service.props": {
- "zookeeper.connect": "localhost:2181/last_cluster",
- "bootstrap.servers": "localhost:9095",
- "consume.latency.sla.ms": "20000",
- "consume.consumer.props": {
- "group.id": "kafka-monitor-group-id"
- }
- },
-
- "topic.management.props.per.cluster" : {
- "first-cluster" : {
- "bootstrap.servers": "localhost:9092",
- "zookeeper.connect": "localhost:2181/first_cluster",
- "topic-management.topicCreationEnabled": true,
- "topic-management.replicationFactor" : 1,
- "topic-management.partitionsToBrokersRatio" : 2.0,
- "topic-management.rebalance.interval.ms" : 600000,
- "topic-management.topicFactory.props": {
- }
- },
-
- "last-cluster" : {
- "bootstrap.servers": "localhost:9095",
- "zookeeper.connect": "localhost:2181/last_cluster",
- "topic-management.topicCreationEnabled": true,
- "topic-management.replicationFactor" : 1,
- "topic-management.partitionsToBrokersRatio" : 2.0,
- "topic-management.rebalance.interval.ms" : 600000,
- "topic-management.topicFactory.props": {
- }
- }
- }
-
- },
-
- "reporter-service": {
- "class.name": "com.linkedin.kmf.services.DefaultMetricsReporterService",
- "report.interval.sec": 1,
- "report.metrics.list": [
- "kmf.services:type=produce-service,name=*:produce-availability-avg",
- "kmf.services:type=consume-service,name=*:consume-availability-avg",
- "kmf.services:type=produce-service,name=*:records-produced-total",
- "kmf.services:type=consume-service,name=*:records-consumed-total",
- "kmf.services:type=consume-service,name=*:records-lost-total",
- "kmf.services:type=consume-service,name=*:records-lost-rate",
- "kmf.services:type=consume-service,name=*:records-duplicated-total",
- "kmf.services:type=consume-service,name=*:records-delay-ms-avg",
- "kmf.services:type=produce-service,name=*:records-produced-rate",
- "kmf.services:type=produce-service,name=*:produce-error-rate",
- "kmf.services:type=consume-service,name=*:consume-error-rate",
- "kmf.services:type=commit-availability-service,name=*:offsets-committed-avg",
- "kmf.services:type=commit-availability-service,name=*:commit-latency-avg",
- "kmf.services:type=commit-availability-service,name=*:commit-availability-avg",
- "kmf.services:type=commit-availability-service,name=*:failed-commit-offsets-avg",
- "kmf.services:type=commit-availability-service,name=*:offsets-committed-total",
- "kmf.services:type=commit-availability-service,name=*:failed-commit-offsets-total"
- ]
- },
-
- "jolokia-service": {
- "class.name": "com.linkedin.kmf.services.JolokiaService"
- }
-}
diff --git a/config/multi-cluster-monitor.yaml b/config/multi-cluster-monitor.yaml
new file mode 100644
index 00000000..95bb9f69
--- /dev/null
+++ b/config/multi-cluster-monitor.yaml
@@ -0,0 +1,80 @@
+---
+# Copyright 2022 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this
+# file except in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+# This properties file specifies an example configure to monitor a pipeline of Kafka clusters.
+# Users will need to update `zookeeper.connect` and `bootstrap.servers` to point to respective clusters.
+# More clusters can be added in the map for "topic.management.config.per.cluster" to reference
+# each cluster in the pipeline. The "produce.service.props" should use the first cluster and
+# the "consume.service.props" should use the last cluster in the pipeline.
+
+# Produce service: Configure Produce Service to produce to the first cluster of the pipeline
+# Consume service: Configure Consume Service to consume from the last cluster of the pipeline
+# Last cluster: If there are more than two clusters in the pipeline, add one property map for each one of them.
+#
+# For additional service configuration examples see `config/xinfra-monitor.yaml`
+
+multi-cluster-monitor:
+ class.name: com.linkedin.kmf.apps.MultiClusterMonitor
+ topic: kafka-monitor-topic
+
+ produce.service.props:
+ zookeeper.connect: localhost:2181/first_cluster
+ bootstrap.servers: localhost:9092
+ produce.record.delay.ms: 100
+ produce.producer.props:
+ client.id: kafka-monitor-client-id
+
+ consume.service.props:
+ zookeeper.connect: localhost:2181/last_cluster
+ bootstrap.servers: localhost:9095
+ consume.latency.sla.ms: '20000'
+ consume.consumer.props:
+ group.id: kafka-monitor-group-id
+ topic.management.props.per.cluster:
+ first-cluster:
+ bootstrap.servers: localhost:9092
+ zookeeper.connect: localhost:2181/first_cluster
+ topic-management.topicCreationEnabled: true
+ topic-management.replicationFactor: 1
+ topic-management.partitionsToBrokersRatio: 2
+ topic-management.rebalance.interval.ms: 600000
+ topic-management.topicFactory.props: {}
+ last-cluster:
+ bootstrap.servers: localhost:9095
+ zookeeper.connect: localhost:2181/last_cluster
+ topic-management.topicCreationEnabled: true
+ topic-management.replicationFactor: 1
+ topic-management.partitionsToBrokersRatio: 2
+ topic-management.rebalance.interval.ms: 600000
+ topic-management.topicFactory.props: {}
+
+reporter-service:
+ class.name: com.linkedin.kmf.services.DefaultMetricsReporterService
+ report.interval.sec: 1
+ report.metrics.list:
+ - kmf.services:type=produce-service,name=*:produce-availability-avg
+ - kmf.services:type=consume-service,name=*:consume-availability-avg
+ - kmf.services:type=produce-service,name=*:records-produced-total
+ - kmf.services:type=consume-service,name=*:records-consumed-total
+ - kmf.services:type=consume-service,name=*:records-lost-total
+ - kmf.services:type=consume-service,name=*:records-lost-rate
+ - kmf.services:type=consume-service,name=*:records-duplicated-total
+ - kmf.services:type=consume-service,name=*:records-delay-ms-avg
+ - kmf.services:type=produce-service,name=*:records-produced-rate
+ - kmf.services:type=produce-service,name=*:produce-error-rate
+ - kmf.services:type=consume-service,name=*:consume-error-rate
+ - kmf.services:type=commit-availability-service,name=*:offsets-committed-avg
+ - kmf.services:type=commit-availability-service,name=*:commit-latency-avg
+ - kmf.services:type=commit-availability-service,name=*:commit-availability-avg
+ - kmf.services:type=commit-availability-service,name=*:failed-commit-offsets-avg
+ - kmf.services:type=commit-availability-service,name=*:offsets-committed-total
+ - kmf.services:type=commit-availability-service,name=*:failed-commit-offsets-total
+ -
+jolokia-service:
+ class.name: com.linkedin.kmf.services.JolokiaService
diff --git a/config/xinfra-monitor.properties b/config/xinfra-monitor.properties
deleted file mode 100644
index 6993bf47..00000000
--- a/config/xinfra-monitor.properties
+++ /dev/null
@@ -1,197 +0,0 @@
-# Copyright 2016 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this
-# file except in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-
-# This properties file specifies the tests/services that XinfraMonitor
-# should instantiate and run, together with the key/value pairs used to
-# configure these tests/services. It should have the following format:
-#
-# {
-# "name1" : {
-# "type": TestClassName
-# "key1": value1,
-# "key2": value2,
-# ...
-# },
-# "name2" : {
-# "type": ServiceClassName
-# "key1": value1,
-# "key2": value2,
-# ...
-# },
-# ...
-# }
-#
-# TestClassName can be canonical name or simple name of any class that implements
-# interface com.linkedin.kmf.services.Test. These classes should be under
-# package com.linkedin.kmf.tests.
-#
-# ServiceClassName can be canonical name or simple name of any class that implements
-# interface com.linkedin.kmf.services.Service. These classes should be under
-# package com.linkedin.kmf.services.
-#
-# Each test/service should be configured with class.name which can be either TestClassName
-# or ServiceClassName. The key for the test/service in the json map is used as name to
-# identify the test/service in the log or JMX metrics, which is useful if multiple
-# test/service with the same class.name are run in the same Kafka Monitor process.
-#
-# If using Secure Socket Layer for security protocol, SSL properties must be defined under
-# produce.producer.props, consume.consumer.props, as well as single-cluster-monitor props
-
-{
- "single-cluster-monitor": {
- "class.name": "com.linkedin.xinfra.monitor.apps.SingleClusterMonitor",
- "topic": "xinfra-monitor-topic",
- "zookeeper.connect": "localhost:2181",
- "bootstrap.servers": "localhost:9092,localhost:9093",
- "request.timeout.ms": 9000,
- "produce.record.delay.ms": 100,
- "topic-management.topicManagementEnabled": true,
- "topic-management.topicCreationEnabled": true,
- "topic-management.replicationFactor" : 1,
- "topic-management.partitionsToBrokersRatio" : 2.0,
- "topic-management.rebalance.interval.ms" : 600000,
- "topic-management.preferred.leader.election.check.interval.ms" : 300000,
- "topic-management.topicFactory.props": {
- },
- "topic-management.topic.props": {
- "retention.ms": "3600000"
- },
- "produce.producer.props": {
- "client.id": "kmf-client-id"
- },
-
- "consume.latency.sla.ms": "20000",
- "consume.consumer.props": {
- }
- },
-
- "offset-commit-service": {
- "class.name": "com.linkedin.xinfra.monitor.services.OffsetCommitService",
- "zookeeper.connect": "localhost:2181",
- "bootstrap.servers": "localhost:9092,localhost:9093",
- "consumer.props": {
- "group.id": "target-consumer-group"
- }
- },
-
- "jolokia-service": {
- "class.name": "com.linkedin.xinfra.monitor.services.JolokiaService"
- },
-
- "reporter-service": {
- "class.name": "com.linkedin.xinfra.monitor.services.DefaultMetricsReporterService",
- "report.interval.sec": 1,
- "report.metrics.list": [
- "kmf:type=kafka-monitor:offline-runnable-count",
- "kmf.services:type=produce-service,name=*:produce-availability-avg",
- "kmf.services:type=consume-service,name=*:consume-availability-avg",
- "kmf.services:type=produce-service,name=*:records-produced-total",
- "kmf.services:type=consume-service,name=*:records-consumed-total",
- "kmf.services:type=produce-service,name=*:records-produced-rate",
- "kmf.services:type=produce-service,name=*:produce-error-rate",
- "kmf.services:type=consume-service,name=*:consume-error-rate",
- "kmf.services:type=consume-service,name=*:records-lost-total",
- "kmf.services:type=consume-service,name=*:records-lost-rate",
- "kmf.services:type=consume-service,name=*:records-duplicated-total",
- "kmf.services:type=consume-service,name=*:records-delay-ms-avg",
- "kmf.services:type=commit-availability-service,name=*:offsets-committed-avg",
- "kmf.services:type=commit-availability-service,name=*:offsets-committed-total",
- "kmf.services:type=commit-availability-service,name=*:failed-commit-offsets-avg",
- "kmf.services:type=commit-availability-service,name=*:failed-commit-offsets-total",
- "kmf.services:type=commit-latency-service,name=*:commit-offset-latency-ms-avg",
- "kmf.services:type=commit-latency-service,name=*:commit-offset-latency-ms-max",
- "kmf.services:type=commit-latency-service,name=*:commit-offset-latency-ms-99th",
- "kmf.services:type=commit-latency-service,name=*:commit-offset-latency-ms-999th",
- "kmf.services:type=commit-latency-service,name=*:commit-offset-latency-ms-9999th",
- "kmf.services:type=cluster-topic-manipulation-service,name=*:topic-creation-metadata-propagation-ms-avg",
- "kmf.services:type=cluster-topic-manipulation-service,name=*:topic-creation-metadata-propagation-ms-max",
- "kmf.services:type=cluster-topic-manipulation-service,name=*:topic-deletion-metadata-propagation-ms-avg",
- "kmf.services:type=cluster-topic-manipulation-service,name=*:topic-deletion-metadata-propagation-ms-max",
- "kmf.services:type=offset-commit-service,name=*:offset-commit-availability-avg",
- "kmf.services:type=offset-commit-service,name=*:offset-commit-service-success-rate",
- "kmf.services:type=offset-commit-service,name=*:offset-commit-service-success-total",
- "kmf.services:type=offset-commit-service,name=*:offset-commit-service-failure-rate",
- "kmf.services:type=offset-commit-service,name=*:offset-commit-service-failure-total"
- ]
- },
-
- "cluster-topic-manipulation-service":{
- "class.name":"com.linkedin.xinfra.monitor.services.ClusterTopicManipulationService",
- "zookeeper.connect": "localhost:2181",
- "bootstrap.servers":"localhost:9092,localhost:9093",
- "topic": "xinfra-monitor-topic"
- },
-
-# Example produce-service to produce messages to cluster
-# "produce-service": {
-# "class.name": "com.linkedin.kmf.services.ProduceService",
-# "topic": "xinfra-monitor-topic",
-# "zookeeper.connect": "localhost:2181",
-# "bootstrap.servers": "localhost:9092",
-# "consume.latency.sla.ms": "20000",
-# "consume.consumer.props": {
-# }
-# },
-
-# Example consume-service to consume messages
-# "consume-service": {
-# "class.name": "com.linkedin.kmf.services.ConsumeService",
-# "topic": "xinfra-monitor-topic",
-# "zookeeper.connect": "localhost:2181",
-# "bootstrap.servers": "localhost:9092",
-# "consume.latency.sla.ms": "20000",
-# "consume.consumer.props": {
-# }
-# },
-
-# Example statsd-service to report metrics
-# "statsd-service": {
-# "class.name": "com.linkedin.xinfra.monitor.services.StatsdMetricsReporterService",
-# "report.statsd.host": "localhost",
-# "report.statsd.port": "8125",
-# "report.statsd.prefix": "xinfra-monitor",
-# "report.interval.sec": 1,
-# "report.metrics.list": [
-# "kmf.services:type=produce-service,name=*:produce-availability-avg",
-# "kmf.services:type=consume-service,name=*:consume-availability-avg"
-# ]
-# },
-
-# Example kafka-service to report metrics
- "reporter-kafka-service": {
- "class.name": "com.linkedin.xinfra.monitor.services.KafkaMetricsReporterService",
- "report.interval.sec": 3,
- "zookeeper.connect": "localhost:2181",
- "bootstrap.servers": "localhost:9092",
- "topic": "xinfra-monitor-topic-metrics",
- "report.kafka.topic.replication.factor": 1,
- "report.metrics.list": [
- "kmf.services:type=produce-service,name=*:produce-availability-avg",
- "kmf.services:type=consume-service,name=*:consume-availability-avg",
- "kmf.services:type=produce-service,name=*:records-produced-total",
- "kmf.services:type=consume-service,name=*:records-consumed-total",
- "kmf.services:type=consume-service,name=*:records-lost-total",
- "kmf.services:type=consume-service,name=*:records-duplicated-total",
- "kmf.services:type=consume-service,name=*:records-delay-ms-avg",
- "kmf.services:type=produce-service,name=*:records-produced-rate",
- "kmf.services:type=produce-service,name=*:produce-error-rate",
- "kmf.services:type=consume-service,name=*:consume-error-rate"
- ]
- }
-
-# Example signalfx-service to report metrics
-# "signalfx-service": {
-# "class.name": "com.linkedin.kmf.services.SignalFxMetricsReporterService",
-# "report.interval.sec": 1,
-# "report.metric.dimensions": {
-# },
-# "report.signalfx.url": "",
-# "report.signalfx.token" : ""
-# }
-
-}
diff --git a/config/xinfra-monitor.yaml b/config/xinfra-monitor.yaml
new file mode 100644
index 00000000..e6c32aad
--- /dev/null
+++ b/config/xinfra-monitor.yaml
@@ -0,0 +1,172 @@
+---
+# Copyright 2022 LinkedIn Corp. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this
+# file except in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+# This properties file specifies the tests/services that XinfraMonitor
+# should instantiate and run, together with the key/value pairs used to
+# configure the tests and services. It should have the following format:
+#
+#
+# "name1":
+# "type": TestClassName
+# "key1": value1,
+# "key2": value2,
+# ...
+# "name2":
+# "type": ServiceClassName
+# "key1": value1,
+# "key2": value2,
+# ...
+# ...
+#
+#
+# TestClassName can be the canonical name or simple name of any class that implements
+# the interface com.linkedin.kmf.services.Test. These classes should be under the
+# package com.linkedin.kmf.tests
+#
+# ServiceClassName can be the canonical name or the simple name of any class that implements
+# interface com.linkedin.kmf.services.Service. These classes should be under
+# package com.linkedin.kmf.services
+#
+# Each test and service should be configured with class.name which can be either TestClassName
+# or ServiceClassName. The key for the test/service in the configuration map is used as the name to
+# identify the test/service in the log or JMX metrics. This is useful if multiple
+# tests or services with the same class.name are ran under the same Kafka Monitor process.
+#
+# If using Secure Socket Layer for security protocol, SSL properties must be defined under
+# produce.producer.props, consume.consumer.props, as well as single-cluster-monitor properties.
+
+single-cluster-monitor:
+ class.name: com.linkedin.xinfra.monitor.apps.SingleClusterMonitor
+ topic: xinfra-monitor-topic
+ zookeeper.connect: localhost:2181
+ bootstrap.servers: localhost:9092,localhost:9093
+ request.timeout.ms: 9000
+ produce.record.delay.ms: 100
+ topic-management.topicManagementEnabled: true
+ topic-management.topicCreationEnabled: true
+ topic-management.replicationFactor: 1
+ topic-management.partitionsToBrokersRatio: 2
+ topic-management.rebalance.interval.ms: 600000
+ topic-management.preferred.leader.election.check.interval.ms: 300000
+ topic-management.topicFactory.props: {}
+ topic-management.topic.props:
+ retention.ms: '3600000'
+ produce.producer.props:
+ client.id: kmf-client-id
+ consume.latency.sla.ms: '20000'
+ consume.consumer.props: {}
+
+offset-commit-service:
+ class.name: com.linkedin.xinfra.monitor.services.OffsetCommitService
+ zookeeper.connect: localhost:2181
+ bootstrap.servers: localhost:9092,localhost:9093
+ consumer.props:
+ group.id: target-consumer-group
+
+jolokia-service:
+ class.name: com.linkedin.xinfra.monitor.services.JolokiaService
+
+reporter-service:
+ class.name: com.linkedin.xinfra.monitor.services.DefaultMetricsReporterService
+ report.interval.sec: 1
+ report.metrics.list:
+ - kmf:type=kafka-monitor:offline-runnable-count
+ - kmf.services:type=produce-service,name=*:produce-availability-avg
+ - kmf.services:type=consume-service,name=*:consume-availability-avg
+ - kmf.services:type=produce-service,name=*:records-produced-total
+ - kmf.services:type=consume-service,name=*:records-consumed-total
+ - kmf.services:type=produce-service,name=*:records-produced-rate
+ - kmf.services:type=produce-service,name=*:produce-error-rate
+ - kmf.services:type=consume-service,name=*:consume-error-rate
+ - kmf.services:type=consume-service,name=*:records-lost-total
+ - kmf.services:type=consume-service,name=*:records-lost-rate
+ - kmf.services:type=consume-service,name=*:records-duplicated-total
+ - kmf.services:type=consume-service,name=*:records-delay-ms-avg
+ - kmf.services:type=commit-availability-service,name=*:offsets-committed-avg
+ - kmf.services:type=commit-availability-service,name=*:offsets-committed-total
+ - kmf.services:type=commit-availability-service,name=*:failed-commit-offsets-avg
+ - kmf.services:type=commit-availability-service,name=*:failed-commit-offsets-total
+ - kmf.services:type=commit-latency-service,name=*:commit-offset-latency-ms-avg
+ - kmf.services:type=commit-latency-service,name=*:commit-offset-latency-ms-max
+ - kmf.services:type=commit-latency-service,name=*:commit-offset-latency-ms-99th
+ - kmf.services:type=commit-latency-service,name=*:commit-offset-latency-ms-999th
+ - kmf.services:type=commit-latency-service,name=*:commit-offset-latency-ms-9999th
+ - kmf.services:type=cluster-topic-manipulation-service,name=*:topic-creation-metadata-propagation-ms-avg
+ - kmf.services:type=cluster-topic-manipulation-service,name=*:topic-creation-metadata-propagation-ms-max
+ - kmf.services:type=cluster-topic-manipulation-service,name=*:topic-deletion-metadata-propagation-ms-avg
+ - kmf.services:type=cluster-topic-manipulation-service,name=*:topic-deletion-metadata-propagation-ms-max
+ - kmf.services:type=offset-commit-service,name=*:offset-commit-availability-avg
+ - kmf.services:type=offset-commit-service,name=*:offset-commit-service-success-rate
+ - kmf.services:type=offset-commit-service,name=*:offset-commit-service-success-total
+ - kmf.services:type=offset-commit-service,name=*:offset-commit-service-failure-rate
+ - kmf.services:type=offset-commit-service,name=*:offset-commit-service-failure-total
+
+cluster-topic-manipulation-service:
+ class.name: com.linkedin.xinfra.monitor.services.ClusterTopicManipulationService
+ zookeeper.connect: localhost:2181
+ bootstrap.servers: localhost:9092,localhost:9093
+ topic: xinfra-monitor-topic
+
+# Example produce-service to produce messages to cluster
+produce-service:
+ class.name: com.linkedin.kmf.services.ProduceService
+ topic: xinfra-monitor-topic
+ zookeeper.connect: localhost:2181
+ bootstrap.servers: localhost:9092
+ consume.latency.sla.ms: '20000'
+ consume.consumer.props: {}
+
+# Example consume-service to consume messages
+consume-service:
+ class.name: com.linkedin.kmf.services.ConsumeService
+ topic: xinfra-monitor-topic
+ zookeeper.connect: localhost:2181
+ bootstrap.servers: localhost:9092
+ consume.latency.sla.ms: '20000'
+ consume.consumer.props: {}
+
+# Example statsd-service to report metrics
+statsd-service:
+ class.name: com.linkedin.xinfra.monitor.services.StatsdMetricsReporterService
+ report.statsd.host: localhost
+ report.statsd.port: '8125'
+ report.statsd.prefix: xinfra-monitor
+ report.interval.sec: 1
+ report.metrics.list:
+ - kmf.services:type=produce-service,name=*:produce-availability-avg
+ - kmf.services:type=consume-service,name=*:consume-availability-avg
+
+# Example kafka-service to report metrics
+reporter-kafka-service:
+ class.name: com.linkedin.xinfra.monitor.services.KafkaMetricsReporterService
+ report.interval.sec: 3
+ zookeeper.connect: localhost:2181
+ bootstrap.servers: localhost:9092
+ topic: xinfra-monitor-topic-metrics
+ report.kafka.topic.replication.factor: 1
+ report.metrics.list:
+ - kmf.services:type=produce-service,name=*:produce-availability-avg
+ - kmf.services:type=consume-service,name=*:consume-availability-avg
+ - kmf.services:type=produce-service,name=*:records-produced-total
+ - kmf.services:type=consume-service,name=*:records-consumed-total
+ - kmf.services:type=consume-service,name=*:records-lost-total
+ - kmf.services:type=consume-service,name=*:records-duplicated-total
+ - kmf.services:type=consume-service,name=*:records-delay-ms-avg
+ - kmf.services:type=produce-service,name=*:records-produced-rate
+ - kmf.services:type=produce-service,name=*:produce-error-rate
+ - kmf.services:type=consume-service,name=*:consume-error-rate
+
+
+# Example signalfx-service to report metrics
+signalfx-service:
+ class.name: com.linkedin.kmf.services.SignalFxMetricsReporterService
+ report.interval.sec: 1
+ report.metric.dimensions: {}
+ report.signalfx.url: ''
+ report.signalfx.token: ''
diff --git a/docker/kafka-monitor-docker-entry.sh b/docker/kafka-monitor-docker-entry.sh
index 97554bb0..7a23c65b 100755
--- a/docker/kafka-monitor-docker-entry.sh
+++ b/docker/kafka-monitor-docker-entry.sh
@@ -22,6 +22,6 @@ trap 'pkill java; exit 143' SIGTERM
# wait for DNS services to be available
sleep 10
-bin/xinfra-monitor-start.sh config/xinfra-monitor.properties &
+bin/xinfra-monitor-start.sh config/xinfra-monitor.yaml &
wait $!
\ No newline at end of file
diff --git a/gradle/wrapper/gradle-wrapper.properties b/gradle/wrapper/gradle-wrapper.properties
index 842c8c5a..a4b44297 100644
--- a/gradle/wrapper/gradle-wrapper.properties
+++ b/gradle/wrapper/gradle-wrapper.properties
@@ -1,6 +1,5 @@
-#Mon Apr 01 18:19:43 PDT 2019
distributionBase=GRADLE_USER_HOME
distributionPath=wrapper/dists
+distributionUrl=https\://services.gradle.org/distributions/gradle-6.3-bin.zip
zipStoreBase=GRADLE_USER_HOME
zipStorePath=wrapper/dists
-distributionUrl=https\://services.gradle.org/distributions/gradle-5.2.1-all.zip
diff --git a/src/main/java/com/linkedin/xinfra/monitor/XinfraMonitor.java b/src/main/java/com/linkedin/xinfra/monitor/XinfraMonitor.java
index d516b076..bfea356d 100644
--- a/src/main/java/com/linkedin/xinfra/monitor/XinfraMonitor.java
+++ b/src/main/java/com/linkedin/xinfra/monitor/XinfraMonitor.java
@@ -11,11 +11,11 @@
package com.linkedin.xinfra.monitor;
import com.fasterxml.jackson.databind.ObjectMapper;
+import com.fasterxml.jackson.dataformat.yaml.YAMLFactory;
import com.linkedin.xinfra.monitor.apps.App;
import com.linkedin.xinfra.monitor.services.Service;
import com.linkedin.xinfra.monitor.services.ServiceFactory;
-import java.io.BufferedReader;
-import java.io.FileReader;
+import java.io.File;
import java.lang.reflect.Constructor;
import java.util.ArrayList;
import java.util.List;
@@ -168,21 +168,14 @@ public void awaitShutdown() {
@SuppressWarnings("rawtypes")
public static void main(String[] args) throws Exception {
if (args.length <= 0) {
- LOG.info("USAGE: java [options] " + XinfraMonitor.class.getName() + " config/xinfra-monitor.properties");
+ LOG.info("USAGE: java [options] " + XinfraMonitor.class.getName() + " config/xinfra-monitor.yaml");
return;
}
- StringBuilder buffer = new StringBuilder();
- try (BufferedReader br = new BufferedReader(new FileReader(args[0].trim()))) {
- String line;
- while ((line = br.readLine()) != null) {
- if (!line.startsWith("#"))
- buffer.append(line);
- }
- }
+ File configurationFile = new File(args[0].trim());
@SuppressWarnings("unchecked")
- Map props = new ObjectMapper().readValue(buffer.toString(), Map.class);
+ Map props = new ObjectMapper(new YAMLFactory()).readValue(configurationFile, Map.class);
XinfraMonitor xinfraMonitor = new XinfraMonitor(props);
xinfraMonitor.start();
LOG.info("Xinfra Monitor has started.");
diff --git a/src/main/java/com/linkedin/xinfra/monitor/services/configs/StatsdMetricsReporterServiceConfig.java b/src/main/java/com/linkedin/xinfra/monitor/services/configs/StatsdMetricsReporterServiceConfig.java
index 8438fd78..f46ac13d 100644
--- a/src/main/java/com/linkedin/xinfra/monitor/services/configs/StatsdMetricsReporterServiceConfig.java
+++ b/src/main/java/com/linkedin/xinfra/monitor/services/configs/StatsdMetricsReporterServiceConfig.java
@@ -8,7 +8,7 @@
* an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*
*
- * In order to enable the StatsD metrics export, add the following section to xinfra-monitor.properties file
+ * In order to enable the StatsD metrics export, add the following section to xinfra-monitor.yaml file
*
*/