From 5af72f0c045c797ac4b7de281e66c2bf8fad1a58 Mon Sep 17 00:00:00 2001 From: Rodrigo Queiro Date: Mon, 14 Jul 2025 14:56:22 +0000 Subject: [PATCH 1/2] Tune node exporter This reduces the metric cardinality 3x (from 4.5k to <1.5k on one system I tried), keeps the same scrape duration (~70ms), and adds some helpful metrics for CPU models and network device drivers. --- .../prometheus/prometheus-robot.values.yaml | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/src/app_charts/prometheus/prometheus-robot.values.yaml b/src/app_charts/prometheus/prometheus-robot.values.yaml index 5847f076f..dda7c2eaa 100644 --- a/src/app_charts/prometheus/prometheus-robot.values.yaml +++ b/src/app_charts/prometheus/prometheus-robot.values.yaml @@ -183,12 +183,25 @@ prometheus-node-exporter: # This collector produces log-spam on newer kernels # https://github.com/prometheus/node_exporter/issues/1892 - --no-collector.rapl + # Since we have hardware network IRQs, this generates 7 zero-value metrics for each CPU core. + - --no-collector.softnet # This is disabled by default, since it might leak memory # (https://github.com/prometheus/node_exporter/blob/master/CHANGELOG.md#0160-rc1--2018-04-04) - --collector.wifi + # Export CPU model (one metric per core) + - --collector.cpu.info + # Export current network devices (bs address, driver name and firmware version, one metric per physical device) + - --collector.ethtool + - --collector.ethtool.metrics-include=^node_ethtool_info$ + - --collector.ethtool.device-exclude=^(bond|cilium|ip6tnl0|lo|lxc|tunl) # Ignore more fuse filesystems # https://github.com/prometheus/node_exporter/blob/master/collector/filesystem_linux.go#L33 - --collector.filesystem.ignored-fs-types=^(autofs|binfmt_misc|bpf|cgroup2?|configfs|debugfs|devpts|devtmpfs|fusectl|fuse\.\w*|hugetlbfs|iso9660|mqueue|nsfs|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|selinuxfs|squashfs|sysfs|tracefs)$ + # Ignore filesystems with UIDs in the mount points (high cardinality) + - --collector.filesystem.mount-points-exclude=^(/run/containerd/|/var/lib/kubelet) + # Ignore virtual network devices + - --collector.netdev.device-exclude=^(bond|cilium|ip6tnl0|lo|lxc|tunl) + - --collector.netclass.ignored-devices=^(bond|cilium|ip6tnl0|lo|lxc|tunl) grafana: enabled: false From a9f8194db24591f41ab1b5812127c1a5350cd736 Mon Sep 17 00:00:00 2001 From: Rodrigo Queiro Date: Wed, 16 Jul 2025 13:14:00 +0000 Subject: [PATCH 2/2] Remove ethtool metrics The hw-exporter is a better way as it also handles other PCI devices. --- src/app_charts/prometheus/prometheus-robot.values.yaml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/app_charts/prometheus/prometheus-robot.values.yaml b/src/app_charts/prometheus/prometheus-robot.values.yaml index dda7c2eaa..fe5f59423 100644 --- a/src/app_charts/prometheus/prometheus-robot.values.yaml +++ b/src/app_charts/prometheus/prometheus-robot.values.yaml @@ -190,10 +190,6 @@ prometheus-node-exporter: - --collector.wifi # Export CPU model (one metric per core) - --collector.cpu.info - # Export current network devices (bs address, driver name and firmware version, one metric per physical device) - - --collector.ethtool - - --collector.ethtool.metrics-include=^node_ethtool_info$ - - --collector.ethtool.device-exclude=^(bond|cilium|ip6tnl0|lo|lxc|tunl) # Ignore more fuse filesystems # https://github.com/prometheus/node_exporter/blob/master/collector/filesystem_linux.go#L33 - --collector.filesystem.ignored-fs-types=^(autofs|binfmt_misc|bpf|cgroup2?|configfs|debugfs|devpts|devtmpfs|fusectl|fuse\.\w*|hugetlbfs|iso9660|mqueue|nsfs|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|selinuxfs|squashfs|sysfs|tracefs)$