From 6d8c7c89a4dc4e2ddda59886d0cbd5c6c0cf4c38 Mon Sep 17 00:00:00 2001 From: ilciko Date: Fri, 3 Oct 2025 22:42:53 +0200 Subject: [PATCH 1/3] Refactor using AI agent, new features: * The exporter remains running and log to scrape failure metric in case of: not root, nvme-cli not found, or unsupported version * OCP metrics are automatically scraped if available or a message logged to stdout if not * Simplified the `nvme-cli` compatibility accepting anything above v2.8 * Added a simple html page if not hitting the `/metrics` endpoint * CLI flags now align to Prometheus Node exporter format * There are now three collectors enabled by default: smart, info and ocp; it is possible to enable/disable them at CLI * Added all metric descriptions * Improved Logging --- README.md | 418 ++++++++++++++++++++++++++++++++----------- cmd/collector.go | 154 ++++++++-------- cmd/main.go | 312 ++++++++++++++++++++++++++++---- pkg/collector.go | 112 +++++++++++- pkg/provider.go | 18 +- pkg/utils/command.go | 10 ++ 6 files changed, 805 insertions(+), 219 deletions(-) diff --git a/README.md b/README.md index 1905e54..22d3433 100644 --- a/README.md +++ b/README.md @@ -5,148 +5,350 @@ [![GitHub license](https://img.shields.io/github/license/E4-Computer-Engineering/nvme-exporter)](https://github.com/E4-Computer-Engineering/nvme-exporter/blob/master/LICENSE) ![GitHub all releases](https://img.shields.io/github/downloads/E4-Computer-Engineering/nvme-exporter/total) -Prometheus exporter for nvme smart-log and OCP smart-log metrics inspired by [fritchie nvme exporter](https://github.com/fritchie/nvme_exporter). +Prometheus exporter for NVMe SMART log and OCP SMART log metrics, inspired by [fritchie nvme exporter](https://github.com/fritchie/nvme_exporter) and following [Prometheus node_exporter](https://github.com/prometheus/node_exporter) design patterns. + +## Features + +- **Resilient Operation**: Continues running even if nvme-cli is not installed, user is not root, or version is unsupported (similar to Prometheus node_exporter behavior) +- **Graceful Degradation**: Logs errors and increments failure metrics instead of crashing +- **Flexible Collector Management**: Enable/disable collectors individually using `--collector.*` and `--no-collector.*` flags +- **OCP Metrics by Default**: OCP SMART log metrics are enabled by default and gracefully disabled if not supported by the drive +- **Prometheus-Compatible**: Follows Prometheus naming conventions and best practices + +## Specifications Specification versions of reference: -* nvme smart-log field descriptions can be found on page 209 of [NVMe specifications](https://nvmexpress.org/wp-content/uploads/NVM-Express-Base-Specification-Revision-2.1-2024.08.05-Ratified.pdf) +* **NVMe SMART log** field descriptions can be found on page 209 of [NVMe Base Specification Revision 2.1](https://nvmexpress.org/wp-content/uploads/NVM-Express-Base-Specification-Revision-2.1-2024.08.05-Ratified.pdf) + +* **NVMe OCP SMART log** field descriptions can be found on page 24 of [Datacenter NVMe SSD Specification v2.5](https://www.opencompute.org/documents/datacenter-nvme-ssd-specification-v2-5-pdf) + +### Supported NVMe CLI Versions + +Supports [NVMe CLI](https://github.com/linux-nvme/nvme-cli) versions **2.8 and above**. -* nvme ocp-smart-log field descriptions can be found on page 24 of [Opencompute NVMe SSD specifications](https://www.opencompute.org/documents/datacenter-nvme-ssd-specification-v2-5-pdf) +> **Note**: The exporter will continue to run with unsupported versions, but may produce incorrect data or fail scrapes. -Supported [NVMe CLI](https://github.com/linux-nvme/nvme-cli) versions: +## Repository Contents -| Version | Supported | -|----|----| -|2.9 | OK | -|2.10 | OK | -|2.11 | TBD | +* **Docker**: Sample `Dockerfile` for containerized deployment +* **Kubernetes**: Deployment manifests in [resources/k8s/](resources/k8s/) +* **Grafana**: Dashboard templates in [resources/grafana/](resources/grafana/) + * [SMART log and OCP dashboard](https://github.com/E4-Computer-Engineering/nvme-exporter/blob/main/resources/grafana/dashboard_SMART_OCP.json) +* **Prometheus**: Recording and alert rules in [resources/prom/](resources/prom/) +* **Systemd**: Service unit files in [resources/systemd/](resources/systemd/) +* **Scripts**: Package installation hooks in [resources/scripts/](resources/scripts/) -## Repo Content +## Installation & Running -* Docker: A sample `Dockerfile` is provided. -* Kubernetes: In [resources](resources/k8s/). -* Grafana: In [resources](resources/grafana/) for dashboards. - * [smart-log and OCP dashboard](https://github.com/E4-Computer-Engineering/nvme-exporter/blob/main/resources/grafana/dashboard_SMART_OCP.json) -* Prometheus: In [resources](resources/prom/) for recording and alert rules. -* Systemd: In [resources](resources/systemd/) for executing the exporter as unit. -* Scripts: In [resources](resources/scripts/) for package installation hooks. +### Prerequisites -## Running +- **nvme-cli** package installed on the host +- **root** privileges (recommended for full functionality) -Running the exporter requires the nvme-cli package to be installed on the host and be `root` account. +> **Note**: The exporter will start without root or nvme-cli installed, but scrapes will fail and increment the `nvme_exporter_scrape_failures_total` metric. -``` bash -nvme_exporter -h +### Quick Start + +```bash +# Run with all default collectors enabled +nvme_exporter + +# View help and available options +nvme_exporter --help ``` -### Flags +### Command-Line Flags + +#### Web Server Options + +| Flag | Description | Default | +|------|-------------|---------| +| `--web.listen-address` | Address on which to expose metrics and web interface | `:9998` | +| `--web.telemetry-path` | Path under which to expose metrics | `/metrics` | + +#### Collector Options -| Name | Description | Default | -|----|----|----| -|port | Listen port number. Type: String. | `9998` | -|ocp | Enable OCP smart log metrics. Type: Bool. | `false` | -|endpoint | The endpoint to query for metrics. Type: String. | `/metrics` | +| Flag | Description | Default | +|------|-------------|---------| +| `--collector.` | Enable the specified collector | See table below | +| `--no-collector.` | Disable the specified collector | - | +| `--collector.disable-defaults` | Disable all default collectors | `false` | -### Systemd +#### Available Collectors -By installing the packaged version: RPM or DEB, the systemd unit will be automatically deployed and started as `nvme_exporter.service`. -If you are installing from `tar.gz` the [systemd unit file](resources/systemd/nvme_exporter.service) is provided in this repo. +| Collector | Description | Enabled by Default | +|-----------|-------------|--------------------| +| `info` | NVMe device info metrics | ✅ Yes | +| `smart` | NVMe SMART log metrics | ✅ Yes | +| `ocp` | NVMe OCP (Open Compute Project) SMART log metrics | ✅ Yes | -> NOTE: if you want to execute with custom flags you will need to modify the unit file +### Usage Examples -### Container +```bash +# Start with all default collectors on default port +nvme_exporter -To run the exporter as a container with, for example, OCP metrics enabled: +# Listen on a specific address and port +nvme_exporter --web.listen-address=":9998" -``` bash -podman run --rm -d --network=host --privileged nvme_exporter -ocp +# Change metrics endpoint path +nvme_exporter --web.telemetry-path="/nvme-metrics" + +# Disable OCP metrics collection +nvme_exporter --no-collector.ocp + +# Only collect SMART metrics (disable info and OCP) +nvme_exporter --collector.disable-defaults --collector.smart + +# Enable only info and smart collectors +nvme_exporter --no-collector.ocp +``` + +### Systemd Service + +By installing the packaged version (RPM or DEB), the systemd unit will be automatically deployed and started as `nvme_exporter.service`. + +If you are installing from `tar.gz`, the [systemd unit file](resources/systemd/nvme_exporter.service) is provided in this repository. + +**To customize flags**, edit the systemd unit file: + +```bash +sudo systemctl edit nvme_exporter.service ``` +Add your custom flags: + +```ini +[Service] +ExecStart= +ExecStart=/usr/bin/nvme_exporter --web.listen-address=":9998" --no-collector.ocp +``` + +### Container Deployment + +#### With Podman + +```bash +# Run with all default collectors +podman run --rm -d --network=host --privileged nvme_exporter + +# Run with custom flags +podman run --rm -d --network=host --privileged nvme_exporter \ + --web.listen-address=":9998" \ + --no-collector.ocp +``` + +#### With Docker + +```bash +# Run with all default collectors +docker run --rm -d --network=host --privileged nvme_exporter + +# Run with custom flags +docker run --rm -d --network=host --privileged nvme_exporter \ + --web.listen-address=":9998" \ + --no-collector.ocp +``` + +## Metrics + +### Exporter Metrics + +| Metric Name | Type | Description | +|-------------|------|-------------| +| `nvme_exporter_scrape_failures_total` | Counter | Total number of scrape failures due to validation errors (not root, nvme-cli not found, or unsupported version) | + +### NVMe Device Metrics + +This collector exports metrics from the following `nvme-cli` commands: + +```bash +nvme list -o json +nvme smart-log -o json +nvme ocp smart-add-log -o json # If OCP collector is enabled +``` + +All metrics include the `device` label with the device path (e.g., `/dev/nvme0n1`). + +#### Info Metrics (collector: `info`) + +| Metric Name | Description | Labels | +|-------------|-------------|--------| +| `nvme_namespace` | NVMe namespace identifier | `device`, `generic_path`, `firmware`, `model_number`, `serial_number` | +| `nvme_used_bytes` | Used storage capacity in bytes | `device`, `generic_path`, `firmware`, `model_number`, `serial_number` | +| `nvme_maximum_lba` | Maximum Logical Block Address | `device`, `generic_path`, `firmware`, `model_number`, `serial_number` | +| `nvme_physical_size` | Physical size in bytes | `device`, `generic_path`, `firmware`, `model_number`, `serial_number` | +| `nvme_sector_size` | Sector size in bytes | `device`, `generic_path`, `firmware`, `model_number`, `serial_number` | + +#### SMART Log Metrics (collector: `smart`) + +**Gauge Metrics** + +| Metric Name | Description | +|-------------|-------------| +| `nvme_critical_warning` | Critical warnings for the controller state. Bits indicate spare capacity, temperature, degraded reliability, or read-only mode | +| `nvme_temperature` | Current composite temperature in Kelvin | +| `nvme_avail_spare` | Available spare capacity as a normalized percentage (0-100) | +| `nvme_spare_thresh` | Available spare capacity threshold below which an asynchronous event is generated | +| `nvme_percent_used` | Vendor-specific estimate of the percentage of device life used (0-255) | +| `nvme_endurance_grp_critical_warning_summary` | Critical warnings for endurance groups. Contains the OR of all critical warnings for all endurance groups | + +**Counter Metrics** + +| Metric Name | Description | +|-------------|-------------| +| `nvme_data_units_read` | Total number of 512-byte data units read from the NVMe device by the host | +| `nvme_data_units_written` | Total number of 512-byte data units written to the NVMe device by the host | +| `nvme_host_read_commands` | Total number of read commands completed by the controller | +| `nvme_host_write_commands` | Total number of write commands completed by the controller | +| `nvme_controller_busy_time` | Total time in minutes the controller was busy processing I/O commands | +| `nvme_power_cycles` | Total number of power cycles | +| `nvme_power_on_hours` | Total number of power-on hours. May not include time when the controller was powered but in a low power state | +| `nvme_unsafe_shutdowns` | Total number of unsafe shutdowns where the controller was not properly notified before power loss | +| `nvme_media_errors` | Total number of unrecovered data integrity errors detected by the controller | +| `nvme_num_err_log_entries` | Lifetime number of error log entries available in the Error Information Log | +| `nvme_warning_temp_time` | Total time in minutes the controller temperature exceeded the warning threshold | +| `nvme_critical_comp_time` | Total time in minutes the controller temperature exceeded the critical composite temperature threshold | +| `nvme_thm_temp1_trans_count` | Total number of times the controller transitioned to a lower power state due to thermal management (threshold 1) | +| `nvme_thm_temp2_trans_count` | Total number of times the controller transitioned to a lower power state due to thermal management (threshold 2) | +| `nvme_thm_temp1_trans_time` | Total time in seconds the controller was in a lower power state due to thermal management (threshold 1) | +| `nvme_thm_temp2_trans_time` | Total time in seconds the controller was in a lower power state due to thermal management (threshold 2) | + +#### OCP SMART Log Metrics (collector: `ocp`) + +> **Note**: These metrics are only available if the NVMe drive supports OCP vendor-specific commands. If not supported, the collector will log a warning and continue with other metrics. + +**Counter Metrics** + +| Metric Name | Description | +|-------------|-------------| +| `nvme_physical_media_units_written_hi` | Physical media units written to the device (high 64 bits). Unit size is 1000h sector size | +| `nvme_physical_media_units_written_lo` | Physical media units written to the device (low 64 bits). Unit size is 1000h sector size | +| `nvme_physical_media_units_read_hi` | Physical media units read from the device (high 64 bits). Unit size is 1000h sector size | +| `nvme_physical_media_units_read_lo` | Physical media units read from the device (low 64 bits). Unit size is 1000h sector size | +| `nvme_bad_user_nand_blocks_raw` | Raw count of user NAND blocks that have been retired due to errors | +| `nvme_bad_user_nand_blocks_normalized` | Normalized value (0-100) of bad user NAND blocks relative to the maximum allowed | +| `nvme_bad_system_nand_blocks_raw` | Raw count of system area NAND blocks that have been retired due to errors | +| `nvme_bad_system_nand_blocks_normalized` | Normalized value (0-100) of bad system NAND blocks relative to the maximum allowed | +| `nvme_xor_recovery_count` | Total number of times data was recovered using XOR parity | +| `nvme_uncorrectable_read_error_count` | Total number of uncorrectable read errors that could not be recovered | +| `nvme_soft_ecc_error_count` | Total number of soft ECC errors that were corrected | +| `nvme_end_to_end_detected_errors` | Total number of end-to-end data protection errors detected | +| `nvme_end_to_end_corrected_errors` | Total number of end-to-end data protection errors that were corrected | +| `nvme_refresh_counts` | Total number of NAND page refresh operations performed | +| `nvme_max_user_data_erase_counts` | Maximum number of erase cycles performed on any user data block | +| `nvme_min_user_data_erase_counts` | Minimum number of erase cycles performed on any user data block | +| `nvme_number_of_thermal_throttling_events` | Total number of times thermal throttling was activated | +| `nvme_pcie_correctable_error_count` | Total number of PCIe correctable errors detected | +| `nvme_incomplete_shutdowns` | Total number of incomplete or unsafe shutdown events | +| `nvme_unaligned_io` | Total number of unaligned I/O operations performed | +| `nvme_plp_start_count` | Total number of times the Power Loss Protection (PLP) mechanism was activated | +| `nvme_pcie_link_retraining_count` | Total number of PCIe link retraining events | +| `nvme_power_state_change_count` | Total number of power state transitions | + +**Gauge Metrics** + +| Metric Name | Description | +|-------------|-------------| +| `nvme_system_data_percent_used` | Percentage of system data area used (0-100) | +| `nvme_current_throttling_status` | Current thermal throttling status (0=not throttled, 1=throttled) | +| `nvme_percent_free_blocks` | Percentage of free NAND blocks available (0-100) | +| `nvme_capacitor_health` | Health indicator of the power loss protection capacitor (vendor-specific scale) | +| `nvme_security_version_number` | Security version number of the device firmware | +| `nvme_nuse_namespace_utilization` | Namespace utilization as reported by the device | +| `nvme_endurance_estimate` | Estimated remaining endurance of the device as a percentage (0-100) | +| `nvme_log_page_version` | Version number of the OCP SMART log page specification | +| `nvme_log_page_guid` | GUID (Globally Unique Identifier) of the OCP SMART log page | +| `nvme_errata_version_field` | Errata version field from the OCP specification version | +| `nvme_point_version_field` | Point version field from the OCP specification version | +| `nvme_minor_version_field` | Minor version field from the OCP specification version | +| `nvme_major_version_field` | Major version field from the OCP specification version | +| `nvme_nvme_errata_version` | NVMe base specification errata version supported by the device | + ## Visualization -This is how the dashboard visualizes: +Grafana dashboards are available in the [resources/grafana/](resources/grafana/) directory. +### Dashboard Screenshots + +**OCP Metrics** ![OCP metrics](https://raw.githubusercontent.com/E4-Computer-Engineering/nvme-exporter/refs/heads/main/resources/grafana/nvme_ocp.png) +**Endurance Metrics** ![Endurance metrics](https://raw.githubusercontent.com/E4-Computer-Engineering/nvme-exporter/refs/heads/main/resources/grafana/nvme_endurance.png) +**Statistics Metrics** ![Stats metrics](https://raw.githubusercontent.com/E4-Computer-Engineering/nvme-exporter/refs/heads/main/resources/grafana/nvme_stats.png) +**Error Metrics** ![Errors metrics](https://raw.githubusercontent.com/E4-Computer-Engineering/nvme-exporter/refs/heads/main/resources/grafana/nvme_errors.png) -## Metrics +## Prometheus Configuration + +### Scrape Configuration + +Add this to your `prometheus.yml`: + +```yaml +scrape_configs: + - job_name: 'nvme' + static_configs: + - targets: ['localhost:9998'] +``` + +### Recording and Alert Rules -This collector exports the output of the following `nvme` cli commands: +Example recording and alert rules are available in [resources/prom/](resources/prom/). -``` bash -nvme list -nvme smart-log -nvme ocp-smart-add-log +## Troubleshooting + +### Exporter starts but no metrics are collected + +**Symptom**: The exporter starts successfully, but scrapes return no NVMe metrics, only the `nvme_exporter_scrape_failures_total` metric is incrementing. + +**Possible causes**: +1. Not running as root +2. nvme-cli not installed +3. Unsupported nvme-cli version + +**Solution**: Check the exporter logs for WARNING messages indicating the specific issue. + +### OCP metrics not available + +**Symptom**: OCP metrics are not being exported. + +**Possible causes**: +1. NVMe drive doesn't support OCP vendor-specific commands +2. OCP collector disabled with `--no-collector.ocp` + +**Solution**: Check exporter logs for "OCP metrics not supported" messages. This is normal for non-OCP drives. + +### Permission denied errors + +**Symptom**: Logs show permission denied when accessing `/dev/nvme*` devices. + +**Solution**: Run the exporter as root or with appropriate capabilities: + +```bash +sudo nvme_exporter +``` + +Or with Docker/Podman: + +```bash +podman run --privileged nvme_exporter ``` -|metric_name|description| -|---|---| -|nvme_avail_spare|---| -|nvme_bad_system_nand_blocks_normalized|---| -|nvme_bad_system_nand_blocks_raw|---| -|nvme_bad_user_nand_blocks_normalized|---| -|nvme_bad_user_nand_blocks_raw|---| -|nvme_capacitor_health|---| -|nvme_controller_busy_time|---| -|nvme_critical_comp_time|---| -|nvme_critical_warning|---| -|nvme_current_throttling_status|---| -|nvme_data_units_read|---| -|nvme_data_units_written|---| -|nvme_end_to_end_corrected_errors|---| -|nvme_end_to_end_detected_errors|---| -|nvme_endurance_estimate|---| -|nvme_endurance_grp_critical_warning_summary|---| -|nvme_errata_version_field|---| -|nvme_host_read_commands|---| -|nvme_host_write_commands|---| -|nvme_incomplete_shutdowns|---| -|nvme_log_page_guid|---| -|nvme_log_page_version|---| -|nvme_major_version_field|---| -|nvme_maximum_lba|---| -|nvme_max_user_data_erase_counts|---| -|nvme_media_errors|---| -|nvme_minor_version_field|---| -|nvme_min_user_data_erase_counts|---| -|nvme_namespace|---| -|nvme_number_of_thermal_throttling_events|---| -|nvme_num_err_log_entries|---| -|nvme_nuse_namespace_utilization|---| -|nvme_nvme_errata_version|---| -|nvme_pcie_correctable_error_count|---| -|nvme_pcie_link_retraining_count|---| -|nvme_percent_free_blocks|---| -|nvme_percent_used|---| -|nvme_physical_media_units_read_hi|---| -|nvme_physical_media_units_read_lo|---| -|nvme_physical_media_units_written_hi|---| -|nvme_physical_media_units_written_lo|---| -|nvme_physical_size|---| -|nvme_plp_start_count|---| -|nvme_point_version_field|---| -|nvme_power_cycles|---| -|nvme_power_on_hours|---| -|nvme_power_state_change_count|---| -|nvme_refresh_counts|---| -|nvme_sector_size|---| -|nvme_security_version_number|---| -|nvme_soft_ecc_error_count|---| -|nvme_spare_thresh|---| -|nvme_system_data_percent_used|---| -|nvme_temperature|---| -|nvme_thm_temp1_trans_count|---| -|nvme_thm_temp1_trans_time|---| -|nvme_thm_temp2_trans_count|---| -|nvme_thm_temp2_trans_time|---| -|nvme_unaligned_io|---| -|nvme_uncorrectable_uead_error_count|---| -|nvme_unsafe_shutdowns|---| -|nvme_used_bytes|---| -|nvme_warning_temp_time|---| -|nvme_xor_recovery_count|---| +## License + +See [LICENSE](LICENSE) file for details. + +## Contributing + +Contributions are welcome! Please open an issue or submit a pull request. + +## Acknowledgments + +- Inspired by [fritchie/nvme_exporter](https://github.com/fritchie/nvme_exporter) +- Follows design patterns from [prometheus/node_exporter](https://github.com/prometheus/node_exporter) diff --git a/cmd/collector.go b/cmd/collector.go index 4a68d0b..f349fc8 100644 --- a/cmd/collector.go +++ b/cmd/collector.go @@ -22,7 +22,9 @@ func getSmartLogData(devicePath string) gjson.Result { func getOcpSmartLogData(devicePath string) gjson.Result { ocpSmartLog, err := utils.ExecuteJSONCommand("nvme", "ocp", "smart-add-log", devicePath, "-o", "json") if err != nil { - log.Printf("Error running smart-add-log %s -o json: %s\n", devicePath, err) + log.Printf("OCP metrics not supported or error running smart-add-log %s -o json: %s (continuing with standard metrics)\n", devicePath, err) + // Return empty result instead of crashing + return gjson.Result{} } return ocpSmartLog @@ -68,7 +70,7 @@ func (f *ProviderFactory) CreateInfoMetricProvider( ) } -func newNvmeCollector(ocpEnabled bool) prometheus.Collector { +func newNvmeCollector(collectorStates map[string]bool) prometheus.Collector { labels := []string{"device"} infoLabels := []string{"device", "generic_path", "firmware", "model_number", "serial_number"} @@ -86,31 +88,31 @@ func newNvmeCollector(ocpEnabled bool) prometheus.Collector { infoMetricProviders := []pkg.MetricProvider{ gaugeValueFactory.CreateInfoMetricProvider( "nvme_namespace", - "", + "NVMe namespace identifier", "NameSpace", infoLabels, ), gaugeValueFactory.CreateInfoMetricProvider( "nvme_used_bytes", - "", + "Used storage capacity in bytes", "UsedBytes", infoLabels, ), gaugeValueFactory.CreateInfoMetricProvider( "nvme_maximum_lba", - "", + "Maximum Logical Block Address", "MaximumLBA", infoLabels, ), gaugeValueFactory.CreateInfoMetricProvider( "nvme_physical_size", - "", + "Physical size in bytes", "PhysicalSize", infoLabels, ), gaugeValueFactory.CreateInfoMetricProvider( "nvme_sector_size", - "", + "Sector size in bytes", "SectorSize", infoLabels, ), @@ -120,128 +122,128 @@ func newNvmeCollector(ocpEnabled bool) prometheus.Collector { logMetricProviders := []pkg.MetricProvider{ gaugeValueFactory.CreateLogMetricProvider( "nvme_critical_warning", - "Critical warnings for the state of the controller", + "Critical warnings for the controller state. Bits indicate spare capacity, temperature, degraded reliability, or read-only mode", "critical_warning", ), gaugeValueFactory.CreateLogMetricProvider( "nvme_temperature", - "Temperature in degrees fahrenheit", + "Current composite temperature in Kelvin", "temperature", ), gaugeValueFactory.CreateLogMetricProvider( "nvme_avail_spare", - "Normalized percentage of remaining spare capacity available", + "Available spare capacity as a normalized percentage (0-100)", "avail_spare", ), gaugeValueFactory.CreateLogMetricProvider( "nvme_spare_thresh", - "Async event completion may occur when avail spare < threshold", + "Available spare capacity threshold below which an asynchronous event is generated", "spare_thresh", ), gaugeValueFactory.CreateLogMetricProvider( "nvme_percent_used", - "Vendor specific estimate of the percentage of life used", + "Vendor-specific estimate of the percentage of device life used (0-255)", "percent_used", ), gaugeValueFactory.CreateLogMetricProvider( "nvme_endurance_grp_critical_warning_summary", - "Critical warnings for the state of endurance groups", + "Critical warnings for endurance groups. Contains the OR of all critical warnings for all endurance groups", "endurance_grp_critical_warning_summary", ), counterValueFactory.CreateLogMetricProvider( "nvme_data_units_read", - "Number of 512 byte data units host has read", + "Total number of 512-byte data units read from the NVMe device by the host", "data_units_read", ), counterValueFactory.CreateLogMetricProvider( "nvme_data_units_written", - "Number of 512 byte data units the host has written", + "Total number of 512-byte data units written to the NVMe device by the host", "data_units_written", ), counterValueFactory.CreateLogMetricProvider( "nvme_host_read_commands", - "Number of read commands completed", + "Total number of read commands completed by the controller", "host_read_commands", ), counterValueFactory.CreateLogMetricProvider( "nvme_host_write_commands", - "Number of write commands completed", + "Total number of write commands completed by the controller", "host_write_commands", ), counterValueFactory.CreateLogMetricProvider( "nvme_controller_busy_time", - "Amount of time in minutes controller busy with IO commands", + "Total time in minutes the controller was busy processing I/O commands", "controller_busy_time", ), counterValueFactory.CreateLogMetricProvider( "nvme_power_cycles", - "Number of power cycles", + "Total number of power cycles", "power_cycles", ), counterValueFactory.CreateLogMetricProvider( "nvme_power_on_hours", - "Number of power on hours", + "Total number of power-on hours. May not include time when the controller was powered but in a low power state", "power_on_hours", ), counterValueFactory.CreateLogMetricProvider( "nvme_unsafe_shutdowns", - "Number of unsafe shutdowns", + "Total number of unsafe shutdowns where the controller was not properly notified before power loss", "unsafe_shutdowns", ), counterValueFactory.CreateLogMetricProvider( "nvme_media_errors", - "Number of unrecovered data integrity errors", + "Total number of unrecovered data integrity errors detected by the controller", "media_errors", ), counterValueFactory.CreateLogMetricProvider( "nvme_num_err_log_entries", - "Lifetime number of error log entries", + "Lifetime number of error log entries available in the Error Information Log", "num_err_log_entries", ), counterValueFactory.CreateLogMetricProvider( "nvme_warning_temp_time", - "Amount of time in minutes temperature > warning threshold", + "Total time in minutes the controller temperature exceeded the warning threshold", "warning_temp_time", ), counterValueFactory.CreateLogMetricProvider( "nvme_critical_comp_time", - "Amount of time in minutes temperature > critical threshold", + "Total time in minutes the controller temperature exceeded the critical composite temperature threshold", "critical_comp_time", ), counterValueFactory.CreateLogMetricProvider( "nvme_thm_temp1_trans_count", - "Number of times controller transitioned to lower power", + "Total number of times the controller transitioned to a lower power state due to thermal management (threshold 1)", "thm_temp1_trans_count", ), counterValueFactory.CreateLogMetricProvider( "nvme_thm_temp2_trans_count", - "Number of times controller transitioned to lower power", + "Total number of times the controller transitioned to a lower power state due to thermal management (threshold 2)", "thm_temp2_trans_count", ), counterValueFactory.CreateLogMetricProvider( "nvme_thm_temp1_trans_time", - "Total number of seconds controller transitioned to lower power", + "Total time in seconds the controller was in a lower power state due to thermal management (threshold 1)", "thm_temp1_total_time", ), counterValueFactory.CreateLogMetricProvider( "nvme_thm_temp2_trans_time", - "Total number of seconds controller transitioned to lower power", + "Total time in seconds the controller was in a lower power state due to thermal management (threshold 2)", "thm_temp2_total_time", ), } @@ -250,198 +252,206 @@ func newNvmeCollector(ocpEnabled bool) prometheus.Collector { ocpLogMetricProviders := []pkg.MetricProvider{ counterValueFactory.CreateLogMetricProvider( "nvme_physical_media_units_written_hi", - "Physical meda units written high", + "Physical media units written to the device (high 64 bits). Unit size is 1000h sector size", "Physical media units written.hi", ), counterValueFactory.CreateLogMetricProvider( "nvme_physical_media_units_written_lo", - "Physical meda units written low", + "Physical media units written to the device (low 64 bits). Unit size is 1000h sector size", "Physical media units written.lo", ), counterValueFactory.CreateLogMetricProvider( "nvme_physical_media_units_read_hi", - "Physical meda units read high", + "Physical media units read from the device (high 64 bits). Unit size is 1000h sector size", "Physical media units read.hi", ), counterValueFactory.CreateLogMetricProvider( "nvme_physical_media_units_read_lo", - "Physical meda units read low", + "Physical media units read from the device (low 64 bits). Unit size is 1000h sector size", "Physical media units read.lo", ), counterValueFactory.CreateLogMetricProvider( "nvme_bad_user_nand_blocks_raw", - "", + "Raw count of user NAND blocks that have been retired due to errors", "Bad user nand blocks - Raw", ), counterValueFactory.CreateLogMetricProvider( "nvme_bad_user_nand_blocks_normalized", - "", + "Normalized value (0-100) of bad user NAND blocks relative to the maximum allowed", "Bad user nand blocks - Normalized", ), counterValueFactory.CreateLogMetricProvider( "nvme_bad_system_nand_blocks_raw", - "", + "Raw count of system area NAND blocks that have been retired due to errors", "Bad system nand blocks - Raw", ), counterValueFactory.CreateLogMetricProvider( "nvme_bad_system_nand_blocks_normalized", - "", + "Normalized value (0-100) of bad system NAND blocks relative to the maximum allowed", "Bad system nand blocks - Normalized", ), counterValueFactory.CreateLogMetricProvider( "nvme_xor_recovery_count", - "", + "Total number of times data was recovered using XOR parity", "XOR recovery count", ), counterValueFactory.CreateLogMetricProvider( - "nvme_uncorrectable_uead_error_count", - "", + "nvme_uncorrectable_read_error_count", + "Total number of uncorrectable read errors that could not be recovered", "Uncorrectable read error count", ), counterValueFactory.CreateLogMetricProvider( "nvme_soft_ecc_error_count", - "", + "Total number of soft ECC errors that were corrected", "Soft ecc error count", ), counterValueFactory.CreateLogMetricProvider( "nvme_end_to_end_detected_errors", - "", + "Total number of end-to-end data protection errors detected", "End to end detected errors", ), counterValueFactory.CreateLogMetricProvider( "nvme_end_to_end_corrected_errors", - "", + "Total number of end-to-end data protection errors that were corrected", "End to end corrected errors", ), gaugeValueFactory.CreateLogMetricProvider( "nvme_system_data_percent_used", - "", + "Percentage of system data area used (0-100)", "System data percent used", ), counterValueFactory.CreateLogMetricProvider( "nvme_refresh_counts", - "", + "Total number of NAND page refresh operations performed", "Refresh counts", ), counterValueFactory.CreateLogMetricProvider( "nvme_max_user_data_erase_counts", - "", + "Maximum number of erase cycles performed on any user data block", "Max User data erase counts", ), counterValueFactory.CreateLogMetricProvider( "nvme_min_user_data_erase_counts", - "", + "Minimum number of erase cycles performed on any user data block", "Min User data erase counts", ), counterValueFactory.CreateLogMetricProvider( "nvme_number_of_thermal_throttling_events", - "", + "Total number of times thermal throttling was activated", "Number of Thermal throttling events", ), gaugeValueFactory.CreateLogMetricProvider( "nvme_current_throttling_status", - "", + "Current thermal throttling status (0=not throttled, 1=throttled)", "Current throttling status", ), counterValueFactory.CreateLogMetricProvider( "nvme_pcie_correctable_error_count", - "", + "Total number of PCIe correctable errors detected", "PCIe correctable error count", ), counterValueFactory.CreateLogMetricProvider( "nvme_incomplete_shutdowns", - "", + "Total number of incomplete or unsafe shutdown events", "Incomplete shutdowns", ), gaugeValueFactory.CreateLogMetricProvider( "nvme_percent_free_blocks", - "", + "Percentage of free NAND blocks available (0-100)", "Percent free blocks", ), gaugeValueFactory.CreateLogMetricProvider( "nvme_capacitor_health", - "", + "Health indicator of the power loss protection capacitor (vendor-specific scale)", "Capacitor health", ), counterValueFactory.CreateLogMetricProvider( "nvme_unaligned_io", - "", + "Total number of unaligned I/O operations performed", "Unaligned I/O", ), gaugeValueFactory.CreateLogMetricProvider( "nvme_security_version_number", - "", + "Security version number of the device firmware", "Security Version Number", ), gaugeValueFactory.CreateLogMetricProvider( "nvme_nuse_namespace_utilization", - "", + "Namespace utilization as reported by the device", "NUSE - Namespace utilization", ), counterValueFactory.CreateLogMetricProvider( "nvme_plp_start_count", - "", + "Total number of times the Power Loss Protection (PLP) mechanism was activated", "PLP start count", ), gaugeValueFactory.CreateLogMetricProvider( "nvme_endurance_estimate", - "", + "Estimated remaining endurance of the device as a percentage (0-100)", "Endurance estimate", ), gaugeValueFactory.CreateLogMetricProvider( "nvme_log_page_version", - "", + "Version number of the OCP SMART log page specification", "Log page version", ), gaugeValueFactory.CreateLogMetricProvider( "nvme_log_page_guid", - "", + "GUID (Globally Unique Identifier) of the OCP SMART log page", "Log page GUID", ), gaugeValueFactory.CreateLogMetricProvider( "nvme_errata_version_field", - "", + "Errata version field from the OCP specification version", "Errata Version Field", ), gaugeValueFactory.CreateLogMetricProvider( "nvme_point_version_field", - "", + "Point version field from the OCP specification version", "Point Version Field", ), gaugeValueFactory.CreateLogMetricProvider( "nvme_minor_version_field", - "", + "Minor version field from the OCP specification version", "Minor Version Field", ), gaugeValueFactory.CreateLogMetricProvider( "nvme_major_version_field", - "", + "Major version field from the OCP specification version", "Major Version Field", ), gaugeValueFactory.CreateLogMetricProvider( "nvme_nvme_errata_version", - "", + "NVMe base specification errata version supported by the device", "NVMe Errata Version", ), counterValueFactory.CreateLogMetricProvider( "nvme_pcie_link_retraining_count", - "", + "Total number of PCIe link retraining events", "PCIe Link Retraining Count", ), counterValueFactory.CreateLogMetricProvider( "nvme_power_state_change_count", - "", + "Total number of power state transitions", "Power State Change Count", ), } - // the info and smart-log collectors are always present - collectors := []pkg.MetricCollector{ - pkg.NewInfoMetricCollector(infoMetricProviders), - pkg.NewLogMetricCollector(logMetricProviders, getSmartLogData), + // Build collectors based on enabled states + collectors := []pkg.MetricCollector{} + + // Add info collector if enabled + if collectorStates["info"] { + collectors = append(collectors, pkg.NewInfoMetricCollector(infoMetricProviders)) + } + + // Add smart-log collector if enabled + if collectorStates["smart"] { + collectors = append(collectors, pkg.NewLogMetricCollector(logMetricProviders, getSmartLogData)) } - if ocpEnabled { + // Add OCP collector if enabled (now enabled by default) + if collectorStates["ocp"] { collectors = append(collectors, pkg.NewLogMetricCollector(ocpLogMetricProviders, getOcpSmartLogData)) } diff --git a/cmd/main.go b/cmd/main.go index de5d047..4660f44 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -6,78 +6,320 @@ import ( "log" "net/http" "regexp" + "strconv" "strings" + "sync" "time" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promhttp" + "github.com/E4-Computer-Engineering/nvme_exporter/pkg" "github.com/E4-Computer-Engineering/nvme_exporter/pkg/utils" ) -var _supportedVersions = map[string]bool{ - "2.9": true, - "2.10": true, - "2.11": true, +const _minimumSupportedVersion = "2.8" + +var ( + validationState = struct { + sync.RWMutex + isValid bool + errorMessage string + }{ + isValid: true, + } + + scrapeFailuresTotal = prometheus.NewCounter( + prometheus.CounterOpts{ + Name: "nvme_exporter_scrape_failures_total", + Help: "Total number of scrape failures due to validation errors (not root, nvme-cli not found, or unsupported version)", + }, + ) +) + +// Collector represents a metric collector with enable/disable capability +type Collector struct { + name string + defaultState bool + enabled *bool + description string } +var ( + collectors = map[string]*Collector{ + "smart": { + name: "smart", + defaultState: true, + description: "NVMe SMART log metrics", + }, + "info": { + name: "info", + defaultState: true, + description: "NVMe device info metrics", + }, + "ocp": { + name: "ocp", + defaultState: true, + description: "NVMe OCP (Open Compute Project) SMART log metrics", + }, + } + + disableDefaultCollectors = flag.Bool( + "collector.disable-defaults", + false, + "Disable all default collectors", + ) +) + func isSupportedVersion(version string) bool { - _, ok := _supportedVersions[version] + versionParts := strings.Split(version, ".") + minVersionParts := strings.Split(_minimumSupportedVersion, ".") + + if len(versionParts) < 2 || len(minVersionParts) < 2 { + return false + } + + vMajor, err1 := strconv.Atoi(versionParts[0]) + vMinor, err2 := strconv.Atoi(versionParts[1]) + minMajor, err3 := strconv.Atoi(minVersionParts[0]) + minMinor, err4 := strconv.Atoi(minVersionParts[1]) + + if err1 != nil || err2 != nil || err3 != nil || err4 != nil { + return false + } - return ok + if vMajor > minMajor { + return true + } + if vMajor < minMajor { + return false + } + + return vMinor >= minMinor +} + +func setValidationError(msg string) { + validationState.Lock() + defer validationState.Unlock() + validationState.isValid = false + validationState.errorMessage = msg +} + +func isValidationValid() bool { + validationState.RLock() + defer validationState.RUnlock() + return validationState.isValid +} + +func initCollectorFlags() { + // Register flags for each collector + for name, collector := range collectors { + flagName := fmt.Sprintf("collector.%s", name) + noFlagName := fmt.Sprintf("no-collector.%s", name) + + defaultValue := collector.defaultState + + // --collector.X flag to enable + enableFlag := flag.Bool( + flagName, + defaultValue, + fmt.Sprintf("Enable the %s collector (default: %t)", collector.description, defaultValue), + ) + + // --no-collector.X flag to disable + disableFlag := flag.Bool( + noFlagName, + false, + fmt.Sprintf("Disable the %s collector", collector.description), + ) + + collector.enabled = enableFlag + + // Store both flags so we can resolve them later + collectors[name] = collector + + // Store the disable flag separately for later processing + if disableFlag != nil { + // We'll handle this in resolveCollectorStates + } + } +} + +func resolveCollectorStates() map[string]bool { + states := make(map[string]bool) + + for name, collector := range collectors { + // Start with default state + enabled := collector.defaultState + + // If disable-defaults is set, start with false + if *disableDefaultCollectors { + enabled = false + } + + // Check if explicit enable flag was set + enableFlagName := fmt.Sprintf("collector.%s", name) + disableFlagName := fmt.Sprintf("no-collector.%s", name) + + // Check if the flag was explicitly set + explicitlyEnabled := false + explicitlyDisabled := false + + flag.Visit(func(f *flag.Flag) { + if f.Name == enableFlagName { + explicitlyEnabled = true + enabled = *collector.enabled + } + if f.Name == disableFlagName { + explicitlyDisabled = true + } + }) + + // Disable flag takes precedence + if explicitlyDisabled { + enabled = false + } else if explicitlyEnabled { + enabled = true + } + + states[name] = enabled + } + + return states } func main() { + // Initialize collector flags before parsing + initCollectorFlags() + flag.Usage = func() { - fmt.Println("nvme_exporter - Exports NVMe smart-log and smart-ocp-log metrics in Prometheus format") - fmt.Println("Validated with nvme smart-log field descriptions can be found on page 209 of:") - fmt.Println( - "https://nvmexpress.org/wp-content/uploads/NVM-Express-Base-Specification-Revision-2.1-2024.08.05-Ratified.pdf") - fmt.Println("Validated with nvme ocp-smart-log field descriptions can be found on page 24 of:") - fmt.Println("https://www.opencompute.org/documents/datacenter-nvme-ssd-specification-v2-5-pdf */") - fmt.Printf("It has been tested with nvme-cli versions:%v\n", _supportedVersions) - fmt.Println("Usage: nvme_exporter [options]") - flag.PrintDefaults() + fmt.Println("nvme_exporter - Prometheus exporter for NVMe device metrics") + fmt.Println("\nExports NVMe SMART log and OCP SMART log metrics in Prometheus format.") + fmt.Println("\nDocumentation:") + fmt.Println(" NVMe SMART log specification (page 209):") + fmt.Println(" https://nvmexpress.org/wp-content/uploads/NVM-Express-Base-Specification-Revision-2.1-2024.08.05-Ratified.pdf") + fmt.Println(" OCP SMART log specification (page 24):") + fmt.Println(" https://www.opencompute.org/documents/datacenter-nvme-ssd-specification-v2-5-pdf") + fmt.Printf("\nMinimum supported nvme-cli version: %s\n", _minimumSupportedVersion) + fmt.Println("\nUsage: nvme_exporter [options]") + fmt.Println("\nWeb server options:") + fmt.Println(" --web.listen-address string") + fmt.Println(" Address on which to expose metrics and web interface (default \":9998\")") + fmt.Println(" --web.telemetry-path string") + fmt.Println(" Path under which to expose metrics (default \"/metrics\")") + fmt.Println("\nCollector options:") + fmt.Println(" --collector.") + fmt.Println(" Enable the specified collector (enabled by default)") + fmt.Println(" --no-collector.") + fmt.Println(" Disable the specified collector") + fmt.Println(" --collector.disable-defaults") + fmt.Println(" Disable all default collectors") + fmt.Println("\nAvailable collectors:") + for name, collector := range collectors { + defaultStr := "" + if collector.defaultState { + defaultStr = " (enabled by default)" + } + fmt.Printf(" %-10s %s%s\n", name, collector.description, defaultStr) + } + fmt.Println("\nExamples:") + fmt.Println(" # Start with all default collectors on default port") + fmt.Println(" nvme_exporter") + fmt.Println("\n # Listen on a specific address and port") + fmt.Println(" nvme_exporter --web.listen-address=\":9100\"") + fmt.Println("\n # Disable OCP metrics collection") + fmt.Println(" nvme_exporter --no-collector.ocp") + fmt.Println("\n # Only collect SMART metrics (disable info and OCP)") + fmt.Println(" nvme_exporter --collector.disable-defaults --collector.smart") } - port := flag.String("port", "9998", "port to listen on") - ocp := flag.Bool("ocp", false, "Enable OCP smart log metrics") - endpoint := flag.String("endpoint", "/metrics", "Specify the endpoint to expose metrics") + // Define flags following Prometheus node_exporter conventions + listenAddress := flag.String("web.listen-address", ":9998", "Address on which to expose metrics and web interface") + metricsPath := flag.String("web.telemetry-path", "/metrics", "Path under which to expose metrics") flag.Parse() - if !strings.HasPrefix(*endpoint, "/") { - *endpoint = "/" + *endpoint + // Ensure metrics path starts with / + if !strings.HasPrefix(*metricsPath, "/") { + *metricsPath = "/" + *metricsPath } + // Register the scrape failures metric + prometheus.MustRegister(scrapeFailuresTotal) + + // Validate prerequisites - log errors but don't exit err := utils.CheckCurrentUser("root") if err != nil { - log.Printf("current user is not root: %s", err.Error()) + log.Printf("WARNING: current user is not root: %s", err.Error()) + log.Printf("WARNING: exporter will continue running but scrapes will fail") + setValidationError(fmt.Sprintf("not running as root: %s", err.Error())) } // check for nvme-cli version out, err := utils.ExecuteCommand("nvme", "--version") if err != nil { - log.Fatal(err.Error()) + log.Printf("WARNING: nvme binary not found or error executing: %s", err.Error()) + log.Printf("WARNING: exporter will continue running but scrapes will fail") + setValidationError(fmt.Sprintf("nvme binary not available: %s", err.Error())) + } else { + re := regexp.MustCompile(`nvme version (\d+\.\d+)`) + match := re.FindStringSubmatch(out) + + if match != nil { + version := match[1] + if !isSupportedVersion(version) { + log.Printf("WARNING: NVMe cli version %s not supported, minimum required version is %s", version, _minimumSupportedVersion) + log.Printf("WARNING: exporter will continue running but scrapes may fail or produce incorrect data") + setValidationError(fmt.Sprintf("unsupported nvme-cli version %s (minimum: %s)", version, _minimumSupportedVersion)) + } else { + log.Printf("NVMe cli version %s detected and supported", version) + } + } else { + log.Printf("WARNING: Unable to find NVMe CLI version in output: %s", out) + log.Printf("WARNING: exporter will continue running but scrapes may fail") + setValidationError(fmt.Sprintf("unable to parse nvme-cli version from output: %s", out)) + } } - re := regexp.MustCompile(`nvme version (\d+\.\d+)\.\d+`) - match := re.FindStringSubmatch(out) + // Resolve collector states based on flags + collectorStates := resolveCollectorStates() - if match != nil { - version := match[1] - if !isSupportedVersion(version) { - log.Printf("NVMe cli version %s not supported, supported versions are: %v", version, _supportedVersions) + // Log enabled collectors + log.Printf("Enabled collectors:") + for name, enabled := range collectorStates { + if enabled { + log.Printf(" - %s", name) } - } else { - log.Fatalf("Unable to find NVMe CLI version in output: %s", out) } - prometheus.MustRegister(newNvmeCollector(*ocp)) - http.Handle(*endpoint, promhttp.Handler()) - log.Printf("Starting newNvmeCollector on port: %s, metrics endpoint: %s\n", *port, *endpoint) - log.Printf("newNvmeCollector is collecting OCP smart-log metrics: %t\n", *ocp) + // Set up validation checker and scrape failure incrementer + pkg.SetValidationChecker(isValidationValid) + pkg.SetScrapeFailureIncrementer(func() { + scrapeFailuresTotal.Inc() + }) + + prometheus.MustRegister(newNvmeCollector(collectorStates)) + http.Handle(*metricsPath, promhttp.Handler()) + + // Add a landing page like node_exporter + http.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) { + if r.URL.Path != "/" { + http.NotFound(w, r) + return + } + w.Header().Set("Content-Type", "text/html; charset=utf-8") + fmt.Fprintf(w, ` +NVMe Exporter + +

NVMe Exporter

+

Metrics

+ +`, *metricsPath) + }) + + log.Printf("Starting nvme_exporter on %s", *listenAddress) + log.Printf("Metrics path: %s", *metricsPath) server := &http.Server{ - Addr: ":" + *port, + Addr: *listenAddress, ReadHeaderTimeout: 3 * time.Second, } log.Fatal(server.ListenAndServe()) diff --git a/pkg/collector.go b/pkg/collector.go index ed2854d..df16880 100644 --- a/pkg/collector.go +++ b/pkg/collector.go @@ -11,13 +11,86 @@ import ( // GetDevices queries the devices list through the shell // and returns an array of JSON results with the devices data. +// This function handles both old flat structure and new nested structure +// of nvme-cli JSON output. func GetDevices() []gjson.Result { + // Check validation state before attempting to query devices + if validationChecker != nil && !validationChecker() { + if scrapeFailureIncrementer != nil { + scrapeFailureIncrementer() + } + log.Printf("Skipping device query due to validation failure") + return []gjson.Result{} + } + devicesJSON, err := utils.ExecuteJSONCommand("nvme", "list", "-o", "json") if err != nil { log.Printf("Error running nvme list -o json: %s\n", err) + if scrapeFailureIncrementer != nil { + scrapeFailureIncrementer() + } + return []gjson.Result{} + } + + devices := devicesJSON.Get("Devices").Array() + if len(devices) == 0 { + return []gjson.Result{} + } + + // Check if we have the new nested structure (with Subsystems) + // or the old flat structure (with DevicePath) + firstDevice := devices[0] + if firstDevice.Get("Subsystems").Exists() { + // New nested structure - flatten it + return flattenNewStructure(devices) + } + + // Old flat structure - return as is + return devices +} + +// flattenNewStructure converts the new nested nvme-cli JSON structure +// to a flat structure compatible with the rest of the code. +func flattenNewStructure(devices []gjson.Result) []gjson.Result { + var flattened []gjson.Result + + for _, device := range devices { + subsystems := device.Get("Subsystems").Array() + for _, subsystem := range subsystems { + controllers := subsystem.Get("Controllers").Array() + for _, controller := range controllers { + serialNumber := controller.Get("SerialNumber").String() + modelNumber := controller.Get("ModelNumber").String() + firmware := controller.Get("Firmware").String() + + namespaces := controller.Get("Namespaces").Array() + for _, namespace := range namespaces { + namespaceName := namespace.Get("NameSpace").String() + generic := namespace.Get("Generic").String() + + // Construct a flat JSON object compatible with old structure + flatJSON := map[string]interface{}{ + "DevicePath": "/dev/" + namespaceName, + "GenericPath": generic, + "Firmware": firmware, + "ModelNumber": modelNumber, + "SerialNumber": serialNumber, + "NameSpace": namespace.Get("NSID").Int(), + "UsedBytes": namespace.Get("UsedBytes").Int(), + "MaximumLBA": namespace.Get("MaximumLBA").Int(), + "PhysicalSize": namespace.Get("PhysicalSize").Int(), + "SectorSize": namespace.Get("SectorSize").Int(), + } + + // Convert map to JSON string and parse it as gjson.Result + jsonStr := utils.MapToJSONString(flatJSON) + flattened = append(flattened, gjson.Parse(jsonStr)) + } + } + } } - return devicesJSON.Get("Devices").Array() + return flattened } // MetricCollector is the interface implemented by the objects contained @@ -75,7 +148,10 @@ func (ic *InfoMetricCollector) CollectMetrics(ch chan<- prometheus.Metric, devic modelNumber, serialNumber, ) - ch <- metric + // Only send metric if it's not nil (handles cases where data is unavailable) + if metric != nil { + ch <- metric + } } } @@ -108,10 +184,19 @@ func (lc *LogMetricCollector) CollectMetrics(ch chan<- prometheus.Metric, device devicePath := device.Get("DevicePath").String() jsonData := lc.getData(devicePath) + + // If getData returns invalid data (e.g., OCP not supported), skip this collector + if !jsonData.Exists() { + return + } + for _, logProvider := range lc.LogMetricProviders { // Fetching the metric object is delegated to the provider metric := logProvider.GetMetric(jsonData, devicePath) - ch <- metric + // Only send metric if it's not nil (handles cases where data is unavailable) + if metric != nil { + ch <- metric + } } } @@ -144,3 +229,24 @@ func (cc *CompositeCollector) Collect(ch chan<- prometheus.Metric) { } } } + +// SetValidationChecker allows injecting a validation check function +// to determine if scrapes should proceed +type ValidationChecker func() bool + +var validationChecker ValidationChecker + +// SetValidationChecker sets the global validation checker +func SetValidationChecker(checker ValidationChecker) { + validationChecker = checker +} + +// IncrementScrapeFailure is called when a scrape fails due to validation errors +type ScrapeFailureIncrementer func() + +var scrapeFailureIncrementer ScrapeFailureIncrementer + +// SetScrapeFailureIncrementer sets the global scrape failure incrementer +func SetScrapeFailureIncrementer(incrementer ScrapeFailureIncrementer) { + scrapeFailureIncrementer = incrementer +} diff --git a/pkg/provider.go b/pkg/provider.go index 1ecc40c..441cc8a 100644 --- a/pkg/provider.go +++ b/pkg/provider.go @@ -39,7 +39,23 @@ func (ip MetricProvider) GetMetric( data gjson.Result, labels ...string, ) prometheus.Metric { - value := data.Get(ip.jsonKey).Float() + // If data is invalid/empty (e.g., OCP not supported), skip metric creation + if !data.Exists() { + return nil + } + + result := data.Get(ip.jsonKey) + + // Handle both scalar values (v2.8) and object values (v2.11+) + // In v2.11+, some fields like critical_warning are objects with a "value" field + var value float64 + if result.IsObject() { + // Try to get the "value" field from the object + value = result.Get("value").Float() + } else { + // Direct numeric value + value = result.Float() + } metric := prometheus.MustNewConstMetric( ip.Desc, diff --git a/pkg/utils/command.go b/pkg/utils/command.go index 5e28b0c..f9571d6 100644 --- a/pkg/utils/command.go +++ b/pkg/utils/command.go @@ -1,6 +1,7 @@ package utils import ( + "encoding/json" "fmt" "os/exec" "os/user" @@ -67,3 +68,12 @@ func CheckCurrentUser(wantedUser string) error { return nil } + +// MapToJSONString converts a map to a JSON string. +func MapToJSONString(data map[string]interface{}) string { + jsonBytes, err := json.Marshal(data) + if err != nil { + return "{}" + } + return string(jsonBytes) +} From fcee3720c1877fa586335af400e718d6899ab8c3 Mon Sep 17 00:00:00 2001 From: dobbi84 Date: Thu, 9 Oct 2025 10:56:34 +0200 Subject: [PATCH 2/3] upgrade version of golangci in GH action --- .github/workflows/build.yml | 2 +- .golangci.bck.yml | 38 ++++++++++++++++++ .golangci.yml | 78 ++++++++++++++++++++++++------------- 3 files changed, 89 insertions(+), 29 deletions(-) create mode 100644 .golangci.bck.yml diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 90379a8..35c2b19 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -37,7 +37,7 @@ jobs: uses: golangci/golangci-lint-action@v6.3.2 with: # renovate: depName=golangci/golangci-lint datasource=github-releases - version: v1.63.4 + version: v2.5.0 args: --timeout=3m0s - name: Build with Goreleaser uses: goreleaser/goreleaser-action@v6.2.1 diff --git a/.golangci.bck.yml b/.golangci.bck.yml new file mode 100644 index 0000000..0611173 --- /dev/null +++ b/.golangci.bck.yml @@ -0,0 +1,38 @@ +# https://golangci-lint.run/usage/linters/ +linters: + enable-all: true + disable: + - exhaustruct + - exportloopref + - err113 + - funlen + - gochecknoglobals + - mnd + # reconsider + - ireturn + # reconsider + - maintidx + # reconsider + - forbidigo + +linters-settings: + varnamelen: + min-name-length: 2 + gci: + sections: + - standard + - default + - localmodule + goimports: + local-prefixes: github.com/E4-Computer-Engineering/nvme_exporter + goconst: + ignore-tests: true + depguard: + rules: + main: + list-mode: lax + files: + - $all + deny: + - pkg: io/ioutil + desc: "replaced by io and os packages since Go 1.16: https://tip.golang.org/doc/go1.16#ioutil" diff --git a/.golangci.yml b/.golangci.yml index 0611173..7f281dd 100644 --- a/.golangci.yml +++ b/.golangci.yml @@ -1,38 +1,60 @@ -# https://golangci-lint.run/usage/linters/ +version: "2" linters: - enable-all: true + default: all disable: - - exhaustruct - - exportloopref - err113 + - exhaustruct + - forbidigo - funlen - gochecknoglobals - - mnd - # reconsider - ireturn - # reconsider - maintidx - # reconsider - - forbidigo - -linters-settings: - varnamelen: - min-name-length: 2 - gci: - sections: + - mnd + settings: + depguard: + rules: + main: + list-mode: lax + files: + - $all + deny: + - pkg: io/ioutil + desc: 'replaced by io and os packages since Go 1.16: https://tip.golang.org/doc/go1.16#ioutil' + varnamelen: + min-name-length: 2 + exclusions: + generated: lax + presets: + - comments + - common-false-positives + - legacy + - std-error-handling + rules: + - linters: + - goconst + path: (.+)_test\.go + paths: + - third_party$ + - builtin$ + - examples$ +formatters: + enable: + - gci + - gofmt + - gofumpt + - goimports + settings: + gci: + sections: - standard - default - localmodule - goimports: - local-prefixes: github.com/E4-Computer-Engineering/nvme_exporter - goconst: - ignore-tests: true - depguard: - rules: - main: - list-mode: lax - files: - - $all - deny: - - pkg: io/ioutil - desc: "replaced by io and os packages since Go 1.16: https://tip.golang.org/doc/go1.16#ioutil" + goimports: + local-prefixes: + - github.com/E4-Computer-Engineering/nvme_exporter + exclusions: + generated: lax + paths: + - third_party$ + - builtin$ + - examples$ From 94a553d22ae300e30344dfa5cb1f46eeedd5fc03 Mon Sep 17 00:00:00 2001 From: ilciko Date: Thu, 9 Oct 2025 11:21:34 +0200 Subject: [PATCH 3/3] golangco lint fixes --- .github/workflows/build.yml | 2 +- .golangci.bck.yml | 38 ------- .golangci.yml | 11 ++ cmd/collector.go | 6 +- cmd/main.go | 221 +++++++++++++++++++++--------------- pkg/collector.go | 18 +-- pkg/utils/command.go | 8 +- 7 files changed, 161 insertions(+), 143 deletions(-) delete mode 100644 .golangci.bck.yml diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 35c2b19..2e78ece 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -34,7 +34,7 @@ jobs: - name: Ensure go.mod is already tidied run: go mod tidy && git diff --no-patch --exit-code - name: Run linters - uses: golangci/golangci-lint-action@v6.3.2 + uses: golangci/golangci-lint-action@v8 with: # renovate: depName=golangci/golangci-lint datasource=github-releases version: v2.5.0 diff --git a/.golangci.bck.yml b/.golangci.bck.yml deleted file mode 100644 index 0611173..0000000 --- a/.golangci.bck.yml +++ /dev/null @@ -1,38 +0,0 @@ -# https://golangci-lint.run/usage/linters/ -linters: - enable-all: true - disable: - - exhaustruct - - exportloopref - - err113 - - funlen - - gochecknoglobals - - mnd - # reconsider - - ireturn - # reconsider - - maintidx - # reconsider - - forbidigo - -linters-settings: - varnamelen: - min-name-length: 2 - gci: - sections: - - standard - - default - - localmodule - goimports: - local-prefixes: github.com/E4-Computer-Engineering/nvme_exporter - goconst: - ignore-tests: true - depguard: - rules: - main: - list-mode: lax - files: - - $all - deny: - - pkg: io/ioutil - desc: "replaced by io and os packages since Go 1.16: https://tip.golang.org/doc/go1.16#ioutil" diff --git a/.golangci.yml b/.golangci.yml index 7f281dd..41f3803 100644 --- a/.golangci.yml +++ b/.golangci.yml @@ -10,6 +10,9 @@ linters: - ireturn - maintidx - mnd + - wsl + enable: + - wsl_v5 settings: depguard: rules: @@ -20,8 +23,16 @@ linters: deny: - pkg: io/ioutil desc: 'replaced by io and os packages since Go 1.16: https://tip.golang.org/doc/go1.16#ioutil' + revive: + rules: + - name: var-naming + disabled: true varnamelen: min-name-length: 2 + wsl_v5: + allow-first-in-block: true + allow-whole-block: false + branch-max-lines: 2 exclusions: generated: lax presets: diff --git a/cmd/collector.go b/cmd/collector.go index f349fc8..85ca44a 100644 --- a/cmd/collector.go +++ b/cmd/collector.go @@ -22,7 +22,8 @@ func getSmartLogData(devicePath string) gjson.Result { func getOcpSmartLogData(devicePath string) gjson.Result { ocpSmartLog, err := utils.ExecuteJSONCommand("nvme", "ocp", "smart-add-log", devicePath, "-o", "json") if err != nil { - log.Printf("OCP metrics not supported or error running smart-add-log %s -o json: %s (continuing with standard metrics)\n", devicePath, err) + log.Printf("OCP metrics not supported or error running smart-add-log %s -o json: %s "+ + "(continuing with standard metrics)\n", devicePath, err) // Return empty result instead of crashing return gjson.Result{} } @@ -122,7 +123,8 @@ func newNvmeCollector(collectorStates map[string]bool) prometheus.Collector { logMetricProviders := []pkg.MetricProvider{ gaugeValueFactory.CreateLogMetricProvider( "nvme_critical_warning", - "Critical warnings for the controller state. Bits indicate spare capacity, temperature, degraded reliability, or read-only mode", + "Critical warnings for the controller state. Bits indicate spare capacity, temperature, "+ + "degraded reliability, or read-only mode", "critical_warning", ), gaugeValueFactory.CreateLogMetricProvider( diff --git a/cmd/main.go b/cmd/main.go index 4660f44..bf70bde 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -23,6 +23,7 @@ const _minimumSupportedVersion = "2.8" var ( validationState = struct { sync.RWMutex + isValid bool errorMessage string }{ @@ -32,17 +33,18 @@ var ( scrapeFailuresTotal = prometheus.NewCounter( prometheus.CounterOpts{ Name: "nvme_exporter_scrape_failures_total", - Help: "Total number of scrape failures due to validation errors (not root, nvme-cli not found, or unsupported version)", + Help: "Total number of scrape failures due to validation errors " + + "(not root, nvme-cli not found, or unsupported version)", }, ) ) -// Collector represents a metric collector with enable/disable capability +// Collector represents a metric collector with enable/disable capability. type Collector struct { - name string - defaultState bool - enabled *bool - description string + name string + defaultState bool + enabled *bool + description string } var ( @@ -91,6 +93,7 @@ func isSupportedVersion(version string) bool { if vMajor > minMajor { return true } + if vMajor < minMajor { return false } @@ -101,6 +104,7 @@ func isSupportedVersion(version string) bool { func setValidationError(msg string) { validationState.Lock() defer validationState.Unlock() + validationState.isValid = false validationState.errorMessage = msg } @@ -108,14 +112,15 @@ func setValidationError(msg string) { func isValidationValid() bool { validationState.RLock() defer validationState.RUnlock() + return validationState.isValid } func initCollectorFlags() { // Register flags for each collector for name, collector := range collectors { - flagName := fmt.Sprintf("collector.%s", name) - noFlagName := fmt.Sprintf("no-collector.%s", name) + flagName := "collector." + name + noFlagName := "no-collector." + name defaultValue := collector.defaultState @@ -138,10 +143,8 @@ func initCollectorFlags() { // Store both flags so we can resolve them later collectors[name] = collector - // Store the disable flag separately for later processing - if disableFlag != nil { - // We'll handle this in resolveCollectorStates - } + // The disable flag is handled in resolveCollectorStates + _ = disableFlag } } @@ -158,19 +161,20 @@ func resolveCollectorStates() map[string]bool { } // Check if explicit enable flag was set - enableFlagName := fmt.Sprintf("collector.%s", name) - disableFlagName := fmt.Sprintf("no-collector.%s", name) + enableFlagName := "collector." + name + disableFlagName := "no-collector." + name // Check if the flag was explicitly set explicitlyEnabled := false explicitlyDisabled := false - flag.Visit(func(f *flag.Flag) { - if f.Name == enableFlagName { + flag.Visit(func(flagItem *flag.Flag) { + if flagItem.Name == enableFlagName { explicitlyEnabled = true enabled = *collector.enabled } - if f.Name == disableFlagName { + + if flagItem.Name == disableFlagName { explicitlyDisabled = true } }) @@ -188,50 +192,107 @@ func resolveCollectorStates() map[string]bool { return states } +func printUsage() { + fmt.Println("nvme_exporter - Prometheus exporter for NVMe device metrics") + fmt.Println("\nExports NVMe SMART log and OCP SMART log metrics in Prometheus format.") + fmt.Println("\nDocumentation:") + fmt.Println(" NVMe SMART log specification (page 209):") + fmt.Println(" https://nvmexpress.org/wp-content/uploads/" + + "NVM-Express-Base-Specification-Revision-2.1-2024.08.05-Ratified.pdf") + fmt.Println(" OCP SMART log specification (page 24):") + fmt.Println(" https://www.opencompute.org/documents/datacenter-nvme-ssd-specification-v2-5-pdf") + fmt.Printf("\nMinimum supported nvme-cli version: %s\n", _minimumSupportedVersion) + fmt.Println("\nUsage: nvme_exporter [options]") + fmt.Println("\nWeb server options:") + fmt.Println(" --web.listen-address string") + fmt.Println(" Address on which to expose metrics and web interface (default \":9998\")") + fmt.Println(" --web.telemetry-path string") + fmt.Println(" Path under which to expose metrics (default \"/metrics\")") + fmt.Println("\nCollector options:") + fmt.Println(" --collector.") + fmt.Println(" Enable the specified collector (enabled by default)") + fmt.Println(" --no-collector.") + fmt.Println(" Disable the specified collector") + fmt.Println(" --collector.disable-defaults") + fmt.Println(" Disable all default collectors") + fmt.Println("\nAvailable collectors:") + + for name, collector := range collectors { + defaultStr := "" + if collector.defaultState { + defaultStr = " (enabled by default)" + } + + fmt.Printf(" %-10s %s%s\n", name, collector.description, defaultStr) + } + + fmt.Println("\nExamples:") + fmt.Println(" # Start with all default collectors on default port") + fmt.Println(" nvme_exporter") + fmt.Println("\n # Listen on a specific address and port") + fmt.Println(" nvme_exporter --web.listen-address=\":9100\"") + fmt.Println("\n # Disable OCP metrics collection") + fmt.Println(" nvme_exporter --no-collector.ocp") + fmt.Println("\n # Only collect SMART metrics (disable info and OCP)") + fmt.Println(" nvme_exporter --collector.disable-defaults --collector.smart") +} + +func validatePrerequisites() { + // Validate current user + err := utils.CheckCurrentUser("root") + if err != nil { + log.Printf("WARNING: current user is not root: %s", err.Error()) + log.Printf("WARNING: exporter will continue running but scrapes will fail") + setValidationError("not running as root: " + err.Error()) + + return + } + + // Check for nvme-cli version + validateNVMeCLI() +} + +func validateNVMeCLI() { + out, err := utils.ExecuteCommand("nvme", "--version") + if err != nil { + log.Printf("WARNING: nvme binary not found or error executing: %s", err.Error()) + log.Printf("WARNING: exporter will continue running but scrapes will fail") + setValidationError("nvme binary not available: " + err.Error()) + + return + } + + re := regexp.MustCompile(`nvme version (\d+\.\d+)`) + match := re.FindStringSubmatch(out) + + if match == nil { + log.Printf("WARNING: Unable to find NVMe CLI version in output: %s", out) + log.Printf("WARNING: exporter will continue running but scrapes may fail") + setValidationError("unable to parse nvme-cli version from output: " + out) + + return + } + + version := match[1] + if !isSupportedVersion(version) { + log.Printf("WARNING: NVMe cli version %s not supported, minimum required version is %s", + version, _minimumSupportedVersion) + log.Printf("WARNING: exporter will continue running but scrapes may fail or produce incorrect data") + setValidationError(fmt.Sprintf("unsupported nvme-cli version %s (minimum: %s)", + version, _minimumSupportedVersion)) + + return + } + + log.Printf("NVMe cli version %s detected and supported", version) +} + func main() { // Initialize collector flags before parsing initCollectorFlags() - flag.Usage = func() { - fmt.Println("nvme_exporter - Prometheus exporter for NVMe device metrics") - fmt.Println("\nExports NVMe SMART log and OCP SMART log metrics in Prometheus format.") - fmt.Println("\nDocumentation:") - fmt.Println(" NVMe SMART log specification (page 209):") - fmt.Println(" https://nvmexpress.org/wp-content/uploads/NVM-Express-Base-Specification-Revision-2.1-2024.08.05-Ratified.pdf") - fmt.Println(" OCP SMART log specification (page 24):") - fmt.Println(" https://www.opencompute.org/documents/datacenter-nvme-ssd-specification-v2-5-pdf") - fmt.Printf("\nMinimum supported nvme-cli version: %s\n", _minimumSupportedVersion) - fmt.Println("\nUsage: nvme_exporter [options]") - fmt.Println("\nWeb server options:") - fmt.Println(" --web.listen-address string") - fmt.Println(" Address on which to expose metrics and web interface (default \":9998\")") - fmt.Println(" --web.telemetry-path string") - fmt.Println(" Path under which to expose metrics (default \"/metrics\")") - fmt.Println("\nCollector options:") - fmt.Println(" --collector.") - fmt.Println(" Enable the specified collector (enabled by default)") - fmt.Println(" --no-collector.") - fmt.Println(" Disable the specified collector") - fmt.Println(" --collector.disable-defaults") - fmt.Println(" Disable all default collectors") - fmt.Println("\nAvailable collectors:") - for name, collector := range collectors { - defaultStr := "" - if collector.defaultState { - defaultStr = " (enabled by default)" - } - fmt.Printf(" %-10s %s%s\n", name, collector.description, defaultStr) - } - fmt.Println("\nExamples:") - fmt.Println(" # Start with all default collectors on default port") - fmt.Println(" nvme_exporter") - fmt.Println("\n # Listen on a specific address and port") - fmt.Println(" nvme_exporter --web.listen-address=\":9100\"") - fmt.Println("\n # Disable OCP metrics collection") - fmt.Println(" nvme_exporter --no-collector.ocp") - fmt.Println("\n # Only collect SMART metrics (disable info and OCP)") - fmt.Println(" nvme_exporter --collector.disable-defaults --collector.smart") - } + flag.Usage = printUsage + // Define flags following Prometheus node_exporter conventions listenAddress := flag.String("web.listen-address", ":9998", "Address on which to expose metrics and web interface") metricsPath := flag.String("web.telemetry-path", "/metrics", "Path under which to expose metrics") @@ -246,44 +307,14 @@ func main() { prometheus.MustRegister(scrapeFailuresTotal) // Validate prerequisites - log errors but don't exit - err := utils.CheckCurrentUser("root") - if err != nil { - log.Printf("WARNING: current user is not root: %s", err.Error()) - log.Printf("WARNING: exporter will continue running but scrapes will fail") - setValidationError(fmt.Sprintf("not running as root: %s", err.Error())) - } - - // check for nvme-cli version - out, err := utils.ExecuteCommand("nvme", "--version") - if err != nil { - log.Printf("WARNING: nvme binary not found or error executing: %s", err.Error()) - log.Printf("WARNING: exporter will continue running but scrapes will fail") - setValidationError(fmt.Sprintf("nvme binary not available: %s", err.Error())) - } else { - re := regexp.MustCompile(`nvme version (\d+\.\d+)`) - match := re.FindStringSubmatch(out) - - if match != nil { - version := match[1] - if !isSupportedVersion(version) { - log.Printf("WARNING: NVMe cli version %s not supported, minimum required version is %s", version, _minimumSupportedVersion) - log.Printf("WARNING: exporter will continue running but scrapes may fail or produce incorrect data") - setValidationError(fmt.Sprintf("unsupported nvme-cli version %s (minimum: %s)", version, _minimumSupportedVersion)) - } else { - log.Printf("NVMe cli version %s detected and supported", version) - } - } else { - log.Printf("WARNING: Unable to find NVMe CLI version in output: %s", out) - log.Printf("WARNING: exporter will continue running but scrapes may fail") - setValidationError(fmt.Sprintf("unable to parse nvme-cli version from output: %s", out)) - } - } + validatePrerequisites() // Resolve collector states based on flags collectorStates := resolveCollectorStates() // Log enabled collectors log.Printf("Enabled collectors:") + for name, enabled := range collectorStates { if enabled { log.Printf(" - %s", name) @@ -300,13 +331,15 @@ func main() { http.Handle(*metricsPath, promhttp.Handler()) // Add a landing page like node_exporter - http.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) { - if r.URL.Path != "/" { - http.NotFound(w, r) + http.HandleFunc("/", func(writer http.ResponseWriter, request *http.Request) { + if request.URL.Path != "/" { + http.NotFound(writer, request) + return } - w.Header().Set("Content-Type", "text/html; charset=utf-8") - fmt.Fprintf(w, ` + + writer.Header().Set("Content-Type", "text/html; charset=utf-8") + fmt.Fprintf(writer, ` NVMe Exporter

NVMe Exporter

diff --git a/pkg/collector.go b/pkg/collector.go index df16880..24cc1d2 100644 --- a/pkg/collector.go +++ b/pkg/collector.go @@ -19,16 +19,20 @@ func GetDevices() []gjson.Result { if scrapeFailureIncrementer != nil { scrapeFailureIncrementer() } + log.Printf("Skipping device query due to validation failure") + return []gjson.Result{} } devicesJSON, err := utils.ExecuteJSONCommand("nvme", "list", "-o", "json") if err != nil { log.Printf("Error running nvme list -o json: %s\n", err) + if scrapeFailureIncrementer != nil { scrapeFailureIncrementer() } + return []gjson.Result{} } @@ -155,7 +159,7 @@ func (ic *InfoMetricCollector) CollectMetrics(ch chan<- prometheus.Metric, devic } } -// InfoMetricCollector implements MetricCollector and sends smart log metrics. +// LogMetricCollector implements MetricCollector and sends smart log metrics. type LogMetricCollector struct { // LogMetricProviders is the list of providers for the log metric collector LogMetricProviders []MetricProvider @@ -179,7 +183,7 @@ func (lc *LogMetricCollector) Describe(ch chan<- *prometheus.Desc) { } } -// Collect gets the smart log data and sends all log metrics through the channel. +// CollectMetrics gets the smart log data and sends all log metrics through the channel. func (lc *LogMetricCollector) CollectMetrics(ch chan<- prometheus.Metric, device gjson.Result) { devicePath := device.Get("DevicePath").String() @@ -230,23 +234,23 @@ func (cc *CompositeCollector) Collect(ch chan<- prometheus.Metric) { } } -// SetValidationChecker allows injecting a validation check function -// to determine if scrapes should proceed +// ValidationChecker allows injecting a validation check function +// to determine if scrapes should proceed. type ValidationChecker func() bool var validationChecker ValidationChecker -// SetValidationChecker sets the global validation checker +// SetValidationChecker sets the global validation checker. func SetValidationChecker(checker ValidationChecker) { validationChecker = checker } -// IncrementScrapeFailure is called when a scrape fails due to validation errors +// ScrapeFailureIncrementer is called when a scrape fails due to validation errors. type ScrapeFailureIncrementer func() var scrapeFailureIncrementer ScrapeFailureIncrementer -// SetScrapeFailureIncrementer sets the global scrape failure incrementer +// SetScrapeFailureIncrementer sets the global scrape failure incrementer. func SetScrapeFailureIncrementer(incrementer ScrapeFailureIncrementer) { scrapeFailureIncrementer = incrementer } diff --git a/pkg/utils/command.go b/pkg/utils/command.go index f9571d6..de8dfa8 100644 --- a/pkg/utils/command.go +++ b/pkg/utils/command.go @@ -1,11 +1,13 @@ package utils import ( + "context" "encoding/json" "fmt" "os/exec" "os/user" "strings" + "time" "github.com/tidwall/gjson" ) @@ -25,7 +27,10 @@ func ExecuteCommand(cmd string, args ...string) (string, error) { return "", fmt.Errorf("error looking for %s cli command in path: %w", cmd, err) } - command := exec.Command(cmd, args...) + ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + defer cancel() + + command := exec.CommandContext(ctx, cmd, args...) out, err := command.CombinedOutput() if err != nil { @@ -75,5 +80,6 @@ func MapToJSONString(data map[string]interface{}) string { if err != nil { return "{}" } + return string(jsonBytes) }