diff --git a/.goreleaser.yml b/.goreleaser.yml index 0d374b7..e1e4bd0 100644 --- a/.goreleaser.yml +++ b/.goreleaser.yml @@ -6,7 +6,7 @@ before: builds: - id: nvme_exporter - main: ./cmd/nvme_exporter + main: ./cmd binary: nvme_exporter ldflags: - -s -w diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..dedde0a --- /dev/null +++ b/Makefile @@ -0,0 +1,2 @@ +build: + go build -o ./nvme_exporter ./nvme_exporter \ No newline at end of file diff --git a/cmd/collector.go b/cmd/collector.go new file mode 100644 index 0000000..4a68d0b --- /dev/null +++ b/cmd/collector.go @@ -0,0 +1,449 @@ +package main + +import ( + "log" + + "github.com/prometheus/client_golang/prometheus" + "github.com/tidwall/gjson" + + "github.com/E4-Computer-Engineering/nvme_exporter/pkg" + "github.com/E4-Computer-Engineering/nvme_exporter/pkg/utils" +) + +func getSmartLogData(devicePath string) gjson.Result { + smartLog, err := utils.ExecuteJSONCommand("nvme", "smart-log", devicePath, "-o", "json") + if err != nil { + log.Printf("Error running smart-log %s -o json: %s\n", devicePath, err) + } + + return smartLog +} + +func getOcpSmartLogData(devicePath string) gjson.Result { + ocpSmartLog, err := utils.ExecuteJSONCommand("nvme", "ocp", "smart-add-log", devicePath, "-o", "json") + if err != nil { + log.Printf("Error running smart-add-log %s -o json: %s\n", devicePath, err) + } + + return ocpSmartLog +} + +type ProviderFactory struct { + valueType prometheus.ValueType + defaultLabels []string +} + +func (f *ProviderFactory) CreateLogMetricProvider( + fqName string, + help string, + jsonKey string, +) pkg.MetricProvider { + return pkg.NewMetricProvider( + prometheus.NewDesc( + fqName, + help, + f.defaultLabels, + nil, + ), + f.valueType, + jsonKey, + ) +} + +func (f *ProviderFactory) CreateInfoMetricProvider( + fqName string, + help string, + jsonKey string, + infoLabels []string, +) pkg.MetricProvider { + return pkg.NewMetricProvider( + prometheus.NewDesc( + fqName, + help, + infoLabels, + nil, + ), + f.valueType, + jsonKey, + ) +} + +func newNvmeCollector(ocpEnabled bool) prometheus.Collector { + labels := []string{"device"} + infoLabels := []string{"device", "generic_path", "firmware", "model_number", "serial_number"} + + gaugeValueFactory := ProviderFactory{ + valueType: prometheus.GaugeValue, + defaultLabels: labels, + } + + counterValueFactory := ProviderFactory{ + valueType: prometheus.CounterValue, + defaultLabels: labels, + } + + // Info metrics + infoMetricProviders := []pkg.MetricProvider{ + gaugeValueFactory.CreateInfoMetricProvider( + "nvme_namespace", + "", + "NameSpace", + infoLabels, + ), + gaugeValueFactory.CreateInfoMetricProvider( + "nvme_used_bytes", + "", + "UsedBytes", + infoLabels, + ), + gaugeValueFactory.CreateInfoMetricProvider( + "nvme_maximum_lba", + "", + "MaximumLBA", + infoLabels, + ), + gaugeValueFactory.CreateInfoMetricProvider( + "nvme_physical_size", + "", + "PhysicalSize", + infoLabels, + ), + gaugeValueFactory.CreateInfoMetricProvider( + "nvme_sector_size", + "", + "SectorSize", + infoLabels, + ), + } + + // Smart-log metrics + logMetricProviders := []pkg.MetricProvider{ + gaugeValueFactory.CreateLogMetricProvider( + "nvme_critical_warning", + "Critical warnings for the state of the controller", + "critical_warning", + ), + gaugeValueFactory.CreateLogMetricProvider( + "nvme_temperature", + "Temperature in degrees fahrenheit", + "temperature", + ), + gaugeValueFactory.CreateLogMetricProvider( + "nvme_avail_spare", + "Normalized percentage of remaining spare capacity available", + "avail_spare", + ), + gaugeValueFactory.CreateLogMetricProvider( + "nvme_spare_thresh", + "Async event completion may occur when avail spare < threshold", + "spare_thresh", + ), + gaugeValueFactory.CreateLogMetricProvider( + "nvme_percent_used", + "Vendor specific estimate of the percentage of life used", + "percent_used", + ), + gaugeValueFactory.CreateLogMetricProvider( + "nvme_endurance_grp_critical_warning_summary", + "Critical warnings for the state of endurance groups", + "endurance_grp_critical_warning_summary", + ), + + counterValueFactory.CreateLogMetricProvider( + "nvme_data_units_read", + "Number of 512 byte data units host has read", + "data_units_read", + ), + + counterValueFactory.CreateLogMetricProvider( + "nvme_data_units_written", + "Number of 512 byte data units the host has written", + "data_units_written", + ), + + counterValueFactory.CreateLogMetricProvider( + "nvme_host_read_commands", + "Number of read commands completed", + "host_read_commands", + ), + + counterValueFactory.CreateLogMetricProvider( + "nvme_host_write_commands", + "Number of write commands completed", + "host_write_commands", + ), + + counterValueFactory.CreateLogMetricProvider( + "nvme_controller_busy_time", + "Amount of time in minutes controller busy with IO commands", + "controller_busy_time", + ), + + counterValueFactory.CreateLogMetricProvider( + "nvme_power_cycles", + "Number of power cycles", + "power_cycles", + ), + + counterValueFactory.CreateLogMetricProvider( + "nvme_power_on_hours", + "Number of power on hours", + "power_on_hours", + ), + + counterValueFactory.CreateLogMetricProvider( + "nvme_unsafe_shutdowns", + "Number of unsafe shutdowns", + "unsafe_shutdowns", + ), + + counterValueFactory.CreateLogMetricProvider( + "nvme_media_errors", + "Number of unrecovered data integrity errors", + "media_errors", + ), + + counterValueFactory.CreateLogMetricProvider( + "nvme_num_err_log_entries", + "Lifetime number of error log entries", + "num_err_log_entries", + ), + + counterValueFactory.CreateLogMetricProvider( + "nvme_warning_temp_time", + "Amount of time in minutes temperature > warning threshold", + "warning_temp_time", + ), + + counterValueFactory.CreateLogMetricProvider( + "nvme_critical_comp_time", + "Amount of time in minutes temperature > critical threshold", + "critical_comp_time", + ), + + counterValueFactory.CreateLogMetricProvider( + "nvme_thm_temp1_trans_count", + "Number of times controller transitioned to lower power", + "thm_temp1_trans_count", + ), + + counterValueFactory.CreateLogMetricProvider( + "nvme_thm_temp2_trans_count", + "Number of times controller transitioned to lower power", + "thm_temp2_trans_count", + ), + + counterValueFactory.CreateLogMetricProvider( + "nvme_thm_temp1_trans_time", + "Total number of seconds controller transitioned to lower power", + "thm_temp1_total_time", + ), + + counterValueFactory.CreateLogMetricProvider( + "nvme_thm_temp2_trans_time", + "Total number of seconds controller transitioned to lower power", + "thm_temp2_total_time", + ), + } + + // OCP smart-log metrics + ocpLogMetricProviders := []pkg.MetricProvider{ + counterValueFactory.CreateLogMetricProvider( + "nvme_physical_media_units_written_hi", + "Physical meda units written high", + "Physical media units written.hi", + ), + counterValueFactory.CreateLogMetricProvider( + "nvme_physical_media_units_written_lo", + "Physical meda units written low", + "Physical media units written.lo", + ), + counterValueFactory.CreateLogMetricProvider( + "nvme_physical_media_units_read_hi", + "Physical meda units read high", + "Physical media units read.hi", + ), + counterValueFactory.CreateLogMetricProvider( + "nvme_physical_media_units_read_lo", + "Physical meda units read low", + "Physical media units read.lo", + ), + counterValueFactory.CreateLogMetricProvider( + "nvme_bad_user_nand_blocks_raw", + "", + "Bad user nand blocks - Raw", + ), + counterValueFactory.CreateLogMetricProvider( + "nvme_bad_user_nand_blocks_normalized", + "", + "Bad user nand blocks - Normalized", + ), + counterValueFactory.CreateLogMetricProvider( + "nvme_bad_system_nand_blocks_raw", + "", + "Bad system nand blocks - Raw", + ), + counterValueFactory.CreateLogMetricProvider( + "nvme_bad_system_nand_blocks_normalized", + "", + "Bad system nand blocks - Normalized", + ), + counterValueFactory.CreateLogMetricProvider( + "nvme_xor_recovery_count", + "", + "XOR recovery count", + ), + counterValueFactory.CreateLogMetricProvider( + "nvme_uncorrectable_uead_error_count", + "", + "Uncorrectable read error count", + ), + counterValueFactory.CreateLogMetricProvider( + "nvme_soft_ecc_error_count", + "", + "Soft ecc error count", + ), + counterValueFactory.CreateLogMetricProvider( + "nvme_end_to_end_detected_errors", + "", + "End to end detected errors", + ), + counterValueFactory.CreateLogMetricProvider( + "nvme_end_to_end_corrected_errors", + "", + "End to end corrected errors", + ), + gaugeValueFactory.CreateLogMetricProvider( + "nvme_system_data_percent_used", + "", + "System data percent used", + ), + counterValueFactory.CreateLogMetricProvider( + "nvme_refresh_counts", + "", + "Refresh counts", + ), + counterValueFactory.CreateLogMetricProvider( + "nvme_max_user_data_erase_counts", + "", + "Max User data erase counts", + ), + counterValueFactory.CreateLogMetricProvider( + "nvme_min_user_data_erase_counts", + "", + "Min User data erase counts", + ), + counterValueFactory.CreateLogMetricProvider( + "nvme_number_of_thermal_throttling_events", + "", + "Number of Thermal throttling events", + ), + gaugeValueFactory.CreateLogMetricProvider( + "nvme_current_throttling_status", + "", + "Current throttling status", + ), + counterValueFactory.CreateLogMetricProvider( + "nvme_pcie_correctable_error_count", + "", + "PCIe correctable error count", + ), + counterValueFactory.CreateLogMetricProvider( + "nvme_incomplete_shutdowns", + "", + "Incomplete shutdowns", + ), + gaugeValueFactory.CreateLogMetricProvider( + "nvme_percent_free_blocks", + "", + "Percent free blocks", + ), + gaugeValueFactory.CreateLogMetricProvider( + "nvme_capacitor_health", + "", + "Capacitor health", + ), + counterValueFactory.CreateLogMetricProvider( + "nvme_unaligned_io", + "", + "Unaligned I/O", + ), + gaugeValueFactory.CreateLogMetricProvider( + "nvme_security_version_number", + "", + "Security Version Number", + ), + gaugeValueFactory.CreateLogMetricProvider( + "nvme_nuse_namespace_utilization", + "", + "NUSE - Namespace utilization", + ), + counterValueFactory.CreateLogMetricProvider( + "nvme_plp_start_count", + "", + "PLP start count", + ), + gaugeValueFactory.CreateLogMetricProvider( + "nvme_endurance_estimate", + "", + "Endurance estimate", + ), + gaugeValueFactory.CreateLogMetricProvider( + "nvme_log_page_version", + "", + "Log page version", + ), + gaugeValueFactory.CreateLogMetricProvider( + "nvme_log_page_guid", + "", + "Log page GUID", + ), + gaugeValueFactory.CreateLogMetricProvider( + "nvme_errata_version_field", + "", + "Errata Version Field", + ), + gaugeValueFactory.CreateLogMetricProvider( + "nvme_point_version_field", + "", + "Point Version Field", + ), + gaugeValueFactory.CreateLogMetricProvider( + "nvme_minor_version_field", + "", + "Minor Version Field", + ), + gaugeValueFactory.CreateLogMetricProvider( + "nvme_major_version_field", + "", + "Major Version Field", + ), + gaugeValueFactory.CreateLogMetricProvider( + "nvme_nvme_errata_version", + "", + "NVMe Errata Version", + ), + counterValueFactory.CreateLogMetricProvider( + "nvme_pcie_link_retraining_count", + "", + "PCIe Link Retraining Count", + ), + counterValueFactory.CreateLogMetricProvider( + "nvme_power_state_change_count", + "", + "Power State Change Count", + ), + } + + // the info and smart-log collectors are always present + collectors := []pkg.MetricCollector{ + pkg.NewInfoMetricCollector(infoMetricProviders), + pkg.NewLogMetricCollector(logMetricProviders, getSmartLogData), + } + + if ocpEnabled { + collectors = append(collectors, pkg.NewLogMetricCollector(ocpLogMetricProviders, getOcpSmartLogData)) + } + + return pkg.NewCompositeCollector(collectors) +} diff --git a/cmd/main.go b/cmd/main.go new file mode 100644 index 0000000..de5d047 --- /dev/null +++ b/cmd/main.go @@ -0,0 +1,84 @@ +package main + +import ( + "flag" + "fmt" + "log" + "net/http" + "regexp" + "strings" + "time" + + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promhttp" + + "github.com/E4-Computer-Engineering/nvme_exporter/pkg/utils" +) + +var _supportedVersions = map[string]bool{ + "2.9": true, + "2.10": true, + "2.11": true, +} + +func isSupportedVersion(version string) bool { + _, ok := _supportedVersions[version] + + return ok +} + +func main() { + flag.Usage = func() { + fmt.Println("nvme_exporter - Exports NVMe smart-log and smart-ocp-log metrics in Prometheus format") + fmt.Println("Validated with nvme smart-log field descriptions can be found on page 209 of:") + fmt.Println( + "https://nvmexpress.org/wp-content/uploads/NVM-Express-Base-Specification-Revision-2.1-2024.08.05-Ratified.pdf") + fmt.Println("Validated with nvme ocp-smart-log field descriptions can be found on page 24 of:") + fmt.Println("https://www.opencompute.org/documents/datacenter-nvme-ssd-specification-v2-5-pdf */") + fmt.Printf("It has been tested with nvme-cli versions:%v\n", _supportedVersions) + fmt.Println("Usage: nvme_exporter [options]") + flag.PrintDefaults() + } + port := flag.String("port", "9998", "port to listen on") + ocp := flag.Bool("ocp", false, "Enable OCP smart log metrics") + endpoint := flag.String("endpoint", "/metrics", "Specify the endpoint to expose metrics") + flag.Parse() + + if !strings.HasPrefix(*endpoint, "/") { + *endpoint = "/" + *endpoint + } + + err := utils.CheckCurrentUser("root") + if err != nil { + log.Printf("current user is not root: %s", err.Error()) + } + + // check for nvme-cli version + out, err := utils.ExecuteCommand("nvme", "--version") + if err != nil { + log.Fatal(err.Error()) + } + + re := regexp.MustCompile(`nvme version (\d+\.\d+)\.\d+`) + match := re.FindStringSubmatch(out) + + if match != nil { + version := match[1] + if !isSupportedVersion(version) { + log.Printf("NVMe cli version %s not supported, supported versions are: %v", version, _supportedVersions) + } + } else { + log.Fatalf("Unable to find NVMe CLI version in output: %s", out) + } + + prometheus.MustRegister(newNvmeCollector(*ocp)) + http.Handle(*endpoint, promhttp.Handler()) + log.Printf("Starting newNvmeCollector on port: %s, metrics endpoint: %s\n", *port, *endpoint) + log.Printf("newNvmeCollector is collecting OCP smart-log metrics: %t\n", *ocp) + + server := &http.Server{ + Addr: ":" + *port, + ReadHeaderTimeout: 3 * time.Second, + } + log.Fatal(server.ListenAndServe()) +} diff --git a/cmd/nvme_exporter/main.go b/cmd/nvme_exporter/main.go deleted file mode 100644 index 55481a4..0000000 --- a/cmd/nvme_exporter/main.go +++ /dev/null @@ -1,889 +0,0 @@ -package main - -import ( - "flag" - "fmt" - "log" - "net/http" - "os/exec" - "os/user" - "regexp" - "strings" - "time" - - "github.com/prometheus/client_golang/prometheus" - "github.com/prometheus/client_golang/prometheus/promhttp" - "github.com/tidwall/gjson" -) - -var _supportedVersions = map[string]bool{ - "2.9": true, - "2.10": true, - "2.11": true, -} - -func isSupportedVersion(version string) bool { - _, ok := _supportedVersions[version] - - return ok -} - -type nvmeCollector struct { - ocp bool - nvmeCriticalWarning *prometheus.Desc - nvmeTemperature *prometheus.Desc - nvmeAvailSpare *prometheus.Desc - nvmeSpareThresh *prometheus.Desc - nvmePercentUsed *prometheus.Desc - nvmeEnduranceGrpCriticalWarningSummary *prometheus.Desc - nvmeDataUnitsRead *prometheus.Desc - nvmeDataUnitsWritten *prometheus.Desc - nvmeHostReadCommands *prometheus.Desc - nvmeHostWriteCommands *prometheus.Desc - nvmeControllerBusyTime *prometheus.Desc - nvmePowerCycles *prometheus.Desc - nvmePowerOnHours *prometheus.Desc - nvmeUnsafeShutdowns *prometheus.Desc - nvmeMediaErrors *prometheus.Desc - nvmeNumErrLogEntries *prometheus.Desc - nvmeWarningTempTime *prometheus.Desc - nvmeCriticalCompTime *prometheus.Desc - nvmeThmTemp1TransCount *prometheus.Desc - nvmeThmTemp2TransCount *prometheus.Desc - nvmeThmTemp1TotalTime *prometheus.Desc - nvmeThmTemp2TotalTime *prometheus.Desc - nvmePhysicalMediaUnitsWrittenHi *prometheus.Desc - nvmePhysicalMediaUnitsWrittenLo *prometheus.Desc - nvmePhysicalMediaUnitsReadHi *prometheus.Desc - nvmePhysicalMediaUnitsReadLo *prometheus.Desc - nvmeBadUserNandBlocksRaw *prometheus.Desc - nvmeBadUserNandBlocksNormalized *prometheus.Desc - nvmeBadSystemNandBlocksRaw *prometheus.Desc - nvmeBadSystemNandBlocksNormalized *prometheus.Desc - nvmeXorRecoveryCount *prometheus.Desc - nvmeUncorrectableReadErrorCount *prometheus.Desc - nvmeSoftEccErrorCount *prometheus.Desc - nvmeEndToEndDetectedErrors *prometheus.Desc - nvmeEndToEndCorrectedErrors *prometheus.Desc - nvmeSystemDataPercentUsed *prometheus.Desc - nvmeRefreshCounts *prometheus.Desc - nvmeMaxUserDataEraseCounts *prometheus.Desc - nvmeMinUserDataEraseCounts *prometheus.Desc - nvmeNumberOfThermalThrottlingEvents *prometheus.Desc - nvmeCurrentThrottlingStatus *prometheus.Desc - nvmePcieCorrectableErrorCount *prometheus.Desc - nvmeIncompleteShutdowns *prometheus.Desc - nvmePercentFreeBlocks *prometheus.Desc - nvmeCapacitorHealth *prometheus.Desc - nvmeUnalignedIo *prometheus.Desc - nvmeSecurityVersionNumber *prometheus.Desc - nvmeNuseNamespaceUtilization *prometheus.Desc - nvmePlpStartCount *prometheus.Desc - nvmeEnduranceEstimate *prometheus.Desc - nvmeLogPageVersion *prometheus.Desc - nvmeLogPageGUID *prometheus.Desc - nvmeErrataVersionField *prometheus.Desc - nvmePointVersionField *prometheus.Desc - nvmeMinorVersionField *prometheus.Desc - nvmeMajorVersionField *prometheus.Desc - nvmeNvmeErrataVersion *prometheus.Desc - nvmePcieLinkRetrainingCount *prometheus.Desc - nvmePowerStateChangeCount *prometheus.Desc - nvmeNameSpace *prometheus.Desc - nvmeUsedBytes *prometheus.Desc - nvmeMaximumLba *prometheus.Desc - nvmePhysicalSize *prometheus.Desc - nvmeSectorSize *prometheus.Desc -} - -func newNvmeCollector(ocp bool) prometheus.Collector { - labels := []string{"device"} - infoLabels := []string{"device", "generic_path", "firmware", "model_number", "serial_number"} - - return &nvmeCollector{ - ocp: ocp, - nvmeCriticalWarning: prometheus.NewDesc( - "nvme_critical_warning", - "Critical warnings for the state of the controller", - labels, - nil, - ), - nvmeTemperature: prometheus.NewDesc( - "nvme_temperature", - "Temperature in degrees fahrenheit", - labels, - nil, - ), - nvmeAvailSpare: prometheus.NewDesc( - "nvme_avail_spare", - "Normalized percentage of remaining spare capacity available", - labels, - nil, - ), - nvmeSpareThresh: prometheus.NewDesc( - "nvme_spare_thresh", - "Async event completion may occur when avail spare < threshold", - labels, - nil, - ), - nvmePercentUsed: prometheus.NewDesc( - "nvme_percent_used", - "Vendor specific estimate of the percentage of life used", - labels, - nil, - ), - nvmeEnduranceGrpCriticalWarningSummary: prometheus.NewDesc( - "nvme_endurance_grp_critical_warning_summary", - "Critical warnings for the state of endurance groups", - labels, - nil, - ), - nvmeDataUnitsRead: prometheus.NewDesc( - "nvme_data_units_read", - "Number of 512 byte data units host has read", - labels, - nil, - ), - nvmeDataUnitsWritten: prometheus.NewDesc( - "nvme_data_units_written", - "Number of 512 byte data units the host has written", - labels, - nil, - ), - nvmeHostReadCommands: prometheus.NewDesc( - "nvme_host_read_commands", - "Number of read commands completed", - labels, - nil, - ), - nvmeHostWriteCommands: prometheus.NewDesc( - "nvme_host_write_commands", - "Number of write commands completed", - labels, - nil, - ), - nvmeControllerBusyTime: prometheus.NewDesc( - "nvme_controller_busy_time", - "Amount of time in minutes controller busy with IO commands", - labels, - nil, - ), - nvmePowerCycles: prometheus.NewDesc( - "nvme_power_cycles", - "Number of power cycles", - labels, - nil, - ), - nvmePowerOnHours: prometheus.NewDesc( - "nvme_power_on_hours", - "Number of power on hours", - labels, - nil, - ), - nvmeUnsafeShutdowns: prometheus.NewDesc( - "nvme_unsafe_shutdowns", - "Number of unsafe shutdowns", - labels, - nil, - ), - nvmeMediaErrors: prometheus.NewDesc( - "nvme_media_errors", - "Number of unrecovered data integrity errors", - labels, - nil, - ), - nvmeNumErrLogEntries: prometheus.NewDesc( - "nvme_num_err_log_entries", - "Lifetime number of error log entries", - labels, - nil, - ), - nvmeWarningTempTime: prometheus.NewDesc( - "nvme_warning_temp_time", - "Amount of time in minutes temperature > warning threshold", - labels, - nil, - ), - nvmeCriticalCompTime: prometheus.NewDesc( - "nvme_critical_comp_time", - "Amount of time in minutes temperature > critical threshold", - labels, - nil, - ), - nvmeThmTemp1TransCount: prometheus.NewDesc( - "nvme_thm_temp1_trans_count", - "Number of times controller transitioned to lower power", - labels, - nil, - ), - nvmeThmTemp2TransCount: prometheus.NewDesc( - "nvme_thm_temp2_trans_count", - "Number of times controller transitioned to lower power", - labels, - nil, - ), - nvmeThmTemp1TotalTime: prometheus.NewDesc( - "nvme_thm_temp1_trans_time", - "Total number of seconds controller transitioned to lower power", - labels, - nil, - ), - nvmeThmTemp2TotalTime: prometheus.NewDesc( - "nvme_thm_temp2_trans_time", - "Total number of seconds controller transitioned to lower power", - labels, - nil, - ), - nvmePhysicalMediaUnitsWrittenHi: prometheus.NewDesc( - "nvme_physical_media_units_written_hi", - "Physical meda units written high", - labels, - nil, - ), - nvmePhysicalMediaUnitsWrittenLo: prometheus.NewDesc( - "nvme_physical_media_units_written_lo", - "Physical meda units written low", - labels, - nil, - ), - nvmePhysicalMediaUnitsReadHi: prometheus.NewDesc( - "nvme_physical_media_units_read_hi", - "Physical meda units read high", - labels, - nil, - ), - nvmePhysicalMediaUnitsReadLo: prometheus.NewDesc( - "nvme_physical_media_units_read_lo", - "Physical meda units read low", - labels, - nil, - ), - nvmeBadUserNandBlocksRaw: prometheus.NewDesc( - "nvme_bad_user_nand_blocks_raw", - "", - labels, - nil, - ), - nvmeBadUserNandBlocksNormalized: prometheus.NewDesc( - "nvme_bad_user_nand_blocks_normalized", - "", - labels, - nil, - ), - nvmeBadSystemNandBlocksRaw: prometheus.NewDesc( - "nvme_bad_system_nand_blocks_raw", - "", - labels, - nil, - ), - nvmeBadSystemNandBlocksNormalized: prometheus.NewDesc( - "nvme_bad_system_nand_blocks_normalized", - "", - labels, - nil, - ), - nvmeXorRecoveryCount: prometheus.NewDesc( - "nvme_xor_recovery_count", - "", - labels, - nil, - ), - nvmeUncorrectableReadErrorCount: prometheus.NewDesc( - "nvme_uncorrectable_uead_error_count", - "", - labels, - nil, - ), - nvmeSoftEccErrorCount: prometheus.NewDesc( - "nvme_soft_ecc_error_count", - "", - labels, - nil, - ), - nvmeEndToEndDetectedErrors: prometheus.NewDesc( - "nvme_end_to_end_detected_errors", - "", - labels, - nil, - ), - nvmeEndToEndCorrectedErrors: prometheus.NewDesc( - "nvme_end_to_end_corrected_errors", - "", - labels, - nil, - ), - nvmeSystemDataPercentUsed: prometheus.NewDesc( - "nvme_system_data_percent_used", - "", - labels, - nil, - ), - nvmeRefreshCounts: prometheus.NewDesc( - "nvme_refresh_counts", - "", - labels, - nil, - ), - nvmeMaxUserDataEraseCounts: prometheus.NewDesc( - "nvme_max_user_data_erase_counts", - "", - labels, - nil, - ), - nvmeMinUserDataEraseCounts: prometheus.NewDesc( - "nvme_min_user_data_erase_counts", - "", - labels, - nil, - ), - nvmeNumberOfThermalThrottlingEvents: prometheus.NewDesc( - "nvme_number_of_thermal_throttling_events", - "", - labels, - nil, - ), - nvmeCurrentThrottlingStatus: prometheus.NewDesc( - "nvme_current_throttling_status", - "", - labels, - nil, - ), - nvmePcieCorrectableErrorCount: prometheus.NewDesc( - "nvme_pcie_correctable_error_count", - "", - labels, - nil, - ), - nvmeIncompleteShutdowns: prometheus.NewDesc( - "nvme_incomplete_shutdowns", - "", - labels, - nil, - ), - nvmePercentFreeBlocks: prometheus.NewDesc( - "nvme_percent_free_blocks", - "", - labels, - nil, - ), - nvmeCapacitorHealth: prometheus.NewDesc( - "nvme_capacitor_health", - "", - labels, - nil, - ), - nvmeUnalignedIo: prometheus.NewDesc( - "nvme_unaligned_io", - "", - labels, - nil, - ), - nvmeSecurityVersionNumber: prometheus.NewDesc( - "nvme_security_version_number", - "", - labels, - nil, - ), - nvmeNuseNamespaceUtilization: prometheus.NewDesc( - "nvme_nuse_namespace_utilization", - "", - labels, - nil, - ), - nvmePlpStartCount: prometheus.NewDesc( - "nvme_plp_start_count", - "", - labels, - nil, - ), - nvmeEnduranceEstimate: prometheus.NewDesc( - "nvme_endurance_estimate", - "", - labels, - nil, - ), - nvmeLogPageVersion: prometheus.NewDesc( - "nvme_log_page_version", - "", - labels, - nil, - ), - nvmeLogPageGUID: prometheus.NewDesc( - "nvme_log_page_guid", - "", - labels, - nil, - ), - nvmeErrataVersionField: prometheus.NewDesc( - "nvme_errata_version_field", - "", - labels, - nil, - ), - nvmePointVersionField: prometheus.NewDesc( - "nvme_point_version_field", - "", - labels, - nil, - ), - nvmeMinorVersionField: prometheus.NewDesc( - "nvme_minor_version_field", - "", - labels, - nil, - ), - nvmeMajorVersionField: prometheus.NewDesc( - "nvme_major_version_field", - "", - labels, - nil, - ), - nvmeNvmeErrataVersion: prometheus.NewDesc( - "nvme_nvme_errata_version", - "", - labels, - nil, - ), - nvmePcieLinkRetrainingCount: prometheus.NewDesc( - "nvme_pcie_link_retraining_count", - "", - labels, - nil, - ), - nvmePowerStateChangeCount: prometheus.NewDesc( - "nvme_power_state_change_count", - "", - labels, - nil, - ), - nvmeNameSpace: prometheus.NewDesc( - "nvme_namespace", - "", - infoLabels, - nil, - ), - nvmeUsedBytes: prometheus.NewDesc( - "nvme_used_bytes", - "", - infoLabels, - nil, - ), - nvmeMaximumLba: prometheus.NewDesc( - "nvme_maximum_lba", - "", - infoLabels, - nil, - ), - nvmePhysicalSize: prometheus.NewDesc( - "nvme_physical_size", - "", - infoLabels, - nil, - ), - nvmeSectorSize: prometheus.NewDesc( - "nvme_sector_size", - "", - infoLabels, - nil, - ), - } -} - -func (c *nvmeCollector) Describe(ch chan<- *prometheus.Desc) { - ch <- c.nvmeCriticalWarning - ch <- c.nvmeTemperature - ch <- c.nvmeAvailSpare - ch <- c.nvmeSpareThresh - ch <- c.nvmePercentUsed - ch <- c.nvmeEnduranceGrpCriticalWarningSummary - ch <- c.nvmeDataUnitsRead - ch <- c.nvmeDataUnitsWritten - ch <- c.nvmeHostReadCommands - ch <- c.nvmeHostWriteCommands - ch <- c.nvmeControllerBusyTime - ch <- c.nvmePowerCycles - ch <- c.nvmePowerOnHours - ch <- c.nvmeUnsafeShutdowns - ch <- c.nvmeMediaErrors - ch <- c.nvmeNumErrLogEntries - ch <- c.nvmeWarningTempTime - ch <- c.nvmeCriticalCompTime - ch <- c.nvmeThmTemp1TransCount - ch <- c.nvmeThmTemp2TransCount - ch <- c.nvmeThmTemp1TotalTime - ch <- c.nvmeThmTemp2TotalTime - ch <- c.nvmePhysicalMediaUnitsWrittenHi - ch <- c.nvmePhysicalMediaUnitsWrittenLo - ch <- c.nvmePhysicalMediaUnitsReadHi - ch <- c.nvmePhysicalMediaUnitsReadLo - ch <- c.nvmeBadUserNandBlocksRaw - ch <- c.nvmeBadUserNandBlocksNormalized - ch <- c.nvmeBadSystemNandBlocksRaw - ch <- c.nvmeBadSystemNandBlocksNormalized - ch <- c.nvmeXorRecoveryCount - ch <- c.nvmeUncorrectableReadErrorCount - ch <- c.nvmeSoftEccErrorCount - ch <- c.nvmeEndToEndDetectedErrors - ch <- c.nvmeEndToEndCorrectedErrors - ch <- c.nvmeSystemDataPercentUsed - ch <- c.nvmeRefreshCounts - ch <- c.nvmeMaxUserDataEraseCounts - ch <- c.nvmeMinUserDataEraseCounts - ch <- c.nvmeNumberOfThermalThrottlingEvents - ch <- c.nvmeCurrentThrottlingStatus - ch <- c.nvmePcieCorrectableErrorCount - ch <- c.nvmeIncompleteShutdowns - ch <- c.nvmePercentFreeBlocks - ch <- c.nvmeCapacitorHealth - ch <- c.nvmeUnalignedIo - ch <- c.nvmeSecurityVersionNumber - ch <- c.nvmeNuseNamespaceUtilization - ch <- c.nvmePlpStartCount - ch <- c.nvmeEnduranceEstimate - ch <- c.nvmeLogPageVersion - ch <- c.nvmeLogPageGUID - ch <- c.nvmeErrataVersionField - ch <- c.nvmePointVersionField - ch <- c.nvmeMinorVersionField - ch <- c.nvmeMajorVersionField - ch <- c.nvmeNvmeErrataVersion - ch <- c.nvmePcieLinkRetrainingCount - ch <- c.nvmePowerStateChangeCount - ch <- c.nvmeNameSpace - ch <- c.nvmeUsedBytes - ch <- c.nvmeMaximumLba - ch <- c.nvmePhysicalSize - ch <- c.nvmeSectorSize -} - -func executeCommand(cmd string, args ...string) ([]byte, error) { - command := exec.Command(cmd, args...) - - output, err := command.CombinedOutput() - if err != nil { - return nil, fmt.Errorf("error running %s command: %w, output: %s", cmd, err, string(output)) - } - - if !gjson.Valid(string(output)) { - return nil, fmt.Errorf("invalid JSON output from %s command: %s", cmd, string(output)) - } - - return output, nil -} - -func (c *nvmeCollector) Collect(ch chan<- prometheus.Metric) { - nvmeDeviceList := c.getDeviceList() - for _, nvmeDevice := range nvmeDeviceList { - c.sendInfoMetrics(ch, nvmeDevice) - nvmeDevicePath := nvmeDevice.Get("DevicePath") - c.collectSmartLogMetrics(ch, nvmeDevicePath) - - if c.ocp { - c.collectOcpSmartLogMetrics(ch, nvmeDevicePath) - } - } -} - -func (c *nvmeCollector) getDeviceList() []gjson.Result { - nvmeDeviceCmd, err := executeCommand("nvme", "list", "-o", "json") - if err != nil { - log.Printf("Error running nvme list -o json: %s\n", err) - } - - // return gjson.Get(string(nvmeDeviceCmd), "Devices.#.DevicePath").Array() - return gjson.Get(string(nvmeDeviceCmd), "Devices").Array() -} - -func (c *nvmeCollector) collectSmartLogMetrics(ch chan<- prometheus.Metric, device gjson.Result) { - nvmeSmartLog, err := executeCommand("nvme", "smart-log", device.String(), "-o", "json") - if err != nil { - log.Printf("Error running smart-log %s -o json: %s\n", device.String(), err) - } - - nvmeSmartLogMetrics := gjson.GetMany(string(nvmeSmartLog), - "critical_warning", - "temperature", - "avail_spare", - "spare_thresh", - "percent_used", - "endurance_grp_critical_warning_summary", - "data_units_read", - "data_units_written", - "host_read_commands", - "host_write_commands", - "controller_busy_time", - "power_cycles", - "power_on_hours", - "unsafe_shutdowns", - "media_errors", - "num_err_log_entries", - "warning_temp_time", - "critical_comp_time", - "thm_temp1_trans_count", - "thm_temp2_trans_count", - "thm_temp1_total_time", - "thm_temp2_total_time") - c.sendSmartLogMetrics(ch, nvmeSmartLogMetrics, device.String()) -} - -func (c *nvmeCollector) collectOcpSmartLogMetrics(ch chan<- prometheus.Metric, device gjson.Result) { - nvmeOcpSmartLog, err := executeCommand("nvme", "ocp", "smart-add-log", device.String(), "-o", "json") - if err != nil { - log.Printf("Error running smart-add-log %s -o json: %s\n", device.String(), err) - } - - nvmeOcpSmartLogMetrics := gjson.GetMany(string(nvmeOcpSmartLog), - "Physical media units written.hi", - "Physical media units written.lo", - "Physical media units read.hi", - "Physical media units read.lo", - "Bad user nand blocks - Raw", - "Bad user nand blocks - Normalized", - "Bad system nand blocks - Raw", - "Bad system nand blocks - Normalized", - "XOR recovery count", - "Uncorrectable read error count", - "Soft ecc error count", - "End to end detected errors", - "End to end corrected errors", - "System data percent used", - "Refresh counts", - "Max User data erase counts", - "Min User data erase counts", - "Number of Thermal throttling events", - "Current throttling status", - "PCIe correctable error count", - "Incomplete shutdowns", - "Percent free blocks", - "Capacitor health", - "Unaligned I/O", - "Security Version Number", - "NUSE - Namespace utilization", - "PLP start count", - "Endurance estimate", - "Log page version", - "Log page GUID", - "Errata Version Field", - "Point Version Field", - "Minor Version Field", - "Major Version Field", - "NVMe Errata Version", - "PCIe Link Retraining Count", - "Power State Change Count") - c.sendOcpSmartLogMetrics(ch, nvmeOcpSmartLogMetrics, device.String()) -} - -func (c *nvmeCollector) sendInfoMetrics(ch chan<- prometheus.Metric, device gjson.Result) { - nameSpace := device.Get("NameSpace").Float() - devicePath := device.Get("DevicePath").String() - genericPath := device.Get("GenericPath").String() - firmware := device.Get("Firmware").String() - modelNumber := device.Get("ModelNumber").String() - serialNumber := device.Get("SerialNumber").String() - usedBytes := device.Get("UsedBytes").Float() - maximumLba := device.Get("MaximumLBA").Float() - physicalSize := device.Get("PhysicalSize").Float() - sectorSize := device.Get("SectorSize").Float() - ch <- prometheus.MustNewConstMetric( - c.nvmeNameSpace, prometheus.GaugeValue, nameSpace, devicePath, genericPath, firmware, modelNumber, serialNumber) - ch <- prometheus.MustNewConstMetric( - c.nvmeUsedBytes, prometheus.GaugeValue, usedBytes, devicePath, genericPath, firmware, modelNumber, serialNumber) - ch <- prometheus.MustNewConstMetric( - c.nvmeMaximumLba, prometheus.GaugeValue, maximumLba, devicePath, genericPath, firmware, modelNumber, serialNumber) - ch <- prometheus.MustNewConstMetric( - c.nvmePhysicalSize, prometheus.GaugeValue, physicalSize, devicePath, genericPath, firmware, modelNumber, serialNumber) - ch <- prometheus.MustNewConstMetric( - c.nvmeSectorSize, prometheus.GaugeValue, sectorSize, devicePath, genericPath, firmware, modelNumber, serialNumber) -} - -func (c *nvmeCollector) sendSmartLogMetrics(ch chan<- prometheus.Metric, metrics []gjson.Result, device string) { - ch <- prometheus.MustNewConstMetric( - c.nvmeCriticalWarning, prometheus.GaugeValue, metrics[0].Float(), device) - ch <- prometheus.MustNewConstMetric( - c.nvmeTemperature, prometheus.GaugeValue, metrics[1].Float(), device) - ch <- prometheus.MustNewConstMetric( - c.nvmeAvailSpare, prometheus.GaugeValue, metrics[2].Float(), device) - ch <- prometheus.MustNewConstMetric( - c.nvmeSpareThresh, prometheus.GaugeValue, metrics[3].Float(), device) - ch <- prometheus.MustNewConstMetric( - c.nvmePercentUsed, prometheus.GaugeValue, metrics[4].Float(), device) - ch <- prometheus.MustNewConstMetric( - c.nvmeEnduranceGrpCriticalWarningSummary, prometheus.GaugeValue, metrics[5].Float(), device) - ch <- prometheus.MustNewConstMetric( - c.nvmeDataUnitsRead, prometheus.CounterValue, metrics[6].Float(), device) - ch <- prometheus.MustNewConstMetric( - c.nvmeDataUnitsWritten, prometheus.CounterValue, metrics[7].Float(), device) - ch <- prometheus.MustNewConstMetric( - c.nvmeHostReadCommands, prometheus.CounterValue, metrics[8].Float(), device) - ch <- prometheus.MustNewConstMetric( - c.nvmeHostWriteCommands, prometheus.CounterValue, metrics[9].Float(), device) - ch <- prometheus.MustNewConstMetric( - c.nvmeControllerBusyTime, prometheus.CounterValue, metrics[10].Float(), device) - ch <- prometheus.MustNewConstMetric( - c.nvmePowerCycles, prometheus.CounterValue, metrics[11].Float(), device) - ch <- prometheus.MustNewConstMetric( - c.nvmePowerOnHours, prometheus.CounterValue, metrics[12].Float(), device) - ch <- prometheus.MustNewConstMetric( - c.nvmeUnsafeShutdowns, prometheus.CounterValue, metrics[13].Float(), device) - ch <- prometheus.MustNewConstMetric( - c.nvmeMediaErrors, prometheus.CounterValue, metrics[14].Float(), device) - ch <- prometheus.MustNewConstMetric( - c.nvmeNumErrLogEntries, prometheus.CounterValue, metrics[15].Float(), device) - ch <- prometheus.MustNewConstMetric( - c.nvmeWarningTempTime, prometheus.CounterValue, metrics[16].Float(), device) - ch <- prometheus.MustNewConstMetric( - c.nvmeCriticalCompTime, prometheus.CounterValue, metrics[17].Float(), device) - ch <- prometheus.MustNewConstMetric( - c.nvmeThmTemp1TransCount, prometheus.CounterValue, metrics[18].Float(), device) - ch <- prometheus.MustNewConstMetric( - c.nvmeThmTemp2TransCount, prometheus.CounterValue, metrics[19].Float(), device) - ch <- prometheus.MustNewConstMetric( - c.nvmeThmTemp1TotalTime, prometheus.CounterValue, metrics[20].Float(), device) - ch <- prometheus.MustNewConstMetric( - c.nvmeThmTemp2TotalTime, prometheus.CounterValue, metrics[21].Float(), device) -} - -func (c *nvmeCollector) sendOcpSmartLogMetrics(ch chan<- prometheus.Metric, metrics []gjson.Result, device string) { - ch <- prometheus.MustNewConstMetric( - c.nvmePhysicalMediaUnitsWrittenHi, prometheus.CounterValue, metrics[0].Float(), device) - ch <- prometheus.MustNewConstMetric( - c.nvmePhysicalMediaUnitsWrittenLo, prometheus.CounterValue, metrics[1].Float(), device) - ch <- prometheus.MustNewConstMetric( - c.nvmePhysicalMediaUnitsReadHi, prometheus.CounterValue, metrics[2].Float(), device) - ch <- prometheus.MustNewConstMetric( - c.nvmePhysicalMediaUnitsReadLo, prometheus.CounterValue, metrics[3].Float(), device) - ch <- prometheus.MustNewConstMetric( - c.nvmeBadUserNandBlocksRaw, prometheus.CounterValue, metrics[4].Float(), device) - ch <- prometheus.MustNewConstMetric( - c.nvmeBadUserNandBlocksNormalized, prometheus.CounterValue, metrics[5].Float(), device) - ch <- prometheus.MustNewConstMetric( - c.nvmeBadSystemNandBlocksRaw, prometheus.CounterValue, metrics[6].Float(), device) - ch <- prometheus.MustNewConstMetric( - c.nvmeBadSystemNandBlocksNormalized, prometheus.CounterValue, metrics[7].Float(), device) - ch <- prometheus.MustNewConstMetric( - c.nvmeXorRecoveryCount, prometheus.CounterValue, metrics[8].Float(), device) - ch <- prometheus.MustNewConstMetric( - c.nvmeUncorrectableReadErrorCount, prometheus.CounterValue, metrics[9].Float(), device) - ch <- prometheus.MustNewConstMetric( - c.nvmeSoftEccErrorCount, prometheus.CounterValue, metrics[10].Float(), device) - ch <- prometheus.MustNewConstMetric( - c.nvmeEndToEndDetectedErrors, prometheus.CounterValue, metrics[11].Float(), device) - ch <- prometheus.MustNewConstMetric( - c.nvmeEndToEndCorrectedErrors, prometheus.CounterValue, metrics[12].Float(), device) - ch <- prometheus.MustNewConstMetric( - c.nvmeSystemDataPercentUsed, prometheus.GaugeValue, metrics[13].Float(), device) - ch <- prometheus.MustNewConstMetric( - c.nvmeRefreshCounts, prometheus.CounterValue, metrics[14].Float(), device) - ch <- prometheus.MustNewConstMetric( - c.nvmeMaxUserDataEraseCounts, prometheus.CounterValue, metrics[15].Float(), device) - ch <- prometheus.MustNewConstMetric( - c.nvmeMinUserDataEraseCounts, prometheus.CounterValue, metrics[16].Float(), device) - ch <- prometheus.MustNewConstMetric( - c.nvmeNumberOfThermalThrottlingEvents, prometheus.CounterValue, metrics[17].Float(), device) - ch <- prometheus.MustNewConstMetric( - c.nvmeCurrentThrottlingStatus, prometheus.GaugeValue, metrics[18].Float(), device) - ch <- prometheus.MustNewConstMetric( - c.nvmePcieCorrectableErrorCount, prometheus.CounterValue, metrics[19].Float(), device) - ch <- prometheus.MustNewConstMetric( - c.nvmeIncompleteShutdowns, prometheus.CounterValue, metrics[20].Float(), device) - ch <- prometheus.MustNewConstMetric( - c.nvmePercentFreeBlocks, prometheus.GaugeValue, metrics[21].Float(), device) - ch <- prometheus.MustNewConstMetric( - c.nvmeCapacitorHealth, prometheus.GaugeValue, metrics[22].Float(), device) - ch <- prometheus.MustNewConstMetric( - c.nvmeUnalignedIo, prometheus.CounterValue, metrics[23].Float(), device) - ch <- prometheus.MustNewConstMetric( - c.nvmeSecurityVersionNumber, prometheus.GaugeValue, metrics[24].Float(), device) - ch <- prometheus.MustNewConstMetric( - c.nvmeNuseNamespaceUtilization, prometheus.GaugeValue, metrics[25].Float(), device) - ch <- prometheus.MustNewConstMetric( - c.nvmePlpStartCount, prometheus.CounterValue, metrics[26].Float(), device) - ch <- prometheus.MustNewConstMetric( - c.nvmeEnduranceEstimate, prometheus.GaugeValue, metrics[27].Float(), device) - ch <- prometheus.MustNewConstMetric( - c.nvmeLogPageVersion, prometheus.GaugeValue, metrics[28].Float(), device) - ch <- prometheus.MustNewConstMetric( - c.nvmeLogPageGUID, prometheus.GaugeValue, metrics[29].Float(), device) - ch <- prometheus.MustNewConstMetric( - c.nvmeErrataVersionField, prometheus.GaugeValue, metrics[30].Float(), device) - ch <- prometheus.MustNewConstMetric( - c.nvmePointVersionField, prometheus.GaugeValue, metrics[31].Float(), device) - ch <- prometheus.MustNewConstMetric( - c.nvmeMinorVersionField, prometheus.GaugeValue, metrics[32].Float(), device) - ch <- prometheus.MustNewConstMetric( - c.nvmeMajorVersionField, prometheus.GaugeValue, metrics[33].Float(), device) - ch <- prometheus.MustNewConstMetric( - c.nvmeNvmeErrataVersion, prometheus.GaugeValue, metrics[34].Float(), device) - ch <- prometheus.MustNewConstMetric( - c.nvmePcieLinkRetrainingCount, prometheus.CounterValue, metrics[35].Float(), device) - ch <- prometheus.MustNewConstMetric( - c.nvmePowerStateChangeCount, prometheus.CounterValue, metrics[36].Float(), device) -} - -func main() { - flag.Usage = func() { - fmt.Println("nvme_exporter - Exports NVMe smart-log and smart-ocp-log metrics in Prometheus format") - fmt.Println("Validated with nvme smart-log field descriptions can be found on page 209 of:") - fmt.Println( - "https://nvmexpress.org/wp-content/uploads/NVM-Express-Base-Specification-Revision-2.1-2024.08.05-Ratified.pdf") - fmt.Println("Validated with nvme ocp-smart-log field descriptions can be found on page 24 of:") - fmt.Println("https://www.opencompute.org/documents/datacenter-nvme-ssd-specification-v2-5-pdf */") - fmt.Printf("It has been tested with nvme-cli versions:%v\n", _supportedVersions) - fmt.Println("Usage: nvme_exporter [options]") - flag.PrintDefaults() - } - port := flag.String("port", "9998", "port to listen on") - ocp := flag.Bool("ocp", false, "Enable OCP smart log metrics") - endpoint := flag.String("endpoint", "/metrics", "Specify the endpoint to expose metrics") - flag.Parse() - - if !strings.HasPrefix(*endpoint, "/") { - *endpoint = "/" + *endpoint - } - - // check user - currentUser, err := user.Current() - if err != nil { - log.Fatalf("Error getting current user %s\n", err) - } - - if currentUser.Username != "root" { - log.Fatalln("Error: you must be root to use nvme-cli") - } - - // check for nvme-cli executable - _, err = exec.LookPath("nvme") - if err != nil { - log.Fatalf("Cannot find NVMe cli command in path: %s\n", err) - } - // check for nvme-cli version - command := exec.Command("nvme", "--version") - - out, err := command.CombinedOutput() - if err != nil { - log.Fatalf("error running nvme --version command: %s", err) - } - - re := regexp.MustCompile(`nvme version (\d+\.\d+)\.\d+`) - match := re.FindStringSubmatch(string(out)) - - if match != nil { - version := match[1] - if !isSupportedVersion(version) { - log.Printf("NVMe cli version %s not supported, supported versions are: %v", version, _supportedVersions) - } - } else { - log.Fatalf("Unable to find NVMe CLI version in output: %s", string(out)) - } - - prometheus.MustRegister(newNvmeCollector(*ocp)) - http.Handle(*endpoint, promhttp.Handler()) - log.Printf("Starting newNvmeCollector on port: %s, metrics endpoint: %s\n", *port, *endpoint) - log.Printf("newNvmeCollector is collecting OCP smart-log metrics: %t\n", *ocp) - - server := &http.Server{ - Addr: ":" + *port, - ReadHeaderTimeout: 3 * time.Second, - } - log.Fatal(server.ListenAndServe()) -} diff --git a/pkg/collector.go b/pkg/collector.go new file mode 100644 index 0000000..ed2854d --- /dev/null +++ b/pkg/collector.go @@ -0,0 +1,146 @@ +package pkg + +import ( + "log" + + "github.com/prometheus/client_golang/prometheus" + "github.com/tidwall/gjson" + + "github.com/E4-Computer-Engineering/nvme_exporter/pkg/utils" +) + +// GetDevices queries the devices list through the shell +// and returns an array of JSON results with the devices data. +func GetDevices() []gjson.Result { + devicesJSON, err := utils.ExecuteJSONCommand("nvme", "list", "-o", "json") + if err != nil { + log.Printf("Error running nvme list -o json: %s\n", err) + } + + return devicesJSON.Get("Devices").Array() +} + +// MetricCollector is the interface implemented by the objects contained +// in the CompositeCollector field. +// +// We could have the collectors implement prometheus.Collector directly, but then +// we wound unnecessarily have to call GetDevices more than once. +// +// Here the device data is injected in CollectMetrics, so that we can call +// GetDevice once in CompositeCollector.Collect +// (it is a shell function, so every call can potentially be "expensive"). +type MetricCollector interface { + // Describe is the same as prometheus.Collector.Describe + Describe(descChan chan<- *prometheus.Desc) + + // CollectMetrics does what prometheus.Collector.Collect does, + // but needs the device JSON data to prevent calling GetDevice + // multiple times + CollectMetrics(metricChan chan<- prometheus.Metric, device gjson.Result) +} + +// InfoMetricCollector implements MetricCollector and sends info metrics. +type InfoMetricCollector struct { + // InfoMetricProviders is the list of providers for the info metric collector + InfoMetricProviders []MetricProvider +} + +// NewInfoMetricCollector initializes and returns a new InfoMetricCollector object. +func NewInfoMetricCollector(providers []MetricProvider) *InfoMetricCollector { + return &InfoMetricCollector{InfoMetricProviders: providers} +} + +// Describe sends all prometheus.Desc pointers through the channel. +func (ic *InfoMetricCollector) Describe(ch chan<- *prometheus.Desc) { + for _, infoProvider := range ic.InfoMetricProviders { + ch <- infoProvider.Desc + } +} + +// CollectMetrics gets the devices data and sends all info metrics through the channel. +func (ic *InfoMetricCollector) CollectMetrics(ch chan<- prometheus.Metric, device gjson.Result) { + devicePath := device.Get("DevicePath").String() + genericPath := device.Get("GenericPath").String() + firmware := device.Get("Firmware").String() + modelNumber := device.Get("ModelNumber").String() + serialNumber := device.Get("SerialNumber").String() + + for _, infoProvider := range ic.InfoMetricProviders { + // Fetching the metric object is delegated to the provider + metric := infoProvider.GetMetric( + device, + devicePath, + genericPath, + firmware, + modelNumber, + serialNumber, + ) + ch <- metric + } +} + +// InfoMetricCollector implements MetricCollector and sends smart log metrics. +type LogMetricCollector struct { + // LogMetricProviders is the list of providers for the log metric collector + LogMetricProviders []MetricProvider + + // getData receives the devicePath and gets the log JSON data + getData func(string) gjson.Result +} + +// NewLogMetricCollector initializes and returns a new LogMetricCollector object. +func NewLogMetricCollector(providers []MetricProvider, getData func(string) gjson.Result) *LogMetricCollector { + return &LogMetricCollector{ + LogMetricProviders: providers, + getData: getData, + } +} + +// Describe sends all prometheus.Desc pointers through the channel. +func (lc *LogMetricCollector) Describe(ch chan<- *prometheus.Desc) { + for _, logProvider := range lc.LogMetricProviders { + ch <- logProvider.Desc + } +} + +// Collect gets the smart log data and sends all log metrics through the channel. +func (lc *LogMetricCollector) CollectMetrics(ch chan<- prometheus.Metric, device gjson.Result) { + devicePath := device.Get("DevicePath").String() + + jsonData := lc.getData(devicePath) + for _, logProvider := range lc.LogMetricProviders { + // Fetching the metric object is delegated to the provider + metric := logProvider.GetMetric(jsonData, devicePath) + ch <- metric + } +} + +// CompositeCollector implements prometheus.Collector interface, +// wrapping a slice of other MetricCollector objects. +type CompositeCollector struct { + // collectors holds a simple list of MetricCollector objects + collectors []MetricCollector +} + +// NewCompositeCollector initializes and returns a new CompositeCollector object. +func NewCompositeCollector(collectors []MetricCollector) *CompositeCollector { + return &CompositeCollector{collectors: collectors} +} + +// Describe calls Describe on every collector in cc.collectors. +func (cc *CompositeCollector) Describe(ch chan<- *prometheus.Desc) { + for _, collector := range cc.collectors { + collector.Describe(ch) + } +} + +// Collect calls Collect on every collector in cc.collectors. +func (cc *CompositeCollector) Collect(ch chan<- prometheus.Metric) { + devices := GetDevices() + + for _, device := range devices { + for _, collector := range cc.collectors { + collector.CollectMetrics(ch, device) + } + } +} diff --git a/pkg/provider.go b/pkg/provider.go new file mode 100644 index 0000000..1ecc40c --- /dev/null +++ b/pkg/provider.go @@ -0,0 +1,52 @@ +package pkg + +import ( + "github.com/prometheus/client_golang/prometheus" + "github.com/tidwall/gjson" +) + +// MetricProvider is an object that computes the info metric +// from the device data in JSON format. +type MetricProvider struct { + // Desc holds the pointer to the prometheus.desc object + Desc *prometheus.Desc + + // ValueType holds the prometheus.ValueType + ValueType prometheus.ValueType + + // jsonKey is the string key that the object needs to access + // in the device JSON to fetch the metric float64 value + jsonKey string +} + +// NewMetricProvider is the constructor for MetricProvider objects. +// No need to return a pointer, since the struct is static data. +func NewMetricProvider( + desc *prometheus.Desc, + valueType prometheus.ValueType, + jsonKey string, +) MetricProvider { + return MetricProvider{ + Desc: desc, + ValueType: valueType, + jsonKey: jsonKey, + } +} + +// GetMetric computes the metric from the +// data in JSON form. +func (ip MetricProvider) GetMetric( + data gjson.Result, + labels ...string, +) prometheus.Metric { + value := data.Get(ip.jsonKey).Float() + + metric := prometheus.MustNewConstMetric( + ip.Desc, + ip.ValueType, + value, + labels..., + ) + + return metric +} diff --git a/pkg/utils/command.go b/pkg/utils/command.go new file mode 100644 index 0000000..5e28b0c --- /dev/null +++ b/pkg/utils/command.go @@ -0,0 +1,69 @@ +package utils + +import ( + "fmt" + "os/exec" + "os/user" + "strings" + + "github.com/tidwall/gjson" +) + +// getStringCmd produces a single string from the cmd, args... format. +func getStringCmd(cmd string, args ...string) string { + cmdSlice := []string{cmd} + cmdSlice = append(cmdSlice, args...) + + return strings.Join(cmdSlice, " ") +} + +// ExecuteCommand executes a command and returns a nicely-formatted error if it fails. +func ExecuteCommand(cmd string, args ...string) (string, error) { + _, err := exec.LookPath(cmd) + if err != nil { + return "", fmt.Errorf("error looking for %s cli command in path: %w", cmd, err) + } + + command := exec.Command(cmd, args...) + + out, err := command.CombinedOutput() + if err != nil { + cmdString := getStringCmd(cmd, args...) + + return string(out), fmt.Errorf("error running command %s: %w", cmdString, err) + } + + return string(out), nil +} + +// ExecuteJSONCommand executes a command, validates the JSON output, and returns +// the parsed gjson.Result object. +func ExecuteJSONCommand(cmd string, args ...string) (gjson.Result, error) { + output, err := ExecuteCommand(cmd, args...) + if err != nil { + return gjson.Result{}, err + } + + if !gjson.Valid(output) { + cmdString := getStringCmd(cmd, args...) + + return gjson.Result{}, fmt.Errorf("invalid JSON output from %s command: %s", cmdString, output) + } + + ret := gjson.Parse(output) + + return ret, nil +} + +func CheckCurrentUser(wantedUser string) error { + currentUser, err := user.Current() + if err != nil { + return fmt.Errorf("error checking current user: %w", err) + } + + if currentUser.Username != wantedUser { + return fmt.Errorf("current user %s is not wanted user %s", currentUser.Username, wantedUser) + } + + return nil +}