Skip to content

INFRA-388 Converting smartmon into python and adding mock tests #1327

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 16 commits into from
May 8, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
118 changes: 118 additions & 0 deletions etc/kayobe/ansible/scripts/generate_fixtures.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
#!/usr/bin/env python3
import json
import re
from pySMART import DeviceList

SMARTMON_ATTRS = {
"airflow_temperature_cel",
"command_timeout",
"current_pending_sector",
"end_to_end_error",
"erase_fail_count",
"g_sense_error_rate",
"hardware_ecc_recovered",
"host_reads_32mib",
"host_reads_mib",
"host_writes_32mib",
"host_writes_mib",
"load_cycle_count",
"media_wearout_indicator",
"nand_writes_1gib",
"offline_uncorrectable",
"power_cycle_count",
"power_on_hours",
"program_fail_cnt_total",
"program_fail_count",
"raw_read_error_rate",
"reallocated_event_count",
"reallocated_sector_ct",
"reported_uncorrect",
"runtime_bad_block",
"sata_downshift_count",
"seek_error_rate",
"spin_retry_count",
"spin_up_time",
"start_stop_count",
"temperature_case",
"temperature_celsius",
"temperature_internal",
"total_lbas_read",
"total_lbas_written",
"udma_crc_error_count",
"unsafe_shutdown_count",
"unused_rsvd_blk_cnt_tot",
"wear_leveling_count",
"workld_host_reads_perc",
"workld_media_wear_indic",
"workload_minutes",
"critical_warning",
"temperature",
"available_spare",
"available_spare_threshold",
"percentage_used",
"data_units_read",
"data_units_written",
"host_reads",
"host_writes",
"controller_busy_time",
"power_cycles",
"unsafe_shutdowns",
"media_errors",
"num_err_log_entries",
"warning_temp_time",
"critical_comp_time",
}

DISK_INFO = {
"name",
"interface",
"vendor",
"family",
"model",
"serial",
"firmware",
"smart_capable",
"smart_enabled",
"assessment",
}

def camel_to_snake(name):
"""
Convert a CamelCase string to snake_case.

Reference: https://stackoverflow.com/questions/1175208/elegant-python-function-to-convert-camelcase-to-snake-case
"""
return re.sub(r'(?<!^)(?=[A-Z])', '_', name).lower()

def attrs_to_dict(obj, allowed_keys):
"""
Build {attr: value} for every public, non-callable attribute whose
snake_case name is in `allowed_keys`.
"""
attributes = {}
for name in dir(obj):
if name.startswith('_'):
continue
try:
value = getattr(obj, name)
except Exception:
continue
if value is None:
continue
if callable(value):
continue
if camel_to_snake(name) in allowed_keys:
attributes[name] = value
return attributes

for disk in DeviceList().devices:

fixtures = {}
disk_info = attrs_to_dict(disk, DISK_INFO)
if_stats = attrs_to_dict(disk.if_attributes, SMARTMON_ATTRS)

fixtures["device_info"] = disk_info
fixtures["if_attributes"] = if_stats

print(f'Disk: {disk.name}: \n')
print(json.dumps(fixtures, indent=2, default=str))
261 changes: 261 additions & 0 deletions etc/kayobe/ansible/scripts/smartmon.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,261 @@
#!/usr/bin/env python3

import subprocess
import json
import re
import datetime
import os

from prometheus_client import CollectorRegistry, Gauge, write_to_textfile
from pySMART import DeviceList

SMARTCTL_PATH = "/usr/sbin/smartctl"

SMARTMON_ATTRS = {
"airflow_temperature_cel",
"command_timeout",
"current_pending_sector",
"end_to_end_error",
"erase_fail_count",
"g_sense_error_rate",
"hardware_ecc_recovered",
"host_reads_32mib",
"host_reads_mib",
"host_writes_32mib",
"host_writes_mib",
"load_cycle_count",
"media_wearout_indicator",
"nand_writes_1gib",
"offline_uncorrectable",
"power_cycle_count",
"power_on_hours",
"program_fail_cnt_total",
"program_fail_count",
"raw_read_error_rate",
"reallocated_event_count",
"reallocated_sector_ct",
"reported_uncorrect",
"runtime_bad_block",
"sata_downshift_count",
"seek_error_rate",
"spin_retry_count",
"spin_up_time",
"start_stop_count",
"temperature_case",
"temperature_celsius",
"temperature_internal",
"total_lbas_read",
"total_lbas_written",
"udma_crc_error_count",
"unsafe_shutdown_count",
"unused_rsvd_blk_cnt_tot",
"wear_leveling_count",
"workld_host_reads_perc",
"workld_media_wear_indic",
"workload_minutes",
"critical_warning",
"temperature",
"available_spare",
"available_spare_threshold",
"percentage_used",
"data_units_read",
"data_units_written",
"host_reads",
"host_writes",
"controller_busy_time",
"power_cycles",
"unsafe_shutdowns",
"media_errors",
"num_err_log_entries",
"warning_temp_time",
"critical_comp_time",
}

def run_command(command, parse_json=False):
"""
Helper to run a subprocess command and optionally parse JSON output.
"""
result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
if parse_json:
return json.loads(result.stdout)
return result.stdout.strip()

def camel_to_snake(name):
"""
Convert a CamelCase string to snake_case.

Reference: https://stackoverflow.com/questions/1175208/elegant-python-function-to-convert-camelcase-to-snake-case
"""
return re.sub(r'(?<!^)(?=[A-Z])', '_', name).lower()

def parse_device_info(device):
"""
Produce Prometheus lines describing the device's identity and SMART status:
- device_info
- device_smart_available
- device_smart_enabled
- device_smart_healthy

Args:
device (Device): A pySMART Device object with attributes such as name, interface, etc.

Returns:
List[str]: A list of Prometheus formatted metric strings.
"""
serial_number = (device.serial or "").lower()
labels = {
"disk": device.name,
"type": device.interface or "",
"vendor": device.vendor or "",
"model_family": device.family or "",
"device_model": device.model or "",
"serial_number": serial_number,
"firmware_version": device.firmware or "",
}
sorted_labels = sorted(labels.items())
label_str = ",".join(f'{k}="{v}"' for k, v in sorted_labels)

metric_labels = f'disk="{device.name}",serial_number="{serial_number}",type="{device.interface}"'

metrics = [
f'smartmon_device_info{{{label_str}}} 1.0',
f'smartmon_device_smart_available{{{metric_labels}}} {float(1) if device.smart_capable else float(0)}',
]

if device.smart_capable:
metrics.append(
f'smartmon_device_smart_enabled{{{metric_labels}}} {float(1) if device.smart_enabled else float(0)}'
)
if device.assessment:
is_healthy = 1 if device.assessment.upper() == "PASS" else 0
metrics.append(
f'smartmon_device_smart_healthy{{{metric_labels}}} {float(is_healthy)}'
)

return metrics

def parse_if_attributes(device):
"""
For any device type (ATA, NVMe, SCSI, etc.), we read device.if_attributes.
We'll iterate over its public fields, convert them to snake_case,
and if it's in SMARTMON_ATTRS and numeric, we produce metrics.

Args:
device (Device): A pySMART Device object with attributes such as name, interface, etc.
Returns:
List[str]: A list of Prometheus formatted metric strings.
"""
metrics = []

if not device.if_attributes:
return metrics

disk = device.name
disk_type = device.interface or ""
serial_number = (device.serial or "").lower()
labels = f'disk="{disk}",serial_number="{serial_number}",type="{disk_type}"'

# Inspect all public attributes on device.if_attributes
for attr_name in dir(device.if_attributes):
if attr_name.startswith("_"):
continue # skip private / special methods
val = getattr(device.if_attributes, attr_name, None)
if callable(val):
continue # skip methods

snake_name = camel_to_snake(attr_name)

if snake_name in SMARTMON_ATTRS and isinstance(val, (int, float)):
metrics.append(f"smartmon_{snake_name}{{{labels}}} {float(val)}")

return metrics

def write_metrics_to_textfile(metrics, output_path=None):
"""
Write metrics to a Prometheus textfile using prometheus_client.
Args:
metrics (List[str]): List of metric strings in 'name{labels} value' format.
output_path (str): Path to write the metrics file. Defaults to node_exporter textfile collector path.
"""
registry = CollectorRegistry()
metric_gauges = {}
for metric in metrics:
# Split metric into name, labels, and value
metric_name, rest = metric.split('{', 1)
label_str, value = rest.split('}', 1)
value = value.strip()
# Parse labels into a dictionary
labels = {}
label_keys = []
label_values = []
for label in label_str.split(','):
if '=' in label:
k, v = label.split('=', 1)
k = k.strip()
v = v.strip('"')
labels[k] = v
label_keys.append(k)
label_values.append(v)
help_str = f"SMART metric {metric_name}"
# Create Gauge if not already present
if metric_name not in metric_gauges:
metric_gauges[metric_name] = Gauge(metric_name, help_str, label_keys, registry=registry)
# Set metric value
gauge = metric_gauges[metric_name]
gauge.labels(*label_values).set(float(value))
if output_path is None:
output_path = '/var/lib/node_exporter/textfile_collector/smartmon.prom'
write_to_textfile(output_path, registry) # Write all metrics to file

def main(output_path=None):
all_metrics = []

try:
version_output = run_command([SMARTCTL_PATH, "--version"])
if version_output.startswith("smartctl"):
first_line = version_output.splitlines()[0]
version_num = first_line.split()[1]
else:
version_num = "unknown"
except Exception:
version_num = "unknown"
all_metrics.append(f'smartmon_smartctl_version{{version="{version_num}"}} 1')

dev_list = DeviceList()

for dev in dev_list.devices:
disk_name = dev.name
disk_type = dev.interface or ""
serial_number = (dev.serial or "").lower()

run_timestamp = int(datetime.datetime.now(datetime.timezone.utc).timestamp())
all_metrics.append(f'smartmon_smartctl_run{{disk="{disk_name}",type="{disk_type}"}} {run_timestamp}')

active = 1
try:
cmd = [SMARTCTL_PATH, "-n", "standby", "-d", disk_type, "-j", disk_name]
standby_json = run_command(cmd, parse_json=True)
if standby_json.get("power_mode", "") == "standby":
active = 0
except json.JSONDecodeError:
active = 0
except Exception:
active = 0

all_metrics.append(
f'smartmon_device_active{{disk="{disk_name}",type="{disk_type}",serial_number="{serial_number}"}} {active}'
)
if active == 0:
continue

all_metrics.extend(parse_device_info(dev))
all_metrics.extend(parse_if_attributes(dev))

write_metrics_to_textfile(all_metrics, output_path)

if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description="Export SMART metrics to Prometheus textfile format.")
parser.add_argument('--output', type=str, default=None, help='Output path for Prometheus textfile (default: /var/lib/node_exporter/textfile_collector/smartmon.prom)')
args = parser.parse_args()
main(args.output)
Loading
Loading