-
Notifications
You must be signed in to change notification settings - Fork 23
INFRA-388 Converting smartmon into python and adding mock tests #1327
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from all commits
Commits
Show all changes
16 commits
Select commit
Hold shift + click to select a range
d0063e7
Convert smartmon script to python
technowhizz 46216b5
Create tests for smartmon
technowhizz 23fc747
Use pySMART
technowhizz 8714025
Add tests for pysmart
technowhizz b694537
Update docstring to state expected Device
technowhizz 2d25e92
Add a function for converting to camelcase
technowhizz a87abe0
Split device and attribute tests into individual test cases per fixtu…
technowhizz 172e034
Use function for snake case conversion
technowhizz aad34c1
Move fixtures to tests folder
technowhizz 44601ca
Use prometheus_client for writing out metrics
technowhizz 783a68c
Add args and returns to doc string for parse_ifattributes
technowhizz 4b1fc1f
Ensure metric names start with smartmon
technowhizz 1bc5733
Update smartmon playbook for smartmon.py
technowhizz df8c944
Add python script to generate test fixtures from real drives
technowhizz 6447df0
Remove smartmon.sh
technowhizz 5142d79
Add test fixture for Dell ENT NVMe
technowhizz File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,118 @@ | ||
#!/usr/bin/env python3 | ||
import json | ||
import re | ||
from pySMART import DeviceList | ||
|
||
SMARTMON_ATTRS = { | ||
"airflow_temperature_cel", | ||
"command_timeout", | ||
"current_pending_sector", | ||
"end_to_end_error", | ||
"erase_fail_count", | ||
"g_sense_error_rate", | ||
"hardware_ecc_recovered", | ||
"host_reads_32mib", | ||
"host_reads_mib", | ||
"host_writes_32mib", | ||
"host_writes_mib", | ||
"load_cycle_count", | ||
"media_wearout_indicator", | ||
"nand_writes_1gib", | ||
"offline_uncorrectable", | ||
"power_cycle_count", | ||
"power_on_hours", | ||
"program_fail_cnt_total", | ||
"program_fail_count", | ||
"raw_read_error_rate", | ||
"reallocated_event_count", | ||
"reallocated_sector_ct", | ||
"reported_uncorrect", | ||
"runtime_bad_block", | ||
"sata_downshift_count", | ||
"seek_error_rate", | ||
"spin_retry_count", | ||
"spin_up_time", | ||
"start_stop_count", | ||
"temperature_case", | ||
"temperature_celsius", | ||
"temperature_internal", | ||
"total_lbas_read", | ||
"total_lbas_written", | ||
"udma_crc_error_count", | ||
"unsafe_shutdown_count", | ||
"unused_rsvd_blk_cnt_tot", | ||
"wear_leveling_count", | ||
"workld_host_reads_perc", | ||
"workld_media_wear_indic", | ||
"workload_minutes", | ||
"critical_warning", | ||
"temperature", | ||
"available_spare", | ||
"available_spare_threshold", | ||
"percentage_used", | ||
"data_units_read", | ||
"data_units_written", | ||
"host_reads", | ||
"host_writes", | ||
"controller_busy_time", | ||
"power_cycles", | ||
"unsafe_shutdowns", | ||
"media_errors", | ||
"num_err_log_entries", | ||
"warning_temp_time", | ||
"critical_comp_time", | ||
} | ||
|
||
DISK_INFO = { | ||
"name", | ||
"interface", | ||
"vendor", | ||
"family", | ||
"model", | ||
"serial", | ||
"firmware", | ||
"smart_capable", | ||
"smart_enabled", | ||
"assessment", | ||
} | ||
|
||
def camel_to_snake(name): | ||
""" | ||
Convert a CamelCase string to snake_case. | ||
|
||
Reference: https://stackoverflow.com/questions/1175208/elegant-python-function-to-convert-camelcase-to-snake-case | ||
""" | ||
return re.sub(r'(?<!^)(?=[A-Z])', '_', name).lower() | ||
|
||
def attrs_to_dict(obj, allowed_keys): | ||
""" | ||
Build {attr: value} for every public, non-callable attribute whose | ||
snake_case name is in `allowed_keys`. | ||
""" | ||
attributes = {} | ||
for name in dir(obj): | ||
if name.startswith('_'): | ||
continue | ||
try: | ||
value = getattr(obj, name) | ||
except Exception: | ||
continue | ||
if value is None: | ||
continue | ||
if callable(value): | ||
continue | ||
if camel_to_snake(name) in allowed_keys: | ||
attributes[name] = value | ||
return attributes | ||
|
||
for disk in DeviceList().devices: | ||
|
||
fixtures = {} | ||
disk_info = attrs_to_dict(disk, DISK_INFO) | ||
if_stats = attrs_to_dict(disk.if_attributes, SMARTMON_ATTRS) | ||
|
||
fixtures["device_info"] = disk_info | ||
fixtures["if_attributes"] = if_stats | ||
|
||
print(f'Disk: {disk.name}: \n') | ||
print(json.dumps(fixtures, indent=2, default=str)) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,261 @@ | ||
#!/usr/bin/env python3 | ||
|
||
import subprocess | ||
import json | ||
import re | ||
import datetime | ||
import os | ||
|
||
from prometheus_client import CollectorRegistry, Gauge, write_to_textfile | ||
from pySMART import DeviceList | ||
|
||
SMARTCTL_PATH = "/usr/sbin/smartctl" | ||
|
||
SMARTMON_ATTRS = { | ||
"airflow_temperature_cel", | ||
"command_timeout", | ||
"current_pending_sector", | ||
"end_to_end_error", | ||
"erase_fail_count", | ||
"g_sense_error_rate", | ||
"hardware_ecc_recovered", | ||
"host_reads_32mib", | ||
"host_reads_mib", | ||
"host_writes_32mib", | ||
"host_writes_mib", | ||
"load_cycle_count", | ||
"media_wearout_indicator", | ||
"nand_writes_1gib", | ||
"offline_uncorrectable", | ||
"power_cycle_count", | ||
"power_on_hours", | ||
"program_fail_cnt_total", | ||
"program_fail_count", | ||
"raw_read_error_rate", | ||
"reallocated_event_count", | ||
"reallocated_sector_ct", | ||
"reported_uncorrect", | ||
"runtime_bad_block", | ||
"sata_downshift_count", | ||
"seek_error_rate", | ||
"spin_retry_count", | ||
"spin_up_time", | ||
"start_stop_count", | ||
"temperature_case", | ||
"temperature_celsius", | ||
"temperature_internal", | ||
"total_lbas_read", | ||
"total_lbas_written", | ||
"udma_crc_error_count", | ||
"unsafe_shutdown_count", | ||
"unused_rsvd_blk_cnt_tot", | ||
"wear_leveling_count", | ||
"workld_host_reads_perc", | ||
"workld_media_wear_indic", | ||
"workload_minutes", | ||
"critical_warning", | ||
"temperature", | ||
"available_spare", | ||
"available_spare_threshold", | ||
"percentage_used", | ||
"data_units_read", | ||
"data_units_written", | ||
"host_reads", | ||
"host_writes", | ||
"controller_busy_time", | ||
"power_cycles", | ||
"unsafe_shutdowns", | ||
"media_errors", | ||
"num_err_log_entries", | ||
"warning_temp_time", | ||
"critical_comp_time", | ||
} | ||
|
||
def run_command(command, parse_json=False): | ||
""" | ||
Helper to run a subprocess command and optionally parse JSON output. | ||
""" | ||
result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) | ||
if parse_json: | ||
return json.loads(result.stdout) | ||
return result.stdout.strip() | ||
|
||
def camel_to_snake(name): | ||
""" | ||
Convert a CamelCase string to snake_case. | ||
|
||
Reference: https://stackoverflow.com/questions/1175208/elegant-python-function-to-convert-camelcase-to-snake-case | ||
""" | ||
return re.sub(r'(?<!^)(?=[A-Z])', '_', name).lower() | ||
|
||
def parse_device_info(device): | ||
""" | ||
Produce Prometheus lines describing the device's identity and SMART status: | ||
- device_info | ||
- device_smart_available | ||
- device_smart_enabled | ||
- device_smart_healthy | ||
|
||
Args: | ||
device (Device): A pySMART Device object with attributes such as name, interface, etc. | ||
|
||
Returns: | ||
List[str]: A list of Prometheus formatted metric strings. | ||
""" | ||
serial_number = (device.serial or "").lower() | ||
labels = { | ||
"disk": device.name, | ||
"type": device.interface or "", | ||
"vendor": device.vendor or "", | ||
"model_family": device.family or "", | ||
"device_model": device.model or "", | ||
"serial_number": serial_number, | ||
"firmware_version": device.firmware or "", | ||
} | ||
sorted_labels = sorted(labels.items()) | ||
label_str = ",".join(f'{k}="{v}"' for k, v in sorted_labels) | ||
|
||
metric_labels = f'disk="{device.name}",serial_number="{serial_number}",type="{device.interface}"' | ||
|
||
metrics = [ | ||
f'smartmon_device_info{{{label_str}}} 1.0', | ||
f'smartmon_device_smart_available{{{metric_labels}}} {float(1) if device.smart_capable else float(0)}', | ||
] | ||
|
||
if device.smart_capable: | ||
metrics.append( | ||
f'smartmon_device_smart_enabled{{{metric_labels}}} {float(1) if device.smart_enabled else float(0)}' | ||
) | ||
if device.assessment: | ||
is_healthy = 1 if device.assessment.upper() == "PASS" else 0 | ||
metrics.append( | ||
f'smartmon_device_smart_healthy{{{metric_labels}}} {float(is_healthy)}' | ||
) | ||
|
||
return metrics | ||
|
||
def parse_if_attributes(device): | ||
""" | ||
For any device type (ATA, NVMe, SCSI, etc.), we read device.if_attributes. | ||
We'll iterate over its public fields, convert them to snake_case, | ||
and if it's in SMARTMON_ATTRS and numeric, we produce metrics. | ||
|
||
Args: | ||
device (Device): A pySMART Device object with attributes such as name, interface, etc. | ||
Returns: | ||
List[str]: A list of Prometheus formatted metric strings. | ||
""" | ||
metrics = [] | ||
|
||
if not device.if_attributes: | ||
return metrics | ||
|
||
disk = device.name | ||
disk_type = device.interface or "" | ||
serial_number = (device.serial or "").lower() | ||
labels = f'disk="{disk}",serial_number="{serial_number}",type="{disk_type}"' | ||
|
||
# Inspect all public attributes on device.if_attributes | ||
for attr_name in dir(device.if_attributes): | ||
if attr_name.startswith("_"): | ||
continue # skip private / special methods | ||
val = getattr(device.if_attributes, attr_name, None) | ||
if callable(val): | ||
continue # skip methods | ||
|
||
snake_name = camel_to_snake(attr_name) | ||
|
||
if snake_name in SMARTMON_ATTRS and isinstance(val, (int, float)): | ||
metrics.append(f"smartmon_{snake_name}{{{labels}}} {float(val)}") | ||
|
||
return metrics | ||
|
||
def write_metrics_to_textfile(metrics, output_path=None): | ||
""" | ||
Write metrics to a Prometheus textfile using prometheus_client. | ||
Args: | ||
metrics (List[str]): List of metric strings in 'name{labels} value' format. | ||
output_path (str): Path to write the metrics file. Defaults to node_exporter textfile collector path. | ||
""" | ||
registry = CollectorRegistry() | ||
metric_gauges = {} | ||
for metric in metrics: | ||
# Split metric into name, labels, and value | ||
metric_name, rest = metric.split('{', 1) | ||
label_str, value = rest.split('}', 1) | ||
value = value.strip() | ||
# Parse labels into a dictionary | ||
labels = {} | ||
label_keys = [] | ||
label_values = [] | ||
for label in label_str.split(','): | ||
if '=' in label: | ||
k, v = label.split('=', 1) | ||
k = k.strip() | ||
v = v.strip('"') | ||
labels[k] = v | ||
label_keys.append(k) | ||
label_values.append(v) | ||
help_str = f"SMART metric {metric_name}" | ||
# Create Gauge if not already present | ||
if metric_name not in metric_gauges: | ||
metric_gauges[metric_name] = Gauge(metric_name, help_str, label_keys, registry=registry) | ||
# Set metric value | ||
gauge = metric_gauges[metric_name] | ||
gauge.labels(*label_values).set(float(value)) | ||
if output_path is None: | ||
output_path = '/var/lib/node_exporter/textfile_collector/smartmon.prom' | ||
write_to_textfile(output_path, registry) # Write all metrics to file | ||
|
||
def main(output_path=None): | ||
all_metrics = [] | ||
|
||
try: | ||
version_output = run_command([SMARTCTL_PATH, "--version"]) | ||
if version_output.startswith("smartctl"): | ||
first_line = version_output.splitlines()[0] | ||
version_num = first_line.split()[1] | ||
else: | ||
version_num = "unknown" | ||
except Exception: | ||
version_num = "unknown" | ||
all_metrics.append(f'smartmon_smartctl_version{{version="{version_num}"}} 1') | ||
|
||
dev_list = DeviceList() | ||
|
||
for dev in dev_list.devices: | ||
disk_name = dev.name | ||
disk_type = dev.interface or "" | ||
serial_number = (dev.serial or "").lower() | ||
|
||
run_timestamp = int(datetime.datetime.now(datetime.timezone.utc).timestamp()) | ||
all_metrics.append(f'smartmon_smartctl_run{{disk="{disk_name}",type="{disk_type}"}} {run_timestamp}') | ||
|
||
active = 1 | ||
try: | ||
cmd = [SMARTCTL_PATH, "-n", "standby", "-d", disk_type, "-j", disk_name] | ||
standby_json = run_command(cmd, parse_json=True) | ||
if standby_json.get("power_mode", "") == "standby": | ||
active = 0 | ||
except json.JSONDecodeError: | ||
active = 0 | ||
except Exception: | ||
active = 0 | ||
|
||
all_metrics.append( | ||
f'smartmon_device_active{{disk="{disk_name}",type="{disk_type}",serial_number="{serial_number}"}} {active}' | ||
) | ||
if active == 0: | ||
continue | ||
|
||
all_metrics.extend(parse_device_info(dev)) | ||
all_metrics.extend(parse_if_attributes(dev)) | ||
|
||
write_metrics_to_textfile(all_metrics, output_path) | ||
|
||
if __name__ == "__main__": | ||
import argparse | ||
parser = argparse.ArgumentParser(description="Export SMART metrics to Prometheus textfile format.") | ||
parser.add_argument('--output', type=str, default=None, help='Output path for Prometheus textfile (default: /var/lib/node_exporter/textfile_collector/smartmon.prom)') | ||
args = parser.parse_args() | ||
main(args.output) |
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.