forked from niclan/check_smart
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcheck_ssdwear
executable file
·567 lines (452 loc) · 17.8 KB
/
check_ssdwear
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
#!/usr/bin/perl
# check_ssd: Check the remaining lifetime/wear of SSD disks.
# - We want warnings well in advance of the disks expiring so
# thesholds set at 6 and 3 months from nominal expiry.
#
# Please see documentation below.
use v5.10;
use strict;
use warnings;
use Pod::Usage; # Bundled
use Getopt::Std;
use Scalar::Util; # Bundeled
use Data::Dumper;
my @states = qw(OK WARNING CRITICAL UNKNOWN);
my %stateval = map { $states[$_] => $_ } 0 .. $#states;
my $OK = $stateval{OK};
my $WARNING = $stateval{WARNING};
my $CRITICAL = $stateval{CRITICAL};
my $UNKNOWN = $stateval{UNKNOWN};
=pod
=head1 NAME
check_ssdwear - Check all SSD (including nvme) disks in the system for wear left
=head1 SYNOPSIS
check_ssdwear [options]
-h Help - this text.
-w <months> Warn if there are less than months left of wear.
Default is 6 months. If the disk contains the
root file system the plugin will report CRITICAL
instead.
-c <months> Critical if there are less than months left of wear.
Default is 3 months.
-p <percent> Warning if there is percent% or less left of wear. No
default but if a non-zero threshold is supplied by
smartctl this will be used unless this option is specified.
-q <percent> Critical if there is percent% or less left of wear
No default.
-n OK|WARNING|CRITICAL|UNKNOWN
State to report if there are no SSDs in the system
-u OK|WARNING|CRITICAL|UNKNOWN
State to report if there is a unsupported SSD in the
system. This MUST be fixed by updating the plugin as
the plugin will report this state regardless of status
for any other devices.
-h This message
NOTE! The program uses sudo to run smartctl and nvme, so your
nagios/nrpe user needs sudo for this.
NOTE! Do not use this in an alarm that wakes people up at night. Disk
weardown is NOT that acute! Or is it?
NOTE! Use this plugin in combination with SMART plugin that monitors
other health parameters of the disks. In fact this plugin even
ignores the "SMART overall-health self-assessment test result"
focusing only on the wear and lifetime.
Please refer do C<perldoc check_ssdwear> for full docs on hardware,
caveats in the lifetime calculation, hardware support and usage
restrictions.
=head1 REQUIRED PROGRAMS
External programs are required: sudo, lsblk, smartctl, nvme
(nvme-cli, only if host has /dev/nvme* devices).
The program uses sudo to run smartctl and nvme, so your nagios/nrpe
user needs sudo for this.
=head1 LIFETIME CALCULATION
This plugin aims to estimate the remaining lifespan of SSDs in terms
of months based on the wear percentage each time it is run. However,
due to the high number of hours it takes for each percentage point of
wear, the estimates will be inaccurate and wary over time.
Because: 1% of wear can be up to many tens of months. In this case
the wear might not matter because the disk will probably live longer
than the host it serves. For the disks this was developed with it was
about 8 months pr. percentage point of wear.
As long as the wear left percentage is high the calculation will be
very inaccurate so the will only report "many" months of life left for
wear percentages at or above 90%.
Also, to avoid underwarn this plugin subtracts 1 from the percentage
in the calculation.
If this all feels too unreliavble use the -p and -q options to set
percentage-based thresholds. Although you cannot disable
the -w and -c checks, you can set them to 0.
=head1 DEVICE SUPPORT
Different SSD disks use different SMART attributes to report wear,
therefore this plugin does not support every SSD.
=head2 Supported devices
Hardware reporting the S.M.A.R.T. attributes shown below is supported.
The attribute number is supplied in paranthesis, but I've seen that
differnet manufacturers use the same attribute name but different
numbers. This plugin only matches the attribute name.
For ssd disks the attributes are collected with smartctl(1). In case
of nvme disks the attributes are collected with nvme(1).
=over 4
=item Media_Wearout_Indicator (233)
This is supported by (some?) Intel SSDs. It reports the nominal wear
left on the device in percent. The drives I've looked at reports a
threshold value of 0.
The raw value of this attribute is always 0.
=item Wear_Leveling_Count (177)
This is supported by (some?) Samsung SSDs. It reports the nominal
wear left on the device in percent. The drives I've looked at reports
a threshold value of 5% but superuser.com shows screen dumps where the
threshold is 0.
The raw value for this attribute increases with wear but is useless
for this plugin unless we can determine that the maximum value is, or
how it relates to the percentage - which will vary from device to
device and with disk capacity.
=item "Percentage used"
This is for nvme devices from nvme smart-log. In nvme(1)'s JSON
output(1), it is called "percent_used", while smartctl(1) output is
"percentage used". The value goes from 0% (unused) to 100% (nominal
life) and is capped at 255%.
=item Other parameters
Other parameters observed on StackExchange and similar, they works the
same as the above attributes
169 Remaining_Lifetime_Perc (Adata, Innodisk)
201 Lifetime_Remaining%
202 Percent_Lifetime_Remain (Crucial)
231 SSD_Life_Left (Sandforce/Kingston)
=back
=head2 Unsupported devices
The plugin ignores devices that `lsblk -o NAME,MODEL` shows as
"LOGICAL VOLUME". These might be hardware RAID drives etc. Hopefuly
you have some hardware specific plugin to monitor the health of these
drives.
If your server has unsupported SSDs the plugin will report a "UNKNOWN"
error and identify the first unsupported device. If so you will need
to update the plugin to either ignore or support the specific
hardware. Please provide PR if you decide to support the device.
If you would like to support more devices PRs are welcome. Please
take the trouble of looking up your devices data sheet to see what the
manufacturer says about how the attribute works and update the
documentation above.
This plugin does not support disks hidden behind RAID
controllers. Some RAID controllers report that the RAID device is
non-rotational and so some effort is spent to ignore these. Patches
welcome, both for ignoring them and supporting them if there are SSDs
behind there.
=head1 BUGS
Probably.
=head1 COPYRIGHT AND LICENSE
(C) Copyright 2023 Schibsted Products and Technology. Written by
Nicolai Langfeldt ([email protected]).
License: Apache 2.0
=head2 GPT, AI assistance
I have experimented with ChatGPT(3.5) and GPT4 while writing this
plugin. After all it was tweeted that it could produce perfectly
working programs in one try.
The good news: GPT is quite good at structuring code. It also taught
me about the ROTA column for lsblk which made things a lot easier.
Given error messages from python it immediately understoond that I was
using python 2 not python3 (old server running CentOS 7) and provided
corrected code that worked in python 2.
https://github.com/yunal/notes/issues/4
Also see https://github.com/yunal/notes/issues/3 for some quite
impressive keeping track of details and raising the abstraction
stakes.
The bad news: It of course can't test the programs and has trouble
with what regular expressions really do, and which parts of strings
they pull out.
Overall GPT3.5 and GPT4 appears to be a trainee or junior programmer
that writes programs with perfect syntax in both shell, perl and
python. It makes some rather obvious mistakes with regards to which
columns of output to use and finds it hard to write (more) correct
code when the mistakes are pointed out.
Some of the documentation text was suggested or influenced by
GPT4. GPT4 also provided proff reading. I was unable to get GPT4 to
write this plugin for me. I also asked GPT4 to suggest improvements
to the code but the suggestions would either not work or make the code
harder to read.
=cut
my %opts;
getopts('hw:c:n:u:p:q:', \%opts);
usage() if exists $opts{'h'};
my $warn_mo = $opts{'w'} // 6;
my $crit_mo = $opts{'c'} // 3;
my $warn_pct = $opts{'p'};
my $crit_pct = $opts{'q'};
my $nossd = $OK; # What to report if no SSDs are seen
my $unknown = $UNKNOWN; # What to report if unknown SSD is seen
$nossd = $stateval{$opts{'n'}} if exists $opts{'n'};
$unknown = $stateval{$opts{'u'}} if exists $opts{'u'};
usage() if $nossd < 0;
sub usage {
pod2usage();
exit 0;
}
sub get_disks {
# Get list of SSD disks and list of the ones that house root file
# systems
my %ssd;
my %rootdisk;
my $basedev = '';
my @physical;
my %in_use;
my $lsblk = `lsblk -pdo NAME,MODEL 2>/dev/null`;
if ($?) {
printf("UNKNOWN: lsblk exited with error code %d\n", $? >> 8);
exit $UNKNOWN;
}
unless ($lsblk) {
# No disks at all
return \%ssd, \%rootdisk, \%in_use;
}
# "Physical" disks with "MODEL" == 'LOGICAL VOLUME' or '' (empty
# string) are ignored. If you know how to support SSDs behind RAID
# controllers etc, please send patches.
foreach (split(/\n/s, $lsblk)) {
s/\s+$//; # No trailing space
my ($dev,$model) = split(/\s+/, $_, 2);
if ($model) {
next if $model =~ /^MODEL$/; # Headline
next if $model =~ /^MR\d\d\d\d/; # Megaraid
next if $model =~ /^LOGICAL VOLUME$/; # HP Raid
} else {
next;
}
push(@physical, $dev);
(my $basedev = $dev) =~ s{^/dev/}{};
my @holders = glob("/sys/block/${basedev}/holders/*");
if (@holders) {
$in_use{$dev} = 1;
}
}
if (!($lsblk = `lsblk -rpo NAME,ROTA,MOUNTPOINT 2>/dev/null`)) {
say "UNKNOWN: lsblk command did not work";
exit $UNKNOWN;
}
foreach (split(/\n/s, $lsblk)) {
my ($dev, $rot, $mount) = split(/\s/);
# ignore header
next if $dev eq 'NAME';
$basedev = $dev if $dev =~ m~^/~;
$rootdisk{$basedev} = 1 if defined $mount && $mount eq '/';
# ignore rotating disks
next if $rot == 1;
# Look for a physical disk which match this prefix
for my $phys (@physical) {
if ($dev =~ m{^$phys}) {
# We only store the main device, not each partition
$ssd{$phys} = 1;
# A partition. Mark the physical device as in use
$in_use{$phys} = 1 if $dev ne $phys;
}
}
}
return \%ssd, \%rootdisk, \%in_use;
}
sub get_param {
# Get column no. $index for that parameter value, or undef if parameter is unknown
my ($smart, $param, $index) = @_;
if (exists $smart->{$param}) {
if (ref $smart->{$param} eq 'ARRAY') {
$index += scalar @{$smart->{$param}} if $index < 0;
return $smart->{$param}->[$index];
} elsif ($index == 0 || $index == -1) {
return $smart->{$param};
}
}
return undef;
}
sub months_from_percentage {
my ($hours, $wear, $disk, $rootdisk) = @_;
my $state = $OK;
my $months_left;
if ($wear > 90) {
# Avoid:
# $wear = 100: division by zero
# $wear > 90: little wear gives very inaccurate calculation
$months_left = 'many';
} else {
$wear -= 1; # 1% is still many months (8 months for some of
# our disks) and the drive might not ever report
# any number lower than 1, so round down.
my $wear_used = 100 - $wear;
$months_left = int( ( ( $hours / $wear_used ) ) * $wear / ( 24 * 30 ) );
if ($months_left < $crit_mo) {
$state = $CRITICAL;
} elsif ($months_left < $warn_mo) {
$state = $WARNING;
$state = $CRITICAL if $rootdisk->{$disk}; # Higher citicality for / disks
}
}
return ( $state, { wear => $wear,
monthsleft => $months_left,
isroot => $rootdisk->{$disk} // 0 } );
}
sub smart_data {
my ($dev) = @_;
my $data;
my $try_human_nvme;
my $smart;
if ($dev =~ m{/nvme}) {
# Munge the "human friendly" output into a hash
$smart = `sudo nvme smart-log $dev`;
} else {
$smart = `sudo smartctl -A $dev`;
if ($smart =~ /Unable to detect device type/) {
say "UNKNOWN: smartctl too old to recognize $dev";
exit $UNKNOWN;
}
}
$smart =~ s/^\s+//gm; # Remove leading space on all lines
$smart =~ s/(\d) (\d)/$1$2/; # Remove single inter-digit spaces e.g. in "1 250"
if ($smart =~ s/^.*===$//s || $smart =~ /^Smart Log/) {
# This handles format like (smartctl -A):
# === START OF SMART DATA SECTION ===
# SMART/Health Information (NVMe Log 0x02)
# Critical Warning: 0x00
# Temperature: 33 Celsius
#
# Or (nvme smart-log):
# Smart Log for NVME device:nvme0n1 namespace-id:ffffffff
# critical_warning : 0x4
for my $line (split(/\n/, $smart)) {
my ($param, $value) = split(/\s*:\s*/, $line, 2);
if (defined $value) {
$param =~ tr/[A-Z ]/[a-z_]/;
if ($value =~ /^0x[0-9a-f]{1,8}$/) {
# hex() will only decode 32 bit values, so leave 64-bit in place
$value = hex($value);
} elsif ($value =~ /^(\d+)%$/) {
# turn percentage into pure integer
$value = $1;
} elsif ($value =~ /^(\d+([.,]\d{3}))( \[.*\])?$/) {
# throw away " [314 TB]"
$value = $1;
# you have to love decimal separators...
$value =~ s/[.,]//g;
}
$data->{$param} = [$value];
}
}
} else {
# Format like:
# ID# ATTRIBUTE_NAME FLAG VALUE WORST THRESH TYPE UPDATED WHEN_FAILED RAW_VALUE
# 5 Reallocated_Sector_Ct 0x0032 100 100 000 Old_age Always - 0
$smart =~ s/^.*RAW_VALUE\n//s; # Remove heading section
for my $line (split(/\n/, $smart)) {
my ($id, $param, @values) = split(/\s+/, $line);
$param = lc $param;
$data->{$param} = [];
for (@values) {
if (/^0x[0-9a-f]{1,8}$/) {
# hex() will only decode 32 bit values, so leave 64-bit in place
$_ = hex($_);
} elsif (/^0\d*$/) {
# This is *not* octal
$_ = $_ + 0;
} elsif (/^\d+([.,]\d{3})+$/) {
# you have to love decimal separators...
s/[.,]//g;
}
push(@{$data->{$param}}, $_);
}
}
}
return $data;
}
sub get_lifetime {
# For the found ssd disks calculate the lifetime left on the disk
# and maybe warn about it.
my ($disks, $rootdisk, $in_use) = @_;
my %findings;
my $worst_disk = 'none';
my $worst_mo = 99999999;
my $state = $OK;
foreach my $disk (keys %{$disks}) {
my $smart = smart_data($disk);
my $hours = get_param($smart, 'power_on_hours', -1);
# You could add other wear related attributes here, and have a
# different calculation below to fit it.
# Intel: This is a percentage of wear left in the device
my $wear = get_param($smart, 'media_wearout_indicator', 1);
# Sandforce/Kingston:
$wear = get_param($smart, 'ssd_life_left', 1) unless defined $wear;
# Adata, innodisk:
$wear = get_param($smart, "remaining_lifetime_perc", 1) unless defined $wear;
# Samsung: This is a percentage of wear left in the device
$wear = get_param($smart, 'wear_leveling_count', 1) unless defined $wear;
if (!$wear) {
# Check for some attributes that count from 0 to 100 instead of 100 to 0.
# NVMe reports usage which starts at 0% and can run as high as 254%
my $percent_used = get_param($smart, "percent_used", 0);
$percent_used = get_param($smart, "percentage_used", 0) unless defined $percent_used;
$percent_used = get_param($smart, "lifetime_remaining%", 1) unless defined $percent_used;
$percent_used = get_param($smart, "percent_lifetime_remain", 1) unless defined $percent_used;
$wear = 100 - $percent_used if defined $percent_used;
}
# When adding For other kinds of attributes or calculations
# probably move this into different procedures for each kind of
# calculation.
my $diskstate;
my $finding;
if ($hours && $wear) {
($diskstate, $finding) =
months_from_percentage($hours, $wear, $disk, $rootdisk);
} else {
$diskstate = $UNKNOWN;
$finding = {
monthsleft => 'unknown',
wear => -1,
};
}
$finding->{in_use} = $in_use->{$disk};
$findings{$disk} = $finding;
my $months_left = $finding->{monthsleft};
if ($finding->{in_use} && $months_left ne 'many') {
if ($diskstate > $state or $months_left =~ /\d/ && $months_left < $worst_mo) {
$worst_mo = $months_left;
$worst_disk = $disk;
}
$state = $diskstate if $diskstate > $state;
}
}
return $state, $worst_disk, \%findings;
}
# Can't be parsing all sorts of weird character sets and strings
$ENV{'LC_ALL'} = 'C';
my ($ssd, $rootdisk, $in_use) = get_disks();
if (scalar keys %{$ssd} == 0) {
say "$states[$nossd]: I can see no SSD disks here";
exit $nossd;
}
my ($state, $worst, $lifetimes) = get_lifetime($ssd, $rootdisk, $in_use);
my $isroot = 0;
my $expectancy = 0;
if ($state > 0) {
$isroot = $lifetimes->{$worst}->{isroot};
$expectancy = $lifetimes->{$worst}->{monthsleft};
}
my $statesum = "Wear left: ";
map { $statesum .= sprintf("%s%s: %s months (%d%%); ", $_,
($lifetimes->{$_}->{isroot}
? " (root)"
: ($lifetimes->{$_}->{in_use} ? "" : " (not in use)")
),
$lifetimes->{$_}->{monthsleft},
$lifetimes->{$_}->{wear});
} sort keys %{$lifetimes};
if ($state == $OK) {
say "OK: No disks at risk. $statesum";
exit $OK;
}
my $expect = "is unsupported";
if ($state == $UNKNOWN) {
$state = $unknown;
} else {
my $month = abs(int($expectancy)) == 1 ? "month" : "months";
$expect = $expectancy < 0 ?
sprintf("was expected to die %d %s ago", abs($expectancy), $month) :
sprintf("is expected to die in %d %s", $expectancy, $month);
}
printf("%s: %s %s! %s\n",
$states[$state], $worst . ($isroot ? " (root disk)" : ""),
$expect, $statesum);
exit $state;