From f27eb2b9d05114206c96c4f438c42242146d988d Mon Sep 17 00:00:00 2001 From: sndl Date: Sun, 25 Jun 2017 02:09:57 +0700 Subject: [PATCH] Add check-smart-tests plugin (#71) * Add check-smart-tests plugin * Cleanup style to pass rubocop checks * Cleanup style to pass rubocop checks * Update README * Fix conditional typo * Fix typo, add additional conditional check * Fix conditional check * Add requested changes to increase readability of the code --- CHANGELOG.md | 3 + README.md | 5 ++ bin/check-smart-tests.rb | 179 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 187 insertions(+) create mode 100755 bin/check-smart-tests.rb diff --git a/CHANGELOG.md b/CHANGELOG.md index a0fa429..72f5f78 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,9 @@ This project adheres to [Semantic Versioning](http://semver.org/). This CHANGELOG follows the format listed at [Keep A Changelog](http://keepachangelog.com/) ## [Unreleased] +### Added +- check-smart-tests.rb: a plugin to check S.M.A.R.T. self tests status + ## [2.1.0] - 2017-05-04 ### Changed - check-disk-usage.rb: show the decimals for disk usage figures diff --git a/README.md b/README.md index 8f8a7ff..63f90db 100644 --- a/README.md +++ b/README.md @@ -52,11 +52,16 @@ Check the SMART status of hardrives and alert based upon a given set of threshol Check the health of a disk using `smartctl` +**check-smart-tests** + +Check the status of SMART offline tests and optionally check if tests were executed in a specified interval + ## Files * bin/check-disk-usage.rb * bin/check-fstab-mounts.rb * bin/check-smart-status.rb * bin/check-smart.rb + * bin/check-smart-tests.rb * bin/metrics-disk.rb * bin/metrics-disk-capacity.rb * bin/metrics-disk-usage.rb diff --git a/bin/check-smart-tests.rb b/bin/check-smart-tests.rb new file mode 100755 index 0000000..0ff209a --- /dev/null +++ b/bin/check-smart-tests.rb @@ -0,0 +1,179 @@ +#! /usr/bin/env ruby +# +# check-smart-tests.rb +# +# DESCRIPTION: +# This script checks S.M.A.R.T. self-tests status and optionally time of last +# test run +# +# OUTPUT: +# plain text +# +# PLATFORMS: +# Linux +# +# DEPENDENCIES: +# gem: sensu-plugin +# +# USAGE: +# check-smart-tests.rb # Use default options +# check-smart-tests.rb -d /dev/sda,/dev/sdb -l 24 -t 336 # Check smart tests status for +# /dev/sda and /dev/sdb devices, also check if short tests were run in last 24 hours and +# extended tests were run in last 14 days(336 hours) +# +# NOTES: +# The plugin requires smartmontools to be installed and smartctl utility in particular. +# +# smartctl requires root rights to run, so you should allow sensu to execute +# this command as root without password by adding following line to /etc/sudoers: +# +# sensu ALL=(ALL) NOPASSWD: /usr/sbin/smartctl +# +# Tested only on Debian. +# +# LICENSE: +# Stanislav Sandalnikov +# Released under the same terms as Sensu (the MIT license); see LICENSE +# for details. + +require 'sensu-plugin/check/cli' + +class Device + attr_accessor :name, :pwh, :str + + def initialize(name, smartctl_executable) + @name = name + @exec = smartctl_executable + @pwh = poweron_hours + @str = selftest_results + end + + def poweron_hours + `sudo #{@exec} -A #{@name}`.split("\n").each do |line| + columns = line.split + if columns[1] == 'Power_On_Hours' + return columns[9] + end + end + end + + def selftest_results + results = [] + headers = %w(num test_description status remaining lifetime lba_of_first_error) + + `sudo #{@exec} -l selftest #{@name}`.split("\n").grep(/^#/).each do |test| + test = test.gsub!(/\s\s+/m, "\t").split("\t") + res = {} + + headers.each_with_index do |v, k| + res[v] = test[k] + end + + results << res + end + + results + end +end + +class CheckSMARTTests < Sensu::Plugin::Check::CLI + option :executable, + long: '--executable EXECUTABLE', + short: '-e EXECUTABLE', + default: '/usr/sbin/smartctl', + description: 'Path to smartctl executable' + option :devices, + long: '--devices *DEVICES', + short: '-d *DEVICES', + default: 'all', + description: 'Comma-separated list of devices to check, i.e. "/dev/sda,/dev/sdb"' + option :short_test_interval, + long: '--short_test_interval INTERVAL', + short: '-s INTERVAL', + description: 'If more time then this value passed since last short test run, then warning will be raised' + option :long_test_interval, + long: '--long_test_interval INTERVAL', + short: '-l INTERVAL', + description: 'If more time then this value passed since last extedned test run, then warning will be raised' + + def initialize + super + @devices = [] + @warnings = [] + @criticals = [] + set_devices + end + + def set_devices + if config[:devices] == 'all' + `lsblk -plnd -o NAME`.split.each do |name| + unless name =~ /\/dev\/loop.*/ + dev = Device.new(name, config[:executable]) + @devices.push(dev) + end + end + else + config[:devices].split(',').each do |name| + dev = Device.new(name, config[:executable]) + @devices.push(dev) + end + end + end + + def check_tests(dev) + if dev.str.empty? + @warnings << "#{dev.name}: No self-tests have been logged." + return + end + + unless dev.str[0]['status'] == 'Completed without error' || dev.str[0]['status'] =~ /Self-test routine in progress/ + @criticals << "#{dev.name}: Last test failed - #{dev.str[0]['status']}" + end + + unless config[:short_test_interval].nil? + dev.str.each_with_index do |t, i| + if t['test_description'] != 'Short offline' + if i == dev.str.length - 1 + @warnings << "#{dev.name}: No short tests were run for this device in last #{dev.str.length} executions" + end + next + else + if dev.pwh.to_i - t['lifetime'].to_i > config[:short_test_interval].to_i + @warnings << "#{dev.name}: More than #{config[:short_test_interval]} hours passed since the last short test" + end + break + end + end + end + + unless config[:long_test_interval].nil? + dev.str.each_with_index do |t, i| + if t['test_description'] != 'Extended offline' + if i == dev.str.length - 1 + @warnings << "#{dev.name}: No extended tests were run for this device in last #{dev.str.length} executions" + end + next + else + if dev.pwh.to_i - t['lifetime'].to_i > config[:long_test_interval].to_i + @warnings << "#{dev.name}: More than #{config[:long_test_interval]} hours passed since the last extended test" + end + break + end + end + end + end + + def run + @devices.each do |device| + check_tests(device) + end + + if @criticals.any? + critical @criticals.join(' ') + elsif @warnings.any? + warning @warnings.join(' ') + else + ok 'All devices are OK' + end + end +end