scrutiny: handle dell/megaraid

This commit is contained in:
Cyberes 2023-05-28 18:25:18 -06:00
parent 60f4329009
commit b98b0d5d96
1 changed files with 52 additions and 22 deletions

View File

@ -2,6 +2,7 @@
import argparse import argparse
import subprocess import subprocess
import sys import sys
from datetime import datetime, timedelta
from typing import List from typing import List
import requests import requests
@ -9,18 +10,29 @@ import requests
from checker import nagios from checker import nagios
def get_disk_wwn_ids() -> List[str]: def get_disk_wwn_ids(ignore_non_smart: bool = False) -> List[str] or bool:
wwn_ids = [] wwn_ids = []
try: try:
output = subprocess.check_output(["lsblk", "-o", "NAME,WWN,TYPE", "-d", "-n", "-p"]) output = subprocess.check_output(["sudo", "smartctl", "--scan"])
for line in output.decode("utf-8").strip().split("\n"): for line in output.decode("utf-8").strip().split("\n"):
parts = line.split() parts = line.split()
if len(parts) == 3: if len(parts) >= 3:
name, wwn, disk_type = parts device = parts[0]
if wwn != "0" and disk_type == "disk": device_type = parts[2].replace('scsi', 'sat,auto')
smart_supported = subprocess.check_output(["sudo", "smartctl", "-i", name]).decode("utf-8") try:
smart_supported = subprocess.check_output(["sudo", "smartctl", "-i", device, "-d", device_type]).decode("utf-8")
if "SMART support is: Enabled" in smart_supported: if "SMART support is: Enabled" in smart_supported:
wwn_ids.append(wwn) wwn_line = [line for line in smart_supported.split("\n") if "LU WWN Device Id" in line]
wwn_id = '0x' + wwn_line[0].replace('LU WWN Device Id: ', '').replace(' ', '')
wwn_ids.append(wwn_id)
# else:
# # TODO: warn if a drive doesn't support SMART
except subprocess.CalledProcessError as e:
if ignore_non_smart:
continue
else:
print(f"UNKNOWN: subprocess Error - {e}")
sys.exit(nagios.UNKNOWN)
except subprocess.CalledProcessError as e: except subprocess.CalledProcessError as e:
print(f"UNKNOWN: subprocess Error - {e}") print(f"UNKNOWN: subprocess Error - {e}")
sys.exit(nagios.UNKNOWN) sys.exit(nagios.UNKNOWN)
@ -42,21 +54,29 @@ def get_smart_health(wwn_id: str, scrutiny_endpoint: str) -> dict:
def main(args): def main(args):
results = {} results = {}
wwn_ids = get_disk_wwn_ids() wwn_ids = get_disk_wwn_ids(args.ignore_non_smart)
for wwn_id in wwn_ids: for wwn_id in wwn_ids:
smart_health = get_smart_health(wwn_id, args.scrutiny_endpoint) smart_health = get_smart_health(wwn_id, args.scrutiny_endpoint)
name = f'/dev/{smart_health["data"]["device"]["device_name"]} {wwn_id}' # differentiate disks in RAID arrays
disk_results = { disk_results = {
'wwn_id': wwn_id, 'wwn_id': wwn_id,
# 'name': f'/dev/{smart_health["data"]["device"]["device_name"]}',
'failed_attributes': [], 'failed_attributes': [],
} }
metadata = smart_health['metadata'] metadata = smart_health['metadata']
if smart_health: # For testing
# print(f"Disk {wwn_id} SMART health:") # smart_health['data']['device']['UpdatedAt'] = '2023-04-28T23:00:03.071184465Z'
# print(json.dumps(smart_health, indent=2))
last_updated = datetime.strptime(smart_health['data']['device']['UpdatedAt'][:-4] + 'Z', '%Y-%m-%dT%H:%M:%S.%fZ')
if datetime.utcnow() - timedelta(hours=args.time_delta_limit) > last_updated:
metics_out_of_date = True
else:
metics_out_of_date = False
if smart_health:
for attribute_id, values in smart_health['data']['smart_results'][0]['attrs'].items(): for attribute_id, values in smart_health['data']['smart_results'][0]['attrs'].items():
if values['status'] == 0: if values['status'] == 0:
continue continue
@ -64,11 +84,12 @@ def main(args):
# continue # continue
values['attribute_name'] = metadata[attribute_id]['display_name'] values['attribute_name'] = metadata[attribute_id]['display_name']
values['metadata'] = metadata[attribute_id] values['metadata'] = metadata[attribute_id]
if 'observed_thresholds' in values['metadata'].keys(): if 'observed_thresholds' in values['metadata'].keys():
del values['metadata']['observed_thresholds'] del values['metadata']['observed_thresholds']
disk_results['failed_attributes'].append(values) disk_results['failed_attributes'].append(values)
results[smart_health['data']['device']['device_name']] = disk_results results[name] = disk_results
crit_disks = {} crit_disks = {}
warn_disks = {} warn_disks = {}
@ -94,55 +115,64 @@ def main(args):
dd = '<dd>' if args.html else '\t- ' dd = '<dd>' if args.html else '\t- '
dds = '</dd>' if args.html else '' dds = '</dd>' if args.html else ''
out_of_date_str = f'metrics are >{args.time_delta_limit} hrs out of date ' if metics_out_of_date else ''
return_code = nagios.OK return_code = nagios.OK
if len(crit_disks): if len(crit_disks):
return_code = nagios.CRITICAL return_code = nagios.CRITICAL
if len(warn_disks): if len(warn_disks):
x = f' and {len(warn_disks)} {"warning" if len(results) > 1 else "warnings"}' x = f' and {len(warn_disks)} {"warnings" if len(results) > 1 else "warning"}'
else: else:
x = '' x = ''
print(f'CRITICAL: {len(crit_disks)} {"error" if len(results) > 1 else "errors"}{x}') print(f'CRITICAL: {out_of_date_str + "and " if len(out_of_date_str) else ""}{len(crit_disks)} {"errors" if len(crit_disks) > 1 else "error"}{x}')
print('<dl>') print('<dl>')
print(f'{dt}Disks with Errors:{dts}') print(f'{dt}Disks with Errors:{dts}')
for disk, warns in crit_disks.items(): for disk, warns in crit_disks.items():
if args.html: if args.html:
disk_name = f'- <a href="{args.pretty_url}/web/device/{results[disk]["wwn_id"]}">/dev/{disk}</a>' disk_name = f'- <a href="{args.pretty_url}/web/device/{results[disk]["wwn_id"]}" target="_blank">{disk}</a>'
else: else:
disk_name = f'\t- /dev/{disk}' disk_name = f'\t- {disk}'
print(f'{dd}{disk_name}: {", ".join([x["display_name"] for x in warns])}{dds}') print(f'{dd}{disk_name}: {", ".join([x["display_name"] for x in warns])}{dds}')
print('</dl>', end='') print('</dl>', end='')
if len(out_of_date_str) and not len(crit_disks):
return_code = nagios.CRITICAL
w = "warnings" if len(warn_disks) > 1 else "warning"
print(f'CRITICAL: {out_of_date_str}{"and " + str(len(warn_disks)) + " " + w if len(warn_disks) else ""}')
if len(warn_disks): if len(warn_disks):
if return_code < nagios.CRITICAL: if return_code < nagios.CRITICAL:
return_code = nagios.WARNING return_code = nagios.WARNING
print(f'WARNING: {len(crit_disks)} {"warning" if len(results) > 1 else "warnings"}') print(f'WARNING: {len(warn_disks)} {"warnings" if len(warn_disks) > 1 else "warning"}')
print('<dl>') print('<dl>')
print(f'{dt}Disks with issues:{dts}') print(f'{dt}Disks with issues:{dts}')
for disk, warns in warn_disks.items(): for disk, warns in warn_disks.items():
if args.html: if args.html:
disk_name = f'- <a href="{args.pretty_url}/web/device/{results[disk]["wwn_id"]}">/dev/{disk}</a>' disk_name = f'- <a href="{args.pretty_url}/web/device/{results[disk]["wwn_id"]}" target="_blank">{disk}</a>'
else: else:
disk_name = f'\t- /dev/{disk}' disk_name = f'\t- {disk}'
print(f'{dd}{disk_name}: {", ".join([x["display_name"] for x in warns])}{dds}') print(f'{dd}{disk_name}: {", ".join([x["display_name"] for x in warns])}{dds}')
print('</dl>', end='') print('</dl>', end='')
if not len(crit_disks) and not len(warn_disks): if not len(crit_disks) and not len(warn_disks):
print(f'OK: all {len(results)} {"disk" if len(results) > 1 else "disks"} are healthy!', end='') print(f'OK: all {len(results.keys())} {"disks" if len(results.keys()) > 1 else "disk"} are healthy!', end='')
print(f"|'warnings'={len(warn_disks)};;; 'errors'={len(crit_disks)};;; 'num_disks'={len(results)};;;") print(f"|'warnings'={len(warn_disks)};;; 'errors'={len(crit_disks)};;; 'num_disks'={len(results.keys())};;;")
sys.exit(return_code) sys.exit(return_code)
if __name__ == "__main__": if __name__ == "__main__":
parser = argparse.ArgumentParser(description='') parser = argparse.ArgumentParser(description='')
parser.add_argument('--scrutiny-endpoint', required=True, help='Base URL for scrutiny.') parser.add_argument('--scrutiny-endpoint', required=True, help='Base URL for scrutiny.')
parser.add_argument('--time-delta-limit', default=24, type=int, help='The Scrutiny data must not be older than this many hours. Default: 24.')
parser.add_argument('--warn-non-critical', action='store_true', help='Warn when a non-critical metric is marked as failed.') parser.add_argument('--warn-non-critical', action='store_true', help='Warn when a non-critical metric is marked as failed.')
parser.add_argument('--html', action='store_true', help='Print HTML.') parser.add_argument('--html', action='store_true', help='Print HTML.')
parser.add_argument('--pretty-url', help='The pretty URL to link to when printing HTML.') parser.add_argument('--pretty-url', help='The pretty URL to link to when printing HTML.')
parser.add_argument('--ignore-non-smart', action='store_true', help="Ignore any non-SMART devices and any devices that error when reading SMART.")
args = parser.parse_args() args = parser.parse_args()
if args.html and not args.pretty_url: if args.html and not args.pretty_url: