diff --git a/check_scrutiny_disks.py b/check_scrutiny_disks.py index 1c8756b..fe3d015 100755 --- a/check_scrutiny_disks.py +++ b/check_scrutiny_disks.py @@ -2,6 +2,7 @@ import argparse import subprocess import sys +from datetime import datetime, timedelta from typing import List import requests @@ -9,18 +10,29 @@ import requests from checker import nagios -def get_disk_wwn_ids() -> List[str]: +def get_disk_wwn_ids(ignore_non_smart: bool = False) -> List[str] or bool: wwn_ids = [] try: - output = subprocess.check_output(["lsblk", "-o", "NAME,WWN,TYPE", "-d", "-n", "-p"]) + output = subprocess.check_output(["sudo", "smartctl", "--scan"]) for line in output.decode("utf-8").strip().split("\n"): parts = line.split() - if len(parts) == 3: - name, wwn, disk_type = parts - if wwn != "0" and disk_type == "disk": - smart_supported = subprocess.check_output(["sudo", "smartctl", "-i", name]).decode("utf-8") + if len(parts) >= 3: + device = parts[0] + device_type = parts[2].replace('scsi', 'sat,auto') + try: + smart_supported = subprocess.check_output(["sudo", "smartctl", "-i", device, "-d", device_type]).decode("utf-8") if "SMART support is: Enabled" in smart_supported: - wwn_ids.append(wwn) + wwn_line = [line for line in smart_supported.split("\n") if "LU WWN Device Id" in line] + wwn_id = '0x' + wwn_line[0].replace('LU WWN Device Id: ', '').replace(' ', '') + wwn_ids.append(wwn_id) + # else: + # # TODO: warn if a drive doesn't support SMART + except subprocess.CalledProcessError as e: + if ignore_non_smart: + continue + else: + print(f"UNKNOWN: subprocess Error - {e}") + sys.exit(nagios.UNKNOWN) except subprocess.CalledProcessError as e: print(f"UNKNOWN: subprocess Error - {e}") sys.exit(nagios.UNKNOWN) @@ -42,21 +54,29 @@ def get_smart_health(wwn_id: str, scrutiny_endpoint: str) -> dict: def main(args): results = {} - wwn_ids = get_disk_wwn_ids() + wwn_ids = get_disk_wwn_ids(args.ignore_non_smart) for wwn_id in wwn_ids: smart_health = get_smart_health(wwn_id, args.scrutiny_endpoint) + + name = f'/dev/{smart_health["data"]["device"]["device_name"]} {wwn_id}' # differentiate disks in RAID arrays + disk_results = { 'wwn_id': wwn_id, - # 'name': f'/dev/{smart_health["data"]["device"]["device_name"]}', 'failed_attributes': [], } metadata = smart_health['metadata'] - if smart_health: - # print(f"Disk {wwn_id} SMART health:") - # print(json.dumps(smart_health, indent=2)) + # For testing + # smart_health['data']['device']['UpdatedAt'] = '2023-04-28T23:00:03.071184465Z' + last_updated = datetime.strptime(smart_health['data']['device']['UpdatedAt'][:-4] + 'Z', '%Y-%m-%dT%H:%M:%S.%fZ') + if datetime.utcnow() - timedelta(hours=args.time_delta_limit) > last_updated: + metics_out_of_date = True + else: + metics_out_of_date = False + + if smart_health: for attribute_id, values in smart_health['data']['smart_results'][0]['attrs'].items(): if values['status'] == 0: continue @@ -64,11 +84,12 @@ def main(args): # continue values['attribute_name'] = metadata[attribute_id]['display_name'] values['metadata'] = metadata[attribute_id] + if 'observed_thresholds' in values['metadata'].keys(): del values['metadata']['observed_thresholds'] disk_results['failed_attributes'].append(values) - results[smart_health['data']['device']['device_name']] = disk_results + results[name] = disk_results crit_disks = {} warn_disks = {} @@ -94,55 +115,64 @@ def main(args): dd = '
' if args.html else '\t- ' dds = '
' if args.html else '' + out_of_date_str = f'metrics are >{args.time_delta_limit} hrs out of date ' if metics_out_of_date else '' + return_code = nagios.OK if len(crit_disks): return_code = nagios.CRITICAL if len(warn_disks): - x = f' and {len(warn_disks)} {"warning" if len(results) > 1 else "warnings"}' + x = f' and {len(warn_disks)} {"warnings" if len(results) > 1 else "warning"}' else: x = '' - print(f'CRITICAL: {len(crit_disks)} {"error" if len(results) > 1 else "errors"}{x}') + print(f'CRITICAL: {out_of_date_str + "and " if len(out_of_date_str) else ""}{len(crit_disks)} {"errors" if len(crit_disks) > 1 else "error"}{x}') print('
') print(f'{dt}Disks with Errors:{dts}') for disk, warns in crit_disks.items(): if args.html: - disk_name = f'- /dev/{disk}' + disk_name = f'- {disk}' else: - disk_name = f'\t- /dev/{disk}' + disk_name = f'\t- {disk}' print(f'{dd}{disk_name}: {", ".join([x["display_name"] for x in warns])}{dds}') print('
', end='') + if len(out_of_date_str) and not len(crit_disks): + return_code = nagios.CRITICAL + w = "warnings" if len(warn_disks) > 1 else "warning" + print(f'CRITICAL: {out_of_date_str}{"and " + str(len(warn_disks)) + " " + w if len(warn_disks) else ""}') + if len(warn_disks): if return_code < nagios.CRITICAL: return_code = nagios.WARNING - print(f'WARNING: {len(crit_disks)} {"warning" if len(results) > 1 else "warnings"}') + print(f'WARNING: {len(warn_disks)} {"warnings" if len(warn_disks) > 1 else "warning"}') print('
') print(f'{dt}Disks with issues:{dts}') for disk, warns in warn_disks.items(): if args.html: - disk_name = f'- /dev/{disk}' + disk_name = f'- {disk}' else: - disk_name = f'\t- /dev/{disk}' + disk_name = f'\t- {disk}' print(f'{dd}{disk_name}: {", ".join([x["display_name"] for x in warns])}{dds}') print('
', end='') if not len(crit_disks) and not len(warn_disks): - print(f'OK: all {len(results)} {"disk" if len(results) > 1 else "disks"} are healthy!', end='') + print(f'OK: all {len(results.keys())} {"disks" if len(results.keys()) > 1 else "disk"} are healthy!', end='') - print(f"|'warnings'={len(warn_disks)};;; 'errors'={len(crit_disks)};;; 'num_disks'={len(results)};;;") + print(f"|'warnings'={len(warn_disks)};;; 'errors'={len(crit_disks)};;; 'num_disks'={len(results.keys())};;;") sys.exit(return_code) if __name__ == "__main__": parser = argparse.ArgumentParser(description='') parser.add_argument('--scrutiny-endpoint', required=True, help='Base URL for scrutiny.') + parser.add_argument('--time-delta-limit', default=24, type=int, help='The Scrutiny data must not be older than this many hours. Default: 24.') parser.add_argument('--warn-non-critical', action='store_true', help='Warn when a non-critical metric is marked as failed.') parser.add_argument('--html', action='store_true', help='Print HTML.') parser.add_argument('--pretty-url', help='The pretty URL to link to when printing HTML.') + parser.add_argument('--ignore-non-smart', action='store_true', help="Ignore any non-SMART devices and any devices that error when reading SMART.") args = parser.parse_args() if args.html and not args.pretty_url: