check_scrutiny_disks: fix not no data

2023-06-12 10:21:05 -06:00 · 2023-06-12 10:21:05 -06:00 · 847999f43b
parent 3e86cc3614
commit 847999f43b
1 changed files with 41 additions and 14 deletions
--- a/check_scrutiny_disks.py
+++ b/check_scrutiny_disks.py
@ -20,7 +20,8 @@ def get_disk_wwn_ids(ignore_non_smart: bool = False) -> List[str] or bool:
                device = parts[0]
                device_type = parts[2].replace('scsi', 'sat,auto')
                try:
-                    smart_supported = subprocess.check_output(["sudo", "smartctl", "-i", device, "-d", device_type]).decode("utf-8")
+                    smart_supported = subprocess.check_output(
+                        ["sudo", "smartctl", "-i", device, "-d", device_type]).decode("utf-8")
                    if "SMART support is: Enabled" in smart_supported:
                        wwn_line = [line for line in smart_supported.split("\n") if "LU WWN Device Id" in line]
                        wwn_id = '0x' + wwn_line[0].replace('LU WWN Device Id: ', '').replace(' ', '')
@ -45,20 +46,26 @@ def get_smart_health(wwn_id: str, scrutiny_endpoint: str) -> dict:
    if response.status_code == 200:
        return response.json()
    elif response.status_code == 404:
-        print(f"Disk {wwn_id} not found on Scrutiny")
-        return {}
+        return {
+            'fetch_error': f"Disk {wwn_id} not found on Scrutiny"
+        }
    else:
-        print(f"Scrutiny Error {response.status_code} for disk {wwn_id}: {response.text}")
-        return {}
+        return {
+            'fetch_error': f"Error {response.status_code} for disk {wwn_id}: {response.text}"
+        }


 def main(args):
    results = {}
    wwn_ids = get_disk_wwn_ids(args.ignore_non_smart)
-    metics_out_of_date = False
+    metrics_out_of_date = False
    for wwn_id in wwn_ids:
        smart_health = get_smart_health(wwn_id, args.scrutiny_endpoint)

+        if smart_health.get('fetch_error'):
+            print('UNKNOWN -', smart_health.get('fetch_error'))
+            sys.exit(nagios.UNKNOWN)
+
        name = f'/dev/{smart_health["data"]["device"]["device_name"]} {wwn_id}'  # differentiate disks in RAID arrays

        results[name] = {
@ -73,7 +80,7 @@ def main(args):

        last_updated = datetime.strptime(smart_health['data']['device']['UpdatedAt'][:-4] + 'Z', '%Y-%m-%dT%H:%M:%S.%fZ')
        if datetime.utcnow() - timedelta(hours=args.time_delta_limit) > last_updated:
-            metics_out_of_date = True
+            metrics_out_of_date = True

        if smart_health and len(smart_health['data']['smart_results']):
            try:
@ -91,10 +98,10 @@ def main(args):
                #     continue
                values['attribute_name'] = metadata[attribute_id]['display_name']
                values['metadata'] = metadata[attribute_id]
-
                if 'observed_thresholds' in values['metadata'].keys():
                    del values['metadata']['observed_thresholds']
                results[name]['failed_attributes'].append(values)
+            results[name]['status'] = 'good'
        else:
            results[name]['status'] = 'no data'

@ -130,7 +137,7 @@ def main(args):
    dd = '<dd>' if args.html else '\t- '
    dds = '</dd>' if args.html else ''

-    out_of_date_str = f'metrics are >{args.time_delta_limit} hrs out of date ' if metics_out_of_date else ''
+    out_of_date_str = f'metrics are >{args.time_delta_limit} hrs out of date ' if metrics_out_of_date else ''

    return_code = nagios.OK
    if len(crit_disks):
@ -139,7 +146,8 @@ def main(args):
            x = f' and {len(warn_disks)} {"warnings" if len(results) > 1 else "warning"}'
        else:
            x = ''
-        print(f'CRITICAL: {out_of_date_str + "and " if len(out_of_date_str) else ""}{len(crit_disks)} {"errors" if len(crit_disks) > 1 else "error"}{x}')
+        print(
+            f'CRITICAL: {out_of_date_str + "and " if len(out_of_date_str) else ""}{len(crit_disks)} {"errors" if len(crit_disks) > 1 else "error"}{x}')

        print('<dl>')
        print(f'{dt}Disks with Errors:{dts}')
@ -180,19 +188,38 @@ def main(args):
    sys.exit(return_code)


+def is_smartmontools_installed():
+    try:
+        result = subprocess.run(['dpkg', '-s', 'smartmontools'], stdout=subprocess.PIPE, stderr=subprocess.PIPE,
+                                check=True, text=True)
+        if 'Status: install ok installed' in result.stdout:
+            return True
+        else:
+            return False
+    except subprocess.CalledProcessError:
+        return False
+
+
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='')
    parser.add_argument('--scrutiny-endpoint', required=True, help='Base URL for scrutiny.')
-    parser.add_argument('--time-delta-limit', default=24, type=int, help='The Scrutiny data must not be older than this many hours. Default: 24.')
-    parser.add_argument('--warn-non-critical', action='store_true', help='Warn when a non-critical metric is marked as failed.')
+    parser.add_argument('--time-delta-limit', default=24, type=int,
+                        help='The Scrutiny data must not be older than this many hours. Default: 24.')
+    parser.add_argument('--warn-non-critical', action='store_true',
+                        help='Warn when a non-critical metric is marked as failed.')
    parser.add_argument('--html', action='store_true', help='Print HTML.')
    parser.add_argument('--pretty-url', help='The pretty URL to link to when printing HTML.')
-    parser.add_argument('--ignore-non-smart', action='store_true', help="Ignore any non-SMART devices and any devices that error when reading SMART.")
+    parser.add_argument('--ignore-non-smart', action='store_true',
+                        help="Ignore any non-SMART devices and any devices that error when reading SMART.")
    parser.add_argument('--dont-warn-no-data', action='store_true', help="Don't warn if there is no data for a disk.")
    args = parser.parse_args()

    if args.html and not args.pretty_url:
-        print('UKNOWN: when using --html you must also set --pretty-url')
+        print('UKNOWN - when using --html you must also set --pretty-url')
+        sys.exit(nagios.UNKNOWN)
+
+    if not is_smartmontools_installed():
+        print('UNKNOWN - smartmontools is not installed.')
        sys.exit(nagios.UNKNOWN)

    args.scrutiny_endpoint = args.scrutiny_endpoint.strip('/')