#!/usr/bin/env python3 import argparse import subprocess import sys from datetime import datetime, timedelta from typing import List import pandas as pd import pytz from checker import nagios from checker.http import fetch_with_retry def get_disk_wwn_ids(ignore_non_smart: bool = False) -> List[str] or bool: wwn_ids = [] try: output = subprocess.check_output(["sudo", "smartctl", "--scan"]) for line in output.decode("utf-8").strip().split("\n"): parts = line.split() if len(parts) >= 3: device = parts[0] device_type = parts[2].replace('scsi', 'sat,auto') try: smart_supported = subprocess.check_output( ["sudo", "smartctl", "-i", device, "-d", device_type]).decode("utf-8") if "SMART support is: Enabled" in smart_supported: wwn_line = [line for line in smart_supported.split("\n") if "LU WWN Device Id" in line] wwn_id = '0x' + wwn_line[0].replace('LU WWN Device Id: ', '').replace(' ', '') wwn_ids.append(wwn_id) # else: # # TODO: warn if a drive doesn't support SMART except subprocess.CalledProcessError as e: if ignore_non_smart: continue else: print(f"UNKNOWN: subprocess Error - {e}") sys.exit(nagios.UNKNOWN) except subprocess.CalledProcessError as e: print(f"UNKNOWN: subprocess Error - {e}") sys.exit(nagios.UNKNOWN) return wwn_ids def get_smart_health(wwn_id: str, scrutiny_endpoint: str) -> dict: url = f"{scrutiny_endpoint}/api/device/{wwn_id}/details" response = fetch_with_retry(url) if response.status_code == 200: return response.json() elif response.status_code == 404: return { 'fetch_error': f"Disk {wwn_id} not found on Scrutiny" } else: return { 'fetch_error': f"Error {response.status_code} for disk {wwn_id}: {response.text}" } def main(args): results = {} wwn_ids = get_disk_wwn_ids(args.ignore_non_smart) metrics_out_of_date = False for wwn_id in wwn_ids: smart_health = get_smart_health(wwn_id, args.scrutiny_endpoint) if smart_health.get('fetch_error'): print('UNKNOWN -', smart_health.get('fetch_error')) sys.exit(nagios.UNKNOWN) name = f'/dev/{smart_health["data"]["device"]["device_name"]} {wwn_id}' # differentiate disks in RAID arrays results[name] = { 'wwn_id': wwn_id, 'failed_attributes': [], } metadata = smart_health['metadata'] # For testing # smart_health['data']['device']['UpdatedAt'] = '2023-04-28T23:00:03.071184465Z' last_updated = pd.to_datetime(smart_health['data']['device']['UpdatedAt']) # , '%Y-%m-%dT%H:%M:%S.%fZ') if datetime.now(pytz.utc) - timedelta(hours=args.time_delta_limit) > last_updated.replace(tzinfo=pytz.utc): metrics_out_of_date = True if smart_health and len(smart_health['data']['smart_results']): try: disk_data = smart_health['data']['smart_results'][0]['attrs'] except Exception as e: print('UNKNOWN - failed to parse data:', e) print('Key "data":', smart_health['data'].keys()) print('Key "smart_results":', len(smart_health['data']['smart_results'])) print('Key "attrs":', smart_health['data']['smart_results'][0].keys()) sys.exit(nagios.UNKNOWN) for attribute_id, values in disk_data.items(): if values['status'] == 0: continue # elif values['status'] == 2 and not args.warn_non_critical: # continue values['attribute_name'] = metadata[attribute_id]['display_name'] values['metadata'] = metadata[attribute_id] if 'observed_thresholds' in values['metadata'].keys(): del values['metadata']['observed_thresholds'] results[name]['failed_attributes'].append(values) results[name]['status'] = 'good' else: results[name]['status'] = 'no data' crit_disks = {} warn_disks = {} for disk, values in results.items(): if values['status'] != 'no data': for item in values['failed_attributes']: if item['status'] == 2 and args.warn_non_critical: if disk not in warn_disks.keys(): warn_disks[disk] = [] warn_disks[disk].append({ 'raw_value': item['raw_value'], 'display_name': item['metadata']['display_name'] }) if item['status'] == 4: if disk not in crit_disks.keys(): crit_disks[disk] = [] crit_disks[disk].append({ 'raw_value': item['raw_value'], 'display_name': item['metadata']['display_name'] }) else: if disk not in warn_disks.keys(): warn_disks[disk] = [] warn_disks[disk].append({ 'raw_value': 'no data', 'display_name': 'no data' }) dt = '