finish check_scrutiny_disks

This commit is contained in:
Cyberes 2023-05-28 14:12:07 -06:00
parent 357f1f2d9e
commit 0ae80b939f
1 changed files with 71 additions and 12 deletions

View File

@ -1,13 +1,13 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
import argparse import argparse
import json
import subprocess import subprocess
import sys import sys
from typing import List from typing import List
from checker import nagios
import requests import requests
from checker import nagios
def get_disk_wwn_ids() -> List[str]: def get_disk_wwn_ids() -> List[str]:
wwn_ids = [] wwn_ids = []
@ -18,11 +18,12 @@ def get_disk_wwn_ids() -> List[str]:
if len(parts) == 3: if len(parts) == 3:
name, wwn, disk_type = parts name, wwn, disk_type = parts
if wwn != "0" and disk_type == "disk": if wwn != "0" and disk_type == "disk":
smart_supported = subprocess.check_output(["smartctl", "-i", name]).decode("utf-8") smart_supported = subprocess.check_output(["sudo", "smartctl", "-i", name]).decode("utf-8")
if "SMART support is: Enabled" in smart_supported: if "SMART support is: Enabled" in smart_supported:
wwn_ids.append(wwn) wwn_ids.append(wwn)
except subprocess.CalledProcessError as e: except subprocess.CalledProcessError as e:
print(f"Subprocess Error: {e}") print(f"UNKNOWN: subprocess Error - {e}")
sys.exit(nagios.UNKNOWN)
return wwn_ids return wwn_ids
@ -39,30 +40,88 @@ def get_smart_health(wwn_id: str, scrutiny_endpoint: str) -> dict:
return {} return {}
def main(scrutiny_endpoint: str): def main(args):
results = {} results = {}
wwn_ids = get_disk_wwn_ids() wwn_ids = get_disk_wwn_ids()
for wwn_id in wwn_ids: for wwn_id in wwn_ids:
smart_health = get_smart_health(wwn_id, scrutiny_endpoint) smart_health = get_smart_health(wwn_id, args.scrutiny_endpoint)
disk_results = {
'wwn_id': wwn_id,
# 'name': f'/dev/{smart_health["data"]["device"]["device_name"]}',
'failed_attributes': [],
}
metadata = smart_health['metadata']
if smart_health: if smart_health:
print(f"Disk {wwn_id} SMART health:") # print(f"Disk {wwn_id} SMART health:")
print(json.dumps(smart_health, indent=2)) # print(json.dumps(smart_health, indent=2))
for metric in smart_health['data']['smart_results'][0]['attrs']: for attribute_id, values in smart_health['data']['smart_results'][0]['attrs'].items():
print(metric) if values['status'] == 0:
continue
# elif values['status'] == 2 and not args.warn_non_critical:
# continue
values['attribute_name'] = metadata[attribute_id]['display_name']
values['metadata'] = metadata[attribute_id]
if 'observed_thresholds' in values['metadata'].keys():
del values['metadata']['observed_thresholds']
disk_results['failed_attributes'].append(values)
results[smart_health['data']['device']['device_name']] = {} results[smart_health['data']['device']['device_name']] = disk_results
crit_disks = {}
warn_disks = {}
for disk, values in results.items():
for item in values['failed_attributes']:
if item['status'] == 2 and args.warn_non_critical:
if disk not in warn_disks.keys():
warn_disks[disk] = []
warn_disks[disk].append({
'raw_value': item['raw_value'],
'display_name': item['metadata']['display_name']
})
if item['status'] == 4:
if disk not in crit_disks.keys():
crit_disks[disk] = []
crit_disks[disk].append({
'raw_value': item['raw_value'],
'display_name': item['metadata']['display_name']
})
return_code = nagios.OK
if len(crit_disks):
return_code = nagios.CRITICAL
print(f'CRITICAL: {len(crit_disks)} {"error" if len(results) == 0 else "errors"} - {args.scrutiny_endpoint}')
print('Disks with Errors:')
for disk, warns in crit_disks.items():
print(f'\t- /dev/{disk}: {", ".join([x["display_name"] for x in warns])}')
if len(warn_disks):
if return_code < nagios.CRITICAL:
return_code = nagios.WARNING
print(f'WARNING: {len(crit_disks)} {"warning" if len(results) == 0 else "warnings"} - {args.scrutiny_endpoint}')
print('Disks with issues:')
for disk, warns in warn_disks.items():
print(f'\t- /dev/{disk}: {", ".join([x["display_name"] for x in warns])}')
if not len(crit_disks) and not len(warn_disks):
print(f'OK: all {len(results)} {"disk" if len(results) == 0 else "disks"} are healthy!', end='')
print(f"|'warnings'={len(warn_disks)};;; 'errors'={len(crit_disks)};;; 'num_disks'={len(results)};;;")
sys.exit(return_code)
if __name__ == "__main__": if __name__ == "__main__":
parser = argparse.ArgumentParser(description='') parser = argparse.ArgumentParser(description='')
parser.add_argument('--scrutiny-endpoint', required=True, help='Base URL for scrutiny.') parser.add_argument('--scrutiny-endpoint', required=True, help='Base URL for scrutiny.')
parser.add_argument('--warn-non-critical', action='store_true', help='Warn when a non-critical metric is marked as failed.')
args = parser.parse_args() args = parser.parse_args()
args.scrutiny_endpoint = args.scrutiny_endpoint.strip('/') args.scrutiny_endpoint = args.scrutiny_endpoint.strip('/')
try: try:
main(args.scrutiny_endpoint) main(args)
except Exception as e: except Exception as e:
print(f'UNKNOWN: exception "{e}"') print(f'UNKNOWN: exception "{e}"')
import traceback import traceback
print(traceback.format_exc()) print(traceback.format_exc())
sys.exit(nagios.UNKNOWN) sys.exit(nagios.UNKNOWN)