icinga2-checks/check_scrutiny_disks.py

162 lines
6.1 KiB
Python
Executable File

#!/usr/bin/env python3
import argparse
import subprocess
import sys
from typing import List
import requests
from checker import nagios
def get_disk_wwn_ids() -> List[str]:
wwn_ids = []
try:
output = subprocess.check_output(["lsblk", "-o", "NAME,WWN,TYPE", "-d", "-n", "-p"])
for line in output.decode("utf-8").strip().split("\n"):
parts = line.split()
if len(parts) == 3:
name, wwn, disk_type = parts
if wwn != "0" and disk_type == "disk":
smart_supported = subprocess.check_output(["sudo", "smartctl", "-i", name]).decode("utf-8")
if "SMART support is: Enabled" in smart_supported:
wwn_ids.append(wwn)
except subprocess.CalledProcessError as e:
print(f"UNKNOWN: subprocess Error - {e}")
sys.exit(nagios.UNKNOWN)
return wwn_ids
def get_smart_health(wwn_id: str, scrutiny_endpoint: str) -> dict:
url = f"{scrutiny_endpoint}/api/device/{wwn_id}/details"
response = requests.get(url)
if response.status_code == 200:
return response.json()
elif response.status_code == 404:
print(f"Disk {wwn_id} not found on Scrutiny")
return {}
else:
print(f"Scrutiny Error {response.status_code} for disk {wwn_id}: {response.text}")
return {}
def main(args):
results = {}
wwn_ids = get_disk_wwn_ids()
for wwn_id in wwn_ids:
smart_health = get_smart_health(wwn_id, args.scrutiny_endpoint)
disk_results = {
'wwn_id': wwn_id,
# 'name': f'/dev/{smart_health["data"]["device"]["device_name"]}',
'failed_attributes': [],
}
metadata = smart_health['metadata']
if smart_health:
# print(f"Disk {wwn_id} SMART health:")
# print(json.dumps(smart_health, indent=2))
for attribute_id, values in smart_health['data']['smart_results'][0]['attrs'].items():
if values['status'] == 0:
continue
# elif values['status'] == 2 and not args.warn_non_critical:
# continue
values['attribute_name'] = metadata[attribute_id]['display_name']
values['metadata'] = metadata[attribute_id]
if 'observed_thresholds' in values['metadata'].keys():
del values['metadata']['observed_thresholds']
disk_results['failed_attributes'].append(values)
results[smart_health['data']['device']['device_name']] = disk_results
crit_disks = {}
warn_disks = {}
for disk, values in results.items():
for item in values['failed_attributes']:
if item['status'] == 2 and args.warn_non_critical:
if disk not in warn_disks.keys():
warn_disks[disk] = []
warn_disks[disk].append({
'raw_value': item['raw_value'],
'display_name': item['metadata']['display_name']
})
if item['status'] == 4:
if disk not in crit_disks.keys():
crit_disks[disk] = []
crit_disks[disk].append({
'raw_value': item['raw_value'],
'display_name': item['metadata']['display_name']
})
dt = '<dt>' if args.html else ''
dts = '</dt>' if args.html else ''
dd = '<dd>' if args.html else '\t- '
dds = '</dd>' if args.html else ''
return_code = nagios.OK
if len(crit_disks):
return_code = nagios.CRITICAL
if len(warn_disks):
x = f' and {len(warn_disks)} {"warning" if len(results) > 1 else "warnings"}'
else:
x = ''
print(f'CRITICAL: {len(crit_disks)} {"error" if len(results) > 1 else "errors"}{x}')
print('<dl>')
print(f'{dt}Disks with Errors:{dts}')
for disk, warns in crit_disks.items():
if args.html:
disk_name = f'- <a href="{args.pretty_url}/web/device/{results[disk]["wwn_id"]}">/dev/{disk}</a>'
else:
disk_name = f'\t- /dev/{disk}'
print(f'{dd}{disk_name}: {", ".join([x["display_name"] for x in warns])}{dds}')
print('</dl>', end='')
if len(warn_disks):
if return_code < nagios.CRITICAL:
return_code = nagios.WARNING
print(f'WARNING: {len(crit_disks)} {"warning" if len(results) > 1 else "warnings"}')
print('<dl>')
print(f'{dt}Disks with issues:{dts}')
for disk, warns in warn_disks.items():
if args.html:
disk_name = f'- <a href="{args.pretty_url}/web/device/{results[disk]["wwn_id"]}">/dev/{disk}</a>'
else:
disk_name = f'\t- /dev/{disk}'
print(f'{dd}{disk_name}: {", ".join([x["display_name"] for x in warns])}{dds}')
print('</dl>', end='')
if not len(crit_disks) and not len(warn_disks):
print(f'OK: all {len(results)} {"disk" if len(results) > 1 else "disks"} are healthy!', end='')
print(f"|'warnings'={len(warn_disks)};;; 'errors'={len(crit_disks)};;; 'num_disks'={len(results)};;;")
sys.exit(return_code)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='')
parser.add_argument('--scrutiny-endpoint', required=True, help='Base URL for scrutiny.')
parser.add_argument('--warn-non-critical', action='store_true', help='Warn when a non-critical metric is marked as failed.')
parser.add_argument('--html', action='store_true', help='Print HTML.')
parser.add_argument('--pretty-url', help='The pretty URL to link to when printing HTML.')
args = parser.parse_args()
if args.html and not args.pretty_url:
print('UKNOWN: when using --html you must also set --pretty-url')
sys.exit(nagios.UNKNOWN)
args.scrutiny_endpoint = args.scrutiny_endpoint.strip('/')
args.pretty_url = args.pretty_url.strip('/') if args.pretty_url else None
try:
main(args)
except Exception as e:
print(f'UNKNOWN: exception "{e}"')
import traceback
print(traceback.format_exc())
sys.exit(nagios.UNKNOWN)