finish check_scrutiny_disks
This commit is contained in:
parent
357f1f2d9e
commit
0ae80b939f
|
@ -1,13 +1,13 @@
|
||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
import argparse
|
import argparse
|
||||||
import json
|
|
||||||
import subprocess
|
import subprocess
|
||||||
import sys
|
import sys
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
from checker import nagios
|
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
|
from checker import nagios
|
||||||
|
|
||||||
|
|
||||||
def get_disk_wwn_ids() -> List[str]:
|
def get_disk_wwn_ids() -> List[str]:
|
||||||
wwn_ids = []
|
wwn_ids = []
|
||||||
|
@ -18,11 +18,12 @@ def get_disk_wwn_ids() -> List[str]:
|
||||||
if len(parts) == 3:
|
if len(parts) == 3:
|
||||||
name, wwn, disk_type = parts
|
name, wwn, disk_type = parts
|
||||||
if wwn != "0" and disk_type == "disk":
|
if wwn != "0" and disk_type == "disk":
|
||||||
smart_supported = subprocess.check_output(["smartctl", "-i", name]).decode("utf-8")
|
smart_supported = subprocess.check_output(["sudo", "smartctl", "-i", name]).decode("utf-8")
|
||||||
if "SMART support is: Enabled" in smart_supported:
|
if "SMART support is: Enabled" in smart_supported:
|
||||||
wwn_ids.append(wwn)
|
wwn_ids.append(wwn)
|
||||||
except subprocess.CalledProcessError as e:
|
except subprocess.CalledProcessError as e:
|
||||||
print(f"Subprocess Error: {e}")
|
print(f"UNKNOWN: subprocess Error - {e}")
|
||||||
|
sys.exit(nagios.UNKNOWN)
|
||||||
return wwn_ids
|
return wwn_ids
|
||||||
|
|
||||||
|
|
||||||
|
@ -39,30 +40,88 @@ def get_smart_health(wwn_id: str, scrutiny_endpoint: str) -> dict:
|
||||||
return {}
|
return {}
|
||||||
|
|
||||||
|
|
||||||
def main(scrutiny_endpoint: str):
|
def main(args):
|
||||||
results = {}
|
results = {}
|
||||||
wwn_ids = get_disk_wwn_ids()
|
wwn_ids = get_disk_wwn_ids()
|
||||||
for wwn_id in wwn_ids:
|
for wwn_id in wwn_ids:
|
||||||
smart_health = get_smart_health(wwn_id, scrutiny_endpoint)
|
smart_health = get_smart_health(wwn_id, args.scrutiny_endpoint)
|
||||||
|
disk_results = {
|
||||||
|
'wwn_id': wwn_id,
|
||||||
|
# 'name': f'/dev/{smart_health["data"]["device"]["device_name"]}',
|
||||||
|
'failed_attributes': [],
|
||||||
|
}
|
||||||
|
|
||||||
|
metadata = smart_health['metadata']
|
||||||
|
|
||||||
if smart_health:
|
if smart_health:
|
||||||
print(f"Disk {wwn_id} SMART health:")
|
# print(f"Disk {wwn_id} SMART health:")
|
||||||
print(json.dumps(smart_health, indent=2))
|
# print(json.dumps(smart_health, indent=2))
|
||||||
|
|
||||||
for metric in smart_health['data']['smart_results'][0]['attrs']:
|
for attribute_id, values in smart_health['data']['smart_results'][0]['attrs'].items():
|
||||||
print(metric)
|
if values['status'] == 0:
|
||||||
|
continue
|
||||||
|
# elif values['status'] == 2 and not args.warn_non_critical:
|
||||||
|
# continue
|
||||||
|
values['attribute_name'] = metadata[attribute_id]['display_name']
|
||||||
|
values['metadata'] = metadata[attribute_id]
|
||||||
|
if 'observed_thresholds' in values['metadata'].keys():
|
||||||
|
del values['metadata']['observed_thresholds']
|
||||||
|
disk_results['failed_attributes'].append(values)
|
||||||
|
|
||||||
results[smart_health['data']['device']['device_name']] = {}
|
results[smart_health['data']['device']['device_name']] = disk_results
|
||||||
|
|
||||||
|
crit_disks = {}
|
||||||
|
warn_disks = {}
|
||||||
|
for disk, values in results.items():
|
||||||
|
for item in values['failed_attributes']:
|
||||||
|
if item['status'] == 2 and args.warn_non_critical:
|
||||||
|
if disk not in warn_disks.keys():
|
||||||
|
warn_disks[disk] = []
|
||||||
|
warn_disks[disk].append({
|
||||||
|
'raw_value': item['raw_value'],
|
||||||
|
'display_name': item['metadata']['display_name']
|
||||||
|
})
|
||||||
|
if item['status'] == 4:
|
||||||
|
if disk not in crit_disks.keys():
|
||||||
|
crit_disks[disk] = []
|
||||||
|
crit_disks[disk].append({
|
||||||
|
'raw_value': item['raw_value'],
|
||||||
|
'display_name': item['metadata']['display_name']
|
||||||
|
})
|
||||||
|
|
||||||
|
return_code = nagios.OK
|
||||||
|
if len(crit_disks):
|
||||||
|
return_code = nagios.CRITICAL
|
||||||
|
print(f'CRITICAL: {len(crit_disks)} {"error" if len(results) == 0 else "errors"} - {args.scrutiny_endpoint}')
|
||||||
|
print('Disks with Errors:')
|
||||||
|
for disk, warns in crit_disks.items():
|
||||||
|
print(f'\t- /dev/{disk}: {", ".join([x["display_name"] for x in warns])}')
|
||||||
|
if len(warn_disks):
|
||||||
|
if return_code < nagios.CRITICAL:
|
||||||
|
return_code = nagios.WARNING
|
||||||
|
print(f'WARNING: {len(crit_disks)} {"warning" if len(results) == 0 else "warnings"} - {args.scrutiny_endpoint}')
|
||||||
|
print('Disks with issues:')
|
||||||
|
for disk, warns in warn_disks.items():
|
||||||
|
print(f'\t- /dev/{disk}: {", ".join([x["display_name"] for x in warns])}')
|
||||||
|
|
||||||
|
if not len(crit_disks) and not len(warn_disks):
|
||||||
|
print(f'OK: all {len(results)} {"disk" if len(results) == 0 else "disks"} are healthy!', end='')
|
||||||
|
|
||||||
|
print(f"|'warnings'={len(warn_disks)};;; 'errors'={len(crit_disks)};;; 'num_disks'={len(results)};;;")
|
||||||
|
sys.exit(return_code)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
parser = argparse.ArgumentParser(description='')
|
parser = argparse.ArgumentParser(description='')
|
||||||
parser.add_argument('--scrutiny-endpoint', required=True, help='Base URL for scrutiny.')
|
parser.add_argument('--scrutiny-endpoint', required=True, help='Base URL for scrutiny.')
|
||||||
|
parser.add_argument('--warn-non-critical', action='store_true', help='Warn when a non-critical metric is marked as failed.')
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
args.scrutiny_endpoint = args.scrutiny_endpoint.strip('/')
|
args.scrutiny_endpoint = args.scrutiny_endpoint.strip('/')
|
||||||
try:
|
try:
|
||||||
main(args.scrutiny_endpoint)
|
main(args)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f'UNKNOWN: exception "{e}"')
|
print(f'UNKNOWN: exception "{e}"')
|
||||||
import traceback
|
import traceback
|
||||||
|
|
||||||
print(traceback.format_exc())
|
print(traceback.format_exc())
|
||||||
sys.exit(nagios.UNKNOWN)
|
sys.exit(nagios.UNKNOWN)
|
||||||
|
|
Loading…
Reference in New Issue