From 48da1bb35fedaffe3cce1588eb83410c9090840f Mon Sep 17 00:00:00 2001 From: Cyberes Date: Mon, 29 May 2023 23:01:58 -0600 Subject: [PATCH] add check_zfs_zpool cache/log device --- check_nginx | 113 ----------------------- check_zfs_zpool.py | 226 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 226 insertions(+), 113 deletions(-) delete mode 100755 check_nginx create mode 100755 check_zfs_zpool.py diff --git a/check_nginx b/check_nginx deleted file mode 100755 index 295e644..0000000 --- a/check_nginx +++ /dev/null @@ -1,113 +0,0 @@ -#!/usr/bin/env python3 -# check_nginx is a Nagios to monitor nginx status -# The version is 1.0.2 -# fixed by Nikolay Kandalintsev (twitter: @nicloay) -# Based on yangzi2008@126.com from http://www.nginxs.com -# which available here http://exchange.nagios.org/directory/Plugins/Web-Servers/nginx/check_nginx/details - -import getopt -import string -import sys -import traceback - -import urllib -from urllib.request import urlopen - - -def usage(): - print("""check_nginx is a Nagios to monitor nginx status - Usage: - - check_nginx [-h|--help][-U|--url][-P|--path][-u|--user][-p|--passwd][-w|--warning][-c|--critical] - - Options: - --help|-h) - print check_nginx help. - --url|-U) - Sets nginx status url. - --path|-P) - Sets nginx status url path. Default is: off - --user|-u) - Sets nginx status BasicAuth user. Default is: off - --passwd|-p) - Sets nginx status BasicAuth passwd. Default is: off - --warning|-w) - Sets a warning level for nginx Active connections. Default is: off - --critical|-c) - Sets a critical level for nginx Active connections. Default is: off - Example: - The url is www.nginxs.com/status - ./check_nginx -U www.nginxs.com -P /status -u eric -p nginx -w 1000 -c 2000 - if dont't have password: - ./check_nginx -U www.nginxs.com -P /status -w 1000 -c 2000 - if don't have path and password: - ./check_nginx -U www.nginxs.com -w 1000 -c 2000""") - - sys.exit(3) - - -try: - options, args = getopt.getopt(sys.argv[1:], "hU:P:u:p:w:c:", ["help", "url=", "path=", "user=", "passwd=", "warning=", "critical="]) - -except getopt.GetoptError: - usage() - sys.exit(3) - -for name, value in options: - if name in ("-h", "--help"): - usage() - if name in ("-U", "--url"): - url = "http://" + value - if name in ("-P", "--path"): - path = value - if name in ("-u", "--user"): - user = value - if name in ("-p", "--passwd"): - passwd = value - if name in ("-w", "--warning"): - warning = value - if name in ("-c", "--critical"): - critical = value -try: - if 'path' in dir(): - req = urllib.Request(url + path) - else: - req = urllib.Request(url) - if 'user' in dir() and 'passwd' in dir(): - passman = urllib.HTTPPasswordMgrWithDefaultRealm() - passman.add_password(None, url + path, user, passwd) - authhandler = urllib.HTTPBasicAuthHandler(passman) - opener = urllib.build_opener(authhandler) - urllib.install_opener(opener) - response = urlopen(req) - the_page = response.readline() - conn = the_page.split() - ActiveConn = conn[2] - the_page1 = response.readline() - the_page2 = response.readline() - the_page3 = response.readline() - response.close() - b = the_page3.split() - reading = b[1] - writing = b[3] - waiting = b[5] - output = 'ActiveConn:%s,reading:%s,writing:%s,waiting:%s' % (ActiveConn, reading, writing, waiting) - perfdata = 'ActiveConn=%s;reading=%s;writing=%s;waiting=%s' % (ActiveConn, reading, writing, waiting) - -except Exception: - print("NGINX STATUS unknown: Error while getting Connection") - print(traceback.format_exc()) - sys.exit(3) -if 'warning' in dir() and 'critical' in dir(): - if int(ActiveConn) >= int(critical): - print('CRITICAL - %s|%s' % (output, perfdata)) - sys.exit(2) - elif int(ActiveConn) >= int(warning): - print('WARNING - %s|%s' % (output, perfdata)) - sys.exit(1) - else: - print('OK - %s|%s' % (output, perfdata)) - sys.exit(0) -else: - print('OK - %s|%s' % (output, perfdata)) - sys.exit(0) diff --git a/check_zfs_zpool.py b/check_zfs_zpool.py new file mode 100755 index 0000000..d713a5f --- /dev/null +++ b/check_zfs_zpool.py @@ -0,0 +1,226 @@ +#!/usr/bin/env python3 +import argparse +import re +import subprocess +import sys + +from checker import nagios +from checker.markdown import list_to_markdown_table + + +def parse_size(size_str): + size_str = size_str.lower() + size_map = {'k': 1, 'm': 1024, 'g': 1024 ** 2, 't': 1024 ** 3, 'p': 1024 ** 4} + size = float(size_str[:-1]) * size_map[size_str[-1]] + return size + + +def percent_to_float(percent_str: str): + percent = float(percent_str.strip('%')) + return percent / 100 + + +def float_to_percent(float_value: float): + percent = float_value * 100 + return f"{percent}%" + + +def clean_device_list(in_str: str): + return re.sub(r'\s+', ' ', re.sub(r'^\s*|', '', in_str)) + + +def zpool_list(zpool: str, vdev_type: str, header: bool = False): + if not header: + return subprocess.check_output(f"zpool list -v {zpool} | awk '/{vdev_type}/ {{while(getline && substr($0, 1, 1) ~ /[[:blank:]]/) print}}'", shell=True).decode('utf-8') + else: + return subprocess.check_output(f"zpool list -v {zpool} | awk 'NR==1 {{print}} /{vdev_type}/ {{while(getline && substr($0, 1, 1) ~ /[[:blank:]]/) print}}'", shell=True).decode('utf-8') + + +def get_vdev_info(zpool: str, vdev_type: str): + output_zpool_logs = zpool_list(zpool, vdev_type) + zpool_vdev_devices = [] + for line in list(filter(None, output_zpool_logs.split('\n'))): + data = list(filter(None, clean_device_list(line).split(' '))) + zpool_vdev_devices.append({ + 'device': data[0], + 'size': data[1], + 'alloc': data[2], + 'free': data[3], + 'frag': data[6], + 'cap': data[7], + 'health': data[9] + }) + return zpool_vdev_devices + + +def get_zfs_pool_status(pool_name): + try: + result = subprocess.run(['zpool', 'list', '-H', '-o', 'name,size,alloc,free,cap,frag,health', pool_name], capture_output=True, text=True, check=True) + pool_info = result.stdout.strip().split('\t') + pool_status = { + 'name': pool_info[0], + 'size': pool_info[1], + 'allocated': pool_info[2], + 'free': pool_info[3], + 'capacity': pool_info[4], + 'fragmentation': pool_info[5], + 'health': pool_info[6] + } + + result = subprocess.run(['zpool', 'status', '-v', pool_name], capture_output=True, text=True, check=True) + pool_status_lines = result.stdout.strip().split('\n') + for i in range(len(pool_status_lines)): + pool_status_lines[i] = re.sub(r'\\t\s*', '', pool_status_lines[i]) + + print(pool_status_lines) + + log_device_status = None + log_device_alloc = None + log_device_found = False + for line in pool_status_lines: + if 'logs' in line: + log_device_found = True + elif log_device_found: + log_device_status = line.strip().split()[-1] + log_device_alloc = line.strip().split()[1] + break + + pool_status['log_device_status'] = log_device_status + pool_status['log_device_alloc'] = log_device_alloc + + return pool_status + + except subprocess.CalledProcessError as e: + print(f"Error: {e}") + sys.exit(2) + + +def main(): + parser = argparse.ArgumentParser(description='Check ZFS pool status') + parser.add_argument('pool_name', help='Name of the ZFS pool to check') + parser.add_argument('check_type', choices=['status', 'cache', 'log'], help='What to check.') + parser.add_argument('--warning-free', type=int, default=65, help='Warning level for free space percentage (default: 65)') + parser.add_argument('--critical-free', type=int, default=80, help='Critical level for free space percentage (default: 80)') + parser.add_argument('--warning-frag', type=int, default=50, help='Warning level for fragmentation percentage (default: 50)') + parser.add_argument('--critical-frag', type=int, default=75, help='Critical level for fragmentation percentage (default: 75)') + args = parser.parse_args() + + args.warning_free = float(f'0.{args.warning_free}') + args.critical_free = float(f'0.{args.critical_free}') + + if args.check_type == 'status': + pool_status = get_zfs_pool_status(args.pool_name) + + print(f"Pool Name: {pool_status['name']}") + print(f"Size: {pool_status['size']}") + print(f"Allocated: {pool_status['allocated']}") + print(f"Free: {pool_status['free']}") + print(f"Capacity: {pool_status['capacity']}") + print(f"Fragmentation: {pool_status['fragmentation']}") + print(f"Health: {pool_status['health']}") + + if pool_status['log_device_status'] is not None: + print(f"Log Device Status: {pool_status['log_device_status']}") + print(f"Log Device Allocation: {pool_status['log_device_alloc']}") + else: + print("No log devices found") + + size_bytes = parse_size(pool_status['size']) + free_bytes = parse_size(pool_status['free']) + free_percentage = (free_bytes / size_bytes) * 100 + fragmentation_percentage = int(pool_status['fragmentation'].rstrip('%')) + + if free_percentage <= args.critical_free or fragmentation_percentage >= args.critical_frag: + print("CRITICAL") + sys.exit(2) + elif free_percentage <= args.warning_free or fragmentation_percentage >= args.warning_frag: + print("WARNING") + sys.exit(1) + else: + print("OK") + sys.exit(0) + elif args.check_type in ['cache', 'log']: + vdev_devices = get_vdev_info(args.pool_name, args.check_type) + table_data = [('Device', 'Size', 'Alloc', 'Free', 'Frag', 'Cap', 'Health', 'State')] + critical = {'cap': [], 'frag': [], 'health': []} + warning = {'cap': [], 'frag': []} + for device in vdev_devices: + device['cap'] = percent_to_float(device['cap']) + device['frag'] = percent_to_float(device['frag']) + state = 'ok' + + if device['cap'] >= args.critical_free: + critical['cap'].append(device['device']) + state = 'critical' + if device['frag'] >= args.critical_frag: + critical['frag'].append(device['device']) + state = 'critical' + if device['health'] != 'ONLINE': + critical['health'].append(device['device']) + state = 'critical' + + if not len(critical['health']): + if device['cap'] >= args.warning_free and device['device'] not in critical['cap']: + warning['cap'].append(device['device']) + state = 'warning' + if device['frag'] >= args.warning_frag and device['device'] not in critical['frag']: + warning['frag'].append(device['device']) + state = 'warning' + + table_data.append((device['device'], device['size'], device['alloc'], device['free'], float_to_percent(device['frag']), float_to_percent(device['cap']), device['health'], state)) + + exit_code = nagios.OK + out_str = None + info_str = None + crit_drives = [] + warn_drives = [] + if len(critical['cap']) or len(critical['frag']) or len(critical['health']): + exit_code = nagios.CRITICAL + if len(critical['cap']) and len(critical['frag']): + info_str = 'critical capacity and fragmentation' + crit_drives = [*critical['cap'], *critical['frag']] + elif len(critical['cap']) and not len(critical['frag']): + info_str = 'critical capacity' + crit_drives = critical['cap'] + elif not len(critical['cap']) and len(critical['frag']): + info_str = 'critical fragmentation' + crit_drives = critical['frag'] + if len(critical['health']): + info_str = "shit's fucked" + crit_drives = crit_drives + critical['health'] + out_str = ['CRITICAL', '-', info_str, f'for {"drives" if len(crit_drives) > 1 else "drive"}', ', '.join([*set(crit_drives)])] + if len(warning['cap']) or len(warning['frag']) and not len(critical['health']): + if exit_code < nagios.WARNING: + exit_code = nagios.WARNING + elif exit_code == nagios.CRITICAL: + out_str[2] = 'multiple issues' + else: + if len(warning['cap']) and len(warning['frag']): + info_str = 'critical capacity and fragmentation' + warn_drives = [*warning['cap'], *warning['frag']] + elif len(warning['cap']) and not len(warning['frag']): + info_str = 'critical capacity' + warn_drives = warning['cap'] + elif not len(warning['cap']) and len(warning['frag']): + info_str = 'critical fragmentation' + warn_drives = warning['frag'] + out_str = ['WARNING', '-', info_str, f'for {"drives" if len(warn_drives) > 1 else "drive"}', ', '.join([*set(warn_drives)])] + + if not len(warn_drives) and not len(crit_drives): + out_str = ['OK', '-', f'{len(vdev_devices)} {args.check_type} devices are healthy'] + + print(*out_str) + print(list_to_markdown_table(table_data, align='left', seperator='!', borders=False)) + # print(zpool_list(args.pool_name, args.check_type, True)) # for testing + sys.exit(exit_code) + + +if __name__ == "__main__": + try: + main() + except Exception as e: + print(f'UNKNOWN: exception "{e}"') + import traceback + + print(traceback.format_exc()) + sys.exit(nagios.UNKNOWN)