#!/usr/bin/env python3 import argparse import re import subprocess import sys import zfslib as zfs from checker import nagios, dict_to_perfdata from checker.markdown import list_to_markdown_table from checker.nagios import state_to_txt from checker.units import filesize # TODO: add perfdata def parse_size(size_str): size_str = size_str.lower() size_map = {'k': 1, 'm': 1024, 'g': 1024 ** 2, 't': 1024 ** 3, 'p': 1024 ** 4} size = float(size_str[:-1]) * size_map[size_str[-1]] return size def percent_to_float(percent_str: str): percent = float(percent_str.strip('%')) return percent / 100 def float_to_percent(float_value): percent = round(float(float_value) * 100, 2) return f"{percent}%" def clean_device_list(in_str: str): return re.sub(r'\s+', ' ', re.sub(r'^\s*|', '', in_str)) def zpool_list(zpool: str, vdev_type: str, header: bool = False): try: if vdev_type == 'pool': if not header: # GPT-4's original awk command was this: # awk '/logs/ || /cache/ {{exit}} /^[[:space:]]+[^[:space:]]/ || /^[[:space:]]{2,}ata-/' return subprocess.check_output( f"zpool list -v {zpool} | awk '/logs/ || /cache/ {{exit}} /^[[:space:]]+[^[:space:]]/'", shell=True, stderr=subprocess.PIPE).decode('utf-8') else: raise NotImplementedError('not implemented for pool') elif vdev_type in ['cache', 'log']: if not header: return subprocess.check_output( f"zpool list -v {zpool} | awk '/{vdev_type}/ {{while(getline && substr($0, 1, 1) ~ /[[:blank:]]/) print}}'", shell=True, stderr=subprocess.PIPE).decode('utf-8') else: return subprocess.check_output( f"zpool list -v {zpool} | awk 'NR==1 {{print}} /{vdev_type}/ {{while(getline && substr($0, 1, 1) ~ /[[:blank:]]/) print}}'", shell=True, stderr=subprocess.PIPE).decode('utf-8') else: raise NotImplementedError except subprocess.CalledProcessError as e: print('UNKNOWN - failed to check pool:', e.stderr.decode(sys.getfilesystemencoding())) sys.exit(nagios.UNKNOWN) def check_vdev_devices(vdev_devices: list, critical_free, warning_free, critical_frag, warning_frag): critical = {'cap': [], 'frag': [], 'health': []} warning = {'cap': [], 'frag': []} states = {} for device in vdev_devices: state = '[OK]' if device['health'] != 'ONLINE': critical['health'].append(device['device']) state = '[CRITICAL]' if device['cap'] != '-': device['cap'] = percent_to_float(device['cap']) if device['cap'] >= critical_free: critical['cap'].append(device['device']) state = '[CRITICAL]' elif not len(critical['health']) and device['cap'] >= warning_free: warning['cap'].append(device['device']) state = '[WARNING]' if device['frag'] != '-': device['frag'] = percent_to_float(device['frag']) if device['frag'] >= critical_frag: critical['frag'].append(device['device']) state = '[CRITICAL]' elif not len(critical['health']) and device['frag'] >= warning_frag: warning['frag'].append(device['device']) state = '[WARNING]' states[device['device']] = state return critical, warning, states def is_dash(string: str): return string == '-' def get_vdev_info(zpool: str, vdev_type: str): output_zpool_logs = zpool_list(zpool, vdev_type) zpool_vdev_devices = [] for line in list(filter(None, output_zpool_logs.split('\n'))): data = list(filter(None, clean_device_list(line).split(' '))) zpool_vdev_devices.append({ 'pool': False, # not (is_dash(data[2]) and is_dash(data[3]) and is_dash(data[6]) and is_dash(data[7])), # TODO: better pool detection 'device': data[0], 'size': data[1], 'alloc': data[2], 'free': data[3], 'frag': data[6], 'cap': data[7], 'health': data[9] }) return zpool_vdev_devices def get_zpool_zfs_properties(pool_name: str): conn = zfs.Connection(host='localhost') poolset = conn.load_poolset() zfs_properties = { 'allocated': None, 'capacity': None, 'fragmentation': None, 'free': None, 'health': None, 'size': None, } pool = poolset.get_pool(pool_name) for prop, value in zfs_properties.items(): zfs_properties[prop] = pool.pool.get_property(prop) return zfs_properties def main(args): warning_free_float = percent_to_float(f'{args.warning_free}%') critical_free_float = percent_to_float(f'{args.critical_free}%') warning_frag_float = percent_to_float(f'{args.warning_frag}%') critical_frag_float = percent_to_float(f'{args.critical_frag}%') if args.check_type == 'status': vdev_devices = [x for x in get_vdev_info(args.pool_name, 'pool') if not x['pool']] if not len(vdev_devices): print('UNKNOWN - no devices found') sys.exit(nagios.UNKNOWN) pool_status = get_zpool_zfs_properties(args.pool_name) exit_code = nagios.OK issues = [] pool_status['capacity'] = percent_to_float(f"{pool_status['capacity']}%") pool_status['fragmentation'] = percent_to_float(f"{pool_status['fragmentation']}%") # Check for critical if pool_status['capacity'] >= critical_free_float: exit_code = nagios.CRITICAL issues.append('capacity') elif pool_status['fragmentation'] >= critical_frag_float: exit_code = nagios.CRITICAL issues.append('fragmentation') elif pool_status['health'] != 'ONLINE': exit_code = nagios.CRITICAL issues.append('health') # Check for warnings if exit_code == nagios.OK: if pool_status['capacity'] >= warning_free_float: exit_code = nagios.WARNING issues.append('capacity') elif pool_status['fragmentation'] >= warning_frag_float: exit_code = nagios.WARNING issues.append('fragmentation') # Print the status if exit_code == nagios.CRITICAL: print('CRITICAL - pool', args.pool_name, 'is unhealthy:', ', '.join(issues)) elif exit_code == nagios.WARNING: print('WARNING - pool', args.pool_name, 'is unhealthy:', ', '.join(issues)) elif exit_code == nagios.OK: print('OK - pool', args.pool_name, 'is healthy') # Build the table critical, warning, states = check_vdev_devices(vdev_devices, critical_free_float, warning_free_float, critical_frag_float, warning_frag_float) table_data = [ ('Device', 'Size', 'Alloc', 'Free', 'Frag', 'Cap', 'Health', 'State'), (args.pool_name, filesize(pool_status['size'], spaces=False, formatter=False), filesize(pool_status['allocated'], spaces=False, formatter=False), filesize(pool_status['free'], spaces=False, formatter=False), float_to_percent(pool_status['fragmentation']), float_to_percent(pool_status['capacity']), pool_status['health'], f"[{state_to_txt(exit_code).upper()}]") ] for device in vdev_devices: if isinstance(device['frag'], float): device['frag'] = float_to_percent(device['frag']) if isinstance(device['cap'], float): device['cap'] = float_to_percent(device['cap']) table_data.append((device['device'], device['size'], device['alloc'], device['free'], device['frag'], device['cap'], device['health'], states[device['device']])) perf_data = { 'capacity': { 'value': int(float(float_to_percent(pool_status['capacity']).strip('%'))), 'warn': args.warning_free, 'crit': args.critical_free, 'min': 0, 'unit': '%' }, 'size': { 'value': filesize(pool_status['size'], spaces=False, formatter=True), 'min': 0 }, 'fragmentation': { 'value': int(float(float_to_percent(pool_status['fragmentation']).strip('%'))), 'warn': float_to_percent(warning_frag_float).strip('%'), 'crit': float_to_percent(critical_frag_float).strip('%'), 'min': 0, 'unit': '%' } } perf_data_str = dict_to_perfdata(perf_data) print(list_to_markdown_table(table_data, align='center', seperator='!', borders=False), '|', perf_data_str) sys.exit(exit_code) elif args.check_type in ['cache', 'log']: vdev_devices = get_vdev_info(args.pool_name, args.check_type) if not len(vdev_devices): print('UNKNOWN - no devices found') sys.exit(nagios.UNKNOWN) table_data = [('Device', 'Size', 'Alloc', 'Free', 'Frag', 'Cap', 'Health', 'State')] critical, warning, states = check_vdev_devices(vdev_devices, critical_free_float, warning_free_float, critical_frag_float, warning_frag_float) for device in vdev_devices: if device['frag'] != '-': device['frag'] = float_to_percent(device['frag']) if device['cap'] != '-': device['cap'] = float_to_percent(device['cap']) table_data.append((device['device'], device['size'], device['alloc'], device['free'], device['frag'], device['cap'], device['health'], states[device['device']])) exit_code = nagios.OK out_str = None info_str = None crit_drives = [] warn_drives = [] issues = set() if len(critical['cap']) or len(critical['frag']) or len(critical['health']): exit_code = nagios.CRITICAL if len(critical['cap']) and len(critical['frag']): info_str = 'critical capacity and fragmentation' crit_drives = [*critical['cap'], *critical['frag']] issues.add('capacity') issues.add('fragmentation') elif len(critical['cap']) and not len(critical['frag']): info_str = 'critical capacity' crit_drives = critical['cap'] issues.add('capacity') elif not len(critical['cap']) and len(critical['frag']): info_str = 'critical fragmentation' crit_drives = critical['frag'] issues.add('fragmentation') if len(critical['health']): info_str = "shit's fucked" crit_drives = crit_drives + critical['health'] issues.add('health') out_str = ['CRITICAL', '-', info_str, f'for {"devices" if len(crit_drives) > 1 else "devices"} for {args.pool_name}:', ', '.join([*set(crit_drives)])] if len(warning['cap']) or len(warning['frag']) and not len(critical['health']): if exit_code < nagios.WARNING: exit_code = nagios.WARNING elif exit_code == nagios.CRITICAL: if len(issues) > 1: # if there's only one issue, don't assume there are also warnings out_str[2] = 'multiple issues: ' + ' and '.join(issues) del out_str[3] del out_str[3] else: if len(warning['cap']) and len(warning['frag']): info_str = 'critical capacity and fragmentation' warn_drives = [*warning['cap'], *warning['frag']] elif len(warning['cap']) and not len(warning['frag']): info_str = 'critical capacity' warn_drives = warning['cap'] elif not len(warning['cap']) and len(warning['frag']): info_str = 'critical fragmentation' warn_drives = warning['frag'] out_str = ['WARNING', '-', info_str, f'for {"devices" if len(warn_drives) > 1 else "devices"} for {args.pool_name}:', ', '.join([*set(warn_drives)])] if not len(warn_drives) and not len(crit_drives): out_str = ['OK', '-', f'{len([x for x in vdev_devices if not x["device"].startswith("mirror")])} {args.check_type} devices for {args.pool_name} are healthy'] print(*out_str) print(list_to_markdown_table(table_data, align='left', seperator='!', borders=False)) sys.exit(exit_code) if __name__ == "__main__": parser = argparse.ArgumentParser(description='Check ZFS pool status') parser.add_argument('--pool-name', required=True, help='Name of the ZFS pool to check.') parser.add_argument('--check-type', required=True, choices=['status', 'cache', 'log'], help='What to check.') parser.add_argument('--warning-free', type=int, default=65, help='Warning level for free space percentage (default: 65)') parser.add_argument('--critical-free', type=int, default=80, help='Critical level for free space percentage (default: 80)') parser.add_argument('--warning-frag', type=int, default=50, help='Warning level for fragmentation percentage (default: 50)') parser.add_argument('--critical-frag', type=int, default=75, help='Critical level for fragmentation percentage (default: 75)') args = parser.parse_args() try: main(args) except Exception as e: print(f'UNKNOWN: exception "{e}"') import traceback print(traceback.format_exc()) sys.exit(nagios.UNKNOWN)