#!/usr/bin/env python3 import argparse import re import subprocess import sys from checker import nagios from checker.markdown import list_to_markdown_table def parse_size(size_str): size_str = size_str.lower() size_map = {'k': 1, 'm': 1024, 'g': 1024 ** 2, 't': 1024 ** 3, 'p': 1024 ** 4} size = float(size_str[:-1]) * size_map[size_str[-1]] return size def percent_to_float(percent_str: str): percent = float(percent_str.strip('%')) return percent / 100 def float_to_percent(float_value: float): percent = float_value * 100 return f"{percent}%" def clean_device_list(in_str: str): return re.sub(r'\s+', ' ', re.sub(r'^\s*|', '', in_str)) def zpool_list(zpool: str, vdev_type: str, header: bool = False): if not header: return subprocess.check_output(f"zpool list -v {zpool} | awk '/{vdev_type}/ {{while(getline && substr($0, 1, 1) ~ /[[:blank:]]/) print}}'", shell=True).decode('utf-8') else: return subprocess.check_output(f"zpool list -v {zpool} | awk 'NR==1 {{print}} /{vdev_type}/ {{while(getline && substr($0, 1, 1) ~ /[[:blank:]]/) print}}'", shell=True).decode('utf-8') def get_vdev_info(zpool: str, vdev_type: str): output_zpool_logs = zpool_list(zpool, vdev_type) zpool_vdev_devices = [] for line in list(filter(None, output_zpool_logs.split('\n'))): data = list(filter(None, clean_device_list(line).split(' '))) zpool_vdev_devices.append({ 'device': data[0], 'size': data[1], 'alloc': data[2], 'free': data[3], 'frag': data[6], 'cap': data[7], 'health': data[9] }) return zpool_vdev_devices def get_zfs_pool_status(pool_name): try: result = subprocess.run(['zpool', 'list', '-H', '-o', 'name,size,alloc,free,cap,frag,health', pool_name], capture_output=True, text=True, check=True) pool_info = result.stdout.strip().split('\t') pool_status = { 'name': pool_info[0], 'size': pool_info[1], 'allocated': pool_info[2], 'free': pool_info[3], 'capacity': pool_info[4], 'fragmentation': pool_info[5], 'health': pool_info[6] } result = subprocess.run(['zpool', 'status', '-v', pool_name], capture_output=True, text=True, check=True) pool_status_lines = result.stdout.strip().split('\n') for i in range(len(pool_status_lines)): pool_status_lines[i] = re.sub(r'\\t\s*', '', pool_status_lines[i]) print(pool_status_lines) log_device_status = None log_device_alloc = None log_device_found = False for line in pool_status_lines: if 'logs' in line: log_device_found = True elif log_device_found: log_device_status = line.strip().split()[-1] log_device_alloc = line.strip().split()[1] break pool_status['log_device_status'] = log_device_status pool_status['log_device_alloc'] = log_device_alloc return pool_status except subprocess.CalledProcessError as e: print(f"Error: {e}") sys.exit(2) def main(): parser = argparse.ArgumentParser(description='Check ZFS pool status') parser.add_argument('pool_name', help='Name of the ZFS pool to check') parser.add_argument('check_type', choices=['status', 'cache', 'log'], help='What to check.') parser.add_argument('--warning-free', type=int, default=65, help='Warning level for free space percentage (default: 65)') parser.add_argument('--critical-free', type=int, default=80, help='Critical level for free space percentage (default: 80)') parser.add_argument('--warning-frag', type=int, default=50, help='Warning level for fragmentation percentage (default: 50)') parser.add_argument('--critical-frag', type=int, default=75, help='Critical level for fragmentation percentage (default: 75)') args = parser.parse_args() args.warning_free = float(f'0.{args.warning_free}') args.critical_free = float(f'0.{args.critical_free}') if args.check_type == 'status': pool_status = get_zfs_pool_status(args.pool_name) print(f"Pool Name: {pool_status['name']}") print(f"Size: {pool_status['size']}") print(f"Allocated: {pool_status['allocated']}") print(f"Free: {pool_status['free']}") print(f"Capacity: {pool_status['capacity']}") print(f"Fragmentation: {pool_status['fragmentation']}") print(f"Health: {pool_status['health']}") if pool_status['log_device_status'] is not None: print(f"Log Device Status: {pool_status['log_device_status']}") print(f"Log Device Allocation: {pool_status['log_device_alloc']}") else: print("No log devices found") size_bytes = parse_size(pool_status['size']) free_bytes = parse_size(pool_status['free']) free_percentage = (free_bytes / size_bytes) * 100 fragmentation_percentage = int(pool_status['fragmentation'].rstrip('%')) if free_percentage <= args.critical_free or fragmentation_percentage >= args.critical_frag: print("CRITICAL") sys.exit(2) elif free_percentage <= args.warning_free or fragmentation_percentage >= args.warning_frag: print("WARNING") sys.exit(1) else: print("OK") sys.exit(0) elif args.check_type in ['cache', 'log']: vdev_devices = get_vdev_info(args.pool_name, args.check_type) table_data = [('Device', 'Size', 'Alloc', 'Free', 'Frag', 'Cap', 'Health', 'State')] critical = {'cap': [], 'frag': [], 'health': []} warning = {'cap': [], 'frag': []} for device in vdev_devices: device['cap'] = percent_to_float(device['cap']) device['frag'] = percent_to_float(device['frag']) state = 'ok' if device['cap'] >= args.critical_free: critical['cap'].append(device['device']) state = 'critical' if device['frag'] >= args.critical_frag: critical['frag'].append(device['device']) state = 'critical' if device['health'] != 'ONLINE': critical['health'].append(device['device']) state = 'critical' if not len(critical['health']): if device['cap'] >= args.warning_free and device['device'] not in critical['cap']: warning['cap'].append(device['device']) state = 'warning' if device['frag'] >= args.warning_frag and device['device'] not in critical['frag']: warning['frag'].append(device['device']) state = 'warning' table_data.append((device['device'], device['size'], device['alloc'], device['free'], float_to_percent(device['frag']), float_to_percent(device['cap']), device['health'], state)) exit_code = nagios.OK out_str = None info_str = None crit_drives = [] warn_drives = [] if len(critical['cap']) or len(critical['frag']) or len(critical['health']): exit_code = nagios.CRITICAL if len(critical['cap']) and len(critical['frag']): info_str = 'critical capacity and fragmentation' crit_drives = [*critical['cap'], *critical['frag']] elif len(critical['cap']) and not len(critical['frag']): info_str = 'critical capacity' crit_drives = critical['cap'] elif not len(critical['cap']) and len(critical['frag']): info_str = 'critical fragmentation' crit_drives = critical['frag'] if len(critical['health']): info_str = "shit's fucked" crit_drives = crit_drives + critical['health'] out_str = ['CRITICAL', '-', info_str, f'for {"drives" if len(crit_drives) > 1 else "drive"}', ', '.join([*set(crit_drives)])] if len(warning['cap']) or len(warning['frag']) and not len(critical['health']): if exit_code < nagios.WARNING: exit_code = nagios.WARNING elif exit_code == nagios.CRITICAL: out_str[2] = 'multiple issues' else: if len(warning['cap']) and len(warning['frag']): info_str = 'critical capacity and fragmentation' warn_drives = [*warning['cap'], *warning['frag']] elif len(warning['cap']) and not len(warning['frag']): info_str = 'critical capacity' warn_drives = warning['cap'] elif not len(warning['cap']) and len(warning['frag']): info_str = 'critical fragmentation' warn_drives = warning['frag'] out_str = ['WARNING', '-', info_str, f'for {"drives" if len(warn_drives) > 1 else "drive"}', ', '.join([*set(warn_drives)])] if not len(warn_drives) and not len(crit_drives): out_str = ['OK', '-', f'{len(vdev_devices)} {args.check_type} devices are healthy'] print(*out_str) print(list_to_markdown_table(table_data, align='left', seperator='!', borders=False)) # print(zpool_list(args.pool_name, args.check_type, True)) # for testing sys.exit(exit_code) if __name__ == "__main__": try: main() except Exception as e: print(f'UNKNOWN: exception "{e}"') import traceback print(traceback.format_exc()) sys.exit(nagios.UNKNOWN)