diff --git a/check_zfs_zpool.py b/check_zfs_zpool.py index d713a5f..b7aaae6 100755 --- a/check_zfs_zpool.py +++ b/check_zfs_zpool.py @@ -4,8 +4,14 @@ import re import subprocess import sys +import zfslib as zfs + from checker import nagios from checker.markdown import list_to_markdown_table +from checker.units import filesize + + +# TODO: add perfdata def parse_size(size_str): @@ -21,7 +27,7 @@ def percent_to_float(percent_str: str): def float_to_percent(float_value: float): - percent = float_value * 100 + percent = round(float_value * 100, 2) return f"{percent}%" @@ -30,10 +36,45 @@ def clean_device_list(in_str: str): def zpool_list(zpool: str, vdev_type: str, header: bool = False): - if not header: - return subprocess.check_output(f"zpool list -v {zpool} | awk '/{vdev_type}/ {{while(getline && substr($0, 1, 1) ~ /[[:blank:]]/) print}}'", shell=True).decode('utf-8') - else: - return subprocess.check_output(f"zpool list -v {zpool} | awk 'NR==1 {{print}} /{vdev_type}/ {{while(getline && substr($0, 1, 1) ~ /[[:blank:]]/) print}}'", shell=True).decode('utf-8') + try: + if not header: + return subprocess.check_output(f"zpool list -v {zpool} | awk '/{vdev_type}/ {{while(getline && substr($0, 1, 1) ~ /[[:blank:]]/) print}}'", shell=True, stderr=subprocess.PIPE).decode('utf-8') + else: + return subprocess.check_output(f"zpool list -v {zpool} | awk 'NR==1 {{print}} /{vdev_type}/ {{while(getline && substr($0, 1, 1) ~ /[[:blank:]]/) print}}'", shell=True, stderr=subprocess.PIPE).decode('utf-8') + except subprocess.CalledProcessError as e: + print('UNKNOWN - failed to check pool:', e.stderr.decode(sys.getfilesystemencoding())) + sys.exit(nagios.UNKNOWN) + + +def check_vdev_devices(vdev_devices: list, critical_free, warning_free, critical_frag, warning_frag): + critical = {'cap': [], 'frag': [], 'health': []} + warning = {'cap': [], 'frag': []} + states = {} + for device in vdev_devices: + state = '[OK]' + + if device['health'] != 'ONLINE': + critical['health'].append(device['device']) + state = '[CRITICAL]' + + if device['cap'] != '-': + device['cap'] = percent_to_float(device['cap']) + if device['cap'] >= critical_free: + critical['cap'].append(device['device']) + state = '[CRITICAL]' + elif not len(critical['health']) and device['cap'] >= warning_free: + warning['cap'].append(device['device']) + state = '[WARNING]' + if device['frag'] != '-': + device['frag'] = percent_to_float(device['frag']) + if device['frag'] >= critical_frag: + critical['frag'].append(device['device']) + state = '[CRITICAL]' + elif not len(critical['health']) and device['frag'] >= warning_frag: + warning['frag'].append(device['device']) + state = '[WARNING]' + states[device['device']] = state + return critical, warning, states def get_vdev_info(zpool: str, vdev_type: str): @@ -53,147 +94,146 @@ def get_vdev_info(zpool: str, vdev_type: str): return zpool_vdev_devices -def get_zfs_pool_status(pool_name): - try: - result = subprocess.run(['zpool', 'list', '-H', '-o', 'name,size,alloc,free,cap,frag,health', pool_name], capture_output=True, text=True, check=True) - pool_info = result.stdout.strip().split('\t') - pool_status = { - 'name': pool_info[0], - 'size': pool_info[1], - 'allocated': pool_info[2], - 'free': pool_info[3], - 'capacity': pool_info[4], - 'fragmentation': pool_info[5], - 'health': pool_info[6] - } +def get_zpool_zfs_properties(pool_name: str): + conn = zfs.Connection(host='localhost') + poolset = conn.load_poolset() + zfs_properties = { + 'allocated': None, + 'capacity': None, + 'fragmentation': None, + 'free': None, + 'health': None, + 'size': None, + } - result = subprocess.run(['zpool', 'status', '-v', pool_name], capture_output=True, text=True, check=True) - pool_status_lines = result.stdout.strip().split('\n') - for i in range(len(pool_status_lines)): - pool_status_lines[i] = re.sub(r'\\t\s*', '', pool_status_lines[i]) + pool = poolset.get_pool(pool_name) + for prop, value in zfs_properties.items(): + zfs_properties[prop] = pool.pool.get_property(prop) - print(pool_status_lines) - - log_device_status = None - log_device_alloc = None - log_device_found = False - for line in pool_status_lines: - if 'logs' in line: - log_device_found = True - elif log_device_found: - log_device_status = line.strip().split()[-1] - log_device_alloc = line.strip().split()[1] - break - - pool_status['log_device_status'] = log_device_status - pool_status['log_device_alloc'] = log_device_alloc - - return pool_status - - except subprocess.CalledProcessError as e: - print(f"Error: {e}") - sys.exit(2) + return zfs_properties def main(): parser = argparse.ArgumentParser(description='Check ZFS pool status') - parser.add_argument('pool_name', help='Name of the ZFS pool to check') - parser.add_argument('check_type', choices=['status', 'cache', 'log'], help='What to check.') + parser.add_argument('--pool-name', required=True, help='Name of the ZFS pool to check.') + parser.add_argument('--check-type', required=True, choices=['status', 'cache', 'log'], help='What to check.') parser.add_argument('--warning-free', type=int, default=65, help='Warning level for free space percentage (default: 65)') parser.add_argument('--critical-free', type=int, default=80, help='Critical level for free space percentage (default: 80)') parser.add_argument('--warning-frag', type=int, default=50, help='Warning level for fragmentation percentage (default: 50)') parser.add_argument('--critical-frag', type=int, default=75, help='Critical level for fragmentation percentage (default: 75)') args = parser.parse_args() - args.warning_free = float(f'0.{args.warning_free}') - args.critical_free = float(f'0.{args.critical_free}') + args.warning_free = percent_to_float(f'{args.warning_free}%') + args.critical_free = percent_to_float(f'{args.critical_free}%') + args.warning_frag = percent_to_float(f'{args.warning_frag}%') + args.critical_frag = percent_to_float(f'{args.critical_frag}%') if args.check_type == 'status': - pool_status = get_zfs_pool_status(args.pool_name) + vdev_devices = [x for x in get_vdev_info(args.pool_name, args.pool_name) if not x['device'].startswith('mirror-')] + if not len(vdev_devices): + print('UNKNOWN - no devices found') + sys.exit(nagios.UNKNOWN) - print(f"Pool Name: {pool_status['name']}") - print(f"Size: {pool_status['size']}") - print(f"Allocated: {pool_status['allocated']}") - print(f"Free: {pool_status['free']}") - print(f"Capacity: {pool_status['capacity']}") - print(f"Fragmentation: {pool_status['fragmentation']}") - print(f"Health: {pool_status['health']}") + pool_status = get_zpool_zfs_properties(args.pool_name) + exit_code = nagios.OK + issues = [] - if pool_status['log_device_status'] is not None: - print(f"Log Device Status: {pool_status['log_device_status']}") - print(f"Log Device Allocation: {pool_status['log_device_alloc']}") - else: - print("No log devices found") + pool_status['capacity'] = percent_to_float(f"{pool_status['capacity']}%") + pool_status['fragmentation'] = percent_to_float(f"{pool_status['fragmentation']}%") - size_bytes = parse_size(pool_status['size']) - free_bytes = parse_size(pool_status['free']) - free_percentage = (free_bytes / size_bytes) * 100 - fragmentation_percentage = int(pool_status['fragmentation'].rstrip('%')) + # Check for critical + if pool_status['capacity'] >= args.critical_free: + exit_code = nagios.CRITICAL + issues.append('capacity') + elif pool_status['fragmentation'] >= args.critical_frag: + exit_code = nagios.CRITICAL + issues.append('fragmentation') + elif pool_status['health'] != 'ONLINE': + exit_code = nagios.CRITICAL + issues.append('health') - if free_percentage <= args.critical_free or fragmentation_percentage >= args.critical_frag: - print("CRITICAL") - sys.exit(2) - elif free_percentage <= args.warning_free or fragmentation_percentage >= args.warning_frag: - print("WARNING") - sys.exit(1) - else: - print("OK") - sys.exit(0) - elif args.check_type in ['cache', 'log']: - vdev_devices = get_vdev_info(args.pool_name, args.check_type) - table_data = [('Device', 'Size', 'Alloc', 'Free', 'Frag', 'Cap', 'Health', 'State')] - critical = {'cap': [], 'frag': [], 'health': []} - warning = {'cap': [], 'frag': []} + # Check for warnings + if exit_code == nagios.OK: + if pool_status['capacity'] >= args.warning_free: + exit_code = nagios.WARNING + issues.append('capacity') + elif pool_status['fragmentation'] >= args.warning_frag: + exit_code = nagios.WARNING + issues.append('fragmentation') + + # Print the status + if exit_code == nagios.CRITICAL: + print('CRITICAL - pool', args.pool_name, 'is unhealthy:', ', '.join(issues)) + elif exit_code == nagios.WARNING: + print('WARNING - pool', args.pool_name, 'is unhealthy:', ', '.join(issues)) + elif exit_code == nagios.OK: + print('OK - pool', args.pool_name, 'is healthy') + + # Build the table + critical, warning, states = check_vdev_devices(vdev_devices, args.critical_free, args.warning_free, args.critical_frag, args.warning_frag) + table_data = [ + ('Device', 'Size', 'Alloc', 'Free', 'Frag', 'Cap', 'Health', 'State'), + (args.pool_name, filesize(pool_status['size'], spaces=False, formatter=False), filesize(pool_status['allocated'], spaces=False, formatter=False), filesize(pool_status['free'], spaces=False, formatter=False), float_to_percent(pool_status['fragmentation']), + float_to_percent(pool_status['capacity']), + pool_status['health'], f"[{('ok' if exit_code == nagios.OK else 'critical').upper()}]") + ] for device in vdev_devices: - device['cap'] = percent_to_float(device['cap']) - device['frag'] = percent_to_float(device['frag']) - state = 'ok' + for device in vdev_devices: + if isinstance(device['frag'], float): + device['frag'] = float_to_percent(device['frag']) + if isinstance(device['cap'], float): + device['cap'] = float_to_percent(device['cap']) + table_data.append((device['device'], device['size'], device['alloc'], device['free'], device['frag'], device['cap'], device['health'], states[device['device']])) - if device['cap'] >= args.critical_free: - critical['cap'].append(device['device']) - state = 'critical' - if device['frag'] >= args.critical_frag: - critical['frag'].append(device['device']) - state = 'critical' - if device['health'] != 'ONLINE': - critical['health'].append(device['device']) - state = 'critical' + print(list_to_markdown_table(table_data, align='left', seperator='!', borders=False)) + sys.exit(exit_code) - if not len(critical['health']): - if device['cap'] >= args.warning_free and device['device'] not in critical['cap']: - warning['cap'].append(device['device']) - state = 'warning' - if device['frag'] >= args.warning_frag and device['device'] not in critical['frag']: - warning['frag'].append(device['device']) - state = 'warning' + elif args.check_type in ['cache', 'log']: - table_data.append((device['device'], device['size'], device['alloc'], device['free'], float_to_percent(device['frag']), float_to_percent(device['cap']), device['health'], state)) + vdev_devices = get_vdev_info(args.pool_name, args.check_type) + if not len(vdev_devices): + print('UNKNOWN - no devices found') + sys.exit(nagios.UNKNOWN) + table_data = [('Device', 'Size', 'Alloc', 'Free', 'Frag', 'Cap', 'Health', 'State')] + critical, warning, states = check_vdev_devices(vdev_devices, args.critical_free, args.warning_free, args.critical_frag, args.warning_frag) + + for device in vdev_devices: + table_data.append((device['device'], device['size'], device['alloc'], device['free'], float_to_percent(device['frag']), float_to_percent(device['cap']), device['health'], states[device['device']])) exit_code = nagios.OK out_str = None info_str = None crit_drives = [] warn_drives = [] + issues = set() if len(critical['cap']) or len(critical['frag']) or len(critical['health']): exit_code = nagios.CRITICAL if len(critical['cap']) and len(critical['frag']): info_str = 'critical capacity and fragmentation' crit_drives = [*critical['cap'], *critical['frag']] + issues.add('capacity') + issues.add('fragmentation') elif len(critical['cap']) and not len(critical['frag']): info_str = 'critical capacity' crit_drives = critical['cap'] + issues.add('capacity') elif not len(critical['cap']) and len(critical['frag']): info_str = 'critical fragmentation' crit_drives = critical['frag'] + issues.add('fragmentation') if len(critical['health']): info_str = "shit's fucked" crit_drives = crit_drives + critical['health'] - out_str = ['CRITICAL', '-', info_str, f'for {"drives" if len(crit_drives) > 1 else "drive"}', ', '.join([*set(crit_drives)])] + issues.add('health') + out_str = ['CRITICAL', '-', info_str, f'for {"devices" if len(crit_drives) > 1 else "devices"} for {args.pool_name}:', ', '.join([*set(crit_drives)])] if len(warning['cap']) or len(warning['frag']) and not len(critical['health']): if exit_code < nagios.WARNING: exit_code = nagios.WARNING elif exit_code == nagios.CRITICAL: - out_str[2] = 'multiple issues' + if len(issues) > 1: # if there's only one issue, don't assume there are also warnings + out_str[2] = 'multiple issues: ' + ' and '.join(issues) + del out_str[3] + del out_str[3] else: if len(warning['cap']) and len(warning['frag']): info_str = 'critical capacity and fragmentation' @@ -204,14 +244,13 @@ def main(): elif not len(warning['cap']) and len(warning['frag']): info_str = 'critical fragmentation' warn_drives = warning['frag'] - out_str = ['WARNING', '-', info_str, f'for {"drives" if len(warn_drives) > 1 else "drive"}', ', '.join([*set(warn_drives)])] + out_str = ['WARNING', '-', info_str, f'for {"devices" if len(warn_drives) > 1 else "devices"} for {args.pool_name}:', ', '.join([*set(warn_drives)])] if not len(warn_drives) and not len(crit_drives): - out_str = ['OK', '-', f'{len(vdev_devices)} {args.check_type} devices are healthy'] + out_str = ['OK', '-', f'{len(vdev_devices)} {args.check_type} devices for {args.pool_name} are healthy'] print(*out_str) print(list_to_markdown_table(table_data, align='left', seperator='!', borders=False)) - # print(zpool_list(args.pool_name, args.check_type, True)) # for testing sys.exit(exit_code) diff --git a/checker/markdown.py b/checker/markdown.py index 73d6d93..59a99b6 100644 --- a/checker/markdown.py +++ b/checker/markdown.py @@ -1,4 +1,4 @@ -def list_to_markdown_table(array, align: str = None, seperator: str = '|', borders: bool = True): +def list_to_markdown_table(array, align: str = None, seperator: str = '|', borders: bool = True, right_align_first_item: bool = True): """ https://gist.github.com/OsKaR31415/955b166f4a286ed427f667cb21d57bfd Args: @@ -7,6 +7,7 @@ def list_to_markdown_table(array, align: str = None, seperator: str = '|', borde align: The alignment of the cells : 'left', 'center' or 'right'. seperator: borders: + right_align_first_item: """ # make sure every elements are strings array = [[str(elt) for elt in line] for line in array] @@ -14,8 +15,12 @@ def list_to_markdown_table(array, align: str = None, seperator: str = '|', borde widths = [max(len(line[i]) for line in array) for i in range(len(array[0]))] # make every width at least 3 colmuns, because the separator needs it widths = [max(w, 3) for w in widths] + # center text according to the widths - array = [[elt.center(w) for elt, w in zip(line, widths)] for line in array] + if right_align_first_item: + array = [[elt.ljust(w) if i == 0 else elt.center(w) for i, (elt, w) in enumerate(zip(line, widths))] for line in array] + else: + array = [[elt.center(w) for elt, w in zip(line, widths)] for line in array] # separate the header and the body array_head, *array_body = array @@ -25,7 +30,10 @@ def list_to_markdown_table(array, align: str = None, seperator: str = '|', borde else: edge_seperator = '' - header = ((edge_seperator + ' ') if borders else '') + f' {seperator} '.join(array_head) + ((' ' + edge_seperator) if borders else '') + if right_align_first_item: + header = ((edge_seperator + ' ') if borders else '') + array_head[0].ljust(widths[0]) + f' {seperator} ' + f' {seperator} '.join([elt.center(w) for elt, w in zip(array_head[1:], widths[1:])]) + ((' ' + edge_seperator) if borders else '') + else: + header = ((edge_seperator + ' ') if borders else '') + f' {seperator} '.join(array_head) + ((' ' + edge_seperator) if borders else '') # alignment of the cells align = str(align).lower() # make sure `align` is a lowercase string diff --git a/checker/units.py b/checker/units.py index 1e59612..a9a4643 100644 --- a/checker/units.py +++ b/checker/units.py @@ -1,15 +1,19 @@ from hurry.filesize import size -def filesize(bytes: int, spaces: bool = True): - system = [ - (1024 ** 5, ' PB'), - (1024 ** 4, ' TB'), - (1024 ** 3, ' GB'), - (1024 ** 2, ' MB'), - (1024 ** 1, ' KB'), - (1024 ** 0, ' B'), - ] - x = size(bytes, system=system) + +def filesize(bytes: int, spaces: bool = True, formatter: bool = True): + if formatter: + system = [ + (1024 ** 5, ' PB'), + (1024 ** 4, ' TB'), + (1024 ** 3, ' GB'), + (1024 ** 2, ' MB'), + (1024 ** 1, ' KB'), + (1024 ** 0, ' B'), + ] + x = size(bytes, system=system) + else: + x = size(bytes) if spaces: return x else: diff --git a/requirements.txt b/requirements.txt index 0c37cf4..43953ac 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,5 +12,6 @@ aiofiles~=0.6.0 markdown~=3.4.1 psutil~=5.9.4 hurry.filesize -certifi -cloudflarepycli +certifi~=2022.12.7 +cloudflarepycli~=1.7.0 +zfslib~=0.11.0 \ No newline at end of file