icinga2-checks/check_zfs_zpool.py

303 lines
13 KiB
Python
Executable File

#!/usr/bin/env python3
import argparse
import re
import subprocess
import sys
import zfslib as zfs
from checker import nagios
from checker.markdown import list_to_markdown_table
from checker.units import filesize
# TODO: add perfdata
def parse_size(size_str):
size_str = size_str.lower()
size_map = {'k': 1, 'm': 1024, 'g': 1024 ** 2, 't': 1024 ** 3, 'p': 1024 ** 4}
size = float(size_str[:-1]) * size_map[size_str[-1]]
return size
def percent_to_float(percent_str: str):
percent = float(percent_str.strip('%'))
return percent / 100
def float_to_percent(float_value):
percent = round(float(float_value) * 100, 2)
return f"{percent}%"
def clean_device_list(in_str: str):
return re.sub(r'\s+', ' ', re.sub(r'^\s*|', '', in_str))
def zpool_list(zpool: str, vdev_type: str, header: bool = False):
try:
if vdev_type == 'pool':
if not header:
# GPT-4's original awk command was this:
# awk '/logs/ || /cache/ {{exit}} /^[[:space:]]+[^[:space:]]/ || /^[[:space:]]{2,}ata-/'
return subprocess.check_output(
f"zpool list -v {zpool} | awk '/logs/ || /cache/ {{exit}} /^[[:space:]]+[^[:space:]]/'", shell=True,
stderr=subprocess.PIPE).decode('utf-8')
else:
raise NotImplementedError('not implemented for pool')
elif vdev_type in ['cache', 'log']:
if not header:
return subprocess.check_output(
f"zpool list -v {zpool} | awk '/{vdev_type}/ {{while(getline && substr($0, 1, 1) ~ /[[:blank:]]/) print}}'",
shell=True, stderr=subprocess.PIPE).decode('utf-8')
else:
return subprocess.check_output(
f"zpool list -v {zpool} | awk 'NR==1 {{print}} /{vdev_type}/ {{while(getline && substr($0, 1, 1) ~ /[[:blank:]]/) print}}'",
shell=True, stderr=subprocess.PIPE).decode('utf-8')
else:
raise NotImplementedError
except subprocess.CalledProcessError as e:
print('UNKNOWN - failed to check pool:', e.stderr.decode(sys.getfilesystemencoding()))
sys.exit(nagios.UNKNOWN)
def check_vdev_devices(vdev_devices: list, critical_free, warning_free, critical_frag, warning_frag):
critical = {'cap': [], 'frag': [], 'health': []}
warning = {'cap': [], 'frag': []}
states = {}
for device in vdev_devices:
state = '[OK]'
if device['health'] != 'ONLINE':
critical['health'].append(device['device'])
state = '[CRITICAL]'
if device['cap'] != '-':
device['cap'] = percent_to_float(device['cap'])
if device['cap'] >= critical_free:
critical['cap'].append(device['device'])
state = '[CRITICAL]'
elif not len(critical['health']) and device['cap'] >= warning_free:
warning['cap'].append(device['device'])
state = '[WARNING]'
if device['frag'] != '-':
device['frag'] = percent_to_float(device['frag'])
if device['frag'] >= critical_frag:
critical['frag'].append(device['device'])
state = '[CRITICAL]'
elif not len(critical['health']) and device['frag'] >= warning_frag:
warning['frag'].append(device['device'])
state = '[WARNING]'
states[device['device']] = state
return critical, warning, states
def is_dash(string:str):
return string == '-'
def get_vdev_info(zpool: str, vdev_type: str):
output_zpool_logs = zpool_list(zpool, vdev_type)
zpool_vdev_devices = []
for line in list(filter(None, output_zpool_logs.split('\n'))):
data = list(filter(None, clean_device_list(line).split(' ')))
zpool_vdev_devices.append({
'pool': not (is_dash(data[2]) and is_dash(data[3]) and is_dash(data[6]) and is_dash(data[7])),
'device': data[0],
'size': data[1],
'alloc': data[2],
'free': data[3],
'frag': data[6],
'cap': data[7],
'health': data[9]
})
return zpool_vdev_devices
def get_zpool_zfs_properties(pool_name: str):
conn = zfs.Connection(host='localhost')
poolset = conn.load_poolset()
zfs_properties = {
'allocated': None,
'capacity': None,
'fragmentation': None,
'free': None,
'health': None,
'size': None,
}
pool = poolset.get_pool(pool_name)
for prop, value in zfs_properties.items():
zfs_properties[prop] = pool.pool.get_property(prop)
return zfs_properties
def main():
parser = argparse.ArgumentParser(description='Check ZFS pool status')
parser.add_argument('--pool-name', required=True, help='Name of the ZFS pool to check.')
parser.add_argument('--check-type', required=True, choices=['status', 'cache', 'log'], help='What to check.')
parser.add_argument('--warning-free', type=int, default=65,
help='Warning level for free space percentage (default: 65)')
parser.add_argument('--critical-free', type=int, default=80,
help='Critical level for free space percentage (default: 80)')
parser.add_argument('--warning-frag', type=int, default=50,
help='Warning level for fragmentation percentage (default: 50)')
parser.add_argument('--critical-frag', type=int, default=75,
help='Critical level for fragmentation percentage (default: 75)')
args = parser.parse_args()
args.warning_free = percent_to_float(f'{args.warning_free}%')
args.critical_free = percent_to_float(f'{args.critical_free}%')
args.warning_frag = percent_to_float(f'{args.warning_frag}%')
args.critical_frag = percent_to_float(f'{args.critical_frag}%')
if args.check_type == 'status':
vdev_devices = [x for x in get_vdev_info(args.pool_name, 'pool') if not x['pool']]
if not len(vdev_devices):
print('UNKNOWN - no devices found')
sys.exit(nagios.UNKNOWN)
pool_status = get_zpool_zfs_properties(args.pool_name)
exit_code = nagios.OK
issues = []
pool_status['capacity'] = percent_to_float(f"{pool_status['capacity']}%")
pool_status['fragmentation'] = percent_to_float(f"{pool_status['fragmentation']}%")
# Check for critical
if pool_status['capacity'] >= args.critical_free:
exit_code = nagios.CRITICAL
issues.append('capacity')
elif pool_status['fragmentation'] >= args.critical_frag:
exit_code = nagios.CRITICAL
issues.append('fragmentation')
elif pool_status['health'] != 'ONLINE':
exit_code = nagios.CRITICAL
issues.append('health')
# Check for warnings
if exit_code == nagios.OK:
if pool_status['capacity'] >= args.warning_free:
exit_code = nagios.WARNING
issues.append('capacity')
elif pool_status['fragmentation'] >= args.warning_frag:
exit_code = nagios.WARNING
issues.append('fragmentation')
# Print the status
if exit_code == nagios.CRITICAL:
print('CRITICAL - pool', args.pool_name, 'is unhealthy:', ', '.join(issues))
elif exit_code == nagios.WARNING:
print('WARNING - pool', args.pool_name, 'is unhealthy:', ', '.join(issues))
elif exit_code == nagios.OK:
print('OK - pool', args.pool_name, 'is healthy')
# Build the table
critical, warning, states = check_vdev_devices(vdev_devices, args.critical_free, args.warning_free,
args.critical_frag, args.warning_frag)
table_data = [
('Device', 'Size', 'Alloc', 'Free', 'Frag', 'Cap', 'Health', 'State'),
(args.pool_name, filesize(pool_status['size'], spaces=False, formatter=False),
filesize(pool_status['allocated'], spaces=False, formatter=False),
filesize(pool_status['free'], spaces=False, formatter=False),
float_to_percent(pool_status['fragmentation']),
float_to_percent(pool_status['capacity']),
pool_status['health'], f"[{('ok' if exit_code == nagios.OK else 'critical').upper()}]")
]
for device in vdev_devices:
if isinstance(device['frag'], float):
device['frag'] = float_to_percent(device['frag'])
if isinstance(device['cap'], float):
device['cap'] = float_to_percent(device['cap'])
table_data.append((device['device'], device['size'], device['alloc'], device['free'], device['frag'],
device['cap'], device['health'], states[device['device']]))
print(list_to_markdown_table(table_data, align='left', seperator='!', borders=False))
sys.exit(exit_code)
elif args.check_type in ['cache', 'log']:
vdev_devices = get_vdev_info(args.pool_name, args.check_type)
if not len(vdev_devices):
print('UNKNOWN - no devices found')
sys.exit(nagios.UNKNOWN)
table_data = [('Device', 'Size', 'Alloc', 'Free', 'Frag', 'Cap', 'Health', 'State')]
critical, warning, states = check_vdev_devices(vdev_devices, args.critical_free, args.warning_free,
args.critical_frag, args.warning_frag)
for device in vdev_devices:
if device['frag'] != '-':
device['frag'] = float_to_percent(device['frag'])
if device['cap'] != '-':
device['cap'] = float_to_percent(device['cap'])
table_data.append((device['device'], device['size'], device['alloc'], device['free'], device['frag'],
device['cap'], device['health'], states[device['device']]))
exit_code = nagios.OK
out_str = None
info_str = None
crit_drives = []
warn_drives = []
issues = set()
if len(critical['cap']) or len(critical['frag']) or len(critical['health']):
exit_code = nagios.CRITICAL
if len(critical['cap']) and len(critical['frag']):
info_str = 'critical capacity and fragmentation'
crit_drives = [*critical['cap'], *critical['frag']]
issues.add('capacity')
issues.add('fragmentation')
elif len(critical['cap']) and not len(critical['frag']):
info_str = 'critical capacity'
crit_drives = critical['cap']
issues.add('capacity')
elif not len(critical['cap']) and len(critical['frag']):
info_str = 'critical fragmentation'
crit_drives = critical['frag']
issues.add('fragmentation')
if len(critical['health']):
info_str = "shit's fucked"
crit_drives = crit_drives + critical['health']
issues.add('health')
out_str = ['CRITICAL', '-', info_str,
f'for {"devices" if len(crit_drives) > 1 else "devices"} for {args.pool_name}:',
', '.join([*set(crit_drives)])]
if len(warning['cap']) or len(warning['frag']) and not len(critical['health']):
if exit_code < nagios.WARNING:
exit_code = nagios.WARNING
elif exit_code == nagios.CRITICAL:
if len(issues) > 1: # if there's only one issue, don't assume there are also warnings
out_str[2] = 'multiple issues: ' + ' and '.join(issues)
del out_str[3]
del out_str[3]
else:
if len(warning['cap']) and len(warning['frag']):
info_str = 'critical capacity and fragmentation'
warn_drives = [*warning['cap'], *warning['frag']]
elif len(warning['cap']) and not len(warning['frag']):
info_str = 'critical capacity'
warn_drives = warning['cap']
elif not len(warning['cap']) and len(warning['frag']):
info_str = 'critical fragmentation'
warn_drives = warning['frag']
out_str = ['WARNING', '-', info_str,
f'for {"devices" if len(warn_drives) > 1 else "devices"} for {args.pool_name}:',
', '.join([*set(warn_drives)])]
if not len(warn_drives) and not len(crit_drives):
out_str = ['OK', '-', f'{len(vdev_devices)} {args.check_type} devices for {args.pool_name} are healthy']
print(*out_str)
print(list_to_markdown_table(table_data, align='left', seperator='!', borders=False))
sys.exit(exit_code)
if __name__ == "__main__":
try:
main()
except Exception as e:
print(f'UNKNOWN: exception "{e}"')
import traceback
print(traceback.format_exc())
sys.exit(nagios.UNKNOWN)