2023-05-29 23:01:58 -06:00
|
|
|
#!/usr/bin/env python3
|
|
|
|
import argparse
|
|
|
|
import re
|
|
|
|
import subprocess
|
|
|
|
import sys
|
|
|
|
|
2023-05-30 00:46:53 -06:00
|
|
|
import zfslib as zfs
|
|
|
|
|
2024-02-23 10:57:26 -07:00
|
|
|
from checker import nagios, dict_to_perfdata
|
2023-05-29 23:01:58 -06:00
|
|
|
from checker.markdown import list_to_markdown_table
|
2023-07-06 09:44:37 -06:00
|
|
|
from checker.nagios import state_to_txt
|
2023-05-30 00:46:53 -06:00
|
|
|
from checker.units import filesize
|
2024-02-23 10:57:26 -07:00
|
|
|
from checker.zfs import zfs_get_free
|
2023-05-30 00:46:53 -06:00
|
|
|
|
|
|
|
|
|
|
|
# TODO: add perfdata
|
2023-05-29 23:01:58 -06:00
|
|
|
|
|
|
|
|
|
|
|
def parse_size(size_str):
|
|
|
|
size_str = size_str.lower()
|
|
|
|
size_map = {'k': 1, 'm': 1024, 'g': 1024 ** 2, 't': 1024 ** 3, 'p': 1024 ** 4}
|
|
|
|
size = float(size_str[:-1]) * size_map[size_str[-1]]
|
|
|
|
return size
|
|
|
|
|
|
|
|
|
|
|
|
def percent_to_float(percent_str: str):
|
|
|
|
percent = float(percent_str.strip('%'))
|
|
|
|
return percent / 100
|
|
|
|
|
|
|
|
|
2023-05-30 12:38:25 -06:00
|
|
|
def float_to_percent(float_value):
|
|
|
|
percent = round(float(float_value) * 100, 2)
|
2023-05-29 23:01:58 -06:00
|
|
|
return f"{percent}%"
|
|
|
|
|
|
|
|
|
|
|
|
def clean_device_list(in_str: str):
|
|
|
|
return re.sub(r'\s+', ' ', re.sub(r'^\s*|', '', in_str))
|
|
|
|
|
|
|
|
|
|
|
|
def zpool_list(zpool: str, vdev_type: str, header: bool = False):
|
2023-05-30 00:46:53 -06:00
|
|
|
try:
|
2023-05-30 12:38:25 -06:00
|
|
|
if vdev_type == 'pool':
|
|
|
|
if not header:
|
|
|
|
# GPT-4's original awk command was this:
|
|
|
|
# awk '/logs/ || /cache/ {{exit}} /^[[:space:]]+[^[:space:]]/ || /^[[:space:]]{2,}ata-/'
|
|
|
|
return subprocess.check_output(
|
|
|
|
f"zpool list -v {zpool} | awk '/logs/ || /cache/ {{exit}} /^[[:space:]]+[^[:space:]]/'", shell=True,
|
|
|
|
stderr=subprocess.PIPE).decode('utf-8')
|
|
|
|
else:
|
|
|
|
raise NotImplementedError('not implemented for pool')
|
|
|
|
elif vdev_type in ['cache', 'log']:
|
|
|
|
if not header:
|
|
|
|
return subprocess.check_output(
|
|
|
|
f"zpool list -v {zpool} | awk '/{vdev_type}/ {{while(getline && substr($0, 1, 1) ~ /[[:blank:]]/) print}}'",
|
|
|
|
shell=True, stderr=subprocess.PIPE).decode('utf-8')
|
|
|
|
else:
|
|
|
|
return subprocess.check_output(
|
|
|
|
f"zpool list -v {zpool} | awk 'NR==1 {{print}} /{vdev_type}/ {{while(getline && substr($0, 1, 1) ~ /[[:blank:]]/) print}}'",
|
|
|
|
shell=True, stderr=subprocess.PIPE).decode('utf-8')
|
2023-05-30 00:46:53 -06:00
|
|
|
else:
|
2023-05-30 12:38:25 -06:00
|
|
|
raise NotImplementedError
|
2023-05-30 00:46:53 -06:00
|
|
|
except subprocess.CalledProcessError as e:
|
|
|
|
print('UNKNOWN - failed to check pool:', e.stderr.decode(sys.getfilesystemencoding()))
|
|
|
|
sys.exit(nagios.UNKNOWN)
|
|
|
|
|
|
|
|
|
|
|
|
def check_vdev_devices(vdev_devices: list, critical_free, warning_free, critical_frag, warning_frag):
|
|
|
|
critical = {'cap': [], 'frag': [], 'health': []}
|
|
|
|
warning = {'cap': [], 'frag': []}
|
|
|
|
states = {}
|
|
|
|
for device in vdev_devices:
|
|
|
|
state = '[OK]'
|
|
|
|
|
|
|
|
if device['health'] != 'ONLINE':
|
|
|
|
critical['health'].append(device['device'])
|
|
|
|
state = '[CRITICAL]'
|
|
|
|
|
|
|
|
if device['cap'] != '-':
|
|
|
|
device['cap'] = percent_to_float(device['cap'])
|
|
|
|
if device['cap'] >= critical_free:
|
|
|
|
critical['cap'].append(device['device'])
|
|
|
|
state = '[CRITICAL]'
|
|
|
|
elif not len(critical['health']) and device['cap'] >= warning_free:
|
|
|
|
warning['cap'].append(device['device'])
|
|
|
|
state = '[WARNING]'
|
|
|
|
if device['frag'] != '-':
|
|
|
|
device['frag'] = percent_to_float(device['frag'])
|
|
|
|
if device['frag'] >= critical_frag:
|
|
|
|
critical['frag'].append(device['device'])
|
|
|
|
state = '[CRITICAL]'
|
|
|
|
elif not len(critical['health']) and device['frag'] >= warning_frag:
|
|
|
|
warning['frag'].append(device['device'])
|
|
|
|
state = '[WARNING]'
|
|
|
|
states[device['device']] = state
|
|
|
|
return critical, warning, states
|
2023-05-29 23:01:58 -06:00
|
|
|
|
2023-07-06 09:44:37 -06:00
|
|
|
|
|
|
|
def is_dash(string: str):
|
2023-05-30 12:38:25 -06:00
|
|
|
return string == '-'
|
2023-05-29 23:01:58 -06:00
|
|
|
|
2023-07-06 09:44:37 -06:00
|
|
|
|
2023-05-29 23:01:58 -06:00
|
|
|
def get_vdev_info(zpool: str, vdev_type: str):
|
|
|
|
output_zpool_logs = zpool_list(zpool, vdev_type)
|
|
|
|
zpool_vdev_devices = []
|
|
|
|
for line in list(filter(None, output_zpool_logs.split('\n'))):
|
|
|
|
data = list(filter(None, clean_device_list(line).split(' ')))
|
|
|
|
zpool_vdev_devices.append({
|
2023-07-06 09:44:37 -06:00
|
|
|
'pool': False, # not (is_dash(data[2]) and is_dash(data[3]) and is_dash(data[6]) and is_dash(data[7])),
|
2023-06-22 15:42:41 -06:00
|
|
|
# TODO: better pool detection
|
|
|
|
|
2023-05-29 23:01:58 -06:00
|
|
|
'device': data[0],
|
|
|
|
'size': data[1],
|
|
|
|
'alloc': data[2],
|
|
|
|
'free': data[3],
|
|
|
|
'frag': data[6],
|
|
|
|
'cap': data[7],
|
|
|
|
'health': data[9]
|
|
|
|
})
|
|
|
|
return zpool_vdev_devices
|
|
|
|
|
|
|
|
|
2023-05-30 00:46:53 -06:00
|
|
|
def get_zpool_zfs_properties(pool_name: str):
|
|
|
|
conn = zfs.Connection(host='localhost')
|
|
|
|
poolset = conn.load_poolset()
|
|
|
|
zfs_properties = {
|
|
|
|
'allocated': None,
|
|
|
|
'capacity': None,
|
|
|
|
'fragmentation': None,
|
|
|
|
'free': None,
|
|
|
|
'health': None,
|
|
|
|
'size': None,
|
|
|
|
}
|
2023-05-29 23:01:58 -06:00
|
|
|
|
2023-05-30 00:46:53 -06:00
|
|
|
pool = poolset.get_pool(pool_name)
|
|
|
|
for prop, value in zfs_properties.items():
|
|
|
|
zfs_properties[prop] = pool.pool.get_property(prop)
|
|
|
|
|
|
|
|
return zfs_properties
|
2023-05-29 23:01:58 -06:00
|
|
|
|
|
|
|
|
|
|
|
def main():
|
|
|
|
parser = argparse.ArgumentParser(description='Check ZFS pool status')
|
2023-05-30 00:46:53 -06:00
|
|
|
parser.add_argument('--pool-name', required=True, help='Name of the ZFS pool to check.')
|
|
|
|
parser.add_argument('--check-type', required=True, choices=['status', 'cache', 'log'], help='What to check.')
|
2023-05-30 12:38:25 -06:00
|
|
|
parser.add_argument('--warning-free', type=int, default=65,
|
|
|
|
help='Warning level for free space percentage (default: 65)')
|
|
|
|
parser.add_argument('--critical-free', type=int, default=80,
|
|
|
|
help='Critical level for free space percentage (default: 80)')
|
|
|
|
parser.add_argument('--warning-frag', type=int, default=50,
|
|
|
|
help='Warning level for fragmentation percentage (default: 50)')
|
|
|
|
parser.add_argument('--critical-frag', type=int, default=75,
|
|
|
|
help='Critical level for fragmentation percentage (default: 75)')
|
2023-05-29 23:01:58 -06:00
|
|
|
args = parser.parse_args()
|
|
|
|
|
2023-05-30 00:46:53 -06:00
|
|
|
args.warning_free = percent_to_float(f'{args.warning_free}%')
|
|
|
|
args.critical_free = percent_to_float(f'{args.critical_free}%')
|
|
|
|
args.warning_frag = percent_to_float(f'{args.warning_frag}%')
|
|
|
|
args.critical_frag = percent_to_float(f'{args.critical_frag}%')
|
2023-05-29 23:01:58 -06:00
|
|
|
|
|
|
|
if args.check_type == 'status':
|
2023-05-30 12:38:25 -06:00
|
|
|
vdev_devices = [x for x in get_vdev_info(args.pool_name, 'pool') if not x['pool']]
|
2023-05-30 00:46:53 -06:00
|
|
|
if not len(vdev_devices):
|
|
|
|
print('UNKNOWN - no devices found')
|
|
|
|
sys.exit(nagios.UNKNOWN)
|
|
|
|
|
|
|
|
pool_status = get_zpool_zfs_properties(args.pool_name)
|
|
|
|
exit_code = nagios.OK
|
|
|
|
issues = []
|
|
|
|
|
|
|
|
pool_status['capacity'] = percent_to_float(f"{pool_status['capacity']}%")
|
|
|
|
pool_status['fragmentation'] = percent_to_float(f"{pool_status['fragmentation']}%")
|
|
|
|
|
|
|
|
# Check for critical
|
|
|
|
if pool_status['capacity'] >= args.critical_free:
|
|
|
|
exit_code = nagios.CRITICAL
|
|
|
|
issues.append('capacity')
|
|
|
|
elif pool_status['fragmentation'] >= args.critical_frag:
|
|
|
|
exit_code = nagios.CRITICAL
|
|
|
|
issues.append('fragmentation')
|
|
|
|
elif pool_status['health'] != 'ONLINE':
|
|
|
|
exit_code = nagios.CRITICAL
|
|
|
|
issues.append('health')
|
|
|
|
|
|
|
|
# Check for warnings
|
|
|
|
if exit_code == nagios.OK:
|
|
|
|
if pool_status['capacity'] >= args.warning_free:
|
|
|
|
exit_code = nagios.WARNING
|
|
|
|
issues.append('capacity')
|
|
|
|
elif pool_status['fragmentation'] >= args.warning_frag:
|
|
|
|
exit_code = nagios.WARNING
|
|
|
|
issues.append('fragmentation')
|
|
|
|
|
|
|
|
# Print the status
|
|
|
|
if exit_code == nagios.CRITICAL:
|
|
|
|
print('CRITICAL - pool', args.pool_name, 'is unhealthy:', ', '.join(issues))
|
|
|
|
elif exit_code == nagios.WARNING:
|
|
|
|
print('WARNING - pool', args.pool_name, 'is unhealthy:', ', '.join(issues))
|
|
|
|
elif exit_code == nagios.OK:
|
|
|
|
print('OK - pool', args.pool_name, 'is healthy')
|
|
|
|
|
|
|
|
# Build the table
|
2023-05-30 12:38:25 -06:00
|
|
|
critical, warning, states = check_vdev_devices(vdev_devices, args.critical_free, args.warning_free,
|
|
|
|
args.critical_frag, args.warning_frag)
|
2023-05-30 00:46:53 -06:00
|
|
|
table_data = [
|
|
|
|
('Device', 'Size', 'Alloc', 'Free', 'Frag', 'Cap', 'Health', 'State'),
|
2023-05-30 12:38:25 -06:00
|
|
|
(args.pool_name, filesize(pool_status['size'], spaces=False, formatter=False),
|
|
|
|
filesize(pool_status['allocated'], spaces=False, formatter=False),
|
|
|
|
filesize(pool_status['free'], spaces=False, formatter=False),
|
|
|
|
float_to_percent(pool_status['fragmentation']),
|
2023-05-30 00:46:53 -06:00
|
|
|
float_to_percent(pool_status['capacity']),
|
2023-07-06 09:44:37 -06:00
|
|
|
pool_status['health'], f"[{state_to_txt(exit_code).upper()}]")
|
2023-05-30 00:46:53 -06:00
|
|
|
]
|
2023-05-29 23:01:58 -06:00
|
|
|
for device in vdev_devices:
|
2023-05-30 12:38:25 -06:00
|
|
|
if isinstance(device['frag'], float):
|
|
|
|
device['frag'] = float_to_percent(device['frag'])
|
|
|
|
if isinstance(device['cap'], float):
|
|
|
|
device['cap'] = float_to_percent(device['cap'])
|
|
|
|
table_data.append((device['device'], device['size'], device['alloc'], device['free'], device['frag'],
|
|
|
|
device['cap'], device['health'], states[device['device']]))
|
2023-05-29 23:01:58 -06:00
|
|
|
|
2024-02-23 10:57:26 -07:00
|
|
|
zpool_size, zpool_free = zfs_get_free(args.pool_name)
|
|
|
|
perf_data = {
|
|
|
|
'free': {
|
|
|
|
'value': zpool_size, 'warn': int(zpool_size * args.warning_free), 'crit': int(zpool_size * args.critical_free), 'min': 0, 'unit': 'GB'
|
|
|
|
},
|
|
|
|
'size': {
|
|
|
|
'value': zpool_free, 'warn': int(zpool_size * args.warning_free), 'crit': int(zpool_size * args.critical_free), 'min': 0, 'unit': 'GB'
|
|
|
|
},
|
|
|
|
'fragmentation': {
|
|
|
|
'value': int(float(float_to_percent(pool_status['fragmentation']).strip('%'))), 'warn': float_to_percent(args.warning_frag), 'crit': float_to_percent(args.critical_frag), 'min': 0, 'unit': '%'
|
|
|
|
}
|
|
|
|
}
|
|
|
|
perf_data_str = dict_to_perfdata(perf_data)
|
|
|
|
|
|
|
|
print(list_to_markdown_table(table_data, align='center', seperator='!', borders=False), '|', perf_data_str)
|
2023-05-30 00:46:53 -06:00
|
|
|
sys.exit(exit_code)
|
2023-05-29 23:01:58 -06:00
|
|
|
|
2023-05-30 00:46:53 -06:00
|
|
|
elif args.check_type in ['cache', 'log']:
|
2023-05-29 23:01:58 -06:00
|
|
|
|
2023-05-30 00:46:53 -06:00
|
|
|
vdev_devices = get_vdev_info(args.pool_name, args.check_type)
|
|
|
|
if not len(vdev_devices):
|
|
|
|
print('UNKNOWN - no devices found')
|
|
|
|
sys.exit(nagios.UNKNOWN)
|
|
|
|
table_data = [('Device', 'Size', 'Alloc', 'Free', 'Frag', 'Cap', 'Health', 'State')]
|
2023-05-30 12:38:25 -06:00
|
|
|
critical, warning, states = check_vdev_devices(vdev_devices, args.critical_free, args.warning_free,
|
|
|
|
args.critical_frag, args.warning_frag)
|
2023-05-30 00:46:53 -06:00
|
|
|
|
|
|
|
for device in vdev_devices:
|
2023-05-30 12:38:25 -06:00
|
|
|
if device['frag'] != '-':
|
|
|
|
device['frag'] = float_to_percent(device['frag'])
|
|
|
|
if device['cap'] != '-':
|
|
|
|
device['cap'] = float_to_percent(device['cap'])
|
|
|
|
table_data.append((device['device'], device['size'], device['alloc'], device['free'], device['frag'],
|
|
|
|
device['cap'], device['health'], states[device['device']]))
|
2023-05-29 23:01:58 -06:00
|
|
|
|
|
|
|
exit_code = nagios.OK
|
|
|
|
out_str = None
|
|
|
|
info_str = None
|
|
|
|
crit_drives = []
|
|
|
|
warn_drives = []
|
2023-05-30 00:46:53 -06:00
|
|
|
issues = set()
|
2023-05-29 23:01:58 -06:00
|
|
|
if len(critical['cap']) or len(critical['frag']) or len(critical['health']):
|
|
|
|
exit_code = nagios.CRITICAL
|
|
|
|
if len(critical['cap']) and len(critical['frag']):
|
|
|
|
info_str = 'critical capacity and fragmentation'
|
|
|
|
crit_drives = [*critical['cap'], *critical['frag']]
|
2023-05-30 00:46:53 -06:00
|
|
|
issues.add('capacity')
|
|
|
|
issues.add('fragmentation')
|
2023-05-29 23:01:58 -06:00
|
|
|
elif len(critical['cap']) and not len(critical['frag']):
|
|
|
|
info_str = 'critical capacity'
|
|
|
|
crit_drives = critical['cap']
|
2023-05-30 00:46:53 -06:00
|
|
|
issues.add('capacity')
|
2023-05-29 23:01:58 -06:00
|
|
|
elif not len(critical['cap']) and len(critical['frag']):
|
|
|
|
info_str = 'critical fragmentation'
|
|
|
|
crit_drives = critical['frag']
|
2023-05-30 00:46:53 -06:00
|
|
|
issues.add('fragmentation')
|
2023-05-29 23:01:58 -06:00
|
|
|
if len(critical['health']):
|
|
|
|
info_str = "shit's fucked"
|
|
|
|
crit_drives = crit_drives + critical['health']
|
2023-05-30 00:46:53 -06:00
|
|
|
issues.add('health')
|
2023-05-30 12:38:25 -06:00
|
|
|
out_str = ['CRITICAL', '-', info_str,
|
|
|
|
f'for {"devices" if len(crit_drives) > 1 else "devices"} for {args.pool_name}:',
|
|
|
|
', '.join([*set(crit_drives)])]
|
2023-05-29 23:01:58 -06:00
|
|
|
if len(warning['cap']) or len(warning['frag']) and not len(critical['health']):
|
|
|
|
if exit_code < nagios.WARNING:
|
|
|
|
exit_code = nagios.WARNING
|
|
|
|
elif exit_code == nagios.CRITICAL:
|
2023-05-30 00:46:53 -06:00
|
|
|
if len(issues) > 1: # if there's only one issue, don't assume there are also warnings
|
|
|
|
out_str[2] = 'multiple issues: ' + ' and '.join(issues)
|
|
|
|
del out_str[3]
|
|
|
|
del out_str[3]
|
2023-05-29 23:01:58 -06:00
|
|
|
else:
|
|
|
|
if len(warning['cap']) and len(warning['frag']):
|
|
|
|
info_str = 'critical capacity and fragmentation'
|
|
|
|
warn_drives = [*warning['cap'], *warning['frag']]
|
|
|
|
elif len(warning['cap']) and not len(warning['frag']):
|
|
|
|
info_str = 'critical capacity'
|
|
|
|
warn_drives = warning['cap']
|
|
|
|
elif not len(warning['cap']) and len(warning['frag']):
|
|
|
|
info_str = 'critical fragmentation'
|
|
|
|
warn_drives = warning['frag']
|
2023-05-30 12:38:25 -06:00
|
|
|
out_str = ['WARNING', '-', info_str,
|
|
|
|
f'for {"devices" if len(warn_drives) > 1 else "devices"} for {args.pool_name}:',
|
|
|
|
', '.join([*set(warn_drives)])]
|
2023-05-29 23:01:58 -06:00
|
|
|
|
|
|
|
if not len(warn_drives) and not len(crit_drives):
|
2023-05-30 00:46:53 -06:00
|
|
|
out_str = ['OK', '-', f'{len(vdev_devices)} {args.check_type} devices for {args.pool_name} are healthy']
|
2023-05-29 23:01:58 -06:00
|
|
|
|
|
|
|
print(*out_str)
|
|
|
|
print(list_to_markdown_table(table_data, align='left', seperator='!', borders=False))
|
|
|
|
sys.exit(exit_code)
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
try:
|
|
|
|
main()
|
|
|
|
except Exception as e:
|
|
|
|
print(f'UNKNOWN: exception "{e}"')
|
|
|
|
import traceback
|
|
|
|
|
|
|
|
print(traceback.format_exc())
|
|
|
|
sys.exit(nagios.UNKNOWN)
|