add check_zfs_zpool pool status

This commit is contained in:
Cyberes 2023-05-30 00:46:53 -06:00
parent 48da1bb35f
commit 465b9ea3a9
4 changed files with 170 additions and 118 deletions

View File

@ -4,8 +4,14 @@ import re
import subprocess
import sys
import zfslib as zfs
from checker import nagios
from checker.markdown import list_to_markdown_table
from checker.units import filesize
# TODO: add perfdata
def parse_size(size_str):
@ -21,7 +27,7 @@ def percent_to_float(percent_str: str):
def float_to_percent(float_value: float):
percent = float_value * 100
percent = round(float_value * 100, 2)
return f"{percent}%"
@ -30,10 +36,45 @@ def clean_device_list(in_str: str):
def zpool_list(zpool: str, vdev_type: str, header: bool = False):
if not header:
return subprocess.check_output(f"zpool list -v {zpool} | awk '/{vdev_type}/ {{while(getline && substr($0, 1, 1) ~ /[[:blank:]]/) print}}'", shell=True).decode('utf-8')
else:
return subprocess.check_output(f"zpool list -v {zpool} | awk 'NR==1 {{print}} /{vdev_type}/ {{while(getline && substr($0, 1, 1) ~ /[[:blank:]]/) print}}'", shell=True).decode('utf-8')
try:
if not header:
return subprocess.check_output(f"zpool list -v {zpool} | awk '/{vdev_type}/ {{while(getline && substr($0, 1, 1) ~ /[[:blank:]]/) print}}'", shell=True, stderr=subprocess.PIPE).decode('utf-8')
else:
return subprocess.check_output(f"zpool list -v {zpool} | awk 'NR==1 {{print}} /{vdev_type}/ {{while(getline && substr($0, 1, 1) ~ /[[:blank:]]/) print}}'", shell=True, stderr=subprocess.PIPE).decode('utf-8')
except subprocess.CalledProcessError as e:
print('UNKNOWN - failed to check pool:', e.stderr.decode(sys.getfilesystemencoding()))
sys.exit(nagios.UNKNOWN)
def check_vdev_devices(vdev_devices: list, critical_free, warning_free, critical_frag, warning_frag):
critical = {'cap': [], 'frag': [], 'health': []}
warning = {'cap': [], 'frag': []}
states = {}
for device in vdev_devices:
state = '[OK]'
if device['health'] != 'ONLINE':
critical['health'].append(device['device'])
state = '[CRITICAL]'
if device['cap'] != '-':
device['cap'] = percent_to_float(device['cap'])
if device['cap'] >= critical_free:
critical['cap'].append(device['device'])
state = '[CRITICAL]'
elif not len(critical['health']) and device['cap'] >= warning_free:
warning['cap'].append(device['device'])
state = '[WARNING]'
if device['frag'] != '-':
device['frag'] = percent_to_float(device['frag'])
if device['frag'] >= critical_frag:
critical['frag'].append(device['device'])
state = '[CRITICAL]'
elif not len(critical['health']) and device['frag'] >= warning_frag:
warning['frag'].append(device['device'])
state = '[WARNING]'
states[device['device']] = state
return critical, warning, states
def get_vdev_info(zpool: str, vdev_type: str):
@ -53,147 +94,146 @@ def get_vdev_info(zpool: str, vdev_type: str):
return zpool_vdev_devices
def get_zfs_pool_status(pool_name):
try:
result = subprocess.run(['zpool', 'list', '-H', '-o', 'name,size,alloc,free,cap,frag,health', pool_name], capture_output=True, text=True, check=True)
pool_info = result.stdout.strip().split('\t')
pool_status = {
'name': pool_info[0],
'size': pool_info[1],
'allocated': pool_info[2],
'free': pool_info[3],
'capacity': pool_info[4],
'fragmentation': pool_info[5],
'health': pool_info[6]
}
def get_zpool_zfs_properties(pool_name: str):
conn = zfs.Connection(host='localhost')
poolset = conn.load_poolset()
zfs_properties = {
'allocated': None,
'capacity': None,
'fragmentation': None,
'free': None,
'health': None,
'size': None,
}
result = subprocess.run(['zpool', 'status', '-v', pool_name], capture_output=True, text=True, check=True)
pool_status_lines = result.stdout.strip().split('\n')
for i in range(len(pool_status_lines)):
pool_status_lines[i] = re.sub(r'\\t\s*', '', pool_status_lines[i])
pool = poolset.get_pool(pool_name)
for prop, value in zfs_properties.items():
zfs_properties[prop] = pool.pool.get_property(prop)
print(pool_status_lines)
log_device_status = None
log_device_alloc = None
log_device_found = False
for line in pool_status_lines:
if 'logs' in line:
log_device_found = True
elif log_device_found:
log_device_status = line.strip().split()[-1]
log_device_alloc = line.strip().split()[1]
break
pool_status['log_device_status'] = log_device_status
pool_status['log_device_alloc'] = log_device_alloc
return pool_status
except subprocess.CalledProcessError as e:
print(f"Error: {e}")
sys.exit(2)
return zfs_properties
def main():
parser = argparse.ArgumentParser(description='Check ZFS pool status')
parser.add_argument('pool_name', help='Name of the ZFS pool to check')
parser.add_argument('check_type', choices=['status', 'cache', 'log'], help='What to check.')
parser.add_argument('--pool-name', required=True, help='Name of the ZFS pool to check.')
parser.add_argument('--check-type', required=True, choices=['status', 'cache', 'log'], help='What to check.')
parser.add_argument('--warning-free', type=int, default=65, help='Warning level for free space percentage (default: 65)')
parser.add_argument('--critical-free', type=int, default=80, help='Critical level for free space percentage (default: 80)')
parser.add_argument('--warning-frag', type=int, default=50, help='Warning level for fragmentation percentage (default: 50)')
parser.add_argument('--critical-frag', type=int, default=75, help='Critical level for fragmentation percentage (default: 75)')
args = parser.parse_args()
args.warning_free = float(f'0.{args.warning_free}')
args.critical_free = float(f'0.{args.critical_free}')
args.warning_free = percent_to_float(f'{args.warning_free}%')
args.critical_free = percent_to_float(f'{args.critical_free}%')
args.warning_frag = percent_to_float(f'{args.warning_frag}%')
args.critical_frag = percent_to_float(f'{args.critical_frag}%')
if args.check_type == 'status':
pool_status = get_zfs_pool_status(args.pool_name)
vdev_devices = [x for x in get_vdev_info(args.pool_name, args.pool_name) if not x['device'].startswith('mirror-')]
if not len(vdev_devices):
print('UNKNOWN - no devices found')
sys.exit(nagios.UNKNOWN)
print(f"Pool Name: {pool_status['name']}")
print(f"Size: {pool_status['size']}")
print(f"Allocated: {pool_status['allocated']}")
print(f"Free: {pool_status['free']}")
print(f"Capacity: {pool_status['capacity']}")
print(f"Fragmentation: {pool_status['fragmentation']}")
print(f"Health: {pool_status['health']}")
pool_status = get_zpool_zfs_properties(args.pool_name)
exit_code = nagios.OK
issues = []
if pool_status['log_device_status'] is not None:
print(f"Log Device Status: {pool_status['log_device_status']}")
print(f"Log Device Allocation: {pool_status['log_device_alloc']}")
else:
print("No log devices found")
pool_status['capacity'] = percent_to_float(f"{pool_status['capacity']}%")
pool_status['fragmentation'] = percent_to_float(f"{pool_status['fragmentation']}%")
size_bytes = parse_size(pool_status['size'])
free_bytes = parse_size(pool_status['free'])
free_percentage = (free_bytes / size_bytes) * 100
fragmentation_percentage = int(pool_status['fragmentation'].rstrip('%'))
# Check for critical
if pool_status['capacity'] >= args.critical_free:
exit_code = nagios.CRITICAL
issues.append('capacity')
elif pool_status['fragmentation'] >= args.critical_frag:
exit_code = nagios.CRITICAL
issues.append('fragmentation')
elif pool_status['health'] != 'ONLINE':
exit_code = nagios.CRITICAL
issues.append('health')
if free_percentage <= args.critical_free or fragmentation_percentage >= args.critical_frag:
print("CRITICAL")
sys.exit(2)
elif free_percentage <= args.warning_free or fragmentation_percentage >= args.warning_frag:
print("WARNING")
sys.exit(1)
else:
print("OK")
sys.exit(0)
elif args.check_type in ['cache', 'log']:
vdev_devices = get_vdev_info(args.pool_name, args.check_type)
table_data = [('Device', 'Size', 'Alloc', 'Free', 'Frag', 'Cap', 'Health', 'State')]
critical = {'cap': [], 'frag': [], 'health': []}
warning = {'cap': [], 'frag': []}
# Check for warnings
if exit_code == nagios.OK:
if pool_status['capacity'] >= args.warning_free:
exit_code = nagios.WARNING
issues.append('capacity')
elif pool_status['fragmentation'] >= args.warning_frag:
exit_code = nagios.WARNING
issues.append('fragmentation')
# Print the status
if exit_code == nagios.CRITICAL:
print('CRITICAL - pool', args.pool_name, 'is unhealthy:', ', '.join(issues))
elif exit_code == nagios.WARNING:
print('WARNING - pool', args.pool_name, 'is unhealthy:', ', '.join(issues))
elif exit_code == nagios.OK:
print('OK - pool', args.pool_name, 'is healthy')
# Build the table
critical, warning, states = check_vdev_devices(vdev_devices, args.critical_free, args.warning_free, args.critical_frag, args.warning_frag)
table_data = [
('Device', 'Size', 'Alloc', 'Free', 'Frag', 'Cap', 'Health', 'State'),
(args.pool_name, filesize(pool_status['size'], spaces=False, formatter=False), filesize(pool_status['allocated'], spaces=False, formatter=False), filesize(pool_status['free'], spaces=False, formatter=False), float_to_percent(pool_status['fragmentation']),
float_to_percent(pool_status['capacity']),
pool_status['health'], f"[{('ok' if exit_code == nagios.OK else 'critical').upper()}]")
]
for device in vdev_devices:
device['cap'] = percent_to_float(device['cap'])
device['frag'] = percent_to_float(device['frag'])
state = 'ok'
for device in vdev_devices:
if isinstance(device['frag'], float):
device['frag'] = float_to_percent(device['frag'])
if isinstance(device['cap'], float):
device['cap'] = float_to_percent(device['cap'])
table_data.append((device['device'], device['size'], device['alloc'], device['free'], device['frag'], device['cap'], device['health'], states[device['device']]))
if device['cap'] >= args.critical_free:
critical['cap'].append(device['device'])
state = 'critical'
if device['frag'] >= args.critical_frag:
critical['frag'].append(device['device'])
state = 'critical'
if device['health'] != 'ONLINE':
critical['health'].append(device['device'])
state = 'critical'
print(list_to_markdown_table(table_data, align='left', seperator='!', borders=False))
sys.exit(exit_code)
if not len(critical['health']):
if device['cap'] >= args.warning_free and device['device'] not in critical['cap']:
warning['cap'].append(device['device'])
state = 'warning'
if device['frag'] >= args.warning_frag and device['device'] not in critical['frag']:
warning['frag'].append(device['device'])
state = 'warning'
elif args.check_type in ['cache', 'log']:
table_data.append((device['device'], device['size'], device['alloc'], device['free'], float_to_percent(device['frag']), float_to_percent(device['cap']), device['health'], state))
vdev_devices = get_vdev_info(args.pool_name, args.check_type)
if not len(vdev_devices):
print('UNKNOWN - no devices found')
sys.exit(nagios.UNKNOWN)
table_data = [('Device', 'Size', 'Alloc', 'Free', 'Frag', 'Cap', 'Health', 'State')]
critical, warning, states = check_vdev_devices(vdev_devices, args.critical_free, args.warning_free, args.critical_frag, args.warning_frag)
for device in vdev_devices:
table_data.append((device['device'], device['size'], device['alloc'], device['free'], float_to_percent(device['frag']), float_to_percent(device['cap']), device['health'], states[device['device']]))
exit_code = nagios.OK
out_str = None
info_str = None
crit_drives = []
warn_drives = []
issues = set()
if len(critical['cap']) or len(critical['frag']) or len(critical['health']):
exit_code = nagios.CRITICAL
if len(critical['cap']) and len(critical['frag']):
info_str = 'critical capacity and fragmentation'
crit_drives = [*critical['cap'], *critical['frag']]
issues.add('capacity')
issues.add('fragmentation')
elif len(critical['cap']) and not len(critical['frag']):
info_str = 'critical capacity'
crit_drives = critical['cap']
issues.add('capacity')
elif not len(critical['cap']) and len(critical['frag']):
info_str = 'critical fragmentation'
crit_drives = critical['frag']
issues.add('fragmentation')
if len(critical['health']):
info_str = "shit's fucked"
crit_drives = crit_drives + critical['health']
out_str = ['CRITICAL', '-', info_str, f'for {"drives" if len(crit_drives) > 1 else "drive"}', ', '.join([*set(crit_drives)])]
issues.add('health')
out_str = ['CRITICAL', '-', info_str, f'for {"devices" if len(crit_drives) > 1 else "devices"} for {args.pool_name}:', ', '.join([*set(crit_drives)])]
if len(warning['cap']) or len(warning['frag']) and not len(critical['health']):
if exit_code < nagios.WARNING:
exit_code = nagios.WARNING
elif exit_code == nagios.CRITICAL:
out_str[2] = 'multiple issues'
if len(issues) > 1: # if there's only one issue, don't assume there are also warnings
out_str[2] = 'multiple issues: ' + ' and '.join(issues)
del out_str[3]
del out_str[3]
else:
if len(warning['cap']) and len(warning['frag']):
info_str = 'critical capacity and fragmentation'
@ -204,14 +244,13 @@ def main():
elif not len(warning['cap']) and len(warning['frag']):
info_str = 'critical fragmentation'
warn_drives = warning['frag']
out_str = ['WARNING', '-', info_str, f'for {"drives" if len(warn_drives) > 1 else "drive"}', ', '.join([*set(warn_drives)])]
out_str = ['WARNING', '-', info_str, f'for {"devices" if len(warn_drives) > 1 else "devices"} for {args.pool_name}:', ', '.join([*set(warn_drives)])]
if not len(warn_drives) and not len(crit_drives):
out_str = ['OK', '-', f'{len(vdev_devices)} {args.check_type} devices are healthy']
out_str = ['OK', '-', f'{len(vdev_devices)} {args.check_type} devices for {args.pool_name} are healthy']
print(*out_str)
print(list_to_markdown_table(table_data, align='left', seperator='!', borders=False))
# print(zpool_list(args.pool_name, args.check_type, True)) # for testing
sys.exit(exit_code)

View File

@ -1,4 +1,4 @@
def list_to_markdown_table(array, align: str = None, seperator: str = '|', borders: bool = True):
def list_to_markdown_table(array, align: str = None, seperator: str = '|', borders: bool = True, right_align_first_item: bool = True):
"""
https://gist.github.com/OsKaR31415/955b166f4a286ed427f667cb21d57bfd
Args:
@ -7,6 +7,7 @@ def list_to_markdown_table(array, align: str = None, seperator: str = '|', borde
align: The alignment of the cells : 'left', 'center' or 'right'.
seperator:
borders:
right_align_first_item:
"""
# make sure every elements are strings
array = [[str(elt) for elt in line] for line in array]
@ -14,8 +15,12 @@ def list_to_markdown_table(array, align: str = None, seperator: str = '|', borde
widths = [max(len(line[i]) for line in array) for i in range(len(array[0]))]
# make every width at least 3 colmuns, because the separator needs it
widths = [max(w, 3) for w in widths]
# center text according to the widths
array = [[elt.center(w) for elt, w in zip(line, widths)] for line in array]
if right_align_first_item:
array = [[elt.ljust(w) if i == 0 else elt.center(w) for i, (elt, w) in enumerate(zip(line, widths))] for line in array]
else:
array = [[elt.center(w) for elt, w in zip(line, widths)] for line in array]
# separate the header and the body
array_head, *array_body = array
@ -25,7 +30,10 @@ def list_to_markdown_table(array, align: str = None, seperator: str = '|', borde
else:
edge_seperator = ''
header = ((edge_seperator + ' ') if borders else '') + f' {seperator} '.join(array_head) + ((' ' + edge_seperator) if borders else '')
if right_align_first_item:
header = ((edge_seperator + ' ') if borders else '') + array_head[0].ljust(widths[0]) + f' {seperator} ' + f' {seperator} '.join([elt.center(w) for elt, w in zip(array_head[1:], widths[1:])]) + ((' ' + edge_seperator) if borders else '')
else:
header = ((edge_seperator + ' ') if borders else '') + f' {seperator} '.join(array_head) + ((' ' + edge_seperator) if borders else '')
# alignment of the cells
align = str(align).lower() # make sure `align` is a lowercase string

View File

@ -1,15 +1,19 @@
from hurry.filesize import size
def filesize(bytes: int, spaces: bool = True):
system = [
(1024 ** 5, ' PB'),
(1024 ** 4, ' TB'),
(1024 ** 3, ' GB'),
(1024 ** 2, ' MB'),
(1024 ** 1, ' KB'),
(1024 ** 0, ' B'),
]
x = size(bytes, system=system)
def filesize(bytes: int, spaces: bool = True, formatter: bool = True):
if formatter:
system = [
(1024 ** 5, ' PB'),
(1024 ** 4, ' TB'),
(1024 ** 3, ' GB'),
(1024 ** 2, ' MB'),
(1024 ** 1, ' KB'),
(1024 ** 0, ' B'),
]
x = size(bytes, system=system)
else:
x = size(bytes)
if spaces:
return x
else:

View File

@ -12,5 +12,6 @@ aiofiles~=0.6.0
markdown~=3.4.1
psutil~=5.9.4
hurry.filesize
certifi
cloudflarepycli
certifi~=2022.12.7
cloudflarepycli~=1.7.0
zfslib~=0.11.0