add check_zfs_zpool pool status

This commit is contained in:
Cyberes 2023-05-30 00:46:53 -06:00
parent 48da1bb35f
commit 465b9ea3a9
4 changed files with 170 additions and 118 deletions

View File

@ -4,8 +4,14 @@ import re
import subprocess import subprocess
import sys import sys
import zfslib as zfs
from checker import nagios from checker import nagios
from checker.markdown import list_to_markdown_table from checker.markdown import list_to_markdown_table
from checker.units import filesize
# TODO: add perfdata
def parse_size(size_str): def parse_size(size_str):
@ -21,7 +27,7 @@ def percent_to_float(percent_str: str):
def float_to_percent(float_value: float): def float_to_percent(float_value: float):
percent = float_value * 100 percent = round(float_value * 100, 2)
return f"{percent}%" return f"{percent}%"
@ -30,10 +36,45 @@ def clean_device_list(in_str: str):
def zpool_list(zpool: str, vdev_type: str, header: bool = False): def zpool_list(zpool: str, vdev_type: str, header: bool = False):
try:
if not header: if not header:
return subprocess.check_output(f"zpool list -v {zpool} | awk '/{vdev_type}/ {{while(getline && substr($0, 1, 1) ~ /[[:blank:]]/) print}}'", shell=True).decode('utf-8') return subprocess.check_output(f"zpool list -v {zpool} | awk '/{vdev_type}/ {{while(getline && substr($0, 1, 1) ~ /[[:blank:]]/) print}}'", shell=True, stderr=subprocess.PIPE).decode('utf-8')
else: else:
return subprocess.check_output(f"zpool list -v {zpool} | awk 'NR==1 {{print}} /{vdev_type}/ {{while(getline && substr($0, 1, 1) ~ /[[:blank:]]/) print}}'", shell=True).decode('utf-8') return subprocess.check_output(f"zpool list -v {zpool} | awk 'NR==1 {{print}} /{vdev_type}/ {{while(getline && substr($0, 1, 1) ~ /[[:blank:]]/) print}}'", shell=True, stderr=subprocess.PIPE).decode('utf-8')
except subprocess.CalledProcessError as e:
print('UNKNOWN - failed to check pool:', e.stderr.decode(sys.getfilesystemencoding()))
sys.exit(nagios.UNKNOWN)
def check_vdev_devices(vdev_devices: list, critical_free, warning_free, critical_frag, warning_frag):
critical = {'cap': [], 'frag': [], 'health': []}
warning = {'cap': [], 'frag': []}
states = {}
for device in vdev_devices:
state = '[OK]'
if device['health'] != 'ONLINE':
critical['health'].append(device['device'])
state = '[CRITICAL]'
if device['cap'] != '-':
device['cap'] = percent_to_float(device['cap'])
if device['cap'] >= critical_free:
critical['cap'].append(device['device'])
state = '[CRITICAL]'
elif not len(critical['health']) and device['cap'] >= warning_free:
warning['cap'].append(device['device'])
state = '[WARNING]'
if device['frag'] != '-':
device['frag'] = percent_to_float(device['frag'])
if device['frag'] >= critical_frag:
critical['frag'].append(device['device'])
state = '[CRITICAL]'
elif not len(critical['health']) and device['frag'] >= warning_frag:
warning['frag'].append(device['device'])
state = '[WARNING]'
states[device['device']] = state
return critical, warning, states
def get_vdev_info(zpool: str, vdev_type: str): def get_vdev_info(zpool: str, vdev_type: str):
@ -53,147 +94,146 @@ def get_vdev_info(zpool: str, vdev_type: str):
return zpool_vdev_devices return zpool_vdev_devices
def get_zfs_pool_status(pool_name): def get_zpool_zfs_properties(pool_name: str):
try: conn = zfs.Connection(host='localhost')
result = subprocess.run(['zpool', 'list', '-H', '-o', 'name,size,alloc,free,cap,frag,health', pool_name], capture_output=True, text=True, check=True) poolset = conn.load_poolset()
pool_info = result.stdout.strip().split('\t') zfs_properties = {
pool_status = { 'allocated': None,
'name': pool_info[0], 'capacity': None,
'size': pool_info[1], 'fragmentation': None,
'allocated': pool_info[2], 'free': None,
'free': pool_info[3], 'health': None,
'capacity': pool_info[4], 'size': None,
'fragmentation': pool_info[5],
'health': pool_info[6]
} }
result = subprocess.run(['zpool', 'status', '-v', pool_name], capture_output=True, text=True, check=True) pool = poolset.get_pool(pool_name)
pool_status_lines = result.stdout.strip().split('\n') for prop, value in zfs_properties.items():
for i in range(len(pool_status_lines)): zfs_properties[prop] = pool.pool.get_property(prop)
pool_status_lines[i] = re.sub(r'\\t\s*', '', pool_status_lines[i])
print(pool_status_lines) return zfs_properties
log_device_status = None
log_device_alloc = None
log_device_found = False
for line in pool_status_lines:
if 'logs' in line:
log_device_found = True
elif log_device_found:
log_device_status = line.strip().split()[-1]
log_device_alloc = line.strip().split()[1]
break
pool_status['log_device_status'] = log_device_status
pool_status['log_device_alloc'] = log_device_alloc
return pool_status
except subprocess.CalledProcessError as e:
print(f"Error: {e}")
sys.exit(2)
def main(): def main():
parser = argparse.ArgumentParser(description='Check ZFS pool status') parser = argparse.ArgumentParser(description='Check ZFS pool status')
parser.add_argument('pool_name', help='Name of the ZFS pool to check') parser.add_argument('--pool-name', required=True, help='Name of the ZFS pool to check.')
parser.add_argument('check_type', choices=['status', 'cache', 'log'], help='What to check.') parser.add_argument('--check-type', required=True, choices=['status', 'cache', 'log'], help='What to check.')
parser.add_argument('--warning-free', type=int, default=65, help='Warning level for free space percentage (default: 65)') parser.add_argument('--warning-free', type=int, default=65, help='Warning level for free space percentage (default: 65)')
parser.add_argument('--critical-free', type=int, default=80, help='Critical level for free space percentage (default: 80)') parser.add_argument('--critical-free', type=int, default=80, help='Critical level for free space percentage (default: 80)')
parser.add_argument('--warning-frag', type=int, default=50, help='Warning level for fragmentation percentage (default: 50)') parser.add_argument('--warning-frag', type=int, default=50, help='Warning level for fragmentation percentage (default: 50)')
parser.add_argument('--critical-frag', type=int, default=75, help='Critical level for fragmentation percentage (default: 75)') parser.add_argument('--critical-frag', type=int, default=75, help='Critical level for fragmentation percentage (default: 75)')
args = parser.parse_args() args = parser.parse_args()
args.warning_free = float(f'0.{args.warning_free}') args.warning_free = percent_to_float(f'{args.warning_free}%')
args.critical_free = float(f'0.{args.critical_free}') args.critical_free = percent_to_float(f'{args.critical_free}%')
args.warning_frag = percent_to_float(f'{args.warning_frag}%')
args.critical_frag = percent_to_float(f'{args.critical_frag}%')
if args.check_type == 'status': if args.check_type == 'status':
pool_status = get_zfs_pool_status(args.pool_name) vdev_devices = [x for x in get_vdev_info(args.pool_name, args.pool_name) if not x['device'].startswith('mirror-')]
if not len(vdev_devices):
print('UNKNOWN - no devices found')
sys.exit(nagios.UNKNOWN)
print(f"Pool Name: {pool_status['name']}") pool_status = get_zpool_zfs_properties(args.pool_name)
print(f"Size: {pool_status['size']}") exit_code = nagios.OK
print(f"Allocated: {pool_status['allocated']}") issues = []
print(f"Free: {pool_status['free']}")
print(f"Capacity: {pool_status['capacity']}")
print(f"Fragmentation: {pool_status['fragmentation']}")
print(f"Health: {pool_status['health']}")
if pool_status['log_device_status'] is not None: pool_status['capacity'] = percent_to_float(f"{pool_status['capacity']}%")
print(f"Log Device Status: {pool_status['log_device_status']}") pool_status['fragmentation'] = percent_to_float(f"{pool_status['fragmentation']}%")
print(f"Log Device Allocation: {pool_status['log_device_alloc']}")
else:
print("No log devices found")
size_bytes = parse_size(pool_status['size']) # Check for critical
free_bytes = parse_size(pool_status['free']) if pool_status['capacity'] >= args.critical_free:
free_percentage = (free_bytes / size_bytes) * 100 exit_code = nagios.CRITICAL
fragmentation_percentage = int(pool_status['fragmentation'].rstrip('%')) issues.append('capacity')
elif pool_status['fragmentation'] >= args.critical_frag:
exit_code = nagios.CRITICAL
issues.append('fragmentation')
elif pool_status['health'] != 'ONLINE':
exit_code = nagios.CRITICAL
issues.append('health')
if free_percentage <= args.critical_free or fragmentation_percentage >= args.critical_frag: # Check for warnings
print("CRITICAL") if exit_code == nagios.OK:
sys.exit(2) if pool_status['capacity'] >= args.warning_free:
elif free_percentage <= args.warning_free or fragmentation_percentage >= args.warning_frag: exit_code = nagios.WARNING
print("WARNING") issues.append('capacity')
sys.exit(1) elif pool_status['fragmentation'] >= args.warning_frag:
else: exit_code = nagios.WARNING
print("OK") issues.append('fragmentation')
sys.exit(0)
elif args.check_type in ['cache', 'log']: # Print the status
vdev_devices = get_vdev_info(args.pool_name, args.check_type) if exit_code == nagios.CRITICAL:
table_data = [('Device', 'Size', 'Alloc', 'Free', 'Frag', 'Cap', 'Health', 'State')] print('CRITICAL - pool', args.pool_name, 'is unhealthy:', ', '.join(issues))
critical = {'cap': [], 'frag': [], 'health': []} elif exit_code == nagios.WARNING:
warning = {'cap': [], 'frag': []} print('WARNING - pool', args.pool_name, 'is unhealthy:', ', '.join(issues))
elif exit_code == nagios.OK:
print('OK - pool', args.pool_name, 'is healthy')
# Build the table
critical, warning, states = check_vdev_devices(vdev_devices, args.critical_free, args.warning_free, args.critical_frag, args.warning_frag)
table_data = [
('Device', 'Size', 'Alloc', 'Free', 'Frag', 'Cap', 'Health', 'State'),
(args.pool_name, filesize(pool_status['size'], spaces=False, formatter=False), filesize(pool_status['allocated'], spaces=False, formatter=False), filesize(pool_status['free'], spaces=False, formatter=False), float_to_percent(pool_status['fragmentation']),
float_to_percent(pool_status['capacity']),
pool_status['health'], f"[{('ok' if exit_code == nagios.OK else 'critical').upper()}]")
]
for device in vdev_devices: for device in vdev_devices:
device['cap'] = percent_to_float(device['cap']) for device in vdev_devices:
device['frag'] = percent_to_float(device['frag']) if isinstance(device['frag'], float):
state = 'ok' device['frag'] = float_to_percent(device['frag'])
if isinstance(device['cap'], float):
device['cap'] = float_to_percent(device['cap'])
table_data.append((device['device'], device['size'], device['alloc'], device['free'], device['frag'], device['cap'], device['health'], states[device['device']]))
if device['cap'] >= args.critical_free: print(list_to_markdown_table(table_data, align='left', seperator='!', borders=False))
critical['cap'].append(device['device']) sys.exit(exit_code)
state = 'critical'
if device['frag'] >= args.critical_frag:
critical['frag'].append(device['device'])
state = 'critical'
if device['health'] != 'ONLINE':
critical['health'].append(device['device'])
state = 'critical'
if not len(critical['health']): elif args.check_type in ['cache', 'log']:
if device['cap'] >= args.warning_free and device['device'] not in critical['cap']:
warning['cap'].append(device['device'])
state = 'warning'
if device['frag'] >= args.warning_frag and device['device'] not in critical['frag']:
warning['frag'].append(device['device'])
state = 'warning'
table_data.append((device['device'], device['size'], device['alloc'], device['free'], float_to_percent(device['frag']), float_to_percent(device['cap']), device['health'], state)) vdev_devices = get_vdev_info(args.pool_name, args.check_type)
if not len(vdev_devices):
print('UNKNOWN - no devices found')
sys.exit(nagios.UNKNOWN)
table_data = [('Device', 'Size', 'Alloc', 'Free', 'Frag', 'Cap', 'Health', 'State')]
critical, warning, states = check_vdev_devices(vdev_devices, args.critical_free, args.warning_free, args.critical_frag, args.warning_frag)
for device in vdev_devices:
table_data.append((device['device'], device['size'], device['alloc'], device['free'], float_to_percent(device['frag']), float_to_percent(device['cap']), device['health'], states[device['device']]))
exit_code = nagios.OK exit_code = nagios.OK
out_str = None out_str = None
info_str = None info_str = None
crit_drives = [] crit_drives = []
warn_drives = [] warn_drives = []
issues = set()
if len(critical['cap']) or len(critical['frag']) or len(critical['health']): if len(critical['cap']) or len(critical['frag']) or len(critical['health']):
exit_code = nagios.CRITICAL exit_code = nagios.CRITICAL
if len(critical['cap']) and len(critical['frag']): if len(critical['cap']) and len(critical['frag']):
info_str = 'critical capacity and fragmentation' info_str = 'critical capacity and fragmentation'
crit_drives = [*critical['cap'], *critical['frag']] crit_drives = [*critical['cap'], *critical['frag']]
issues.add('capacity')
issues.add('fragmentation')
elif len(critical['cap']) and not len(critical['frag']): elif len(critical['cap']) and not len(critical['frag']):
info_str = 'critical capacity' info_str = 'critical capacity'
crit_drives = critical['cap'] crit_drives = critical['cap']
issues.add('capacity')
elif not len(critical['cap']) and len(critical['frag']): elif not len(critical['cap']) and len(critical['frag']):
info_str = 'critical fragmentation' info_str = 'critical fragmentation'
crit_drives = critical['frag'] crit_drives = critical['frag']
issues.add('fragmentation')
if len(critical['health']): if len(critical['health']):
info_str = "shit's fucked" info_str = "shit's fucked"
crit_drives = crit_drives + critical['health'] crit_drives = crit_drives + critical['health']
out_str = ['CRITICAL', '-', info_str, f'for {"drives" if len(crit_drives) > 1 else "drive"}', ', '.join([*set(crit_drives)])] issues.add('health')
out_str = ['CRITICAL', '-', info_str, f'for {"devices" if len(crit_drives) > 1 else "devices"} for {args.pool_name}:', ', '.join([*set(crit_drives)])]
if len(warning['cap']) or len(warning['frag']) and not len(critical['health']): if len(warning['cap']) or len(warning['frag']) and not len(critical['health']):
if exit_code < nagios.WARNING: if exit_code < nagios.WARNING:
exit_code = nagios.WARNING exit_code = nagios.WARNING
elif exit_code == nagios.CRITICAL: elif exit_code == nagios.CRITICAL:
out_str[2] = 'multiple issues' if len(issues) > 1: # if there's only one issue, don't assume there are also warnings
out_str[2] = 'multiple issues: ' + ' and '.join(issues)
del out_str[3]
del out_str[3]
else: else:
if len(warning['cap']) and len(warning['frag']): if len(warning['cap']) and len(warning['frag']):
info_str = 'critical capacity and fragmentation' info_str = 'critical capacity and fragmentation'
@ -204,14 +244,13 @@ def main():
elif not len(warning['cap']) and len(warning['frag']): elif not len(warning['cap']) and len(warning['frag']):
info_str = 'critical fragmentation' info_str = 'critical fragmentation'
warn_drives = warning['frag'] warn_drives = warning['frag']
out_str = ['WARNING', '-', info_str, f'for {"drives" if len(warn_drives) > 1 else "drive"}', ', '.join([*set(warn_drives)])] out_str = ['WARNING', '-', info_str, f'for {"devices" if len(warn_drives) > 1 else "devices"} for {args.pool_name}:', ', '.join([*set(warn_drives)])]
if not len(warn_drives) and not len(crit_drives): if not len(warn_drives) and not len(crit_drives):
out_str = ['OK', '-', f'{len(vdev_devices)} {args.check_type} devices are healthy'] out_str = ['OK', '-', f'{len(vdev_devices)} {args.check_type} devices for {args.pool_name} are healthy']
print(*out_str) print(*out_str)
print(list_to_markdown_table(table_data, align='left', seperator='!', borders=False)) print(list_to_markdown_table(table_data, align='left', seperator='!', borders=False))
# print(zpool_list(args.pool_name, args.check_type, True)) # for testing
sys.exit(exit_code) sys.exit(exit_code)

View File

@ -1,4 +1,4 @@
def list_to_markdown_table(array, align: str = None, seperator: str = '|', borders: bool = True): def list_to_markdown_table(array, align: str = None, seperator: str = '|', borders: bool = True, right_align_first_item: bool = True):
""" """
https://gist.github.com/OsKaR31415/955b166f4a286ed427f667cb21d57bfd https://gist.github.com/OsKaR31415/955b166f4a286ed427f667cb21d57bfd
Args: Args:
@ -7,6 +7,7 @@ def list_to_markdown_table(array, align: str = None, seperator: str = '|', borde
align: The alignment of the cells : 'left', 'center' or 'right'. align: The alignment of the cells : 'left', 'center' or 'right'.
seperator: seperator:
borders: borders:
right_align_first_item:
""" """
# make sure every elements are strings # make sure every elements are strings
array = [[str(elt) for elt in line] for line in array] array = [[str(elt) for elt in line] for line in array]
@ -14,7 +15,11 @@ def list_to_markdown_table(array, align: str = None, seperator: str = '|', borde
widths = [max(len(line[i]) for line in array) for i in range(len(array[0]))] widths = [max(len(line[i]) for line in array) for i in range(len(array[0]))]
# make every width at least 3 colmuns, because the separator needs it # make every width at least 3 colmuns, because the separator needs it
widths = [max(w, 3) for w in widths] widths = [max(w, 3) for w in widths]
# center text according to the widths # center text according to the widths
if right_align_first_item:
array = [[elt.ljust(w) if i == 0 else elt.center(w) for i, (elt, w) in enumerate(zip(line, widths))] for line in array]
else:
array = [[elt.center(w) for elt, w in zip(line, widths)] for line in array] array = [[elt.center(w) for elt, w in zip(line, widths)] for line in array]
# separate the header and the body # separate the header and the body
@ -25,6 +30,9 @@ def list_to_markdown_table(array, align: str = None, seperator: str = '|', borde
else: else:
edge_seperator = '' edge_seperator = ''
if right_align_first_item:
header = ((edge_seperator + ' ') if borders else '') + array_head[0].ljust(widths[0]) + f' {seperator} ' + f' {seperator} '.join([elt.center(w) for elt, w in zip(array_head[1:], widths[1:])]) + ((' ' + edge_seperator) if borders else '')
else:
header = ((edge_seperator + ' ') if borders else '') + f' {seperator} '.join(array_head) + ((' ' + edge_seperator) if borders else '') header = ((edge_seperator + ' ') if borders else '') + f' {seperator} '.join(array_head) + ((' ' + edge_seperator) if borders else '')
# alignment of the cells # alignment of the cells

View File

@ -1,6 +1,8 @@
from hurry.filesize import size from hurry.filesize import size
def filesize(bytes: int, spaces: bool = True):
def filesize(bytes: int, spaces: bool = True, formatter: bool = True):
if formatter:
system = [ system = [
(1024 ** 5, ' PB'), (1024 ** 5, ' PB'),
(1024 ** 4, ' TB'), (1024 ** 4, ' TB'),
@ -10,6 +12,8 @@ def filesize(bytes: int, spaces: bool = True):
(1024 ** 0, ' B'), (1024 ** 0, ' B'),
] ]
x = size(bytes, system=system) x = size(bytes, system=system)
else:
x = size(bytes)
if spaces: if spaces:
return x return x
else: else:

View File

@ -12,5 +12,6 @@ aiofiles~=0.6.0
markdown~=3.4.1 markdown~=3.4.1
psutil~=5.9.4 psutil~=5.9.4
hurry.filesize hurry.filesize
certifi certifi~=2022.12.7
cloudflarepycli cloudflarepycli~=1.7.0
zfslib~=0.11.0