add check_zfs_zpool pool status
This commit is contained in:
parent
48da1bb35f
commit
465b9ea3a9
|
@ -4,8 +4,14 @@ import re
|
|||
import subprocess
|
||||
import sys
|
||||
|
||||
import zfslib as zfs
|
||||
|
||||
from checker import nagios
|
||||
from checker.markdown import list_to_markdown_table
|
||||
from checker.units import filesize
|
||||
|
||||
|
||||
# TODO: add perfdata
|
||||
|
||||
|
||||
def parse_size(size_str):
|
||||
|
@ -21,7 +27,7 @@ def percent_to_float(percent_str: str):
|
|||
|
||||
|
||||
def float_to_percent(float_value: float):
|
||||
percent = float_value * 100
|
||||
percent = round(float_value * 100, 2)
|
||||
return f"{percent}%"
|
||||
|
||||
|
||||
|
@ -30,10 +36,45 @@ def clean_device_list(in_str: str):
|
|||
|
||||
|
||||
def zpool_list(zpool: str, vdev_type: str, header: bool = False):
|
||||
try:
|
||||
if not header:
|
||||
return subprocess.check_output(f"zpool list -v {zpool} | awk '/{vdev_type}/ {{while(getline && substr($0, 1, 1) ~ /[[:blank:]]/) print}}'", shell=True).decode('utf-8')
|
||||
return subprocess.check_output(f"zpool list -v {zpool} | awk '/{vdev_type}/ {{while(getline && substr($0, 1, 1) ~ /[[:blank:]]/) print}}'", shell=True, stderr=subprocess.PIPE).decode('utf-8')
|
||||
else:
|
||||
return subprocess.check_output(f"zpool list -v {zpool} | awk 'NR==1 {{print}} /{vdev_type}/ {{while(getline && substr($0, 1, 1) ~ /[[:blank:]]/) print}}'", shell=True).decode('utf-8')
|
||||
return subprocess.check_output(f"zpool list -v {zpool} | awk 'NR==1 {{print}} /{vdev_type}/ {{while(getline && substr($0, 1, 1) ~ /[[:blank:]]/) print}}'", shell=True, stderr=subprocess.PIPE).decode('utf-8')
|
||||
except subprocess.CalledProcessError as e:
|
||||
print('UNKNOWN - failed to check pool:', e.stderr.decode(sys.getfilesystemencoding()))
|
||||
sys.exit(nagios.UNKNOWN)
|
||||
|
||||
|
||||
def check_vdev_devices(vdev_devices: list, critical_free, warning_free, critical_frag, warning_frag):
|
||||
critical = {'cap': [], 'frag': [], 'health': []}
|
||||
warning = {'cap': [], 'frag': []}
|
||||
states = {}
|
||||
for device in vdev_devices:
|
||||
state = '[OK]'
|
||||
|
||||
if device['health'] != 'ONLINE':
|
||||
critical['health'].append(device['device'])
|
||||
state = '[CRITICAL]'
|
||||
|
||||
if device['cap'] != '-':
|
||||
device['cap'] = percent_to_float(device['cap'])
|
||||
if device['cap'] >= critical_free:
|
||||
critical['cap'].append(device['device'])
|
||||
state = '[CRITICAL]'
|
||||
elif not len(critical['health']) and device['cap'] >= warning_free:
|
||||
warning['cap'].append(device['device'])
|
||||
state = '[WARNING]'
|
||||
if device['frag'] != '-':
|
||||
device['frag'] = percent_to_float(device['frag'])
|
||||
if device['frag'] >= critical_frag:
|
||||
critical['frag'].append(device['device'])
|
||||
state = '[CRITICAL]'
|
||||
elif not len(critical['health']) and device['frag'] >= warning_frag:
|
||||
warning['frag'].append(device['device'])
|
||||
state = '[WARNING]'
|
||||
states[device['device']] = state
|
||||
return critical, warning, states
|
||||
|
||||
|
||||
def get_vdev_info(zpool: str, vdev_type: str):
|
||||
|
@ -53,147 +94,146 @@ def get_vdev_info(zpool: str, vdev_type: str):
|
|||
return zpool_vdev_devices
|
||||
|
||||
|
||||
def get_zfs_pool_status(pool_name):
|
||||
try:
|
||||
result = subprocess.run(['zpool', 'list', '-H', '-o', 'name,size,alloc,free,cap,frag,health', pool_name], capture_output=True, text=True, check=True)
|
||||
pool_info = result.stdout.strip().split('\t')
|
||||
pool_status = {
|
||||
'name': pool_info[0],
|
||||
'size': pool_info[1],
|
||||
'allocated': pool_info[2],
|
||||
'free': pool_info[3],
|
||||
'capacity': pool_info[4],
|
||||
'fragmentation': pool_info[5],
|
||||
'health': pool_info[6]
|
||||
def get_zpool_zfs_properties(pool_name: str):
|
||||
conn = zfs.Connection(host='localhost')
|
||||
poolset = conn.load_poolset()
|
||||
zfs_properties = {
|
||||
'allocated': None,
|
||||
'capacity': None,
|
||||
'fragmentation': None,
|
||||
'free': None,
|
||||
'health': None,
|
||||
'size': None,
|
||||
}
|
||||
|
||||
result = subprocess.run(['zpool', 'status', '-v', pool_name], capture_output=True, text=True, check=True)
|
||||
pool_status_lines = result.stdout.strip().split('\n')
|
||||
for i in range(len(pool_status_lines)):
|
||||
pool_status_lines[i] = re.sub(r'\\t\s*', '', pool_status_lines[i])
|
||||
pool = poolset.get_pool(pool_name)
|
||||
for prop, value in zfs_properties.items():
|
||||
zfs_properties[prop] = pool.pool.get_property(prop)
|
||||
|
||||
print(pool_status_lines)
|
||||
|
||||
log_device_status = None
|
||||
log_device_alloc = None
|
||||
log_device_found = False
|
||||
for line in pool_status_lines:
|
||||
if 'logs' in line:
|
||||
log_device_found = True
|
||||
elif log_device_found:
|
||||
log_device_status = line.strip().split()[-1]
|
||||
log_device_alloc = line.strip().split()[1]
|
||||
break
|
||||
|
||||
pool_status['log_device_status'] = log_device_status
|
||||
pool_status['log_device_alloc'] = log_device_alloc
|
||||
|
||||
return pool_status
|
||||
|
||||
except subprocess.CalledProcessError as e:
|
||||
print(f"Error: {e}")
|
||||
sys.exit(2)
|
||||
return zfs_properties
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description='Check ZFS pool status')
|
||||
parser.add_argument('pool_name', help='Name of the ZFS pool to check')
|
||||
parser.add_argument('check_type', choices=['status', 'cache', 'log'], help='What to check.')
|
||||
parser.add_argument('--pool-name', required=True, help='Name of the ZFS pool to check.')
|
||||
parser.add_argument('--check-type', required=True, choices=['status', 'cache', 'log'], help='What to check.')
|
||||
parser.add_argument('--warning-free', type=int, default=65, help='Warning level for free space percentage (default: 65)')
|
||||
parser.add_argument('--critical-free', type=int, default=80, help='Critical level for free space percentage (default: 80)')
|
||||
parser.add_argument('--warning-frag', type=int, default=50, help='Warning level for fragmentation percentage (default: 50)')
|
||||
parser.add_argument('--critical-frag', type=int, default=75, help='Critical level for fragmentation percentage (default: 75)')
|
||||
args = parser.parse_args()
|
||||
|
||||
args.warning_free = float(f'0.{args.warning_free}')
|
||||
args.critical_free = float(f'0.{args.critical_free}')
|
||||
args.warning_free = percent_to_float(f'{args.warning_free}%')
|
||||
args.critical_free = percent_to_float(f'{args.critical_free}%')
|
||||
args.warning_frag = percent_to_float(f'{args.warning_frag}%')
|
||||
args.critical_frag = percent_to_float(f'{args.critical_frag}%')
|
||||
|
||||
if args.check_type == 'status':
|
||||
pool_status = get_zfs_pool_status(args.pool_name)
|
||||
vdev_devices = [x for x in get_vdev_info(args.pool_name, args.pool_name) if not x['device'].startswith('mirror-')]
|
||||
if not len(vdev_devices):
|
||||
print('UNKNOWN - no devices found')
|
||||
sys.exit(nagios.UNKNOWN)
|
||||
|
||||
print(f"Pool Name: {pool_status['name']}")
|
||||
print(f"Size: {pool_status['size']}")
|
||||
print(f"Allocated: {pool_status['allocated']}")
|
||||
print(f"Free: {pool_status['free']}")
|
||||
print(f"Capacity: {pool_status['capacity']}")
|
||||
print(f"Fragmentation: {pool_status['fragmentation']}")
|
||||
print(f"Health: {pool_status['health']}")
|
||||
pool_status = get_zpool_zfs_properties(args.pool_name)
|
||||
exit_code = nagios.OK
|
||||
issues = []
|
||||
|
||||
if pool_status['log_device_status'] is not None:
|
||||
print(f"Log Device Status: {pool_status['log_device_status']}")
|
||||
print(f"Log Device Allocation: {pool_status['log_device_alloc']}")
|
||||
else:
|
||||
print("No log devices found")
|
||||
pool_status['capacity'] = percent_to_float(f"{pool_status['capacity']}%")
|
||||
pool_status['fragmentation'] = percent_to_float(f"{pool_status['fragmentation']}%")
|
||||
|
||||
size_bytes = parse_size(pool_status['size'])
|
||||
free_bytes = parse_size(pool_status['free'])
|
||||
free_percentage = (free_bytes / size_bytes) * 100
|
||||
fragmentation_percentage = int(pool_status['fragmentation'].rstrip('%'))
|
||||
# Check for critical
|
||||
if pool_status['capacity'] >= args.critical_free:
|
||||
exit_code = nagios.CRITICAL
|
||||
issues.append('capacity')
|
||||
elif pool_status['fragmentation'] >= args.critical_frag:
|
||||
exit_code = nagios.CRITICAL
|
||||
issues.append('fragmentation')
|
||||
elif pool_status['health'] != 'ONLINE':
|
||||
exit_code = nagios.CRITICAL
|
||||
issues.append('health')
|
||||
|
||||
if free_percentage <= args.critical_free or fragmentation_percentage >= args.critical_frag:
|
||||
print("CRITICAL")
|
||||
sys.exit(2)
|
||||
elif free_percentage <= args.warning_free or fragmentation_percentage >= args.warning_frag:
|
||||
print("WARNING")
|
||||
sys.exit(1)
|
||||
else:
|
||||
print("OK")
|
||||
sys.exit(0)
|
||||
elif args.check_type in ['cache', 'log']:
|
||||
vdev_devices = get_vdev_info(args.pool_name, args.check_type)
|
||||
table_data = [('Device', 'Size', 'Alloc', 'Free', 'Frag', 'Cap', 'Health', 'State')]
|
||||
critical = {'cap': [], 'frag': [], 'health': []}
|
||||
warning = {'cap': [], 'frag': []}
|
||||
# Check for warnings
|
||||
if exit_code == nagios.OK:
|
||||
if pool_status['capacity'] >= args.warning_free:
|
||||
exit_code = nagios.WARNING
|
||||
issues.append('capacity')
|
||||
elif pool_status['fragmentation'] >= args.warning_frag:
|
||||
exit_code = nagios.WARNING
|
||||
issues.append('fragmentation')
|
||||
|
||||
# Print the status
|
||||
if exit_code == nagios.CRITICAL:
|
||||
print('CRITICAL - pool', args.pool_name, 'is unhealthy:', ', '.join(issues))
|
||||
elif exit_code == nagios.WARNING:
|
||||
print('WARNING - pool', args.pool_name, 'is unhealthy:', ', '.join(issues))
|
||||
elif exit_code == nagios.OK:
|
||||
print('OK - pool', args.pool_name, 'is healthy')
|
||||
|
||||
# Build the table
|
||||
critical, warning, states = check_vdev_devices(vdev_devices, args.critical_free, args.warning_free, args.critical_frag, args.warning_frag)
|
||||
table_data = [
|
||||
('Device', 'Size', 'Alloc', 'Free', 'Frag', 'Cap', 'Health', 'State'),
|
||||
(args.pool_name, filesize(pool_status['size'], spaces=False, formatter=False), filesize(pool_status['allocated'], spaces=False, formatter=False), filesize(pool_status['free'], spaces=False, formatter=False), float_to_percent(pool_status['fragmentation']),
|
||||
float_to_percent(pool_status['capacity']),
|
||||
pool_status['health'], f"[{('ok' if exit_code == nagios.OK else 'critical').upper()}]")
|
||||
]
|
||||
for device in vdev_devices:
|
||||
device['cap'] = percent_to_float(device['cap'])
|
||||
device['frag'] = percent_to_float(device['frag'])
|
||||
state = 'ok'
|
||||
for device in vdev_devices:
|
||||
if isinstance(device['frag'], float):
|
||||
device['frag'] = float_to_percent(device['frag'])
|
||||
if isinstance(device['cap'], float):
|
||||
device['cap'] = float_to_percent(device['cap'])
|
||||
table_data.append((device['device'], device['size'], device['alloc'], device['free'], device['frag'], device['cap'], device['health'], states[device['device']]))
|
||||
|
||||
if device['cap'] >= args.critical_free:
|
||||
critical['cap'].append(device['device'])
|
||||
state = 'critical'
|
||||
if device['frag'] >= args.critical_frag:
|
||||
critical['frag'].append(device['device'])
|
||||
state = 'critical'
|
||||
if device['health'] != 'ONLINE':
|
||||
critical['health'].append(device['device'])
|
||||
state = 'critical'
|
||||
print(list_to_markdown_table(table_data, align='left', seperator='!', borders=False))
|
||||
sys.exit(exit_code)
|
||||
|
||||
if not len(critical['health']):
|
||||
if device['cap'] >= args.warning_free and device['device'] not in critical['cap']:
|
||||
warning['cap'].append(device['device'])
|
||||
state = 'warning'
|
||||
if device['frag'] >= args.warning_frag and device['device'] not in critical['frag']:
|
||||
warning['frag'].append(device['device'])
|
||||
state = 'warning'
|
||||
elif args.check_type in ['cache', 'log']:
|
||||
|
||||
table_data.append((device['device'], device['size'], device['alloc'], device['free'], float_to_percent(device['frag']), float_to_percent(device['cap']), device['health'], state))
|
||||
vdev_devices = get_vdev_info(args.pool_name, args.check_type)
|
||||
if not len(vdev_devices):
|
||||
print('UNKNOWN - no devices found')
|
||||
sys.exit(nagios.UNKNOWN)
|
||||
table_data = [('Device', 'Size', 'Alloc', 'Free', 'Frag', 'Cap', 'Health', 'State')]
|
||||
critical, warning, states = check_vdev_devices(vdev_devices, args.critical_free, args.warning_free, args.critical_frag, args.warning_frag)
|
||||
|
||||
for device in vdev_devices:
|
||||
table_data.append((device['device'], device['size'], device['alloc'], device['free'], float_to_percent(device['frag']), float_to_percent(device['cap']), device['health'], states[device['device']]))
|
||||
|
||||
exit_code = nagios.OK
|
||||
out_str = None
|
||||
info_str = None
|
||||
crit_drives = []
|
||||
warn_drives = []
|
||||
issues = set()
|
||||
if len(critical['cap']) or len(critical['frag']) or len(critical['health']):
|
||||
exit_code = nagios.CRITICAL
|
||||
if len(critical['cap']) and len(critical['frag']):
|
||||
info_str = 'critical capacity and fragmentation'
|
||||
crit_drives = [*critical['cap'], *critical['frag']]
|
||||
issues.add('capacity')
|
||||
issues.add('fragmentation')
|
||||
elif len(critical['cap']) and not len(critical['frag']):
|
||||
info_str = 'critical capacity'
|
||||
crit_drives = critical['cap']
|
||||
issues.add('capacity')
|
||||
elif not len(critical['cap']) and len(critical['frag']):
|
||||
info_str = 'critical fragmentation'
|
||||
crit_drives = critical['frag']
|
||||
issues.add('fragmentation')
|
||||
if len(critical['health']):
|
||||
info_str = "shit's fucked"
|
||||
crit_drives = crit_drives + critical['health']
|
||||
out_str = ['CRITICAL', '-', info_str, f'for {"drives" if len(crit_drives) > 1 else "drive"}', ', '.join([*set(crit_drives)])]
|
||||
issues.add('health')
|
||||
out_str = ['CRITICAL', '-', info_str, f'for {"devices" if len(crit_drives) > 1 else "devices"} for {args.pool_name}:', ', '.join([*set(crit_drives)])]
|
||||
if len(warning['cap']) or len(warning['frag']) and not len(critical['health']):
|
||||
if exit_code < nagios.WARNING:
|
||||
exit_code = nagios.WARNING
|
||||
elif exit_code == nagios.CRITICAL:
|
||||
out_str[2] = 'multiple issues'
|
||||
if len(issues) > 1: # if there's only one issue, don't assume there are also warnings
|
||||
out_str[2] = 'multiple issues: ' + ' and '.join(issues)
|
||||
del out_str[3]
|
||||
del out_str[3]
|
||||
else:
|
||||
if len(warning['cap']) and len(warning['frag']):
|
||||
info_str = 'critical capacity and fragmentation'
|
||||
|
@ -204,14 +244,13 @@ def main():
|
|||
elif not len(warning['cap']) and len(warning['frag']):
|
||||
info_str = 'critical fragmentation'
|
||||
warn_drives = warning['frag']
|
||||
out_str = ['WARNING', '-', info_str, f'for {"drives" if len(warn_drives) > 1 else "drive"}', ', '.join([*set(warn_drives)])]
|
||||
out_str = ['WARNING', '-', info_str, f'for {"devices" if len(warn_drives) > 1 else "devices"} for {args.pool_name}:', ', '.join([*set(warn_drives)])]
|
||||
|
||||
if not len(warn_drives) and not len(crit_drives):
|
||||
out_str = ['OK', '-', f'{len(vdev_devices)} {args.check_type} devices are healthy']
|
||||
out_str = ['OK', '-', f'{len(vdev_devices)} {args.check_type} devices for {args.pool_name} are healthy']
|
||||
|
||||
print(*out_str)
|
||||
print(list_to_markdown_table(table_data, align='left', seperator='!', borders=False))
|
||||
# print(zpool_list(args.pool_name, args.check_type, True)) # for testing
|
||||
sys.exit(exit_code)
|
||||
|
||||
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
def list_to_markdown_table(array, align: str = None, seperator: str = '|', borders: bool = True):
|
||||
def list_to_markdown_table(array, align: str = None, seperator: str = '|', borders: bool = True, right_align_first_item: bool = True):
|
||||
"""
|
||||
https://gist.github.com/OsKaR31415/955b166f4a286ed427f667cb21d57bfd
|
||||
Args:
|
||||
|
@ -7,6 +7,7 @@ def list_to_markdown_table(array, align: str = None, seperator: str = '|', borde
|
|||
align: The alignment of the cells : 'left', 'center' or 'right'.
|
||||
seperator:
|
||||
borders:
|
||||
right_align_first_item:
|
||||
"""
|
||||
# make sure every elements are strings
|
||||
array = [[str(elt) for elt in line] for line in array]
|
||||
|
@ -14,7 +15,11 @@ def list_to_markdown_table(array, align: str = None, seperator: str = '|', borde
|
|||
widths = [max(len(line[i]) for line in array) for i in range(len(array[0]))]
|
||||
# make every width at least 3 colmuns, because the separator needs it
|
||||
widths = [max(w, 3) for w in widths]
|
||||
|
||||
# center text according to the widths
|
||||
if right_align_first_item:
|
||||
array = [[elt.ljust(w) if i == 0 else elt.center(w) for i, (elt, w) in enumerate(zip(line, widths))] for line in array]
|
||||
else:
|
||||
array = [[elt.center(w) for elt, w in zip(line, widths)] for line in array]
|
||||
|
||||
# separate the header and the body
|
||||
|
@ -25,6 +30,9 @@ def list_to_markdown_table(array, align: str = None, seperator: str = '|', borde
|
|||
else:
|
||||
edge_seperator = ''
|
||||
|
||||
if right_align_first_item:
|
||||
header = ((edge_seperator + ' ') if borders else '') + array_head[0].ljust(widths[0]) + f' {seperator} ' + f' {seperator} '.join([elt.center(w) for elt, w in zip(array_head[1:], widths[1:])]) + ((' ' + edge_seperator) if borders else '')
|
||||
else:
|
||||
header = ((edge_seperator + ' ') if borders else '') + f' {seperator} '.join(array_head) + ((' ' + edge_seperator) if borders else '')
|
||||
|
||||
# alignment of the cells
|
||||
|
|
|
@ -1,6 +1,8 @@
|
|||
from hurry.filesize import size
|
||||
|
||||
def filesize(bytes: int, spaces: bool = True):
|
||||
|
||||
def filesize(bytes: int, spaces: bool = True, formatter: bool = True):
|
||||
if formatter:
|
||||
system = [
|
||||
(1024 ** 5, ' PB'),
|
||||
(1024 ** 4, ' TB'),
|
||||
|
@ -10,6 +12,8 @@ def filesize(bytes: int, spaces: bool = True):
|
|||
(1024 ** 0, ' B'),
|
||||
]
|
||||
x = size(bytes, system=system)
|
||||
else:
|
||||
x = size(bytes)
|
||||
if spaces:
|
||||
return x
|
||||
else:
|
||||
|
|
|
@ -12,5 +12,6 @@ aiofiles~=0.6.0
|
|||
markdown~=3.4.1
|
||||
psutil~=5.9.4
|
||||
hurry.filesize
|
||||
certifi
|
||||
cloudflarepycli
|
||||
certifi~=2022.12.7
|
||||
cloudflarepycli~=1.7.0
|
||||
zfslib~=0.11.0
|
Loading…
Reference in New Issue