icinga2-checks/check_zfs_zpool.py

227 lines
9.5 KiB
Python
Raw Normal View History

2023-05-29 23:01:58 -06:00
#!/usr/bin/env python3
import argparse
import re
import subprocess
import sys
from checker import nagios
from checker.markdown import list_to_markdown_table
def parse_size(size_str):
size_str = size_str.lower()
size_map = {'k': 1, 'm': 1024, 'g': 1024 ** 2, 't': 1024 ** 3, 'p': 1024 ** 4}
size = float(size_str[:-1]) * size_map[size_str[-1]]
return size
def percent_to_float(percent_str: str):
percent = float(percent_str.strip('%'))
return percent / 100
def float_to_percent(float_value: float):
percent = float_value * 100
return f"{percent}%"
def clean_device_list(in_str: str):
return re.sub(r'\s+', ' ', re.sub(r'^\s*|', '', in_str))
def zpool_list(zpool: str, vdev_type: str, header: bool = False):
if not header:
return subprocess.check_output(f"zpool list -v {zpool} | awk '/{vdev_type}/ {{while(getline && substr($0, 1, 1) ~ /[[:blank:]]/) print}}'", shell=True).decode('utf-8')
else:
return subprocess.check_output(f"zpool list -v {zpool} | awk 'NR==1 {{print}} /{vdev_type}/ {{while(getline && substr($0, 1, 1) ~ /[[:blank:]]/) print}}'", shell=True).decode('utf-8')
def get_vdev_info(zpool: str, vdev_type: str):
output_zpool_logs = zpool_list(zpool, vdev_type)
zpool_vdev_devices = []
for line in list(filter(None, output_zpool_logs.split('\n'))):
data = list(filter(None, clean_device_list(line).split(' ')))
zpool_vdev_devices.append({
'device': data[0],
'size': data[1],
'alloc': data[2],
'free': data[3],
'frag': data[6],
'cap': data[7],
'health': data[9]
})
return zpool_vdev_devices
def get_zfs_pool_status(pool_name):
try:
result = subprocess.run(['zpool', 'list', '-H', '-o', 'name,size,alloc,free,cap,frag,health', pool_name], capture_output=True, text=True, check=True)
pool_info = result.stdout.strip().split('\t')
pool_status = {
'name': pool_info[0],
'size': pool_info[1],
'allocated': pool_info[2],
'free': pool_info[3],
'capacity': pool_info[4],
'fragmentation': pool_info[5],
'health': pool_info[6]
}
result = subprocess.run(['zpool', 'status', '-v', pool_name], capture_output=True, text=True, check=True)
pool_status_lines = result.stdout.strip().split('\n')
for i in range(len(pool_status_lines)):
pool_status_lines[i] = re.sub(r'\\t\s*', '', pool_status_lines[i])
print(pool_status_lines)
log_device_status = None
log_device_alloc = None
log_device_found = False
for line in pool_status_lines:
if 'logs' in line:
log_device_found = True
elif log_device_found:
log_device_status = line.strip().split()[-1]
log_device_alloc = line.strip().split()[1]
break
pool_status['log_device_status'] = log_device_status
pool_status['log_device_alloc'] = log_device_alloc
return pool_status
except subprocess.CalledProcessError as e:
print(f"Error: {e}")
sys.exit(2)
def main():
parser = argparse.ArgumentParser(description='Check ZFS pool status')
parser.add_argument('pool_name', help='Name of the ZFS pool to check')
parser.add_argument('check_type', choices=['status', 'cache', 'log'], help='What to check.')
parser.add_argument('--warning-free', type=int, default=65, help='Warning level for free space percentage (default: 65)')
parser.add_argument('--critical-free', type=int, default=80, help='Critical level for free space percentage (default: 80)')
parser.add_argument('--warning-frag', type=int, default=50, help='Warning level for fragmentation percentage (default: 50)')
parser.add_argument('--critical-frag', type=int, default=75, help='Critical level for fragmentation percentage (default: 75)')
args = parser.parse_args()
args.warning_free = float(f'0.{args.warning_free}')
args.critical_free = float(f'0.{args.critical_free}')
if args.check_type == 'status':
pool_status = get_zfs_pool_status(args.pool_name)
print(f"Pool Name: {pool_status['name']}")
print(f"Size: {pool_status['size']}")
print(f"Allocated: {pool_status['allocated']}")
print(f"Free: {pool_status['free']}")
print(f"Capacity: {pool_status['capacity']}")
print(f"Fragmentation: {pool_status['fragmentation']}")
print(f"Health: {pool_status['health']}")
if pool_status['log_device_status'] is not None:
print(f"Log Device Status: {pool_status['log_device_status']}")
print(f"Log Device Allocation: {pool_status['log_device_alloc']}")
else:
print("No log devices found")
size_bytes = parse_size(pool_status['size'])
free_bytes = parse_size(pool_status['free'])
free_percentage = (free_bytes / size_bytes) * 100
fragmentation_percentage = int(pool_status['fragmentation'].rstrip('%'))
if free_percentage <= args.critical_free or fragmentation_percentage >= args.critical_frag:
print("CRITICAL")
sys.exit(2)
elif free_percentage <= args.warning_free or fragmentation_percentage >= args.warning_frag:
print("WARNING")
sys.exit(1)
else:
print("OK")
sys.exit(0)
elif args.check_type in ['cache', 'log']:
vdev_devices = get_vdev_info(args.pool_name, args.check_type)
table_data = [('Device', 'Size', 'Alloc', 'Free', 'Frag', 'Cap', 'Health', 'State')]
critical = {'cap': [], 'frag': [], 'health': []}
warning = {'cap': [], 'frag': []}
for device in vdev_devices:
device['cap'] = percent_to_float(device['cap'])
device['frag'] = percent_to_float(device['frag'])
state = 'ok'
if device['cap'] >= args.critical_free:
critical['cap'].append(device['device'])
state = 'critical'
if device['frag'] >= args.critical_frag:
critical['frag'].append(device['device'])
state = 'critical'
if device['health'] != 'ONLINE':
critical['health'].append(device['device'])
state = 'critical'
if not len(critical['health']):
if device['cap'] >= args.warning_free and device['device'] not in critical['cap']:
warning['cap'].append(device['device'])
state = 'warning'
if device['frag'] >= args.warning_frag and device['device'] not in critical['frag']:
warning['frag'].append(device['device'])
state = 'warning'
table_data.append((device['device'], device['size'], device['alloc'], device['free'], float_to_percent(device['frag']), float_to_percent(device['cap']), device['health'], state))
exit_code = nagios.OK
out_str = None
info_str = None
crit_drives = []
warn_drives = []
if len(critical['cap']) or len(critical['frag']) or len(critical['health']):
exit_code = nagios.CRITICAL
if len(critical['cap']) and len(critical['frag']):
info_str = 'critical capacity and fragmentation'
crit_drives = [*critical['cap'], *critical['frag']]
elif len(critical['cap']) and not len(critical['frag']):
info_str = 'critical capacity'
crit_drives = critical['cap']
elif not len(critical['cap']) and len(critical['frag']):
info_str = 'critical fragmentation'
crit_drives = critical['frag']
if len(critical['health']):
info_str = "shit's fucked"
crit_drives = crit_drives + critical['health']
out_str = ['CRITICAL', '-', info_str, f'for {"drives" if len(crit_drives) > 1 else "drive"}', ', '.join([*set(crit_drives)])]
if len(warning['cap']) or len(warning['frag']) and not len(critical['health']):
if exit_code < nagios.WARNING:
exit_code = nagios.WARNING
elif exit_code == nagios.CRITICAL:
out_str[2] = 'multiple issues'
else:
if len(warning['cap']) and len(warning['frag']):
info_str = 'critical capacity and fragmentation'
warn_drives = [*warning['cap'], *warning['frag']]
elif len(warning['cap']) and not len(warning['frag']):
info_str = 'critical capacity'
warn_drives = warning['cap']
elif not len(warning['cap']) and len(warning['frag']):
info_str = 'critical fragmentation'
warn_drives = warning['frag']
out_str = ['WARNING', '-', info_str, f'for {"drives" if len(warn_drives) > 1 else "drive"}', ', '.join([*set(warn_drives)])]
if not len(warn_drives) and not len(crit_drives):
out_str = ['OK', '-', f'{len(vdev_devices)} {args.check_type} devices are healthy']
print(*out_str)
print(list_to_markdown_table(table_data, align='left', seperator='!', borders=False))
# print(zpool_list(args.pool_name, args.check_type, True)) # for testing
sys.exit(exit_code)
if __name__ == "__main__":
try:
main()
except Exception as e:
print(f'UNKNOWN: exception "{e}"')
import traceback
print(traceback.format_exc())
sys.exit(nagios.UNKNOWN)