227 lines
9.5 KiB
Python
227 lines
9.5 KiB
Python
|
#!/usr/bin/env python3
|
||
|
import argparse
|
||
|
import re
|
||
|
import subprocess
|
||
|
import sys
|
||
|
|
||
|
from checker import nagios
|
||
|
from checker.markdown import list_to_markdown_table
|
||
|
|
||
|
|
||
|
def parse_size(size_str):
|
||
|
size_str = size_str.lower()
|
||
|
size_map = {'k': 1, 'm': 1024, 'g': 1024 ** 2, 't': 1024 ** 3, 'p': 1024 ** 4}
|
||
|
size = float(size_str[:-1]) * size_map[size_str[-1]]
|
||
|
return size
|
||
|
|
||
|
|
||
|
def percent_to_float(percent_str: str):
|
||
|
percent = float(percent_str.strip('%'))
|
||
|
return percent / 100
|
||
|
|
||
|
|
||
|
def float_to_percent(float_value: float):
|
||
|
percent = float_value * 100
|
||
|
return f"{percent}%"
|
||
|
|
||
|
|
||
|
def clean_device_list(in_str: str):
|
||
|
return re.sub(r'\s+', ' ', re.sub(r'^\s*|', '', in_str))
|
||
|
|
||
|
|
||
|
def zpool_list(zpool: str, vdev_type: str, header: bool = False):
|
||
|
if not header:
|
||
|
return subprocess.check_output(f"zpool list -v {zpool} | awk '/{vdev_type}/ {{while(getline && substr($0, 1, 1) ~ /[[:blank:]]/) print}}'", shell=True).decode('utf-8')
|
||
|
else:
|
||
|
return subprocess.check_output(f"zpool list -v {zpool} | awk 'NR==1 {{print}} /{vdev_type}/ {{while(getline && substr($0, 1, 1) ~ /[[:blank:]]/) print}}'", shell=True).decode('utf-8')
|
||
|
|
||
|
|
||
|
def get_vdev_info(zpool: str, vdev_type: str):
|
||
|
output_zpool_logs = zpool_list(zpool, vdev_type)
|
||
|
zpool_vdev_devices = []
|
||
|
for line in list(filter(None, output_zpool_logs.split('\n'))):
|
||
|
data = list(filter(None, clean_device_list(line).split(' ')))
|
||
|
zpool_vdev_devices.append({
|
||
|
'device': data[0],
|
||
|
'size': data[1],
|
||
|
'alloc': data[2],
|
||
|
'free': data[3],
|
||
|
'frag': data[6],
|
||
|
'cap': data[7],
|
||
|
'health': data[9]
|
||
|
})
|
||
|
return zpool_vdev_devices
|
||
|
|
||
|
|
||
|
def get_zfs_pool_status(pool_name):
|
||
|
try:
|
||
|
result = subprocess.run(['zpool', 'list', '-H', '-o', 'name,size,alloc,free,cap,frag,health', pool_name], capture_output=True, text=True, check=True)
|
||
|
pool_info = result.stdout.strip().split('\t')
|
||
|
pool_status = {
|
||
|
'name': pool_info[0],
|
||
|
'size': pool_info[1],
|
||
|
'allocated': pool_info[2],
|
||
|
'free': pool_info[3],
|
||
|
'capacity': pool_info[4],
|
||
|
'fragmentation': pool_info[5],
|
||
|
'health': pool_info[6]
|
||
|
}
|
||
|
|
||
|
result = subprocess.run(['zpool', 'status', '-v', pool_name], capture_output=True, text=True, check=True)
|
||
|
pool_status_lines = result.stdout.strip().split('\n')
|
||
|
for i in range(len(pool_status_lines)):
|
||
|
pool_status_lines[i] = re.sub(r'\\t\s*', '', pool_status_lines[i])
|
||
|
|
||
|
print(pool_status_lines)
|
||
|
|
||
|
log_device_status = None
|
||
|
log_device_alloc = None
|
||
|
log_device_found = False
|
||
|
for line in pool_status_lines:
|
||
|
if 'logs' in line:
|
||
|
log_device_found = True
|
||
|
elif log_device_found:
|
||
|
log_device_status = line.strip().split()[-1]
|
||
|
log_device_alloc = line.strip().split()[1]
|
||
|
break
|
||
|
|
||
|
pool_status['log_device_status'] = log_device_status
|
||
|
pool_status['log_device_alloc'] = log_device_alloc
|
||
|
|
||
|
return pool_status
|
||
|
|
||
|
except subprocess.CalledProcessError as e:
|
||
|
print(f"Error: {e}")
|
||
|
sys.exit(2)
|
||
|
|
||
|
|
||
|
def main():
|
||
|
parser = argparse.ArgumentParser(description='Check ZFS pool status')
|
||
|
parser.add_argument('pool_name', help='Name of the ZFS pool to check')
|
||
|
parser.add_argument('check_type', choices=['status', 'cache', 'log'], help='What to check.')
|
||
|
parser.add_argument('--warning-free', type=int, default=65, help='Warning level for free space percentage (default: 65)')
|
||
|
parser.add_argument('--critical-free', type=int, default=80, help='Critical level for free space percentage (default: 80)')
|
||
|
parser.add_argument('--warning-frag', type=int, default=50, help='Warning level for fragmentation percentage (default: 50)')
|
||
|
parser.add_argument('--critical-frag', type=int, default=75, help='Critical level for fragmentation percentage (default: 75)')
|
||
|
args = parser.parse_args()
|
||
|
|
||
|
args.warning_free = float(f'0.{args.warning_free}')
|
||
|
args.critical_free = float(f'0.{args.critical_free}')
|
||
|
|
||
|
if args.check_type == 'status':
|
||
|
pool_status = get_zfs_pool_status(args.pool_name)
|
||
|
|
||
|
print(f"Pool Name: {pool_status['name']}")
|
||
|
print(f"Size: {pool_status['size']}")
|
||
|
print(f"Allocated: {pool_status['allocated']}")
|
||
|
print(f"Free: {pool_status['free']}")
|
||
|
print(f"Capacity: {pool_status['capacity']}")
|
||
|
print(f"Fragmentation: {pool_status['fragmentation']}")
|
||
|
print(f"Health: {pool_status['health']}")
|
||
|
|
||
|
if pool_status['log_device_status'] is not None:
|
||
|
print(f"Log Device Status: {pool_status['log_device_status']}")
|
||
|
print(f"Log Device Allocation: {pool_status['log_device_alloc']}")
|
||
|
else:
|
||
|
print("No log devices found")
|
||
|
|
||
|
size_bytes = parse_size(pool_status['size'])
|
||
|
free_bytes = parse_size(pool_status['free'])
|
||
|
free_percentage = (free_bytes / size_bytes) * 100
|
||
|
fragmentation_percentage = int(pool_status['fragmentation'].rstrip('%'))
|
||
|
|
||
|
if free_percentage <= args.critical_free or fragmentation_percentage >= args.critical_frag:
|
||
|
print("CRITICAL")
|
||
|
sys.exit(2)
|
||
|
elif free_percentage <= args.warning_free or fragmentation_percentage >= args.warning_frag:
|
||
|
print("WARNING")
|
||
|
sys.exit(1)
|
||
|
else:
|
||
|
print("OK")
|
||
|
sys.exit(0)
|
||
|
elif args.check_type in ['cache', 'log']:
|
||
|
vdev_devices = get_vdev_info(args.pool_name, args.check_type)
|
||
|
table_data = [('Device', 'Size', 'Alloc', 'Free', 'Frag', 'Cap', 'Health', 'State')]
|
||
|
critical = {'cap': [], 'frag': [], 'health': []}
|
||
|
warning = {'cap': [], 'frag': []}
|
||
|
for device in vdev_devices:
|
||
|
device['cap'] = percent_to_float(device['cap'])
|
||
|
device['frag'] = percent_to_float(device['frag'])
|
||
|
state = 'ok'
|
||
|
|
||
|
if device['cap'] >= args.critical_free:
|
||
|
critical['cap'].append(device['device'])
|
||
|
state = 'critical'
|
||
|
if device['frag'] >= args.critical_frag:
|
||
|
critical['frag'].append(device['device'])
|
||
|
state = 'critical'
|
||
|
if device['health'] != 'ONLINE':
|
||
|
critical['health'].append(device['device'])
|
||
|
state = 'critical'
|
||
|
|
||
|
if not len(critical['health']):
|
||
|
if device['cap'] >= args.warning_free and device['device'] not in critical['cap']:
|
||
|
warning['cap'].append(device['device'])
|
||
|
state = 'warning'
|
||
|
if device['frag'] >= args.warning_frag and device['device'] not in critical['frag']:
|
||
|
warning['frag'].append(device['device'])
|
||
|
state = 'warning'
|
||
|
|
||
|
table_data.append((device['device'], device['size'], device['alloc'], device['free'], float_to_percent(device['frag']), float_to_percent(device['cap']), device['health'], state))
|
||
|
|
||
|
exit_code = nagios.OK
|
||
|
out_str = None
|
||
|
info_str = None
|
||
|
crit_drives = []
|
||
|
warn_drives = []
|
||
|
if len(critical['cap']) or len(critical['frag']) or len(critical['health']):
|
||
|
exit_code = nagios.CRITICAL
|
||
|
if len(critical['cap']) and len(critical['frag']):
|
||
|
info_str = 'critical capacity and fragmentation'
|
||
|
crit_drives = [*critical['cap'], *critical['frag']]
|
||
|
elif len(critical['cap']) and not len(critical['frag']):
|
||
|
info_str = 'critical capacity'
|
||
|
crit_drives = critical['cap']
|
||
|
elif not len(critical['cap']) and len(critical['frag']):
|
||
|
info_str = 'critical fragmentation'
|
||
|
crit_drives = critical['frag']
|
||
|
if len(critical['health']):
|
||
|
info_str = "shit's fucked"
|
||
|
crit_drives = crit_drives + critical['health']
|
||
|
out_str = ['CRITICAL', '-', info_str, f'for {"drives" if len(crit_drives) > 1 else "drive"}', ', '.join([*set(crit_drives)])]
|
||
|
if len(warning['cap']) or len(warning['frag']) and not len(critical['health']):
|
||
|
if exit_code < nagios.WARNING:
|
||
|
exit_code = nagios.WARNING
|
||
|
elif exit_code == nagios.CRITICAL:
|
||
|
out_str[2] = 'multiple issues'
|
||
|
else:
|
||
|
if len(warning['cap']) and len(warning['frag']):
|
||
|
info_str = 'critical capacity and fragmentation'
|
||
|
warn_drives = [*warning['cap'], *warning['frag']]
|
||
|
elif len(warning['cap']) and not len(warning['frag']):
|
||
|
info_str = 'critical capacity'
|
||
|
warn_drives = warning['cap']
|
||
|
elif not len(warning['cap']) and len(warning['frag']):
|
||
|
info_str = 'critical fragmentation'
|
||
|
warn_drives = warning['frag']
|
||
|
out_str = ['WARNING', '-', info_str, f'for {"drives" if len(warn_drives) > 1 else "drive"}', ', '.join([*set(warn_drives)])]
|
||
|
|
||
|
if not len(warn_drives) and not len(crit_drives):
|
||
|
out_str = ['OK', '-', f'{len(vdev_devices)} {args.check_type} devices are healthy']
|
||
|
|
||
|
print(*out_str)
|
||
|
print(list_to_markdown_table(table_data, align='left', seperator='!', borders=False))
|
||
|
# print(zpool_list(args.pool_name, args.check_type, True)) # for testing
|
||
|
sys.exit(exit_code)
|
||
|
|
||
|
|
||
|
if __name__ == "__main__":
|
||
|
try:
|
||
|
main()
|
||
|
except Exception as e:
|
||
|
print(f'UNKNOWN: exception "{e}"')
|
||
|
import traceback
|
||
|
|
||
|
print(traceback.format_exc())
|
||
|
sys.exit(nagios.UNKNOWN)
|