add check_zfs_zpool cache/log device

This commit is contained in:
Cyberes 2023-05-29 23:01:58 -06:00
parent f41e65c73c
commit 48da1bb35f
2 changed files with 226 additions and 113 deletions

View File

@ -1,113 +0,0 @@
#!/usr/bin/env python3
# check_nginx is a Nagios to monitor nginx status
# The version is 1.0.2
# fixed by Nikolay Kandalintsev (twitter: @nicloay)
# Based on yangzi2008@126.com from http://www.nginxs.com
# which available here http://exchange.nagios.org/directory/Plugins/Web-Servers/nginx/check_nginx/details
import getopt
import string
import sys
import traceback
import urllib
from urllib.request import urlopen
def usage():
print("""check_nginx is a Nagios to monitor nginx status
Usage:
check_nginx [-h|--help][-U|--url][-P|--path][-u|--user][-p|--passwd][-w|--warning][-c|--critical]
Options:
--help|-h)
print check_nginx help.
--url|-U)
Sets nginx status url.
--path|-P)
Sets nginx status url path. Default is: off
--user|-u)
Sets nginx status BasicAuth user. Default is: off
--passwd|-p)
Sets nginx status BasicAuth passwd. Default is: off
--warning|-w)
Sets a warning level for nginx Active connections. Default is: off
--critical|-c)
Sets a critical level for nginx Active connections. Default is: off
Example:
The url is www.nginxs.com/status
./check_nginx -U www.nginxs.com -P /status -u eric -p nginx -w 1000 -c 2000
if dont't have password:
./check_nginx -U www.nginxs.com -P /status -w 1000 -c 2000
if don't have path and password:
./check_nginx -U www.nginxs.com -w 1000 -c 2000""")
sys.exit(3)
try:
options, args = getopt.getopt(sys.argv[1:], "hU:P:u:p:w:c:", ["help", "url=", "path=", "user=", "passwd=", "warning=", "critical="])
except getopt.GetoptError:
usage()
sys.exit(3)
for name, value in options:
if name in ("-h", "--help"):
usage()
if name in ("-U", "--url"):
url = "http://" + value
if name in ("-P", "--path"):
path = value
if name in ("-u", "--user"):
user = value
if name in ("-p", "--passwd"):
passwd = value
if name in ("-w", "--warning"):
warning = value
if name in ("-c", "--critical"):
critical = value
try:
if 'path' in dir():
req = urllib.Request(url + path)
else:
req = urllib.Request(url)
if 'user' in dir() and 'passwd' in dir():
passman = urllib.HTTPPasswordMgrWithDefaultRealm()
passman.add_password(None, url + path, user, passwd)
authhandler = urllib.HTTPBasicAuthHandler(passman)
opener = urllib.build_opener(authhandler)
urllib.install_opener(opener)
response = urlopen(req)
the_page = response.readline()
conn = the_page.split()
ActiveConn = conn[2]
the_page1 = response.readline()
the_page2 = response.readline()
the_page3 = response.readline()
response.close()
b = the_page3.split()
reading = b[1]
writing = b[3]
waiting = b[5]
output = 'ActiveConn:%s,reading:%s,writing:%s,waiting:%s' % (ActiveConn, reading, writing, waiting)
perfdata = 'ActiveConn=%s;reading=%s;writing=%s;waiting=%s' % (ActiveConn, reading, writing, waiting)
except Exception:
print("NGINX STATUS unknown: Error while getting Connection")
print(traceback.format_exc())
sys.exit(3)
if 'warning' in dir() and 'critical' in dir():
if int(ActiveConn) >= int(critical):
print('CRITICAL - %s|%s' % (output, perfdata))
sys.exit(2)
elif int(ActiveConn) >= int(warning):
print('WARNING - %s|%s' % (output, perfdata))
sys.exit(1)
else:
print('OK - %s|%s' % (output, perfdata))
sys.exit(0)
else:
print('OK - %s|%s' % (output, perfdata))
sys.exit(0)

226
check_zfs_zpool.py Executable file
View File

@ -0,0 +1,226 @@
#!/usr/bin/env python3
import argparse
import re
import subprocess
import sys
from checker import nagios
from checker.markdown import list_to_markdown_table
def parse_size(size_str):
size_str = size_str.lower()
size_map = {'k': 1, 'm': 1024, 'g': 1024 ** 2, 't': 1024 ** 3, 'p': 1024 ** 4}
size = float(size_str[:-1]) * size_map[size_str[-1]]
return size
def percent_to_float(percent_str: str):
percent = float(percent_str.strip('%'))
return percent / 100
def float_to_percent(float_value: float):
percent = float_value * 100
return f"{percent}%"
def clean_device_list(in_str: str):
return re.sub(r'\s+', ' ', re.sub(r'^\s*|', '', in_str))
def zpool_list(zpool: str, vdev_type: str, header: bool = False):
if not header:
return subprocess.check_output(f"zpool list -v {zpool} | awk '/{vdev_type}/ {{while(getline && substr($0, 1, 1) ~ /[[:blank:]]/) print}}'", shell=True).decode('utf-8')
else:
return subprocess.check_output(f"zpool list -v {zpool} | awk 'NR==1 {{print}} /{vdev_type}/ {{while(getline && substr($0, 1, 1) ~ /[[:blank:]]/) print}}'", shell=True).decode('utf-8')
def get_vdev_info(zpool: str, vdev_type: str):
output_zpool_logs = zpool_list(zpool, vdev_type)
zpool_vdev_devices = []
for line in list(filter(None, output_zpool_logs.split('\n'))):
data = list(filter(None, clean_device_list(line).split(' ')))
zpool_vdev_devices.append({
'device': data[0],
'size': data[1],
'alloc': data[2],
'free': data[3],
'frag': data[6],
'cap': data[7],
'health': data[9]
})
return zpool_vdev_devices
def get_zfs_pool_status(pool_name):
try:
result = subprocess.run(['zpool', 'list', '-H', '-o', 'name,size,alloc,free,cap,frag,health', pool_name], capture_output=True, text=True, check=True)
pool_info = result.stdout.strip().split('\t')
pool_status = {
'name': pool_info[0],
'size': pool_info[1],
'allocated': pool_info[2],
'free': pool_info[3],
'capacity': pool_info[4],
'fragmentation': pool_info[5],
'health': pool_info[6]
}
result = subprocess.run(['zpool', 'status', '-v', pool_name], capture_output=True, text=True, check=True)
pool_status_lines = result.stdout.strip().split('\n')
for i in range(len(pool_status_lines)):
pool_status_lines[i] = re.sub(r'\\t\s*', '', pool_status_lines[i])
print(pool_status_lines)
log_device_status = None
log_device_alloc = None
log_device_found = False
for line in pool_status_lines:
if 'logs' in line:
log_device_found = True
elif log_device_found:
log_device_status = line.strip().split()[-1]
log_device_alloc = line.strip().split()[1]
break
pool_status['log_device_status'] = log_device_status
pool_status['log_device_alloc'] = log_device_alloc
return pool_status
except subprocess.CalledProcessError as e:
print(f"Error: {e}")
sys.exit(2)
def main():
parser = argparse.ArgumentParser(description='Check ZFS pool status')
parser.add_argument('pool_name', help='Name of the ZFS pool to check')
parser.add_argument('check_type', choices=['status', 'cache', 'log'], help='What to check.')
parser.add_argument('--warning-free', type=int, default=65, help='Warning level for free space percentage (default: 65)')
parser.add_argument('--critical-free', type=int, default=80, help='Critical level for free space percentage (default: 80)')
parser.add_argument('--warning-frag', type=int, default=50, help='Warning level for fragmentation percentage (default: 50)')
parser.add_argument('--critical-frag', type=int, default=75, help='Critical level for fragmentation percentage (default: 75)')
args = parser.parse_args()
args.warning_free = float(f'0.{args.warning_free}')
args.critical_free = float(f'0.{args.critical_free}')
if args.check_type == 'status':
pool_status = get_zfs_pool_status(args.pool_name)
print(f"Pool Name: {pool_status['name']}")
print(f"Size: {pool_status['size']}")
print(f"Allocated: {pool_status['allocated']}")
print(f"Free: {pool_status['free']}")
print(f"Capacity: {pool_status['capacity']}")
print(f"Fragmentation: {pool_status['fragmentation']}")
print(f"Health: {pool_status['health']}")
if pool_status['log_device_status'] is not None:
print(f"Log Device Status: {pool_status['log_device_status']}")
print(f"Log Device Allocation: {pool_status['log_device_alloc']}")
else:
print("No log devices found")
size_bytes = parse_size(pool_status['size'])
free_bytes = parse_size(pool_status['free'])
free_percentage = (free_bytes / size_bytes) * 100
fragmentation_percentage = int(pool_status['fragmentation'].rstrip('%'))
if free_percentage <= args.critical_free or fragmentation_percentage >= args.critical_frag:
print("CRITICAL")
sys.exit(2)
elif free_percentage <= args.warning_free or fragmentation_percentage >= args.warning_frag:
print("WARNING")
sys.exit(1)
else:
print("OK")
sys.exit(0)
elif args.check_type in ['cache', 'log']:
vdev_devices = get_vdev_info(args.pool_name, args.check_type)
table_data = [('Device', 'Size', 'Alloc', 'Free', 'Frag', 'Cap', 'Health', 'State')]
critical = {'cap': [], 'frag': [], 'health': []}
warning = {'cap': [], 'frag': []}
for device in vdev_devices:
device['cap'] = percent_to_float(device['cap'])
device['frag'] = percent_to_float(device['frag'])
state = 'ok'
if device['cap'] >= args.critical_free:
critical['cap'].append(device['device'])
state = 'critical'
if device['frag'] >= args.critical_frag:
critical['frag'].append(device['device'])
state = 'critical'
if device['health'] != 'ONLINE':
critical['health'].append(device['device'])
state = 'critical'
if not len(critical['health']):
if device['cap'] >= args.warning_free and device['device'] not in critical['cap']:
warning['cap'].append(device['device'])
state = 'warning'
if device['frag'] >= args.warning_frag and device['device'] not in critical['frag']:
warning['frag'].append(device['device'])
state = 'warning'
table_data.append((device['device'], device['size'], device['alloc'], device['free'], float_to_percent(device['frag']), float_to_percent(device['cap']), device['health'], state))
exit_code = nagios.OK
out_str = None
info_str = None
crit_drives = []
warn_drives = []
if len(critical['cap']) or len(critical['frag']) or len(critical['health']):
exit_code = nagios.CRITICAL
if len(critical['cap']) and len(critical['frag']):
info_str = 'critical capacity and fragmentation'
crit_drives = [*critical['cap'], *critical['frag']]
elif len(critical['cap']) and not len(critical['frag']):
info_str = 'critical capacity'
crit_drives = critical['cap']
elif not len(critical['cap']) and len(critical['frag']):
info_str = 'critical fragmentation'
crit_drives = critical['frag']
if len(critical['health']):
info_str = "shit's fucked"
crit_drives = crit_drives + critical['health']
out_str = ['CRITICAL', '-', info_str, f'for {"drives" if len(crit_drives) > 1 else "drive"}', ', '.join([*set(crit_drives)])]
if len(warning['cap']) or len(warning['frag']) and not len(critical['health']):
if exit_code < nagios.WARNING:
exit_code = nagios.WARNING
elif exit_code == nagios.CRITICAL:
out_str[2] = 'multiple issues'
else:
if len(warning['cap']) and len(warning['frag']):
info_str = 'critical capacity and fragmentation'
warn_drives = [*warning['cap'], *warning['frag']]
elif len(warning['cap']) and not len(warning['frag']):
info_str = 'critical capacity'
warn_drives = warning['cap']
elif not len(warning['cap']) and len(warning['frag']):
info_str = 'critical fragmentation'
warn_drives = warning['frag']
out_str = ['WARNING', '-', info_str, f'for {"drives" if len(warn_drives) > 1 else "drive"}', ', '.join([*set(warn_drives)])]
if not len(warn_drives) and not len(crit_drives):
out_str = ['OK', '-', f'{len(vdev_devices)} {args.check_type} devices are healthy']
print(*out_str)
print(list_to_markdown_table(table_data, align='left', seperator='!', borders=False))
# print(zpool_list(args.pool_name, args.check_type, True)) # for testing
sys.exit(exit_code)
if __name__ == "__main__":
try:
main()
except Exception as e:
print(f'UNKNOWN: exception "{e}"')
import traceback
print(traceback.format_exc())
sys.exit(nagios.UNKNOWN)