add check_zfs_zpool cache/log device
This commit is contained in:
parent
f41e65c73c
commit
48da1bb35f
113
check_nginx
113
check_nginx
|
@ -1,113 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
# check_nginx is a Nagios to monitor nginx status
|
||||
# The version is 1.0.2
|
||||
# fixed by Nikolay Kandalintsev (twitter: @nicloay)
|
||||
# Based on yangzi2008@126.com from http://www.nginxs.com
|
||||
# which available here http://exchange.nagios.org/directory/Plugins/Web-Servers/nginx/check_nginx/details
|
||||
|
||||
import getopt
|
||||
import string
|
||||
import sys
|
||||
import traceback
|
||||
|
||||
import urllib
|
||||
from urllib.request import urlopen
|
||||
|
||||
|
||||
def usage():
|
||||
print("""check_nginx is a Nagios to monitor nginx status
|
||||
Usage:
|
||||
|
||||
check_nginx [-h|--help][-U|--url][-P|--path][-u|--user][-p|--passwd][-w|--warning][-c|--critical]
|
||||
|
||||
Options:
|
||||
--help|-h)
|
||||
print check_nginx help.
|
||||
--url|-U)
|
||||
Sets nginx status url.
|
||||
--path|-P)
|
||||
Sets nginx status url path. Default is: off
|
||||
--user|-u)
|
||||
Sets nginx status BasicAuth user. Default is: off
|
||||
--passwd|-p)
|
||||
Sets nginx status BasicAuth passwd. Default is: off
|
||||
--warning|-w)
|
||||
Sets a warning level for nginx Active connections. Default is: off
|
||||
--critical|-c)
|
||||
Sets a critical level for nginx Active connections. Default is: off
|
||||
Example:
|
||||
The url is www.nginxs.com/status
|
||||
./check_nginx -U www.nginxs.com -P /status -u eric -p nginx -w 1000 -c 2000
|
||||
if dont't have password:
|
||||
./check_nginx -U www.nginxs.com -P /status -w 1000 -c 2000
|
||||
if don't have path and password:
|
||||
./check_nginx -U www.nginxs.com -w 1000 -c 2000""")
|
||||
|
||||
sys.exit(3)
|
||||
|
||||
|
||||
try:
|
||||
options, args = getopt.getopt(sys.argv[1:], "hU:P:u:p:w:c:", ["help", "url=", "path=", "user=", "passwd=", "warning=", "critical="])
|
||||
|
||||
except getopt.GetoptError:
|
||||
usage()
|
||||
sys.exit(3)
|
||||
|
||||
for name, value in options:
|
||||
if name in ("-h", "--help"):
|
||||
usage()
|
||||
if name in ("-U", "--url"):
|
||||
url = "http://" + value
|
||||
if name in ("-P", "--path"):
|
||||
path = value
|
||||
if name in ("-u", "--user"):
|
||||
user = value
|
||||
if name in ("-p", "--passwd"):
|
||||
passwd = value
|
||||
if name in ("-w", "--warning"):
|
||||
warning = value
|
||||
if name in ("-c", "--critical"):
|
||||
critical = value
|
||||
try:
|
||||
if 'path' in dir():
|
||||
req = urllib.Request(url + path)
|
||||
else:
|
||||
req = urllib.Request(url)
|
||||
if 'user' in dir() and 'passwd' in dir():
|
||||
passman = urllib.HTTPPasswordMgrWithDefaultRealm()
|
||||
passman.add_password(None, url + path, user, passwd)
|
||||
authhandler = urllib.HTTPBasicAuthHandler(passman)
|
||||
opener = urllib.build_opener(authhandler)
|
||||
urllib.install_opener(opener)
|
||||
response = urlopen(req)
|
||||
the_page = response.readline()
|
||||
conn = the_page.split()
|
||||
ActiveConn = conn[2]
|
||||
the_page1 = response.readline()
|
||||
the_page2 = response.readline()
|
||||
the_page3 = response.readline()
|
||||
response.close()
|
||||
b = the_page3.split()
|
||||
reading = b[1]
|
||||
writing = b[3]
|
||||
waiting = b[5]
|
||||
output = 'ActiveConn:%s,reading:%s,writing:%s,waiting:%s' % (ActiveConn, reading, writing, waiting)
|
||||
perfdata = 'ActiveConn=%s;reading=%s;writing=%s;waiting=%s' % (ActiveConn, reading, writing, waiting)
|
||||
|
||||
except Exception:
|
||||
print("NGINX STATUS unknown: Error while getting Connection")
|
||||
print(traceback.format_exc())
|
||||
sys.exit(3)
|
||||
if 'warning' in dir() and 'critical' in dir():
|
||||
if int(ActiveConn) >= int(critical):
|
||||
print('CRITICAL - %s|%s' % (output, perfdata))
|
||||
sys.exit(2)
|
||||
elif int(ActiveConn) >= int(warning):
|
||||
print('WARNING - %s|%s' % (output, perfdata))
|
||||
sys.exit(1)
|
||||
else:
|
||||
print('OK - %s|%s' % (output, perfdata))
|
||||
sys.exit(0)
|
||||
else:
|
||||
print('OK - %s|%s' % (output, perfdata))
|
||||
sys.exit(0)
|
|
@ -0,0 +1,226 @@
|
|||
#!/usr/bin/env python3
|
||||
import argparse
|
||||
import re
|
||||
import subprocess
|
||||
import sys
|
||||
|
||||
from checker import nagios
|
||||
from checker.markdown import list_to_markdown_table
|
||||
|
||||
|
||||
def parse_size(size_str):
|
||||
size_str = size_str.lower()
|
||||
size_map = {'k': 1, 'm': 1024, 'g': 1024 ** 2, 't': 1024 ** 3, 'p': 1024 ** 4}
|
||||
size = float(size_str[:-1]) * size_map[size_str[-1]]
|
||||
return size
|
||||
|
||||
|
||||
def percent_to_float(percent_str: str):
|
||||
percent = float(percent_str.strip('%'))
|
||||
return percent / 100
|
||||
|
||||
|
||||
def float_to_percent(float_value: float):
|
||||
percent = float_value * 100
|
||||
return f"{percent}%"
|
||||
|
||||
|
||||
def clean_device_list(in_str: str):
|
||||
return re.sub(r'\s+', ' ', re.sub(r'^\s*|', '', in_str))
|
||||
|
||||
|
||||
def zpool_list(zpool: str, vdev_type: str, header: bool = False):
|
||||
if not header:
|
||||
return subprocess.check_output(f"zpool list -v {zpool} | awk '/{vdev_type}/ {{while(getline && substr($0, 1, 1) ~ /[[:blank:]]/) print}}'", shell=True).decode('utf-8')
|
||||
else:
|
||||
return subprocess.check_output(f"zpool list -v {zpool} | awk 'NR==1 {{print}} /{vdev_type}/ {{while(getline && substr($0, 1, 1) ~ /[[:blank:]]/) print}}'", shell=True).decode('utf-8')
|
||||
|
||||
|
||||
def get_vdev_info(zpool: str, vdev_type: str):
|
||||
output_zpool_logs = zpool_list(zpool, vdev_type)
|
||||
zpool_vdev_devices = []
|
||||
for line in list(filter(None, output_zpool_logs.split('\n'))):
|
||||
data = list(filter(None, clean_device_list(line).split(' ')))
|
||||
zpool_vdev_devices.append({
|
||||
'device': data[0],
|
||||
'size': data[1],
|
||||
'alloc': data[2],
|
||||
'free': data[3],
|
||||
'frag': data[6],
|
||||
'cap': data[7],
|
||||
'health': data[9]
|
||||
})
|
||||
return zpool_vdev_devices
|
||||
|
||||
|
||||
def get_zfs_pool_status(pool_name):
|
||||
try:
|
||||
result = subprocess.run(['zpool', 'list', '-H', '-o', 'name,size,alloc,free,cap,frag,health', pool_name], capture_output=True, text=True, check=True)
|
||||
pool_info = result.stdout.strip().split('\t')
|
||||
pool_status = {
|
||||
'name': pool_info[0],
|
||||
'size': pool_info[1],
|
||||
'allocated': pool_info[2],
|
||||
'free': pool_info[3],
|
||||
'capacity': pool_info[4],
|
||||
'fragmentation': pool_info[5],
|
||||
'health': pool_info[6]
|
||||
}
|
||||
|
||||
result = subprocess.run(['zpool', 'status', '-v', pool_name], capture_output=True, text=True, check=True)
|
||||
pool_status_lines = result.stdout.strip().split('\n')
|
||||
for i in range(len(pool_status_lines)):
|
||||
pool_status_lines[i] = re.sub(r'\\t\s*', '', pool_status_lines[i])
|
||||
|
||||
print(pool_status_lines)
|
||||
|
||||
log_device_status = None
|
||||
log_device_alloc = None
|
||||
log_device_found = False
|
||||
for line in pool_status_lines:
|
||||
if 'logs' in line:
|
||||
log_device_found = True
|
||||
elif log_device_found:
|
||||
log_device_status = line.strip().split()[-1]
|
||||
log_device_alloc = line.strip().split()[1]
|
||||
break
|
||||
|
||||
pool_status['log_device_status'] = log_device_status
|
||||
pool_status['log_device_alloc'] = log_device_alloc
|
||||
|
||||
return pool_status
|
||||
|
||||
except subprocess.CalledProcessError as e:
|
||||
print(f"Error: {e}")
|
||||
sys.exit(2)
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description='Check ZFS pool status')
|
||||
parser.add_argument('pool_name', help='Name of the ZFS pool to check')
|
||||
parser.add_argument('check_type', choices=['status', 'cache', 'log'], help='What to check.')
|
||||
parser.add_argument('--warning-free', type=int, default=65, help='Warning level for free space percentage (default: 65)')
|
||||
parser.add_argument('--critical-free', type=int, default=80, help='Critical level for free space percentage (default: 80)')
|
||||
parser.add_argument('--warning-frag', type=int, default=50, help='Warning level for fragmentation percentage (default: 50)')
|
||||
parser.add_argument('--critical-frag', type=int, default=75, help='Critical level for fragmentation percentage (default: 75)')
|
||||
args = parser.parse_args()
|
||||
|
||||
args.warning_free = float(f'0.{args.warning_free}')
|
||||
args.critical_free = float(f'0.{args.critical_free}')
|
||||
|
||||
if args.check_type == 'status':
|
||||
pool_status = get_zfs_pool_status(args.pool_name)
|
||||
|
||||
print(f"Pool Name: {pool_status['name']}")
|
||||
print(f"Size: {pool_status['size']}")
|
||||
print(f"Allocated: {pool_status['allocated']}")
|
||||
print(f"Free: {pool_status['free']}")
|
||||
print(f"Capacity: {pool_status['capacity']}")
|
||||
print(f"Fragmentation: {pool_status['fragmentation']}")
|
||||
print(f"Health: {pool_status['health']}")
|
||||
|
||||
if pool_status['log_device_status'] is not None:
|
||||
print(f"Log Device Status: {pool_status['log_device_status']}")
|
||||
print(f"Log Device Allocation: {pool_status['log_device_alloc']}")
|
||||
else:
|
||||
print("No log devices found")
|
||||
|
||||
size_bytes = parse_size(pool_status['size'])
|
||||
free_bytes = parse_size(pool_status['free'])
|
||||
free_percentage = (free_bytes / size_bytes) * 100
|
||||
fragmentation_percentage = int(pool_status['fragmentation'].rstrip('%'))
|
||||
|
||||
if free_percentage <= args.critical_free or fragmentation_percentage >= args.critical_frag:
|
||||
print("CRITICAL")
|
||||
sys.exit(2)
|
||||
elif free_percentage <= args.warning_free or fragmentation_percentage >= args.warning_frag:
|
||||
print("WARNING")
|
||||
sys.exit(1)
|
||||
else:
|
||||
print("OK")
|
||||
sys.exit(0)
|
||||
elif args.check_type in ['cache', 'log']:
|
||||
vdev_devices = get_vdev_info(args.pool_name, args.check_type)
|
||||
table_data = [('Device', 'Size', 'Alloc', 'Free', 'Frag', 'Cap', 'Health', 'State')]
|
||||
critical = {'cap': [], 'frag': [], 'health': []}
|
||||
warning = {'cap': [], 'frag': []}
|
||||
for device in vdev_devices:
|
||||
device['cap'] = percent_to_float(device['cap'])
|
||||
device['frag'] = percent_to_float(device['frag'])
|
||||
state = 'ok'
|
||||
|
||||
if device['cap'] >= args.critical_free:
|
||||
critical['cap'].append(device['device'])
|
||||
state = 'critical'
|
||||
if device['frag'] >= args.critical_frag:
|
||||
critical['frag'].append(device['device'])
|
||||
state = 'critical'
|
||||
if device['health'] != 'ONLINE':
|
||||
critical['health'].append(device['device'])
|
||||
state = 'critical'
|
||||
|
||||
if not len(critical['health']):
|
||||
if device['cap'] >= args.warning_free and device['device'] not in critical['cap']:
|
||||
warning['cap'].append(device['device'])
|
||||
state = 'warning'
|
||||
if device['frag'] >= args.warning_frag and device['device'] not in critical['frag']:
|
||||
warning['frag'].append(device['device'])
|
||||
state = 'warning'
|
||||
|
||||
table_data.append((device['device'], device['size'], device['alloc'], device['free'], float_to_percent(device['frag']), float_to_percent(device['cap']), device['health'], state))
|
||||
|
||||
exit_code = nagios.OK
|
||||
out_str = None
|
||||
info_str = None
|
||||
crit_drives = []
|
||||
warn_drives = []
|
||||
if len(critical['cap']) or len(critical['frag']) or len(critical['health']):
|
||||
exit_code = nagios.CRITICAL
|
||||
if len(critical['cap']) and len(critical['frag']):
|
||||
info_str = 'critical capacity and fragmentation'
|
||||
crit_drives = [*critical['cap'], *critical['frag']]
|
||||
elif len(critical['cap']) and not len(critical['frag']):
|
||||
info_str = 'critical capacity'
|
||||
crit_drives = critical['cap']
|
||||
elif not len(critical['cap']) and len(critical['frag']):
|
||||
info_str = 'critical fragmentation'
|
||||
crit_drives = critical['frag']
|
||||
if len(critical['health']):
|
||||
info_str = "shit's fucked"
|
||||
crit_drives = crit_drives + critical['health']
|
||||
out_str = ['CRITICAL', '-', info_str, f'for {"drives" if len(crit_drives) > 1 else "drive"}', ', '.join([*set(crit_drives)])]
|
||||
if len(warning['cap']) or len(warning['frag']) and not len(critical['health']):
|
||||
if exit_code < nagios.WARNING:
|
||||
exit_code = nagios.WARNING
|
||||
elif exit_code == nagios.CRITICAL:
|
||||
out_str[2] = 'multiple issues'
|
||||
else:
|
||||
if len(warning['cap']) and len(warning['frag']):
|
||||
info_str = 'critical capacity and fragmentation'
|
||||
warn_drives = [*warning['cap'], *warning['frag']]
|
||||
elif len(warning['cap']) and not len(warning['frag']):
|
||||
info_str = 'critical capacity'
|
||||
warn_drives = warning['cap']
|
||||
elif not len(warning['cap']) and len(warning['frag']):
|
||||
info_str = 'critical fragmentation'
|
||||
warn_drives = warning['frag']
|
||||
out_str = ['WARNING', '-', info_str, f'for {"drives" if len(warn_drives) > 1 else "drive"}', ', '.join([*set(warn_drives)])]
|
||||
|
||||
if not len(warn_drives) and not len(crit_drives):
|
||||
out_str = ['OK', '-', f'{len(vdev_devices)} {args.check_type} devices are healthy']
|
||||
|
||||
print(*out_str)
|
||||
print(list_to_markdown_table(table_data, align='left', seperator='!', borders=False))
|
||||
# print(zpool_list(args.pool_name, args.check_type, True)) # for testing
|
||||
sys.exit(exit_code)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
try:
|
||||
main()
|
||||
except Exception as e:
|
||||
print(f'UNKNOWN: exception "{e}"')
|
||||
import traceback
|
||||
|
||||
print(traceback.format_exc())
|
||||
sys.exit(nagios.UNKNOWN)
|
Loading…
Reference in New Issue