add idrac checkers
This commit is contained in:
parent
a25ec2e08d
commit
17d3fb9fa0
|
@ -2,4 +2,5 @@
|
||||||
|
|
||||||
My custom Icinga2 checks.
|
My custom Icinga2 checks.
|
||||||
|
|
||||||
Useful: https://nagios-plugins.org/doc/guidelines.html#AEN200
|
Useful: https://nagios-plugins.org/doc/guidelines.html#AEN200
|
||||||
|
https://icinga.com/docs/icinga-2/latest/doc/05-service-monitoring/#performance-data-metrics
|
|
@ -6,13 +6,7 @@ import traceback
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
from checker import nagios
|
from checker import nagios
|
||||||
from checker import print_icinga2_check_status
|
from checker.result import quit_check
|
||||||
|
|
||||||
|
|
||||||
def quit_check(text_result, exit_code):
|
|
||||||
print_icinga2_check_status(text_result, exit_code)
|
|
||||||
sys.exit(exit_code)
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
parser = argparse.ArgumentParser(description='Validate Hauk tracking functionality.')
|
parser = argparse.ArgumentParser(description='Validate Hauk tracking functionality.')
|
||||||
|
|
|
@ -0,0 +1,50 @@
|
||||||
|
import argparse
|
||||||
|
import sys
|
||||||
|
import traceback
|
||||||
|
|
||||||
|
from checker import nagios
|
||||||
|
from checker.result import quit_check
|
||||||
|
from checker.snmp import get_snmp_value
|
||||||
|
|
||||||
|
# TODO: support iDRAC 8
|
||||||
|
|
||||||
|
# https://github.com/ilovepancakes95/idrac_snmp-grafana/blob/master/idrac-input.conf
|
||||||
|
FAN_SPEED_OID = '.1.3.6.1.4.1.674.10892.5.4.700.12.1.6.1.'
|
||||||
|
|
||||||
|
|
||||||
|
def main(args):
|
||||||
|
text_result = ''
|
||||||
|
perf_data = {}
|
||||||
|
fan_speeds = []
|
||||||
|
for i in range(args.fan_num):
|
||||||
|
value = get_snmp_value(FAN_SPEED_OID + str(i + 1), args.ip, args.community)
|
||||||
|
if not value:
|
||||||
|
continue
|
||||||
|
fan_speeds.append(value)
|
||||||
|
text_result += f'Fan{i + 1}: {value}, '
|
||||||
|
perf_data[f'Fan{i + 1}'] = {'value': value, 'warn': args.warn, 'crit': args.crit}
|
||||||
|
text_result = text_result.strip(', ')
|
||||||
|
|
||||||
|
exit_code = nagios.STATE_OK
|
||||||
|
if min(fan_speeds) <= args.crit:
|
||||||
|
exit_code = max(nagios.STATE_CRIT, exit_code)
|
||||||
|
elif min(fan_speeds) <= args.warn:
|
||||||
|
exit_code = max(nagios.STATE_WARN, exit_code)
|
||||||
|
|
||||||
|
quit_check(text_result, exit_code, perfdata=perf_data)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = argparse.ArgumentParser(description='Validate Hauk tracking functionality.')
|
||||||
|
parser.add_argument('--ip', required=True, help='The iDRAC IP to query.')
|
||||||
|
parser.add_argument('--community', default='public', help='Your SNMP community. Default: public')
|
||||||
|
parser.add_argument('--fan-num', type=int, default=1, help='Number of fans. Default: 1')
|
||||||
|
parser.add_argument('--warn', type=int, default=840, help='RPM warning level. Default: 840')
|
||||||
|
parser.add_argument('--crit', type=int, default=600, help='RPM critical level. Default: 600')
|
||||||
|
args = parser.parse_args()
|
||||||
|
try:
|
||||||
|
main(args)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"UNKNOWN: exception\n{e}")
|
||||||
|
print(traceback.format_exc())
|
||||||
|
sys.exit(nagios.STATE_UNKNOWN)
|
|
@ -0,0 +1,52 @@
|
||||||
|
import argparse
|
||||||
|
import sys
|
||||||
|
import traceback
|
||||||
|
|
||||||
|
from checker import nagios
|
||||||
|
from checker.result import quit_check
|
||||||
|
from checker.snmp import get_snmp_value
|
||||||
|
|
||||||
|
# TODO: support iDRAC 8
|
||||||
|
|
||||||
|
# https://github.com/ilovepancakes95/idrac_snmp-grafana/blob/master/idrac-input.conf
|
||||||
|
GLOBAL_SYSTEM_STATUS_OID = '.1.3.6.1.4.1.674.10892.5.2.1.0'
|
||||||
|
|
||||||
|
|
||||||
|
def main(args):
|
||||||
|
system_status = get_snmp_value(GLOBAL_SYSTEM_STATUS_OID, args.ip, args.community)
|
||||||
|
if system_status == 1:
|
||||||
|
exit_code = nagios.STATE_UNKNOWN
|
||||||
|
text_result = 'status is other'
|
||||||
|
elif system_status == 2:
|
||||||
|
exit_code = nagios.STATE_UNKNOWN
|
||||||
|
text_result = 'status is unknown'
|
||||||
|
elif system_status == 3:
|
||||||
|
exit_code = nagios.STATE_OK
|
||||||
|
text_result = 'status is nominal'
|
||||||
|
elif system_status == 4:
|
||||||
|
exit_code = nagios.STATE_WARN
|
||||||
|
text_result = 'status is non-critical'
|
||||||
|
elif system_status == 5:
|
||||||
|
exit_code = nagios.STATE_CRIT
|
||||||
|
text_result = 'status is critical'
|
||||||
|
elif system_status == 6:
|
||||||
|
exit_code = nagios.STATE_CRIT
|
||||||
|
text_result = 'status is non-recoverable'
|
||||||
|
else:
|
||||||
|
exit_code = nagios.STATE_UNKNOWN
|
||||||
|
text_result = 'status is critical'
|
||||||
|
|
||||||
|
quit_check(text_result, exit_code)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = argparse.ArgumentParser(description='Validate Hauk tracking functionality.')
|
||||||
|
parser.add_argument('--ip', required=True, help='The iDRAC IP to query.')
|
||||||
|
parser.add_argument('--community', default='public', help='Your SNMP community. Default: public')
|
||||||
|
args = parser.parse_args()
|
||||||
|
try:
|
||||||
|
main(args)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"UNKNOWN: exception\n{e}")
|
||||||
|
print(traceback.format_exc())
|
||||||
|
sys.exit(nagios.STATE_UNKNOWN)
|
|
@ -0,0 +1,89 @@
|
||||||
|
import argparse
|
||||||
|
import sys
|
||||||
|
import traceback
|
||||||
|
|
||||||
|
from checker import nagios
|
||||||
|
from checker.result import quit_check
|
||||||
|
from checker.snmp import get_snmp_value
|
||||||
|
from checker.units import c_to_f
|
||||||
|
|
||||||
|
# TODO: support iDRAC 8
|
||||||
|
|
||||||
|
# https://github.com/ilovepancakes95/idrac_snmp-grafana/blob/master/idrac-input.conf
|
||||||
|
INLET_TEMP_OID = '.1.3.6.1.4.1.674.10892.5.4.700.20.1.6.1.1'
|
||||||
|
EXHAUST_TEMP_OID = '.1.3.6.1.4.1.674.10892.5.4.700.20.1.6.1.2'
|
||||||
|
CPU1_TEMP_OID = '.1.3.6.1.4.1.674.10892.5.4.700.20.1.6.1.3'
|
||||||
|
CPU2_TEMP_OID = '.1.3.6.1.4.1.674.10892.5.4.700.20.1.6.1.4'
|
||||||
|
|
||||||
|
|
||||||
|
def get_snmp_temp(oid, ip, community):
|
||||||
|
value = get_snmp_value(oid, ip, community)
|
||||||
|
return c_to_f(float(value[0] + value[1] + '.' + value[2]))
|
||||||
|
|
||||||
|
|
||||||
|
def main(args):
|
||||||
|
inlet_temp = get_snmp_temp(INLET_TEMP_OID, args.ip, args.community)
|
||||||
|
exhaust_temp = get_snmp_temp(EXHAUST_TEMP_OID, args.ip, args.community)
|
||||||
|
|
||||||
|
cpu_temps = []
|
||||||
|
cpu_temps.append(get_snmp_temp(CPU1_TEMP_OID, args.ip, args.community))
|
||||||
|
if args.cpu_num > 1:
|
||||||
|
cpu_temps.append(get_snmp_temp(CPU2_TEMP_OID, args.ip, args.community))
|
||||||
|
|
||||||
|
exit_code = nagios.STATE_OK
|
||||||
|
if inlet_temp >= args.inlet_crit:
|
||||||
|
exit_code = max(nagios.STATE_CRIT, exit_code)
|
||||||
|
elif inlet_temp >= args.inlet_warn:
|
||||||
|
exit_code = max(nagios.STATE_WARN, exit_code)
|
||||||
|
|
||||||
|
if exhaust_temp >= args.exhaust_crit:
|
||||||
|
exit_code = max(nagios.STATE_CRIT, exit_code)
|
||||||
|
elif exhaust_temp >= args.exhaust_warn:
|
||||||
|
exit_code = max(nagios.STATE_WARN, exit_code)
|
||||||
|
|
||||||
|
if max(cpu_temps) >= args.cpu_crit:
|
||||||
|
exit_code = max(nagios.STATE_CRIT, exit_code)
|
||||||
|
elif max(cpu_temps) >= args.cpu_warn:
|
||||||
|
exit_code = max(nagios.STATE_WARN, exit_code)
|
||||||
|
|
||||||
|
text_result = f'CPU1: {cpu_temps[0]}'
|
||||||
|
if len(cpu_temps) > 1:
|
||||||
|
text_result += f', CPU2: {cpu_temps[1]}'
|
||||||
|
text_result += f', Inlet: {inlet_temp}, Exhaust: {exhaust_temp}'
|
||||||
|
|
||||||
|
perf_data = {
|
||||||
|
'cpu1': {
|
||||||
|
'value': cpu_temps[0], 'warn': args.cpu_warn, 'crit': args.cpu_crit, 'unit': 'F'
|
||||||
|
},
|
||||||
|
'inlet': {
|
||||||
|
'value': inlet_temp, 'warn': args.inlet_warn, 'crit': args.inlet_crit, 'unit': 'F'
|
||||||
|
},
|
||||||
|
'exhaust': {
|
||||||
|
'value': exhaust_temp, 'warn': args.exhaust_warn, 'crit': args.exhaust_crit, 'unit': 'F'
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(cpu_temps) > 1:
|
||||||
|
perf_data['cpu2'] = {'value': cpu_temps[1], 'warn': args.cpu_warn, 'crit': args.cpu_crit, 'unit': 'F'}
|
||||||
|
|
||||||
|
quit_check(text_result, exit_code, perfdata=perf_data)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = argparse.ArgumentParser(description='Validate Hauk tracking functionality.')
|
||||||
|
parser.add_argument('--ip', required=True, help='The iDRAC IP to query.')
|
||||||
|
parser.add_argument('--community', default='public', help='Your SNMP community. Default: public')
|
||||||
|
parser.add_argument('--cpu-num', type=int, default=1, help='Number of CPU nodes. Default: 1')
|
||||||
|
parser.add_argument('--inlet-warn', type=int, default=108, help='System Board Inlet Temp warning level in F. Default: 108')
|
||||||
|
parser.add_argument('--inlet-crit', type=int, default=116, help='System Board Inlet Temp critical level in F. Default: 116')
|
||||||
|
parser.add_argument('--exhaust-warn', type=int, default=158, help='System Board Exhaust Temp warning level in F. Default: 158')
|
||||||
|
parser.add_argument('--exhaust-crit', type=int, default=167, help='System Board Exhaust Temp critical level in F. Default: 167')
|
||||||
|
parser.add_argument('--cpu-warn', type=int, default=186, help='CPU temp critical level in F. Default: 186')
|
||||||
|
parser.add_argument('--cpu-crit', type=int, default=195, help='CPU temp critical level in F. Default: 195')
|
||||||
|
args = parser.parse_args()
|
||||||
|
try:
|
||||||
|
main(args)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"UNKNOWN: exception\n{e}")
|
||||||
|
print(traceback.format_exc())
|
||||||
|
sys.exit(nagios.STATE_UNKNOWN)
|
|
@ -4,9 +4,10 @@ WARNING_THRESHOLD=15
|
||||||
CRITICAL_THRESHOLD=25
|
CRITICAL_THRESHOLD=25
|
||||||
AVERAGE_SECONDS=5
|
AVERAGE_SECONDS=5
|
||||||
SHOW_TOP_PROCESSES=false
|
SHOW_TOP_PROCESSES=false
|
||||||
|
PROXMOX_CALCULATION=false
|
||||||
|
|
||||||
# Parse command line arguments
|
# Parse command line arguments
|
||||||
while getopts "w:c:n:t" opt; do
|
while getopts "w:c:n:tph" opt; do
|
||||||
case $opt in
|
case $opt in
|
||||||
w)
|
w)
|
||||||
WARNING_THRESHOLD="$OPTARG"
|
WARNING_THRESHOLD="$OPTARG"
|
||||||
|
@ -20,8 +21,22 @@ while getopts "w:c:n:t" opt; do
|
||||||
t)
|
t)
|
||||||
SHOW_TOP_PROCESSES=true
|
SHOW_TOP_PROCESSES=true
|
||||||
;;
|
;;
|
||||||
|
p)
|
||||||
|
PROXMOX_CALCULATION=true
|
||||||
|
;;
|
||||||
|
h)
|
||||||
|
echo "Usage: check_iowait.sh [-w warning_threshold] [-c critical_threshold] [-n average_seconds] [-t] [-p] [-h]"
|
||||||
|
echo "Options:"
|
||||||
|
echo " -w Set warning threshold"
|
||||||
|
echo " -c Set critical threshold"
|
||||||
|
echo " -n Set average seconds"
|
||||||
|
echo " -t Show top processes"
|
||||||
|
echo " -p Enable iowait calculation similar to Proxmox"
|
||||||
|
echo " -h Print this help message"
|
||||||
|
exit 0
|
||||||
|
;;
|
||||||
\?)
|
\?)
|
||||||
echo "Usage: check_iowait.sh [-w warning_threshold] [-c critical_threshold] [-n average_seconds] [-t]"
|
echo "Usage: check_iowait.sh [-w warning_threshold] [-c critical_threshold] [-n average_seconds] [-t] [-p] [-h]"
|
||||||
exit 1
|
exit 1
|
||||||
;;
|
;;
|
||||||
esac
|
esac
|
||||||
|
@ -40,6 +55,11 @@ if uname | grep -q "BSD"; then
|
||||||
fi
|
fi
|
||||||
else
|
else
|
||||||
iowait=$(iostat -c $AVERAGE_SECONDS 2 | awk 'NR==4 {print $4}')
|
iowait=$(iostat -c $AVERAGE_SECONDS 2 | awk 'NR==4 {print $4}')
|
||||||
|
if $PROXMOX_CALCULATION; then
|
||||||
|
idle=$(iostat -c $AVERAGE_SECONDS 2 | awk 'NR==4 {print $6}')
|
||||||
|
non_idle=$(echo "100 - $idle" | bc -l)
|
||||||
|
iowait=$(echo "$iowait / $non_idle * 100" | bc -l)
|
||||||
|
fi
|
||||||
if $SHOW_TOP_PROCESSES; then
|
if $SHOW_TOP_PROCESSES; then
|
||||||
top_processes=$(pidstat -d -l -u -r 1 1 | awk 'NR>4 {print $1, $NF, $8}' | sort -k3 -nr | head -n 3 | awk '{printf "%s%s", sep, $2; sep=", "} END {print ""}')
|
top_processes=$(pidstat -d -l -u -r 1 1 | awk 'NR>4 {print $1, $NF, $8}' | sort -k3 -nr | head -n 3 | awk '{printf "%s%s", sep, $2; sep=", "} END {print ""}')
|
||||||
fi
|
fi
|
||||||
|
|
|
@ -1,2 +1,2 @@
|
||||||
from .print import print_icinga2_check_status, dict_to_perfdata, create_description_list
|
from .result import print_icinga2_check_status, dict_to_perfdata, create_description_list
|
||||||
from .markdown import list_to_markdown_table
|
from .markdown import list_to_markdown_table
|
||||||
|
|
|
@ -4,7 +4,7 @@ from time import sleep
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
from . import nagios
|
from . import nagios
|
||||||
from .print import print_icinga2_check_status
|
from .result import print_icinga2_check_status
|
||||||
|
|
||||||
|
|
||||||
def fetch_with_retry(url, method: str = 'get', retries=3, delay=1, **kwargs):
|
def fetch_with_retry(url, method: str = 'get', retries=3, delay=1, **kwargs):
|
||||||
|
|
|
@ -1,4 +1,8 @@
|
||||||
# TODO: remove non STATE_ vars
|
# TODO: remove non STATE_ vars
|
||||||
|
import sys
|
||||||
|
|
||||||
|
from checker import print_icinga2_check_status
|
||||||
|
|
||||||
UNKNOWN = STATE_UNKNOWN = -1
|
UNKNOWN = STATE_UNKNOWN = -1
|
||||||
OK = STATE_OK = 0
|
OK = STATE_OK = 0
|
||||||
WARNING = STATE_WARN = 1
|
WARNING = STATE_WARN = 1
|
||||||
|
|
|
@ -1,3 +1,6 @@
|
||||||
|
import sys
|
||||||
|
|
||||||
|
|
||||||
def create_description_list(data: list):
|
def create_description_list(data: list):
|
||||||
"""
|
"""
|
||||||
Create a Description List HTML element based on the input list.
|
Create a Description List HTML element based on the input list.
|
||||||
|
@ -107,3 +110,8 @@ def print_icinga2_check_status(text_result: str, return_code: int, perfdata=None
|
||||||
status = status_codes[return_code]
|
status = status_codes[return_code]
|
||||||
perfdata_str = f' | {dict_to_perfdata(perfdata)}' if perfdata else ''
|
perfdata_str = f' | {dict_to_perfdata(perfdata)}' if perfdata else ''
|
||||||
print(f"{status} - {text_result.strip()}{perfdata_str}")
|
print(f"{status} - {text_result.strip()}{perfdata_str}")
|
||||||
|
|
||||||
|
|
||||||
|
def quit_check(text_result: str, exit_code: int, perfdata=None):
|
||||||
|
print_icinga2_check_status(text_result, exit_code, perfdata=perfdata)
|
||||||
|
sys.exit(exit_code)
|
|
@ -0,0 +1,25 @@
|
||||||
|
import pysnmp
|
||||||
|
from pysnmp.hlapi import *
|
||||||
|
|
||||||
|
|
||||||
|
def get_snmp_value(oid, ip, community):
|
||||||
|
errorIndication, errorStatus, errorIndex, varBinds = next(
|
||||||
|
getCmd(SnmpEngine(),
|
||||||
|
CommunityData(community),
|
||||||
|
UdpTransportTarget((ip, 161)),
|
||||||
|
ContextData(),
|
||||||
|
ObjectType(ObjectIdentity(oid)))
|
||||||
|
)
|
||||||
|
|
||||||
|
if errorIndication:
|
||||||
|
print(errorIndication)
|
||||||
|
raise
|
||||||
|
elif errorStatus:
|
||||||
|
print('%s at %s' % (errorStatus.prettyPrint(), errorIndex and varBinds[int(errorIndex) - 1][0] or '?'))
|
||||||
|
raise
|
||||||
|
else:
|
||||||
|
if isinstance(varBinds[0][1], pysnmp.proto.rfc1905.NoSuchObject):
|
||||||
|
return None
|
||||||
|
value = varBinds[0][1]
|
||||||
|
if isinstance(value, pysnmp.proto.rfc1902.Integer):
|
||||||
|
return int(value)
|
|
@ -57,3 +57,7 @@ def human_readable_size(size: Union[int, float], bits=False, decimal_places: int
|
||||||
size = round(size, decimal_places)
|
size = round(size, decimal_places)
|
||||||
|
|
||||||
return f'{size} {units[bits][base][exp]}'
|
return f'{size} {units[bits][base][exp]}'
|
||||||
|
|
||||||
|
|
||||||
|
def c_to_f(c):
|
||||||
|
return round(c * 1.8 + 32, 2)
|
||||||
|
|
Loading…
Reference in New Issue