add idrac checkers
This commit is contained in:
parent
a25ec2e08d
commit
17d3fb9fa0
|
@ -3,3 +3,4 @@
|
|||
My custom Icinga2 checks.
|
||||
|
||||
Useful: https://nagios-plugins.org/doc/guidelines.html#AEN200
|
||||
https://icinga.com/docs/icinga-2/latest/doc/05-service-monitoring/#performance-data-metrics
|
|
@ -6,13 +6,7 @@ import traceback
|
|||
import requests
|
||||
|
||||
from checker import nagios
|
||||
from checker import print_icinga2_check_status
|
||||
|
||||
|
||||
def quit_check(text_result, exit_code):
|
||||
print_icinga2_check_status(text_result, exit_code)
|
||||
sys.exit(exit_code)
|
||||
|
||||
from checker.result import quit_check
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description='Validate Hauk tracking functionality.')
|
||||
|
|
|
@ -0,0 +1,50 @@
|
|||
import argparse
|
||||
import sys
|
||||
import traceback
|
||||
|
||||
from checker import nagios
|
||||
from checker.result import quit_check
|
||||
from checker.snmp import get_snmp_value
|
||||
|
||||
# TODO: support iDRAC 8
|
||||
|
||||
# https://github.com/ilovepancakes95/idrac_snmp-grafana/blob/master/idrac-input.conf
|
||||
FAN_SPEED_OID = '.1.3.6.1.4.1.674.10892.5.4.700.12.1.6.1.'
|
||||
|
||||
|
||||
def main(args):
|
||||
text_result = ''
|
||||
perf_data = {}
|
||||
fan_speeds = []
|
||||
for i in range(args.fan_num):
|
||||
value = get_snmp_value(FAN_SPEED_OID + str(i + 1), args.ip, args.community)
|
||||
if not value:
|
||||
continue
|
||||
fan_speeds.append(value)
|
||||
text_result += f'Fan{i + 1}: {value}, '
|
||||
perf_data[f'Fan{i + 1}'] = {'value': value, 'warn': args.warn, 'crit': args.crit}
|
||||
text_result = text_result.strip(', ')
|
||||
|
||||
exit_code = nagios.STATE_OK
|
||||
if min(fan_speeds) <= args.crit:
|
||||
exit_code = max(nagios.STATE_CRIT, exit_code)
|
||||
elif min(fan_speeds) <= args.warn:
|
||||
exit_code = max(nagios.STATE_WARN, exit_code)
|
||||
|
||||
quit_check(text_result, exit_code, perfdata=perf_data)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description='Validate Hauk tracking functionality.')
|
||||
parser.add_argument('--ip', required=True, help='The iDRAC IP to query.')
|
||||
parser.add_argument('--community', default='public', help='Your SNMP community. Default: public')
|
||||
parser.add_argument('--fan-num', type=int, default=1, help='Number of fans. Default: 1')
|
||||
parser.add_argument('--warn', type=int, default=840, help='RPM warning level. Default: 840')
|
||||
parser.add_argument('--crit', type=int, default=600, help='RPM critical level. Default: 600')
|
||||
args = parser.parse_args()
|
||||
try:
|
||||
main(args)
|
||||
except Exception as e:
|
||||
print(f"UNKNOWN: exception\n{e}")
|
||||
print(traceback.format_exc())
|
||||
sys.exit(nagios.STATE_UNKNOWN)
|
|
@ -0,0 +1,52 @@
|
|||
import argparse
|
||||
import sys
|
||||
import traceback
|
||||
|
||||
from checker import nagios
|
||||
from checker.result import quit_check
|
||||
from checker.snmp import get_snmp_value
|
||||
|
||||
# TODO: support iDRAC 8
|
||||
|
||||
# https://github.com/ilovepancakes95/idrac_snmp-grafana/blob/master/idrac-input.conf
|
||||
GLOBAL_SYSTEM_STATUS_OID = '.1.3.6.1.4.1.674.10892.5.2.1.0'
|
||||
|
||||
|
||||
def main(args):
|
||||
system_status = get_snmp_value(GLOBAL_SYSTEM_STATUS_OID, args.ip, args.community)
|
||||
if system_status == 1:
|
||||
exit_code = nagios.STATE_UNKNOWN
|
||||
text_result = 'status is other'
|
||||
elif system_status == 2:
|
||||
exit_code = nagios.STATE_UNKNOWN
|
||||
text_result = 'status is unknown'
|
||||
elif system_status == 3:
|
||||
exit_code = nagios.STATE_OK
|
||||
text_result = 'status is nominal'
|
||||
elif system_status == 4:
|
||||
exit_code = nagios.STATE_WARN
|
||||
text_result = 'status is non-critical'
|
||||
elif system_status == 5:
|
||||
exit_code = nagios.STATE_CRIT
|
||||
text_result = 'status is critical'
|
||||
elif system_status == 6:
|
||||
exit_code = nagios.STATE_CRIT
|
||||
text_result = 'status is non-recoverable'
|
||||
else:
|
||||
exit_code = nagios.STATE_UNKNOWN
|
||||
text_result = 'status is critical'
|
||||
|
||||
quit_check(text_result, exit_code)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description='Validate Hauk tracking functionality.')
|
||||
parser.add_argument('--ip', required=True, help='The iDRAC IP to query.')
|
||||
parser.add_argument('--community', default='public', help='Your SNMP community. Default: public')
|
||||
args = parser.parse_args()
|
||||
try:
|
||||
main(args)
|
||||
except Exception as e:
|
||||
print(f"UNKNOWN: exception\n{e}")
|
||||
print(traceback.format_exc())
|
||||
sys.exit(nagios.STATE_UNKNOWN)
|
|
@ -0,0 +1,89 @@
|
|||
import argparse
|
||||
import sys
|
||||
import traceback
|
||||
|
||||
from checker import nagios
|
||||
from checker.result import quit_check
|
||||
from checker.snmp import get_snmp_value
|
||||
from checker.units import c_to_f
|
||||
|
||||
# TODO: support iDRAC 8
|
||||
|
||||
# https://github.com/ilovepancakes95/idrac_snmp-grafana/blob/master/idrac-input.conf
|
||||
INLET_TEMP_OID = '.1.3.6.1.4.1.674.10892.5.4.700.20.1.6.1.1'
|
||||
EXHAUST_TEMP_OID = '.1.3.6.1.4.1.674.10892.5.4.700.20.1.6.1.2'
|
||||
CPU1_TEMP_OID = '.1.3.6.1.4.1.674.10892.5.4.700.20.1.6.1.3'
|
||||
CPU2_TEMP_OID = '.1.3.6.1.4.1.674.10892.5.4.700.20.1.6.1.4'
|
||||
|
||||
|
||||
def get_snmp_temp(oid, ip, community):
|
||||
value = get_snmp_value(oid, ip, community)
|
||||
return c_to_f(float(value[0] + value[1] + '.' + value[2]))
|
||||
|
||||
|
||||
def main(args):
|
||||
inlet_temp = get_snmp_temp(INLET_TEMP_OID, args.ip, args.community)
|
||||
exhaust_temp = get_snmp_temp(EXHAUST_TEMP_OID, args.ip, args.community)
|
||||
|
||||
cpu_temps = []
|
||||
cpu_temps.append(get_snmp_temp(CPU1_TEMP_OID, args.ip, args.community))
|
||||
if args.cpu_num > 1:
|
||||
cpu_temps.append(get_snmp_temp(CPU2_TEMP_OID, args.ip, args.community))
|
||||
|
||||
exit_code = nagios.STATE_OK
|
||||
if inlet_temp >= args.inlet_crit:
|
||||
exit_code = max(nagios.STATE_CRIT, exit_code)
|
||||
elif inlet_temp >= args.inlet_warn:
|
||||
exit_code = max(nagios.STATE_WARN, exit_code)
|
||||
|
||||
if exhaust_temp >= args.exhaust_crit:
|
||||
exit_code = max(nagios.STATE_CRIT, exit_code)
|
||||
elif exhaust_temp >= args.exhaust_warn:
|
||||
exit_code = max(nagios.STATE_WARN, exit_code)
|
||||
|
||||
if max(cpu_temps) >= args.cpu_crit:
|
||||
exit_code = max(nagios.STATE_CRIT, exit_code)
|
||||
elif max(cpu_temps) >= args.cpu_warn:
|
||||
exit_code = max(nagios.STATE_WARN, exit_code)
|
||||
|
||||
text_result = f'CPU1: {cpu_temps[0]}'
|
||||
if len(cpu_temps) > 1:
|
||||
text_result += f', CPU2: {cpu_temps[1]}'
|
||||
text_result += f', Inlet: {inlet_temp}, Exhaust: {exhaust_temp}'
|
||||
|
||||
perf_data = {
|
||||
'cpu1': {
|
||||
'value': cpu_temps[0], 'warn': args.cpu_warn, 'crit': args.cpu_crit, 'unit': 'F'
|
||||
},
|
||||
'inlet': {
|
||||
'value': inlet_temp, 'warn': args.inlet_warn, 'crit': args.inlet_crit, 'unit': 'F'
|
||||
},
|
||||
'exhaust': {
|
||||
'value': exhaust_temp, 'warn': args.exhaust_warn, 'crit': args.exhaust_crit, 'unit': 'F'
|
||||
}
|
||||
}
|
||||
|
||||
if len(cpu_temps) > 1:
|
||||
perf_data['cpu2'] = {'value': cpu_temps[1], 'warn': args.cpu_warn, 'crit': args.cpu_crit, 'unit': 'F'}
|
||||
|
||||
quit_check(text_result, exit_code, perfdata=perf_data)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description='Validate Hauk tracking functionality.')
|
||||
parser.add_argument('--ip', required=True, help='The iDRAC IP to query.')
|
||||
parser.add_argument('--community', default='public', help='Your SNMP community. Default: public')
|
||||
parser.add_argument('--cpu-num', type=int, default=1, help='Number of CPU nodes. Default: 1')
|
||||
parser.add_argument('--inlet-warn', type=int, default=108, help='System Board Inlet Temp warning level in F. Default: 108')
|
||||
parser.add_argument('--inlet-crit', type=int, default=116, help='System Board Inlet Temp critical level in F. Default: 116')
|
||||
parser.add_argument('--exhaust-warn', type=int, default=158, help='System Board Exhaust Temp warning level in F. Default: 158')
|
||||
parser.add_argument('--exhaust-crit', type=int, default=167, help='System Board Exhaust Temp critical level in F. Default: 167')
|
||||
parser.add_argument('--cpu-warn', type=int, default=186, help='CPU temp critical level in F. Default: 186')
|
||||
parser.add_argument('--cpu-crit', type=int, default=195, help='CPU temp critical level in F. Default: 195')
|
||||
args = parser.parse_args()
|
||||
try:
|
||||
main(args)
|
||||
except Exception as e:
|
||||
print(f"UNKNOWN: exception\n{e}")
|
||||
print(traceback.format_exc())
|
||||
sys.exit(nagios.STATE_UNKNOWN)
|
|
@ -4,9 +4,10 @@ WARNING_THRESHOLD=15
|
|||
CRITICAL_THRESHOLD=25
|
||||
AVERAGE_SECONDS=5
|
||||
SHOW_TOP_PROCESSES=false
|
||||
PROXMOX_CALCULATION=false
|
||||
|
||||
# Parse command line arguments
|
||||
while getopts "w:c:n:t" opt; do
|
||||
while getopts "w:c:n:tph" opt; do
|
||||
case $opt in
|
||||
w)
|
||||
WARNING_THRESHOLD="$OPTARG"
|
||||
|
@ -20,8 +21,22 @@ while getopts "w:c:n:t" opt; do
|
|||
t)
|
||||
SHOW_TOP_PROCESSES=true
|
||||
;;
|
||||
p)
|
||||
PROXMOX_CALCULATION=true
|
||||
;;
|
||||
h)
|
||||
echo "Usage: check_iowait.sh [-w warning_threshold] [-c critical_threshold] [-n average_seconds] [-t] [-p] [-h]"
|
||||
echo "Options:"
|
||||
echo " -w Set warning threshold"
|
||||
echo " -c Set critical threshold"
|
||||
echo " -n Set average seconds"
|
||||
echo " -t Show top processes"
|
||||
echo " -p Enable iowait calculation similar to Proxmox"
|
||||
echo " -h Print this help message"
|
||||
exit 0
|
||||
;;
|
||||
\?)
|
||||
echo "Usage: check_iowait.sh [-w warning_threshold] [-c critical_threshold] [-n average_seconds] [-t]"
|
||||
echo "Usage: check_iowait.sh [-w warning_threshold] [-c critical_threshold] [-n average_seconds] [-t] [-p] [-h]"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
|
@ -40,6 +55,11 @@ if uname | grep -q "BSD"; then
|
|||
fi
|
||||
else
|
||||
iowait=$(iostat -c $AVERAGE_SECONDS 2 | awk 'NR==4 {print $4}')
|
||||
if $PROXMOX_CALCULATION; then
|
||||
idle=$(iostat -c $AVERAGE_SECONDS 2 | awk 'NR==4 {print $6}')
|
||||
non_idle=$(echo "100 - $idle" | bc -l)
|
||||
iowait=$(echo "$iowait / $non_idle * 100" | bc -l)
|
||||
fi
|
||||
if $SHOW_TOP_PROCESSES; then
|
||||
top_processes=$(pidstat -d -l -u -r 1 1 | awk 'NR>4 {print $1, $NF, $8}' | sort -k3 -nr | head -n 3 | awk '{printf "%s%s", sep, $2; sep=", "} END {print ""}')
|
||||
fi
|
||||
|
|
|
@ -1,2 +1,2 @@
|
|||
from .print import print_icinga2_check_status, dict_to_perfdata, create_description_list
|
||||
from .result import print_icinga2_check_status, dict_to_perfdata, create_description_list
|
||||
from .markdown import list_to_markdown_table
|
||||
|
|
|
@ -4,7 +4,7 @@ from time import sleep
|
|||
import requests
|
||||
|
||||
from . import nagios
|
||||
from .print import print_icinga2_check_status
|
||||
from .result import print_icinga2_check_status
|
||||
|
||||
|
||||
def fetch_with_retry(url, method: str = 'get', retries=3, delay=1, **kwargs):
|
||||
|
|
|
@ -1,4 +1,8 @@
|
|||
# TODO: remove non STATE_ vars
|
||||
import sys
|
||||
|
||||
from checker import print_icinga2_check_status
|
||||
|
||||
UNKNOWN = STATE_UNKNOWN = -1
|
||||
OK = STATE_OK = 0
|
||||
WARNING = STATE_WARN = 1
|
||||
|
|
|
@ -1,3 +1,6 @@
|
|||
import sys
|
||||
|
||||
|
||||
def create_description_list(data: list):
|
||||
"""
|
||||
Create a Description List HTML element based on the input list.
|
||||
|
@ -107,3 +110,8 @@ def print_icinga2_check_status(text_result: str, return_code: int, perfdata=None
|
|||
status = status_codes[return_code]
|
||||
perfdata_str = f' | {dict_to_perfdata(perfdata)}' if perfdata else ''
|
||||
print(f"{status} - {text_result.strip()}{perfdata_str}")
|
||||
|
||||
|
||||
def quit_check(text_result: str, exit_code: int, perfdata=None):
|
||||
print_icinga2_check_status(text_result, exit_code, perfdata=perfdata)
|
||||
sys.exit(exit_code)
|
|
@ -0,0 +1,25 @@
|
|||
import pysnmp
|
||||
from pysnmp.hlapi import *
|
||||
|
||||
|
||||
def get_snmp_value(oid, ip, community):
|
||||
errorIndication, errorStatus, errorIndex, varBinds = next(
|
||||
getCmd(SnmpEngine(),
|
||||
CommunityData(community),
|
||||
UdpTransportTarget((ip, 161)),
|
||||
ContextData(),
|
||||
ObjectType(ObjectIdentity(oid)))
|
||||
)
|
||||
|
||||
if errorIndication:
|
||||
print(errorIndication)
|
||||
raise
|
||||
elif errorStatus:
|
||||
print('%s at %s' % (errorStatus.prettyPrint(), errorIndex and varBinds[int(errorIndex) - 1][0] or '?'))
|
||||
raise
|
||||
else:
|
||||
if isinstance(varBinds[0][1], pysnmp.proto.rfc1905.NoSuchObject):
|
||||
return None
|
||||
value = varBinds[0][1]
|
||||
if isinstance(value, pysnmp.proto.rfc1902.Integer):
|
||||
return int(value)
|
|
@ -57,3 +57,7 @@ def human_readable_size(size: Union[int, float], bits=False, decimal_places: int
|
|||
size = round(size, decimal_places)
|
||||
|
||||
return f'{size} {units[bits][base][exp]}'
|
||||
|
||||
|
||||
def c_to_f(c):
|
||||
return round(c * 1.8 + 32, 2)
|
||||
|
|
Loading…
Reference in New Issue