add idrac checkers

This commit is contained in:
Cyberes 2023-11-26 23:15:27 -07:00
parent a25ec2e08d
commit 17d3fb9fa0
12 changed files with 259 additions and 12 deletions

View File

@ -3,3 +3,4 @@
My custom Icinga2 checks.
Useful: https://nagios-plugins.org/doc/guidelines.html#AEN200
https://icinga.com/docs/icinga-2/latest/doc/05-service-monitoring/#performance-data-metrics

View File

@ -6,13 +6,7 @@ import traceback
import requests
from checker import nagios
from checker import print_icinga2_check_status
def quit_check(text_result, exit_code):
print_icinga2_check_status(text_result, exit_code)
sys.exit(exit_code)
from checker.result import quit_check
def main():
parser = argparse.ArgumentParser(description='Validate Hauk tracking functionality.')

50
check_idrac_fans.py Normal file
View File

@ -0,0 +1,50 @@
import argparse
import sys
import traceback
from checker import nagios
from checker.result import quit_check
from checker.snmp import get_snmp_value
# TODO: support iDRAC 8
# https://github.com/ilovepancakes95/idrac_snmp-grafana/blob/master/idrac-input.conf
FAN_SPEED_OID = '.1.3.6.1.4.1.674.10892.5.4.700.12.1.6.1.'
def main(args):
text_result = ''
perf_data = {}
fan_speeds = []
for i in range(args.fan_num):
value = get_snmp_value(FAN_SPEED_OID + str(i + 1), args.ip, args.community)
if not value:
continue
fan_speeds.append(value)
text_result += f'Fan{i + 1}: {value}, '
perf_data[f'Fan{i + 1}'] = {'value': value, 'warn': args.warn, 'crit': args.crit}
text_result = text_result.strip(', ')
exit_code = nagios.STATE_OK
if min(fan_speeds) <= args.crit:
exit_code = max(nagios.STATE_CRIT, exit_code)
elif min(fan_speeds) <= args.warn:
exit_code = max(nagios.STATE_WARN, exit_code)
quit_check(text_result, exit_code, perfdata=perf_data)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Validate Hauk tracking functionality.')
parser.add_argument('--ip', required=True, help='The iDRAC IP to query.')
parser.add_argument('--community', default='public', help='Your SNMP community. Default: public')
parser.add_argument('--fan-num', type=int, default=1, help='Number of fans. Default: 1')
parser.add_argument('--warn', type=int, default=840, help='RPM warning level. Default: 840')
parser.add_argument('--crit', type=int, default=600, help='RPM critical level. Default: 600')
args = parser.parse_args()
try:
main(args)
except Exception as e:
print(f"UNKNOWN: exception\n{e}")
print(traceback.format_exc())
sys.exit(nagios.STATE_UNKNOWN)

52
check_idrac_status.py Normal file
View File

@ -0,0 +1,52 @@
import argparse
import sys
import traceback
from checker import nagios
from checker.result import quit_check
from checker.snmp import get_snmp_value
# TODO: support iDRAC 8
# https://github.com/ilovepancakes95/idrac_snmp-grafana/blob/master/idrac-input.conf
GLOBAL_SYSTEM_STATUS_OID = '.1.3.6.1.4.1.674.10892.5.2.1.0'
def main(args):
system_status = get_snmp_value(GLOBAL_SYSTEM_STATUS_OID, args.ip, args.community)
if system_status == 1:
exit_code = nagios.STATE_UNKNOWN
text_result = 'status is other'
elif system_status == 2:
exit_code = nagios.STATE_UNKNOWN
text_result = 'status is unknown'
elif system_status == 3:
exit_code = nagios.STATE_OK
text_result = 'status is nominal'
elif system_status == 4:
exit_code = nagios.STATE_WARN
text_result = 'status is non-critical'
elif system_status == 5:
exit_code = nagios.STATE_CRIT
text_result = 'status is critical'
elif system_status == 6:
exit_code = nagios.STATE_CRIT
text_result = 'status is non-recoverable'
else:
exit_code = nagios.STATE_UNKNOWN
text_result = 'status is critical'
quit_check(text_result, exit_code)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Validate Hauk tracking functionality.')
parser.add_argument('--ip', required=True, help='The iDRAC IP to query.')
parser.add_argument('--community', default='public', help='Your SNMP community. Default: public')
args = parser.parse_args()
try:
main(args)
except Exception as e:
print(f"UNKNOWN: exception\n{e}")
print(traceback.format_exc())
sys.exit(nagios.STATE_UNKNOWN)

89
check_idrac_temps.py Normal file
View File

@ -0,0 +1,89 @@
import argparse
import sys
import traceback
from checker import nagios
from checker.result import quit_check
from checker.snmp import get_snmp_value
from checker.units import c_to_f
# TODO: support iDRAC 8
# https://github.com/ilovepancakes95/idrac_snmp-grafana/blob/master/idrac-input.conf
INLET_TEMP_OID = '.1.3.6.1.4.1.674.10892.5.4.700.20.1.6.1.1'
EXHAUST_TEMP_OID = '.1.3.6.1.4.1.674.10892.5.4.700.20.1.6.1.2'
CPU1_TEMP_OID = '.1.3.6.1.4.1.674.10892.5.4.700.20.1.6.1.3'
CPU2_TEMP_OID = '.1.3.6.1.4.1.674.10892.5.4.700.20.1.6.1.4'
def get_snmp_temp(oid, ip, community):
value = get_snmp_value(oid, ip, community)
return c_to_f(float(value[0] + value[1] + '.' + value[2]))
def main(args):
inlet_temp = get_snmp_temp(INLET_TEMP_OID, args.ip, args.community)
exhaust_temp = get_snmp_temp(EXHAUST_TEMP_OID, args.ip, args.community)
cpu_temps = []
cpu_temps.append(get_snmp_temp(CPU1_TEMP_OID, args.ip, args.community))
if args.cpu_num > 1:
cpu_temps.append(get_snmp_temp(CPU2_TEMP_OID, args.ip, args.community))
exit_code = nagios.STATE_OK
if inlet_temp >= args.inlet_crit:
exit_code = max(nagios.STATE_CRIT, exit_code)
elif inlet_temp >= args.inlet_warn:
exit_code = max(nagios.STATE_WARN, exit_code)
if exhaust_temp >= args.exhaust_crit:
exit_code = max(nagios.STATE_CRIT, exit_code)
elif exhaust_temp >= args.exhaust_warn:
exit_code = max(nagios.STATE_WARN, exit_code)
if max(cpu_temps) >= args.cpu_crit:
exit_code = max(nagios.STATE_CRIT, exit_code)
elif max(cpu_temps) >= args.cpu_warn:
exit_code = max(nagios.STATE_WARN, exit_code)
text_result = f'CPU1: {cpu_temps[0]}'
if len(cpu_temps) > 1:
text_result += f', CPU2: {cpu_temps[1]}'
text_result += f', Inlet: {inlet_temp}, Exhaust: {exhaust_temp}'
perf_data = {
'cpu1': {
'value': cpu_temps[0], 'warn': args.cpu_warn, 'crit': args.cpu_crit, 'unit': 'F'
},
'inlet': {
'value': inlet_temp, 'warn': args.inlet_warn, 'crit': args.inlet_crit, 'unit': 'F'
},
'exhaust': {
'value': exhaust_temp, 'warn': args.exhaust_warn, 'crit': args.exhaust_crit, 'unit': 'F'
}
}
if len(cpu_temps) > 1:
perf_data['cpu2'] = {'value': cpu_temps[1], 'warn': args.cpu_warn, 'crit': args.cpu_crit, 'unit': 'F'}
quit_check(text_result, exit_code, perfdata=perf_data)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Validate Hauk tracking functionality.')
parser.add_argument('--ip', required=True, help='The iDRAC IP to query.')
parser.add_argument('--community', default='public', help='Your SNMP community. Default: public')
parser.add_argument('--cpu-num', type=int, default=1, help='Number of CPU nodes. Default: 1')
parser.add_argument('--inlet-warn', type=int, default=108, help='System Board Inlet Temp warning level in F. Default: 108')
parser.add_argument('--inlet-crit', type=int, default=116, help='System Board Inlet Temp critical level in F. Default: 116')
parser.add_argument('--exhaust-warn', type=int, default=158, help='System Board Exhaust Temp warning level in F. Default: 158')
parser.add_argument('--exhaust-crit', type=int, default=167, help='System Board Exhaust Temp critical level in F. Default: 167')
parser.add_argument('--cpu-warn', type=int, default=186, help='CPU temp critical level in F. Default: 186')
parser.add_argument('--cpu-crit', type=int, default=195, help='CPU temp critical level in F. Default: 195')
args = parser.parse_args()
try:
main(args)
except Exception as e:
print(f"UNKNOWN: exception\n{e}")
print(traceback.format_exc())
sys.exit(nagios.STATE_UNKNOWN)

View File

@ -4,9 +4,10 @@ WARNING_THRESHOLD=15
CRITICAL_THRESHOLD=25
AVERAGE_SECONDS=5
SHOW_TOP_PROCESSES=false
PROXMOX_CALCULATION=false
# Parse command line arguments
while getopts "w:c:n:t" opt; do
while getopts "w:c:n:tph" opt; do
case $opt in
w)
WARNING_THRESHOLD="$OPTARG"
@ -20,8 +21,22 @@ while getopts "w:c:n:t" opt; do
t)
SHOW_TOP_PROCESSES=true
;;
p)
PROXMOX_CALCULATION=true
;;
h)
echo "Usage: check_iowait.sh [-w warning_threshold] [-c critical_threshold] [-n average_seconds] [-t] [-p] [-h]"
echo "Options:"
echo " -w Set warning threshold"
echo " -c Set critical threshold"
echo " -n Set average seconds"
echo " -t Show top processes"
echo " -p Enable iowait calculation similar to Proxmox"
echo " -h Print this help message"
exit 0
;;
\?)
echo "Usage: check_iowait.sh [-w warning_threshold] [-c critical_threshold] [-n average_seconds] [-t]"
echo "Usage: check_iowait.sh [-w warning_threshold] [-c critical_threshold] [-n average_seconds] [-t] [-p] [-h]"
exit 1
;;
esac
@ -40,6 +55,11 @@ if uname | grep -q "BSD"; then
fi
else
iowait=$(iostat -c $AVERAGE_SECONDS 2 | awk 'NR==4 {print $4}')
if $PROXMOX_CALCULATION; then
idle=$(iostat -c $AVERAGE_SECONDS 2 | awk 'NR==4 {print $6}')
non_idle=$(echo "100 - $idle" | bc -l)
iowait=$(echo "$iowait / $non_idle * 100" | bc -l)
fi
if $SHOW_TOP_PROCESSES; then
top_processes=$(pidstat -d -l -u -r 1 1 | awk 'NR>4 {print $1, $NF, $8}' | sort -k3 -nr | head -n 3 | awk '{printf "%s%s", sep, $2; sep=", "} END {print ""}')
fi

View File

@ -1,2 +1,2 @@
from .print import print_icinga2_check_status, dict_to_perfdata, create_description_list
from .result import print_icinga2_check_status, dict_to_perfdata, create_description_list
from .markdown import list_to_markdown_table

View File

@ -4,7 +4,7 @@ from time import sleep
import requests
from . import nagios
from .print import print_icinga2_check_status
from .result import print_icinga2_check_status
def fetch_with_retry(url, method: str = 'get', retries=3, delay=1, **kwargs):

View File

@ -1,4 +1,8 @@
# TODO: remove non STATE_ vars
import sys
from checker import print_icinga2_check_status
UNKNOWN = STATE_UNKNOWN = -1
OK = STATE_OK = 0
WARNING = STATE_WARN = 1

View File

@ -1,3 +1,6 @@
import sys
def create_description_list(data: list):
"""
Create a Description List HTML element based on the input list.
@ -107,3 +110,8 @@ def print_icinga2_check_status(text_result: str, return_code: int, perfdata=None
status = status_codes[return_code]
perfdata_str = f' | {dict_to_perfdata(perfdata)}' if perfdata else ''
print(f"{status} - {text_result.strip()}{perfdata_str}")
def quit_check(text_result: str, exit_code: int, perfdata=None):
print_icinga2_check_status(text_result, exit_code, perfdata=perfdata)
sys.exit(exit_code)

25
checker/snmp.py Normal file
View File

@ -0,0 +1,25 @@
import pysnmp
from pysnmp.hlapi import *
def get_snmp_value(oid, ip, community):
errorIndication, errorStatus, errorIndex, varBinds = next(
getCmd(SnmpEngine(),
CommunityData(community),
UdpTransportTarget((ip, 161)),
ContextData(),
ObjectType(ObjectIdentity(oid)))
)
if errorIndication:
print(errorIndication)
raise
elif errorStatus:
print('%s at %s' % (errorStatus.prettyPrint(), errorIndex and varBinds[int(errorIndex) - 1][0] or '?'))
raise
else:
if isinstance(varBinds[0][1], pysnmp.proto.rfc1905.NoSuchObject):
return None
value = varBinds[0][1]
if isinstance(value, pysnmp.proto.rfc1902.Integer):
return int(value)

View File

@ -57,3 +57,7 @@ def human_readable_size(size: Union[int, float], bits=False, decimal_places: int
size = round(size, decimal_places)
return f'{size} {units[bits][base][exp]}'
def c_to_f(c):
return round(c * 1.8 + 32, 2)