From 17d3fb9fa07340f87e9edd4d37e5c1ba92ab58c4 Mon Sep 17 00:00:00 2001 From: Cyberes Date: Sun, 26 Nov 2023 23:15:27 -0700 Subject: [PATCH] add idrac checkers --- README.md | 3 +- check_hauk.py | 8 +-- check_idrac_fans.py | 50 ++++++++++++++++++ check_idrac_status.py | 52 +++++++++++++++++++ check_idrac_temps.py | 89 +++++++++++++++++++++++++++++++++ check_iowait.sh | 24 ++++++++- checker/__init__.py | 2 +- checker/http.py | 2 +- checker/nagios.py | 4 ++ checker/{print.py => result.py} | 8 +++ checker/snmp.py | 25 +++++++++ checker/units.py | 4 ++ 12 files changed, 259 insertions(+), 12 deletions(-) create mode 100644 check_idrac_fans.py create mode 100644 check_idrac_status.py create mode 100644 check_idrac_temps.py rename checker/{print.py => result.py} (95%) create mode 100644 checker/snmp.py diff --git a/README.md b/README.md index e189fb9..1f3a6a8 100644 --- a/README.md +++ b/README.md @@ -2,4 +2,5 @@ My custom Icinga2 checks. -Useful: https://nagios-plugins.org/doc/guidelines.html#AEN200 \ No newline at end of file +Useful: https://nagios-plugins.org/doc/guidelines.html#AEN200 +https://icinga.com/docs/icinga-2/latest/doc/05-service-monitoring/#performance-data-metrics \ No newline at end of file diff --git a/check_hauk.py b/check_hauk.py index b0341aa..fce2744 100755 --- a/check_hauk.py +++ b/check_hauk.py @@ -6,13 +6,7 @@ import traceback import requests from checker import nagios -from checker import print_icinga2_check_status - - -def quit_check(text_result, exit_code): - print_icinga2_check_status(text_result, exit_code) - sys.exit(exit_code) - +from checker.result import quit_check def main(): parser = argparse.ArgumentParser(description='Validate Hauk tracking functionality.') diff --git a/check_idrac_fans.py b/check_idrac_fans.py new file mode 100644 index 0000000..042201c --- /dev/null +++ b/check_idrac_fans.py @@ -0,0 +1,50 @@ +import argparse +import sys +import traceback + +from checker import nagios +from checker.result import quit_check +from checker.snmp import get_snmp_value + +# TODO: support iDRAC 8 + +# https://github.com/ilovepancakes95/idrac_snmp-grafana/blob/master/idrac-input.conf +FAN_SPEED_OID = '.1.3.6.1.4.1.674.10892.5.4.700.12.1.6.1.' + + +def main(args): + text_result = '' + perf_data = {} + fan_speeds = [] + for i in range(args.fan_num): + value = get_snmp_value(FAN_SPEED_OID + str(i + 1), args.ip, args.community) + if not value: + continue + fan_speeds.append(value) + text_result += f'Fan{i + 1}: {value}, ' + perf_data[f'Fan{i + 1}'] = {'value': value, 'warn': args.warn, 'crit': args.crit} + text_result = text_result.strip(', ') + + exit_code = nagios.STATE_OK + if min(fan_speeds) <= args.crit: + exit_code = max(nagios.STATE_CRIT, exit_code) + elif min(fan_speeds) <= args.warn: + exit_code = max(nagios.STATE_WARN, exit_code) + + quit_check(text_result, exit_code, perfdata=perf_data) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='Validate Hauk tracking functionality.') + parser.add_argument('--ip', required=True, help='The iDRAC IP to query.') + parser.add_argument('--community', default='public', help='Your SNMP community. Default: public') + parser.add_argument('--fan-num', type=int, default=1, help='Number of fans. Default: 1') + parser.add_argument('--warn', type=int, default=840, help='RPM warning level. Default: 840') + parser.add_argument('--crit', type=int, default=600, help='RPM critical level. Default: 600') + args = parser.parse_args() + try: + main(args) + except Exception as e: + print(f"UNKNOWN: exception\n{e}") + print(traceback.format_exc()) + sys.exit(nagios.STATE_UNKNOWN) diff --git a/check_idrac_status.py b/check_idrac_status.py new file mode 100644 index 0000000..2daf130 --- /dev/null +++ b/check_idrac_status.py @@ -0,0 +1,52 @@ +import argparse +import sys +import traceback + +from checker import nagios +from checker.result import quit_check +from checker.snmp import get_snmp_value + +# TODO: support iDRAC 8 + +# https://github.com/ilovepancakes95/idrac_snmp-grafana/blob/master/idrac-input.conf +GLOBAL_SYSTEM_STATUS_OID = '.1.3.6.1.4.1.674.10892.5.2.1.0' + + +def main(args): + system_status = get_snmp_value(GLOBAL_SYSTEM_STATUS_OID, args.ip, args.community) + if system_status == 1: + exit_code = nagios.STATE_UNKNOWN + text_result = 'status is other' + elif system_status == 2: + exit_code = nagios.STATE_UNKNOWN + text_result = 'status is unknown' + elif system_status == 3: + exit_code = nagios.STATE_OK + text_result = 'status is nominal' + elif system_status == 4: + exit_code = nagios.STATE_WARN + text_result = 'status is non-critical' + elif system_status == 5: + exit_code = nagios.STATE_CRIT + text_result = 'status is critical' + elif system_status == 6: + exit_code = nagios.STATE_CRIT + text_result = 'status is non-recoverable' + else: + exit_code = nagios.STATE_UNKNOWN + text_result = 'status is critical' + + quit_check(text_result, exit_code) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='Validate Hauk tracking functionality.') + parser.add_argument('--ip', required=True, help='The iDRAC IP to query.') + parser.add_argument('--community', default='public', help='Your SNMP community. Default: public') + args = parser.parse_args() + try: + main(args) + except Exception as e: + print(f"UNKNOWN: exception\n{e}") + print(traceback.format_exc()) + sys.exit(nagios.STATE_UNKNOWN) diff --git a/check_idrac_temps.py b/check_idrac_temps.py new file mode 100644 index 0000000..c8b35b6 --- /dev/null +++ b/check_idrac_temps.py @@ -0,0 +1,89 @@ +import argparse +import sys +import traceback + +from checker import nagios +from checker.result import quit_check +from checker.snmp import get_snmp_value +from checker.units import c_to_f + +# TODO: support iDRAC 8 + +# https://github.com/ilovepancakes95/idrac_snmp-grafana/blob/master/idrac-input.conf +INLET_TEMP_OID = '.1.3.6.1.4.1.674.10892.5.4.700.20.1.6.1.1' +EXHAUST_TEMP_OID = '.1.3.6.1.4.1.674.10892.5.4.700.20.1.6.1.2' +CPU1_TEMP_OID = '.1.3.6.1.4.1.674.10892.5.4.700.20.1.6.1.3' +CPU2_TEMP_OID = '.1.3.6.1.4.1.674.10892.5.4.700.20.1.6.1.4' + + +def get_snmp_temp(oid, ip, community): + value = get_snmp_value(oid, ip, community) + return c_to_f(float(value[0] + value[1] + '.' + value[2])) + + +def main(args): + inlet_temp = get_snmp_temp(INLET_TEMP_OID, args.ip, args.community) + exhaust_temp = get_snmp_temp(EXHAUST_TEMP_OID, args.ip, args.community) + + cpu_temps = [] + cpu_temps.append(get_snmp_temp(CPU1_TEMP_OID, args.ip, args.community)) + if args.cpu_num > 1: + cpu_temps.append(get_snmp_temp(CPU2_TEMP_OID, args.ip, args.community)) + + exit_code = nagios.STATE_OK + if inlet_temp >= args.inlet_crit: + exit_code = max(nagios.STATE_CRIT, exit_code) + elif inlet_temp >= args.inlet_warn: + exit_code = max(nagios.STATE_WARN, exit_code) + + if exhaust_temp >= args.exhaust_crit: + exit_code = max(nagios.STATE_CRIT, exit_code) + elif exhaust_temp >= args.exhaust_warn: + exit_code = max(nagios.STATE_WARN, exit_code) + + if max(cpu_temps) >= args.cpu_crit: + exit_code = max(nagios.STATE_CRIT, exit_code) + elif max(cpu_temps) >= args.cpu_warn: + exit_code = max(nagios.STATE_WARN, exit_code) + + text_result = f'CPU1: {cpu_temps[0]}' + if len(cpu_temps) > 1: + text_result += f', CPU2: {cpu_temps[1]}' + text_result += f', Inlet: {inlet_temp}, Exhaust: {exhaust_temp}' + + perf_data = { + 'cpu1': { + 'value': cpu_temps[0], 'warn': args.cpu_warn, 'crit': args.cpu_crit, 'unit': 'F' + }, + 'inlet': { + 'value': inlet_temp, 'warn': args.inlet_warn, 'crit': args.inlet_crit, 'unit': 'F' + }, + 'exhaust': { + 'value': exhaust_temp, 'warn': args.exhaust_warn, 'crit': args.exhaust_crit, 'unit': 'F' + } + } + + if len(cpu_temps) > 1: + perf_data['cpu2'] = {'value': cpu_temps[1], 'warn': args.cpu_warn, 'crit': args.cpu_crit, 'unit': 'F'} + + quit_check(text_result, exit_code, perfdata=perf_data) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='Validate Hauk tracking functionality.') + parser.add_argument('--ip', required=True, help='The iDRAC IP to query.') + parser.add_argument('--community', default='public', help='Your SNMP community. Default: public') + parser.add_argument('--cpu-num', type=int, default=1, help='Number of CPU nodes. Default: 1') + parser.add_argument('--inlet-warn', type=int, default=108, help='System Board Inlet Temp warning level in F. Default: 108') + parser.add_argument('--inlet-crit', type=int, default=116, help='System Board Inlet Temp critical level in F. Default: 116') + parser.add_argument('--exhaust-warn', type=int, default=158, help='System Board Exhaust Temp warning level in F. Default: 158') + parser.add_argument('--exhaust-crit', type=int, default=167, help='System Board Exhaust Temp critical level in F. Default: 167') + parser.add_argument('--cpu-warn', type=int, default=186, help='CPU temp critical level in F. Default: 186') + parser.add_argument('--cpu-crit', type=int, default=195, help='CPU temp critical level in F. Default: 195') + args = parser.parse_args() + try: + main(args) + except Exception as e: + print(f"UNKNOWN: exception\n{e}") + print(traceback.format_exc()) + sys.exit(nagios.STATE_UNKNOWN) diff --git a/check_iowait.sh b/check_iowait.sh index 991d4e2..fd2a122 100755 --- a/check_iowait.sh +++ b/check_iowait.sh @@ -4,9 +4,10 @@ WARNING_THRESHOLD=15 CRITICAL_THRESHOLD=25 AVERAGE_SECONDS=5 SHOW_TOP_PROCESSES=false +PROXMOX_CALCULATION=false # Parse command line arguments -while getopts "w:c:n:t" opt; do +while getopts "w:c:n:tph" opt; do case $opt in w) WARNING_THRESHOLD="$OPTARG" @@ -20,8 +21,22 @@ while getopts "w:c:n:t" opt; do t) SHOW_TOP_PROCESSES=true ;; + p) + PROXMOX_CALCULATION=true + ;; + h) + echo "Usage: check_iowait.sh [-w warning_threshold] [-c critical_threshold] [-n average_seconds] [-t] [-p] [-h]" + echo "Options:" + echo " -w Set warning threshold" + echo " -c Set critical threshold" + echo " -n Set average seconds" + echo " -t Show top processes" + echo " -p Enable iowait calculation similar to Proxmox" + echo " -h Print this help message" + exit 0 + ;; \?) - echo "Usage: check_iowait.sh [-w warning_threshold] [-c critical_threshold] [-n average_seconds] [-t]" + echo "Usage: check_iowait.sh [-w warning_threshold] [-c critical_threshold] [-n average_seconds] [-t] [-p] [-h]" exit 1 ;; esac @@ -40,6 +55,11 @@ if uname | grep -q "BSD"; then fi else iowait=$(iostat -c $AVERAGE_SECONDS 2 | awk 'NR==4 {print $4}') + if $PROXMOX_CALCULATION; then + idle=$(iostat -c $AVERAGE_SECONDS 2 | awk 'NR==4 {print $6}') + non_idle=$(echo "100 - $idle" | bc -l) + iowait=$(echo "$iowait / $non_idle * 100" | bc -l) + fi if $SHOW_TOP_PROCESSES; then top_processes=$(pidstat -d -l -u -r 1 1 | awk 'NR>4 {print $1, $NF, $8}' | sort -k3 -nr | head -n 3 | awk '{printf "%s%s", sep, $2; sep=", "} END {print ""}') fi diff --git a/checker/__init__.py b/checker/__init__.py index 860b257..4f56427 100644 --- a/checker/__init__.py +++ b/checker/__init__.py @@ -1,2 +1,2 @@ -from .print import print_icinga2_check_status, dict_to_perfdata, create_description_list +from .result import print_icinga2_check_status, dict_to_perfdata, create_description_list from .markdown import list_to_markdown_table diff --git a/checker/http.py b/checker/http.py index ca4a8b3..1039452 100644 --- a/checker/http.py +++ b/checker/http.py @@ -4,7 +4,7 @@ from time import sleep import requests from . import nagios -from .print import print_icinga2_check_status +from .result import print_icinga2_check_status def fetch_with_retry(url, method: str = 'get', retries=3, delay=1, **kwargs): diff --git a/checker/nagios.py b/checker/nagios.py index 4701313..021391e 100644 --- a/checker/nagios.py +++ b/checker/nagios.py @@ -1,4 +1,8 @@ # TODO: remove non STATE_ vars +import sys + +from checker import print_icinga2_check_status + UNKNOWN = STATE_UNKNOWN = -1 OK = STATE_OK = 0 WARNING = STATE_WARN = 1 diff --git a/checker/print.py b/checker/result.py similarity index 95% rename from checker/print.py rename to checker/result.py index 7101b0d..d02c724 100644 --- a/checker/print.py +++ b/checker/result.py @@ -1,3 +1,6 @@ +import sys + + def create_description_list(data: list): """ Create a Description List HTML element based on the input list. @@ -107,3 +110,8 @@ def print_icinga2_check_status(text_result: str, return_code: int, perfdata=None status = status_codes[return_code] perfdata_str = f' | {dict_to_perfdata(perfdata)}' if perfdata else '' print(f"{status} - {text_result.strip()}{perfdata_str}") + + +def quit_check(text_result: str, exit_code: int, perfdata=None): + print_icinga2_check_status(text_result, exit_code, perfdata=perfdata) + sys.exit(exit_code) diff --git a/checker/snmp.py b/checker/snmp.py new file mode 100644 index 0000000..ddb9487 --- /dev/null +++ b/checker/snmp.py @@ -0,0 +1,25 @@ +import pysnmp +from pysnmp.hlapi import * + + +def get_snmp_value(oid, ip, community): + errorIndication, errorStatus, errorIndex, varBinds = next( + getCmd(SnmpEngine(), + CommunityData(community), + UdpTransportTarget((ip, 161)), + ContextData(), + ObjectType(ObjectIdentity(oid))) + ) + + if errorIndication: + print(errorIndication) + raise + elif errorStatus: + print('%s at %s' % (errorStatus.prettyPrint(), errorIndex and varBinds[int(errorIndex) - 1][0] or '?')) + raise + else: + if isinstance(varBinds[0][1], pysnmp.proto.rfc1905.NoSuchObject): + return None + value = varBinds[0][1] + if isinstance(value, pysnmp.proto.rfc1902.Integer): + return int(value) diff --git a/checker/units.py b/checker/units.py index cdd0396..fe128ea 100644 --- a/checker/units.py +++ b/checker/units.py @@ -57,3 +57,7 @@ def human_readable_size(size: Union[int, float], bits=False, decimal_places: int size = round(size, decimal_places) return f'{size} {units[bits][base][exp]}' + + +def c_to_f(c): + return round(c * 1.8 + 32, 2)