From d431bd5d7a7feeed8038fb0fa3bb1c52d69fc7a1 Mon Sep 17 00:00:00 2001 From: Cyberes Date: Tue, 27 Jun 2023 14:32:10 -0600 Subject: [PATCH] check_greylog: alert notifications, better metric names, better filesizes --- check_graylog.py | 44 +++++++++++++++++++++++++------------------- checker/units.py | 39 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 64 insertions(+), 19 deletions(-) diff --git a/check_graylog.py b/check_graylog.py index 82b523c..be1a548 100755 --- a/check_graylog.py +++ b/check_graylog.py @@ -10,6 +10,7 @@ from urllib3.exceptions import InsecureRequestWarning import checker.nagios from checker import print_icinga2_check_status from checker.linuxfabric.base import get_state +from checker.units import human_readable_size def main(): @@ -20,6 +21,7 @@ def main(): parser.add_argument('--warn-mem', type=int, default=75, help='Percentage of JVM memory used for warm') parser.add_argument('--crit-mem', type=int, default=100, help='Percentage of JVM memory used for critical') parser.add_argument('--insecure', action='store_false', help="Don't verify SSL") + parser.add_argument('--crit-notif', action='store_true', help='Return critical when there are notifications') args = parser.parse_args() if not args.insecure: @@ -78,27 +80,19 @@ def main(): '.').replace('-', '_').replace('.', '_') value = None if 'value' in metric['metric']: - # perfdata.append(f'{name}={metric["metric"]["value"]}') value = metric["metric"]["value"] elif 'count' in metric['metric']: - # perfdata.append(f'{name}={metric["metric"]["count"]}') value = metric["metric"]["count"] elif 'rate' in metric['metric']: - # perfdata.append(f'{name}_total={metric["metric"]["rate"]["total"]}') - # perfdata.append(f'{name}_mean={metric["metric"]["rate"]["mean"]}') - # perfdata.append(f'{name}_five_minute={metric["metric"]["rate"]["five_minute"]}') - # perfdata.append(f'{name}_fifteen_minute={metric["metric"]["rate"]["fifteen_minute"]}') - # perfdata.append(f'{name}_one_minute={metric["metric"]["rate"]["one_minute"]}') value = metric["metric"]["rate"]["one_minute"] name = f'{name}_one_minute' - # if isinstance(value, float): - # value = round(value, 1) value = int(value) metrics_data[name] = value jvm_mem_usage_warn = int(metrics_data['jvm_memory_heap_max'] / int(100 / args.warn_mem)) jvm_mem_usage_crit = int(metrics_data['jvm_memory_heap_max'] / int(100 / args.crit_mem)) + # Some metric names are changed for better readability perfdata = { 'throughput_input_1_sec_rate': { 'value': int(metrics_data['throughput_input_1_sec_rate']), @@ -108,28 +102,34 @@ def main(): 'value': int(metrics_data['throughput_output_1_sec_rate']), 'min': 0, }, - 'incoming_messages_one_minute': { + 'incoming_messages_rate_per_sec_one_minute': { 'value': metrics_data['incomingMessages_one_minute'], 'min': 0, }, - 'open_connections': { + 'connections': { 'value': metrics_data['open_connections'], 'min': 0, }, - 'total_connections': { - 'value': metrics_data['total_connections'], - 'min': 0, - }, - 'written_bytes_1sec': { + 'network_out_total_1sec': { 'value': metrics_data['written_bytes_1sec'], 'min': 0, 'unit': 'B', }, - 'read_bytes_1sec': { + 'network_out_total_total': { + 'value': metrics_data['written_bytes_total'], + 'min': 0, + 'unit': 'B', + }, + 'network_in_1sec': { 'value': metrics_data['read_bytes_1sec'], 'min': 0, 'unit': 'B', }, + 'network_in_total': { + 'value': metrics_data['read_bytes_total'], + 'min': 0, + 'unit': 'B', + }, 'entries_uncommitted': { 'value': metrics_data['journal_entries_uncommitted'], 'min': 0, @@ -173,9 +173,15 @@ def main(): if jvm_mem_usage_state != checker.nagios.STATE_OK: text_result += f' JVM memory usage is high!' - text_result = text_result + f' JVM memory usage: {int((perfdata["jvm_memory_used"]["value"] / metrics_data["jvm_memory_heap_max"]) * 100)}%, incoming_messages_one_minute: {perfdata["incoming_messages_one_minute"]["value"]}, open_connections: {perfdata["open_connections"]["value"]}' + '\n' + notif_str - exit_code = max(checker.nagios.STATE_OK, jvm_mem_usage_state) + + if notifications['total'] > 0: + text_result += f' There are notifications!' + if args.crit_notif: + exit_code = checker.nagios.STATE_CRIT + + text_result = text_result + f' JVM memory usage: {int((perfdata["jvm_memory_used"]["value"] / metrics_data["jvm_memory_heap_max"]) * 100)}%, incoming rate (events/second for last minute): {perfdata["incoming_messages_rate_per_sec_one_minute"]["value"]}, connections: {perfdata["connections"]["value"]}, total network in: {human_readable_size(perfdata["network_in_total"]["value"], decimal_places=0)}' + '\n' + notif_str + print_icinga2_check_status(text_result, exit_code, perfdata) sys.exit(exit_code) diff --git a/checker/units.py b/checker/units.py index a9a4643..79127a4 100644 --- a/checker/units.py +++ b/checker/units.py @@ -1,3 +1,6 @@ +from math import log2, log10 +from typing import Union + from hurry.filesize import size @@ -18,3 +21,39 @@ def filesize(bytes: int, spaces: bool = True, formatter: bool = True): return x else: return x.replace(' ', '') + + +def human_readable_size(size: Union[int, float], bits=False, decimal_places: int = 2, base: int = 10): + # Define the units + units = {False: {2: ['bytes', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'], + 10: ['bytes', 'KB', 'MB', 'GB', 'TB', 'PB', 'EB', 'ZB', 'YB']}, + True: {2: ['bits', 'Kib', 'Mib', 'Gib', 'Tib', 'Pib', 'Eib', 'Zib', 'Yib'], + 10: ['bits', 'Kb', 'Mb', 'Gb', 'Tb', 'Pb', 'Eb', 'Zb', 'Yb']}} + + # Convert bytes to bits if needed + if bits: + size *= 8 + + # Determine the unit + if size == 0: + return '0 ' + units[bits][base][0] + else: + if base == 2: + log = int(log2(size)) + exp = log // 10 + elif base == 10: + log = int(log10(size)) + exp = log // 3 + else: + raise ValueError("Invalid base. Use either 2 or 10.") + + if exp >= len(units[bits][base]): + exp = len(units[bits][base]) - 1 + size /= base ** (exp * (10 if base == 2 else 3)) + + if decimal_places == 0: + size = int(size) + else: + round(size, decimal_places) + + return f'{size} {units[bits][base][exp]}'