From 96fc88737e0cc2deaf4b4af6cc8be3b4b5f4ad72 Mon Sep 17 00:00:00 2001 From: Cyberes Date: Tue, 27 Jun 2023 13:13:52 -0600 Subject: [PATCH] add check_greylog.py --- check_graylog.py | 189 ++++++++++++++++++++++++++++++++++++ check_speedtest.py | 6 +- checker/linuxfabric/base.py | 18 ++-- 3 files changed, 201 insertions(+), 12 deletions(-) create mode 100755 check_graylog.py diff --git a/check_graylog.py b/check_graylog.py new file mode 100755 index 0000000..82b523c --- /dev/null +++ b/check_graylog.py @@ -0,0 +1,189 @@ +#!/usr/bin/env python3 + +import argparse +import sys +import traceback + +import requests +from urllib3.exceptions import InsecureRequestWarning + +import checker.nagios +from checker import print_icinga2_check_status +from checker.linuxfabric.base import get_state + + +def main(): + parser = argparse.ArgumentParser(description='Check Graylog input health') + parser.add_argument('-u', '--url', required=True, help='Graylog API URL') + parser.add_argument('-t', '--token', required=True, help='Graylog API token') + parser.add_argument('-i', '--input', required=True, help='Input ID to check') + parser.add_argument('--warn-mem', type=int, default=75, help='Percentage of JVM memory used for warm') + parser.add_argument('--crit-mem', type=int, default=100, help='Percentage of JVM memory used for critical') + parser.add_argument('--insecure', action='store_false', help="Don't verify SSL") + args = parser.parse_args() + + if not args.insecure: + requests.packages.urllib3.disable_warnings(category=InsecureRequestWarning) + + headers = { + 'Accept': 'application/json', + 'X-Requested-By': 'XMLHttpRequest', + } + + try: + response = requests.get(f'{args.url}/system/inputstates/{args.input}', headers=headers, + auth=(args.token, 'token'), verify=args.insecure) + response.raise_for_status() + except requests.exceptions.RequestException as e: + print(f'CRITICAL - Unable to query Graylog API: {e}') + sys.exit(2) + + input_data = response.json() + type = input_data['message_input']['type'] + + try: + metrics_json = { + 'metrics': [ + 'org.graylog2.throughput.input.1-sec-rate', + 'org.graylog2.throughput.output.1-sec-rate', + f'{type}.{args.input}.incomingMessages', + f'{type}.{args.input}.open_connections', + f'{type}.{args.input}.total_connections', + f'{type}.{args.input}.written_bytes_1sec', + f'{type}.{args.input}.written_bytes_total', + f'{type}.{args.input}.read_bytes_1sec', + f'{type}.{args.input}.read_bytes_total', + "org.graylog2.journal.append.1-sec-rate", + "org.graylog2.journal.read.1-sec-rate", + "org.graylog2.journal.segments", + "org.graylog2.journal.entries-uncommitted", + "jvm.memory.heap.used", + "jvm.memory.heap.committed", + "jvm.memory.heap.max" + ], + } + response = requests.post(f'{args.url}/cluster/metrics/multiple', headers=headers, auth=(args.token, 'token'), + verify=args.insecure, + json=metrics_json) + response.raise_for_status() + input_metrics = response.json()[list(response.json().keys())[0]]['metrics'] + except requests.exceptions.RequestException as e: + print(f'CRITICAL - Unable to query Graylog API: {e}\n{response.text}') + sys.exit(2) + + # Format the metrics + metrics_data = {} + for metric in input_metrics: + name = metric['full_name'].replace(type, '').replace('org.graylog2.', '').replace(args.input, '').strip( + '.').replace('-', '_').replace('.', '_') + value = None + if 'value' in metric['metric']: + # perfdata.append(f'{name}={metric["metric"]["value"]}') + value = metric["metric"]["value"] + elif 'count' in metric['metric']: + # perfdata.append(f'{name}={metric["metric"]["count"]}') + value = metric["metric"]["count"] + elif 'rate' in metric['metric']: + # perfdata.append(f'{name}_total={metric["metric"]["rate"]["total"]}') + # perfdata.append(f'{name}_mean={metric["metric"]["rate"]["mean"]}') + # perfdata.append(f'{name}_five_minute={metric["metric"]["rate"]["five_minute"]}') + # perfdata.append(f'{name}_fifteen_minute={metric["metric"]["rate"]["fifteen_minute"]}') + # perfdata.append(f'{name}_one_minute={metric["metric"]["rate"]["one_minute"]}') + value = metric["metric"]["rate"]["one_minute"] + name = f'{name}_one_minute' + # if isinstance(value, float): + # value = round(value, 1) + value = int(value) + metrics_data[name] = value + + jvm_mem_usage_warn = int(metrics_data['jvm_memory_heap_max'] / int(100 / args.warn_mem)) + jvm_mem_usage_crit = int(metrics_data['jvm_memory_heap_max'] / int(100 / args.crit_mem)) + + perfdata = { + 'throughput_input_1_sec_rate': { + 'value': int(metrics_data['throughput_input_1_sec_rate']), + 'min': 0, + }, + 'throughput_output_1_sec_rate': { + 'value': int(metrics_data['throughput_output_1_sec_rate']), + 'min': 0, + }, + 'incoming_messages_one_minute': { + 'value': metrics_data['incomingMessages_one_minute'], + 'min': 0, + }, + 'open_connections': { + 'value': metrics_data['open_connections'], + 'min': 0, + }, + 'total_connections': { + 'value': metrics_data['total_connections'], + 'min': 0, + }, + 'written_bytes_1sec': { + 'value': metrics_data['written_bytes_1sec'], + 'min': 0, + 'unit': 'B', + }, + 'read_bytes_1sec': { + 'value': metrics_data['read_bytes_1sec'], + 'min': 0, + 'unit': 'B', + }, + 'entries_uncommitted': { + 'value': metrics_data['journal_entries_uncommitted'], + 'min': 0, + }, + 'jvm_memory_used': { + 'value': metrics_data['jvm_memory_heap_used'], + 'min': 0, + 'warn': jvm_mem_usage_warn, + 'crit': jvm_mem_usage_crit, + 'max': int(metrics_data['jvm_memory_heap_max']), + 'unit': 'B', + }, + } + + jvm_mem_usage_state = get_state(int(metrics_data['jvm_memory_heap_used']), jvm_mem_usage_warn, jvm_mem_usage_crit, + operator='gt') + + try: + response = requests.get(f'{args.url}/system/notifications', headers=headers, auth=(args.token, 'token'), + verify=args.insecure) + response.raise_for_status() + except requests.exceptions.RequestException as e: + print(f'CRITICAL - Unable to query Graylog API: {e}') + sys.exit(2) + notifications = response.json() + if notifications['total'] > 0: + notif_str = 'Notifications:' + for notification in notifications['notifications']: + notif_str = notif_str + f'\n{notification["type"]}: {notification["description"]}' + else: + notif_str = 'No notifications' + + if input_data['state'] == 'RUNNING': + text_result = f'Input "{input_data["message_input"]["title"]}" is running.' + else: + text_result = f'Input "{input_data["message_input"]["title"]}" is not running!' + print(text_result) + print(notif_str) + sys.exit(checker.nagios.STATE_CRIT) + + if jvm_mem_usage_state != checker.nagios.STATE_OK: + text_result += f' JVM memory usage is high!' + + text_result = text_result + f' JVM memory usage: {int((perfdata["jvm_memory_used"]["value"] / metrics_data["jvm_memory_heap_max"]) * 100)}%, incoming_messages_one_minute: {perfdata["incoming_messages_one_minute"]["value"]}, open_connections: {perfdata["open_connections"]["value"]}' + '\n' + notif_str + + exit_code = max(checker.nagios.STATE_OK, jvm_mem_usage_state) + print_icinga2_check_status(text_result, exit_code, perfdata) + sys.exit(exit_code) + + +if __name__ == "__main__": + try: + main() + except Exception as e: + print(f'UNKNOWN: exception "{e}"') + print(traceback.format_exc()) + sys.exit(checker.nagios.STATE_UNKNOWN) diff --git a/check_speedtest.py b/check_speedtest.py index 9991e3c..34ff399 100755 --- a/check_speedtest.py +++ b/check_speedtest.py @@ -87,9 +87,9 @@ def main(): warnings.simplefilter("ignore", category=RuntimeWarning) speedtest_results = run_speedtest() - upload_speed_state = get_state(speedtest_results['upload_speed'], args.warn_up, args.critical_up, _operator='le') - download_speed_state = get_state(speedtest_results['download_speed'], args.warn_down, args.critical_down, _operator='le') - latency_state = get_state(speedtest_results['latency'], args.warn_latency, args.critical_latency, _operator='ge') + upload_speed_state = get_state(speedtest_results['upload_speed'], args.warn_up, args.critical_up, operator='le') + download_speed_state = get_state(speedtest_results['download_speed'], args.warn_down, args.critical_down, operator='le') + latency_state = get_state(speedtest_results['latency'], args.warn_latency, args.critical_latency, operator='ge') exit_code = max(upload_speed_state, download_speed_state, latency_state) text_result = f"upload: {speedtest_results['upload_speed']:.1f} Mbps, download: {speedtest_results['download_speed']:.1f} Mbps, latency: {speedtest_results['latency']:.1f} ms, jitter: {speedtest_results['jitter']:.1f} ms" diff --git a/checker/linuxfabric/base.py b/checker/linuxfabric/base.py index d66db23..281a15a 100644 --- a/checker/linuxfabric/base.py +++ b/checker/linuxfabric/base.py @@ -106,7 +106,7 @@ def get_perfdata(label, value, uom=None, warn=None, crit=None, _min=None, _max=N return msg -def get_state(value, warn, crit, _operator='ge'): +def get_state(value, warn, crit, operator='ge'): """Returns the STATE by comparing `value` to the given thresholds using a comparison `_operator`. `warn` and `crit` threshold may also be `None`. @@ -123,7 +123,7 @@ def get_state(value, warn, crit, _operator='ge'): Numeric warning threshold crit : float Numeric critical threshold - _operator : string + operator : string `eq` = equal to `ge` = greater or equal `gt` = greater than @@ -139,7 +139,7 @@ def get_state(value, warn, crit, _operator='ge'): """ # make sure to use float comparison value = float(value) - if _operator == 'ge': + if operator == 'ge': if crit is not None: if value >= float(crit): return STATE_CRIT @@ -148,7 +148,7 @@ def get_state(value, warn, crit, _operator='ge'): return STATE_WARN return STATE_OK - if _operator == 'gt': + if operator == 'gt': if crit is not None: if value > float(crit): return STATE_CRIT @@ -157,7 +157,7 @@ def get_state(value, warn, crit, _operator='ge'): return STATE_WARN return STATE_OK - if _operator == 'le': + if operator == 'le': if crit is not None: if value <= float(crit): return STATE_CRIT @@ -166,7 +166,7 @@ def get_state(value, warn, crit, _operator='ge'): return STATE_WARN return STATE_OK - if _operator == 'lt': + if operator == 'lt': if crit is not None: if value < float(crit): return STATE_CRIT @@ -175,7 +175,7 @@ def get_state(value, warn, crit, _operator='ge'): return STATE_WARN return STATE_OK - if _operator == 'eq': + if operator == 'eq': if crit is not None: if value == float(crit): return STATE_CRIT @@ -184,7 +184,7 @@ def get_state(value, warn, crit, _operator='ge'): return STATE_WARN return STATE_OK - if _operator == 'ne': + if operator == 'ne': if crit is not None: if value != float(crit): return STATE_CRIT @@ -193,7 +193,7 @@ def get_state(value, warn, crit, _operator='ge'): return STATE_WARN return STATE_OK - if _operator == 'range': + if operator == 'range': if crit is not None: if not contine_or_exit(match_range(value, crit)): return STATE_CRIT