diff --git a/check_graylog.py b/check_graylog.py index be1a548..42ca6a5 100755 --- a/check_graylog.py +++ b/check_graylog.py @@ -3,27 +3,49 @@ import argparse import sys import traceback +from datetime import datetime import requests from urllib3.exceptions import InsecureRequestWarning -import checker.nagios +from checker import nagios from checker import print_icinga2_check_status +from checker.http import fetch_with_retry from checker.linuxfabric.base import get_state from checker.units import human_readable_size +def transform_inputs(old_dict): + new_dict = {} + for key in old_dict: + for item in old_dict[key]: + new_key = item['id'] + new_dict[new_key] = item + return new_dict + + +def parse_traffic_ts(ts: str): + datetime_obj = datetime.strptime(ts, '%Y-%m-%dT%H:%M:%S.%fZ') + current_time = datetime.now() + time_diff = current_time - datetime_obj + return time_diff.total_seconds() < 24 * 60 * 60 # less than 24 hrs ago? + + def main(): parser = argparse.ArgumentParser(description='Check Graylog input health') - parser.add_argument('-u', '--url', required=True, help='Graylog API URL') + parser.add_argument('-u', '--url', required=True, help='The base Graylog URL') parser.add_argument('-t', '--token', required=True, help='Graylog API token') parser.add_argument('-i', '--input', required=True, help='Input ID to check') parser.add_argument('--warn-mem', type=int, default=75, help='Percentage of JVM memory used for warm') parser.add_argument('--crit-mem', type=int, default=100, help='Percentage of JVM memory used for critical') parser.add_argument('--insecure', action='store_false', help="Don't verify SSL") parser.add_argument('--crit-notif', action='store_true', help='Return critical when there are notifications') + parser.add_argument('--html', action='store_true', help='Print HTML') + parser.add_argument('--cluster-metrics', action='store_true', help='Also gather cluster metrics and check for notifications.') args = parser.parse_args() + base_url = args.url.strip('/') + if not args.insecure: requests.packages.urllib3.disable_warnings(category=InsecureRequestWarning) @@ -32,48 +54,52 @@ def main(): 'X-Requested-By': 'XMLHttpRequest', } - try: - response = requests.get(f'{args.url}/system/inputstates/{args.input}', headers=headers, - auth=(args.token, 'token'), verify=args.insecure) - response.raise_for_status() - except requests.exceptions.RequestException as e: - print(f'CRITICAL - Unable to query Graylog API: {e}') - sys.exit(2) + # Get the basic input metadata + input_data = transform_inputs(fetch_with_retry(f'{base_url}/api/cluster/inputstates', headers=headers, auth=(args.token, 'token'), verify=args.insecure).json()).get(args.input, {}) - input_data = response.json() + # Get it over with + if bool(input_data) and input_data.get('state') == 'RUNNING': + input_name = input_data["message_input"]["title"] + text_result = f'Graylog input "{input_name}" is running.' + else: + input_name = args.input + if args.html: + text_result = f'Graylog input "{input_name}" is not running!' + else: + text_result = f'Graylog input "{input_name}" is not running!' + print_icinga2_check_status(text_result, nagios.STATE_CRIT) + sys.exit(nagios.STATE_CRIT) + + # If the input is running, continue gathering metrics and other health checks + input_name_clean = input_name.lower().replace(' ', '_').replace('-', '_') + + # Get metrics for the input type = input_data['message_input']['type'] + metrics_json = { + 'metrics': [ + 'org.graylog2.throughput.input.1-sec-rate', + 'org.graylog2.throughput.output.1-sec-rate', + f'{type}.{args.input}.incomingMessages', + f'{type}.{args.input}.open_connections', + f'{type}.{args.input}.total_connections', + f'{type}.{args.input}.written_bytes_1sec', + f'{type}.{args.input}.written_bytes_total', + f'{type}.{args.input}.read_bytes_1sec', + f'{type}.{args.input}.read_bytes_total', + "org.graylog2.journal.append.1-sec-rate", + "org.graylog2.journal.read.1-sec-rate", + "org.graylog2.journal.segments", + "org.graylog2.journal.entries-uncommitted", + "jvm.memory.heap.used", + "jvm.memory.heap.committed", + "jvm.memory.heap.max" + ], + } + r = fetch_with_retry(f'{base_url}/api/cluster/metrics/multiple', method='post', headers=headers, auth=(args.token, 'token'), + verify=args.insecure, json=metrics_json).json() + input_metrics = r[list(r.keys())[0]]['metrics'] - try: - metrics_json = { - 'metrics': [ - 'org.graylog2.throughput.input.1-sec-rate', - 'org.graylog2.throughput.output.1-sec-rate', - f'{type}.{args.input}.incomingMessages', - f'{type}.{args.input}.open_connections', - f'{type}.{args.input}.total_connections', - f'{type}.{args.input}.written_bytes_1sec', - f'{type}.{args.input}.written_bytes_total', - f'{type}.{args.input}.read_bytes_1sec', - f'{type}.{args.input}.read_bytes_total', - "org.graylog2.journal.append.1-sec-rate", - "org.graylog2.journal.read.1-sec-rate", - "org.graylog2.journal.segments", - "org.graylog2.journal.entries-uncommitted", - "jvm.memory.heap.used", - "jvm.memory.heap.committed", - "jvm.memory.heap.max" - ], - } - response = requests.post(f'{args.url}/cluster/metrics/multiple', headers=headers, auth=(args.token, 'token'), - verify=args.insecure, - json=metrics_json) - response.raise_for_status() - input_metrics = response.json()[list(response.json().keys())[0]]['metrics'] - except requests.exceptions.RequestException as e: - print(f'CRITICAL - Unable to query Graylog API: {e}\n{response.text}') - sys.exit(2) - - # Format the metrics + # Format the metrics for later metrics_data = {} for metric in input_metrics: name = metric['full_name'].replace(type, '').replace('org.graylog2.', '').replace(args.input, '').strip( @@ -92,95 +118,138 @@ def main(): jvm_mem_usage_warn = int(metrics_data['jvm_memory_heap_max'] / int(100 / args.warn_mem)) jvm_mem_usage_crit = int(metrics_data['jvm_memory_heap_max'] / int(100 / args.crit_mem)) + # Get traffic data for last 24 hrs + traffic_last_24_hrs = fetch_with_retry(f'{base_url}/api/system/cluster/traffic?daily=false', headers=headers, auth=(args.token, 'token'), verify=args.insecure).json() + input_traffic_avg = sum([v for k, v in traffic_last_24_hrs['input'].items() if parse_traffic_ts(k)]) + output_traffic_avg = sum([v for k, v in traffic_last_24_hrs['output'].items() if parse_traffic_ts(k)]) + + elastisearch_health = fetch_with_retry(f'{base_url}/api/system/indexer/cluster/health', headers=headers, auth=(args.token, 'token'), verify=args.insecure).json() + elastisearch_status = elastisearch_health['status'].lower() + elastisearch_active_shards = elastisearch_health['shards']['active'] + + indexer_failures = fetch_with_retry(f'{base_url}/api/system/indexer/failures?limit=10&offset=0', headers=headers, auth=(args.token, 'token'), verify=args.insecure).json() + # Some metric names are changed for better readability perfdata = { - 'throughput_input_1_sec_rate': { + f'{input_name_clean}_throughput_input_1_sec_rate': { 'value': int(metrics_data['throughput_input_1_sec_rate']), 'min': 0, }, - 'throughput_output_1_sec_rate': { + f'{input_name_clean}_throughput_output_1_sec_rate': { 'value': int(metrics_data['throughput_output_1_sec_rate']), 'min': 0, }, - 'incoming_messages_rate_per_sec_one_minute': { + f'{input_name_clean}_incoming_messages_rate_per_sec_one_minute': { 'value': metrics_data['incomingMessages_one_minute'], 'min': 0, }, - 'connections': { + f'{input_name_clean}_connections': { 'value': metrics_data['open_connections'], 'min': 0, }, - 'network_out_total_1sec': { + f'{input_name_clean}_network_out_total_1sec': { 'value': metrics_data['written_bytes_1sec'], 'min': 0, 'unit': 'B', }, - 'network_out_total_total': { + f'{input_name_clean}_network_out_total_total': { 'value': metrics_data['written_bytes_total'], 'min': 0, 'unit': 'B', }, - 'network_in_1sec': { + f'{input_name_clean}_network_in_1sec': { 'value': metrics_data['read_bytes_1sec'], 'min': 0, 'unit': 'B', }, - 'network_in_total': { + f'{input_name_clean}_network_in_total': { 'value': metrics_data['read_bytes_total'], 'min': 0, 'unit': 'B', - }, - 'entries_uncommitted': { - 'value': metrics_data['journal_entries_uncommitted'], - 'min': 0, - }, - 'jvm_memory_used': { - 'value': metrics_data['jvm_memory_heap_used'], - 'min': 0, - 'warn': jvm_mem_usage_warn, - 'crit': jvm_mem_usage_crit, - 'max': int(metrics_data['jvm_memory_heap_max']), - 'unit': 'B', - }, + } } - jvm_mem_usage_state = get_state(int(metrics_data['jvm_memory_heap_used']), jvm_mem_usage_warn, jvm_mem_usage_crit, - operator='gt') + if args.cluster_metrics: + perfdata.update({ + 'entries_uncommitted': { + 'value': metrics_data['journal_entries_uncommitted'], + 'min': 0, + }, + 'jvm_memory_used': { + 'value': metrics_data['jvm_memory_heap_used'], + 'min': 0, + 'warn': jvm_mem_usage_warn, + 'crit': jvm_mem_usage_crit, + 'max': int(metrics_data['jvm_memory_heap_max']), + 'unit': 'B', + }, + 'from_network_traffic_avg': { + 'value': input_traffic_avg, + 'min': 0, + 'unit': 'B', + }, + 'to_elasticsearch_24hrs_avg': { + 'value': output_traffic_avg, + 'min': 0, + 'unit': 'B', + }, + 'elastisearch_active_shards': { + 'value': elastisearch_active_shards, + 'min': 0 + }, + 'indexder_failures': { + 'value': indexer_failures['total'], + 'min': 0, + }, + }) - try: - response = requests.get(f'{args.url}/system/notifications', headers=headers, auth=(args.token, 'token'), - verify=args.insecure) - response.raise_for_status() - except requests.exceptions.RequestException as e: - print(f'CRITICAL - Unable to query Graylog API: {e}') - sys.exit(2) - notifications = response.json() - if notifications['total'] > 0: - notif_str = 'Notifications:' - for notification in notifications['notifications']: - notif_str = notif_str + f'\n{notification["type"]}: {notification["description"]}' + # Check for notifications + if args.cluster_metrics: + notifications = fetch_with_retry(f'{base_url}/api/system/notifications', headers=headers, auth=(args.token, 'token'), verify=args.insecure).json() + if notifications['total'] > 0: + notif_str = 'Notifications:' + for notification in notifications['notifications']: + notif_str = notif_str + f'\n{notification["type"]}: {notification["description"]}' + else: + notif_str = 'No notifications' + + if indexer_failures['total'] > 0: + indexer_failures_exit = nagios.STATE_CRIT + if args.html: + text_result += f' There are {indexer_failures["total"]} indexer failures!' + else: + text_result += f' There are {indexer_failures["total"]} indexer failures!' + else: + indexer_failures_exit = nagios.STATE_OK + + # https://go2docs.graylog.org/5-0/setting_up_graylog/elasticsearch.htm#ClusterStatusExplained + if elastisearch_status == 'yellow': + elastisearch_exit_code = nagios.STATE_WARN + text_result += ' Elasticsearch is condition YELLOW!' + elif elastisearch_status == 'red': + elastisearch_exit_code = nagios.STATE_CRIT + text_result += ' Elasticsearch is condition RED!' + elif elastisearch_status == 'green': + elastisearch_exit_code = nagios.STATE_OK + else: + print_icinga2_check_status(f'unknown Elasticsearch health: {elastisearch_status}', nagios.STATE_UNKNOWN) + sys.exit(nagios.STATE_UNKNOWN) + + jvm_mem_usage_state = get_state(int(metrics_data['jvm_memory_heap_used']), jvm_mem_usage_warn, jvm_mem_usage_crit, operator='gt') + if jvm_mem_usage_state != nagios.STATE_OK: + text_result += f' JVM memory usage is high!' + + exit_code = max(nagios.STATE_OK, jvm_mem_usage_state, elastisearch_exit_code, indexer_failures_exit) + + if notifications['total'] > 0: + text_result += f' There are notifications!' + if args.crit_notif: + exit_code = nagios.STATE_CRIT # force crit + + text_result = text_result + f' JVM memory usage: {int((perfdata["jvm_memory_used"]["value"] / metrics_data["jvm_memory_heap_max"]) * 100)}%, input incoming rate (events/second for last minute): {perfdata[f"{input_name_clean}_incoming_messages_rate_per_sec_one_minute"]["value"]}, input connections: {perfdata[f"{input_name_clean}_connections"]["value"]}, input total network in: {human_readable_size(perfdata[f"{input_name_clean}_network_in_total"]["value"], decimal_places=0)}' + '\n' + notif_str else: - notif_str = 'No notifications' - - if input_data['state'] == 'RUNNING': - text_result = f'Input "{input_data["message_input"]["title"]}" is running.' - else: - text_result = f'Input "{input_data["message_input"]["title"]}" is not running!' - print(text_result) - print(notif_str) - sys.exit(checker.nagios.STATE_CRIT) - - if jvm_mem_usage_state != checker.nagios.STATE_OK: - text_result += f' JVM memory usage is high!' - - exit_code = max(checker.nagios.STATE_OK, jvm_mem_usage_state) - - if notifications['total'] > 0: - text_result += f' There are notifications!' - if args.crit_notif: - exit_code = checker.nagios.STATE_CRIT - - text_result = text_result + f' JVM memory usage: {int((perfdata["jvm_memory_used"]["value"] / metrics_data["jvm_memory_heap_max"]) * 100)}%, incoming rate (events/second for last minute): {perfdata["incoming_messages_rate_per_sec_one_minute"]["value"]}, connections: {perfdata["connections"]["value"]}, total network in: {human_readable_size(perfdata["network_in_total"]["value"], decimal_places=0)}' + '\n' + notif_str + text_result = text_result + f' {input_name_clean}_incoming_messages_rate_per_sec_one_minute (events/second for last minute): {perfdata[f"{input_name_clean}_incoming_messages_rate_per_sec_one_minute"]["value"]}, {input_name_clean}_connections: {perfdata[f"{input_name_clean}_connections"]["value"]}, {input_name_clean}_network_in_total: {human_readable_size(perfdata[f"{input_name_clean}_network_in_total"]["value"], decimal_places=0)}' + '\n' + exit_code = nagios.STATE_OK print_icinga2_check_status(text_result, exit_code, perfdata) sys.exit(exit_code) @@ -192,4 +261,4 @@ if __name__ == "__main__": except Exception as e: print(f'UNKNOWN: exception "{e}"') print(traceback.format_exc()) - sys.exit(checker.nagios.STATE_UNKNOWN) + sys.exit(nagios.STATE_UNKNOWN) diff --git a/check_nginx.py b/check_nginx.py index 2de95d4..4e731da 100755 --- a/check_nginx.py +++ b/check_nginx.py @@ -5,7 +5,7 @@ import sys import traceback from checker import nagios, print_icinga2_check_status -from checker.http import get_with_retry +from checker.http import fetch_with_retry from checker.linuxfabric.base import get_state @@ -42,7 +42,7 @@ def main(): parser.add_argument("--warning-waiting", type=int, default=None, help="Warning threshold for waiting connections. Default: 0 (disabled)") args = parser.parse_args() - status = get_with_retry(args.url).text + status = fetch_with_retry(args.url).text data = parse_nginx_status(status) perfdata_dict = { diff --git a/check_opnsense_traffic_for_host.py b/check_opnsense_traffic_for_host.py index 8e37802..815ab79 100755 --- a/check_opnsense_traffic_for_host.py +++ b/check_opnsense_traffic_for_host.py @@ -10,7 +10,7 @@ from urllib3.exceptions import InsecureRequestWarning import checker.nagios as nagios from checker import print_icinga2_check_status -from checker.http import get_with_retry +from checker.http import fetch_with_retry from checker.linuxfabric.base import get_state from checker.markdown import list_to_markdown_table from checker.units import filesize @@ -26,9 +26,9 @@ def is_internet_traffic(ip): def get_traffic_top(args, interface): - response = get_with_retry(f'https://{args.opnsense}/api/diagnostics/traffic/top/{interface}', - headers={'Accept': 'application/json'}, auth=(args.key, args.secret), verify=False, - timeout=args.timeout) + response = fetch_with_retry(f'https://{args.opnsense}/api/diagnostics/traffic/top/{interface}', + headers={'Accept': 'application/json'}, auth=(args.key, args.secret), verify=False, + timeout=args.timeout) if response.status_code != 200: print(f'UNKNOWN: unable to query OPNsense API for {interface}: {response.status_code}\n{response.text}') sys.exit(nagios.UNKNOWN) @@ -81,9 +81,9 @@ def main(): traffic_data = [] for _ in range(args.duration): # start_time = time.time() - response = get_with_retry(f'https://{args.opnsense}/api/diagnostics/traffic/top/{interface}', - headers={'Accept': 'application/json'}, auth=(args.key, args.secret), verify=False, - timeout=args.timeout) + response = fetch_with_retry(f'https://{args.opnsense}/api/diagnostics/traffic/top/{interface}', + headers={'Accept': 'application/json'}, auth=(args.key, args.secret), verify=False, + timeout=args.timeout) # end_time = time.time() # api_request_time = end_time - start_time diff --git a/check_scrutiny_disks.py b/check_scrutiny_disks.py index 169eacb..9052243 100755 --- a/check_scrutiny_disks.py +++ b/check_scrutiny_disks.py @@ -8,7 +8,7 @@ from typing import List import requests from checker import nagios -from checker.http import get_with_retry +from checker.http import fetch_with_retry def get_disk_wwn_ids(ignore_non_smart: bool = False) -> List[str] or bool: @@ -43,7 +43,7 @@ def get_disk_wwn_ids(ignore_non_smart: bool = False) -> List[str] or bool: def get_smart_health(wwn_id: str, scrutiny_endpoint: str) -> dict: url = f"{scrutiny_endpoint}/api/device/{wwn_id}/details" - response = get_with_retry(url) + response = fetch_with_retry(url) if response.status_code == 200: return response.json() elif response.status_code == 404: diff --git a/checker/http.py b/checker/http.py index 7bf3b58..ca4a8b3 100644 --- a/checker/http.py +++ b/checker/http.py @@ -7,19 +7,26 @@ from . import nagios from .print import print_icinga2_check_status -def get_with_retry(url, retries=3, delay=1, **kwargs): +def fetch_with_retry(url, method: str = 'get', retries=3, delay=1, **kwargs): """ Wrapper function for requests.get() with a retry mechanism. + :param method: HTTP request type: get, post :param url: URL to send the GET request :param retries: Number of retries in case of HTTP failures (default: 3) :param delay: Time delay between retries in seconds (default: 1) :param kwargs: Additional keyword arguments for requests.get() :return: Response object """ + for i in range(retries): try: - response = requests.get(url, **kwargs) + if method == 'get': + response = requests.get(url, **kwargs) + elif method == 'post': + response = requests.post(url, **kwargs) + else: + raise ValueError('Invalid method! Must be get or post.') response.raise_for_status() return response except requests.exceptions.RequestException as e: diff --git a/checker/units.py b/checker/units.py index 79127a4..cdd0396 100644 --- a/checker/units.py +++ b/checker/units.py @@ -54,6 +54,6 @@ def human_readable_size(size: Union[int, float], bits=False, decimal_places: int if decimal_places == 0: size = int(size) else: - round(size, decimal_places) + size = round(size, decimal_places) return f'{size} {units[bits][base][exp]}'