#!/usr/bin/env python3 import argparse import sys import traceback from datetime import datetime import requests from urllib3.exceptions import InsecureRequestWarning from checker import nagios from checker import print_icinga2_check_status from checker.http import fetch_with_retry from checker.linuxfabric.base import get_state from checker.units import human_readable_size def transform_inputs(old_dict): new_dict = {} for key in old_dict: for item in old_dict[key]: new_key = item['id'] new_dict[new_key] = item return new_dict def parse_traffic_ts(ts: str): datetime_obj = datetime.strptime(ts, '%Y-%m-%dT%H:%M:%S.%fZ') current_time = datetime.now() time_diff = current_time - datetime_obj return time_diff.total_seconds() < 24 * 60 * 60 # less than 24 hrs ago? def main(): parser = argparse.ArgumentParser(description='Check Graylog input health') parser.add_argument('-u', '--url', required=True, help='The base Graylog URL') parser.add_argument('-t', '--token', required=True, help='Graylog API token') parser.add_argument('-i', '--input', help='Input ID to check. If unset, will check cluster metrics') parser.add_argument('--warn-mem', type=int, default=75, help='Percentage of JVM memory used for warm') parser.add_argument('--crit-mem', type=int, default=100, help='Percentage of JVM memory used for critical') parser.add_argument('--insecure', action='store_false', help="Don't verify SSL") parser.add_argument('--crit-notif', action='store_true', help='Return critical when there are notifications') parser.add_argument('--ignore-update-notif', action='store_true', help='Ignore any update notifications') parser.add_argument('--html', action='store_true', help='Print HTML') parser.add_argument('--cluster-metrics', action='store_true', help='Also gather cluster metrics and check for notifications') args = parser.parse_args() base_url = args.url.strip('/') if not args.input: args.cluster_metrics = True if not args.insecure: requests.packages.urllib3.disable_warnings(category=InsecureRequestWarning) headers = { 'Accept': 'application/json', 'X-Requested-By': 'XMLHttpRequest', } text_result = '' metrics_json = { 'metrics': [ 'org.graylog2.throughput.input.1-sec-rate', 'org.graylog2.throughput.output.1-sec-rate', "org.graylog2.journal.append.1-sec-rate", "org.graylog2.journal.read.1-sec-rate", "org.graylog2.journal.segments", "org.graylog2.journal.entries-uncommitted", "jvm.memory.heap.used", "jvm.memory.heap.committed", "jvm.memory.heap.max" ], } if args.input: input_data = transform_inputs( fetch_with_retry(f'{base_url}/api/cluster/inputstates', headers=headers, auth=(args.token, 'token'), verify=args.insecure).json()).get(args.input, {}) # Get it over with if bool(input_data) and input_data.get('state') == 'RUNNING': input_name = input_data["message_input"]["title"] text_result = f'Graylog input "{input_name}" is running.' else: input_name = args.input if args.html: text_result = f'Graylog input "{input_name}" is not running!' else: text_result = f'Graylog input "{input_name}" is not running!' print_icinga2_check_status(text_result, nagios.STATE_CRIT) sys.exit(nagios.STATE_CRIT) # If the input is running, continue gathering metrics and other health checks input_name_clean = input_name.lower().replace(' ', '_').replace('-', '_') type = input_data['message_input']['type'] metrics_json['metrics'] = metrics_json['metrics'] + [ f'{type}.{args.input}.incomingMessages', f'{type}.{args.input}.open_connections', f'{type}.{args.input}.total_connections', f'{type}.{args.input}.written_bytes_1sec', f'{type}.{args.input}.written_bytes_total', f'{type}.{args.input}.read_bytes_1sec', f'{type}.{args.input}.read_bytes_total', ] r = fetch_with_retry(f'{base_url}/api/cluster/metrics/multiple', method='post', headers=headers, auth=(args.token, 'token'), verify=args.insecure, json=metrics_json).json() input_metrics = r[list(r.keys())[0]]['metrics'] # Format the metrics for later metrics_data = {} for metric in input_metrics: if args.input: name = metric['full_name'].replace(type, '').replace('org.graylog2.', '').replace(args.input, '') else: name = metric['full_name'].replace('org.graylog2.', '') name = name.strip('.').replace('-', '_').replace('.', '_') value = None if 'value' in metric['metric']: value = metric["metric"]["value"] elif 'count' in metric['metric']: value = metric["metric"]["count"] elif 'rate' in metric['metric']: value = metric["metric"]["rate"]["one_minute"] name = f'{name}_one_minute' value = int(value) metrics_data[name] = value perfdata = {} if args.input: # Some metric names are changed for better readability perfdata.update({ f'{input_name_clean}_incoming_messages_rate_per_sec_1min': { 'value': metrics_data['incomingMessages_one_minute'], 'min': 0, }, f'{input_name_clean}_connections': { 'value': metrics_data['open_connections'], 'min': 0, }, f'{input_name_clean}_network_out_total_1sec': { 'value': metrics_data['written_bytes_1sec'], 'min': 0, 'unit': 'B', }, f'{input_name_clean}_network_out_total_total': { 'value': metrics_data['written_bytes_total'], 'min': 0, 'unit': 'B', }, f'{input_name_clean}_network_in_1sec': { 'value': metrics_data['read_bytes_1sec'], 'min': 0, 'unit': 'B', }, f'{input_name_clean}_network_in_total': { 'value': metrics_data['read_bytes_total'], 'min': 0, 'unit': 'B', } }) if args.cluster_metrics: jvm_mem_usage_warn = int(metrics_data['jvm_memory_heap_max'] / int(100 / args.warn_mem)) jvm_mem_usage_crit = int(metrics_data['jvm_memory_heap_max'] / int(100 / args.crit_mem)) # Get traffic data for last 24 hrs traffic_last_24_hrs = fetch_with_retry(f'{base_url}/api/system/cluster/traffic?daily=false', headers=headers, auth=(args.token, 'token'), verify=args.insecure).json() input_traffic_avg = sum([v for k, v in traffic_last_24_hrs['input'].items() if parse_traffic_ts(k)]) output_traffic_avg = sum([v for k, v in traffic_last_24_hrs['output'].items() if parse_traffic_ts(k)]) elasticsearch_health = fetch_with_retry(f'{base_url}/api/system/indexer/cluster/health', headers=headers, auth=(args.token, 'token'), verify=args.insecure).json() elasticsearch_status = elasticsearch_health['status'].lower() elasticsearch_active_shards = elasticsearch_health['shards']['active'] indexer_failures = fetch_with_retry(f'{base_url}/api/system/indexer/failures?limit=10&offset=0', headers=headers, auth=(args.token, 'token'), verify=args.insecure).json() perfdata.update({ 'throughput_input_1_sec_rate': { 'value': int(metrics_data['throughput_input_1_sec_rate']), 'min': 0, }, 'throughput_output_1_sec_rate': { 'value': int(metrics_data['throughput_output_1_sec_rate']), 'min': 0, }, 'entries_uncommitted': { 'value': metrics_data['journal_entries_uncommitted'], 'min': 0, }, 'jvm_memory_used': { 'value': metrics_data['jvm_memory_heap_used'], 'min': 0, 'warn': jvm_mem_usage_warn, 'crit': jvm_mem_usage_crit, 'max': int(metrics_data['jvm_memory_heap_max']), 'unit': 'B', }, 'network_traffic_in_avg': { 'value': input_traffic_avg, 'min': 0, 'unit': 'B', }, 'to_elasticsearch_24hrs_avg': { 'value': output_traffic_avg, 'min': 0, 'unit': 'B', }, 'elasticsearch_active_shards': { 'value': elasticsearch_active_shards, 'min': 0 }, 'indexer_failures': { 'value': indexer_failures['total'], 'warn': 1, 'crit': 1, 'min': 0, }, }) # Check for notifications if args.cluster_metrics: notifications_query = fetch_with_retry(f'{base_url}/api/system/notifications', headers=headers, auth=(args.token, 'token'), verify=args.insecure).json() notifications = [] for notif in notifications_query['notifications']: if notif['type'] == 'outdated_version' and not args.ignore_update_notif: notifications.append(notif) elif notif['type'] != 'outdated_version': notifications.append(notif) if len(notifications): notif = "notifications" if len(notifications) else "notification" are = "are" if len(notifications) else "is" if args.html: notif_str = f'There {are} {len(notifications)} {notif}.' else: notif_str = f'There {are} {len(notifications)} {notif}.' else: notif_str = 'No notifications' if indexer_failures['total'] > 0: indexer_failures_exit = nagios.STATE_CRIT if args.html: text_result += f' There are {indexer_failures["total"]} indexer failures!' else: text_result += f' There are {indexer_failures["total"]} indexer failures!' else: indexer_failures_exit = nagios.STATE_OK # https://go2docs.graylog.org/5-0/setting_up_graylog/elasticsearch.htm#ClusterStatusExplained if elasticsearch_status == 'yellow': elasticsearch_exit_code = nagios.STATE_WARN text_result += ' Elasticsearch is condition YELLOW!' elif elasticsearch_status == 'red': elasticsearch_exit_code = nagios.STATE_CRIT text_result += ' Elasticsearch is condition RED!' elif elasticsearch_status == 'green': elasticsearch_exit_code = nagios.STATE_OK else: print_icinga2_check_status(f'unknown Elasticsearch health: {elasticsearch_status}', nagios.STATE_UNKNOWN) sys.exit(nagios.STATE_UNKNOWN) jvm_mem_usage_state = get_state(int(metrics_data['jvm_memory_heap_used']), jvm_mem_usage_warn, jvm_mem_usage_crit, operator='gt') if jvm_mem_usage_state != nagios.STATE_OK: text_result += f' JVM memory usage is high!' exit_code = max(nagios.STATE_OK, jvm_mem_usage_state, elasticsearch_exit_code, indexer_failures_exit) if len(notifications): text_result += f' There {are} {len(notifications)} {notif}!' if args.crit_notif: exit_code = nagios.STATE_CRIT # force crit if args.input: # show less data text_result = text_result + f' JVM memory usage: {int((perfdata["jvm_memory_used"]["value"] / metrics_data["jvm_memory_heap_max"]) * 100)}%' else: # show more data text_result = text_result + f' JVM memory usage: {int((perfdata["jvm_memory_used"]["value"] / metrics_data["jvm_memory_heap_max"]) * 100)}%, throughput last 1 second: {human_readable_size(perfdata["throughput_input_1_sec_rate"]["value"])} in - {human_readable_size(perfdata["throughput_output_1_sec_rate"]["value"])} out, Elasticsearch active shards: {perfdata["elasticsearch_active_shards"]["value"]}' if args.input: text_result = text_result + f' {input_name_clean} events/second for last minute: {perfdata[f"{input_name_clean}_incoming_messages_rate_per_sec_1min"]["value"]}, {input_name_clean}_connections: {perfdata[f"{input_name_clean}_connections"]["value"]}, {input_name_clean}_network_in_total: {human_readable_size(perfdata[f"{input_name_clean}_network_in_total"]["value"], decimal_places=0)}' exit_code = nagios.STATE_OK else: text_result = text_result + '\n' + notif_str print_icinga2_check_status(text_result, exit_code, perfdata) sys.exit(exit_code) if __name__ == "__main__": try: main() except Exception as e: print(f'UNKNOWN: exception "{e}"') print(traceback.format_exc()) sys.exit(nagios.STATE_UNKNOWN)