diff --git a/check_graylog.py b/check_graylog.py index 42ca6a5..0429eeb 100755 --- a/check_graylog.py +++ b/check_graylog.py @@ -35,16 +35,18 @@ def main(): parser = argparse.ArgumentParser(description='Check Graylog input health') parser.add_argument('-u', '--url', required=True, help='The base Graylog URL') parser.add_argument('-t', '--token', required=True, help='Graylog API token') - parser.add_argument('-i', '--input', required=True, help='Input ID to check') + parser.add_argument('-i', '--input', help='Input ID to check. If unset, will check cluster metrics') parser.add_argument('--warn-mem', type=int, default=75, help='Percentage of JVM memory used for warm') parser.add_argument('--crit-mem', type=int, default=100, help='Percentage of JVM memory used for critical') parser.add_argument('--insecure', action='store_false', help="Don't verify SSL") parser.add_argument('--crit-notif', action='store_true', help='Return critical when there are notifications') parser.add_argument('--html', action='store_true', help='Print HTML') - parser.add_argument('--cluster-metrics', action='store_true', help='Also gather cluster metrics and check for notifications.') + parser.add_argument('--cluster-metrics', action='store_true', help='Also gather cluster metrics and check for notifications') args = parser.parse_args() base_url = args.url.strip('/') + if not args.input: + args.cluster_metrics = True if not args.insecure: requests.packages.urllib3.disable_warnings(category=InsecureRequestWarning) @@ -54,38 +56,11 @@ def main(): 'X-Requested-By': 'XMLHttpRequest', } - # Get the basic input metadata - input_data = transform_inputs(fetch_with_retry(f'{base_url}/api/cluster/inputstates', headers=headers, auth=(args.token, 'token'), verify=args.insecure).json()).get(args.input, {}) - - # Get it over with - if bool(input_data) and input_data.get('state') == 'RUNNING': - input_name = input_data["message_input"]["title"] - text_result = f'Graylog input "{input_name}" is running.' - else: - input_name = args.input - if args.html: - text_result = f'Graylog input "{input_name}" is not running!' - else: - text_result = f'Graylog input "{input_name}" is not running!' - print_icinga2_check_status(text_result, nagios.STATE_CRIT) - sys.exit(nagios.STATE_CRIT) - - # If the input is running, continue gathering metrics and other health checks - input_name_clean = input_name.lower().replace(' ', '_').replace('-', '_') - - # Get metrics for the input - type = input_data['message_input']['type'] + text_result = '' metrics_json = { 'metrics': [ 'org.graylog2.throughput.input.1-sec-rate', 'org.graylog2.throughput.output.1-sec-rate', - f'{type}.{args.input}.incomingMessages', - f'{type}.{args.input}.open_connections', - f'{type}.{args.input}.total_connections', - f'{type}.{args.input}.written_bytes_1sec', - f'{type}.{args.input}.written_bytes_total', - f'{type}.{args.input}.read_bytes_1sec', - f'{type}.{args.input}.read_bytes_total', "org.graylog2.journal.append.1-sec-rate", "org.graylog2.journal.read.1-sec-rate", "org.graylog2.journal.segments", @@ -95,6 +70,21 @@ def main(): "jvm.memory.heap.max" ], } + + if args.input: + # Get the basic input metadata + input_data = transform_inputs(fetch_with_retry(f'{base_url}/api/cluster/inputstates', headers=headers, auth=(args.token, 'token'), verify=args.insecure).json()).get(args.input, {}) + type = input_data['message_input']['type'] + metrics_json['metrics'] = metrics_json['metrics'] + [ + f'{type}.{args.input}.incomingMessages', + f'{type}.{args.input}.open_connections', + f'{type}.{args.input}.total_connections', + f'{type}.{args.input}.written_bytes_1sec', + f'{type}.{args.input}.written_bytes_total', + f'{type}.{args.input}.read_bytes_1sec', + f'{type}.{args.input}.read_bytes_total', + ] + r = fetch_with_retry(f'{base_url}/api/cluster/metrics/multiple', method='post', headers=headers, auth=(args.token, 'token'), verify=args.insecure, json=metrics_json).json() input_metrics = r[list(r.keys())[0]]['metrics'] @@ -102,8 +92,11 @@ def main(): # Format the metrics for later metrics_data = {} for metric in input_metrics: - name = metric['full_name'].replace(type, '').replace('org.graylog2.', '').replace(args.input, '').strip( - '.').replace('-', '_').replace('.', '_') + if args.input: + name = metric['full_name'].replace(type, '').replace('org.graylog2.', '').replace(args.input, '') + else: + name = metric['full_name'].replace('org.graylog2.', '') + name = name.strip('.').replace('-', '_').replace('.', '_') value = None if 'value' in metric['metric']: value = metric["metric"]["value"] @@ -115,62 +108,81 @@ def main(): value = int(value) metrics_data[name] = value - jvm_mem_usage_warn = int(metrics_data['jvm_memory_heap_max'] / int(100 / args.warn_mem)) - jvm_mem_usage_crit = int(metrics_data['jvm_memory_heap_max'] / int(100 / args.crit_mem)) + perfdata = {} - # Get traffic data for last 24 hrs - traffic_last_24_hrs = fetch_with_retry(f'{base_url}/api/system/cluster/traffic?daily=false', headers=headers, auth=(args.token, 'token'), verify=args.insecure).json() - input_traffic_avg = sum([v for k, v in traffic_last_24_hrs['input'].items() if parse_traffic_ts(k)]) - output_traffic_avg = sum([v for k, v in traffic_last_24_hrs['output'].items() if parse_traffic_ts(k)]) + if args.input: + # Get it over with + if bool(input_data) and input_data.get('state') == 'RUNNING': + input_name = input_data["message_input"]["title"] + text_result = f'Graylog input "{input_name}" is running.' + else: + input_name = args.input + if args.html: + text_result = f'Graylog input "{input_name}" is not running!' + else: + text_result = f'Graylog input "{input_name}" is not running!' + print_icinga2_check_status(text_result, nagios.STATE_CRIT) + sys.exit(nagios.STATE_CRIT) - elastisearch_health = fetch_with_retry(f'{base_url}/api/system/indexer/cluster/health', headers=headers, auth=(args.token, 'token'), verify=args.insecure).json() - elastisearch_status = elastisearch_health['status'].lower() - elastisearch_active_shards = elastisearch_health['shards']['active'] + # If the input is running, continue gathering metrics and other health checks + input_name_clean = input_name.lower().replace(' ', '_').replace('-', '_') - indexer_failures = fetch_with_retry(f'{base_url}/api/system/indexer/failures?limit=10&offset=0', headers=headers, auth=(args.token, 'token'), verify=args.insecure).json() - - # Some metric names are changed for better readability - perfdata = { - f'{input_name_clean}_throughput_input_1_sec_rate': { - 'value': int(metrics_data['throughput_input_1_sec_rate']), - 'min': 0, - }, - f'{input_name_clean}_throughput_output_1_sec_rate': { - 'value': int(metrics_data['throughput_output_1_sec_rate']), - 'min': 0, - }, - f'{input_name_clean}_incoming_messages_rate_per_sec_one_minute': { - 'value': metrics_data['incomingMessages_one_minute'], - 'min': 0, - }, - f'{input_name_clean}_connections': { - 'value': metrics_data['open_connections'], - 'min': 0, - }, - f'{input_name_clean}_network_out_total_1sec': { - 'value': metrics_data['written_bytes_1sec'], - 'min': 0, - 'unit': 'B', - }, - f'{input_name_clean}_network_out_total_total': { - 'value': metrics_data['written_bytes_total'], - 'min': 0, - 'unit': 'B', - }, - f'{input_name_clean}_network_in_1sec': { - 'value': metrics_data['read_bytes_1sec'], - 'min': 0, - 'unit': 'B', - }, - f'{input_name_clean}_network_in_total': { - 'value': metrics_data['read_bytes_total'], - 'min': 0, - 'unit': 'B', - } - } + # Some metric names are changed for better readability + perfdata.update({ + f'{input_name_clean}_incoming_messages_rate_per_sec_one_minute': { + 'value': metrics_data['incomingMessages_one_minute'], + 'min': 0, + }, + f'{input_name_clean}_connections': { + 'value': metrics_data['open_connections'], + 'min': 0, + }, + f'{input_name_clean}_network_out_total_1sec': { + 'value': metrics_data['written_bytes_1sec'], + 'min': 0, + 'unit': 'B', + }, + f'{input_name_clean}_network_out_total_total': { + 'value': metrics_data['written_bytes_total'], + 'min': 0, + 'unit': 'B', + }, + f'{input_name_clean}_network_in_1sec': { + 'value': metrics_data['read_bytes_1sec'], + 'min': 0, + 'unit': 'B', + }, + f'{input_name_clean}_network_in_total': { + 'value': metrics_data['read_bytes_total'], + 'min': 0, + 'unit': 'B', + } + }) if args.cluster_metrics: + jvm_mem_usage_warn = int(metrics_data['jvm_memory_heap_max'] / int(100 / args.warn_mem)) + jvm_mem_usage_crit = int(metrics_data['jvm_memory_heap_max'] / int(100 / args.crit_mem)) + + # Get traffic data for last 24 hrs + traffic_last_24_hrs = fetch_with_retry(f'{base_url}/api/system/cluster/traffic?daily=false', headers=headers, auth=(args.token, 'token'), verify=args.insecure).json() + input_traffic_avg = sum([v for k, v in traffic_last_24_hrs['input'].items() if parse_traffic_ts(k)]) + output_traffic_avg = sum([v for k, v in traffic_last_24_hrs['output'].items() if parse_traffic_ts(k)]) + + elastisearch_health = fetch_with_retry(f'{base_url}/api/system/indexer/cluster/health', headers=headers, auth=(args.token, 'token'), verify=args.insecure).json() + elastisearch_status = elastisearch_health['status'].lower() + elastisearch_active_shards = elastisearch_health['shards']['active'] + + indexer_failures = fetch_with_retry(f'{base_url}/api/system/indexer/failures?limit=10&offset=0', headers=headers, auth=(args.token, 'token'), verify=args.insecure).json() + perfdata.update({ + 'throughput_input_1_sec_rate': { + 'value': int(metrics_data['throughput_input_1_sec_rate']), + 'min': 0, + }, + 'throughput_output_1_sec_rate': { + 'value': int(metrics_data['throughput_output_1_sec_rate']), + 'min': 0, + }, 'entries_uncommitted': { 'value': metrics_data['journal_entries_uncommitted'], 'min': 0, @@ -183,7 +195,7 @@ def main(): 'max': int(metrics_data['jvm_memory_heap_max']), 'unit': 'B', }, - 'from_network_traffic_avg': { + 'network_traffic_in_avg': { 'value': input_traffic_avg, 'min': 0, 'unit': 'B', @@ -246,10 +258,13 @@ def main(): if args.crit_notif: exit_code = nagios.STATE_CRIT # force crit - text_result = text_result + f' JVM memory usage: {int((perfdata["jvm_memory_used"]["value"] / metrics_data["jvm_memory_heap_max"]) * 100)}%, input incoming rate (events/second for last minute): {perfdata[f"{input_name_clean}_incoming_messages_rate_per_sec_one_minute"]["value"]}, input connections: {perfdata[f"{input_name_clean}_connections"]["value"]}, input total network in: {human_readable_size(perfdata[f"{input_name_clean}_network_in_total"]["value"], decimal_places=0)}' + '\n' + notif_str - else: - text_result = text_result + f' {input_name_clean}_incoming_messages_rate_per_sec_one_minute (events/second for last minute): {perfdata[f"{input_name_clean}_incoming_messages_rate_per_sec_one_minute"]["value"]}, {input_name_clean}_connections: {perfdata[f"{input_name_clean}_connections"]["value"]}, {input_name_clean}_network_in_total: {human_readable_size(perfdata[f"{input_name_clean}_network_in_total"]["value"], decimal_places=0)}' + '\n' + text_result = text_result + f' JVM memory usage: {int((perfdata["jvm_memory_used"]["value"] / metrics_data["jvm_memory_heap_max"]) * 100)}%' + + if args.input: + text_result = text_result + f' {input_name_clean} events/second for last minute: {perfdata[f"{input_name_clean}_incoming_messages_rate_per_sec_one_minute"]["value"]}, {input_name_clean}_connections: {perfdata[f"{input_name_clean}_connections"]["value"]}, {input_name_clean}_network_in_total: {human_readable_size(perfdata[f"{input_name_clean}_network_in_total"]["value"], decimal_places=0)}' exit_code = nagios.STATE_OK + else: + text_result = text_result + '\n' + notif_str print_icinga2_check_status(text_result, exit_code, perfdata) sys.exit(exit_code)