#!/usr/bin/env python3 import argparse import sys import traceback import requests from urllib3.exceptions import InsecureRequestWarning import checker.nagios from checker import print_icinga2_check_status from checker.linuxfabric.base import get_state from checker.units import human_readable_size def main(): parser = argparse.ArgumentParser(description='Check Graylog input health') parser.add_argument('-u', '--url', required=True, help='Graylog API URL') parser.add_argument('-t', '--token', required=True, help='Graylog API token') parser.add_argument('-i', '--input', required=True, help='Input ID to check') parser.add_argument('--warn-mem', type=int, default=75, help='Percentage of JVM memory used for warm') parser.add_argument('--crit-mem', type=int, default=100, help='Percentage of JVM memory used for critical') parser.add_argument('--insecure', action='store_false', help="Don't verify SSL") parser.add_argument('--crit-notif', action='store_true', help='Return critical when there are notifications') args = parser.parse_args() if not args.insecure: requests.packages.urllib3.disable_warnings(category=InsecureRequestWarning) headers = { 'Accept': 'application/json', 'X-Requested-By': 'XMLHttpRequest', } try: response = requests.get(f'{args.url}/system/inputstates/{args.input}', headers=headers, auth=(args.token, 'token'), verify=args.insecure) response.raise_for_status() except requests.exceptions.RequestException as e: print(f'CRITICAL - Unable to query Graylog API: {e}') sys.exit(2) input_data = response.json() type = input_data['message_input']['type'] try: metrics_json = { 'metrics': [ 'org.graylog2.throughput.input.1-sec-rate', 'org.graylog2.throughput.output.1-sec-rate', f'{type}.{args.input}.incomingMessages', f'{type}.{args.input}.open_connections', f'{type}.{args.input}.total_connections', f'{type}.{args.input}.written_bytes_1sec', f'{type}.{args.input}.written_bytes_total', f'{type}.{args.input}.read_bytes_1sec', f'{type}.{args.input}.read_bytes_total', "org.graylog2.journal.append.1-sec-rate", "org.graylog2.journal.read.1-sec-rate", "org.graylog2.journal.segments", "org.graylog2.journal.entries-uncommitted", "jvm.memory.heap.used", "jvm.memory.heap.committed", "jvm.memory.heap.max" ], } response = requests.post(f'{args.url}/cluster/metrics/multiple', headers=headers, auth=(args.token, 'token'), verify=args.insecure, json=metrics_json) response.raise_for_status() input_metrics = response.json()[list(response.json().keys())[0]]['metrics'] except requests.exceptions.RequestException as e: print(f'CRITICAL - Unable to query Graylog API: {e}\n{response.text}') sys.exit(2) # Format the metrics metrics_data = {} for metric in input_metrics: name = metric['full_name'].replace(type, '').replace('org.graylog2.', '').replace(args.input, '').strip( '.').replace('-', '_').replace('.', '_') value = None if 'value' in metric['metric']: value = metric["metric"]["value"] elif 'count' in metric['metric']: value = metric["metric"]["count"] elif 'rate' in metric['metric']: value = metric["metric"]["rate"]["one_minute"] name = f'{name}_one_minute' value = int(value) metrics_data[name] = value jvm_mem_usage_warn = int(metrics_data['jvm_memory_heap_max'] / int(100 / args.warn_mem)) jvm_mem_usage_crit = int(metrics_data['jvm_memory_heap_max'] / int(100 / args.crit_mem)) # Some metric names are changed for better readability perfdata = { 'throughput_input_1_sec_rate': { 'value': int(metrics_data['throughput_input_1_sec_rate']), 'min': 0, }, 'throughput_output_1_sec_rate': { 'value': int(metrics_data['throughput_output_1_sec_rate']), 'min': 0, }, 'incoming_messages_rate_per_sec_one_minute': { 'value': metrics_data['incomingMessages_one_minute'], 'min': 0, }, 'connections': { 'value': metrics_data['open_connections'], 'min': 0, }, 'network_out_total_1sec': { 'value': metrics_data['written_bytes_1sec'], 'min': 0, 'unit': 'B', }, 'network_out_total_total': { 'value': metrics_data['written_bytes_total'], 'min': 0, 'unit': 'B', }, 'network_in_1sec': { 'value': metrics_data['read_bytes_1sec'], 'min': 0, 'unit': 'B', }, 'network_in_total': { 'value': metrics_data['read_bytes_total'], 'min': 0, 'unit': 'B', }, 'entries_uncommitted': { 'value': metrics_data['journal_entries_uncommitted'], 'min': 0, }, 'jvm_memory_used': { 'value': metrics_data['jvm_memory_heap_used'], 'min': 0, 'warn': jvm_mem_usage_warn, 'crit': jvm_mem_usage_crit, 'max': int(metrics_data['jvm_memory_heap_max']), 'unit': 'B', }, } jvm_mem_usage_state = get_state(int(metrics_data['jvm_memory_heap_used']), jvm_mem_usage_warn, jvm_mem_usage_crit, operator='gt') try: response = requests.get(f'{args.url}/system/notifications', headers=headers, auth=(args.token, 'token'), verify=args.insecure) response.raise_for_status() except requests.exceptions.RequestException as e: print(f'CRITICAL - Unable to query Graylog API: {e}') sys.exit(2) notifications = response.json() if notifications['total'] > 0: notif_str = 'Notifications:' for notification in notifications['notifications']: notif_str = notif_str + f'\n{notification["type"]}: {notification["description"]}' else: notif_str = 'No notifications' if input_data['state'] == 'RUNNING': text_result = f'Input "{input_data["message_input"]["title"]}" is running.' else: text_result = f'Input "{input_data["message_input"]["title"]}" is not running!' print(text_result) print(notif_str) sys.exit(checker.nagios.STATE_CRIT) if jvm_mem_usage_state != checker.nagios.STATE_OK: text_result += f' JVM memory usage is high!' exit_code = max(checker.nagios.STATE_OK, jvm_mem_usage_state) if notifications['total'] > 0: text_result += f' There are notifications!' if args.crit_notif: exit_code = checker.nagios.STATE_CRIT text_result = text_result + f' JVM memory usage: {int((perfdata["jvm_memory_used"]["value"] / metrics_data["jvm_memory_heap_max"]) * 100)}%, incoming rate (events/second for last minute): {perfdata["incoming_messages_rate_per_sec_one_minute"]["value"]}, connections: {perfdata["connections"]["value"]}, total network in: {human_readable_size(perfdata["network_in_total"]["value"], decimal_places=0)}' + '\n' + notif_str print_icinga2_check_status(text_result, exit_code, perfdata) sys.exit(exit_code) if __name__ == "__main__": try: main() except Exception as e: print(f'UNKNOWN: exception "{e}"') print(traceback.format_exc()) sys.exit(checker.nagios.STATE_UNKNOWN)