diff --git a/check_graylog.py b/check_graylog.py index 0c55006..16079ea 100755 --- a/check_graylog.py +++ b/check_graylog.py @@ -41,7 +41,8 @@ def main(): parser.add_argument('--insecure', action='store_false', help="Don't verify SSL") parser.add_argument('--crit-notif', action='store_true', help='Return critical when there are notifications') parser.add_argument('--html', action='store_true', help='Print HTML') - parser.add_argument('--cluster-metrics', action='store_true', help='Also gather cluster metrics and check for notifications') + parser.add_argument('--cluster-metrics', action='store_true', + help='Also gather cluster metrics and check for notifications') args = parser.parse_args() base_url = args.url.strip('/') @@ -72,8 +73,25 @@ def main(): } if args.input: - # Get the basic input metadata - input_data = transform_inputs(fetch_with_retry(f'{base_url}/api/cluster/inputstates', headers=headers, auth=(args.token, 'token'), verify=args.insecure).json()).get(args.input, {}) + input_data = transform_inputs( + fetch_with_retry(f'{base_url}/api/cluster/inputstates', headers=headers, auth=(args.token, 'token'), + verify=args.insecure).json()).get(args.input, {}) + # Get it over with + if bool(input_data) and input_data.get('state') == 'RUNNING': + input_name = input_data["message_input"]["title"] + text_result = f'Graylog input "{input_name}" is running.' + else: + input_name = args.input + if args.html: + text_result = f'Graylog input "{input_name}" is not running!' + else: + text_result = f'Graylog input "{input_name}" is not running!' + print_icinga2_check_status(text_result, nagios.STATE_CRIT) + sys.exit(nagios.STATE_CRIT) + + # If the input is running, continue gathering metrics and other health checks + input_name_clean = input_name.lower().replace(' ', '_').replace('-', '_') + type = input_data['message_input']['type'] metrics_json['metrics'] = metrics_json['metrics'] + [ f'{type}.{args.input}.incomingMessages', @@ -85,7 +103,8 @@ def main(): f'{type}.{args.input}.read_bytes_total', ] - r = fetch_with_retry(f'{base_url}/api/cluster/metrics/multiple', method='post', headers=headers, auth=(args.token, 'token'), + r = fetch_with_retry(f'{base_url}/api/cluster/metrics/multiple', method='post', headers=headers, + auth=(args.token, 'token'), verify=args.insecure, json=metrics_json).json() input_metrics = r[list(r.keys())[0]]['metrics'] @@ -111,22 +130,6 @@ def main(): perfdata = {} if args.input: - # Get it over with - if bool(input_data) and input_data.get('state') == 'RUNNING': - input_name = input_data["message_input"]["title"] - text_result = f'Graylog input "{input_name}" is running.' - else: - input_name = args.input - if args.html: - text_result = f'Graylog input "{input_name}" is not running!' - else: - text_result = f'Graylog input "{input_name}" is not running!' - print_icinga2_check_status(text_result, nagios.STATE_CRIT) - sys.exit(nagios.STATE_CRIT) - - # If the input is running, continue gathering metrics and other health checks - input_name_clean = input_name.lower().replace(' ', '_').replace('-', '_') - # Some metric names are changed for better readability perfdata.update({ f'{input_name_clean}_incoming_messages_rate_per_sec_1min': { @@ -164,15 +167,18 @@ def main(): jvm_mem_usage_crit = int(metrics_data['jvm_memory_heap_max'] / int(100 / args.crit_mem)) # Get traffic data for last 24 hrs - traffic_last_24_hrs = fetch_with_retry(f'{base_url}/api/system/cluster/traffic?daily=false', headers=headers, auth=(args.token, 'token'), verify=args.insecure).json() + traffic_last_24_hrs = fetch_with_retry(f'{base_url}/api/system/cluster/traffic?daily=false', headers=headers, + auth=(args.token, 'token'), verify=args.insecure).json() input_traffic_avg = sum([v for k, v in traffic_last_24_hrs['input'].items() if parse_traffic_ts(k)]) output_traffic_avg = sum([v for k, v in traffic_last_24_hrs['output'].items() if parse_traffic_ts(k)]) - elasticsearch_health = fetch_with_retry(f'{base_url}/api/system/indexer/cluster/health', headers=headers, auth=(args.token, 'token'), verify=args.insecure).json() + elasticsearch_health = fetch_with_retry(f'{base_url}/api/system/indexer/cluster/health', headers=headers, + auth=(args.token, 'token'), verify=args.insecure).json() elasticsearch_status = elasticsearch_health['status'].lower() elasticsearch_active_shards = elasticsearch_health['shards']['active'] - indexer_failures = fetch_with_retry(f'{base_url}/api/system/indexer/failures?limit=10&offset=0', headers=headers, auth=(args.token, 'token'), verify=args.insecure).json() + indexer_failures = fetch_with_retry(f'{base_url}/api/system/indexer/failures?limit=10&offset=0', + headers=headers, auth=(args.token, 'token'), verify=args.insecure).json() perfdata.update({ 'throughput_input_1_sec_rate': { @@ -219,7 +225,8 @@ def main(): # Check for notifications if args.cluster_metrics: - notifications = fetch_with_retry(f'{base_url}/api/system/notifications', headers=headers, auth=(args.token, 'token'), verify=args.insecure).json() + notifications = fetch_with_retry(f'{base_url}/api/system/notifications', headers=headers, + auth=(args.token, 'token'), verify=args.insecure).json() if notifications['total'] > 0: notif_str = 'Notifications:' for notification in notifications['notifications']: @@ -249,7 +256,8 @@ def main(): print_icinga2_check_status(f'unknown Elasticsearch health: {elasticsearch_status}', nagios.STATE_UNKNOWN) sys.exit(nagios.STATE_UNKNOWN) - jvm_mem_usage_state = get_state(int(metrics_data['jvm_memory_heap_used']), jvm_mem_usage_warn, jvm_mem_usage_crit, operator='gt') + jvm_mem_usage_state = get_state(int(metrics_data['jvm_memory_heap_used']), jvm_mem_usage_warn, + jvm_mem_usage_crit, operator='gt') if jvm_mem_usage_state != nagios.STATE_OK: text_result += f' JVM memory usage is high!'