icinga2-checks/check_graylog.py

#!/usr/bin/env python3

import argparse
import sys
import traceback
from datetime import datetime

import requests
from urllib3.exceptions import InsecureRequestWarning

from checker import nagios
from checker import print_icinga2_check_status
from checker.http import fetch_with_retry
from checker.linuxfabric.base import get_state
from checker.units import human_readable_size


def transform_inputs(old_dict):
    new_dict = {}
    for key in old_dict:
        for item in old_dict[key]:
            new_key = item['id']
            new_dict[new_key] = item
    return new_dict


def parse_traffic_ts(ts: str):
    datetime_obj = datetime.strptime(ts, '%Y-%m-%dT%H:%M:%S.%fZ')
    current_time = datetime.now()
    time_diff = current_time - datetime_obj
    return time_diff.total_seconds() < 24 * 60 * 60  # less than 24 hrs ago?


def main():
    parser = argparse.ArgumentParser(description='Check Graylog input health')
    parser.add_argument('-u', '--url', required=True, help='The base Graylog URL')
    parser.add_argument('-t', '--token', required=True, help='Graylog API token')
    parser.add_argument('-i', '--input', help='Input ID to check. If unset, will check cluster metrics')
    parser.add_argument('--warn-mem', type=int, default=75, help='Percentage of JVM memory used for warm')
    parser.add_argument('--crit-mem', type=int, default=100, help='Percentage of JVM memory used for critical')
    parser.add_argument('--insecure', action='store_false', help="Don't verify SSL")
    parser.add_argument('--crit-notif', action='store_true', help='Return critical when there are notifications')
    parser.add_argument('--ignore-update-notif', action='store_true', help='Ignore any update notifications')
    parser.add_argument('--html', action='store_true', help='Print HTML')
    parser.add_argument('--cluster-metrics', action='store_true', help='Also gather cluster metrics and check for notifications')
    args = parser.parse_args()

    base_url = args.url.strip('/')
    if not args.input:
        args.cluster_metrics = True

    if not args.insecure:
        requests.packages.urllib3.disable_warnings(category=InsecureRequestWarning)

    headers = {
        'Accept': 'application/json',
        'X-Requested-By': 'XMLHttpRequest',
    }

    text_result = ''
    metrics_json = {
        'metrics': [
            'org.graylog2.throughput.input.1-sec-rate',
            'org.graylog2.throughput.output.1-sec-rate',
            "org.graylog2.journal.append.1-sec-rate",
            "org.graylog2.journal.read.1-sec-rate",
            "org.graylog2.journal.segments",
            "org.graylog2.journal.entries-uncommitted",
            "jvm.memory.heap.used",
            "jvm.memory.heap.committed",
            "jvm.memory.heap.max"
        ],
    }

    if args.input:
        input_data = transform_inputs(
            fetch_with_retry(f'{base_url}/api/cluster/inputstates', headers=headers, auth=(args.token, 'token'),
                             verify=args.insecure).json()).get(args.input, {})
        # Get it over with
        if bool(input_data) and input_data.get('state') == 'RUNNING':
            input_name = input_data["message_input"]["title"]
            text_result = f'Graylog input "{input_name}" is running.'
        else:
            input_name = args.input
            if args.html:
                text_result = f'Graylog input <a href="{base_url}/system/inputs" target="_blank">"{input_name}" is not running!</a>'
            else:
                text_result = f'Graylog input "{input_name}" is not running!'
            print_icinga2_check_status(text_result, nagios.STATE_CRIT)
            sys.exit(nagios.STATE_CRIT)

        # If the input is running, continue gathering metrics and other health checks
        input_name_clean = input_name.lower().replace(' ', '_').replace('-', '_')

        type = input_data['message_input']['type']
        metrics_json['metrics'] = metrics_json['metrics'] + [
            f'{type}.{args.input}.incomingMessages',
            f'{type}.{args.input}.open_connections',
            f'{type}.{args.input}.total_connections',
            f'{type}.{args.input}.written_bytes_1sec',
            f'{type}.{args.input}.written_bytes_total',
            f'{type}.{args.input}.read_bytes_1sec',
            f'{type}.{args.input}.read_bytes_total',
        ]

    r = fetch_with_retry(f'{base_url}/api/cluster/metrics/multiple', method='post', headers=headers,
                         auth=(args.token, 'token'),
                         verify=args.insecure, json=metrics_json).json()
    input_metrics = r[list(r.keys())[0]]['metrics']

    # Format the metrics for later
    metrics_data = {}
    for metric in input_metrics:
        if args.input:
            name = metric['full_name'].replace(type, '').replace('org.graylog2.', '').replace(args.input, '')
        else:
            name = metric['full_name'].replace('org.graylog2.', '')
        name = name.strip('.').replace('-', '_').replace('.', '_')
        value = None
        if 'value' in metric['metric']:
            value = metric["metric"]["value"]
        elif 'count' in metric['metric']:
            value = metric["metric"]["count"]
        elif 'rate' in metric['metric']:
            value = metric["metric"]["rate"]["one_minute"]
            name = f'{name}_one_minute'
        value = int(value)
        metrics_data[name] = value

    perfdata = {}

    if args.input:
        # Some metric names are changed for better readability
        perfdata.update({
            f'{input_name_clean}_incoming_messages_rate_per_sec_1min': {
                'value': metrics_data['incomingMessages_one_minute'],
                'min': 0,
            },
            f'{input_name_clean}_connections': {
                'value': metrics_data['open_connections'],
                'min': 0,
            },
            f'{input_name_clean}_network_out_total_1sec': {
                'value': metrics_data['written_bytes_1sec'],
                'min': 0,
                'unit': 'B',
            },
            f'{input_name_clean}_network_out_total_total': {
                'value': metrics_data['written_bytes_total'],
                'min': 0,
                'unit': 'B',
            },
            f'{input_name_clean}_network_in_1sec': {
                'value': metrics_data['read_bytes_1sec'],
                'min': 0,
                'unit': 'B',
            },
            f'{input_name_clean}_network_in_total': {
                'value': metrics_data['read_bytes_total'],
                'min': 0,
                'unit': 'B',
            }
        })

    if args.cluster_metrics:
        jvm_mem_usage_warn = int(metrics_data['jvm_memory_heap_max'] / int(100 / args.warn_mem))
        jvm_mem_usage_crit = int(metrics_data['jvm_memory_heap_max'] / int(100 / args.crit_mem))

        # Get traffic data for last 24 hrs
        traffic_last_24_hrs = fetch_with_retry(f'{base_url}/api/system/cluster/traffic?daily=false', headers=headers,
                                               auth=(args.token, 'token'), verify=args.insecure).json()
        input_traffic_avg = sum([v for k, v in traffic_last_24_hrs['input'].items() if parse_traffic_ts(k)])
        output_traffic_avg = sum([v for k, v in traffic_last_24_hrs['output'].items() if parse_traffic_ts(k)])

        elasticsearch_health = fetch_with_retry(f'{base_url}/api/system/indexer/cluster/health', headers=headers,
                                                auth=(args.token, 'token'), verify=args.insecure).json()
        elasticsearch_status = elasticsearch_health['status'].lower()
        elasticsearch_active_shards = elasticsearch_health['shards']['active']

        indexer_failures = fetch_with_retry(f'{base_url}/api/system/indexer/failures?limit=10&offset=0',
                                            headers=headers, auth=(args.token, 'token'), verify=args.insecure).json()

        perfdata.update({
            'throughput_input_1_sec_rate': {
                'value': int(metrics_data['throughput_input_1_sec_rate']),
                'min': 0,
            },
            'throughput_output_1_sec_rate': {
                'value': int(metrics_data['throughput_output_1_sec_rate']),
                'min': 0,
            },
            'entries_uncommitted': {
                'value': metrics_data['journal_entries_uncommitted'],
                'min': 0,
            },
            'jvm_memory_used': {
                'value': metrics_data['jvm_memory_heap_used'],
                'min': 0,
                'warn': jvm_mem_usage_warn,
                'crit': jvm_mem_usage_crit,
                'max': int(metrics_data['jvm_memory_heap_max']),
                'unit': 'B',
            },
            'network_traffic_in_avg': {
                'value': input_traffic_avg,
                'min': 0,
                'unit': 'B',
            },
            'to_elasticsearch_24hrs_avg': {
                'value': output_traffic_avg,
                'min': 0,
                'unit': 'B',
            },
            'elasticsearch_active_shards': {
                'value': elasticsearch_active_shards,
                'min': 0
            },
            'indexer_failures': {
                'value': indexer_failures['total'],
                'warn': 1,
                'crit': 1,
                'min': 0,
            },
        })

    # Check for notifications
    if args.cluster_metrics:
        notifications_query = fetch_with_retry(f'{base_url}/api/system/notifications', headers=headers,
                                         auth=(args.token, 'token'), verify=args.insecure).json()

        notifications = []
        for notif in notifications_query['notifications']:
            if notif['type'] == 'outdated_version' and not args.ignore_update_notif:
                notifications.append(notif)
            elif notif['type'] != 'outdated_version':
                notifications.append(notif)

        if len(notifications):
            notif = "notifications" if len(notifications) else "notification"
            are = "are" if len(notifications) else "is"
            if args.html:
                notif_str = f'<a href="{base_url}/system/overview" target="_blank">There {are} {len(notifications)} {notif}.</a>'
            else:
                notif_str = f'There {are} {len(notifications)} {notif}.'
        else:
            notif_str = 'No notifications'

        if indexer_failures['total'] > 0:
            indexer_failures_exit = nagios.STATE_CRIT
            if args.html:
                text_result += f' <a href="{base_url}/system/indices/failures" target="_blank">There are {indexer_failures["total"]} indexer failures!</a>'
            else:
                text_result += f' There are {indexer_failures["total"]} indexer failures!'
        else:
            indexer_failures_exit = nagios.STATE_OK

        # https://go2docs.graylog.org/5-0/setting_up_graylog/elasticsearch.htm#ClusterStatusExplained
        if elasticsearch_status == 'yellow':
            elasticsearch_exit_code = nagios.STATE_WARN
            text_result += ' Elasticsearch is condition YELLOW!'
        elif elasticsearch_status == 'red':
            elasticsearch_exit_code = nagios.STATE_CRIT
            text_result += ' Elasticsearch is condition RED!'
        elif elasticsearch_status == 'green':
            elasticsearch_exit_code = nagios.STATE_OK
        else:
            print_icinga2_check_status(f'unknown Elasticsearch health: {elasticsearch_status}', nagios.STATE_UNKNOWN)
            sys.exit(nagios.STATE_UNKNOWN)

        jvm_mem_usage_state = get_state(int(metrics_data['jvm_memory_heap_used']), jvm_mem_usage_warn,
                                        jvm_mem_usage_crit, operator='gt')
        if jvm_mem_usage_state != nagios.STATE_OK:
            text_result += f' JVM memory usage is high!'

        exit_code = max(nagios.STATE_OK, jvm_mem_usage_state, elasticsearch_exit_code, indexer_failures_exit)

        if len(notifications):
            text_result += f' There {are} {len(notifications)} {notif}!'
            if args.crit_notif:
                exit_code = nagios.STATE_CRIT  # force crit

        if args.input:
            # show less data
            text_result = text_result + f' JVM memory usage: {int((perfdata["jvm_memory_used"]["value"] / metrics_data["jvm_memory_heap_max"]) * 100)}%'
        else:
            # show more data
            text_result = text_result + f' JVM memory usage: {int((perfdata["jvm_memory_used"]["value"] / metrics_data["jvm_memory_heap_max"]) * 100)}%, throughput last 1 second: {human_readable_size(perfdata["throughput_input_1_sec_rate"]["value"])} in - {human_readable_size(perfdata["throughput_output_1_sec_rate"]["value"])} out, Elasticsearch active shards: {perfdata["elasticsearch_active_shards"]["value"]}'

    if args.input:
        text_result = text_result + f' {input_name_clean} events/second for last minute: {perfdata[f"{input_name_clean}_incoming_messages_rate_per_sec_1min"]["value"]}, {input_name_clean}_connections: {perfdata[f"{input_name_clean}_connections"]["value"]}, {input_name_clean}_network_in_total: {human_readable_size(perfdata[f"{input_name_clean}_network_in_total"]["value"], decimal_places=0)}'
        exit_code = nagios.STATE_OK
    else:
        text_result = text_result + '\n' + notif_str

    print_icinga2_check_status(text_result, exit_code, perfdata)
    sys.exit(exit_code)


if __name__ == "__main__":
    try:
        main()
    except Exception as e:
        print(f'UNKNOWN: exception "{e}"')
        print(traceback.format_exc())
        sys.exit(nagios.STATE_UNKNOWN)
add check_greylog.py 2023-06-27 13:13:52 -06:00			`#!/usr/bin/env python3`

			`import argparse`
			`import sys`
			`import traceback`
add method to get_with_retry() check_graylog: add more healthchecks and perfdata 2023-06-27 17:56:40 -06:00			`from datetime import datetime`
add check_greylog.py 2023-06-27 13:13:52 -06:00
			`import requests`
			`from urllib3.exceptions import InsecureRequestWarning`

add method to get_with_retry() check_graylog: add more healthchecks and perfdata 2023-06-27 17:56:40 -06:00			`from checker import nagios`
add check_greylog.py 2023-06-27 13:13:52 -06:00			`from checker import print_icinga2_check_status`
add method to get_with_retry() check_graylog: add more healthchecks and perfdata 2023-06-27 17:56:40 -06:00			`from checker.http import fetch_with_retry`
add check_greylog.py 2023-06-27 13:13:52 -06:00			`from checker.linuxfabric.base import get_state`
check_greylog: alert notifications, better metric names, better filesizes 2023-06-27 14:32:10 -06:00			`from checker.units import human_readable_size`
add check_greylog.py 2023-06-27 13:13:52 -06:00

add method to get_with_retry() check_graylog: add more healthchecks and perfdata 2023-06-27 17:56:40 -06:00			`def transform_inputs(old_dict):`
			`new_dict = {}`
			`for key in old_dict:`
			`for item in old_dict[key]:`
			`new_key = item['id']`
			`new_dict[new_key] = item`
			`return new_dict`


			`def parse_traffic_ts(ts: str):`
			`datetime_obj = datetime.strptime(ts, '%Y-%m-%dT%H:%M:%S.%fZ')`
			`current_time = datetime.now()`
			`time_diff = current_time - datetime_obj`
			`return time_diff.total_seconds() < 24 * 60 * 60 # less than 24 hrs ago?`


add check_greylog.py 2023-06-27 13:13:52 -06:00			`def main():`
			`parser = argparse.ArgumentParser(description='Check Graylog input health')`
add method to get_with_retry() check_graylog: add more healthchecks and perfdata 2023-06-27 17:56:40 -06:00			`parser.add_argument('-u', '--url', required=True, help='The base Graylog URL')`
add check_greylog.py 2023-06-27 13:13:52 -06:00			`parser.add_argument('-t', '--token', required=True, help='Graylog API token')`
check_graylog: split input and cluster metrics 2023-06-27 18:16:25 -06:00			`parser.add_argument('-i', '--input', help='Input ID to check. If unset, will check cluster metrics')`
add check_greylog.py 2023-06-27 13:13:52 -06:00			`parser.add_argument('--warn-mem', type=int, default=75, help='Percentage of JVM memory used for warm')`
			`parser.add_argument('--crit-mem', type=int, default=100, help='Percentage of JVM memory used for critical')`
			`parser.add_argument('--insecure', action='store_false', help="Don't verify SSL")`
check_greylog: alert notifications, better metric names, better filesizes 2023-06-27 14:32:10 -06:00			`parser.add_argument('--crit-notif', action='store_true', help='Return critical when there are notifications')`
check_graylog: option to ignore update notifications 2023-07-25 10:22:55 -06:00			`parser.add_argument('--ignore-update-notif', action='store_true', help='Ignore any update notifications')`
add method to get_with_retry() check_graylog: add more healthchecks and perfdata 2023-06-27 17:56:40 -06:00			`parser.add_argument('--html', action='store_true', help='Print HTML')`
add check_graylog_index_size, fix other stuff 2024-04-09 21:23:07 -06:00			`parser.add_argument('--cluster-metrics', action='store_true', help='Also gather cluster metrics and check for notifications')`
add check_greylog.py 2023-06-27 13:13:52 -06:00			`args = parser.parse_args()`

add method to get_with_retry() check_graylog: add more healthchecks and perfdata 2023-06-27 17:56:40 -06:00			`base_url = args.url.strip('/')`
check_graylog: split input and cluster metrics 2023-06-27 18:16:25 -06:00			`if not args.input:`
			`args.cluster_metrics = True`
add method to get_with_retry() check_graylog: add more healthchecks and perfdata 2023-06-27 17:56:40 -06:00
add check_greylog.py 2023-06-27 13:13:52 -06:00			`if not args.insecure:`
			`requests.packages.urllib3.disable_warnings(category=InsecureRequestWarning)`

			`headers = {`
			`'Accept': 'application/json',`
			`'X-Requested-By': 'XMLHttpRequest',`
			`}`

check_graylog: split input and cluster metrics 2023-06-27 18:16:25 -06:00			`text_result = ''`
add method to get_with_retry() check_graylog: add more healthchecks and perfdata 2023-06-27 17:56:40 -06:00			`metrics_json = {`
			`'metrics': [`
			`'org.graylog2.throughput.input.1-sec-rate',`
			`'org.graylog2.throughput.output.1-sec-rate',`
			`"org.graylog2.journal.append.1-sec-rate",`
			`"org.graylog2.journal.read.1-sec-rate",`
			`"org.graylog2.journal.segments",`
			`"org.graylog2.journal.entries-uncommitted",`
			`"jvm.memory.heap.used",`
			`"jvm.memory.heap.committed",`
			`"jvm.memory.heap.max"`
			`],`
			`}`
check_graylog: split input and cluster metrics 2023-06-27 18:16:25 -06:00
			`if args.input:`
check_graylog: fix 2023-06-27 18:28:49 -06:00			`input_data = transform_inputs(`
			`fetch_with_retry(f'{base_url}/api/cluster/inputstates', headers=headers, auth=(args.token, 'token'),`
			`verify=args.insecure).json()).get(args.input, {})`
			`# Get it over with`
			`if bool(input_data) and input_data.get('state') == 'RUNNING':`
			`input_name = input_data["message_input"]["title"]`
			`text_result = f'Graylog input "{input_name}" is running.'`
			`else:`
			`input_name = args.input`
			`if args.html:`
			`text_result = f'Graylog input <a href="{base_url}/system/inputs" target="_blank">"{input_name}" is not running!</a>'`
			`else:`
			`text_result = f'Graylog input "{input_name}" is not running!'`
			`print_icinga2_check_status(text_result, nagios.STATE_CRIT)`
			`sys.exit(nagios.STATE_CRIT)`

			`# If the input is running, continue gathering metrics and other health checks`
			`input_name_clean = input_name.lower().replace(' ', '_').replace('-', '_')`

check_graylog: split input and cluster metrics 2023-06-27 18:16:25 -06:00			`type = input_data['message_input']['type']`
			`metrics_json['metrics'] = metrics_json['metrics'] + [`
			`f'{type}.{args.input}.incomingMessages',`
			`f'{type}.{args.input}.open_connections',`
			`f'{type}.{args.input}.total_connections',`
			`f'{type}.{args.input}.written_bytes_1sec',`
			`f'{type}.{args.input}.written_bytes_total',`
			`f'{type}.{args.input}.read_bytes_1sec',`
			`f'{type}.{args.input}.read_bytes_total',`
			`]`

check_graylog: fix 2023-06-27 18:28:49 -06:00			`r = fetch_with_retry(f'{base_url}/api/cluster/metrics/multiple', method='post', headers=headers,`
			`auth=(args.token, 'token'),`
add method to get_with_retry() check_graylog: add more healthchecks and perfdata 2023-06-27 17:56:40 -06:00			`verify=args.insecure, json=metrics_json).json()`
			`input_metrics = r[list(r.keys())[0]]['metrics']`
add check_greylog.py 2023-06-27 13:13:52 -06:00
add method to get_with_retry() check_graylog: add more healthchecks and perfdata 2023-06-27 17:56:40 -06:00			`# Format the metrics for later`
add check_greylog.py 2023-06-27 13:13:52 -06:00			`metrics_data = {}`
			`for metric in input_metrics:`
check_graylog: split input and cluster metrics 2023-06-27 18:16:25 -06:00			`if args.input:`
			`name = metric['full_name'].replace(type, '').replace('org.graylog2.', '').replace(args.input, '')`
			`else:`
			`name = metric['full_name'].replace('org.graylog2.', '')`
			`name = name.strip('.').replace('-', '_').replace('.', '_')`
add check_greylog.py 2023-06-27 13:13:52 -06:00			`value = None`
			`if 'value' in metric['metric']:`
			`value = metric["metric"]["value"]`
			`elif 'count' in metric['metric']:`
			`value = metric["metric"]["count"]`
			`elif 'rate' in metric['metric']:`
			`value = metric["metric"]["rate"]["one_minute"]`
			`name = f'{name}_one_minute'`
			`value = int(value)`
			`metrics_data[name] = value`

check_graylog: split input and cluster metrics 2023-06-27 18:16:25 -06:00			`perfdata = {}`

			`if args.input:`
			`# Some metric names are changed for better readability`
			`perfdata.update({`
check_graylog: update perfdata name 2023-06-27 18:18:06 -06:00			`f'{input_name_clean}_incoming_messages_rate_per_sec_1min': {`
check_graylog: split input and cluster metrics 2023-06-27 18:16:25 -06:00			`'value': metrics_data['incomingMessages_one_minute'],`
			`'min': 0,`
			`},`
			`f'{input_name_clean}_connections': {`
			`'value': metrics_data['open_connections'],`
			`'min': 0,`
			`},`
			`f'{input_name_clean}_network_out_total_1sec': {`
			`'value': metrics_data['written_bytes_1sec'],`
			`'min': 0,`
			`'unit': 'B',`
			`},`
			`f'{input_name_clean}_network_out_total_total': {`
			`'value': metrics_data['written_bytes_total'],`
			`'min': 0,`
			`'unit': 'B',`
			`},`
			`f'{input_name_clean}_network_in_1sec': {`
			`'value': metrics_data['read_bytes_1sec'],`
			`'min': 0,`
			`'unit': 'B',`
			`},`
			`f'{input_name_clean}_network_in_total': {`
			`'value': metrics_data['read_bytes_total'],`
			`'min': 0,`
			`'unit': 'B',`
			`}`
			`})`
add check_greylog.py 2023-06-27 13:13:52 -06:00
add method to get_with_retry() check_graylog: add more healthchecks and perfdata 2023-06-27 17:56:40 -06:00			`if args.cluster_metrics:`
check_graylog: split input and cluster metrics 2023-06-27 18:16:25 -06:00			`jvm_mem_usage_warn = int(metrics_data['jvm_memory_heap_max'] / int(100 / args.warn_mem))`
			`jvm_mem_usage_crit = int(metrics_data['jvm_memory_heap_max'] / int(100 / args.crit_mem))`

			`# Get traffic data for last 24 hrs`
check_graylog: fix 2023-06-27 18:28:49 -06:00			`traffic_last_24_hrs = fetch_with_retry(f'{base_url}/api/system/cluster/traffic?daily=false', headers=headers,`
			`auth=(args.token, 'token'), verify=args.insecure).json()`
check_graylog: split input and cluster metrics 2023-06-27 18:16:25 -06:00			`input_traffic_avg = sum([v for k, v in traffic_last_24_hrs['input'].items() if parse_traffic_ts(k)])`
			`output_traffic_avg = sum([v for k, v in traffic_last_24_hrs['output'].items() if parse_traffic_ts(k)])`

check_graylog: fix 2023-06-27 18:28:49 -06:00			`elasticsearch_health = fetch_with_retry(f'{base_url}/api/system/indexer/cluster/health', headers=headers,`
			`auth=(args.token, 'token'), verify=args.insecure).json()`
check_graylog: spelling 2023-06-27 18:24:31 -06:00			`elasticsearch_status = elasticsearch_health['status'].lower()`
			`elasticsearch_active_shards = elasticsearch_health['shards']['active']`
check_graylog: split input and cluster metrics 2023-06-27 18:16:25 -06:00
check_graylog: fix 2023-06-27 18:28:49 -06:00			`indexer_failures = fetch_with_retry(f'{base_url}/api/system/indexer/failures?limit=10&offset=0',`
			`headers=headers, auth=(args.token, 'token'), verify=args.insecure).json()`
check_graylog: split input and cluster metrics 2023-06-27 18:16:25 -06:00
add method to get_with_retry() check_graylog: add more healthchecks and perfdata 2023-06-27 17:56:40 -06:00			`perfdata.update({`
check_graylog: split input and cluster metrics 2023-06-27 18:16:25 -06:00			`'throughput_input_1_sec_rate': {`
			`'value': int(metrics_data['throughput_input_1_sec_rate']),`
			`'min': 0,`
			`},`
			`'throughput_output_1_sec_rate': {`
			`'value': int(metrics_data['throughput_output_1_sec_rate']),`
			`'min': 0,`
			`},`
add method to get_with_retry() check_graylog: add more healthchecks and perfdata 2023-06-27 17:56:40 -06:00			`'entries_uncommitted': {`
			`'value': metrics_data['journal_entries_uncommitted'],`
			`'min': 0,`
			`},`
			`'jvm_memory_used': {`
			`'value': metrics_data['jvm_memory_heap_used'],`
			`'min': 0,`
			`'warn': jvm_mem_usage_warn,`
			`'crit': jvm_mem_usage_crit,`
			`'max': int(metrics_data['jvm_memory_heap_max']),`
			`'unit': 'B',`
			`},`
check_graylog: split input and cluster metrics 2023-06-27 18:16:25 -06:00			`'network_traffic_in_avg': {`
add method to get_with_retry() check_graylog: add more healthchecks and perfdata 2023-06-27 17:56:40 -06:00			`'value': input_traffic_avg,`
			`'min': 0,`
			`'unit': 'B',`
			`},`
			`'to_elasticsearch_24hrs_avg': {`
			`'value': output_traffic_avg,`
			`'min': 0,`
			`'unit': 'B',`
			`},`
check_graylog: spelling 2023-06-27 18:24:31 -06:00			`'elasticsearch_active_shards': {`
			`'value': elasticsearch_active_shards,`
add method to get_with_retry() check_graylog: add more healthchecks and perfdata 2023-06-27 17:56:40 -06:00			`'min': 0`
			`},`
check_graylog: show more data when only cluster mode 2023-06-27 18:22:46 -06:00			`'indexer_failures': {`
add method to get_with_retry() check_graylog: add more healthchecks and perfdata 2023-06-27 17:56:40 -06:00			`'value': indexer_failures['total'],`
minor 2023-06-27 18:25:49 -06:00			`'warn': 1,`
			`'crit': 1,`
add method to get_with_retry() check_graylog: add more healthchecks and perfdata 2023-06-27 17:56:40 -06:00			`'min': 0,`
			`},`
			`})`
add check_greylog.py 2023-06-27 13:13:52 -06:00
add method to get_with_retry() check_graylog: add more healthchecks and perfdata 2023-06-27 17:56:40 -06:00			`# Check for notifications`
			`if args.cluster_metrics:`
check_graylog: option to ignore update notifications 2023-07-25 10:22:55 -06:00			`notifications_query = fetch_with_retry(f'{base_url}/api/system/notifications', headers=headers,`
check_graylog: fix 2023-06-27 18:28:49 -06:00			`auth=(args.token, 'token'), verify=args.insecure).json()`
check_graylog: option to ignore update notifications 2023-07-25 10:22:55 -06:00
			`notifications = []`
			`for notif in notifications_query['notifications']:`
			`if notif['type'] == 'outdated_version' and not args.ignore_update_notif:`
			`notifications.append(notif)`
			`elif notif['type'] != 'outdated_version':`
			`notifications.append(notif)`

			`if len(notifications):`
			`notif = "notifications" if len(notifications) else "notification"`
			`are = "are" if len(notifications) else "is"`
check_graylog: fix notifications 2023-06-27 18:34:00 -06:00			`if args.html:`
check_graylog: option to ignore update notifications 2023-07-25 10:22:55 -06:00			`notif_str = f'<a href="{base_url}/system/overview" target="_blank">There {are} {len(notifications)} {notif}.</a>'`
check_graylog: fix notifications 2023-06-27 18:34:00 -06:00			`else:`
check_graylog: option to ignore update notifications 2023-07-25 10:22:55 -06:00			`notif_str = f'There {are} {len(notifications)} {notif}.'`
add method to get_with_retry() check_graylog: add more healthchecks and perfdata 2023-06-27 17:56:40 -06:00			`else:`
			`notif_str = 'No notifications'`
add check_greylog.py 2023-06-27 13:13:52 -06:00
add method to get_with_retry() check_graylog: add more healthchecks and perfdata 2023-06-27 17:56:40 -06:00			`if indexer_failures['total'] > 0:`
			`indexer_failures_exit = nagios.STATE_CRIT`
			`if args.html:`
			`text_result += f' <a href="{base_url}/system/indices/failures" target="_blank">There are {indexer_failures["total"]} indexer failures!</a>'`
			`else:`
			`text_result += f' There are {indexer_failures["total"]} indexer failures!'`
			`else:`
			`indexer_failures_exit = nagios.STATE_OK`

			`# https://go2docs.graylog.org/5-0/setting_up_graylog/elasticsearch.htm#ClusterStatusExplained`
check_graylog: spelling 2023-06-27 18:24:31 -06:00			`if elasticsearch_status == 'yellow':`
			`elasticsearch_exit_code = nagios.STATE_WARN`
add method to get_with_retry() check_graylog: add more healthchecks and perfdata 2023-06-27 17:56:40 -06:00			`text_result += ' Elasticsearch is condition YELLOW!'`
check_graylog: spelling 2023-06-27 18:24:31 -06:00			`elif elasticsearch_status == 'red':`
			`elasticsearch_exit_code = nagios.STATE_CRIT`
add method to get_with_retry() check_graylog: add more healthchecks and perfdata 2023-06-27 17:56:40 -06:00			`text_result += ' Elasticsearch is condition RED!'`
check_graylog: spelling 2023-06-27 18:24:31 -06:00			`elif elasticsearch_status == 'green':`
			`elasticsearch_exit_code = nagios.STATE_OK`
add method to get_with_retry() check_graylog: add more healthchecks and perfdata 2023-06-27 17:56:40 -06:00			`else:`
check_graylog: spelling 2023-06-27 18:24:31 -06:00			`print_icinga2_check_status(f'unknown Elasticsearch health: {elasticsearch_status}', nagios.STATE_UNKNOWN)`
add method to get_with_retry() check_graylog: add more healthchecks and perfdata 2023-06-27 17:56:40 -06:00			`sys.exit(nagios.STATE_UNKNOWN)`
add check_greylog.py 2023-06-27 13:13:52 -06:00
check_graylog: fix 2023-06-27 18:28:49 -06:00			`jvm_mem_usage_state = get_state(int(metrics_data['jvm_memory_heap_used']), jvm_mem_usage_warn,`
			`jvm_mem_usage_crit, operator='gt')`
add method to get_with_retry() check_graylog: add more healthchecks and perfdata 2023-06-27 17:56:40 -06:00			`if jvm_mem_usage_state != nagios.STATE_OK:`
			`text_result += f' JVM memory usage is high!'`
add check_greylog.py 2023-06-27 13:13:52 -06:00
check_graylog: spelling 2023-06-27 18:24:31 -06:00			`exit_code = max(nagios.STATE_OK, jvm_mem_usage_state, elasticsearch_exit_code, indexer_failures_exit)`
check_greylog: alert notifications, better metric names, better filesizes 2023-06-27 14:32:10 -06:00
check_graylog: option to ignore update notifications 2023-07-25 10:22:55 -06:00			`if len(notifications):`
			`text_result += f' There {are} {len(notifications)} {notif}!'`
add method to get_with_retry() check_graylog: add more healthchecks and perfdata 2023-06-27 17:56:40 -06:00			`if args.crit_notif:`
			`exit_code = nagios.STATE_CRIT # force crit`
check_greylog: alert notifications, better metric names, better filesizes 2023-06-27 14:32:10 -06:00
check_graylog: show more data when only cluster mode 2023-06-27 18:22:46 -06:00			`if args.input:`
			`# show less data`
			`text_result = text_result + f' JVM memory usage: {int((perfdata["jvm_memory_used"]["value"] / metrics_data["jvm_memory_heap_max"]) * 100)}%'`
			`else:`
			`# show more data`
check_graylog: spelling 2023-06-27 18:24:31 -06:00			`text_result = text_result + f' JVM memory usage: {int((perfdata["jvm_memory_used"]["value"] / metrics_data["jvm_memory_heap_max"]) * 100)}%, throughput last 1 second: {human_readable_size(perfdata["throughput_input_1_sec_rate"]["value"])} in - {human_readable_size(perfdata["throughput_output_1_sec_rate"]["value"])} out, Elasticsearch active shards: {perfdata["elasticsearch_active_shards"]["value"]}'`
check_graylog: split input and cluster metrics 2023-06-27 18:16:25 -06:00
			`if args.input:`
check_graylog: update perfdata name 2023-06-27 18:18:06 -06:00			`text_result = text_result + f' {input_name_clean} events/second for last minute: {perfdata[f"{input_name_clean}_incoming_messages_rate_per_sec_1min"]["value"]}, {input_name_clean}_connections: {perfdata[f"{input_name_clean}_connections"]["value"]}, {input_name_clean}_network_in_total: {human_readable_size(perfdata[f"{input_name_clean}_network_in_total"]["value"], decimal_places=0)}'`
add method to get_with_retry() check_graylog: add more healthchecks and perfdata 2023-06-27 17:56:40 -06:00			`exit_code = nagios.STATE_OK`
check_graylog: split input and cluster metrics 2023-06-27 18:16:25 -06:00			`else:`
			`text_result = text_result + '\n' + notif_str`
check_greylog: alert notifications, better metric names, better filesizes 2023-06-27 14:32:10 -06:00
add check_greylog.py 2023-06-27 13:13:52 -06:00			`print_icinga2_check_status(text_result, exit_code, perfdata)`
			`sys.exit(exit_code)`


			`if __name__ == "__main__":`
			`try:`
			`main()`
			`except Exception as e:`
			`print(f'UNKNOWN: exception "{e}"')`
			`print(traceback.format_exc())`
add method to get_with_retry() check_graylog: add more healthchecks and perfdata 2023-06-27 17:56:40 -06:00			`sys.exit(nagios.STATE_UNKNOWN)`