diff --git a/check_graylog.py b/check_graylog.py
index 42ca6a5..0429eeb 100755
--- a/check_graylog.py
+++ b/check_graylog.py
@@ -35,16 +35,18 @@ def main():
parser = argparse.ArgumentParser(description='Check Graylog input health')
parser.add_argument('-u', '--url', required=True, help='The base Graylog URL')
parser.add_argument('-t', '--token', required=True, help='Graylog API token')
- parser.add_argument('-i', '--input', required=True, help='Input ID to check')
+ parser.add_argument('-i', '--input', help='Input ID to check. If unset, will check cluster metrics')
parser.add_argument('--warn-mem', type=int, default=75, help='Percentage of JVM memory used for warm')
parser.add_argument('--crit-mem', type=int, default=100, help='Percentage of JVM memory used for critical')
parser.add_argument('--insecure', action='store_false', help="Don't verify SSL")
parser.add_argument('--crit-notif', action='store_true', help='Return critical when there are notifications')
parser.add_argument('--html', action='store_true', help='Print HTML')
- parser.add_argument('--cluster-metrics', action='store_true', help='Also gather cluster metrics and check for notifications.')
+ parser.add_argument('--cluster-metrics', action='store_true', help='Also gather cluster metrics and check for notifications')
args = parser.parse_args()
base_url = args.url.strip('/')
+ if not args.input:
+ args.cluster_metrics = True
if not args.insecure:
requests.packages.urllib3.disable_warnings(category=InsecureRequestWarning)
@@ -54,38 +56,11 @@ def main():
'X-Requested-By': 'XMLHttpRequest',
}
- # Get the basic input metadata
- input_data = transform_inputs(fetch_with_retry(f'{base_url}/api/cluster/inputstates', headers=headers, auth=(args.token, 'token'), verify=args.insecure).json()).get(args.input, {})
-
- # Get it over with
- if bool(input_data) and input_data.get('state') == 'RUNNING':
- input_name = input_data["message_input"]["title"]
- text_result = f'Graylog input "{input_name}" is running.'
- else:
- input_name = args.input
- if args.html:
- text_result = f'Graylog input "{input_name}" is not running!'
- else:
- text_result = f'Graylog input "{input_name}" is not running!'
- print_icinga2_check_status(text_result, nagios.STATE_CRIT)
- sys.exit(nagios.STATE_CRIT)
-
- # If the input is running, continue gathering metrics and other health checks
- input_name_clean = input_name.lower().replace(' ', '_').replace('-', '_')
-
- # Get metrics for the input
- type = input_data['message_input']['type']
+ text_result = ''
metrics_json = {
'metrics': [
'org.graylog2.throughput.input.1-sec-rate',
'org.graylog2.throughput.output.1-sec-rate',
- f'{type}.{args.input}.incomingMessages',
- f'{type}.{args.input}.open_connections',
- f'{type}.{args.input}.total_connections',
- f'{type}.{args.input}.written_bytes_1sec',
- f'{type}.{args.input}.written_bytes_total',
- f'{type}.{args.input}.read_bytes_1sec',
- f'{type}.{args.input}.read_bytes_total',
"org.graylog2.journal.append.1-sec-rate",
"org.graylog2.journal.read.1-sec-rate",
"org.graylog2.journal.segments",
@@ -95,6 +70,21 @@ def main():
"jvm.memory.heap.max"
],
}
+
+ if args.input:
+ # Get the basic input metadata
+ input_data = transform_inputs(fetch_with_retry(f'{base_url}/api/cluster/inputstates', headers=headers, auth=(args.token, 'token'), verify=args.insecure).json()).get(args.input, {})
+ type = input_data['message_input']['type']
+ metrics_json['metrics'] = metrics_json['metrics'] + [
+ f'{type}.{args.input}.incomingMessages',
+ f'{type}.{args.input}.open_connections',
+ f'{type}.{args.input}.total_connections',
+ f'{type}.{args.input}.written_bytes_1sec',
+ f'{type}.{args.input}.written_bytes_total',
+ f'{type}.{args.input}.read_bytes_1sec',
+ f'{type}.{args.input}.read_bytes_total',
+ ]
+
r = fetch_with_retry(f'{base_url}/api/cluster/metrics/multiple', method='post', headers=headers, auth=(args.token, 'token'),
verify=args.insecure, json=metrics_json).json()
input_metrics = r[list(r.keys())[0]]['metrics']
@@ -102,8 +92,11 @@ def main():
# Format the metrics for later
metrics_data = {}
for metric in input_metrics:
- name = metric['full_name'].replace(type, '').replace('org.graylog2.', '').replace(args.input, '').strip(
- '.').replace('-', '_').replace('.', '_')
+ if args.input:
+ name = metric['full_name'].replace(type, '').replace('org.graylog2.', '').replace(args.input, '')
+ else:
+ name = metric['full_name'].replace('org.graylog2.', '')
+ name = name.strip('.').replace('-', '_').replace('.', '_')
value = None
if 'value' in metric['metric']:
value = metric["metric"]["value"]
@@ -115,62 +108,81 @@ def main():
value = int(value)
metrics_data[name] = value
- jvm_mem_usage_warn = int(metrics_data['jvm_memory_heap_max'] / int(100 / args.warn_mem))
- jvm_mem_usage_crit = int(metrics_data['jvm_memory_heap_max'] / int(100 / args.crit_mem))
+ perfdata = {}
- # Get traffic data for last 24 hrs
- traffic_last_24_hrs = fetch_with_retry(f'{base_url}/api/system/cluster/traffic?daily=false', headers=headers, auth=(args.token, 'token'), verify=args.insecure).json()
- input_traffic_avg = sum([v for k, v in traffic_last_24_hrs['input'].items() if parse_traffic_ts(k)])
- output_traffic_avg = sum([v for k, v in traffic_last_24_hrs['output'].items() if parse_traffic_ts(k)])
+ if args.input:
+ # Get it over with
+ if bool(input_data) and input_data.get('state') == 'RUNNING':
+ input_name = input_data["message_input"]["title"]
+ text_result = f'Graylog input "{input_name}" is running.'
+ else:
+ input_name = args.input
+ if args.html:
+ text_result = f'Graylog input "{input_name}" is not running!'
+ else:
+ text_result = f'Graylog input "{input_name}" is not running!'
+ print_icinga2_check_status(text_result, nagios.STATE_CRIT)
+ sys.exit(nagios.STATE_CRIT)
- elastisearch_health = fetch_with_retry(f'{base_url}/api/system/indexer/cluster/health', headers=headers, auth=(args.token, 'token'), verify=args.insecure).json()
- elastisearch_status = elastisearch_health['status'].lower()
- elastisearch_active_shards = elastisearch_health['shards']['active']
+ # If the input is running, continue gathering metrics and other health checks
+ input_name_clean = input_name.lower().replace(' ', '_').replace('-', '_')
- indexer_failures = fetch_with_retry(f'{base_url}/api/system/indexer/failures?limit=10&offset=0', headers=headers, auth=(args.token, 'token'), verify=args.insecure).json()
-
- # Some metric names are changed for better readability
- perfdata = {
- f'{input_name_clean}_throughput_input_1_sec_rate': {
- 'value': int(metrics_data['throughput_input_1_sec_rate']),
- 'min': 0,
- },
- f'{input_name_clean}_throughput_output_1_sec_rate': {
- 'value': int(metrics_data['throughput_output_1_sec_rate']),
- 'min': 0,
- },
- f'{input_name_clean}_incoming_messages_rate_per_sec_one_minute': {
- 'value': metrics_data['incomingMessages_one_minute'],
- 'min': 0,
- },
- f'{input_name_clean}_connections': {
- 'value': metrics_data['open_connections'],
- 'min': 0,
- },
- f'{input_name_clean}_network_out_total_1sec': {
- 'value': metrics_data['written_bytes_1sec'],
- 'min': 0,
- 'unit': 'B',
- },
- f'{input_name_clean}_network_out_total_total': {
- 'value': metrics_data['written_bytes_total'],
- 'min': 0,
- 'unit': 'B',
- },
- f'{input_name_clean}_network_in_1sec': {
- 'value': metrics_data['read_bytes_1sec'],
- 'min': 0,
- 'unit': 'B',
- },
- f'{input_name_clean}_network_in_total': {
- 'value': metrics_data['read_bytes_total'],
- 'min': 0,
- 'unit': 'B',
- }
- }
+ # Some metric names are changed for better readability
+ perfdata.update({
+ f'{input_name_clean}_incoming_messages_rate_per_sec_one_minute': {
+ 'value': metrics_data['incomingMessages_one_minute'],
+ 'min': 0,
+ },
+ f'{input_name_clean}_connections': {
+ 'value': metrics_data['open_connections'],
+ 'min': 0,
+ },
+ f'{input_name_clean}_network_out_total_1sec': {
+ 'value': metrics_data['written_bytes_1sec'],
+ 'min': 0,
+ 'unit': 'B',
+ },
+ f'{input_name_clean}_network_out_total_total': {
+ 'value': metrics_data['written_bytes_total'],
+ 'min': 0,
+ 'unit': 'B',
+ },
+ f'{input_name_clean}_network_in_1sec': {
+ 'value': metrics_data['read_bytes_1sec'],
+ 'min': 0,
+ 'unit': 'B',
+ },
+ f'{input_name_clean}_network_in_total': {
+ 'value': metrics_data['read_bytes_total'],
+ 'min': 0,
+ 'unit': 'B',
+ }
+ })
if args.cluster_metrics:
+ jvm_mem_usage_warn = int(metrics_data['jvm_memory_heap_max'] / int(100 / args.warn_mem))
+ jvm_mem_usage_crit = int(metrics_data['jvm_memory_heap_max'] / int(100 / args.crit_mem))
+
+ # Get traffic data for last 24 hrs
+ traffic_last_24_hrs = fetch_with_retry(f'{base_url}/api/system/cluster/traffic?daily=false', headers=headers, auth=(args.token, 'token'), verify=args.insecure).json()
+ input_traffic_avg = sum([v for k, v in traffic_last_24_hrs['input'].items() if parse_traffic_ts(k)])
+ output_traffic_avg = sum([v for k, v in traffic_last_24_hrs['output'].items() if parse_traffic_ts(k)])
+
+ elastisearch_health = fetch_with_retry(f'{base_url}/api/system/indexer/cluster/health', headers=headers, auth=(args.token, 'token'), verify=args.insecure).json()
+ elastisearch_status = elastisearch_health['status'].lower()
+ elastisearch_active_shards = elastisearch_health['shards']['active']
+
+ indexer_failures = fetch_with_retry(f'{base_url}/api/system/indexer/failures?limit=10&offset=0', headers=headers, auth=(args.token, 'token'), verify=args.insecure).json()
+
perfdata.update({
+ 'throughput_input_1_sec_rate': {
+ 'value': int(metrics_data['throughput_input_1_sec_rate']),
+ 'min': 0,
+ },
+ 'throughput_output_1_sec_rate': {
+ 'value': int(metrics_data['throughput_output_1_sec_rate']),
+ 'min': 0,
+ },
'entries_uncommitted': {
'value': metrics_data['journal_entries_uncommitted'],
'min': 0,
@@ -183,7 +195,7 @@ def main():
'max': int(metrics_data['jvm_memory_heap_max']),
'unit': 'B',
},
- 'from_network_traffic_avg': {
+ 'network_traffic_in_avg': {
'value': input_traffic_avg,
'min': 0,
'unit': 'B',
@@ -246,10 +258,13 @@ def main():
if args.crit_notif:
exit_code = nagios.STATE_CRIT # force crit
- text_result = text_result + f' JVM memory usage: {int((perfdata["jvm_memory_used"]["value"] / metrics_data["jvm_memory_heap_max"]) * 100)}%, input incoming rate (events/second for last minute): {perfdata[f"{input_name_clean}_incoming_messages_rate_per_sec_one_minute"]["value"]}, input connections: {perfdata[f"{input_name_clean}_connections"]["value"]}, input total network in: {human_readable_size(perfdata[f"{input_name_clean}_network_in_total"]["value"], decimal_places=0)}' + '\n' + notif_str
- else:
- text_result = text_result + f' {input_name_clean}_incoming_messages_rate_per_sec_one_minute (events/second for last minute): {perfdata[f"{input_name_clean}_incoming_messages_rate_per_sec_one_minute"]["value"]}, {input_name_clean}_connections: {perfdata[f"{input_name_clean}_connections"]["value"]}, {input_name_clean}_network_in_total: {human_readable_size(perfdata[f"{input_name_clean}_network_in_total"]["value"], decimal_places=0)}' + '\n'
+ text_result = text_result + f' JVM memory usage: {int((perfdata["jvm_memory_used"]["value"] / metrics_data["jvm_memory_heap_max"]) * 100)}%'
+
+ if args.input:
+ text_result = text_result + f' {input_name_clean} events/second for last minute: {perfdata[f"{input_name_clean}_incoming_messages_rate_per_sec_one_minute"]["value"]}, {input_name_clean}_connections: {perfdata[f"{input_name_clean}_connections"]["value"]}, {input_name_clean}_network_in_total: {human_readable_size(perfdata[f"{input_name_clean}_network_in_total"]["value"], decimal_places=0)}'
exit_code = nagios.STATE_OK
+ else:
+ text_result = text_result + '\n' + notif_str
print_icinga2_check_status(text_result, exit_code, perfdata)
sys.exit(exit_code)