icinga2-checks/check_graylog.py

306 lines
13 KiB
Python
Executable File

#!/usr/bin/env python3
import argparse
import sys
import traceback
from datetime import datetime
import requests
from urllib3.exceptions import InsecureRequestWarning
from checker import nagios
from checker import print_icinga2_check_status
from checker.http import fetch_with_retry
from checker.linuxfabric.base import get_state
from checker.units import human_readable_size
def transform_inputs(old_dict):
new_dict = {}
for key in old_dict:
for item in old_dict[key]:
new_key = item['id']
new_dict[new_key] = item
return new_dict
def parse_traffic_ts(ts: str):
datetime_obj = datetime.strptime(ts, '%Y-%m-%dT%H:%M:%S.%fZ')
current_time = datetime.now()
time_diff = current_time - datetime_obj
return time_diff.total_seconds() < 24 * 60 * 60 # less than 24 hrs ago?
def main():
parser = argparse.ArgumentParser(description='Check Graylog input health')
parser.add_argument('-u', '--url', required=True, help='The base Graylog URL')
parser.add_argument('-t', '--token', required=True, help='Graylog API token')
parser.add_argument('-i', '--input', help='Input ID to check. If unset, will check cluster metrics')
parser.add_argument('--warn-mem', type=int, default=75, help='Percentage of JVM memory used for warm')
parser.add_argument('--crit-mem', type=int, default=100, help='Percentage of JVM memory used for critical')
parser.add_argument('--insecure', action='store_false', help="Don't verify SSL")
parser.add_argument('--crit-notif', action='store_true', help='Return critical when there are notifications')
parser.add_argument('--ignore-update-notif', action='store_true', help='Ignore any update notifications')
parser.add_argument('--html', action='store_true', help='Print HTML')
parser.add_argument('--cluster-metrics', action='store_true', help='Also gather cluster metrics and check for notifications')
args = parser.parse_args()
base_url = args.url.strip('/')
if not args.input:
args.cluster_metrics = True
if not args.insecure:
requests.packages.urllib3.disable_warnings(category=InsecureRequestWarning)
headers = {
'Accept': 'application/json',
'X-Requested-By': 'XMLHttpRequest',
}
text_result = ''
metrics_json = {
'metrics': [
'org.graylog2.throughput.input.1-sec-rate',
'org.graylog2.throughput.output.1-sec-rate',
"org.graylog2.journal.append.1-sec-rate",
"org.graylog2.journal.read.1-sec-rate",
"org.graylog2.journal.segments",
"org.graylog2.journal.entries-uncommitted",
"jvm.memory.heap.used",
"jvm.memory.heap.committed",
"jvm.memory.heap.max"
],
}
if args.input:
input_data = transform_inputs(
fetch_with_retry(f'{base_url}/api/cluster/inputstates', headers=headers, auth=(args.token, 'token'),
verify=args.insecure).json()).get(args.input, {})
# Get it over with
if bool(input_data) and input_data.get('state') == 'RUNNING':
input_name = input_data["message_input"]["title"]
text_result = f'Graylog input "{input_name}" is running.'
else:
input_name = args.input
if args.html:
text_result = f'Graylog input <a href="{base_url}/system/inputs" target="_blank">"{input_name}" is not running!</a>'
else:
text_result = f'Graylog input "{input_name}" is not running!'
print_icinga2_check_status(text_result, nagios.STATE_CRIT)
sys.exit(nagios.STATE_CRIT)
# If the input is running, continue gathering metrics and other health checks
input_name_clean = input_name.lower().replace(' ', '_').replace('-', '_')
type = input_data['message_input']['type']
metrics_json['metrics'] = metrics_json['metrics'] + [
f'{type}.{args.input}.incomingMessages',
f'{type}.{args.input}.open_connections',
f'{type}.{args.input}.total_connections',
f'{type}.{args.input}.written_bytes_1sec',
f'{type}.{args.input}.written_bytes_total',
f'{type}.{args.input}.read_bytes_1sec',
f'{type}.{args.input}.read_bytes_total',
]
r = fetch_with_retry(f'{base_url}/api/cluster/metrics/multiple', method='post', headers=headers,
auth=(args.token, 'token'),
verify=args.insecure, json=metrics_json).json()
input_metrics = r[list(r.keys())[0]]['metrics']
# Format the metrics for later
metrics_data = {}
for metric in input_metrics:
if args.input:
name = metric['full_name'].replace(type, '').replace('org.graylog2.', '').replace(args.input, '')
else:
name = metric['full_name'].replace('org.graylog2.', '')
name = name.strip('.').replace('-', '_').replace('.', '_')
value = None
if 'value' in metric['metric']:
value = metric["metric"]["value"]
elif 'count' in metric['metric']:
value = metric["metric"]["count"]
elif 'rate' in metric['metric']:
value = metric["metric"]["rate"]["one_minute"]
name = f'{name}_one_minute'
value = int(value)
metrics_data[name] = value
perfdata = {}
if args.input:
# Some metric names are changed for better readability
perfdata.update({
f'{input_name_clean}_incoming_messages_rate_per_sec_1min': {
'value': metrics_data['incomingMessages_one_minute'],
'min': 0,
},
f'{input_name_clean}_connections': {
'value': metrics_data['open_connections'],
'min': 0,
},
f'{input_name_clean}_network_out_total_1sec': {
'value': metrics_data['written_bytes_1sec'],
'min': 0,
'unit': 'B',
},
f'{input_name_clean}_network_out_total_total': {
'value': metrics_data['written_bytes_total'],
'min': 0,
'unit': 'B',
},
f'{input_name_clean}_network_in_1sec': {
'value': metrics_data['read_bytes_1sec'],
'min': 0,
'unit': 'B',
},
f'{input_name_clean}_network_in_total': {
'value': metrics_data['read_bytes_total'],
'min': 0,
'unit': 'B',
}
})
if args.cluster_metrics:
jvm_mem_usage_warn = int(metrics_data['jvm_memory_heap_max'] / int(100 / args.warn_mem))
jvm_mem_usage_crit = int(metrics_data['jvm_memory_heap_max'] / int(100 / args.crit_mem))
# Get traffic data for last 24 hrs
traffic_last_24_hrs = fetch_with_retry(f'{base_url}/api/system/cluster/traffic?daily=false', headers=headers,
auth=(args.token, 'token'), verify=args.insecure).json()
input_traffic_avg = sum([v for k, v in traffic_last_24_hrs['input'].items() if parse_traffic_ts(k)])
output_traffic_avg = sum([v for k, v in traffic_last_24_hrs['output'].items() if parse_traffic_ts(k)])
elasticsearch_health = fetch_with_retry(f'{base_url}/api/system/indexer/cluster/health', headers=headers,
auth=(args.token, 'token'), verify=args.insecure).json()
elasticsearch_status = elasticsearch_health['status'].lower()
elasticsearch_active_shards = elasticsearch_health['shards']['active']
indexer_failures = fetch_with_retry(f'{base_url}/api/system/indexer/failures?limit=10&offset=0',
headers=headers, auth=(args.token, 'token'), verify=args.insecure).json()
perfdata.update({
'throughput_input_1_sec_rate': {
'value': int(metrics_data['throughput_input_1_sec_rate']),
'min': 0,
},
'throughput_output_1_sec_rate': {
'value': int(metrics_data['throughput_output_1_sec_rate']),
'min': 0,
},
'entries_uncommitted': {
'value': metrics_data['journal_entries_uncommitted'],
'min': 0,
},
'jvm_memory_used': {
'value': metrics_data['jvm_memory_heap_used'],
'min': 0,
'warn': jvm_mem_usage_warn,
'crit': jvm_mem_usage_crit,
'max': int(metrics_data['jvm_memory_heap_max']),
'unit': 'B',
},
'network_traffic_in_avg': {
'value': input_traffic_avg,
'min': 0,
'unit': 'B',
},
'to_elasticsearch_24hrs_avg': {
'value': output_traffic_avg,
'min': 0,
'unit': 'B',
},
'elasticsearch_active_shards': {
'value': elasticsearch_active_shards,
'min': 0
},
'indexer_failures': {
'value': indexer_failures['total'],
'warn': 1,
'crit': 1,
'min': 0,
},
})
# Check for notifications
if args.cluster_metrics:
notifications_query = fetch_with_retry(f'{base_url}/api/system/notifications', headers=headers,
auth=(args.token, 'token'), verify=args.insecure).json()
notifications = []
for notif in notifications_query['notifications']:
if notif['type'] == 'outdated_version' and not args.ignore_update_notif:
notifications.append(notif)
elif notif['type'] != 'outdated_version':
notifications.append(notif)
if len(notifications):
notif = "notifications" if len(notifications) else "notification"
are = "are" if len(notifications) else "is"
if args.html:
notif_str = f'<a href="{base_url}/system/overview" target="_blank">There {are} {len(notifications)} {notif}.</a>'
else:
notif_str = f'There {are} {len(notifications)} {notif}.'
else:
notif_str = 'No notifications'
if indexer_failures['total'] > 0:
indexer_failures_exit = nagios.STATE_CRIT
if args.html:
text_result += f' <a href="{base_url}/system/indices/failures" target="_blank">There are {indexer_failures["total"]} indexer failures!</a>'
else:
text_result += f' There are {indexer_failures["total"]} indexer failures!'
else:
indexer_failures_exit = nagios.STATE_OK
# https://go2docs.graylog.org/5-0/setting_up_graylog/elasticsearch.htm#ClusterStatusExplained
if elasticsearch_status == 'yellow':
elasticsearch_exit_code = nagios.STATE_WARN
text_result += ' Elasticsearch is condition YELLOW!'
elif elasticsearch_status == 'red':
elasticsearch_exit_code = nagios.STATE_CRIT
text_result += ' Elasticsearch is condition RED!'
elif elasticsearch_status == 'green':
elasticsearch_exit_code = nagios.STATE_OK
else:
print_icinga2_check_status(f'unknown Elasticsearch health: {elasticsearch_status}', nagios.STATE_UNKNOWN)
sys.exit(nagios.STATE_UNKNOWN)
jvm_mem_usage_state = get_state(int(metrics_data['jvm_memory_heap_used']), jvm_mem_usage_warn,
jvm_mem_usage_crit, operator='gt')
if jvm_mem_usage_state != nagios.STATE_OK:
text_result += f' JVM memory usage is high!'
exit_code = max(nagios.STATE_OK, jvm_mem_usage_state, elasticsearch_exit_code, indexer_failures_exit)
if len(notifications):
text_result += f' There {are} {len(notifications)} {notif}!'
if args.crit_notif:
exit_code = nagios.STATE_CRIT # force crit
if args.input:
# show less data
text_result = text_result + f' JVM memory usage: {int((perfdata["jvm_memory_used"]["value"] / metrics_data["jvm_memory_heap_max"]) * 100)}%'
else:
# show more data
text_result = text_result + f' JVM memory usage: {int((perfdata["jvm_memory_used"]["value"] / metrics_data["jvm_memory_heap_max"]) * 100)}%, throughput last 1 second: {human_readable_size(perfdata["throughput_input_1_sec_rate"]["value"])} in - {human_readable_size(perfdata["throughput_output_1_sec_rate"]["value"])} out, Elasticsearch active shards: {perfdata["elasticsearch_active_shards"]["value"]}'
if args.input:
text_result = text_result + f' {input_name_clean} events/second for last minute: {perfdata[f"{input_name_clean}_incoming_messages_rate_per_sec_1min"]["value"]}, {input_name_clean}_connections: {perfdata[f"{input_name_clean}_connections"]["value"]}, {input_name_clean}_network_in_total: {human_readable_size(perfdata[f"{input_name_clean}_network_in_total"]["value"], decimal_places=0)}'
exit_code = nagios.STATE_OK
else:
text_result = text_result + '\n' + notif_str
print_icinga2_check_status(text_result, exit_code, perfdata)
sys.exit(exit_code)
if __name__ == "__main__":
try:
main()
except Exception as e:
print(f'UNKNOWN: exception "{e}"')
print(traceback.format_exc())
sys.exit(nagios.STATE_UNKNOWN)