icinga2-checks/check_graylog.py

196 lines
7.7 KiB
Python
Raw Normal View History

2023-06-27 13:13:52 -06:00
#!/usr/bin/env python3
import argparse
import sys
import traceback
import requests
from urllib3.exceptions import InsecureRequestWarning
import checker.nagios
from checker import print_icinga2_check_status
from checker.linuxfabric.base import get_state
from checker.units import human_readable_size
2023-06-27 13:13:52 -06:00
def main():
parser = argparse.ArgumentParser(description='Check Graylog input health')
parser.add_argument('-u', '--url', required=True, help='Graylog API URL')
parser.add_argument('-t', '--token', required=True, help='Graylog API token')
parser.add_argument('-i', '--input', required=True, help='Input ID to check')
parser.add_argument('--warn-mem', type=int, default=75, help='Percentage of JVM memory used for warm')
parser.add_argument('--crit-mem', type=int, default=100, help='Percentage of JVM memory used for critical')
parser.add_argument('--insecure', action='store_false', help="Don't verify SSL")
parser.add_argument('--crit-notif', action='store_true', help='Return critical when there are notifications')
2023-06-27 13:13:52 -06:00
args = parser.parse_args()
if not args.insecure:
requests.packages.urllib3.disable_warnings(category=InsecureRequestWarning)
headers = {
'Accept': 'application/json',
'X-Requested-By': 'XMLHttpRequest',
}
try:
response = requests.get(f'{args.url}/system/inputstates/{args.input}', headers=headers,
auth=(args.token, 'token'), verify=args.insecure)
response.raise_for_status()
except requests.exceptions.RequestException as e:
print(f'CRITICAL - Unable to query Graylog API: {e}')
sys.exit(2)
input_data = response.json()
type = input_data['message_input']['type']
try:
metrics_json = {
'metrics': [
'org.graylog2.throughput.input.1-sec-rate',
'org.graylog2.throughput.output.1-sec-rate',
f'{type}.{args.input}.incomingMessages',
f'{type}.{args.input}.open_connections',
f'{type}.{args.input}.total_connections',
f'{type}.{args.input}.written_bytes_1sec',
f'{type}.{args.input}.written_bytes_total',
f'{type}.{args.input}.read_bytes_1sec',
f'{type}.{args.input}.read_bytes_total',
"org.graylog2.journal.append.1-sec-rate",
"org.graylog2.journal.read.1-sec-rate",
"org.graylog2.journal.segments",
"org.graylog2.journal.entries-uncommitted",
"jvm.memory.heap.used",
"jvm.memory.heap.committed",
"jvm.memory.heap.max"
],
}
response = requests.post(f'{args.url}/cluster/metrics/multiple', headers=headers, auth=(args.token, 'token'),
verify=args.insecure,
json=metrics_json)
response.raise_for_status()
input_metrics = response.json()[list(response.json().keys())[0]]['metrics']
except requests.exceptions.RequestException as e:
print(f'CRITICAL - Unable to query Graylog API: {e}\n{response.text}')
sys.exit(2)
# Format the metrics
metrics_data = {}
for metric in input_metrics:
name = metric['full_name'].replace(type, '').replace('org.graylog2.', '').replace(args.input, '').strip(
'.').replace('-', '_').replace('.', '_')
value = None
if 'value' in metric['metric']:
value = metric["metric"]["value"]
elif 'count' in metric['metric']:
value = metric["metric"]["count"]
elif 'rate' in metric['metric']:
value = metric["metric"]["rate"]["one_minute"]
name = f'{name}_one_minute'
value = int(value)
metrics_data[name] = value
jvm_mem_usage_warn = int(metrics_data['jvm_memory_heap_max'] / int(100 / args.warn_mem))
jvm_mem_usage_crit = int(metrics_data['jvm_memory_heap_max'] / int(100 / args.crit_mem))
# Some metric names are changed for better readability
2023-06-27 13:13:52 -06:00
perfdata = {
'throughput_input_1_sec_rate': {
'value': int(metrics_data['throughput_input_1_sec_rate']),
'min': 0,
},
'throughput_output_1_sec_rate': {
'value': int(metrics_data['throughput_output_1_sec_rate']),
'min': 0,
},
'incoming_messages_rate_per_sec_one_minute': {
2023-06-27 13:13:52 -06:00
'value': metrics_data['incomingMessages_one_minute'],
'min': 0,
},
'connections': {
2023-06-27 13:13:52 -06:00
'value': metrics_data['open_connections'],
'min': 0,
},
'network_out_total_1sec': {
'value': metrics_data['written_bytes_1sec'],
2023-06-27 13:13:52 -06:00
'min': 0,
'unit': 'B',
2023-06-27 13:13:52 -06:00
},
'network_out_total_total': {
'value': metrics_data['written_bytes_total'],
2023-06-27 13:13:52 -06:00
'min': 0,
'unit': 'B',
},
'network_in_1sec': {
2023-06-27 13:13:52 -06:00
'value': metrics_data['read_bytes_1sec'],
'min': 0,
'unit': 'B',
},
'network_in_total': {
'value': metrics_data['read_bytes_total'],
'min': 0,
'unit': 'B',
},
2023-06-27 13:13:52 -06:00
'entries_uncommitted': {
'value': metrics_data['journal_entries_uncommitted'],
'min': 0,
},
'jvm_memory_used': {
'value': metrics_data['jvm_memory_heap_used'],
'min': 0,
'warn': jvm_mem_usage_warn,
'crit': jvm_mem_usage_crit,
'max': int(metrics_data['jvm_memory_heap_max']),
'unit': 'B',
},
}
jvm_mem_usage_state = get_state(int(metrics_data['jvm_memory_heap_used']), jvm_mem_usage_warn, jvm_mem_usage_crit,
operator='gt')
try:
response = requests.get(f'{args.url}/system/notifications', headers=headers, auth=(args.token, 'token'),
verify=args.insecure)
response.raise_for_status()
except requests.exceptions.RequestException as e:
print(f'CRITICAL - Unable to query Graylog API: {e}')
sys.exit(2)
notifications = response.json()
if notifications['total'] > 0:
notif_str = 'Notifications:'
for notification in notifications['notifications']:
notif_str = notif_str + f'\n{notification["type"]}: {notification["description"]}'
else:
notif_str = 'No notifications'
if input_data['state'] == 'RUNNING':
text_result = f'Input "{input_data["message_input"]["title"]}" is running.'
else:
text_result = f'Input "{input_data["message_input"]["title"]}" is not running!'
print(text_result)
print(notif_str)
sys.exit(checker.nagios.STATE_CRIT)
if jvm_mem_usage_state != checker.nagios.STATE_OK:
text_result += f' JVM memory usage is high!'
exit_code = max(checker.nagios.STATE_OK, jvm_mem_usage_state)
if notifications['total'] > 0:
text_result += f' There are notifications!'
if args.crit_notif:
exit_code = checker.nagios.STATE_CRIT
text_result = text_result + f' JVM memory usage: {int((perfdata["jvm_memory_used"]["value"] / metrics_data["jvm_memory_heap_max"]) * 100)}%, incoming rate (events/second for last minute): {perfdata["incoming_messages_rate_per_sec_one_minute"]["value"]}, connections: {perfdata["connections"]["value"]}, total network in: {human_readable_size(perfdata["network_in_total"]["value"], decimal_places=0)}' + '\n' + notif_str
2023-06-27 13:13:52 -06:00
print_icinga2_check_status(text_result, exit_code, perfdata)
sys.exit(exit_code)
if __name__ == "__main__":
try:
main()
except Exception as e:
print(f'UNKNOWN: exception "{e}"')
print(traceback.format_exc())
sys.exit(checker.nagios.STATE_UNKNOWN)