2023-06-27 13:13:52 -06:00
#!/usr/bin/env python3
import argparse
import sys
import traceback
2023-06-27 17:56:40 -06:00
from datetime import datetime
2023-06-27 13:13:52 -06:00
import requests
from urllib3 . exceptions import InsecureRequestWarning
2023-06-27 17:56:40 -06:00
from checker import nagios
2023-06-27 13:13:52 -06:00
from checker import print_icinga2_check_status
2023-06-27 17:56:40 -06:00
from checker . http import fetch_with_retry
2023-06-27 13:13:52 -06:00
from checker . linuxfabric . base import get_state
2023-06-27 14:32:10 -06:00
from checker . units import human_readable_size
2023-06-27 13:13:52 -06:00
2023-06-27 17:56:40 -06:00
def transform_inputs ( old_dict ) :
new_dict = { }
for key in old_dict :
for item in old_dict [ key ] :
new_key = item [ ' id ' ]
new_dict [ new_key ] = item
return new_dict
def parse_traffic_ts ( ts : str ) :
datetime_obj = datetime . strptime ( ts , ' % Y- % m- %d T % H: % M: % S. %f Z ' )
current_time = datetime . now ( )
time_diff = current_time - datetime_obj
return time_diff . total_seconds ( ) < 24 * 60 * 60 # less than 24 hrs ago?
2023-06-27 13:13:52 -06:00
def main ( ) :
parser = argparse . ArgumentParser ( description = ' Check Graylog input health ' )
2023-06-27 17:56:40 -06:00
parser . add_argument ( ' -u ' , ' --url ' , required = True , help = ' The base Graylog URL ' )
2023-06-27 13:13:52 -06:00
parser . add_argument ( ' -t ' , ' --token ' , required = True , help = ' Graylog API token ' )
2023-06-27 18:16:25 -06:00
parser . add_argument ( ' -i ' , ' --input ' , help = ' Input ID to check. If unset, will check cluster metrics ' )
2023-06-27 13:13:52 -06:00
parser . add_argument ( ' --warn-mem ' , type = int , default = 75 , help = ' Percentage of JVM memory used for warm ' )
parser . add_argument ( ' --crit-mem ' , type = int , default = 100 , help = ' Percentage of JVM memory used for critical ' )
parser . add_argument ( ' --insecure ' , action = ' store_false ' , help = " Don ' t verify SSL " )
2023-06-27 14:32:10 -06:00
parser . add_argument ( ' --crit-notif ' , action = ' store_true ' , help = ' Return critical when there are notifications ' )
2023-07-25 10:22:55 -06:00
parser . add_argument ( ' --ignore-update-notif ' , action = ' store_true ' , help = ' Ignore any update notifications ' )
2023-06-27 17:56:40 -06:00
parser . add_argument ( ' --html ' , action = ' store_true ' , help = ' Print HTML ' )
2024-04-09 21:23:07 -06:00
parser . add_argument ( ' --cluster-metrics ' , action = ' store_true ' , help = ' Also gather cluster metrics and check for notifications ' )
2023-06-27 13:13:52 -06:00
args = parser . parse_args ( )
2023-06-27 17:56:40 -06:00
base_url = args . url . strip ( ' / ' )
2023-06-27 18:16:25 -06:00
if not args . input :
args . cluster_metrics = True
2023-06-27 17:56:40 -06:00
2023-06-27 13:13:52 -06:00
if not args . insecure :
requests . packages . urllib3 . disable_warnings ( category = InsecureRequestWarning )
headers = {
' Accept ' : ' application/json ' ,
' X-Requested-By ' : ' XMLHttpRequest ' ,
}
2023-06-27 18:16:25 -06:00
text_result = ' '
2023-06-27 17:56:40 -06:00
metrics_json = {
' metrics ' : [
' org.graylog2.throughput.input.1-sec-rate ' ,
' org.graylog2.throughput.output.1-sec-rate ' ,
" org.graylog2.journal.append.1-sec-rate " ,
" org.graylog2.journal.read.1-sec-rate " ,
" org.graylog2.journal.segments " ,
" org.graylog2.journal.entries-uncommitted " ,
" jvm.memory.heap.used " ,
" jvm.memory.heap.committed " ,
" jvm.memory.heap.max "
] ,
}
2023-06-27 18:16:25 -06:00
if args . input :
2023-06-27 18:28:49 -06:00
input_data = transform_inputs (
fetch_with_retry ( f ' { base_url } /api/cluster/inputstates ' , headers = headers , auth = ( args . token , ' token ' ) ,
verify = args . insecure ) . json ( ) ) . get ( args . input , { } )
# Get it over with
if bool ( input_data ) and input_data . get ( ' state ' ) == ' RUNNING ' :
input_name = input_data [ " message_input " ] [ " title " ]
text_result = f ' Graylog input " { input_name } " is running. '
else :
input_name = args . input
if args . html :
text_result = f ' Graylog input <a href= " { base_url } /system/inputs " target= " _blank " > " { input_name } " is not running!</a> '
else :
text_result = f ' Graylog input " { input_name } " is not running! '
print_icinga2_check_status ( text_result , nagios . STATE_CRIT )
sys . exit ( nagios . STATE_CRIT )
# If the input is running, continue gathering metrics and other health checks
input_name_clean = input_name . lower ( ) . replace ( ' ' , ' _ ' ) . replace ( ' - ' , ' _ ' )
2023-06-27 18:16:25 -06:00
type = input_data [ ' message_input ' ] [ ' type ' ]
metrics_json [ ' metrics ' ] = metrics_json [ ' metrics ' ] + [
f ' { type } . { args . input } .incomingMessages ' ,
f ' { type } . { args . input } .open_connections ' ,
f ' { type } . { args . input } .total_connections ' ,
f ' { type } . { args . input } .written_bytes_1sec ' ,
f ' { type } . { args . input } .written_bytes_total ' ,
f ' { type } . { args . input } .read_bytes_1sec ' ,
f ' { type } . { args . input } .read_bytes_total ' ,
]
2023-06-27 18:28:49 -06:00
r = fetch_with_retry ( f ' { base_url } /api/cluster/metrics/multiple ' , method = ' post ' , headers = headers ,
auth = ( args . token , ' token ' ) ,
2023-06-27 17:56:40 -06:00
verify = args . insecure , json = metrics_json ) . json ( )
input_metrics = r [ list ( r . keys ( ) ) [ 0 ] ] [ ' metrics ' ]
2023-06-27 13:13:52 -06:00
2023-06-27 17:56:40 -06:00
# Format the metrics for later
2023-06-27 13:13:52 -06:00
metrics_data = { }
for metric in input_metrics :
2023-06-27 18:16:25 -06:00
if args . input :
name = metric [ ' full_name ' ] . replace ( type , ' ' ) . replace ( ' org.graylog2. ' , ' ' ) . replace ( args . input , ' ' )
else :
name = metric [ ' full_name ' ] . replace ( ' org.graylog2. ' , ' ' )
name = name . strip ( ' . ' ) . replace ( ' - ' , ' _ ' ) . replace ( ' . ' , ' _ ' )
2023-06-27 13:13:52 -06:00
value = None
if ' value ' in metric [ ' metric ' ] :
value = metric [ " metric " ] [ " value " ]
elif ' count ' in metric [ ' metric ' ] :
value = metric [ " metric " ] [ " count " ]
elif ' rate ' in metric [ ' metric ' ] :
value = metric [ " metric " ] [ " rate " ] [ " one_minute " ]
name = f ' { name } _one_minute '
value = int ( value )
metrics_data [ name ] = value
2023-06-27 18:16:25 -06:00
perfdata = { }
if args . input :
# Some metric names are changed for better readability
perfdata . update ( {
2023-06-27 18:18:06 -06:00
f ' { input_name_clean } _incoming_messages_rate_per_sec_1min ' : {
2023-06-27 18:16:25 -06:00
' value ' : metrics_data [ ' incomingMessages_one_minute ' ] ,
' min ' : 0 ,
} ,
f ' { input_name_clean } _connections ' : {
' value ' : metrics_data [ ' open_connections ' ] ,
' min ' : 0 ,
} ,
f ' { input_name_clean } _network_out_total_1sec ' : {
' value ' : metrics_data [ ' written_bytes_1sec ' ] ,
' min ' : 0 ,
' unit ' : ' B ' ,
} ,
f ' { input_name_clean } _network_out_total_total ' : {
' value ' : metrics_data [ ' written_bytes_total ' ] ,
' min ' : 0 ,
' unit ' : ' B ' ,
} ,
f ' { input_name_clean } _network_in_1sec ' : {
' value ' : metrics_data [ ' read_bytes_1sec ' ] ,
' min ' : 0 ,
' unit ' : ' B ' ,
} ,
f ' { input_name_clean } _network_in_total ' : {
' value ' : metrics_data [ ' read_bytes_total ' ] ,
' min ' : 0 ,
' unit ' : ' B ' ,
}
} )
2023-06-27 13:13:52 -06:00
2023-06-27 17:56:40 -06:00
if args . cluster_metrics :
2023-06-27 18:16:25 -06:00
jvm_mem_usage_warn = int ( metrics_data [ ' jvm_memory_heap_max ' ] / int ( 100 / args . warn_mem ) )
jvm_mem_usage_crit = int ( metrics_data [ ' jvm_memory_heap_max ' ] / int ( 100 / args . crit_mem ) )
# Get traffic data for last 24 hrs
2023-06-27 18:28:49 -06:00
traffic_last_24_hrs = fetch_with_retry ( f ' { base_url } /api/system/cluster/traffic?daily=false ' , headers = headers ,
auth = ( args . token , ' token ' ) , verify = args . insecure ) . json ( )
2023-06-27 18:16:25 -06:00
input_traffic_avg = sum ( [ v for k , v in traffic_last_24_hrs [ ' input ' ] . items ( ) if parse_traffic_ts ( k ) ] )
output_traffic_avg = sum ( [ v for k , v in traffic_last_24_hrs [ ' output ' ] . items ( ) if parse_traffic_ts ( k ) ] )
2023-06-27 18:28:49 -06:00
elasticsearch_health = fetch_with_retry ( f ' { base_url } /api/system/indexer/cluster/health ' , headers = headers ,
auth = ( args . token , ' token ' ) , verify = args . insecure ) . json ( )
2023-06-27 18:24:31 -06:00
elasticsearch_status = elasticsearch_health [ ' status ' ] . lower ( )
elasticsearch_active_shards = elasticsearch_health [ ' shards ' ] [ ' active ' ]
2023-06-27 18:16:25 -06:00
2023-06-27 18:28:49 -06:00
indexer_failures = fetch_with_retry ( f ' { base_url } /api/system/indexer/failures?limit=10&offset=0 ' ,
headers = headers , auth = ( args . token , ' token ' ) , verify = args . insecure ) . json ( )
2023-06-27 18:16:25 -06:00
2023-06-27 17:56:40 -06:00
perfdata . update ( {
2023-06-27 18:16:25 -06:00
' throughput_input_1_sec_rate ' : {
' value ' : int ( metrics_data [ ' throughput_input_1_sec_rate ' ] ) ,
' min ' : 0 ,
} ,
' throughput_output_1_sec_rate ' : {
' value ' : int ( metrics_data [ ' throughput_output_1_sec_rate ' ] ) ,
' min ' : 0 ,
} ,
2023-06-27 17:56:40 -06:00
' entries_uncommitted ' : {
' value ' : metrics_data [ ' journal_entries_uncommitted ' ] ,
' min ' : 0 ,
} ,
' jvm_memory_used ' : {
' value ' : metrics_data [ ' jvm_memory_heap_used ' ] ,
' min ' : 0 ,
' warn ' : jvm_mem_usage_warn ,
' crit ' : jvm_mem_usage_crit ,
' max ' : int ( metrics_data [ ' jvm_memory_heap_max ' ] ) ,
' unit ' : ' B ' ,
} ,
2023-06-27 18:16:25 -06:00
' network_traffic_in_avg ' : {
2023-06-27 17:56:40 -06:00
' value ' : input_traffic_avg ,
' min ' : 0 ,
' unit ' : ' B ' ,
} ,
' to_elasticsearch_24hrs_avg ' : {
' value ' : output_traffic_avg ,
' min ' : 0 ,
' unit ' : ' B ' ,
} ,
2023-06-27 18:24:31 -06:00
' elasticsearch_active_shards ' : {
' value ' : elasticsearch_active_shards ,
2023-06-27 17:56:40 -06:00
' min ' : 0
} ,
2023-06-27 18:22:46 -06:00
' indexer_failures ' : {
2023-06-27 17:56:40 -06:00
' value ' : indexer_failures [ ' total ' ] ,
2023-06-27 18:25:49 -06:00
' warn ' : 1 ,
' crit ' : 1 ,
2023-06-27 17:56:40 -06:00
' min ' : 0 ,
} ,
} )
2023-06-27 13:13:52 -06:00
2023-06-27 17:56:40 -06:00
# Check for notifications
if args . cluster_metrics :
2023-07-25 10:22:55 -06:00
notifications_query = fetch_with_retry ( f ' { base_url } /api/system/notifications ' , headers = headers ,
2023-06-27 18:28:49 -06:00
auth = ( args . token , ' token ' ) , verify = args . insecure ) . json ( )
2023-07-25 10:22:55 -06:00
notifications = [ ]
for notif in notifications_query [ ' notifications ' ] :
if notif [ ' type ' ] == ' outdated_version ' and not args . ignore_update_notif :
notifications . append ( notif )
elif notif [ ' type ' ] != ' outdated_version ' :
notifications . append ( notif )
if len ( notifications ) :
notif = " notifications " if len ( notifications ) else " notification "
are = " are " if len ( notifications ) else " is "
2023-06-27 18:34:00 -06:00
if args . html :
2023-07-25 10:22:55 -06:00
notif_str = f ' <a href= " { base_url } /system/overview " target= " _blank " >There { are } { len ( notifications ) } { notif } .</a> '
2023-06-27 18:34:00 -06:00
else :
2023-07-25 10:22:55 -06:00
notif_str = f ' There { are } { len ( notifications ) } { notif } . '
2023-06-27 17:56:40 -06:00
else :
notif_str = ' No notifications '
2023-06-27 13:13:52 -06:00
2023-06-27 17:56:40 -06:00
if indexer_failures [ ' total ' ] > 0 :
indexer_failures_exit = nagios . STATE_CRIT
if args . html :
text_result + = f ' <a href= " { base_url } /system/indices/failures " target= " _blank " >There are { indexer_failures [ " total " ] } indexer failures!</a> '
else :
text_result + = f ' There are { indexer_failures [ " total " ] } indexer failures! '
else :
indexer_failures_exit = nagios . STATE_OK
# https://go2docs.graylog.org/5-0/setting_up_graylog/elasticsearch.htm#ClusterStatusExplained
2023-06-27 18:24:31 -06:00
if elasticsearch_status == ' yellow ' :
elasticsearch_exit_code = nagios . STATE_WARN
2023-06-27 17:56:40 -06:00
text_result + = ' Elasticsearch is condition YELLOW! '
2023-06-27 18:24:31 -06:00
elif elasticsearch_status == ' red ' :
elasticsearch_exit_code = nagios . STATE_CRIT
2023-06-27 17:56:40 -06:00
text_result + = ' Elasticsearch is condition RED! '
2023-06-27 18:24:31 -06:00
elif elasticsearch_status == ' green ' :
elasticsearch_exit_code = nagios . STATE_OK
2023-06-27 17:56:40 -06:00
else :
2023-06-27 18:24:31 -06:00
print_icinga2_check_status ( f ' unknown Elasticsearch health: { elasticsearch_status } ' , nagios . STATE_UNKNOWN )
2023-06-27 17:56:40 -06:00
sys . exit ( nagios . STATE_UNKNOWN )
2023-06-27 13:13:52 -06:00
2023-06-27 18:28:49 -06:00
jvm_mem_usage_state = get_state ( int ( metrics_data [ ' jvm_memory_heap_used ' ] ) , jvm_mem_usage_warn ,
jvm_mem_usage_crit , operator = ' gt ' )
2023-06-27 17:56:40 -06:00
if jvm_mem_usage_state != nagios . STATE_OK :
text_result + = f ' JVM memory usage is high! '
2023-06-27 13:13:52 -06:00
2023-06-27 18:24:31 -06:00
exit_code = max ( nagios . STATE_OK , jvm_mem_usage_state , elasticsearch_exit_code , indexer_failures_exit )
2023-06-27 14:32:10 -06:00
2023-07-25 10:22:55 -06:00
if len ( notifications ) :
text_result + = f ' There { are } { len ( notifications ) } { notif } ! '
2023-06-27 17:56:40 -06:00
if args . crit_notif :
exit_code = nagios . STATE_CRIT # force crit
2023-06-27 14:32:10 -06:00
2023-06-27 18:22:46 -06:00
if args . input :
# show less data
text_result = text_result + f ' JVM memory usage: { int ( ( perfdata [ " jvm_memory_used " ] [ " value " ] / metrics_data [ " jvm_memory_heap_max " ] ) * 100 ) } % '
else :
# show more data
2023-06-27 18:24:31 -06:00
text_result = text_result + f ' JVM memory usage: { int ( ( perfdata [ " jvm_memory_used " ] [ " value " ] / metrics_data [ " jvm_memory_heap_max " ] ) * 100 ) } %, throughput last 1 second: { human_readable_size ( perfdata [ " throughput_input_1_sec_rate " ] [ " value " ] ) } in - { human_readable_size ( perfdata [ " throughput_output_1_sec_rate " ] [ " value " ] ) } out, Elasticsearch active shards: { perfdata [ " elasticsearch_active_shards " ] [ " value " ] } '
2023-06-27 18:16:25 -06:00
if args . input :
2023-06-27 18:18:06 -06:00
text_result = text_result + f ' { input_name_clean } events/second for last minute: { perfdata [ f " { input_name_clean } _incoming_messages_rate_per_sec_1min " ] [ " value " ] } , { input_name_clean } _connections: { perfdata [ f " { input_name_clean } _connections " ] [ " value " ] } , { input_name_clean } _network_in_total: { human_readable_size ( perfdata [ f " { input_name_clean } _network_in_total " ] [ " value " ] , decimal_places = 0 ) } '
2023-06-27 17:56:40 -06:00
exit_code = nagios . STATE_OK
2023-06-27 18:16:25 -06:00
else :
text_result = text_result + ' \n ' + notif_str
2023-06-27 14:32:10 -06:00
2023-06-27 13:13:52 -06:00
print_icinga2_check_status ( text_result , exit_code , perfdata )
sys . exit ( exit_code )
if __name__ == " __main__ " :
try :
main ( )
except Exception as e :
print ( f ' UNKNOWN: exception " { e } " ' )
print ( traceback . format_exc ( ) )
2023-06-27 17:56:40 -06:00
sys . exit ( nagios . STATE_UNKNOWN )