check_graylog: fix
This commit is contained in:
parent
725b7a2bab
commit
61ec9a8580
|
@ -41,7 +41,8 @@ def main():
|
||||||
parser.add_argument('--insecure', action='store_false', help="Don't verify SSL")
|
parser.add_argument('--insecure', action='store_false', help="Don't verify SSL")
|
||||||
parser.add_argument('--crit-notif', action='store_true', help='Return critical when there are notifications')
|
parser.add_argument('--crit-notif', action='store_true', help='Return critical when there are notifications')
|
||||||
parser.add_argument('--html', action='store_true', help='Print HTML')
|
parser.add_argument('--html', action='store_true', help='Print HTML')
|
||||||
parser.add_argument('--cluster-metrics', action='store_true', help='Also gather cluster metrics and check for notifications')
|
parser.add_argument('--cluster-metrics', action='store_true',
|
||||||
|
help='Also gather cluster metrics and check for notifications')
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
base_url = args.url.strip('/')
|
base_url = args.url.strip('/')
|
||||||
|
@ -72,8 +73,25 @@ def main():
|
||||||
}
|
}
|
||||||
|
|
||||||
if args.input:
|
if args.input:
|
||||||
# Get the basic input metadata
|
input_data = transform_inputs(
|
||||||
input_data = transform_inputs(fetch_with_retry(f'{base_url}/api/cluster/inputstates', headers=headers, auth=(args.token, 'token'), verify=args.insecure).json()).get(args.input, {})
|
fetch_with_retry(f'{base_url}/api/cluster/inputstates', headers=headers, auth=(args.token, 'token'),
|
||||||
|
verify=args.insecure).json()).get(args.input, {})
|
||||||
|
# Get it over with
|
||||||
|
if bool(input_data) and input_data.get('state') == 'RUNNING':
|
||||||
|
input_name = input_data["message_input"]["title"]
|
||||||
|
text_result = f'Graylog input "{input_name}" is running.'
|
||||||
|
else:
|
||||||
|
input_name = args.input
|
||||||
|
if args.html:
|
||||||
|
text_result = f'Graylog input <a href="{base_url}/system/inputs" target="_blank">"{input_name}" is not running!</a>'
|
||||||
|
else:
|
||||||
|
text_result = f'Graylog input "{input_name}" is not running!'
|
||||||
|
print_icinga2_check_status(text_result, nagios.STATE_CRIT)
|
||||||
|
sys.exit(nagios.STATE_CRIT)
|
||||||
|
|
||||||
|
# If the input is running, continue gathering metrics and other health checks
|
||||||
|
input_name_clean = input_name.lower().replace(' ', '_').replace('-', '_')
|
||||||
|
|
||||||
type = input_data['message_input']['type']
|
type = input_data['message_input']['type']
|
||||||
metrics_json['metrics'] = metrics_json['metrics'] + [
|
metrics_json['metrics'] = metrics_json['metrics'] + [
|
||||||
f'{type}.{args.input}.incomingMessages',
|
f'{type}.{args.input}.incomingMessages',
|
||||||
|
@ -85,7 +103,8 @@ def main():
|
||||||
f'{type}.{args.input}.read_bytes_total',
|
f'{type}.{args.input}.read_bytes_total',
|
||||||
]
|
]
|
||||||
|
|
||||||
r = fetch_with_retry(f'{base_url}/api/cluster/metrics/multiple', method='post', headers=headers, auth=(args.token, 'token'),
|
r = fetch_with_retry(f'{base_url}/api/cluster/metrics/multiple', method='post', headers=headers,
|
||||||
|
auth=(args.token, 'token'),
|
||||||
verify=args.insecure, json=metrics_json).json()
|
verify=args.insecure, json=metrics_json).json()
|
||||||
input_metrics = r[list(r.keys())[0]]['metrics']
|
input_metrics = r[list(r.keys())[0]]['metrics']
|
||||||
|
|
||||||
|
@ -111,22 +130,6 @@ def main():
|
||||||
perfdata = {}
|
perfdata = {}
|
||||||
|
|
||||||
if args.input:
|
if args.input:
|
||||||
# Get it over with
|
|
||||||
if bool(input_data) and input_data.get('state') == 'RUNNING':
|
|
||||||
input_name = input_data["message_input"]["title"]
|
|
||||||
text_result = f'Graylog input "{input_name}" is running.'
|
|
||||||
else:
|
|
||||||
input_name = args.input
|
|
||||||
if args.html:
|
|
||||||
text_result = f'Graylog input <a href="{base_url}/system/inputs" target="_blank">"{input_name}" is not running!</a>'
|
|
||||||
else:
|
|
||||||
text_result = f'Graylog input "{input_name}" is not running!'
|
|
||||||
print_icinga2_check_status(text_result, nagios.STATE_CRIT)
|
|
||||||
sys.exit(nagios.STATE_CRIT)
|
|
||||||
|
|
||||||
# If the input is running, continue gathering metrics and other health checks
|
|
||||||
input_name_clean = input_name.lower().replace(' ', '_').replace('-', '_')
|
|
||||||
|
|
||||||
# Some metric names are changed for better readability
|
# Some metric names are changed for better readability
|
||||||
perfdata.update({
|
perfdata.update({
|
||||||
f'{input_name_clean}_incoming_messages_rate_per_sec_1min': {
|
f'{input_name_clean}_incoming_messages_rate_per_sec_1min': {
|
||||||
|
@ -164,15 +167,18 @@ def main():
|
||||||
jvm_mem_usage_crit = int(metrics_data['jvm_memory_heap_max'] / int(100 / args.crit_mem))
|
jvm_mem_usage_crit = int(metrics_data['jvm_memory_heap_max'] / int(100 / args.crit_mem))
|
||||||
|
|
||||||
# Get traffic data for last 24 hrs
|
# Get traffic data for last 24 hrs
|
||||||
traffic_last_24_hrs = fetch_with_retry(f'{base_url}/api/system/cluster/traffic?daily=false', headers=headers, auth=(args.token, 'token'), verify=args.insecure).json()
|
traffic_last_24_hrs = fetch_with_retry(f'{base_url}/api/system/cluster/traffic?daily=false', headers=headers,
|
||||||
|
auth=(args.token, 'token'), verify=args.insecure).json()
|
||||||
input_traffic_avg = sum([v for k, v in traffic_last_24_hrs['input'].items() if parse_traffic_ts(k)])
|
input_traffic_avg = sum([v for k, v in traffic_last_24_hrs['input'].items() if parse_traffic_ts(k)])
|
||||||
output_traffic_avg = sum([v for k, v in traffic_last_24_hrs['output'].items() if parse_traffic_ts(k)])
|
output_traffic_avg = sum([v for k, v in traffic_last_24_hrs['output'].items() if parse_traffic_ts(k)])
|
||||||
|
|
||||||
elasticsearch_health = fetch_with_retry(f'{base_url}/api/system/indexer/cluster/health', headers=headers, auth=(args.token, 'token'), verify=args.insecure).json()
|
elasticsearch_health = fetch_with_retry(f'{base_url}/api/system/indexer/cluster/health', headers=headers,
|
||||||
|
auth=(args.token, 'token'), verify=args.insecure).json()
|
||||||
elasticsearch_status = elasticsearch_health['status'].lower()
|
elasticsearch_status = elasticsearch_health['status'].lower()
|
||||||
elasticsearch_active_shards = elasticsearch_health['shards']['active']
|
elasticsearch_active_shards = elasticsearch_health['shards']['active']
|
||||||
|
|
||||||
indexer_failures = fetch_with_retry(f'{base_url}/api/system/indexer/failures?limit=10&offset=0', headers=headers, auth=(args.token, 'token'), verify=args.insecure).json()
|
indexer_failures = fetch_with_retry(f'{base_url}/api/system/indexer/failures?limit=10&offset=0',
|
||||||
|
headers=headers, auth=(args.token, 'token'), verify=args.insecure).json()
|
||||||
|
|
||||||
perfdata.update({
|
perfdata.update({
|
||||||
'throughput_input_1_sec_rate': {
|
'throughput_input_1_sec_rate': {
|
||||||
|
@ -219,7 +225,8 @@ def main():
|
||||||
|
|
||||||
# Check for notifications
|
# Check for notifications
|
||||||
if args.cluster_metrics:
|
if args.cluster_metrics:
|
||||||
notifications = fetch_with_retry(f'{base_url}/api/system/notifications', headers=headers, auth=(args.token, 'token'), verify=args.insecure).json()
|
notifications = fetch_with_retry(f'{base_url}/api/system/notifications', headers=headers,
|
||||||
|
auth=(args.token, 'token'), verify=args.insecure).json()
|
||||||
if notifications['total'] > 0:
|
if notifications['total'] > 0:
|
||||||
notif_str = 'Notifications:'
|
notif_str = 'Notifications:'
|
||||||
for notification in notifications['notifications']:
|
for notification in notifications['notifications']:
|
||||||
|
@ -249,7 +256,8 @@ def main():
|
||||||
print_icinga2_check_status(f'unknown Elasticsearch health: {elasticsearch_status}', nagios.STATE_UNKNOWN)
|
print_icinga2_check_status(f'unknown Elasticsearch health: {elasticsearch_status}', nagios.STATE_UNKNOWN)
|
||||||
sys.exit(nagios.STATE_UNKNOWN)
|
sys.exit(nagios.STATE_UNKNOWN)
|
||||||
|
|
||||||
jvm_mem_usage_state = get_state(int(metrics_data['jvm_memory_heap_used']), jvm_mem_usage_warn, jvm_mem_usage_crit, operator='gt')
|
jvm_mem_usage_state = get_state(int(metrics_data['jvm_memory_heap_used']), jvm_mem_usage_warn,
|
||||||
|
jvm_mem_usage_crit, operator='gt')
|
||||||
if jvm_mem_usage_state != nagios.STATE_OK:
|
if jvm_mem_usage_state != nagios.STATE_OK:
|
||||||
text_result += f' JVM memory usage is high!'
|
text_result += f' JVM memory usage is high!'
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue