check_graylog: split input and cluster metrics
This commit is contained in:
parent
9f00479dad
commit
5cb9f2a457
159
check_graylog.py
159
check_graylog.py
|
@ -35,16 +35,18 @@ def main():
|
||||||
parser = argparse.ArgumentParser(description='Check Graylog input health')
|
parser = argparse.ArgumentParser(description='Check Graylog input health')
|
||||||
parser.add_argument('-u', '--url', required=True, help='The base Graylog URL')
|
parser.add_argument('-u', '--url', required=True, help='The base Graylog URL')
|
||||||
parser.add_argument('-t', '--token', required=True, help='Graylog API token')
|
parser.add_argument('-t', '--token', required=True, help='Graylog API token')
|
||||||
parser.add_argument('-i', '--input', required=True, help='Input ID to check')
|
parser.add_argument('-i', '--input', help='Input ID to check. If unset, will check cluster metrics')
|
||||||
parser.add_argument('--warn-mem', type=int, default=75, help='Percentage of JVM memory used for warm')
|
parser.add_argument('--warn-mem', type=int, default=75, help='Percentage of JVM memory used for warm')
|
||||||
parser.add_argument('--crit-mem', type=int, default=100, help='Percentage of JVM memory used for critical')
|
parser.add_argument('--crit-mem', type=int, default=100, help='Percentage of JVM memory used for critical')
|
||||||
parser.add_argument('--insecure', action='store_false', help="Don't verify SSL")
|
parser.add_argument('--insecure', action='store_false', help="Don't verify SSL")
|
||||||
parser.add_argument('--crit-notif', action='store_true', help='Return critical when there are notifications')
|
parser.add_argument('--crit-notif', action='store_true', help='Return critical when there are notifications')
|
||||||
parser.add_argument('--html', action='store_true', help='Print HTML')
|
parser.add_argument('--html', action='store_true', help='Print HTML')
|
||||||
parser.add_argument('--cluster-metrics', action='store_true', help='Also gather cluster metrics and check for notifications.')
|
parser.add_argument('--cluster-metrics', action='store_true', help='Also gather cluster metrics and check for notifications')
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
base_url = args.url.strip('/')
|
base_url = args.url.strip('/')
|
||||||
|
if not args.input:
|
||||||
|
args.cluster_metrics = True
|
||||||
|
|
||||||
if not args.insecure:
|
if not args.insecure:
|
||||||
requests.packages.urllib3.disable_warnings(category=InsecureRequestWarning)
|
requests.packages.urllib3.disable_warnings(category=InsecureRequestWarning)
|
||||||
|
@ -54,9 +56,61 @@ def main():
|
||||||
'X-Requested-By': 'XMLHttpRequest',
|
'X-Requested-By': 'XMLHttpRequest',
|
||||||
}
|
}
|
||||||
|
|
||||||
|
text_result = ''
|
||||||
|
metrics_json = {
|
||||||
|
'metrics': [
|
||||||
|
'org.graylog2.throughput.input.1-sec-rate',
|
||||||
|
'org.graylog2.throughput.output.1-sec-rate',
|
||||||
|
"org.graylog2.journal.append.1-sec-rate",
|
||||||
|
"org.graylog2.journal.read.1-sec-rate",
|
||||||
|
"org.graylog2.journal.segments",
|
||||||
|
"org.graylog2.journal.entries-uncommitted",
|
||||||
|
"jvm.memory.heap.used",
|
||||||
|
"jvm.memory.heap.committed",
|
||||||
|
"jvm.memory.heap.max"
|
||||||
|
],
|
||||||
|
}
|
||||||
|
|
||||||
|
if args.input:
|
||||||
# Get the basic input metadata
|
# Get the basic input metadata
|
||||||
input_data = transform_inputs(fetch_with_retry(f'{base_url}/api/cluster/inputstates', headers=headers, auth=(args.token, 'token'), verify=args.insecure).json()).get(args.input, {})
|
input_data = transform_inputs(fetch_with_retry(f'{base_url}/api/cluster/inputstates', headers=headers, auth=(args.token, 'token'), verify=args.insecure).json()).get(args.input, {})
|
||||||
|
type = input_data['message_input']['type']
|
||||||
|
metrics_json['metrics'] = metrics_json['metrics'] + [
|
||||||
|
f'{type}.{args.input}.incomingMessages',
|
||||||
|
f'{type}.{args.input}.open_connections',
|
||||||
|
f'{type}.{args.input}.total_connections',
|
||||||
|
f'{type}.{args.input}.written_bytes_1sec',
|
||||||
|
f'{type}.{args.input}.written_bytes_total',
|
||||||
|
f'{type}.{args.input}.read_bytes_1sec',
|
||||||
|
f'{type}.{args.input}.read_bytes_total',
|
||||||
|
]
|
||||||
|
|
||||||
|
r = fetch_with_retry(f'{base_url}/api/cluster/metrics/multiple', method='post', headers=headers, auth=(args.token, 'token'),
|
||||||
|
verify=args.insecure, json=metrics_json).json()
|
||||||
|
input_metrics = r[list(r.keys())[0]]['metrics']
|
||||||
|
|
||||||
|
# Format the metrics for later
|
||||||
|
metrics_data = {}
|
||||||
|
for metric in input_metrics:
|
||||||
|
if args.input:
|
||||||
|
name = metric['full_name'].replace(type, '').replace('org.graylog2.', '').replace(args.input, '')
|
||||||
|
else:
|
||||||
|
name = metric['full_name'].replace('org.graylog2.', '')
|
||||||
|
name = name.strip('.').replace('-', '_').replace('.', '_')
|
||||||
|
value = None
|
||||||
|
if 'value' in metric['metric']:
|
||||||
|
value = metric["metric"]["value"]
|
||||||
|
elif 'count' in metric['metric']:
|
||||||
|
value = metric["metric"]["count"]
|
||||||
|
elif 'rate' in metric['metric']:
|
||||||
|
value = metric["metric"]["rate"]["one_minute"]
|
||||||
|
name = f'{name}_one_minute'
|
||||||
|
value = int(value)
|
||||||
|
metrics_data[name] = value
|
||||||
|
|
||||||
|
perfdata = {}
|
||||||
|
|
||||||
|
if args.input:
|
||||||
# Get it over with
|
# Get it over with
|
||||||
if bool(input_data) and input_data.get('state') == 'RUNNING':
|
if bool(input_data) and input_data.get('state') == 'RUNNING':
|
||||||
input_name = input_data["message_input"]["title"]
|
input_name = input_data["message_input"]["title"]
|
||||||
|
@ -73,72 +127,8 @@ def main():
|
||||||
# If the input is running, continue gathering metrics and other health checks
|
# If the input is running, continue gathering metrics and other health checks
|
||||||
input_name_clean = input_name.lower().replace(' ', '_').replace('-', '_')
|
input_name_clean = input_name.lower().replace(' ', '_').replace('-', '_')
|
||||||
|
|
||||||
# Get metrics for the input
|
|
||||||
type = input_data['message_input']['type']
|
|
||||||
metrics_json = {
|
|
||||||
'metrics': [
|
|
||||||
'org.graylog2.throughput.input.1-sec-rate',
|
|
||||||
'org.graylog2.throughput.output.1-sec-rate',
|
|
||||||
f'{type}.{args.input}.incomingMessages',
|
|
||||||
f'{type}.{args.input}.open_connections',
|
|
||||||
f'{type}.{args.input}.total_connections',
|
|
||||||
f'{type}.{args.input}.written_bytes_1sec',
|
|
||||||
f'{type}.{args.input}.written_bytes_total',
|
|
||||||
f'{type}.{args.input}.read_bytes_1sec',
|
|
||||||
f'{type}.{args.input}.read_bytes_total',
|
|
||||||
"org.graylog2.journal.append.1-sec-rate",
|
|
||||||
"org.graylog2.journal.read.1-sec-rate",
|
|
||||||
"org.graylog2.journal.segments",
|
|
||||||
"org.graylog2.journal.entries-uncommitted",
|
|
||||||
"jvm.memory.heap.used",
|
|
||||||
"jvm.memory.heap.committed",
|
|
||||||
"jvm.memory.heap.max"
|
|
||||||
],
|
|
||||||
}
|
|
||||||
r = fetch_with_retry(f'{base_url}/api/cluster/metrics/multiple', method='post', headers=headers, auth=(args.token, 'token'),
|
|
||||||
verify=args.insecure, json=metrics_json).json()
|
|
||||||
input_metrics = r[list(r.keys())[0]]['metrics']
|
|
||||||
|
|
||||||
# Format the metrics for later
|
|
||||||
metrics_data = {}
|
|
||||||
for metric in input_metrics:
|
|
||||||
name = metric['full_name'].replace(type, '').replace('org.graylog2.', '').replace(args.input, '').strip(
|
|
||||||
'.').replace('-', '_').replace('.', '_')
|
|
||||||
value = None
|
|
||||||
if 'value' in metric['metric']:
|
|
||||||
value = metric["metric"]["value"]
|
|
||||||
elif 'count' in metric['metric']:
|
|
||||||
value = metric["metric"]["count"]
|
|
||||||
elif 'rate' in metric['metric']:
|
|
||||||
value = metric["metric"]["rate"]["one_minute"]
|
|
||||||
name = f'{name}_one_minute'
|
|
||||||
value = int(value)
|
|
||||||
metrics_data[name] = value
|
|
||||||
|
|
||||||
jvm_mem_usage_warn = int(metrics_data['jvm_memory_heap_max'] / int(100 / args.warn_mem))
|
|
||||||
jvm_mem_usage_crit = int(metrics_data['jvm_memory_heap_max'] / int(100 / args.crit_mem))
|
|
||||||
|
|
||||||
# Get traffic data for last 24 hrs
|
|
||||||
traffic_last_24_hrs = fetch_with_retry(f'{base_url}/api/system/cluster/traffic?daily=false', headers=headers, auth=(args.token, 'token'), verify=args.insecure).json()
|
|
||||||
input_traffic_avg = sum([v for k, v in traffic_last_24_hrs['input'].items() if parse_traffic_ts(k)])
|
|
||||||
output_traffic_avg = sum([v for k, v in traffic_last_24_hrs['output'].items() if parse_traffic_ts(k)])
|
|
||||||
|
|
||||||
elastisearch_health = fetch_with_retry(f'{base_url}/api/system/indexer/cluster/health', headers=headers, auth=(args.token, 'token'), verify=args.insecure).json()
|
|
||||||
elastisearch_status = elastisearch_health['status'].lower()
|
|
||||||
elastisearch_active_shards = elastisearch_health['shards']['active']
|
|
||||||
|
|
||||||
indexer_failures = fetch_with_retry(f'{base_url}/api/system/indexer/failures?limit=10&offset=0', headers=headers, auth=(args.token, 'token'), verify=args.insecure).json()
|
|
||||||
|
|
||||||
# Some metric names are changed for better readability
|
# Some metric names are changed for better readability
|
||||||
perfdata = {
|
perfdata.update({
|
||||||
f'{input_name_clean}_throughput_input_1_sec_rate': {
|
|
||||||
'value': int(metrics_data['throughput_input_1_sec_rate']),
|
|
||||||
'min': 0,
|
|
||||||
},
|
|
||||||
f'{input_name_clean}_throughput_output_1_sec_rate': {
|
|
||||||
'value': int(metrics_data['throughput_output_1_sec_rate']),
|
|
||||||
'min': 0,
|
|
||||||
},
|
|
||||||
f'{input_name_clean}_incoming_messages_rate_per_sec_one_minute': {
|
f'{input_name_clean}_incoming_messages_rate_per_sec_one_minute': {
|
||||||
'value': metrics_data['incomingMessages_one_minute'],
|
'value': metrics_data['incomingMessages_one_minute'],
|
||||||
'min': 0,
|
'min': 0,
|
||||||
|
@ -167,10 +157,32 @@ def main():
|
||||||
'min': 0,
|
'min': 0,
|
||||||
'unit': 'B',
|
'unit': 'B',
|
||||||
}
|
}
|
||||||
}
|
})
|
||||||
|
|
||||||
if args.cluster_metrics:
|
if args.cluster_metrics:
|
||||||
|
jvm_mem_usage_warn = int(metrics_data['jvm_memory_heap_max'] / int(100 / args.warn_mem))
|
||||||
|
jvm_mem_usage_crit = int(metrics_data['jvm_memory_heap_max'] / int(100 / args.crit_mem))
|
||||||
|
|
||||||
|
# Get traffic data for last 24 hrs
|
||||||
|
traffic_last_24_hrs = fetch_with_retry(f'{base_url}/api/system/cluster/traffic?daily=false', headers=headers, auth=(args.token, 'token'), verify=args.insecure).json()
|
||||||
|
input_traffic_avg = sum([v for k, v in traffic_last_24_hrs['input'].items() if parse_traffic_ts(k)])
|
||||||
|
output_traffic_avg = sum([v for k, v in traffic_last_24_hrs['output'].items() if parse_traffic_ts(k)])
|
||||||
|
|
||||||
|
elastisearch_health = fetch_with_retry(f'{base_url}/api/system/indexer/cluster/health', headers=headers, auth=(args.token, 'token'), verify=args.insecure).json()
|
||||||
|
elastisearch_status = elastisearch_health['status'].lower()
|
||||||
|
elastisearch_active_shards = elastisearch_health['shards']['active']
|
||||||
|
|
||||||
|
indexer_failures = fetch_with_retry(f'{base_url}/api/system/indexer/failures?limit=10&offset=0', headers=headers, auth=(args.token, 'token'), verify=args.insecure).json()
|
||||||
|
|
||||||
perfdata.update({
|
perfdata.update({
|
||||||
|
'throughput_input_1_sec_rate': {
|
||||||
|
'value': int(metrics_data['throughput_input_1_sec_rate']),
|
||||||
|
'min': 0,
|
||||||
|
},
|
||||||
|
'throughput_output_1_sec_rate': {
|
||||||
|
'value': int(metrics_data['throughput_output_1_sec_rate']),
|
||||||
|
'min': 0,
|
||||||
|
},
|
||||||
'entries_uncommitted': {
|
'entries_uncommitted': {
|
||||||
'value': metrics_data['journal_entries_uncommitted'],
|
'value': metrics_data['journal_entries_uncommitted'],
|
||||||
'min': 0,
|
'min': 0,
|
||||||
|
@ -183,7 +195,7 @@ def main():
|
||||||
'max': int(metrics_data['jvm_memory_heap_max']),
|
'max': int(metrics_data['jvm_memory_heap_max']),
|
||||||
'unit': 'B',
|
'unit': 'B',
|
||||||
},
|
},
|
||||||
'from_network_traffic_avg': {
|
'network_traffic_in_avg': {
|
||||||
'value': input_traffic_avg,
|
'value': input_traffic_avg,
|
||||||
'min': 0,
|
'min': 0,
|
||||||
'unit': 'B',
|
'unit': 'B',
|
||||||
|
@ -246,10 +258,13 @@ def main():
|
||||||
if args.crit_notif:
|
if args.crit_notif:
|
||||||
exit_code = nagios.STATE_CRIT # force crit
|
exit_code = nagios.STATE_CRIT # force crit
|
||||||
|
|
||||||
text_result = text_result + f' JVM memory usage: {int((perfdata["jvm_memory_used"]["value"] / metrics_data["jvm_memory_heap_max"]) * 100)}%, input incoming rate (events/second for last minute): {perfdata[f"{input_name_clean}_incoming_messages_rate_per_sec_one_minute"]["value"]}, input connections: {perfdata[f"{input_name_clean}_connections"]["value"]}, input total network in: {human_readable_size(perfdata[f"{input_name_clean}_network_in_total"]["value"], decimal_places=0)}' + '\n' + notif_str
|
text_result = text_result + f' JVM memory usage: {int((perfdata["jvm_memory_used"]["value"] / metrics_data["jvm_memory_heap_max"]) * 100)}%'
|
||||||
else:
|
|
||||||
text_result = text_result + f' {input_name_clean}_incoming_messages_rate_per_sec_one_minute (events/second for last minute): {perfdata[f"{input_name_clean}_incoming_messages_rate_per_sec_one_minute"]["value"]}, {input_name_clean}_connections: {perfdata[f"{input_name_clean}_connections"]["value"]}, {input_name_clean}_network_in_total: {human_readable_size(perfdata[f"{input_name_clean}_network_in_total"]["value"], decimal_places=0)}' + '\n'
|
if args.input:
|
||||||
|
text_result = text_result + f' {input_name_clean} events/second for last minute: {perfdata[f"{input_name_clean}_incoming_messages_rate_per_sec_one_minute"]["value"]}, {input_name_clean}_connections: {perfdata[f"{input_name_clean}_connections"]["value"]}, {input_name_clean}_network_in_total: {human_readable_size(perfdata[f"{input_name_clean}_network_in_total"]["value"], decimal_places=0)}'
|
||||||
exit_code = nagios.STATE_OK
|
exit_code = nagios.STATE_OK
|
||||||
|
else:
|
||||||
|
text_result = text_result + '\n' + notif_str
|
||||||
|
|
||||||
print_icinga2_check_status(text_result, exit_code, perfdata)
|
print_icinga2_check_status(text_result, exit_code, perfdata)
|
||||||
sys.exit(exit_code)
|
sys.exit(exit_code)
|
||||||
|
|
Loading…
Reference in New Issue