add check_greylog.py
This commit is contained in:
parent
ba92dddeec
commit
96fc88737e
|
@ -0,0 +1,189 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import sys
|
||||||
|
import traceback
|
||||||
|
|
||||||
|
import requests
|
||||||
|
from urllib3.exceptions import InsecureRequestWarning
|
||||||
|
|
||||||
|
import checker.nagios
|
||||||
|
from checker import print_icinga2_check_status
|
||||||
|
from checker.linuxfabric.base import get_state
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser(description='Check Graylog input health')
|
||||||
|
parser.add_argument('-u', '--url', required=True, help='Graylog API URL')
|
||||||
|
parser.add_argument('-t', '--token', required=True, help='Graylog API token')
|
||||||
|
parser.add_argument('-i', '--input', required=True, help='Input ID to check')
|
||||||
|
parser.add_argument('--warn-mem', type=int, default=75, help='Percentage of JVM memory used for warm')
|
||||||
|
parser.add_argument('--crit-mem', type=int, default=100, help='Percentage of JVM memory used for critical')
|
||||||
|
parser.add_argument('--insecure', action='store_false', help="Don't verify SSL")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
if not args.insecure:
|
||||||
|
requests.packages.urllib3.disable_warnings(category=InsecureRequestWarning)
|
||||||
|
|
||||||
|
headers = {
|
||||||
|
'Accept': 'application/json',
|
||||||
|
'X-Requested-By': 'XMLHttpRequest',
|
||||||
|
}
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = requests.get(f'{args.url}/system/inputstates/{args.input}', headers=headers,
|
||||||
|
auth=(args.token, 'token'), verify=args.insecure)
|
||||||
|
response.raise_for_status()
|
||||||
|
except requests.exceptions.RequestException as e:
|
||||||
|
print(f'CRITICAL - Unable to query Graylog API: {e}')
|
||||||
|
sys.exit(2)
|
||||||
|
|
||||||
|
input_data = response.json()
|
||||||
|
type = input_data['message_input']['type']
|
||||||
|
|
||||||
|
try:
|
||||||
|
metrics_json = {
|
||||||
|
'metrics': [
|
||||||
|
'org.graylog2.throughput.input.1-sec-rate',
|
||||||
|
'org.graylog2.throughput.output.1-sec-rate',
|
||||||
|
f'{type}.{args.input}.incomingMessages',
|
||||||
|
f'{type}.{args.input}.open_connections',
|
||||||
|
f'{type}.{args.input}.total_connections',
|
||||||
|
f'{type}.{args.input}.written_bytes_1sec',
|
||||||
|
f'{type}.{args.input}.written_bytes_total',
|
||||||
|
f'{type}.{args.input}.read_bytes_1sec',
|
||||||
|
f'{type}.{args.input}.read_bytes_total',
|
||||||
|
"org.graylog2.journal.append.1-sec-rate",
|
||||||
|
"org.graylog2.journal.read.1-sec-rate",
|
||||||
|
"org.graylog2.journal.segments",
|
||||||
|
"org.graylog2.journal.entries-uncommitted",
|
||||||
|
"jvm.memory.heap.used",
|
||||||
|
"jvm.memory.heap.committed",
|
||||||
|
"jvm.memory.heap.max"
|
||||||
|
],
|
||||||
|
}
|
||||||
|
response = requests.post(f'{args.url}/cluster/metrics/multiple', headers=headers, auth=(args.token, 'token'),
|
||||||
|
verify=args.insecure,
|
||||||
|
json=metrics_json)
|
||||||
|
response.raise_for_status()
|
||||||
|
input_metrics = response.json()[list(response.json().keys())[0]]['metrics']
|
||||||
|
except requests.exceptions.RequestException as e:
|
||||||
|
print(f'CRITICAL - Unable to query Graylog API: {e}\n{response.text}')
|
||||||
|
sys.exit(2)
|
||||||
|
|
||||||
|
# Format the metrics
|
||||||
|
metrics_data = {}
|
||||||
|
for metric in input_metrics:
|
||||||
|
name = metric['full_name'].replace(type, '').replace('org.graylog2.', '').replace(args.input, '').strip(
|
||||||
|
'.').replace('-', '_').replace('.', '_')
|
||||||
|
value = None
|
||||||
|
if 'value' in metric['metric']:
|
||||||
|
# perfdata.append(f'{name}={metric["metric"]["value"]}')
|
||||||
|
value = metric["metric"]["value"]
|
||||||
|
elif 'count' in metric['metric']:
|
||||||
|
# perfdata.append(f'{name}={metric["metric"]["count"]}')
|
||||||
|
value = metric["metric"]["count"]
|
||||||
|
elif 'rate' in metric['metric']:
|
||||||
|
# perfdata.append(f'{name}_total={metric["metric"]["rate"]["total"]}')
|
||||||
|
# perfdata.append(f'{name}_mean={metric["metric"]["rate"]["mean"]}')
|
||||||
|
# perfdata.append(f'{name}_five_minute={metric["metric"]["rate"]["five_minute"]}')
|
||||||
|
# perfdata.append(f'{name}_fifteen_minute={metric["metric"]["rate"]["fifteen_minute"]}')
|
||||||
|
# perfdata.append(f'{name}_one_minute={metric["metric"]["rate"]["one_minute"]}')
|
||||||
|
value = metric["metric"]["rate"]["one_minute"]
|
||||||
|
name = f'{name}_one_minute'
|
||||||
|
# if isinstance(value, float):
|
||||||
|
# value = round(value, 1)
|
||||||
|
value = int(value)
|
||||||
|
metrics_data[name] = value
|
||||||
|
|
||||||
|
jvm_mem_usage_warn = int(metrics_data['jvm_memory_heap_max'] / int(100 / args.warn_mem))
|
||||||
|
jvm_mem_usage_crit = int(metrics_data['jvm_memory_heap_max'] / int(100 / args.crit_mem))
|
||||||
|
|
||||||
|
perfdata = {
|
||||||
|
'throughput_input_1_sec_rate': {
|
||||||
|
'value': int(metrics_data['throughput_input_1_sec_rate']),
|
||||||
|
'min': 0,
|
||||||
|
},
|
||||||
|
'throughput_output_1_sec_rate': {
|
||||||
|
'value': int(metrics_data['throughput_output_1_sec_rate']),
|
||||||
|
'min': 0,
|
||||||
|
},
|
||||||
|
'incoming_messages_one_minute': {
|
||||||
|
'value': metrics_data['incomingMessages_one_minute'],
|
||||||
|
'min': 0,
|
||||||
|
},
|
||||||
|
'open_connections': {
|
||||||
|
'value': metrics_data['open_connections'],
|
||||||
|
'min': 0,
|
||||||
|
},
|
||||||
|
'total_connections': {
|
||||||
|
'value': metrics_data['total_connections'],
|
||||||
|
'min': 0,
|
||||||
|
},
|
||||||
|
'written_bytes_1sec': {
|
||||||
|
'value': metrics_data['written_bytes_1sec'],
|
||||||
|
'min': 0,
|
||||||
|
'unit': 'B',
|
||||||
|
},
|
||||||
|
'read_bytes_1sec': {
|
||||||
|
'value': metrics_data['read_bytes_1sec'],
|
||||||
|
'min': 0,
|
||||||
|
'unit': 'B',
|
||||||
|
},
|
||||||
|
'entries_uncommitted': {
|
||||||
|
'value': metrics_data['journal_entries_uncommitted'],
|
||||||
|
'min': 0,
|
||||||
|
},
|
||||||
|
'jvm_memory_used': {
|
||||||
|
'value': metrics_data['jvm_memory_heap_used'],
|
||||||
|
'min': 0,
|
||||||
|
'warn': jvm_mem_usage_warn,
|
||||||
|
'crit': jvm_mem_usage_crit,
|
||||||
|
'max': int(metrics_data['jvm_memory_heap_max']),
|
||||||
|
'unit': 'B',
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
jvm_mem_usage_state = get_state(int(metrics_data['jvm_memory_heap_used']), jvm_mem_usage_warn, jvm_mem_usage_crit,
|
||||||
|
operator='gt')
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = requests.get(f'{args.url}/system/notifications', headers=headers, auth=(args.token, 'token'),
|
||||||
|
verify=args.insecure)
|
||||||
|
response.raise_for_status()
|
||||||
|
except requests.exceptions.RequestException as e:
|
||||||
|
print(f'CRITICAL - Unable to query Graylog API: {e}')
|
||||||
|
sys.exit(2)
|
||||||
|
notifications = response.json()
|
||||||
|
if notifications['total'] > 0:
|
||||||
|
notif_str = 'Notifications:'
|
||||||
|
for notification in notifications['notifications']:
|
||||||
|
notif_str = notif_str + f'\n{notification["type"]}: {notification["description"]}'
|
||||||
|
else:
|
||||||
|
notif_str = 'No notifications'
|
||||||
|
|
||||||
|
if input_data['state'] == 'RUNNING':
|
||||||
|
text_result = f'Input "{input_data["message_input"]["title"]}" is running.'
|
||||||
|
else:
|
||||||
|
text_result = f'Input "{input_data["message_input"]["title"]}" is not running!'
|
||||||
|
print(text_result)
|
||||||
|
print(notif_str)
|
||||||
|
sys.exit(checker.nagios.STATE_CRIT)
|
||||||
|
|
||||||
|
if jvm_mem_usage_state != checker.nagios.STATE_OK:
|
||||||
|
text_result += f' JVM memory usage is high!'
|
||||||
|
|
||||||
|
text_result = text_result + f' JVM memory usage: {int((perfdata["jvm_memory_used"]["value"] / metrics_data["jvm_memory_heap_max"]) * 100)}%, incoming_messages_one_minute: {perfdata["incoming_messages_one_minute"]["value"]}, open_connections: {perfdata["open_connections"]["value"]}' + '\n' + notif_str
|
||||||
|
|
||||||
|
exit_code = max(checker.nagios.STATE_OK, jvm_mem_usage_state)
|
||||||
|
print_icinga2_check_status(text_result, exit_code, perfdata)
|
||||||
|
sys.exit(exit_code)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
try:
|
||||||
|
main()
|
||||||
|
except Exception as e:
|
||||||
|
print(f'UNKNOWN: exception "{e}"')
|
||||||
|
print(traceback.format_exc())
|
||||||
|
sys.exit(checker.nagios.STATE_UNKNOWN)
|
|
@ -87,9 +87,9 @@ def main():
|
||||||
warnings.simplefilter("ignore", category=RuntimeWarning)
|
warnings.simplefilter("ignore", category=RuntimeWarning)
|
||||||
speedtest_results = run_speedtest()
|
speedtest_results = run_speedtest()
|
||||||
|
|
||||||
upload_speed_state = get_state(speedtest_results['upload_speed'], args.warn_up, args.critical_up, _operator='le')
|
upload_speed_state = get_state(speedtest_results['upload_speed'], args.warn_up, args.critical_up, operator='le')
|
||||||
download_speed_state = get_state(speedtest_results['download_speed'], args.warn_down, args.critical_down, _operator='le')
|
download_speed_state = get_state(speedtest_results['download_speed'], args.warn_down, args.critical_down, operator='le')
|
||||||
latency_state = get_state(speedtest_results['latency'], args.warn_latency, args.critical_latency, _operator='ge')
|
latency_state = get_state(speedtest_results['latency'], args.warn_latency, args.critical_latency, operator='ge')
|
||||||
exit_code = max(upload_speed_state, download_speed_state, latency_state)
|
exit_code = max(upload_speed_state, download_speed_state, latency_state)
|
||||||
text_result = f"upload: {speedtest_results['upload_speed']:.1f} Mbps, download: {speedtest_results['download_speed']:.1f} Mbps, latency: {speedtest_results['latency']:.1f} ms, jitter: {speedtest_results['jitter']:.1f} ms"
|
text_result = f"upload: {speedtest_results['upload_speed']:.1f} Mbps, download: {speedtest_results['download_speed']:.1f} Mbps, latency: {speedtest_results['latency']:.1f} ms, jitter: {speedtest_results['jitter']:.1f} ms"
|
||||||
|
|
||||||
|
|
|
@ -106,7 +106,7 @@ def get_perfdata(label, value, uom=None, warn=None, crit=None, _min=None, _max=N
|
||||||
return msg
|
return msg
|
||||||
|
|
||||||
|
|
||||||
def get_state(value, warn, crit, _operator='ge'):
|
def get_state(value, warn, crit, operator='ge'):
|
||||||
"""Returns the STATE by comparing `value` to the given thresholds using
|
"""Returns the STATE by comparing `value` to the given thresholds using
|
||||||
a comparison `_operator`. `warn` and `crit` threshold may also be `None`.
|
a comparison `_operator`. `warn` and `crit` threshold may also be `None`.
|
||||||
|
|
||||||
|
@ -123,7 +123,7 @@ def get_state(value, warn, crit, _operator='ge'):
|
||||||
Numeric warning threshold
|
Numeric warning threshold
|
||||||
crit : float
|
crit : float
|
||||||
Numeric critical threshold
|
Numeric critical threshold
|
||||||
_operator : string
|
operator : string
|
||||||
`eq` = equal to
|
`eq` = equal to
|
||||||
`ge` = greater or equal
|
`ge` = greater or equal
|
||||||
`gt` = greater than
|
`gt` = greater than
|
||||||
|
@ -139,7 +139,7 @@ def get_state(value, warn, crit, _operator='ge'):
|
||||||
"""
|
"""
|
||||||
# make sure to use float comparison
|
# make sure to use float comparison
|
||||||
value = float(value)
|
value = float(value)
|
||||||
if _operator == 'ge':
|
if operator == 'ge':
|
||||||
if crit is not None:
|
if crit is not None:
|
||||||
if value >= float(crit):
|
if value >= float(crit):
|
||||||
return STATE_CRIT
|
return STATE_CRIT
|
||||||
|
@ -148,7 +148,7 @@ def get_state(value, warn, crit, _operator='ge'):
|
||||||
return STATE_WARN
|
return STATE_WARN
|
||||||
return STATE_OK
|
return STATE_OK
|
||||||
|
|
||||||
if _operator == 'gt':
|
if operator == 'gt':
|
||||||
if crit is not None:
|
if crit is not None:
|
||||||
if value > float(crit):
|
if value > float(crit):
|
||||||
return STATE_CRIT
|
return STATE_CRIT
|
||||||
|
@ -157,7 +157,7 @@ def get_state(value, warn, crit, _operator='ge'):
|
||||||
return STATE_WARN
|
return STATE_WARN
|
||||||
return STATE_OK
|
return STATE_OK
|
||||||
|
|
||||||
if _operator == 'le':
|
if operator == 'le':
|
||||||
if crit is not None:
|
if crit is not None:
|
||||||
if value <= float(crit):
|
if value <= float(crit):
|
||||||
return STATE_CRIT
|
return STATE_CRIT
|
||||||
|
@ -166,7 +166,7 @@ def get_state(value, warn, crit, _operator='ge'):
|
||||||
return STATE_WARN
|
return STATE_WARN
|
||||||
return STATE_OK
|
return STATE_OK
|
||||||
|
|
||||||
if _operator == 'lt':
|
if operator == 'lt':
|
||||||
if crit is not None:
|
if crit is not None:
|
||||||
if value < float(crit):
|
if value < float(crit):
|
||||||
return STATE_CRIT
|
return STATE_CRIT
|
||||||
|
@ -175,7 +175,7 @@ def get_state(value, warn, crit, _operator='ge'):
|
||||||
return STATE_WARN
|
return STATE_WARN
|
||||||
return STATE_OK
|
return STATE_OK
|
||||||
|
|
||||||
if _operator == 'eq':
|
if operator == 'eq':
|
||||||
if crit is not None:
|
if crit is not None:
|
||||||
if value == float(crit):
|
if value == float(crit):
|
||||||
return STATE_CRIT
|
return STATE_CRIT
|
||||||
|
@ -184,7 +184,7 @@ def get_state(value, warn, crit, _operator='ge'):
|
||||||
return STATE_WARN
|
return STATE_WARN
|
||||||
return STATE_OK
|
return STATE_OK
|
||||||
|
|
||||||
if _operator == 'ne':
|
if operator == 'ne':
|
||||||
if crit is not None:
|
if crit is not None:
|
||||||
if value != float(crit):
|
if value != float(crit):
|
||||||
return STATE_CRIT
|
return STATE_CRIT
|
||||||
|
@ -193,7 +193,7 @@ def get_state(value, warn, crit, _operator='ge'):
|
||||||
return STATE_WARN
|
return STATE_WARN
|
||||||
return STATE_OK
|
return STATE_OK
|
||||||
|
|
||||||
if _operator == 'range':
|
if operator == 'range':
|
||||||
if crit is not None:
|
if crit is not None:
|
||||||
if not contine_or_exit(match_range(value, crit)):
|
if not contine_or_exit(match_range(value, crit)):
|
||||||
return STATE_CRIT
|
return STATE_CRIT
|
||||||
|
|
Loading…
Reference in New Issue