add method to get_with_retry()
check_graylog: add more healthchecks and perfdata
This commit is contained in:
parent
d431bd5d7a
commit
9f00479dad
265
check_graylog.py
265
check_graylog.py
|
@ -3,27 +3,49 @@
|
||||||
import argparse
|
import argparse
|
||||||
import sys
|
import sys
|
||||||
import traceback
|
import traceback
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
from urllib3.exceptions import InsecureRequestWarning
|
from urllib3.exceptions import InsecureRequestWarning
|
||||||
|
|
||||||
import checker.nagios
|
from checker import nagios
|
||||||
from checker import print_icinga2_check_status
|
from checker import print_icinga2_check_status
|
||||||
|
from checker.http import fetch_with_retry
|
||||||
from checker.linuxfabric.base import get_state
|
from checker.linuxfabric.base import get_state
|
||||||
from checker.units import human_readable_size
|
from checker.units import human_readable_size
|
||||||
|
|
||||||
|
|
||||||
|
def transform_inputs(old_dict):
|
||||||
|
new_dict = {}
|
||||||
|
for key in old_dict:
|
||||||
|
for item in old_dict[key]:
|
||||||
|
new_key = item['id']
|
||||||
|
new_dict[new_key] = item
|
||||||
|
return new_dict
|
||||||
|
|
||||||
|
|
||||||
|
def parse_traffic_ts(ts: str):
|
||||||
|
datetime_obj = datetime.strptime(ts, '%Y-%m-%dT%H:%M:%S.%fZ')
|
||||||
|
current_time = datetime.now()
|
||||||
|
time_diff = current_time - datetime_obj
|
||||||
|
return time_diff.total_seconds() < 24 * 60 * 60 # less than 24 hrs ago?
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
parser = argparse.ArgumentParser(description='Check Graylog input health')
|
parser = argparse.ArgumentParser(description='Check Graylog input health')
|
||||||
parser.add_argument('-u', '--url', required=True, help='Graylog API URL')
|
parser.add_argument('-u', '--url', required=True, help='The base Graylog URL')
|
||||||
parser.add_argument('-t', '--token', required=True, help='Graylog API token')
|
parser.add_argument('-t', '--token', required=True, help='Graylog API token')
|
||||||
parser.add_argument('-i', '--input', required=True, help='Input ID to check')
|
parser.add_argument('-i', '--input', required=True, help='Input ID to check')
|
||||||
parser.add_argument('--warn-mem', type=int, default=75, help='Percentage of JVM memory used for warm')
|
parser.add_argument('--warn-mem', type=int, default=75, help='Percentage of JVM memory used for warm')
|
||||||
parser.add_argument('--crit-mem', type=int, default=100, help='Percentage of JVM memory used for critical')
|
parser.add_argument('--crit-mem', type=int, default=100, help='Percentage of JVM memory used for critical')
|
||||||
parser.add_argument('--insecure', action='store_false', help="Don't verify SSL")
|
parser.add_argument('--insecure', action='store_false', help="Don't verify SSL")
|
||||||
parser.add_argument('--crit-notif', action='store_true', help='Return critical when there are notifications')
|
parser.add_argument('--crit-notif', action='store_true', help='Return critical when there are notifications')
|
||||||
|
parser.add_argument('--html', action='store_true', help='Print HTML')
|
||||||
|
parser.add_argument('--cluster-metrics', action='store_true', help='Also gather cluster metrics and check for notifications.')
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
base_url = args.url.strip('/')
|
||||||
|
|
||||||
if not args.insecure:
|
if not args.insecure:
|
||||||
requests.packages.urllib3.disable_warnings(category=InsecureRequestWarning)
|
requests.packages.urllib3.disable_warnings(category=InsecureRequestWarning)
|
||||||
|
|
||||||
|
@ -32,48 +54,52 @@ def main():
|
||||||
'X-Requested-By': 'XMLHttpRequest',
|
'X-Requested-By': 'XMLHttpRequest',
|
||||||
}
|
}
|
||||||
|
|
||||||
try:
|
# Get the basic input metadata
|
||||||
response = requests.get(f'{args.url}/system/inputstates/{args.input}', headers=headers,
|
input_data = transform_inputs(fetch_with_retry(f'{base_url}/api/cluster/inputstates', headers=headers, auth=(args.token, 'token'), verify=args.insecure).json()).get(args.input, {})
|
||||||
auth=(args.token, 'token'), verify=args.insecure)
|
|
||||||
response.raise_for_status()
|
|
||||||
except requests.exceptions.RequestException as e:
|
|
||||||
print(f'CRITICAL - Unable to query Graylog API: {e}')
|
|
||||||
sys.exit(2)
|
|
||||||
|
|
||||||
input_data = response.json()
|
# Get it over with
|
||||||
|
if bool(input_data) and input_data.get('state') == 'RUNNING':
|
||||||
|
input_name = input_data["message_input"]["title"]
|
||||||
|
text_result = f'Graylog input "{input_name}" is running.'
|
||||||
|
else:
|
||||||
|
input_name = args.input
|
||||||
|
if args.html:
|
||||||
|
text_result = f'Graylog input <a href="{base_url}/system/inputs" target="_blank">"{input_name}" is not running!</a>'
|
||||||
|
else:
|
||||||
|
text_result = f'Graylog input "{input_name}" is not running!'
|
||||||
|
print_icinga2_check_status(text_result, nagios.STATE_CRIT)
|
||||||
|
sys.exit(nagios.STATE_CRIT)
|
||||||
|
|
||||||
|
# If the input is running, continue gathering metrics and other health checks
|
||||||
|
input_name_clean = input_name.lower().replace(' ', '_').replace('-', '_')
|
||||||
|
|
||||||
|
# Get metrics for the input
|
||||||
type = input_data['message_input']['type']
|
type = input_data['message_input']['type']
|
||||||
|
metrics_json = {
|
||||||
|
'metrics': [
|
||||||
|
'org.graylog2.throughput.input.1-sec-rate',
|
||||||
|
'org.graylog2.throughput.output.1-sec-rate',
|
||||||
|
f'{type}.{args.input}.incomingMessages',
|
||||||
|
f'{type}.{args.input}.open_connections',
|
||||||
|
f'{type}.{args.input}.total_connections',
|
||||||
|
f'{type}.{args.input}.written_bytes_1sec',
|
||||||
|
f'{type}.{args.input}.written_bytes_total',
|
||||||
|
f'{type}.{args.input}.read_bytes_1sec',
|
||||||
|
f'{type}.{args.input}.read_bytes_total',
|
||||||
|
"org.graylog2.journal.append.1-sec-rate",
|
||||||
|
"org.graylog2.journal.read.1-sec-rate",
|
||||||
|
"org.graylog2.journal.segments",
|
||||||
|
"org.graylog2.journal.entries-uncommitted",
|
||||||
|
"jvm.memory.heap.used",
|
||||||
|
"jvm.memory.heap.committed",
|
||||||
|
"jvm.memory.heap.max"
|
||||||
|
],
|
||||||
|
}
|
||||||
|
r = fetch_with_retry(f'{base_url}/api/cluster/metrics/multiple', method='post', headers=headers, auth=(args.token, 'token'),
|
||||||
|
verify=args.insecure, json=metrics_json).json()
|
||||||
|
input_metrics = r[list(r.keys())[0]]['metrics']
|
||||||
|
|
||||||
try:
|
# Format the metrics for later
|
||||||
metrics_json = {
|
|
||||||
'metrics': [
|
|
||||||
'org.graylog2.throughput.input.1-sec-rate',
|
|
||||||
'org.graylog2.throughput.output.1-sec-rate',
|
|
||||||
f'{type}.{args.input}.incomingMessages',
|
|
||||||
f'{type}.{args.input}.open_connections',
|
|
||||||
f'{type}.{args.input}.total_connections',
|
|
||||||
f'{type}.{args.input}.written_bytes_1sec',
|
|
||||||
f'{type}.{args.input}.written_bytes_total',
|
|
||||||
f'{type}.{args.input}.read_bytes_1sec',
|
|
||||||
f'{type}.{args.input}.read_bytes_total',
|
|
||||||
"org.graylog2.journal.append.1-sec-rate",
|
|
||||||
"org.graylog2.journal.read.1-sec-rate",
|
|
||||||
"org.graylog2.journal.segments",
|
|
||||||
"org.graylog2.journal.entries-uncommitted",
|
|
||||||
"jvm.memory.heap.used",
|
|
||||||
"jvm.memory.heap.committed",
|
|
||||||
"jvm.memory.heap.max"
|
|
||||||
],
|
|
||||||
}
|
|
||||||
response = requests.post(f'{args.url}/cluster/metrics/multiple', headers=headers, auth=(args.token, 'token'),
|
|
||||||
verify=args.insecure,
|
|
||||||
json=metrics_json)
|
|
||||||
response.raise_for_status()
|
|
||||||
input_metrics = response.json()[list(response.json().keys())[0]]['metrics']
|
|
||||||
except requests.exceptions.RequestException as e:
|
|
||||||
print(f'CRITICAL - Unable to query Graylog API: {e}\n{response.text}')
|
|
||||||
sys.exit(2)
|
|
||||||
|
|
||||||
# Format the metrics
|
|
||||||
metrics_data = {}
|
metrics_data = {}
|
||||||
for metric in input_metrics:
|
for metric in input_metrics:
|
||||||
name = metric['full_name'].replace(type, '').replace('org.graylog2.', '').replace(args.input, '').strip(
|
name = metric['full_name'].replace(type, '').replace('org.graylog2.', '').replace(args.input, '').strip(
|
||||||
|
@ -92,95 +118,138 @@ def main():
|
||||||
jvm_mem_usage_warn = int(metrics_data['jvm_memory_heap_max'] / int(100 / args.warn_mem))
|
jvm_mem_usage_warn = int(metrics_data['jvm_memory_heap_max'] / int(100 / args.warn_mem))
|
||||||
jvm_mem_usage_crit = int(metrics_data['jvm_memory_heap_max'] / int(100 / args.crit_mem))
|
jvm_mem_usage_crit = int(metrics_data['jvm_memory_heap_max'] / int(100 / args.crit_mem))
|
||||||
|
|
||||||
|
# Get traffic data for last 24 hrs
|
||||||
|
traffic_last_24_hrs = fetch_with_retry(f'{base_url}/api/system/cluster/traffic?daily=false', headers=headers, auth=(args.token, 'token'), verify=args.insecure).json()
|
||||||
|
input_traffic_avg = sum([v for k, v in traffic_last_24_hrs['input'].items() if parse_traffic_ts(k)])
|
||||||
|
output_traffic_avg = sum([v for k, v in traffic_last_24_hrs['output'].items() if parse_traffic_ts(k)])
|
||||||
|
|
||||||
|
elastisearch_health = fetch_with_retry(f'{base_url}/api/system/indexer/cluster/health', headers=headers, auth=(args.token, 'token'), verify=args.insecure).json()
|
||||||
|
elastisearch_status = elastisearch_health['status'].lower()
|
||||||
|
elastisearch_active_shards = elastisearch_health['shards']['active']
|
||||||
|
|
||||||
|
indexer_failures = fetch_with_retry(f'{base_url}/api/system/indexer/failures?limit=10&offset=0', headers=headers, auth=(args.token, 'token'), verify=args.insecure).json()
|
||||||
|
|
||||||
# Some metric names are changed for better readability
|
# Some metric names are changed for better readability
|
||||||
perfdata = {
|
perfdata = {
|
||||||
'throughput_input_1_sec_rate': {
|
f'{input_name_clean}_throughput_input_1_sec_rate': {
|
||||||
'value': int(metrics_data['throughput_input_1_sec_rate']),
|
'value': int(metrics_data['throughput_input_1_sec_rate']),
|
||||||
'min': 0,
|
'min': 0,
|
||||||
},
|
},
|
||||||
'throughput_output_1_sec_rate': {
|
f'{input_name_clean}_throughput_output_1_sec_rate': {
|
||||||
'value': int(metrics_data['throughput_output_1_sec_rate']),
|
'value': int(metrics_data['throughput_output_1_sec_rate']),
|
||||||
'min': 0,
|
'min': 0,
|
||||||
},
|
},
|
||||||
'incoming_messages_rate_per_sec_one_minute': {
|
f'{input_name_clean}_incoming_messages_rate_per_sec_one_minute': {
|
||||||
'value': metrics_data['incomingMessages_one_minute'],
|
'value': metrics_data['incomingMessages_one_minute'],
|
||||||
'min': 0,
|
'min': 0,
|
||||||
},
|
},
|
||||||
'connections': {
|
f'{input_name_clean}_connections': {
|
||||||
'value': metrics_data['open_connections'],
|
'value': metrics_data['open_connections'],
|
||||||
'min': 0,
|
'min': 0,
|
||||||
},
|
},
|
||||||
'network_out_total_1sec': {
|
f'{input_name_clean}_network_out_total_1sec': {
|
||||||
'value': metrics_data['written_bytes_1sec'],
|
'value': metrics_data['written_bytes_1sec'],
|
||||||
'min': 0,
|
'min': 0,
|
||||||
'unit': 'B',
|
'unit': 'B',
|
||||||
},
|
},
|
||||||
'network_out_total_total': {
|
f'{input_name_clean}_network_out_total_total': {
|
||||||
'value': metrics_data['written_bytes_total'],
|
'value': metrics_data['written_bytes_total'],
|
||||||
'min': 0,
|
'min': 0,
|
||||||
'unit': 'B',
|
'unit': 'B',
|
||||||
},
|
},
|
||||||
'network_in_1sec': {
|
f'{input_name_clean}_network_in_1sec': {
|
||||||
'value': metrics_data['read_bytes_1sec'],
|
'value': metrics_data['read_bytes_1sec'],
|
||||||
'min': 0,
|
'min': 0,
|
||||||
'unit': 'B',
|
'unit': 'B',
|
||||||
},
|
},
|
||||||
'network_in_total': {
|
f'{input_name_clean}_network_in_total': {
|
||||||
'value': metrics_data['read_bytes_total'],
|
'value': metrics_data['read_bytes_total'],
|
||||||
'min': 0,
|
'min': 0,
|
||||||
'unit': 'B',
|
'unit': 'B',
|
||||||
},
|
}
|
||||||
'entries_uncommitted': {
|
|
||||||
'value': metrics_data['journal_entries_uncommitted'],
|
|
||||||
'min': 0,
|
|
||||||
},
|
|
||||||
'jvm_memory_used': {
|
|
||||||
'value': metrics_data['jvm_memory_heap_used'],
|
|
||||||
'min': 0,
|
|
||||||
'warn': jvm_mem_usage_warn,
|
|
||||||
'crit': jvm_mem_usage_crit,
|
|
||||||
'max': int(metrics_data['jvm_memory_heap_max']),
|
|
||||||
'unit': 'B',
|
|
||||||
},
|
|
||||||
}
|
}
|
||||||
|
|
||||||
jvm_mem_usage_state = get_state(int(metrics_data['jvm_memory_heap_used']), jvm_mem_usage_warn, jvm_mem_usage_crit,
|
if args.cluster_metrics:
|
||||||
operator='gt')
|
perfdata.update({
|
||||||
|
'entries_uncommitted': {
|
||||||
|
'value': metrics_data['journal_entries_uncommitted'],
|
||||||
|
'min': 0,
|
||||||
|
},
|
||||||
|
'jvm_memory_used': {
|
||||||
|
'value': metrics_data['jvm_memory_heap_used'],
|
||||||
|
'min': 0,
|
||||||
|
'warn': jvm_mem_usage_warn,
|
||||||
|
'crit': jvm_mem_usage_crit,
|
||||||
|
'max': int(metrics_data['jvm_memory_heap_max']),
|
||||||
|
'unit': 'B',
|
||||||
|
},
|
||||||
|
'from_network_traffic_avg': {
|
||||||
|
'value': input_traffic_avg,
|
||||||
|
'min': 0,
|
||||||
|
'unit': 'B',
|
||||||
|
},
|
||||||
|
'to_elasticsearch_24hrs_avg': {
|
||||||
|
'value': output_traffic_avg,
|
||||||
|
'min': 0,
|
||||||
|
'unit': 'B',
|
||||||
|
},
|
||||||
|
'elastisearch_active_shards': {
|
||||||
|
'value': elastisearch_active_shards,
|
||||||
|
'min': 0
|
||||||
|
},
|
||||||
|
'indexder_failures': {
|
||||||
|
'value': indexer_failures['total'],
|
||||||
|
'min': 0,
|
||||||
|
},
|
||||||
|
})
|
||||||
|
|
||||||
try:
|
# Check for notifications
|
||||||
response = requests.get(f'{args.url}/system/notifications', headers=headers, auth=(args.token, 'token'),
|
if args.cluster_metrics:
|
||||||
verify=args.insecure)
|
notifications = fetch_with_retry(f'{base_url}/api/system/notifications', headers=headers, auth=(args.token, 'token'), verify=args.insecure).json()
|
||||||
response.raise_for_status()
|
if notifications['total'] > 0:
|
||||||
except requests.exceptions.RequestException as e:
|
notif_str = 'Notifications:'
|
||||||
print(f'CRITICAL - Unable to query Graylog API: {e}')
|
for notification in notifications['notifications']:
|
||||||
sys.exit(2)
|
notif_str = notif_str + f'\n{notification["type"]}: {notification["description"]}'
|
||||||
notifications = response.json()
|
else:
|
||||||
if notifications['total'] > 0:
|
notif_str = 'No notifications'
|
||||||
notif_str = 'Notifications:'
|
|
||||||
for notification in notifications['notifications']:
|
if indexer_failures['total'] > 0:
|
||||||
notif_str = notif_str + f'\n{notification["type"]}: {notification["description"]}'
|
indexer_failures_exit = nagios.STATE_CRIT
|
||||||
|
if args.html:
|
||||||
|
text_result += f' <a href="{base_url}/system/indices/failures" target="_blank">There are {indexer_failures["total"]} indexer failures!</a>'
|
||||||
|
else:
|
||||||
|
text_result += f' There are {indexer_failures["total"]} indexer failures!'
|
||||||
|
else:
|
||||||
|
indexer_failures_exit = nagios.STATE_OK
|
||||||
|
|
||||||
|
# https://go2docs.graylog.org/5-0/setting_up_graylog/elasticsearch.htm#ClusterStatusExplained
|
||||||
|
if elastisearch_status == 'yellow':
|
||||||
|
elastisearch_exit_code = nagios.STATE_WARN
|
||||||
|
text_result += ' Elasticsearch is condition YELLOW!'
|
||||||
|
elif elastisearch_status == 'red':
|
||||||
|
elastisearch_exit_code = nagios.STATE_CRIT
|
||||||
|
text_result += ' Elasticsearch is condition RED!'
|
||||||
|
elif elastisearch_status == 'green':
|
||||||
|
elastisearch_exit_code = nagios.STATE_OK
|
||||||
|
else:
|
||||||
|
print_icinga2_check_status(f'unknown Elasticsearch health: {elastisearch_status}', nagios.STATE_UNKNOWN)
|
||||||
|
sys.exit(nagios.STATE_UNKNOWN)
|
||||||
|
|
||||||
|
jvm_mem_usage_state = get_state(int(metrics_data['jvm_memory_heap_used']), jvm_mem_usage_warn, jvm_mem_usage_crit, operator='gt')
|
||||||
|
if jvm_mem_usage_state != nagios.STATE_OK:
|
||||||
|
text_result += f' JVM memory usage is high!'
|
||||||
|
|
||||||
|
exit_code = max(nagios.STATE_OK, jvm_mem_usage_state, elastisearch_exit_code, indexer_failures_exit)
|
||||||
|
|
||||||
|
if notifications['total'] > 0:
|
||||||
|
text_result += f' There are notifications!'
|
||||||
|
if args.crit_notif:
|
||||||
|
exit_code = nagios.STATE_CRIT # force crit
|
||||||
|
|
||||||
|
text_result = text_result + f' JVM memory usage: {int((perfdata["jvm_memory_used"]["value"] / metrics_data["jvm_memory_heap_max"]) * 100)}%, input incoming rate (events/second for last minute): {perfdata[f"{input_name_clean}_incoming_messages_rate_per_sec_one_minute"]["value"]}, input connections: {perfdata[f"{input_name_clean}_connections"]["value"]}, input total network in: {human_readable_size(perfdata[f"{input_name_clean}_network_in_total"]["value"], decimal_places=0)}' + '\n' + notif_str
|
||||||
else:
|
else:
|
||||||
notif_str = 'No notifications'
|
text_result = text_result + f' {input_name_clean}_incoming_messages_rate_per_sec_one_minute (events/second for last minute): {perfdata[f"{input_name_clean}_incoming_messages_rate_per_sec_one_minute"]["value"]}, {input_name_clean}_connections: {perfdata[f"{input_name_clean}_connections"]["value"]}, {input_name_clean}_network_in_total: {human_readable_size(perfdata[f"{input_name_clean}_network_in_total"]["value"], decimal_places=0)}' + '\n'
|
||||||
|
exit_code = nagios.STATE_OK
|
||||||
if input_data['state'] == 'RUNNING':
|
|
||||||
text_result = f'Input "{input_data["message_input"]["title"]}" is running.'
|
|
||||||
else:
|
|
||||||
text_result = f'Input "{input_data["message_input"]["title"]}" is not running!'
|
|
||||||
print(text_result)
|
|
||||||
print(notif_str)
|
|
||||||
sys.exit(checker.nagios.STATE_CRIT)
|
|
||||||
|
|
||||||
if jvm_mem_usage_state != checker.nagios.STATE_OK:
|
|
||||||
text_result += f' JVM memory usage is high!'
|
|
||||||
|
|
||||||
exit_code = max(checker.nagios.STATE_OK, jvm_mem_usage_state)
|
|
||||||
|
|
||||||
if notifications['total'] > 0:
|
|
||||||
text_result += f' There are notifications!'
|
|
||||||
if args.crit_notif:
|
|
||||||
exit_code = checker.nagios.STATE_CRIT
|
|
||||||
|
|
||||||
text_result = text_result + f' JVM memory usage: {int((perfdata["jvm_memory_used"]["value"] / metrics_data["jvm_memory_heap_max"]) * 100)}%, incoming rate (events/second for last minute): {perfdata["incoming_messages_rate_per_sec_one_minute"]["value"]}, connections: {perfdata["connections"]["value"]}, total network in: {human_readable_size(perfdata["network_in_total"]["value"], decimal_places=0)}' + '\n' + notif_str
|
|
||||||
|
|
||||||
print_icinga2_check_status(text_result, exit_code, perfdata)
|
print_icinga2_check_status(text_result, exit_code, perfdata)
|
||||||
sys.exit(exit_code)
|
sys.exit(exit_code)
|
||||||
|
@ -192,4 +261,4 @@ if __name__ == "__main__":
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f'UNKNOWN: exception "{e}"')
|
print(f'UNKNOWN: exception "{e}"')
|
||||||
print(traceback.format_exc())
|
print(traceback.format_exc())
|
||||||
sys.exit(checker.nagios.STATE_UNKNOWN)
|
sys.exit(nagios.STATE_UNKNOWN)
|
||||||
|
|
|
@ -5,7 +5,7 @@ import sys
|
||||||
import traceback
|
import traceback
|
||||||
|
|
||||||
from checker import nagios, print_icinga2_check_status
|
from checker import nagios, print_icinga2_check_status
|
||||||
from checker.http import get_with_retry
|
from checker.http import fetch_with_retry
|
||||||
from checker.linuxfabric.base import get_state
|
from checker.linuxfabric.base import get_state
|
||||||
|
|
||||||
|
|
||||||
|
@ -42,7 +42,7 @@ def main():
|
||||||
parser.add_argument("--warning-waiting", type=int, default=None, help="Warning threshold for waiting connections. Default: 0 (disabled)")
|
parser.add_argument("--warning-waiting", type=int, default=None, help="Warning threshold for waiting connections. Default: 0 (disabled)")
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
status = get_with_retry(args.url).text
|
status = fetch_with_retry(args.url).text
|
||||||
data = parse_nginx_status(status)
|
data = parse_nginx_status(status)
|
||||||
|
|
||||||
perfdata_dict = {
|
perfdata_dict = {
|
||||||
|
|
|
@ -10,7 +10,7 @@ from urllib3.exceptions import InsecureRequestWarning
|
||||||
|
|
||||||
import checker.nagios as nagios
|
import checker.nagios as nagios
|
||||||
from checker import print_icinga2_check_status
|
from checker import print_icinga2_check_status
|
||||||
from checker.http import get_with_retry
|
from checker.http import fetch_with_retry
|
||||||
from checker.linuxfabric.base import get_state
|
from checker.linuxfabric.base import get_state
|
||||||
from checker.markdown import list_to_markdown_table
|
from checker.markdown import list_to_markdown_table
|
||||||
from checker.units import filesize
|
from checker.units import filesize
|
||||||
|
@ -26,9 +26,9 @@ def is_internet_traffic(ip):
|
||||||
|
|
||||||
|
|
||||||
def get_traffic_top(args, interface):
|
def get_traffic_top(args, interface):
|
||||||
response = get_with_retry(f'https://{args.opnsense}/api/diagnostics/traffic/top/{interface}',
|
response = fetch_with_retry(f'https://{args.opnsense}/api/diagnostics/traffic/top/{interface}',
|
||||||
headers={'Accept': 'application/json'}, auth=(args.key, args.secret), verify=False,
|
headers={'Accept': 'application/json'}, auth=(args.key, args.secret), verify=False,
|
||||||
timeout=args.timeout)
|
timeout=args.timeout)
|
||||||
if response.status_code != 200:
|
if response.status_code != 200:
|
||||||
print(f'UNKNOWN: unable to query OPNsense API for {interface}: {response.status_code}\n{response.text}')
|
print(f'UNKNOWN: unable to query OPNsense API for {interface}: {response.status_code}\n{response.text}')
|
||||||
sys.exit(nagios.UNKNOWN)
|
sys.exit(nagios.UNKNOWN)
|
||||||
|
@ -81,9 +81,9 @@ def main():
|
||||||
traffic_data = []
|
traffic_data = []
|
||||||
for _ in range(args.duration):
|
for _ in range(args.duration):
|
||||||
# start_time = time.time()
|
# start_time = time.time()
|
||||||
response = get_with_retry(f'https://{args.opnsense}/api/diagnostics/traffic/top/{interface}',
|
response = fetch_with_retry(f'https://{args.opnsense}/api/diagnostics/traffic/top/{interface}',
|
||||||
headers={'Accept': 'application/json'}, auth=(args.key, args.secret), verify=False,
|
headers={'Accept': 'application/json'}, auth=(args.key, args.secret), verify=False,
|
||||||
timeout=args.timeout)
|
timeout=args.timeout)
|
||||||
# end_time = time.time()
|
# end_time = time.time()
|
||||||
# api_request_time = end_time - start_time
|
# api_request_time = end_time - start_time
|
||||||
|
|
||||||
|
|
|
@ -8,7 +8,7 @@ from typing import List
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
from checker import nagios
|
from checker import nagios
|
||||||
from checker.http import get_with_retry
|
from checker.http import fetch_with_retry
|
||||||
|
|
||||||
|
|
||||||
def get_disk_wwn_ids(ignore_non_smart: bool = False) -> List[str] or bool:
|
def get_disk_wwn_ids(ignore_non_smart: bool = False) -> List[str] or bool:
|
||||||
|
@ -43,7 +43,7 @@ def get_disk_wwn_ids(ignore_non_smart: bool = False) -> List[str] or bool:
|
||||||
|
|
||||||
def get_smart_health(wwn_id: str, scrutiny_endpoint: str) -> dict:
|
def get_smart_health(wwn_id: str, scrutiny_endpoint: str) -> dict:
|
||||||
url = f"{scrutiny_endpoint}/api/device/{wwn_id}/details"
|
url = f"{scrutiny_endpoint}/api/device/{wwn_id}/details"
|
||||||
response = get_with_retry(url)
|
response = fetch_with_retry(url)
|
||||||
if response.status_code == 200:
|
if response.status_code == 200:
|
||||||
return response.json()
|
return response.json()
|
||||||
elif response.status_code == 404:
|
elif response.status_code == 404:
|
||||||
|
|
|
@ -7,19 +7,26 @@ from . import nagios
|
||||||
from .print import print_icinga2_check_status
|
from .print import print_icinga2_check_status
|
||||||
|
|
||||||
|
|
||||||
def get_with_retry(url, retries=3, delay=1, **kwargs):
|
def fetch_with_retry(url, method: str = 'get', retries=3, delay=1, **kwargs):
|
||||||
"""
|
"""
|
||||||
Wrapper function for requests.get() with a retry mechanism.
|
Wrapper function for requests.get() with a retry mechanism.
|
||||||
|
|
||||||
|
:param method: HTTP request type: get, post
|
||||||
:param url: URL to send the GET request
|
:param url: URL to send the GET request
|
||||||
:param retries: Number of retries in case of HTTP failures (default: 3)
|
:param retries: Number of retries in case of HTTP failures (default: 3)
|
||||||
:param delay: Time delay between retries in seconds (default: 1)
|
:param delay: Time delay between retries in seconds (default: 1)
|
||||||
:param kwargs: Additional keyword arguments for requests.get()
|
:param kwargs: Additional keyword arguments for requests.get()
|
||||||
:return: Response object
|
:return: Response object
|
||||||
"""
|
"""
|
||||||
|
|
||||||
for i in range(retries):
|
for i in range(retries):
|
||||||
try:
|
try:
|
||||||
response = requests.get(url, **kwargs)
|
if method == 'get':
|
||||||
|
response = requests.get(url, **kwargs)
|
||||||
|
elif method == 'post':
|
||||||
|
response = requests.post(url, **kwargs)
|
||||||
|
else:
|
||||||
|
raise ValueError('Invalid method! Must be get or post.')
|
||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
return response
|
return response
|
||||||
except requests.exceptions.RequestException as e:
|
except requests.exceptions.RequestException as e:
|
||||||
|
|
|
@ -54,6 +54,6 @@ def human_readable_size(size: Union[int, float], bits=False, decimal_places: int
|
||||||
if decimal_places == 0:
|
if decimal_places == 0:
|
||||||
size = int(size)
|
size = int(size)
|
||||||
else:
|
else:
|
||||||
round(size, decimal_places)
|
size = round(size, decimal_places)
|
||||||
|
|
||||||
return f'{size} {units[bits][base][exp]}'
|
return f'{size} {units[bits][base][exp]}'
|
||||||
|
|
Loading…
Reference in New Issue