add method to get_with_retry()

check_graylog: add more healthchecks and perfdata
This commit is contained in:
Cyberes 2023-06-27 17:56:40 -06:00
parent d431bd5d7a
commit 9f00479dad
6 changed files with 188 additions and 112 deletions

View File

@ -3,27 +3,49 @@
import argparse import argparse
import sys import sys
import traceback import traceback
from datetime import datetime
import requests import requests
from urllib3.exceptions import InsecureRequestWarning from urllib3.exceptions import InsecureRequestWarning
import checker.nagios from checker import nagios
from checker import print_icinga2_check_status from checker import print_icinga2_check_status
from checker.http import fetch_with_retry
from checker.linuxfabric.base import get_state from checker.linuxfabric.base import get_state
from checker.units import human_readable_size from checker.units import human_readable_size
def transform_inputs(old_dict):
new_dict = {}
for key in old_dict:
for item in old_dict[key]:
new_key = item['id']
new_dict[new_key] = item
return new_dict
def parse_traffic_ts(ts: str):
datetime_obj = datetime.strptime(ts, '%Y-%m-%dT%H:%M:%S.%fZ')
current_time = datetime.now()
time_diff = current_time - datetime_obj
return time_diff.total_seconds() < 24 * 60 * 60 # less than 24 hrs ago?
def main(): def main():
parser = argparse.ArgumentParser(description='Check Graylog input health') parser = argparse.ArgumentParser(description='Check Graylog input health')
parser.add_argument('-u', '--url', required=True, help='Graylog API URL') parser.add_argument('-u', '--url', required=True, help='The base Graylog URL')
parser.add_argument('-t', '--token', required=True, help='Graylog API token') parser.add_argument('-t', '--token', required=True, help='Graylog API token')
parser.add_argument('-i', '--input', required=True, help='Input ID to check') parser.add_argument('-i', '--input', required=True, help='Input ID to check')
parser.add_argument('--warn-mem', type=int, default=75, help='Percentage of JVM memory used for warm') parser.add_argument('--warn-mem', type=int, default=75, help='Percentage of JVM memory used for warm')
parser.add_argument('--crit-mem', type=int, default=100, help='Percentage of JVM memory used for critical') parser.add_argument('--crit-mem', type=int, default=100, help='Percentage of JVM memory used for critical')
parser.add_argument('--insecure', action='store_false', help="Don't verify SSL") parser.add_argument('--insecure', action='store_false', help="Don't verify SSL")
parser.add_argument('--crit-notif', action='store_true', help='Return critical when there are notifications') parser.add_argument('--crit-notif', action='store_true', help='Return critical when there are notifications')
parser.add_argument('--html', action='store_true', help='Print HTML')
parser.add_argument('--cluster-metrics', action='store_true', help='Also gather cluster metrics and check for notifications.')
args = parser.parse_args() args = parser.parse_args()
base_url = args.url.strip('/')
if not args.insecure: if not args.insecure:
requests.packages.urllib3.disable_warnings(category=InsecureRequestWarning) requests.packages.urllib3.disable_warnings(category=InsecureRequestWarning)
@ -32,48 +54,52 @@ def main():
'X-Requested-By': 'XMLHttpRequest', 'X-Requested-By': 'XMLHttpRequest',
} }
try: # Get the basic input metadata
response = requests.get(f'{args.url}/system/inputstates/{args.input}', headers=headers, input_data = transform_inputs(fetch_with_retry(f'{base_url}/api/cluster/inputstates', headers=headers, auth=(args.token, 'token'), verify=args.insecure).json()).get(args.input, {})
auth=(args.token, 'token'), verify=args.insecure)
response.raise_for_status()
except requests.exceptions.RequestException as e:
print(f'CRITICAL - Unable to query Graylog API: {e}')
sys.exit(2)
input_data = response.json() # Get it over with
if bool(input_data) and input_data.get('state') == 'RUNNING':
input_name = input_data["message_input"]["title"]
text_result = f'Graylog input "{input_name}" is running.'
else:
input_name = args.input
if args.html:
text_result = f'Graylog input <a href="{base_url}/system/inputs" target="_blank">"{input_name}" is not running!</a>'
else:
text_result = f'Graylog input "{input_name}" is not running!'
print_icinga2_check_status(text_result, nagios.STATE_CRIT)
sys.exit(nagios.STATE_CRIT)
# If the input is running, continue gathering metrics and other health checks
input_name_clean = input_name.lower().replace(' ', '_').replace('-', '_')
# Get metrics for the input
type = input_data['message_input']['type'] type = input_data['message_input']['type']
metrics_json = {
'metrics': [
'org.graylog2.throughput.input.1-sec-rate',
'org.graylog2.throughput.output.1-sec-rate',
f'{type}.{args.input}.incomingMessages',
f'{type}.{args.input}.open_connections',
f'{type}.{args.input}.total_connections',
f'{type}.{args.input}.written_bytes_1sec',
f'{type}.{args.input}.written_bytes_total',
f'{type}.{args.input}.read_bytes_1sec',
f'{type}.{args.input}.read_bytes_total',
"org.graylog2.journal.append.1-sec-rate",
"org.graylog2.journal.read.1-sec-rate",
"org.graylog2.journal.segments",
"org.graylog2.journal.entries-uncommitted",
"jvm.memory.heap.used",
"jvm.memory.heap.committed",
"jvm.memory.heap.max"
],
}
r = fetch_with_retry(f'{base_url}/api/cluster/metrics/multiple', method='post', headers=headers, auth=(args.token, 'token'),
verify=args.insecure, json=metrics_json).json()
input_metrics = r[list(r.keys())[0]]['metrics']
try: # Format the metrics for later
metrics_json = {
'metrics': [
'org.graylog2.throughput.input.1-sec-rate',
'org.graylog2.throughput.output.1-sec-rate',
f'{type}.{args.input}.incomingMessages',
f'{type}.{args.input}.open_connections',
f'{type}.{args.input}.total_connections',
f'{type}.{args.input}.written_bytes_1sec',
f'{type}.{args.input}.written_bytes_total',
f'{type}.{args.input}.read_bytes_1sec',
f'{type}.{args.input}.read_bytes_total',
"org.graylog2.journal.append.1-sec-rate",
"org.graylog2.journal.read.1-sec-rate",
"org.graylog2.journal.segments",
"org.graylog2.journal.entries-uncommitted",
"jvm.memory.heap.used",
"jvm.memory.heap.committed",
"jvm.memory.heap.max"
],
}
response = requests.post(f'{args.url}/cluster/metrics/multiple', headers=headers, auth=(args.token, 'token'),
verify=args.insecure,
json=metrics_json)
response.raise_for_status()
input_metrics = response.json()[list(response.json().keys())[0]]['metrics']
except requests.exceptions.RequestException as e:
print(f'CRITICAL - Unable to query Graylog API: {e}\n{response.text}')
sys.exit(2)
# Format the metrics
metrics_data = {} metrics_data = {}
for metric in input_metrics: for metric in input_metrics:
name = metric['full_name'].replace(type, '').replace('org.graylog2.', '').replace(args.input, '').strip( name = metric['full_name'].replace(type, '').replace('org.graylog2.', '').replace(args.input, '').strip(
@ -92,95 +118,138 @@ def main():
jvm_mem_usage_warn = int(metrics_data['jvm_memory_heap_max'] / int(100 / args.warn_mem)) jvm_mem_usage_warn = int(metrics_data['jvm_memory_heap_max'] / int(100 / args.warn_mem))
jvm_mem_usage_crit = int(metrics_data['jvm_memory_heap_max'] / int(100 / args.crit_mem)) jvm_mem_usage_crit = int(metrics_data['jvm_memory_heap_max'] / int(100 / args.crit_mem))
# Get traffic data for last 24 hrs
traffic_last_24_hrs = fetch_with_retry(f'{base_url}/api/system/cluster/traffic?daily=false', headers=headers, auth=(args.token, 'token'), verify=args.insecure).json()
input_traffic_avg = sum([v for k, v in traffic_last_24_hrs['input'].items() if parse_traffic_ts(k)])
output_traffic_avg = sum([v for k, v in traffic_last_24_hrs['output'].items() if parse_traffic_ts(k)])
elastisearch_health = fetch_with_retry(f'{base_url}/api/system/indexer/cluster/health', headers=headers, auth=(args.token, 'token'), verify=args.insecure).json()
elastisearch_status = elastisearch_health['status'].lower()
elastisearch_active_shards = elastisearch_health['shards']['active']
indexer_failures = fetch_with_retry(f'{base_url}/api/system/indexer/failures?limit=10&offset=0', headers=headers, auth=(args.token, 'token'), verify=args.insecure).json()
# Some metric names are changed for better readability # Some metric names are changed for better readability
perfdata = { perfdata = {
'throughput_input_1_sec_rate': { f'{input_name_clean}_throughput_input_1_sec_rate': {
'value': int(metrics_data['throughput_input_1_sec_rate']), 'value': int(metrics_data['throughput_input_1_sec_rate']),
'min': 0, 'min': 0,
}, },
'throughput_output_1_sec_rate': { f'{input_name_clean}_throughput_output_1_sec_rate': {
'value': int(metrics_data['throughput_output_1_sec_rate']), 'value': int(metrics_data['throughput_output_1_sec_rate']),
'min': 0, 'min': 0,
}, },
'incoming_messages_rate_per_sec_one_minute': { f'{input_name_clean}_incoming_messages_rate_per_sec_one_minute': {
'value': metrics_data['incomingMessages_one_minute'], 'value': metrics_data['incomingMessages_one_minute'],
'min': 0, 'min': 0,
}, },
'connections': { f'{input_name_clean}_connections': {
'value': metrics_data['open_connections'], 'value': metrics_data['open_connections'],
'min': 0, 'min': 0,
}, },
'network_out_total_1sec': { f'{input_name_clean}_network_out_total_1sec': {
'value': metrics_data['written_bytes_1sec'], 'value': metrics_data['written_bytes_1sec'],
'min': 0, 'min': 0,
'unit': 'B', 'unit': 'B',
}, },
'network_out_total_total': { f'{input_name_clean}_network_out_total_total': {
'value': metrics_data['written_bytes_total'], 'value': metrics_data['written_bytes_total'],
'min': 0, 'min': 0,
'unit': 'B', 'unit': 'B',
}, },
'network_in_1sec': { f'{input_name_clean}_network_in_1sec': {
'value': metrics_data['read_bytes_1sec'], 'value': metrics_data['read_bytes_1sec'],
'min': 0, 'min': 0,
'unit': 'B', 'unit': 'B',
}, },
'network_in_total': { f'{input_name_clean}_network_in_total': {
'value': metrics_data['read_bytes_total'], 'value': metrics_data['read_bytes_total'],
'min': 0, 'min': 0,
'unit': 'B', 'unit': 'B',
}, }
'entries_uncommitted': {
'value': metrics_data['journal_entries_uncommitted'],
'min': 0,
},
'jvm_memory_used': {
'value': metrics_data['jvm_memory_heap_used'],
'min': 0,
'warn': jvm_mem_usage_warn,
'crit': jvm_mem_usage_crit,
'max': int(metrics_data['jvm_memory_heap_max']),
'unit': 'B',
},
} }
jvm_mem_usage_state = get_state(int(metrics_data['jvm_memory_heap_used']), jvm_mem_usage_warn, jvm_mem_usage_crit, if args.cluster_metrics:
operator='gt') perfdata.update({
'entries_uncommitted': {
'value': metrics_data['journal_entries_uncommitted'],
'min': 0,
},
'jvm_memory_used': {
'value': metrics_data['jvm_memory_heap_used'],
'min': 0,
'warn': jvm_mem_usage_warn,
'crit': jvm_mem_usage_crit,
'max': int(metrics_data['jvm_memory_heap_max']),
'unit': 'B',
},
'from_network_traffic_avg': {
'value': input_traffic_avg,
'min': 0,
'unit': 'B',
},
'to_elasticsearch_24hrs_avg': {
'value': output_traffic_avg,
'min': 0,
'unit': 'B',
},
'elastisearch_active_shards': {
'value': elastisearch_active_shards,
'min': 0
},
'indexder_failures': {
'value': indexer_failures['total'],
'min': 0,
},
})
try: # Check for notifications
response = requests.get(f'{args.url}/system/notifications', headers=headers, auth=(args.token, 'token'), if args.cluster_metrics:
verify=args.insecure) notifications = fetch_with_retry(f'{base_url}/api/system/notifications', headers=headers, auth=(args.token, 'token'), verify=args.insecure).json()
response.raise_for_status() if notifications['total'] > 0:
except requests.exceptions.RequestException as e: notif_str = 'Notifications:'
print(f'CRITICAL - Unable to query Graylog API: {e}') for notification in notifications['notifications']:
sys.exit(2) notif_str = notif_str + f'\n{notification["type"]}: {notification["description"]}'
notifications = response.json() else:
if notifications['total'] > 0: notif_str = 'No notifications'
notif_str = 'Notifications:'
for notification in notifications['notifications']: if indexer_failures['total'] > 0:
notif_str = notif_str + f'\n{notification["type"]}: {notification["description"]}' indexer_failures_exit = nagios.STATE_CRIT
if args.html:
text_result += f' <a href="{base_url}/system/indices/failures" target="_blank">There are {indexer_failures["total"]} indexer failures!</a>'
else:
text_result += f' There are {indexer_failures["total"]} indexer failures!'
else:
indexer_failures_exit = nagios.STATE_OK
# https://go2docs.graylog.org/5-0/setting_up_graylog/elasticsearch.htm#ClusterStatusExplained
if elastisearch_status == 'yellow':
elastisearch_exit_code = nagios.STATE_WARN
text_result += ' Elasticsearch is condition YELLOW!'
elif elastisearch_status == 'red':
elastisearch_exit_code = nagios.STATE_CRIT
text_result += ' Elasticsearch is condition RED!'
elif elastisearch_status == 'green':
elastisearch_exit_code = nagios.STATE_OK
else:
print_icinga2_check_status(f'unknown Elasticsearch health: {elastisearch_status}', nagios.STATE_UNKNOWN)
sys.exit(nagios.STATE_UNKNOWN)
jvm_mem_usage_state = get_state(int(metrics_data['jvm_memory_heap_used']), jvm_mem_usage_warn, jvm_mem_usage_crit, operator='gt')
if jvm_mem_usage_state != nagios.STATE_OK:
text_result += f' JVM memory usage is high!'
exit_code = max(nagios.STATE_OK, jvm_mem_usage_state, elastisearch_exit_code, indexer_failures_exit)
if notifications['total'] > 0:
text_result += f' There are notifications!'
if args.crit_notif:
exit_code = nagios.STATE_CRIT # force crit
text_result = text_result + f' JVM memory usage: {int((perfdata["jvm_memory_used"]["value"] / metrics_data["jvm_memory_heap_max"]) * 100)}%, input incoming rate (events/second for last minute): {perfdata[f"{input_name_clean}_incoming_messages_rate_per_sec_one_minute"]["value"]}, input connections: {perfdata[f"{input_name_clean}_connections"]["value"]}, input total network in: {human_readable_size(perfdata[f"{input_name_clean}_network_in_total"]["value"], decimal_places=0)}' + '\n' + notif_str
else: else:
notif_str = 'No notifications' text_result = text_result + f' {input_name_clean}_incoming_messages_rate_per_sec_one_minute (events/second for last minute): {perfdata[f"{input_name_clean}_incoming_messages_rate_per_sec_one_minute"]["value"]}, {input_name_clean}_connections: {perfdata[f"{input_name_clean}_connections"]["value"]}, {input_name_clean}_network_in_total: {human_readable_size(perfdata[f"{input_name_clean}_network_in_total"]["value"], decimal_places=0)}' + '\n'
exit_code = nagios.STATE_OK
if input_data['state'] == 'RUNNING':
text_result = f'Input "{input_data["message_input"]["title"]}" is running.'
else:
text_result = f'Input "{input_data["message_input"]["title"]}" is not running!'
print(text_result)
print(notif_str)
sys.exit(checker.nagios.STATE_CRIT)
if jvm_mem_usage_state != checker.nagios.STATE_OK:
text_result += f' JVM memory usage is high!'
exit_code = max(checker.nagios.STATE_OK, jvm_mem_usage_state)
if notifications['total'] > 0:
text_result += f' There are notifications!'
if args.crit_notif:
exit_code = checker.nagios.STATE_CRIT
text_result = text_result + f' JVM memory usage: {int((perfdata["jvm_memory_used"]["value"] / metrics_data["jvm_memory_heap_max"]) * 100)}%, incoming rate (events/second for last minute): {perfdata["incoming_messages_rate_per_sec_one_minute"]["value"]}, connections: {perfdata["connections"]["value"]}, total network in: {human_readable_size(perfdata["network_in_total"]["value"], decimal_places=0)}' + '\n' + notif_str
print_icinga2_check_status(text_result, exit_code, perfdata) print_icinga2_check_status(text_result, exit_code, perfdata)
sys.exit(exit_code) sys.exit(exit_code)
@ -192,4 +261,4 @@ if __name__ == "__main__":
except Exception as e: except Exception as e:
print(f'UNKNOWN: exception "{e}"') print(f'UNKNOWN: exception "{e}"')
print(traceback.format_exc()) print(traceback.format_exc())
sys.exit(checker.nagios.STATE_UNKNOWN) sys.exit(nagios.STATE_UNKNOWN)

View File

@ -5,7 +5,7 @@ import sys
import traceback import traceback
from checker import nagios, print_icinga2_check_status from checker import nagios, print_icinga2_check_status
from checker.http import get_with_retry from checker.http import fetch_with_retry
from checker.linuxfabric.base import get_state from checker.linuxfabric.base import get_state
@ -42,7 +42,7 @@ def main():
parser.add_argument("--warning-waiting", type=int, default=None, help="Warning threshold for waiting connections. Default: 0 (disabled)") parser.add_argument("--warning-waiting", type=int, default=None, help="Warning threshold for waiting connections. Default: 0 (disabled)")
args = parser.parse_args() args = parser.parse_args()
status = get_with_retry(args.url).text status = fetch_with_retry(args.url).text
data = parse_nginx_status(status) data = parse_nginx_status(status)
perfdata_dict = { perfdata_dict = {

View File

@ -10,7 +10,7 @@ from urllib3.exceptions import InsecureRequestWarning
import checker.nagios as nagios import checker.nagios as nagios
from checker import print_icinga2_check_status from checker import print_icinga2_check_status
from checker.http import get_with_retry from checker.http import fetch_with_retry
from checker.linuxfabric.base import get_state from checker.linuxfabric.base import get_state
from checker.markdown import list_to_markdown_table from checker.markdown import list_to_markdown_table
from checker.units import filesize from checker.units import filesize
@ -26,9 +26,9 @@ def is_internet_traffic(ip):
def get_traffic_top(args, interface): def get_traffic_top(args, interface):
response = get_with_retry(f'https://{args.opnsense}/api/diagnostics/traffic/top/{interface}', response = fetch_with_retry(f'https://{args.opnsense}/api/diagnostics/traffic/top/{interface}',
headers={'Accept': 'application/json'}, auth=(args.key, args.secret), verify=False, headers={'Accept': 'application/json'}, auth=(args.key, args.secret), verify=False,
timeout=args.timeout) timeout=args.timeout)
if response.status_code != 200: if response.status_code != 200:
print(f'UNKNOWN: unable to query OPNsense API for {interface}: {response.status_code}\n{response.text}') print(f'UNKNOWN: unable to query OPNsense API for {interface}: {response.status_code}\n{response.text}')
sys.exit(nagios.UNKNOWN) sys.exit(nagios.UNKNOWN)
@ -81,9 +81,9 @@ def main():
traffic_data = [] traffic_data = []
for _ in range(args.duration): for _ in range(args.duration):
# start_time = time.time() # start_time = time.time()
response = get_with_retry(f'https://{args.opnsense}/api/diagnostics/traffic/top/{interface}', response = fetch_with_retry(f'https://{args.opnsense}/api/diagnostics/traffic/top/{interface}',
headers={'Accept': 'application/json'}, auth=(args.key, args.secret), verify=False, headers={'Accept': 'application/json'}, auth=(args.key, args.secret), verify=False,
timeout=args.timeout) timeout=args.timeout)
# end_time = time.time() # end_time = time.time()
# api_request_time = end_time - start_time # api_request_time = end_time - start_time

View File

@ -8,7 +8,7 @@ from typing import List
import requests import requests
from checker import nagios from checker import nagios
from checker.http import get_with_retry from checker.http import fetch_with_retry
def get_disk_wwn_ids(ignore_non_smart: bool = False) -> List[str] or bool: def get_disk_wwn_ids(ignore_non_smart: bool = False) -> List[str] or bool:
@ -43,7 +43,7 @@ def get_disk_wwn_ids(ignore_non_smart: bool = False) -> List[str] or bool:
def get_smart_health(wwn_id: str, scrutiny_endpoint: str) -> dict: def get_smart_health(wwn_id: str, scrutiny_endpoint: str) -> dict:
url = f"{scrutiny_endpoint}/api/device/{wwn_id}/details" url = f"{scrutiny_endpoint}/api/device/{wwn_id}/details"
response = get_with_retry(url) response = fetch_with_retry(url)
if response.status_code == 200: if response.status_code == 200:
return response.json() return response.json()
elif response.status_code == 404: elif response.status_code == 404:

View File

@ -7,19 +7,26 @@ from . import nagios
from .print import print_icinga2_check_status from .print import print_icinga2_check_status
def get_with_retry(url, retries=3, delay=1, **kwargs): def fetch_with_retry(url, method: str = 'get', retries=3, delay=1, **kwargs):
""" """
Wrapper function for requests.get() with a retry mechanism. Wrapper function for requests.get() with a retry mechanism.
:param method: HTTP request type: get, post
:param url: URL to send the GET request :param url: URL to send the GET request
:param retries: Number of retries in case of HTTP failures (default: 3) :param retries: Number of retries in case of HTTP failures (default: 3)
:param delay: Time delay between retries in seconds (default: 1) :param delay: Time delay between retries in seconds (default: 1)
:param kwargs: Additional keyword arguments for requests.get() :param kwargs: Additional keyword arguments for requests.get()
:return: Response object :return: Response object
""" """
for i in range(retries): for i in range(retries):
try: try:
response = requests.get(url, **kwargs) if method == 'get':
response = requests.get(url, **kwargs)
elif method == 'post':
response = requests.post(url, **kwargs)
else:
raise ValueError('Invalid method! Must be get or post.')
response.raise_for_status() response.raise_for_status()
return response return response
except requests.exceptions.RequestException as e: except requests.exceptions.RequestException as e:

View File

@ -54,6 +54,6 @@ def human_readable_size(size: Union[int, float], bits=False, decimal_places: int
if decimal_places == 0: if decimal_places == 0:
size = int(size) size = int(size)
else: else:
round(size, decimal_places) size = round(size, decimal_places)
return f'{size} {units[bits][base][exp]}' return f'{size} {units[bits][base][exp]}'