catch errors, message formatting

This commit is contained in:
Cyberes 2023-04-21 23:54:16 -06:00
parent 74a4849cd8
commit 62b7cd6594
3 changed files with 216 additions and 185 deletions

View File

@ -20,95 +20,108 @@ parser.add_argument('--warn', type=float, help='Manually set warn level.')
parser.add_argument('--crit', type=float, help='Manually set critical level.') parser.add_argument('--crit', type=float, help='Manually set critical level.')
args = parser.parse_args() args = parser.parse_args()
# TODO: add warn suppoort # TODO: add warn suppoort
if args.type == 'gc-time': def main():
# in seconds if args.type == 'gc-time':
python_gc_time_sum_MAX = 0.002 if not args.crit else args.crit # in seconds
try: python_gc_time_sum_MAX = 0.002 if not args.crit else args.crit
python_gc_time_sum = np.round(np.average(get_avg_python_gc_time(args.grafana_api_key, args.interval, args.range, args.grafana_server)), 5) try:
if python_gc_time_sum >= python_gc_time_sum_MAX: python_gc_time_sum = np.round(np.average(get_avg_python_gc_time(args.grafana_api_key, args.interval, args.range, args.grafana_server)), 5)
print(f"CRITICAL: average GC time per collection is {python_gc_time_sum} sec. |'garbage-collection'={python_gc_time_sum}s;;;") if python_gc_time_sum >= python_gc_time_sum_MAX:
sys.exit(nagios.CRITICAL) print(f"CRITICAL: average GC time per collection is {python_gc_time_sum} sec. |'garbage-collection'={python_gc_time_sum}s;;;")
else: sys.exit(nagios.CRITICAL)
print(f"OK: average GC time per collection is {python_gc_time_sum} sec. |'garbage-collection'={python_gc_time_sum}s;;;") else:
sys.exit(nagios.OK) print(f"OK: average GC time per collection is {python_gc_time_sum} sec. |'garbage-collection'={python_gc_time_sum}s;;;")
except Exception as e: sys.exit(nagios.OK)
print(f'UNKNOWN: failed to check avg. GC time "{e}"') except Exception as e:
sys.exit(nagios.UNKNOWN) print(f'UNKNOWN: failed to check avg. GC time "{e}"')
elif args.type == 'response-time': sys.exit(nagios.UNKNOWN)
response_time_MAX = 1 if not args.crit else args.crit elif args.type == 'response-time':
timeout = 10 response_time_MAX = 1 if not args.crit else args.crit
try: timeout = 10
response_times = [] try:
for i in range(10): response_times = []
start = time.perf_counter() for i in range(10):
try: start = time.perf_counter()
response = requests.post(args.synapse_server, timeout=timeout, verify=False) try:
except Exception as e: response = requests.post(args.synapse_server, timeout=timeout, verify=False)
print(f'UNKNOWN: failed to ping endpoint "{e}"') except Exception as e:
sys.exit(nagios.UNKNOWN) print(f'UNKNOWN: failed to ping endpoint "{e}"')
request_time = time.perf_counter() - start sys.exit(nagios.UNKNOWN)
response_times.append(np.round(request_time, 2)) request_time = time.perf_counter() - start
time.sleep(1) response_times.append(np.round(request_time, 2))
response_time = np.round(np.average(response_times), 2) time.sleep(1)
if response_time > response_time_MAX: response_time = np.round(np.average(response_times), 2)
print(f"CRITICAL: response time is {response_time} sec. |'response-time'={response_time}s;;;") if response_time > response_time_MAX:
sys.exit(nagios.CRITICAL) print(f"CRITICAL: response time is {response_time} sec. |'response-time'={response_time}s;;;")
else: sys.exit(nagios.CRITICAL)
print(f"OK: response time is {response_time} sec. |'response-time'={response_time}s;;;") else:
sys.exit(nagios.OK) print(f"OK: response time is {response_time} sec. |'response-time'={response_time}s;;;")
except Exception as e: sys.exit(nagios.OK)
print(f'UNKNOWN: failed to check response time "{e}"') except Exception as e:
sys.exit(nagios.UNKNOWN) print(f'UNKNOWN: failed to check response time "{e}"')
elif args.type == 'outgoing-http-rate': sys.exit(nagios.UNKNOWN)
# outgoing req/sec elif args.type == 'outgoing-http-rate':
outgoing_http_request_rate_MAX = 10 if not args.crit else args.crit # outgoing req/sec
try: outgoing_http_request_rate_MAX = 10 if not args.crit else args.crit
outgoing_http_request_rate = get_outgoing_http_request_rate(args.grafana_api_key, args.interval, args.range, args.grafana_server) try:
failed = {} outgoing_http_request_rate = get_outgoing_http_request_rate(args.grafana_api_key, args.interval, args.range, args.grafana_server)
perf_data = '|' failed = {}
for k, v in outgoing_http_request_rate.items(): perf_data = '|'
perf_data = perf_data + f"'{k}'={v}s;;; " for k, v in outgoing_http_request_rate.items():
if v > outgoing_http_request_rate_MAX: perf_data = perf_data + f"'{k}'={v}s;;; "
failed[k] = v if v > outgoing_http_request_rate_MAX:
failed[k] = v
if len(failed.keys()) > 0: if len(failed.keys()) > 0:
print(f'CRITICAL: outgoing HTTP request rate for {failed} req/sec.', perf_data) print(f'CRITICAL: outgoing HTTP request rate for {failed} req/sec.', perf_data)
sys.exit(nagios.CRITICAL) sys.exit(nagios.CRITICAL)
print(f'OK: outgoing HTTP request rate is {outgoing_http_request_rate} req/sec.', perf_data) print(f'OK: outgoing HTTP request rate is {outgoing_http_request_rate} req/sec.', perf_data)
sys.exit(nagios.OK)
except Exception as e:
print(f'UNKNOWN: failed to check outgoing HTTP request rate "{e}"')
sys.exit(nagios.UNKNOWN)
elif args.type == 'avg-send':
# Average send time in seconds
event_send_time_MAX = 1 if not args.crit else args.crit
try:
event_send_time = get_event_send_time(args.grafana_api_key, args.interval, args.range, args.grafana_server)
if event_send_time > event_send_time_MAX:
print(f"CRITICAL: average message send time is {event_send_time} sec. |'avg-send-time'={event_send_time}s;;;")
sys.exit(nagios.CRITICAL)
else:
print(f"OK: average message send time is {event_send_time} sec. |'avg-send-time'={event_send_time}s;;;")
sys.exit(nagios.OK) sys.exit(nagios.OK)
except Exception as e: except Exception as e:
print(f'UNKNOWN: failed to check average message send time "{e}"') print(f'UNKNOWN: failed to check outgoing HTTP request rate "{e}"')
sys.exit(nagios.UNKNOWN)
elif args.type == 'avg-send':
# Average send time in seconds
event_send_time_MAX = 1 if not args.crit else args.crit
try:
event_send_time = get_event_send_time(args.grafana_api_key, args.interval, args.range, args.grafana_server)
if event_send_time > event_send_time_MAX:
print(f"CRITICAL: average message send time is {event_send_time} sec. |'avg-send-time'={event_send_time}s;;;")
sys.exit(nagios.CRITICAL)
else:
print(f"OK: average message send time is {event_send_time} sec. |'avg-send-time'={event_send_time}s;;;")
sys.exit(nagios.OK)
except Exception as e:
print(f'UNKNOWN: failed to check average message send time "{e}"')
sys.exit(nagios.UNKNOWN)
elif args.type == 'db-lag':
# in seconds
db_lag_MAX = 0.01 if not args.crit else args.crit
try:
db_lag = get_waiting_for_db(args.grafana_api_key, args.interval, args.range, args.grafana_server)
if db_lag > db_lag_MAX:
print(f"CRITICAL: DB lag is {db_lag} sec. |'db-lag'={db_lag}s;;;")
sys.exit(nagios.CRITICAL)
else:
print(f"OK: DB lag is {db_lag} sec. |'db-lag'={db_lag}s;;;")
sys.exit(nagios.OK)
except Exception as e:
print(f'UNKNOWN: failed to check DB lag "{e}"')
sys.exit(nagios.UNKNOWN)
else:
print('Wrong type')
sys.exit(nagios.UNKNOWN) sys.exit(nagios.UNKNOWN)
elif args.type == 'db-lag':
# in seconds
db_lag_MAX = 0.01 if not args.crit else args.crit if __name__ == "__main__":
try: try:
db_lag = get_waiting_for_db(args.grafana_api_key, args.interval, args.range, args.grafana_server) main()
if db_lag > db_lag_MAX:
print(f"CRITICAL: DB lag is {db_lag} sec. |'db-lag'={db_lag}s;;;")
sys.exit(nagios.CRITICAL)
else:
print(f"OK: DB lag is {db_lag} sec. |'db-lag'={db_lag}s;;;")
sys.exit(nagios.OK)
except Exception as e: except Exception as e:
print(f'UNKNOWN: failed to check DB lag "{e}"') print(f'UNKNOWN: exception "{e}"')
import traceback
print(traceback.format_exc())
sys.exit(nagios.UNKNOWN) sys.exit(nagios.UNKNOWN)
else:
print('Wrong type')
sys.exit(nagios.UNKNOWN)

View File

@ -18,112 +18,130 @@ parser.add_argument('--warn', type=float, default=20, help='Manually set warn le
parser.add_argument('--crit', type=float, default=30, help='Manually set critical level.') parser.add_argument('--crit', type=float, default=30, help='Manually set critical level.')
args = parser.parse_args() args = parser.parse_args()
if args.prometheus:
from checker.prometheus import parse_metrics
r = requests.get(args.metrics_endpoint) def main():
if r.status_code != 200: if args.prometheus:
sys.exit(nagios.UNKNOWN) from checker.prometheus import parse_metrics
metrics = {} r = requests.get(args.metrics_endpoint)
for item in parse_metrics(r.text)['monbot_ping_receive_delay_seconds']['monbot_ping_receive_delay_seconds_sum']: if r.status_code != 200:
if item.labels['receivingDomain'] not in metrics.keys(): sys.exit(nagios.UNKNOWN)
metrics[item.labels['receivingDomain']] = {}
metrics[item.labels['receivingDomain']][item.labels['sourceDomain']] = item.value
pings = {'receiver': [], 'sender': [], } metrics = {}
for receiving_domain, senders in metrics.items(): for item in parse_metrics(r.text)['monbot_ping_receive_delay_seconds']['monbot_ping_receive_delay_seconds_sum']:
if receiving_domain == args.domain: if item.labels['receivingDomain'] not in metrics.keys():
for k, v in senders.items(): metrics[item.labels['receivingDomain']] = {}
pings['receiver'].append(v) metrics[item.labels['receivingDomain']][item.labels['sourceDomain']] = item.value
else:
for k, v in senders.items():
if k == args.domain:
pings['sender'].append(v)
print(json.dumps(pings)) pings = {'receiver': [], 'sender': [], }
for receiving_domain, senders in metrics.items():
if receiving_domain == args.domain:
for k, v in senders.items():
pings['receiver'].append(v)
else:
for k, v in senders.items():
if k == args.domain:
pings['sender'].append(v)
receiver_avg = np.round(np.average(pings['receiver']), 2) print(json.dumps(pings))
sender_avg = np.round(np.average(pings['sender']), 2)
print('receiver latency is', receiver_avg) receiver_avg = np.round(np.average(pings['receiver']), 2)
print('sender latency is', sender_avg) sender_avg = np.round(np.average(pings['sender']), 2)
else:
from bs4 import BeautifulSoup
import re
# Split the values since icinga will quote the args print('receiver latency is', receiver_avg)
if len(args.ignore) == 1: print('sender latency is', sender_avg)
args.ignore = args.ignore[0].strip(' ').split(' ')
def get_sec(time_str):
"""Get seconds from time."""
h, m, s = time_str.split(':')
return int(h) * 3600 + int(m) * 60 + int(s)
def ms_to_s(s):
min_m = re.match(r'^(\d+)m([\d.]+)s', s)
if min_m:
return get_sec(f'0:{min_m.group(1)}:{int(float(min_m.group(2)))}')
elif s.endswith('ms'):
return float('0.' + s.strip('ms'))
elif s.endswith('s'):
return float(s.strip('ms'))
r = requests.get(args.metrics_endpoint)
if r.status_code != 200:
sys.exit(nagios.UNKNOWN)
soup = BeautifulSoup(r.text, 'html.parser')
tooltips = soup.find_all('span', {'class', 'tooltip'})
data = {}
for item in tooltips:
m = re.match(r'<span class="tooltip">\s*Send: (.*?)\s*<br\/>\s*Receive: (.*?)\s*<\/span>', str(item))
if m:
domain = item.parent.parent.find('span', {'class': 'domain'}).text
data[domain] = {
'send': ms_to_s(m.group(1)),
'receive': ms_to_s(m.group(2)),
}
exit_code = nagios.OK
info_str = []
data_str = []
if len(data.keys()) == 0:
print('UNKNOWN: failed to find any servers.')
sys.exit(nagios.UNKNOWN)
for domain, values in data.items():
if domain not in args.ignore:
if values['send'] >= args.crit:
info_str.append(f'CRITICAL: {domain} send is {values["send"]}s.')
exit_code = nagios.CRITICAL
elif values['send'] >= args.warn:
info_str.append(f'WARN: {domain} send is {values["send"]}s.')
if exit_code < nagios.WARNING:
exit_code = nagios.WARNING
# else:
# print(f'OK: {domain} send is {values["send"]}s.')
if values['receive'] >= args.crit:
info_str.append(f'CRITICAL: {domain} receive is {values["receive"]}s.')
exit_code = nagios.CRITICAL
elif values['receive'] >= args.warn:
info_str.append(f'WARN: {domain} receive is {values["receive"]}s.')
if exit_code < nagios.WARNING:
exit_code = nagios.WARNING
# else:
# print(f'OK: {domain} receive is {values["receive"]}s.')
data_str.append(
f"'{domain}-send'={values['send']}s;;; '{domain}-receive'={values['receive']}s;;;"
)
if any(('CRITICAL' not in s and 'WARNING' not in s) for s in info_str) or len(info_str) == 0:
print(f'OK: ping time is good.', end=' ')
else: else:
for x in info_str: from bs4 import BeautifulSoup
print(x, end=('\n' if info_str.index(x) + 1 < len(info_str) else '')) import re
print(f'|{" ".join(data_str)}')
sys.exit(exit_code) # Split the values since icinga will quote the args
if len(args.ignore) == 1:
args.ignore = args.ignore[0].strip(' ').split(' ')
def get_sec(time_str):
"""Get seconds from time."""
h, m, s = time_str.split(':')
return int(h) * 3600 + int(m) * 60 + int(s)
def ms_to_s(s):
min_m = re.match(r'^(\d+)m([\d.]+)s', s)
if min_m:
return get_sec(f'0:{min_m.group(1)}:{int(float(min_m.group(2)))}')
elif s.endswith('ms'):
return float('0.' + s.strip('ms'))
elif s.endswith('s'):
return float(s.strip('ms'))
r = requests.get(args.metrics_endpoint)
if r.status_code != 200:
sys.exit(nagios.UNKNOWN)
soup = BeautifulSoup(r.text, 'html.parser')
tooltips = soup.find_all('span', {'class', 'tooltip'})
data = {}
for item in tooltips:
m = re.match(r'<span class="tooltip">\s*Send: (.*?)\s*<br\/>\s*Receive: (.*?)\s*<\/span>', str(item))
if m:
domain = item.parent.parent.find('span', {'class': 'domain'}).text
data[domain] = {
'send': ms_to_s(m.group(1)),
'receive': ms_to_s(m.group(2)),
}
exit_code = nagios.OK
info_str = []
data_str = []
if len(data.keys()) == 0:
print('UNKNOWN: failed to find any servers.')
sys.exit(nagios.UNKNOWN)
for domain, values in data.items():
if domain not in args.ignore:
if 'send' in values.keys():
if values['send'] >= args.crit:
info_str.append(f'CRITICAL: {domain} send is {values["send"]}s.')
exit_code = nagios.CRITICAL
elif values['send'] >= args.warn:
info_str.append(f'WARN: {domain} send is {values["send"]}s.')
if exit_code < nagios.WARNING:
exit_code = nagios.WARNING
# else:
# print(f'OK: {domain} send is {values["send"]}s.')
else:
info_str.append(f'UNKNOWN: {domain} send is empty.')
if 'receive' in values.keys():
if values['receive'] >= args.crit:
info_str.append(f'CRITICAL: {domain} receive is {values["receive"]}s.')
exit_code = nagios.CRITICAL
elif values['receive'] >= args.warn:
info_str.append(f'WARN: {domain} receive is {values["receive"]}s.')
if exit_code < nagios.WARNING:
exit_code = nagios.WARNING
# else:
# print(f'OK: {domain} receive is {values["receive"]}s.')
else:
info_str.append(f'UNKNOWN: {domain} receive is empty.')
if 'send' in values.keys() and 'receive' in values.keys():
data_str.append(
f"'{domain}-send'={values['send']}s;;; '{domain}-receive'={values['receive']}s;;;"
)
if any(('CRITICAL' not in s and 'WARNING' not in s) for s in info_str) or len(info_str) == 0:
print(f'OK: ping time is good.', end=' ')
else:
for x in info_str:
print(x, end=('\n' if info_str.index(x) + 1 < len(info_str) else ''))
print(f'|{" ".join(data_str)}')
sys.exit(exit_code)
if __name__ == "__main__":
try:
main()
except Exception as e:
print(f'UNKNOWN: exception "{e}"')
import traceback
print(traceback.format_exc())
sys.exit(nagios.UNKNOWN)

View File

@ -66,7 +66,7 @@ def build_msg(host_name, host_display_name, state, date_str, output, service_nam
elif host_name: elif host_name:
icinga2_url = f'<br>[Quick Link]({icinga2_url}/icingadb/host?name={host_name.replace(" ", "+")})' icinga2_url = f'<br>[Quick Link]({icinga2_url}/icingadb/host?name={host_name.replace(" ", "+")})'
msg = f"""{icon}&nbsp;&nbsp;&nbsp;{item} is <font color="{choose_color(state)}">{state}</font> <br> msg = f"""{icon}&nbsp;&nbsp;&nbsp;{item} is **<font color="{choose_color(state)}">{state}</font>** <br>
**When:** {date_str}. <br> **When:** {date_str}. <br>
**Info:** {newline_to_formatted_html(output)}{address}{comment}{icinga2_url}""" **Info:** {newline_to_formatted_html(output)}{address}{comment}{icinga2_url}"""
return msg return msg