catch errors, message formatting
This commit is contained in:
parent
74a4849cd8
commit
62b7cd6594
|
@ -20,95 +20,108 @@ parser.add_argument('--warn', type=float, help='Manually set warn level.')
|
|||
parser.add_argument('--crit', type=float, help='Manually set critical level.')
|
||||
args = parser.parse_args()
|
||||
|
||||
|
||||
# TODO: add warn suppoort
|
||||
|
||||
if args.type == 'gc-time':
|
||||
# in seconds
|
||||
python_gc_time_sum_MAX = 0.002 if not args.crit else args.crit
|
||||
try:
|
||||
python_gc_time_sum = np.round(np.average(get_avg_python_gc_time(args.grafana_api_key, args.interval, args.range, args.grafana_server)), 5)
|
||||
if python_gc_time_sum >= python_gc_time_sum_MAX:
|
||||
print(f"CRITICAL: average GC time per collection is {python_gc_time_sum} sec. |'garbage-collection'={python_gc_time_sum}s;;;")
|
||||
sys.exit(nagios.CRITICAL)
|
||||
else:
|
||||
print(f"OK: average GC time per collection is {python_gc_time_sum} sec. |'garbage-collection'={python_gc_time_sum}s;;;")
|
||||
sys.exit(nagios.OK)
|
||||
except Exception as e:
|
||||
print(f'UNKNOWN: failed to check avg. GC time "{e}"')
|
||||
sys.exit(nagios.UNKNOWN)
|
||||
elif args.type == 'response-time':
|
||||
response_time_MAX = 1 if not args.crit else args.crit
|
||||
timeout = 10
|
||||
try:
|
||||
response_times = []
|
||||
for i in range(10):
|
||||
start = time.perf_counter()
|
||||
try:
|
||||
response = requests.post(args.synapse_server, timeout=timeout, verify=False)
|
||||
except Exception as e:
|
||||
print(f'UNKNOWN: failed to ping endpoint "{e}"')
|
||||
sys.exit(nagios.UNKNOWN)
|
||||
request_time = time.perf_counter() - start
|
||||
response_times.append(np.round(request_time, 2))
|
||||
time.sleep(1)
|
||||
response_time = np.round(np.average(response_times), 2)
|
||||
if response_time > response_time_MAX:
|
||||
print(f"CRITICAL: response time is {response_time} sec. |'response-time'={response_time}s;;;")
|
||||
sys.exit(nagios.CRITICAL)
|
||||
else:
|
||||
print(f"OK: response time is {response_time} sec. |'response-time'={response_time}s;;;")
|
||||
sys.exit(nagios.OK)
|
||||
except Exception as e:
|
||||
print(f'UNKNOWN: failed to check response time "{e}"')
|
||||
sys.exit(nagios.UNKNOWN)
|
||||
elif args.type == 'outgoing-http-rate':
|
||||
# outgoing req/sec
|
||||
outgoing_http_request_rate_MAX = 10 if not args.crit else args.crit
|
||||
try:
|
||||
outgoing_http_request_rate = get_outgoing_http_request_rate(args.grafana_api_key, args.interval, args.range, args.grafana_server)
|
||||
failed = {}
|
||||
perf_data = '|'
|
||||
for k, v in outgoing_http_request_rate.items():
|
||||
perf_data = perf_data + f"'{k}'={v}s;;; "
|
||||
if v > outgoing_http_request_rate_MAX:
|
||||
failed[k] = v
|
||||
def main():
|
||||
if args.type == 'gc-time':
|
||||
# in seconds
|
||||
python_gc_time_sum_MAX = 0.002 if not args.crit else args.crit
|
||||
try:
|
||||
python_gc_time_sum = np.round(np.average(get_avg_python_gc_time(args.grafana_api_key, args.interval, args.range, args.grafana_server)), 5)
|
||||
if python_gc_time_sum >= python_gc_time_sum_MAX:
|
||||
print(f"CRITICAL: average GC time per collection is {python_gc_time_sum} sec. |'garbage-collection'={python_gc_time_sum}s;;;")
|
||||
sys.exit(nagios.CRITICAL)
|
||||
else:
|
||||
print(f"OK: average GC time per collection is {python_gc_time_sum} sec. |'garbage-collection'={python_gc_time_sum}s;;;")
|
||||
sys.exit(nagios.OK)
|
||||
except Exception as e:
|
||||
print(f'UNKNOWN: failed to check avg. GC time "{e}"')
|
||||
sys.exit(nagios.UNKNOWN)
|
||||
elif args.type == 'response-time':
|
||||
response_time_MAX = 1 if not args.crit else args.crit
|
||||
timeout = 10
|
||||
try:
|
||||
response_times = []
|
||||
for i in range(10):
|
||||
start = time.perf_counter()
|
||||
try:
|
||||
response = requests.post(args.synapse_server, timeout=timeout, verify=False)
|
||||
except Exception as e:
|
||||
print(f'UNKNOWN: failed to ping endpoint "{e}"')
|
||||
sys.exit(nagios.UNKNOWN)
|
||||
request_time = time.perf_counter() - start
|
||||
response_times.append(np.round(request_time, 2))
|
||||
time.sleep(1)
|
||||
response_time = np.round(np.average(response_times), 2)
|
||||
if response_time > response_time_MAX:
|
||||
print(f"CRITICAL: response time is {response_time} sec. |'response-time'={response_time}s;;;")
|
||||
sys.exit(nagios.CRITICAL)
|
||||
else:
|
||||
print(f"OK: response time is {response_time} sec. |'response-time'={response_time}s;;;")
|
||||
sys.exit(nagios.OK)
|
||||
except Exception as e:
|
||||
print(f'UNKNOWN: failed to check response time "{e}"')
|
||||
sys.exit(nagios.UNKNOWN)
|
||||
elif args.type == 'outgoing-http-rate':
|
||||
# outgoing req/sec
|
||||
outgoing_http_request_rate_MAX = 10 if not args.crit else args.crit
|
||||
try:
|
||||
outgoing_http_request_rate = get_outgoing_http_request_rate(args.grafana_api_key, args.interval, args.range, args.grafana_server)
|
||||
failed = {}
|
||||
perf_data = '|'
|
||||
for k, v in outgoing_http_request_rate.items():
|
||||
perf_data = perf_data + f"'{k}'={v}s;;; "
|
||||
if v > outgoing_http_request_rate_MAX:
|
||||
failed[k] = v
|
||||
|
||||
if len(failed.keys()) > 0:
|
||||
print(f'CRITICAL: outgoing HTTP request rate for {failed} req/sec.', perf_data)
|
||||
sys.exit(nagios.CRITICAL)
|
||||
print(f'OK: outgoing HTTP request rate is {outgoing_http_request_rate} req/sec.', perf_data)
|
||||
sys.exit(nagios.OK)
|
||||
except Exception as e:
|
||||
print(f'UNKNOWN: failed to check outgoing HTTP request rate "{e}"')
|
||||
sys.exit(nagios.UNKNOWN)
|
||||
elif args.type == 'avg-send':
|
||||
# Average send time in seconds
|
||||
event_send_time_MAX = 1 if not args.crit else args.crit
|
||||
try:
|
||||
event_send_time = get_event_send_time(args.grafana_api_key, args.interval, args.range, args.grafana_server)
|
||||
if event_send_time > event_send_time_MAX:
|
||||
print(f"CRITICAL: average message send time is {event_send_time} sec. |'avg-send-time'={event_send_time}s;;;")
|
||||
sys.exit(nagios.CRITICAL)
|
||||
else:
|
||||
print(f"OK: average message send time is {event_send_time} sec. |'avg-send-time'={event_send_time}s;;;")
|
||||
if len(failed.keys()) > 0:
|
||||
print(f'CRITICAL: outgoing HTTP request rate for {failed} req/sec.', perf_data)
|
||||
sys.exit(nagios.CRITICAL)
|
||||
print(f'OK: outgoing HTTP request rate is {outgoing_http_request_rate} req/sec.', perf_data)
|
||||
sys.exit(nagios.OK)
|
||||
except Exception as e:
|
||||
print(f'UNKNOWN: failed to check average message send time "{e}"')
|
||||
except Exception as e:
|
||||
print(f'UNKNOWN: failed to check outgoing HTTP request rate "{e}"')
|
||||
sys.exit(nagios.UNKNOWN)
|
||||
elif args.type == 'avg-send':
|
||||
# Average send time in seconds
|
||||
event_send_time_MAX = 1 if not args.crit else args.crit
|
||||
try:
|
||||
event_send_time = get_event_send_time(args.grafana_api_key, args.interval, args.range, args.grafana_server)
|
||||
if event_send_time > event_send_time_MAX:
|
||||
print(f"CRITICAL: average message send time is {event_send_time} sec. |'avg-send-time'={event_send_time}s;;;")
|
||||
sys.exit(nagios.CRITICAL)
|
||||
else:
|
||||
print(f"OK: average message send time is {event_send_time} sec. |'avg-send-time'={event_send_time}s;;;")
|
||||
sys.exit(nagios.OK)
|
||||
except Exception as e:
|
||||
print(f'UNKNOWN: failed to check average message send time "{e}"')
|
||||
sys.exit(nagios.UNKNOWN)
|
||||
elif args.type == 'db-lag':
|
||||
# in seconds
|
||||
db_lag_MAX = 0.01 if not args.crit else args.crit
|
||||
try:
|
||||
db_lag = get_waiting_for_db(args.grafana_api_key, args.interval, args.range, args.grafana_server)
|
||||
if db_lag > db_lag_MAX:
|
||||
print(f"CRITICAL: DB lag is {db_lag} sec. |'db-lag'={db_lag}s;;;")
|
||||
sys.exit(nagios.CRITICAL)
|
||||
else:
|
||||
print(f"OK: DB lag is {db_lag} sec. |'db-lag'={db_lag}s;;;")
|
||||
sys.exit(nagios.OK)
|
||||
except Exception as e:
|
||||
print(f'UNKNOWN: failed to check DB lag "{e}"')
|
||||
sys.exit(nagios.UNKNOWN)
|
||||
else:
|
||||
print('Wrong type')
|
||||
sys.exit(nagios.UNKNOWN)
|
||||
elif args.type == 'db-lag':
|
||||
# in seconds
|
||||
db_lag_MAX = 0.01 if not args.crit else args.crit
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
try:
|
||||
db_lag = get_waiting_for_db(args.grafana_api_key, args.interval, args.range, args.grafana_server)
|
||||
if db_lag > db_lag_MAX:
|
||||
print(f"CRITICAL: DB lag is {db_lag} sec. |'db-lag'={db_lag}s;;;")
|
||||
sys.exit(nagios.CRITICAL)
|
||||
else:
|
||||
print(f"OK: DB lag is {db_lag} sec. |'db-lag'={db_lag}s;;;")
|
||||
sys.exit(nagios.OK)
|
||||
main()
|
||||
except Exception as e:
|
||||
print(f'UNKNOWN: failed to check DB lag "{e}"')
|
||||
print(f'UNKNOWN: exception "{e}"')
|
||||
import traceback
|
||||
|
||||
print(traceback.format_exc())
|
||||
sys.exit(nagios.UNKNOWN)
|
||||
else:
|
||||
print('Wrong type')
|
||||
sys.exit(nagios.UNKNOWN)
|
||||
|
|
|
@ -18,112 +18,130 @@ parser.add_argument('--warn', type=float, default=20, help='Manually set warn le
|
|||
parser.add_argument('--crit', type=float, default=30, help='Manually set critical level.')
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.prometheus:
|
||||
from checker.prometheus import parse_metrics
|
||||
|
||||
r = requests.get(args.metrics_endpoint)
|
||||
if r.status_code != 200:
|
||||
sys.exit(nagios.UNKNOWN)
|
||||
def main():
|
||||
if args.prometheus:
|
||||
from checker.prometheus import parse_metrics
|
||||
|
||||
metrics = {}
|
||||
for item in parse_metrics(r.text)['monbot_ping_receive_delay_seconds']['monbot_ping_receive_delay_seconds_sum']:
|
||||
if item.labels['receivingDomain'] not in metrics.keys():
|
||||
metrics[item.labels['receivingDomain']] = {}
|
||||
metrics[item.labels['receivingDomain']][item.labels['sourceDomain']] = item.value
|
||||
r = requests.get(args.metrics_endpoint)
|
||||
if r.status_code != 200:
|
||||
sys.exit(nagios.UNKNOWN)
|
||||
|
||||
pings = {'receiver': [], 'sender': [], }
|
||||
for receiving_domain, senders in metrics.items():
|
||||
if receiving_domain == args.domain:
|
||||
for k, v in senders.items():
|
||||
pings['receiver'].append(v)
|
||||
else:
|
||||
for k, v in senders.items():
|
||||
if k == args.domain:
|
||||
pings['sender'].append(v)
|
||||
metrics = {}
|
||||
for item in parse_metrics(r.text)['monbot_ping_receive_delay_seconds']['monbot_ping_receive_delay_seconds_sum']:
|
||||
if item.labels['receivingDomain'] not in metrics.keys():
|
||||
metrics[item.labels['receivingDomain']] = {}
|
||||
metrics[item.labels['receivingDomain']][item.labels['sourceDomain']] = item.value
|
||||
|
||||
print(json.dumps(pings))
|
||||
pings = {'receiver': [], 'sender': [], }
|
||||
for receiving_domain, senders in metrics.items():
|
||||
if receiving_domain == args.domain:
|
||||
for k, v in senders.items():
|
||||
pings['receiver'].append(v)
|
||||
else:
|
||||
for k, v in senders.items():
|
||||
if k == args.domain:
|
||||
pings['sender'].append(v)
|
||||
|
||||
receiver_avg = np.round(np.average(pings['receiver']), 2)
|
||||
sender_avg = np.round(np.average(pings['sender']), 2)
|
||||
print(json.dumps(pings))
|
||||
|
||||
print('receiver latency is', receiver_avg)
|
||||
print('sender latency is', sender_avg)
|
||||
else:
|
||||
from bs4 import BeautifulSoup
|
||||
import re
|
||||
receiver_avg = np.round(np.average(pings['receiver']), 2)
|
||||
sender_avg = np.round(np.average(pings['sender']), 2)
|
||||
|
||||
# Split the values since icinga will quote the args
|
||||
if len(args.ignore) == 1:
|
||||
args.ignore = args.ignore[0].strip(' ').split(' ')
|
||||
|
||||
|
||||
def get_sec(time_str):
|
||||
"""Get seconds from time."""
|
||||
h, m, s = time_str.split(':')
|
||||
return int(h) * 3600 + int(m) * 60 + int(s)
|
||||
|
||||
|
||||
def ms_to_s(s):
|
||||
min_m = re.match(r'^(\d+)m([\d.]+)s', s)
|
||||
if min_m:
|
||||
return get_sec(f'0:{min_m.group(1)}:{int(float(min_m.group(2)))}')
|
||||
elif s.endswith('ms'):
|
||||
return float('0.' + s.strip('ms'))
|
||||
elif s.endswith('s'):
|
||||
return float(s.strip('ms'))
|
||||
|
||||
|
||||
r = requests.get(args.metrics_endpoint)
|
||||
if r.status_code != 200:
|
||||
sys.exit(nagios.UNKNOWN)
|
||||
soup = BeautifulSoup(r.text, 'html.parser')
|
||||
tooltips = soup.find_all('span', {'class', 'tooltip'})
|
||||
data = {}
|
||||
for item in tooltips:
|
||||
m = re.match(r'<span class="tooltip">\s*Send: (.*?)\s*<br\/>\s*Receive: (.*?)\s*<\/span>', str(item))
|
||||
if m:
|
||||
domain = item.parent.parent.find('span', {'class': 'domain'}).text
|
||||
data[domain] = {
|
||||
'send': ms_to_s(m.group(1)),
|
||||
'receive': ms_to_s(m.group(2)),
|
||||
}
|
||||
exit_code = nagios.OK
|
||||
info_str = []
|
||||
data_str = []
|
||||
|
||||
if len(data.keys()) == 0:
|
||||
print('UNKNOWN: failed to find any servers.')
|
||||
sys.exit(nagios.UNKNOWN)
|
||||
|
||||
for domain, values in data.items():
|
||||
if domain not in args.ignore:
|
||||
if values['send'] >= args.crit:
|
||||
info_str.append(f'CRITICAL: {domain} send is {values["send"]}s.')
|
||||
exit_code = nagios.CRITICAL
|
||||
elif values['send'] >= args.warn:
|
||||
info_str.append(f'WARN: {domain} send is {values["send"]}s.')
|
||||
if exit_code < nagios.WARNING:
|
||||
exit_code = nagios.WARNING
|
||||
# else:
|
||||
# print(f'OK: {domain} send is {values["send"]}s.')
|
||||
|
||||
if values['receive'] >= args.crit:
|
||||
info_str.append(f'CRITICAL: {domain} receive is {values["receive"]}s.')
|
||||
exit_code = nagios.CRITICAL
|
||||
elif values['receive'] >= args.warn:
|
||||
info_str.append(f'WARN: {domain} receive is {values["receive"]}s.')
|
||||
if exit_code < nagios.WARNING:
|
||||
exit_code = nagios.WARNING
|
||||
# else:
|
||||
# print(f'OK: {domain} receive is {values["receive"]}s.')
|
||||
data_str.append(
|
||||
f"'{domain}-send'={values['send']}s;;; '{domain}-receive'={values['receive']}s;;;"
|
||||
)
|
||||
if any(('CRITICAL' not in s and 'WARNING' not in s) for s in info_str) or len(info_str) == 0:
|
||||
print(f'OK: ping time is good.', end=' ')
|
||||
print('receiver latency is', receiver_avg)
|
||||
print('sender latency is', sender_avg)
|
||||
else:
|
||||
for x in info_str:
|
||||
print(x, end=('\n' if info_str.index(x) + 1 < len(info_str) else ''))
|
||||
print(f'|{" ".join(data_str)}')
|
||||
from bs4 import BeautifulSoup
|
||||
import re
|
||||
|
||||
sys.exit(exit_code)
|
||||
# Split the values since icinga will quote the args
|
||||
if len(args.ignore) == 1:
|
||||
args.ignore = args.ignore[0].strip(' ').split(' ')
|
||||
|
||||
def get_sec(time_str):
|
||||
"""Get seconds from time."""
|
||||
h, m, s = time_str.split(':')
|
||||
return int(h) * 3600 + int(m) * 60 + int(s)
|
||||
|
||||
def ms_to_s(s):
|
||||
min_m = re.match(r'^(\d+)m([\d.]+)s', s)
|
||||
if min_m:
|
||||
return get_sec(f'0:{min_m.group(1)}:{int(float(min_m.group(2)))}')
|
||||
elif s.endswith('ms'):
|
||||
return float('0.' + s.strip('ms'))
|
||||
elif s.endswith('s'):
|
||||
return float(s.strip('ms'))
|
||||
|
||||
r = requests.get(args.metrics_endpoint)
|
||||
if r.status_code != 200:
|
||||
sys.exit(nagios.UNKNOWN)
|
||||
soup = BeautifulSoup(r.text, 'html.parser')
|
||||
tooltips = soup.find_all('span', {'class', 'tooltip'})
|
||||
data = {}
|
||||
for item in tooltips:
|
||||
m = re.match(r'<span class="tooltip">\s*Send: (.*?)\s*<br\/>\s*Receive: (.*?)\s*<\/span>', str(item))
|
||||
if m:
|
||||
domain = item.parent.parent.find('span', {'class': 'domain'}).text
|
||||
data[domain] = {
|
||||
'send': ms_to_s(m.group(1)),
|
||||
'receive': ms_to_s(m.group(2)),
|
||||
}
|
||||
exit_code = nagios.OK
|
||||
info_str = []
|
||||
data_str = []
|
||||
|
||||
if len(data.keys()) == 0:
|
||||
print('UNKNOWN: failed to find any servers.')
|
||||
sys.exit(nagios.UNKNOWN)
|
||||
|
||||
for domain, values in data.items():
|
||||
if domain not in args.ignore:
|
||||
if 'send' in values.keys():
|
||||
if values['send'] >= args.crit:
|
||||
info_str.append(f'CRITICAL: {domain} send is {values["send"]}s.')
|
||||
exit_code = nagios.CRITICAL
|
||||
elif values['send'] >= args.warn:
|
||||
info_str.append(f'WARN: {domain} send is {values["send"]}s.')
|
||||
if exit_code < nagios.WARNING:
|
||||
exit_code = nagios.WARNING
|
||||
# else:
|
||||
# print(f'OK: {domain} send is {values["send"]}s.')
|
||||
else:
|
||||
info_str.append(f'UNKNOWN: {domain} send is empty.')
|
||||
|
||||
if 'receive' in values.keys():
|
||||
if values['receive'] >= args.crit:
|
||||
info_str.append(f'CRITICAL: {domain} receive is {values["receive"]}s.')
|
||||
exit_code = nagios.CRITICAL
|
||||
elif values['receive'] >= args.warn:
|
||||
info_str.append(f'WARN: {domain} receive is {values["receive"]}s.')
|
||||
if exit_code < nagios.WARNING:
|
||||
exit_code = nagios.WARNING
|
||||
# else:
|
||||
# print(f'OK: {domain} receive is {values["receive"]}s.')
|
||||
else:
|
||||
info_str.append(f'UNKNOWN: {domain} receive is empty.')
|
||||
|
||||
if 'send' in values.keys() and 'receive' in values.keys():
|
||||
data_str.append(
|
||||
f"'{domain}-send'={values['send']}s;;; '{domain}-receive'={values['receive']}s;;;"
|
||||
)
|
||||
if any(('CRITICAL' not in s and 'WARNING' not in s) for s in info_str) or len(info_str) == 0:
|
||||
print(f'OK: ping time is good.', end=' ')
|
||||
else:
|
||||
for x in info_str:
|
||||
print(x, end=('\n' if info_str.index(x) + 1 < len(info_str) else ''))
|
||||
print(f'|{" ".join(data_str)}')
|
||||
|
||||
sys.exit(exit_code)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
try:
|
||||
main()
|
||||
except Exception as e:
|
||||
print(f'UNKNOWN: exception "{e}"')
|
||||
import traceback
|
||||
|
||||
print(traceback.format_exc())
|
||||
sys.exit(nagios.UNKNOWN)
|
||||
|
|
|
@ -66,7 +66,7 @@ def build_msg(host_name, host_display_name, state, date_str, output, service_nam
|
|||
elif host_name:
|
||||
icinga2_url = f'<br>[Quick Link]({icinga2_url}/icingadb/host?name={host_name.replace(" ", "+")})'
|
||||
|
||||
msg = f"""{icon} {item} is <font color="{choose_color(state)}">{state}</font> <br>
|
||||
msg = f"""{icon} {item} is **<font color="{choose_color(state)}">{state}</font>** <br>
|
||||
**When:** {date_str}. <br>
|
||||
**Info:** {newline_to_formatted_html(output)}{address}{comment}{icinga2_url}"""
|
||||
return msg
|
||||
|
|
Loading…
Reference in New Issue