diff --git a/check_matrix_synapse.py b/check_matrix_synapse.py index a5d916a..6ccab81 100644 --- a/check_matrix_synapse.py +++ b/check_matrix_synapse.py @@ -20,95 +20,108 @@ parser.add_argument('--warn', type=float, help='Manually set warn level.') parser.add_argument('--crit', type=float, help='Manually set critical level.') args = parser.parse_args() + # TODO: add warn suppoort -if args.type == 'gc-time': - # in seconds - python_gc_time_sum_MAX = 0.002 if not args.crit else args.crit - try: - python_gc_time_sum = np.round(np.average(get_avg_python_gc_time(args.grafana_api_key, args.interval, args.range, args.grafana_server)), 5) - if python_gc_time_sum >= python_gc_time_sum_MAX: - print(f"CRITICAL: average GC time per collection is {python_gc_time_sum} sec. |'garbage-collection'={python_gc_time_sum}s;;;") - sys.exit(nagios.CRITICAL) - else: - print(f"OK: average GC time per collection is {python_gc_time_sum} sec. |'garbage-collection'={python_gc_time_sum}s;;;") - sys.exit(nagios.OK) - except Exception as e: - print(f'UNKNOWN: failed to check avg. GC time "{e}"') - sys.exit(nagios.UNKNOWN) -elif args.type == 'response-time': - response_time_MAX = 1 if not args.crit else args.crit - timeout = 10 - try: - response_times = [] - for i in range(10): - start = time.perf_counter() - try: - response = requests.post(args.synapse_server, timeout=timeout, verify=False) - except Exception as e: - print(f'UNKNOWN: failed to ping endpoint "{e}"') - sys.exit(nagios.UNKNOWN) - request_time = time.perf_counter() - start - response_times.append(np.round(request_time, 2)) - time.sleep(1) - response_time = np.round(np.average(response_times), 2) - if response_time > response_time_MAX: - print(f"CRITICAL: response time is {response_time} sec. |'response-time'={response_time}s;;;") - sys.exit(nagios.CRITICAL) - else: - print(f"OK: response time is {response_time} sec. |'response-time'={response_time}s;;;") - sys.exit(nagios.OK) - except Exception as e: - print(f'UNKNOWN: failed to check response time "{e}"') - sys.exit(nagios.UNKNOWN) -elif args.type == 'outgoing-http-rate': - # outgoing req/sec - outgoing_http_request_rate_MAX = 10 if not args.crit else args.crit - try: - outgoing_http_request_rate = get_outgoing_http_request_rate(args.grafana_api_key, args.interval, args.range, args.grafana_server) - failed = {} - perf_data = '|' - for k, v in outgoing_http_request_rate.items(): - perf_data = perf_data + f"'{k}'={v}s;;; " - if v > outgoing_http_request_rate_MAX: - failed[k] = v +def main(): + if args.type == 'gc-time': + # in seconds + python_gc_time_sum_MAX = 0.002 if not args.crit else args.crit + try: + python_gc_time_sum = np.round(np.average(get_avg_python_gc_time(args.grafana_api_key, args.interval, args.range, args.grafana_server)), 5) + if python_gc_time_sum >= python_gc_time_sum_MAX: + print(f"CRITICAL: average GC time per collection is {python_gc_time_sum} sec. |'garbage-collection'={python_gc_time_sum}s;;;") + sys.exit(nagios.CRITICAL) + else: + print(f"OK: average GC time per collection is {python_gc_time_sum} sec. |'garbage-collection'={python_gc_time_sum}s;;;") + sys.exit(nagios.OK) + except Exception as e: + print(f'UNKNOWN: failed to check avg. GC time "{e}"') + sys.exit(nagios.UNKNOWN) + elif args.type == 'response-time': + response_time_MAX = 1 if not args.crit else args.crit + timeout = 10 + try: + response_times = [] + for i in range(10): + start = time.perf_counter() + try: + response = requests.post(args.synapse_server, timeout=timeout, verify=False) + except Exception as e: + print(f'UNKNOWN: failed to ping endpoint "{e}"') + sys.exit(nagios.UNKNOWN) + request_time = time.perf_counter() - start + response_times.append(np.round(request_time, 2)) + time.sleep(1) + response_time = np.round(np.average(response_times), 2) + if response_time > response_time_MAX: + print(f"CRITICAL: response time is {response_time} sec. |'response-time'={response_time}s;;;") + sys.exit(nagios.CRITICAL) + else: + print(f"OK: response time is {response_time} sec. |'response-time'={response_time}s;;;") + sys.exit(nagios.OK) + except Exception as e: + print(f'UNKNOWN: failed to check response time "{e}"') + sys.exit(nagios.UNKNOWN) + elif args.type == 'outgoing-http-rate': + # outgoing req/sec + outgoing_http_request_rate_MAX = 10 if not args.crit else args.crit + try: + outgoing_http_request_rate = get_outgoing_http_request_rate(args.grafana_api_key, args.interval, args.range, args.grafana_server) + failed = {} + perf_data = '|' + for k, v in outgoing_http_request_rate.items(): + perf_data = perf_data + f"'{k}'={v}s;;; " + if v > outgoing_http_request_rate_MAX: + failed[k] = v - if len(failed.keys()) > 0: - print(f'CRITICAL: outgoing HTTP request rate for {failed} req/sec.', perf_data) - sys.exit(nagios.CRITICAL) - print(f'OK: outgoing HTTP request rate is {outgoing_http_request_rate} req/sec.', perf_data) - sys.exit(nagios.OK) - except Exception as e: - print(f'UNKNOWN: failed to check outgoing HTTP request rate "{e}"') - sys.exit(nagios.UNKNOWN) -elif args.type == 'avg-send': - # Average send time in seconds - event_send_time_MAX = 1 if not args.crit else args.crit - try: - event_send_time = get_event_send_time(args.grafana_api_key, args.interval, args.range, args.grafana_server) - if event_send_time > event_send_time_MAX: - print(f"CRITICAL: average message send time is {event_send_time} sec. |'avg-send-time'={event_send_time}s;;;") - sys.exit(nagios.CRITICAL) - else: - print(f"OK: average message send time is {event_send_time} sec. |'avg-send-time'={event_send_time}s;;;") + if len(failed.keys()) > 0: + print(f'CRITICAL: outgoing HTTP request rate for {failed} req/sec.', perf_data) + sys.exit(nagios.CRITICAL) + print(f'OK: outgoing HTTP request rate is {outgoing_http_request_rate} req/sec.', perf_data) sys.exit(nagios.OK) - except Exception as e: - print(f'UNKNOWN: failed to check average message send time "{e}"') + except Exception as e: + print(f'UNKNOWN: failed to check outgoing HTTP request rate "{e}"') + sys.exit(nagios.UNKNOWN) + elif args.type == 'avg-send': + # Average send time in seconds + event_send_time_MAX = 1 if not args.crit else args.crit + try: + event_send_time = get_event_send_time(args.grafana_api_key, args.interval, args.range, args.grafana_server) + if event_send_time > event_send_time_MAX: + print(f"CRITICAL: average message send time is {event_send_time} sec. |'avg-send-time'={event_send_time}s;;;") + sys.exit(nagios.CRITICAL) + else: + print(f"OK: average message send time is {event_send_time} sec. |'avg-send-time'={event_send_time}s;;;") + sys.exit(nagios.OK) + except Exception as e: + print(f'UNKNOWN: failed to check average message send time "{e}"') + sys.exit(nagios.UNKNOWN) + elif args.type == 'db-lag': + # in seconds + db_lag_MAX = 0.01 if not args.crit else args.crit + try: + db_lag = get_waiting_for_db(args.grafana_api_key, args.interval, args.range, args.grafana_server) + if db_lag > db_lag_MAX: + print(f"CRITICAL: DB lag is {db_lag} sec. |'db-lag'={db_lag}s;;;") + sys.exit(nagios.CRITICAL) + else: + print(f"OK: DB lag is {db_lag} sec. |'db-lag'={db_lag}s;;;") + sys.exit(nagios.OK) + except Exception as e: + print(f'UNKNOWN: failed to check DB lag "{e}"') + sys.exit(nagios.UNKNOWN) + else: + print('Wrong type') sys.exit(nagios.UNKNOWN) -elif args.type == 'db-lag': - # in seconds - db_lag_MAX = 0.01 if not args.crit else args.crit + + +if __name__ == "__main__": try: - db_lag = get_waiting_for_db(args.grafana_api_key, args.interval, args.range, args.grafana_server) - if db_lag > db_lag_MAX: - print(f"CRITICAL: DB lag is {db_lag} sec. |'db-lag'={db_lag}s;;;") - sys.exit(nagios.CRITICAL) - else: - print(f"OK: DB lag is {db_lag} sec. |'db-lag'={db_lag}s;;;") - sys.exit(nagios.OK) + main() except Exception as e: - print(f'UNKNOWN: failed to check DB lag "{e}"') + print(f'UNKNOWN: exception "{e}"') + import traceback + + print(traceback.format_exc()) sys.exit(nagios.UNKNOWN) -else: - print('Wrong type') - sys.exit(nagios.UNKNOWN) diff --git a/check_monitor_bot.py b/check_monitor_bot.py index 36f6031..f2071fb 100644 --- a/check_monitor_bot.py +++ b/check_monitor_bot.py @@ -18,112 +18,130 @@ parser.add_argument('--warn', type=float, default=20, help='Manually set warn le parser.add_argument('--crit', type=float, default=30, help='Manually set critical level.') args = parser.parse_args() -if args.prometheus: - from checker.prometheus import parse_metrics - r = requests.get(args.metrics_endpoint) - if r.status_code != 200: - sys.exit(nagios.UNKNOWN) +def main(): + if args.prometheus: + from checker.prometheus import parse_metrics - metrics = {} - for item in parse_metrics(r.text)['monbot_ping_receive_delay_seconds']['monbot_ping_receive_delay_seconds_sum']: - if item.labels['receivingDomain'] not in metrics.keys(): - metrics[item.labels['receivingDomain']] = {} - metrics[item.labels['receivingDomain']][item.labels['sourceDomain']] = item.value + r = requests.get(args.metrics_endpoint) + if r.status_code != 200: + sys.exit(nagios.UNKNOWN) - pings = {'receiver': [], 'sender': [], } - for receiving_domain, senders in metrics.items(): - if receiving_domain == args.domain: - for k, v in senders.items(): - pings['receiver'].append(v) - else: - for k, v in senders.items(): - if k == args.domain: - pings['sender'].append(v) + metrics = {} + for item in parse_metrics(r.text)['monbot_ping_receive_delay_seconds']['monbot_ping_receive_delay_seconds_sum']: + if item.labels['receivingDomain'] not in metrics.keys(): + metrics[item.labels['receivingDomain']] = {} + metrics[item.labels['receivingDomain']][item.labels['sourceDomain']] = item.value - print(json.dumps(pings)) + pings = {'receiver': [], 'sender': [], } + for receiving_domain, senders in metrics.items(): + if receiving_domain == args.domain: + for k, v in senders.items(): + pings['receiver'].append(v) + else: + for k, v in senders.items(): + if k == args.domain: + pings['sender'].append(v) - receiver_avg = np.round(np.average(pings['receiver']), 2) - sender_avg = np.round(np.average(pings['sender']), 2) + print(json.dumps(pings)) - print('receiver latency is', receiver_avg) - print('sender latency is', sender_avg) -else: - from bs4 import BeautifulSoup - import re + receiver_avg = np.round(np.average(pings['receiver']), 2) + sender_avg = np.round(np.average(pings['sender']), 2) - # Split the values since icinga will quote the args - if len(args.ignore) == 1: - args.ignore = args.ignore[0].strip(' ').split(' ') - - - def get_sec(time_str): - """Get seconds from time.""" - h, m, s = time_str.split(':') - return int(h) * 3600 + int(m) * 60 + int(s) - - - def ms_to_s(s): - min_m = re.match(r'^(\d+)m([\d.]+)s', s) - if min_m: - return get_sec(f'0:{min_m.group(1)}:{int(float(min_m.group(2)))}') - elif s.endswith('ms'): - return float('0.' + s.strip('ms')) - elif s.endswith('s'): - return float(s.strip('ms')) - - - r = requests.get(args.metrics_endpoint) - if r.status_code != 200: - sys.exit(nagios.UNKNOWN) - soup = BeautifulSoup(r.text, 'html.parser') - tooltips = soup.find_all('span', {'class', 'tooltip'}) - data = {} - for item in tooltips: - m = re.match(r'\s*Send: (.*?)\s*\s*Receive: (.*?)\s*<\/span>', str(item)) - if m: - domain = item.parent.parent.find('span', {'class': 'domain'}).text - data[domain] = { - 'send': ms_to_s(m.group(1)), - 'receive': ms_to_s(m.group(2)), - } - exit_code = nagios.OK - info_str = [] - data_str = [] - - if len(data.keys()) == 0: - print('UNKNOWN: failed to find any servers.') - sys.exit(nagios.UNKNOWN) - - for domain, values in data.items(): - if domain not in args.ignore: - if values['send'] >= args.crit: - info_str.append(f'CRITICAL: {domain} send is {values["send"]}s.') - exit_code = nagios.CRITICAL - elif values['send'] >= args.warn: - info_str.append(f'WARN: {domain} send is {values["send"]}s.') - if exit_code < nagios.WARNING: - exit_code = nagios.WARNING - # else: - # print(f'OK: {domain} send is {values["send"]}s.') - - if values['receive'] >= args.crit: - info_str.append(f'CRITICAL: {domain} receive is {values["receive"]}s.') - exit_code = nagios.CRITICAL - elif values['receive'] >= args.warn: - info_str.append(f'WARN: {domain} receive is {values["receive"]}s.') - if exit_code < nagios.WARNING: - exit_code = nagios.WARNING - # else: - # print(f'OK: {domain} receive is {values["receive"]}s.') - data_str.append( - f"'{domain}-send'={values['send']}s;;; '{domain}-receive'={values['receive']}s;;;" - ) - if any(('CRITICAL' not in s and 'WARNING' not in s) for s in info_str) or len(info_str) == 0: - print(f'OK: ping time is good.', end=' ') + print('receiver latency is', receiver_avg) + print('sender latency is', sender_avg) else: - for x in info_str: - print(x, end=('\n' if info_str.index(x) + 1 < len(info_str) else '')) - print(f'|{" ".join(data_str)}') + from bs4 import BeautifulSoup + import re - sys.exit(exit_code) + # Split the values since icinga will quote the args + if len(args.ignore) == 1: + args.ignore = args.ignore[0].strip(' ').split(' ') + + def get_sec(time_str): + """Get seconds from time.""" + h, m, s = time_str.split(':') + return int(h) * 3600 + int(m) * 60 + int(s) + + def ms_to_s(s): + min_m = re.match(r'^(\d+)m([\d.]+)s', s) + if min_m: + return get_sec(f'0:{min_m.group(1)}:{int(float(min_m.group(2)))}') + elif s.endswith('ms'): + return float('0.' + s.strip('ms')) + elif s.endswith('s'): + return float(s.strip('ms')) + + r = requests.get(args.metrics_endpoint) + if r.status_code != 200: + sys.exit(nagios.UNKNOWN) + soup = BeautifulSoup(r.text, 'html.parser') + tooltips = soup.find_all('span', {'class', 'tooltip'}) + data = {} + for item in tooltips: + m = re.match(r'\s*Send: (.*?)\s*\s*Receive: (.*?)\s*<\/span>', str(item)) + if m: + domain = item.parent.parent.find('span', {'class': 'domain'}).text + data[domain] = { + 'send': ms_to_s(m.group(1)), + 'receive': ms_to_s(m.group(2)), + } + exit_code = nagios.OK + info_str = [] + data_str = [] + + if len(data.keys()) == 0: + print('UNKNOWN: failed to find any servers.') + sys.exit(nagios.UNKNOWN) + + for domain, values in data.items(): + if domain not in args.ignore: + if 'send' in values.keys(): + if values['send'] >= args.crit: + info_str.append(f'CRITICAL: {domain} send is {values["send"]}s.') + exit_code = nagios.CRITICAL + elif values['send'] >= args.warn: + info_str.append(f'WARN: {domain} send is {values["send"]}s.') + if exit_code < nagios.WARNING: + exit_code = nagios.WARNING + # else: + # print(f'OK: {domain} send is {values["send"]}s.') + else: + info_str.append(f'UNKNOWN: {domain} send is empty.') + + if 'receive' in values.keys(): + if values['receive'] >= args.crit: + info_str.append(f'CRITICAL: {domain} receive is {values["receive"]}s.') + exit_code = nagios.CRITICAL + elif values['receive'] >= args.warn: + info_str.append(f'WARN: {domain} receive is {values["receive"]}s.') + if exit_code < nagios.WARNING: + exit_code = nagios.WARNING + # else: + # print(f'OK: {domain} receive is {values["receive"]}s.') + else: + info_str.append(f'UNKNOWN: {domain} receive is empty.') + + if 'send' in values.keys() and 'receive' in values.keys(): + data_str.append( + f"'{domain}-send'={values['send']}s;;; '{domain}-receive'={values['receive']}s;;;" + ) + if any(('CRITICAL' not in s and 'WARNING' not in s) for s in info_str) or len(info_str) == 0: + print(f'OK: ping time is good.', end=' ') + else: + for x in info_str: + print(x, end=('\n' if info_str.index(x) + 1 < len(info_str) else '')) + print(f'|{" ".join(data_str)}') + + sys.exit(exit_code) + + +if __name__ == "__main__": + try: + main() + except Exception as e: + print(f'UNKNOWN: exception "{e}"') + import traceback + + print(traceback.format_exc()) + sys.exit(nagios.UNKNOWN) diff --git a/checker/notify.py b/checker/notify.py index 3ae753c..d038561 100644 --- a/checker/notify.py +++ b/checker/notify.py @@ -66,7 +66,7 @@ def build_msg(host_name, host_display_name, state, date_str, output, service_nam elif host_name: icinga2_url = f'
[Quick Link]({icinga2_url}/icingadb/host?name={host_name.replace(" ", "+")})' - msg = f"""{icon}   {item} is {state}
+ msg = f"""{icon}   {item} is **{state}**
**When:** {date_str}.
**Info:** {newline_to_formatted_html(output)}{address}{comment}{icinga2_url}""" return msg