#!/usr/bin/env python3 import argparse import sys import time import traceback import numpy as np import requests from checker import nagios from checker.synapse_grafana import get_avg_python_gc_time, get_event_send_time, get_outgoing_http_request_rate, get_waiting_for_db parser = argparse.ArgumentParser(description='Process some integers.') parser.add_argument('--grafana-server', required=True, help='Grafana server. Example: http://10.0.0.43:3000') parser.add_argument('--synapse-server', required=True, help='Matrix Synapse server.') parser.add_argument('--grafana-api-key', required=True) parser.add_argument('--interval', default=15, type=int, help='Data interval in seconds.') parser.add_argument('--range', default=2, type=int, help='Data range in minutes. Used for comparison and averaging.') parser.add_argument('--type', required=True, choices=['gc-time', 'response-time', 'outgoing-http-rate', 'avg-send', 'db-lag']) parser.add_argument('--warn', type=float, help='Manually set warn level.') parser.add_argument('--crit', type=float, help='Manually set critical level.') args = parser.parse_args() # TODO: add warn suppoort def main(): if args.type == 'gc-time': # in seconds python_gc_time_sum_MAX = 0.002 if not args.crit else args.crit try: raw_gc_time = get_avg_python_gc_time(args.grafana_api_key, args.interval, args.range, args.grafana_server) if not len(raw_gc_time): print(f"UNKNOWN: no data for average GC time") sys.exit(nagios.UNKNOWN) python_gc_time_sum = np.round(np.average(), 5) if python_gc_time_sum >= python_gc_time_sum_MAX: print(f"CRITICAL: average GC time per collection is {python_gc_time_sum} sec. |'garbage-collection'={python_gc_time_sum}s;;;") sys.exit(nagios.CRITICAL) else: print(f"OK: average GC time per collection is {python_gc_time_sum} sec. |'garbage-collection'={python_gc_time_sum}s;;;") sys.exit(nagios.OK) except Exception as e: print(f'UNKNOWN: failed to check avg. GC time "{e}"') print(traceback.format_exc()) sys.exit(nagios.UNKNOWN) elif args.type == 'response-time': response_time_MAX = 1 if not args.crit else args.crit timeout = 10 try: response_times = [] for i in range(10): start = time.perf_counter() try: response = requests.post(args.synapse_server, timeout=timeout, verify=False) except Exception as e: print(f'UNKNOWN: failed to ping endpoint "{e}"') print(traceback.format_exc()) sys.exit(nagios.UNKNOWN) request_time = time.perf_counter() - start response_times.append(np.round(request_time, 2)) time.sleep(1) response_time = np.round(np.average(response_times), 2) if response_time > response_time_MAX: print(f"CRITICAL: response time is {response_time} sec. |'response-time'={response_time}s;;;") sys.exit(nagios.CRITICAL) else: print(f"OK: response time is {response_time} sec. |'response-time'={response_time}s;;;") sys.exit(nagios.OK) except Exception as e: print(f'UNKNOWN: failed to check response time "{e}"') print(traceback.format_exc()) sys.exit(nagios.UNKNOWN) elif args.type == 'outgoing-http-rate': # outgoing req/sec outgoing_http_request_rate_MAX = 10 if not args.crit else args.crit try: outgoing_http_request_rate = get_outgoing_http_request_rate(args.grafana_api_key, args.interval, args.range, args.grafana_server) if not len(outgoing_http_request_rate.keys()): print(f"UNKNOWN: no data for outgoing HTTP request rate") sys.exit(nagios.UNKNOWN) failed = {} perf_data = '|' for k, v in outgoing_http_request_rate.items(): perf_data = perf_data + f"'{k}'={v}s;;; " if v > outgoing_http_request_rate_MAX: failed[k] = v if len(failed.keys()) > 0: print(f'CRITICAL: outgoing HTTP request rate for {failed} req/sec.', perf_data) sys.exit(nagios.CRITICAL) print(f'OK: outgoing HTTP request rate is {outgoing_http_request_rate} req/sec.', perf_data) sys.exit(nagios.OK) except Exception as e: print(f'UNKNOWN: failed to check outgoing HTTP request rate "{e}"') print(traceback.format_exc()) sys.exit(nagios.UNKNOWN) elif args.type == 'avg-send': # Average send time in seconds event_send_time_MAX = 1 if not args.crit else args.crit try: event_send_time = get_event_send_time(args.grafana_api_key, args.interval, args.range, args.grafana_server) if not event_send_time: print(f"UNKNOWN: no data for event send time") sys.exit(nagios.UNKNOWN) if event_send_time > event_send_time_MAX: print(f"CRITICAL: average message send time is {event_send_time} sec. |'avg-send-time'={event_send_time}s;;;") sys.exit(nagios.CRITICAL) else: print(f"OK: average message send time is {event_send_time} sec. |'avg-send-time'={event_send_time}s;;;") sys.exit(nagios.OK) except Exception as e: print(f'UNKNOWN: failed to check average message send time "{e}"') print(traceback.format_exc()) sys.exit(nagios.UNKNOWN) elif args.type == 'db-lag': db_lag_MAX = 0.01 if not args.crit else args.crit # in seconds try: db_lag, null_present, raw_data = get_waiting_for_db(args.grafana_api_key, args.interval, args.range, args.grafana_server) if db_lag == None: print(f"UNKNOWN: no data for DB lag") sys.exit(nagios.UNKNOWN) null_warn = f'Null data was present for this timeseries, value may be incorrect.\n{raw_data}' if null_present else '' if db_lag > db_lag_MAX: print(f"CRITICAL: DB lag is {db_lag} sec. {null_warn} |'db-lag'={db_lag}s;;;") sys.exit(nagios.CRITICAL) else: print(f"OK: DB lag is {db_lag} sec. {null_warn} |'db-lag'={db_lag}s;;;") sys.exit(nagios.OK) except Exception as e: print(f'UNKNOWN: failed to check DB lag "{e}"') print(traceback.format_exc()) sys.exit(nagios.UNKNOWN) else: print('Wrong type') sys.exit(nagios.UNKNOWN) if __name__ == "__main__": try: main() except Exception as e: print(f'UNKNOWN: exception "{e}"') print(traceback.format_exc()) sys.exit(nagios.UNKNOWN)