From 39042ba364c6acdf6ca1378878cb75b7b4dbc09e Mon Sep 17 00:00:00 2001 From: Cyberes Date: Fri, 21 Apr 2023 23:54:16 -0600 Subject: [PATCH] add code --- Checks/Matrix Synapse/__init__.py | 0 Checks/Matrix Synapse/check_federation.py | 201 +++++++++++ Checks/Matrix Synapse/check_media_cdn.py | 220 ++++++++++++ Checks/Matrix Synapse/check_monitor_bot.py | 130 +++++++ Checks/Matrix Synapse/grafana.py | 378 +++++++++++++++++++++ Checks/Matrix Synapse/matrix_synapse.py | 111 ++++++ Checks/Matrix Synapse/nagios.py | 4 + Checks/Matrix Synapse/prometheus.py | 12 + Checks/Matrix Synapse/requirements.txt | 9 + Checks/Matrix Synapse/synapse_client.py | 110 ++++++ Checks/__init__.py | 0 Checks/check_monitor_bot.sh | 4 + Checks/check_redis.py | 211 ++++++++++++ Checks/test-federation.sh | 9 + Checks/test-media-cdn.sh | 8 + Other/icinga-to-kuma.py | 85 +++++ 16 files changed, 1492 insertions(+) create mode 100644 Checks/Matrix Synapse/__init__.py create mode 100644 Checks/Matrix Synapse/check_federation.py create mode 100644 Checks/Matrix Synapse/check_media_cdn.py create mode 100644 Checks/Matrix Synapse/check_monitor_bot.py create mode 100644 Checks/Matrix Synapse/grafana.py create mode 100644 Checks/Matrix Synapse/matrix_synapse.py create mode 100644 Checks/Matrix Synapse/nagios.py create mode 100644 Checks/Matrix Synapse/prometheus.py create mode 100644 Checks/Matrix Synapse/requirements.txt create mode 100644 Checks/Matrix Synapse/synapse_client.py create mode 100644 Checks/__init__.py create mode 100644 Checks/check_monitor_bot.sh create mode 100644 Checks/check_redis.py create mode 100644 Checks/test-federation.sh create mode 100644 Checks/test-media-cdn.sh create mode 100644 Other/icinga-to-kuma.py diff --git a/Checks/Matrix Synapse/__init__.py b/Checks/Matrix Synapse/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/Checks/Matrix Synapse/check_federation.py b/Checks/Matrix Synapse/check_federation.py new file mode 100644 index 0000000..939cfee --- /dev/null +++ b/Checks/Matrix Synapse/check_federation.py @@ -0,0 +1,201 @@ +#!/usr/bin/env python3 +import argparse +import asyncio +import json +import os +import sys +import time +import urllib +from datetime import datetime +from uuid import uuid4 + +from nio import AsyncClient, AsyncClientConfig, JoinError, JoinResponse, LoginResponse, RoomCreateError, RoomGetEventResponse, RoomSendError + +import nagios + +parser = argparse.ArgumentParser(description='Test federation between two homeservers.') +parser.add_argument('--bot1-user', required=True, help='User ID for bot 1.') +parser.add_argument('--bot1-pw', required=True, help='Password for bot 1.') +parser.add_argument('--bot1-hs', required=True, help='Homeserver for bot 1.') +parser.add_argument('--bot1-auth-file', help="File to cache the bot's login details to.") +parser.add_argument('--bot2-user', required=True, help='User ID for bot 2.') +parser.add_argument('--bot2-pw', required=True, help='Password for bot 2.') +parser.add_argument('--bot2-hs', required=True, help='Homeserver for bot 2.') +parser.add_argument('--bot2-auth-file', help="File to cache the bot's login details to.") +parser.add_argument('--timeout', type=float, default=90, help='Request timeout limit.') +parser.add_argument('--warn', type=float, default=2.0, help='Manually set warn level.') +parser.add_argument('--crit', type=float, default=2.5, help='Manually set critical level.') +args = parser.parse_args() + +bot1_hs_domain = urllib.parse.urlparse(args.bot1_hs).netloc +bot2_hs_domain = urllib.parse.urlparse(args.bot2_hs).netloc + + +def write_details_to_disk(resp: LoginResponse, homeserver, config_file) -> None: + """Writes the required login details to disk so we can log in later without + using a password. + Arguments: + resp {LoginResponse} -- the successful client login response. + homeserver -- URL of homeserver, e.g. "https://matrix.example.org" + """ + # open the config file in write-mode + with open(config_file, "w") as f: + # write the login details to disk + json.dump({"homeserver": homeserver, # e.g. "https://matrix.example.org" + "user_id": resp.user_id, # e.g. "@user:example.org" + "device_id": resp.device_id, # device ID, 10 uppercase letters + "access_token": resp.access_token, # cryptogr. access token + }, f, ) + + +async def test_one_direction(sender_client, receiver_client, receiver_user_id): + # The sender creates the room and invites the receiver + test_room_name = str(uuid4()) + new_test_room = await sender_client.room_create(name=test_room_name, invite=[receiver_user_id]) + if isinstance(new_test_room, RoomCreateError): + print(new_test_room) + new_test_room_id = new_test_room.room_id + + time.sleep(2) + + # The receiver joins via invite + timeout_start = datetime.now() + while True: + resp = await receiver_client.join(new_test_room_id) + if isinstance(resp, JoinResponse): + break + elif isinstance(resp, JoinError): + return f'UNKNOWN: failed to join room "{resp}"', nagios.UNKNOWN + if (datetime.now() - timeout_start).total_seconds() >= args.timeout: + return 'UNKNOWN: failed to join room, timeout.', nagios.UNKNOWN + + time.sleep(2) + + # Sender sends the msg to room + send_msg_time = datetime.now() + msg = {'id': str(uuid4()), 'ts': send_msg_time.microsecond} + resp = (await sender_client.room_send(new_test_room_id, 'm.room.message', {'body': json.dumps(msg), 'msgtype': 'm.room.message'})) + if isinstance(resp, RoomSendError): + return f'UNKNOWN: failed to send message "{resp}', nagios.UNKNOWN + msg_event_id = resp.event_id + + # Sender watches for the message + start_check = datetime.now() + while True: + resp = await receiver_client.room_get_event(new_test_room_id, msg_event_id) + if isinstance(resp, RoomGetEventResponse): + recv_msg_time = datetime.now() + recv_msg = json.loads(resp.event.source['content']['body']) + break + if (datetime.now() - start_check).total_seconds() >= args.timeout: + await sender_client.room_leave(new_test_room_id) + await sender_client.room_forget(new_test_room_id) + await receiver_client.room_leave(new_test_room_id) + await receiver_client.room_forget(new_test_room_id) + return "CRITICAL: timeout - receiver did not recieve the sender's message.", nagios.CRITICAL + + # Double check everything makes sense + if not msg == recv_msg: + await sender_client.room_leave(new_test_room_id) + await sender_client.room_forget(new_test_room_id) + await receiver_client.room_leave(new_test_room_id) + await receiver_client.room_forget(new_test_room_id) + return "CRITICAL: sender's message did not match the receiver's.", nagios.CRITICAL + + # Calculate the time it took to recieve the message, including sync + bot1_msg_delta = (recv_msg_time - send_msg_time).total_seconds() + + # Clean up the rooms + await sender_client.room_leave(new_test_room_id) + await sender_client.room_forget(new_test_room_id) + await receiver_client.room_leave(new_test_room_id) + await receiver_client.room_forget(new_test_room_id) + + return bot1_msg_delta, True + + +async def login(user_id, passwd, homeserver, config_file=None): + client = AsyncClient(homeserver, user_id, config=AsyncClientConfig(request_timeout=args.timeout, max_timeout_retry_wait_time=10)) + if config_file: + # If there are no previously-saved credentials, we'll use the password + if not os.path.exists(config_file): + resp = await client.login(passwd) + + # check that we logged in successfully + if isinstance(resp, LoginResponse): + write_details_to_disk(resp, homeserver, config_file) + else: + print(f'UNKNOWN: failed to log in "{resp}"') + sys.exit(nagios.UNKNOWN) + else: + # Otherwise the config file exists, so we'll use the stored credentials + with open(config_file, "r") as f: + config = json.load(f) + client = AsyncClient(config["homeserver"]) + client.access_token = config["access_token"] + client.user_id = config["user_id"] + client.device_id = config["device_id"] + else: + await client.login(passwd) + return client + + +async def main() -> None: + bot1 = await login(args.bot1_user, args.bot1_pw, args.bot1_hs, args.bot1_auth_file) + bot2 = await login(args.bot2_user, args.bot2_pw, args.bot2_hs, args.bot2_auth_file) + + bot1_output_msg, bot1_output_code = await test_one_direction(bot1, bot2, args.bot2_user) + bot2_output_msg, bot2_output_code = await test_one_direction(bot2, bot1, args.bot1_user) + + nagios_output = nagios.OK + + if not bot1_output_code: + print(bot1_output_msg) + nagios_output = bot1_output_code + if not bot2_output_code: + print(bot2_output_msg) + if nagios_output < bot2_output_code: + # Only set the code if our code is more severe + nagios_output = bot2_output_code + + # bot1 -> bot2 + if isinstance(bot1_output_msg, float): # only do this if the func returned a value + bot1_output_msg = round(bot1_output_msg, 2) + if bot1_output_msg >= args.crit: + if nagios_output < nagios.CRITICAL: + nagios_output = nagios.CRITICAL + print('CRITICAL:', bot1_hs_domain, '->', bot2_hs_domain, 'is', bot1_output_msg, 'seconds.') + elif bot1_output_msg >= args.warn: + if nagios_output < nagios.WARNING: + nagios_output = nagios.WARNING + print('WARNING:', bot1_hs_domain, '->', bot2_hs_domain, 'is', bot1_output_msg, 'seconds.') + else: + print('OK:', bot1_hs_domain, '->', bot2_hs_domain, 'is', bot1_output_msg, 'seconds.') + + # bot2 -> bot1 + if isinstance(bot2_output_msg, float): + bot2_output_msg = round(bot2_output_msg, 2) + if bot2_output_msg >= args.crit: + if nagios_output < nagios.CRITICAL: + nagios_output = nagios.CRITICAL + print('CRITICAL:', bot1_hs_domain, '<-', bot2_hs_domain, 'is', bot2_output_msg, 'seconds.') + elif bot2_output_msg >= args.warn: + if nagios_output < nagios.WARNING: + nagios_output = nagios.WARNING + print('WARNING:', bot1_hs_domain, '<-', bot2_hs_domain, 'is', bot2_output_msg, 'seconds.') + else: + print('OK:', bot1_hs_domain, '<-', bot2_hs_domain, 'is', bot2_output_msg, 'seconds.') + + # Clean up + await bot1.close() + await bot2.close() + + sys.exit(nagios_output) + + +if __name__ == "__main__": + try: + asyncio.run(main()) + except Exception as e: + print(f"UNKNOWN: exception '{e}'") + sys.exit(nagios.UNKNOWN) diff --git a/Checks/Matrix Synapse/check_media_cdn.py b/Checks/Matrix Synapse/check_media_cdn.py new file mode 100644 index 0000000..2772734 --- /dev/null +++ b/Checks/Matrix Synapse/check_media_cdn.py @@ -0,0 +1,220 @@ +#!/usr/bin/env python3 +import argparse +import asyncio +import json +import os +import sys +import tempfile +import urllib + +import aiofiles.os +import magic +import numpy as np +import requests +from PIL import Image +from nio import AsyncClient, AsyncClientConfig, LoginResponse, UploadResponse +from urllib3.exceptions import InsecureRequestWarning + +import nagios + +parser = argparse.ArgumentParser(description='') +parser.add_argument('--user', required=True, help='User ID for the bot.') +parser.add_argument('--pw', required=True, help='Password for the bot.') +parser.add_argument('--hs', required=True, help='Homeserver of the bot.') +parser.add_argument('--admin-endpoint', required=True, help='Admin endpoint that will be called to purge media for this user.') +parser.add_argument('--room', required=True, help='The room the bot should send its test messages in.') +parser.add_argument('--media-cdn-domain', required=True, help='The domain to make sure it redirects to.') +parser.add_argument('--auth-file', help="File to cache the bot's login details to.") +parser.add_argument('--timeout', type=float, default=90, help='Request timeout limit.') +parser.add_argument('--warn', type=float, default=2.0, help='Manually set warn level.') +parser.add_argument('--crit', type=float, default=2.5, help='Manually set critical level.') +args = parser.parse_args() + +CONFIG_FILE = args.auth_file + + +def verify_media_header(header: str, header_dict: dict, good_value: str = None, warn_value: str = None, critical_value: str = None): + """ + If you don't specify good_value, warn_value, or critical_value then the header will only be checked for existience. + """ + # Convert everything to strings to prevent any wierdness + header_value = str(header_dict.get(header)) + good_value = str(good_value) + warn_value = str(warn_value) + critical_value = str(critical_value) + if not header_value: + return f'CRITICAL: missing header "{header}"', nagios.CRITICAL + elif good_value and header_value == good_value: + return f'OK: {header}: "{header_value}"', nagios.OK + elif warn_value and header_value == warn_value: + return f'WARN: {header}: "{header_value}"', nagios.WARNING + elif critical_value and header_value == critical_value: + return f'CRITICAL: {header}: "{header_value}"', nagios.CRITICAL + return f'OK: {header} is present with value "{header_value}"', nagios.OK + + +def write_details_to_disk(resp: LoginResponse, homeserver) -> None: + """Writes the required login details to disk so we can log in later without + using a password. + Arguments: + resp {LoginResponse} -- the successful client login response. + homeserver -- URL of homeserver, e.g. "https://matrix.example.org" + """ + # open the config file in write-mode + with open(CONFIG_FILE, "w") as f: + # write the login details to disk + json.dump({"homeserver": homeserver, # e.g. "https://matrix.example.org" + "user_id": resp.user_id, # e.g. "@user:example.org" + "device_id": resp.device_id, # device ID, 10 uppercase letters + "access_token": resp.access_token, # cryptogr. access token + }, f, ) + + +async def send_image(client, room_id, image): + """Send image to room. + Arguments: + --------- + client : Client + room_id : str + image : str, file name of image + This is a working example for a JPG image. + "content": { + "body": "someimage.jpg", + "info": { + "size": 5420, + "mimetype": "image/jpeg", + "thumbnail_info": { + "w": 100, + "h": 100, + "mimetype": "image/jpeg", + "size": 2106 + }, + "w": 100, + "h": 100, + "thumbnail_url": "mxc://example.com/SomeStrangeThumbnailUriKey" + }, + "msgtype": "m.image", + "url": "mxc://example.com/SomeStrangeUriKey" + } + """ + mime_type = magic.from_file(image, mime=True) # e.g. "image/jpeg" + if not mime_type.startswith("image/"): + print(f'UNKNOWN: wrong mime type "{mime_type}"') + sys.exit(nagios.UNKNOWN) + + im = Image.open(image) + (width, height) = im.size # im.size returns (width,height) tuple + + # first do an upload of image, then send URI of upload to room + file_stat = await aiofiles.os.stat(image) + async with aiofiles.open(image, "r+b") as f: + resp, maybe_keys = await client.upload(f, content_type=mime_type, # image/jpeg + filename=os.path.basename(image), filesize=file_stat.st_size, ) + if not isinstance(resp, UploadResponse): + print(f'UNKNOWN: failed to upload image "{resp}"') + sys.exit(nagios.UNKNOWN) + + content = {"body": os.path.basename(image), # descriptive title + "info": {"size": file_stat.st_size, "mimetype": mime_type, "thumbnail_info": None, # TODO + "w": width, # width in pixel + "h": height, # height in pixel + "thumbnail_url": None, # TODO + }, "msgtype": "m.image", "url": resp.content_uri, } + + try: + return await client.room_send(room_id, message_type="m.room.message", content=content) + except Exception as e: + print(f"Image send of file {image} failed.") + print(f'UNKNOWN: failed to send image event "{e}"') + sys.exit(nagios.UNKNOWN) + + +async def main() -> None: + client = AsyncClient(args.hs, args.user, config=AsyncClientConfig(request_timeout=args.timeout, max_timeout_retry_wait_time=10)) + if args.auth_file: + # If there are no previously-saved credentials, we'll use the password + if not os.path.exists(CONFIG_FILE): + resp = await client.login(args.pw) + + # check that we logged in successfully + if isinstance(resp, LoginResponse): + write_details_to_disk(resp, args.hs) + else: + print(f'UNKNOWN: failed to log in "{resp}"') + sys.exit(nagios.UNKNOWN) + else: + # Otherwise the config file exists, so we'll use the stored credentials + with open(CONFIG_FILE, "r") as f: + config = json.load(f) + client = AsyncClient(config["homeserver"]) + client.access_token = config["access_token"] + client.user_id = config["user_id"] + client.device_id = config["device_id"] + else: + await client.login(args.pw) + + await client.join(args.room) + + # Create a random image + imarray = np.random.rand(100, 100, 3) * 255 + im = Image.fromarray(imarray.astype('uint8')).convert('RGBA') + _, test_image_path = tempfile.mkstemp() + test_image_path = test_image_path + '.png' + im.save(test_image_path) + + # Send the image and get the event ID + image_event_id = (await send_image(client, args.room, test_image_path)).event_id + + # Get the event + image_event = (await client.room_get_event(args.room, image_event_id)).event + + # convert mxc:// to http:// + target_file_url = await client.mxc_to_http(image_event.url) + + # Check the headers. Ignore the non-async thing here, it doesn't + # matter in this situation. + headers = dict(requests.head(target_file_url).headers) + + exit_code = nagios.OK + + # Check domain + domain = urllib.parse.urlparse(headers['location']).netloc + if domain != args.media_cdn_domain: + exit_code = nagios.CRITICAL + print(f'CRITICAL: media CDN domain is "{domain}"') + else: + print(f'OK: media CDN domain is "{domain}"') + + results = [verify_media_header('synapse-media-local-status', headers), verify_media_header('synapse-media-s3-status', headers, good_value='200'), verify_media_header('synapse-media-server', headers, good_value='s3'), + verify_media_header('Server', headers, good_value='cloudflare')] + for header_chk, code in results: + if code != nagios.OK: + exit_code = code + print(header_chk) + + # Clean up + await client.room_redact(args.room, image_event_id) + os.remove(test_image_path) + await client.close() + + requests.packages.urllib3.disable_warnings(category=InsecureRequestWarning) + try: + r = requests.delete(f'{args.admin_endpoint}/_synapse/admin/v1/users/{args.user}/media', headers={'Authorization': f'Bearer {client.access_token}'}, verify=False) + if r.status_code != 200: + if nagios.WARNING < exit_code: + exit_code = nagios.WARNING + print(f"WARN: failed to purge media for this user, request failed with '{r.text}'") + except Exception as e: + if nagios.WARNING < exit_code: + exit_code = nagios.WARNING + print(f"WARN: failed to purge media for this user '{e}'") + + sys.exit(exit_code) + + +if __name__ == "__main__": + try: + asyncio.run(main()) + except Exception as e: + print(f'UNKNOWN: exception "{e}"') + sys.exit(nagios.UNKNOWN) diff --git a/Checks/Matrix Synapse/check_monitor_bot.py b/Checks/Matrix Synapse/check_monitor_bot.py new file mode 100644 index 0000000..208e2c3 --- /dev/null +++ b/Checks/Matrix Synapse/check_monitor_bot.py @@ -0,0 +1,130 @@ +#!/usr/bin/env python3 +import argparse +import json +import sys + +import numpy as np +import requests + +import nagios + +parser = argparse.ArgumentParser(description='') +parser.add_argument('--metrics-endpoint', required=True, help='Target URL to scrape.') +parser.add_argument('--domain', required=True, help='Our domain.') +parser.add_argument('--prometheus', action='store_true', help='Use Promethus instead of scraping the status page.') +parser.add_argument('--ignore', nargs='*', default=[], help='Ignore these hosts.') +parser.add_argument('--timeout', type=float, default=90, help='Request timeout limit.') +parser.add_argument('--warn', type=float, default=20, help='Manually set warn level.') +parser.add_argument('--crit', type=float, default=30, help='Manually set critical level.') +args = parser.parse_args() + +if args.prometheus: + from prometheus import parse_metrics + + r = requests.get(args.metrics_endpoint) + if r.status_code != 200: + sys.exit(nagios.UNKNOWN) + + metrics = {} + for item in parse_metrics(r.text)['monbot_ping_receive_delay_seconds']['monbot_ping_receive_delay_seconds_sum']: + if item.labels['receivingDomain'] not in metrics.keys(): + metrics[item.labels['receivingDomain']] = {} + metrics[item.labels['receivingDomain']][item.labels['sourceDomain']] = item.value + + pings = {'receiver': [], 'sender': [], } + for receiving_domain, senders in metrics.items(): + if receiving_domain == args.domain: + for k, v in senders.items(): + pings['receiver'].append(v) + else: + for k, v in senders.items(): + if k == args.domain: + pings['sender'].append(v) + + print(json.dumps(pings)) + + receiver_avg = np.round(np.average(pings['receiver']), 2) + sender_avg = np.round(np.average(pings['sender']), 2) + + print('receiver latency is', receiver_avg) + print('sender latency is', sender_avg) +else: + from bs4 import BeautifulSoup + import re + + # Split the values since icinga will quote the args + if len(args.ignore) == 1: + args.ignore = args.ignore[0].strip(' ').split(' ') + + + def get_sec(time_str): + """Get seconds from time.""" + h, m, s = time_str.split(':') + return int(h) * 3600 + int(m) * 60 + int(s) + + + def ms_to_s(s): + min_m = re.match(r'^(\d+)m([\d.]+)s', s) + if min_m: + return get_sec(f'0:{min_m.group(1)}:{int(float(min_m.group(2)))}') + elif s.endswith('ms'): + return float('0.' + s.strip('ms')) + elif s.endswith('s'): + return float(s.strip('ms')) + + + r = requests.get(args.metrics_endpoint) + if r.status_code != 200: + sys.exit(nagios.UNKNOWN) + soup = BeautifulSoup(r.text, 'html.parser') + tooltips = soup.find_all('span', {'class', 'tooltip'}) + data = {} + for item in tooltips: + m = re.match(r'\s*Send: (.*?)\s*\s*Receive: (.*?)\s*<\/span>', str(item)) + print(item) + if m: + domain = item.parent.parent.find('span', {'class': 'domain'}).text + data[domain] = { + 'send': ms_to_s(m.group(1)), + 'receive': ms_to_s(m.group(2)), + } + exit_code = nagios.OK + info_str = [] + data_str = [] + + if len(data.keys()) == 0: + print('UNKNOWN: failed to find any servers.') + sys.exit(nagios.UNKNOWN) + + for domain, values in data.items(): + if domain not in args.ignore: + if values['send'] >= args.crit: + info_str.append(f'CRITICAL: {domain} send is {values["send"]}s.') + exit_code = nagios.CRITICAL + elif values['send'] >= args.warn: + info_str.append(f'WARN: {domain} send is {values["send"]}s.') + if exit_code < nagios.WARNING: + exit_code = nagios.WARNING + # else: + # print(f'OK: {domain} send is {values["send"]}s.') + + if values['receive'] >= args.crit: + info_str.append(f'CRITICAL: {domain} receive is {values["receive"]}s.') + exit_code = nagios.CRITICAL + elif values['receive'] >= args.warn: + info_str.append(f'WARN: {domain} receive is {values["receive"]}s.') + if exit_code < nagios.WARNING: + exit_code = nagios.WARNING + # else: + # print(f'OK: {domain} receive is {values["receive"]}s.') + data_str.append( + f"'{domain}-send'={values['send']}s;;; '{domain}-receive'={values['receive']}s;;;" + ) + if any(('CRITICAL' not in s and 'WARNING' not in s) for s in info_str) or len(info_str) == 0: + print(f'OK: ping time is good.', end=' ') + else: + for x in info_str: + print(x, end=('\n' if info_str.index(x) + 1 < len(info_str) else '')) + print(f'|{" ".join(data_str)}') + + sys.exit(exit_code) diff --git a/Checks/Matrix Synapse/grafana.py b/Checks/Matrix Synapse/grafana.py new file mode 100644 index 0000000..d172c60 --- /dev/null +++ b/Checks/Matrix Synapse/grafana.py @@ -0,0 +1,378 @@ +import json + +import numpy as np +import requests +from urllib3.exceptions import InsecureRequestWarning + +requests.packages.urllib3.disable_warnings(category=InsecureRequestWarning) + + +def get_avg_python_gc_time(api_key, interval, data_range, endpoint): + json_data = { + 'queries': [ + { + 'datasource': { + 'type': 'prometheus', + 'uid': 'AbuT5CJ4z', + }, + 'expr': 'rate(python_gc_time_sum{instance="10.0.0.34:9000",job=~"(federation-receiver|federation-sender|initialsync|synapse|synchrotron)",index=~".*"}[30s])/rate(python_gc_time_count[30s])', + 'format': 'time_series', + 'intervalFactor': 2, + 'refId': 'A', + 'step': 20, + 'target': '', + 'interval': '', + # 'key': 'Q-7edaea76-89bd-4b29-8412-a68bf4646712-0', + 'queryType': 'timeSeriesQuery', + 'exemplar': False, + # 'requestId': 'Q-7edaea76-89bd-4b29-8412-a68bf4646712-0A', + 'utcOffsetSec': -25200, + 'legendFormat': '', + 'datasourceId': 1, + 'intervalMs': interval * 1000, + # 'maxDataPoints': 1383, + }, + ], + 'from': f'now-{data_range}m', + 'to': 'now', + } + response = requests.post(f'{endpoint}/api/ds/query', headers={'Authorization': f'Bearer {api_key}'}, json=json_data, verify=False).json() + good = [] + for i in response['results']['A']['frames']: + # This one can sometimes be null + new = [] + for x in range(len(i['data']['values'][1])): + if i['data']['values'][1][x] is not None: + new.append(i['data']['values'][1][x]) + good.append(new) + # Remove empty arrays + results = [] + for x in good: + if len(x) > 0: + results.append(x) + return [np.round(np.average(i), 5) for i in results] + + +def get_outgoing_http_request_rate(api_key, interval, data_range, endpoint): + json_data = { + 'queries': [ + { + 'datasource': { + 'type': 'prometheus', + 'uid': 'AbuT5CJ4z', + }, + 'editorMode': 'code', + 'expr': 'rate(synapse_http_client_requests_total{job=~"(federation-receiver|federation-sender|initialsync|synapse|synchrotron)",index=~".*",instance="10.0.0.34:9000"}[2m])', + 'range': True, + 'refId': 'A', + 'interval': '', + # 'key': 'Q-8b3dabd7-358e-45ed-a9ba-7be3f5fcf274-0', + 'queryType': 'timeSeriesQuery', + 'exemplar': False, + # 'requestId': 'Q-8b3dabd7-358e-45ed-a9ba-7be3f5fcf274-0Q-c5c08c6b-7591-424c-8eac-53837fa51e89-1A', + 'utcOffsetSec': -25200, + 'legendFormat': '', + 'datasourceId': 1, + 'intervalMs': interval * 1000, + # 'maxDataPoints': 10, + }, + { + 'datasource': { + 'type': 'prometheus', + 'uid': 'AbuT5CJ4z', + }, + 'editorMode': 'code', + 'expr': 'rate(synapse_http_matrixfederationclient_requests_total{job=~"(federation-receiver|federation-sender|initialsync|synapse|synchrotron)",index=~".*",instance="10.0.0.34:9000"}[2m])', + 'range': True, + 'refId': 'B', + 'interval': '', + # 'key': 'Q-c5c08c6b-7591-424c-8eac-53837fa51e89-1', + 'queryType': 'timeSeriesQuery', + 'exemplar': False, + # 'requestId': 'Q-8b3dabd7-358e-45ed-a9ba-7be3f5fcf274-0Q-c5c08c6b-7591-424c-8eac-53837fa51e89-1B', + 'utcOffsetSec': -25200, + 'legendFormat': '', + 'datasourceId': 1, + 'intervalMs': interval * 1000, + # 'maxDataPoints': 10, + }, + ], + 'from': f'now-{data_range}m', + 'to': 'now', + } + response = requests.post(f'{endpoint}/api/ds/query', headers={'Authorization': f'Bearer {api_key}'}, json=json_data, verify=False).json() + output = {} + for letter, result in response['results'].items(): + name = result['frames'][0]['schema']['name'].split('=')[-1].strip('}').strip('"') + output[name] = np.round(np.average(result['frames'][0]['data']['values'][1]), 2) + return output + # return { + # 'GET': np.round(np.average(response['results']['A']['frames'][0]['data']['values'][1]), 2), + # 'POST': np.round(np.average(response['results']['A']['frames'][1]['data']['values'][1]), 2), + # 'PUT': np.round(np.average(response['results']['A']['frames'][2]['data']['values'][1]), 2), + # 'fedr_GET': np.round(np.average(response['results']['B']['frames'][0]['data']['values'][1]), 2) + # } + + +def get_event_send_time(api_key, interval, data_range, endpoint): + json_data = { + 'queries': [ + { + 'datasource': { + 'type': 'prometheus', + 'uid': 'AbuT5CJ4z', + }, + 'expr': 'histogram_quantile(0.99, sum(rate(synapse_http_server_response_time_seconds_bucket{servlet=\'RoomSendEventRestServlet\',index=~".*",instance="10.0.0.34:9000",code=~"2.."}[2m])) by (le))', + 'format': 'time_series', + 'intervalFactor': 1, + 'refId': 'D', + 'interval': '', + # 'key': 'Q-d8eb3572-9aea-4a73-92f2-e08b33c21ecb-0', + 'editorMode': 'builder', + 'range': True, + 'instant': True, + 'queryType': 'timeSeriesQuery', + 'exemplar': False, + # 'requestId': 'Q-d8eb3572-9aea-4a73-92f2-e08b33c21ecb-0Q-a9222e59-18ff-4b3b-80ae-27bea8f149a9-1Q-0378a458-1ade-410e-a4b3-ae4aaa91d709-2Q-da4c00b6-61c1-49f5-8a0a-9f19990acfb7-3Q-21254889-3cf6-4d97-8dc5-ddf68360847e-4Q-502b8ed5-4050-461c-befc-76f6796dce68-5Q-364dc896-c399-4e58-8930-cba2e3d1d579-6Q-9072e904-da8d-4b00-b454-dac45b7c38f0-7D', + 'utcOffsetSec': -25200, + 'legendFormat': '', + 'datasourceId': 1, + 'intervalMs': interval * 1000, + # 'maxDataPoints': 1383, + }, + { + 'datasource': { + 'type': 'prometheus', + 'uid': 'AbuT5CJ4z', + }, + 'expr': 'histogram_quantile(0.9, sum(rate(synapse_http_server_response_time_seconds_bucket{servlet=\'RoomSendEventRestServlet\',index=~".*",instance="10.0.0.34:9000",code=~"2.."}[2m])) by (le))', + 'format': 'time_series', + 'interval': '', + 'intervalFactor': 1, + 'refId': 'A', + # 'key': 'Q-a9222e59-18ff-4b3b-80ae-27bea8f149a9-1', + 'queryType': 'timeSeriesQuery', + 'exemplar': False, + # 'requestId': 'Q-d8eb3572-9aea-4a73-92f2-e08b33c21ecb-0Q-a9222e59-18ff-4b3b-80ae-27bea8f149a9-1Q-0378a458-1ade-410e-a4b3-ae4aaa91d709-2Q-da4c00b6-61c1-49f5-8a0a-9f19990acfb7-3Q-21254889-3cf6-4d97-8dc5-ddf68360847e-4Q-502b8ed5-4050-461c-befc-76f6796dce68-5Q-364dc896-c399-4e58-8930-cba2e3d1d579-6Q-9072e904-da8d-4b00-b454-dac45b7c38f0-7A', + 'utcOffsetSec': -25200, + 'legendFormat': '', + 'datasourceId': 1, + 'intervalMs': interval * 1000, + # 'maxDataPoints': 1383, + }, + { + 'datasource': { + 'type': 'prometheus', + 'uid': 'AbuT5CJ4z', + }, + 'expr': 'histogram_quantile(0.75, sum(rate(synapse_http_server_response_time_seconds_bucket{servlet=\'RoomSendEventRestServlet\',index=~".*",instance="10.0.0.34:9000",code=~"2.."}[2m])) by (le))', + 'format': 'time_series', + 'intervalFactor': 1, + 'refId': 'C', + 'interval': '', + # 'key': 'Q-0378a458-1ade-410e-a4b3-ae4aaa91d709-2', + 'queryType': 'timeSeriesQuery', + 'exemplar': False, + # 'requestId': 'Q-d8eb3572-9aea-4a73-92f2-e08b33c21ecb-0Q-a9222e59-18ff-4b3b-80ae-27bea8f149a9-1Q-0378a458-1ade-410e-a4b3-ae4aaa91d709-2Q-da4c00b6-61c1-49f5-8a0a-9f19990acfb7-3Q-21254889-3cf6-4d97-8dc5-ddf68360847e-4Q-502b8ed5-4050-461c-befc-76f6796dce68-5Q-364dc896-c399-4e58-8930-cba2e3d1d579-6Q-9072e904-da8d-4b00-b454-dac45b7c38f0-7C', + 'utcOffsetSec': -25200, + 'legendFormat': '', + 'datasourceId': 1, + 'intervalMs': interval * 1000, + # 'maxDataPoints': 1383, + }, + { + 'datasource': { + 'type': 'prometheus', + 'uid': 'AbuT5CJ4z', + }, + 'expr': 'histogram_quantile(0.5, sum(rate(synapse_http_server_response_time_seconds_bucket{servlet=\'RoomSendEventRestServlet\',index=~".*",instance="10.0.0.34:9000",code=~"2.."}[2m])) by (le))', + 'format': 'time_series', + 'intervalFactor': 1, + 'refId': 'B', + 'interval': '', + # 'key': 'Q-da4c00b6-61c1-49f5-8a0a-9f19990acfb7-3', + 'queryType': 'timeSeriesQuery', + 'exemplar': False, + # 'requestId': 'Q-d8eb3572-9aea-4a73-92f2-e08b33c21ecb-0Q-a9222e59-18ff-4b3b-80ae-27bea8f149a9-1Q-0378a458-1ade-410e-a4b3-ae4aaa91d709-2Q-da4c00b6-61c1-49f5-8a0a-9f19990acfb7-3Q-21254889-3cf6-4d97-8dc5-ddf68360847e-4Q-502b8ed5-4050-461c-befc-76f6796dce68-5Q-364dc896-c399-4e58-8930-cba2e3d1d579-6Q-9072e904-da8d-4b00-b454-dac45b7c38f0-7B', + 'utcOffsetSec': -25200, + 'legendFormat': '', + 'datasourceId': 1, + 'intervalMs': interval * 1000, + # 'maxDataPoints': 1383, + }, + { + 'datasource': { + 'type': 'prometheus', + 'uid': 'AbuT5CJ4z', + }, + 'expr': 'histogram_quantile(0.25, sum(rate(synapse_http_server_response_time_seconds_bucket{servlet=\'RoomSendEventRestServlet\',index=~".*",instance="10.0.0.34:9000",code=~"2.."}[2m])) by (le))', + 'refId': 'F', + 'interval': '', + # 'key': 'Q-21254889-3cf6-4d97-8dc5-ddf68360847e-4', + 'queryType': 'timeSeriesQuery', + 'exemplar': False, + # 'requestId': 'Q-d8eb3572-9aea-4a73-92f2-e08b33c21ecb-0Q-a9222e59-18ff-4b3b-80ae-27bea8f149a9-1Q-0378a458-1ade-410e-a4b3-ae4aaa91d709-2Q-da4c00b6-61c1-49f5-8a0a-9f19990acfb7-3Q-21254889-3cf6-4d97-8dc5-ddf68360847e-4Q-502b8ed5-4050-461c-befc-76f6796dce68-5Q-364dc896-c399-4e58-8930-cba2e3d1d579-6Q-9072e904-da8d-4b00-b454-dac45b7c38f0-7F', + 'utcOffsetSec': -25200, + 'legendFormat': '', + 'datasourceId': 1, + 'intervalMs': interval * 1000, + # 'maxDataPoints': 1383, + }, + { + 'datasource': { + 'type': 'prometheus', + 'uid': 'AbuT5CJ4z', + }, + 'expr': 'histogram_quantile(0.05, sum(rate(synapse_http_server_response_time_seconds_bucket{servlet=\'RoomSendEventRestServlet\',index=~".*",instance="10.0.0.34:9000",code=~"2.."}[2m])) by (le))', + 'refId': 'G', + 'interval': '', + # 'key': 'Q-502b8ed5-4050-461c-befc-76f6796dce68-5', + 'queryType': 'timeSeriesQuery', + 'exemplar': False, + # 'requestId': 'Q-d8eb3572-9aea-4a73-92f2-e08b33c21ecb-0Q-a9222e59-18ff-4b3b-80ae-27bea8f149a9-1Q-0378a458-1ade-410e-a4b3-ae4aaa91d709-2Q-da4c00b6-61c1-49f5-8a0a-9f19990acfb7-3Q-21254889-3cf6-4d97-8dc5-ddf68360847e-4Q-502b8ed5-4050-461c-befc-76f6796dce68-5Q-364dc896-c399-4e58-8930-cba2e3d1d579-6Q-9072e904-da8d-4b00-b454-dac45b7c38f0-7G', + 'utcOffsetSec': -25200, + 'legendFormat': '', + 'datasourceId': 1, + 'intervalMs': interval * 1000, + # 'maxDataPoints': 1383, + }, + { + 'datasource': { + 'type': 'prometheus', + 'uid': 'AbuT5CJ4z', + }, + 'expr': 'sum(rate(synapse_http_server_response_time_seconds_sum{servlet=\'RoomSendEventRestServlet\',index=~".*",instance="10.0.0.34:9000",code=~"2.."}[2m])) / sum(rate(synapse_http_server_response_time_seconds_count{servlet=\'RoomSendEventRestServlet\',index=~".*",instance="10.0.0.34:9000",code=~"2.."}[2m]))', + 'refId': 'H', + 'interval': '', + # 'key': 'Q-364dc896-c399-4e58-8930-cba2e3d1d579-6', + 'queryType': 'timeSeriesQuery', + 'exemplar': False, + # 'requestId': 'Q-d8eb3572-9aea-4a73-92f2-e08b33c21ecb-0Q-a9222e59-18ff-4b3b-80ae-27bea8f149a9-1Q-0378a458-1ade-410e-a4b3-ae4aaa91d709-2Q-da4c00b6-61c1-49f5-8a0a-9f19990acfb7-3Q-21254889-3cf6-4d97-8dc5-ddf68360847e-4Q-502b8ed5-4050-461c-befc-76f6796dce68-5Q-364dc896-c399-4e58-8930-cba2e3d1d579-6Q-9072e904-da8d-4b00-b454-dac45b7c38f0-7H', + 'utcOffsetSec': -25200, + 'legendFormat': '', + 'datasourceId': 1, + 'intervalMs': interval * 1000, + # 'maxDataPoints': 1383, + }, + { + 'datasource': { + 'type': 'prometheus', + 'uid': 'AbuT5CJ4z', + }, + 'expr': 'sum(rate(synapse_storage_events_persisted_events_total{instance="10.0.0.34:9000"}[2m]))', + 'hide': False, + 'instant': False, + 'refId': 'E', + 'interval': '', + # 'key': 'Q-9072e904-da8d-4b00-b454-dac45b7c38f0-7', + 'editorMode': 'code', + 'queryType': 'timeSeriesQuery', + 'exemplar': False, + # 'requestId': 'Q-d8eb3572-9aea-4a73-92f2-e08b33c21ecb-0Q-a9222e59-18ff-4b3b-80ae-27bea8f149a9-1Q-0378a458-1ade-410e-a4b3-ae4aaa91d709-2Q-da4c00b6-61c1-49f5-8a0a-9f19990acfb7-3Q-21254889-3cf6-4d97-8dc5-ddf68360847e-4Q-502b8ed5-4050-461c-befc-76f6796dce68-5Q-364dc896-c399-4e58-8930-cba2e3d1d579-6Q-9072e904-da8d-4b00-b454-dac45b7c38f0-7E', + 'utcOffsetSec': -25200, + 'legendFormat': '', + 'datasourceId': 1, + 'intervalMs': interval * 1000, + # 'maxDataPoints': 1383, + }, + ], + 'from': f'now-{data_range}m', + 'to': 'now', + } + response = requests.post(f'{endpoint}/api/ds/query', headers={'Authorization': f'Bearer {api_key}'}, json=json_data, verify=False).json() + return np.round(np.average(response['results']['E']['frames'][0]['data']['values'][1]), 2) + + +def get_waiting_for_db(api_key, interval, data_range, endpoint): + json_data = { + 'queries': [ + { + 'datasource': { + 'type': 'prometheus', + 'uid': 'AbuT5CJ4z', + }, + 'expr': 'rate(synapse_storage_schedule_time_sum{instance="10.0.0.34:9000",job=~"(federation-receiver|federation-sender|initialsync|synapse|synchrotron)",index=~".*"}[30s])/rate(synapse_storage_schedule_time_count[30s])', + 'format': 'time_series', + 'intervalFactor': 2, + 'refId': 'A', + 'step': 20, + 'interval': '', + # 'key': 'Q-459af7f4-0427-4832-9353-46086b3f5c27-0', + 'queryType': 'timeSeriesQuery', + 'exemplar': False, + # 'requestId': 'Q-459af7f4-0427-4832-9353-46086b3f5c27-0A', + 'utcOffsetSec': -25200, + 'legendFormat': '', + 'datasourceId': 1, + 'intervalMs': interval * 1000, + # 'maxDataPoints': 1383, + }, + ], + 'from': f'now-{data_range}m', + 'to': 'now', + } + response = requests.post(f'{endpoint}/api/ds/query', headers={'Authorization': f'Bearer {api_key}'}, json=json_data, verify=False).json() + return np.round(np.average(response['results']['A']['frames'][0]['data']['values'][1]), 5) + + +def get_stateres_worst_case(api_key, interval, data_range, endpoint): + """ + CPU and DB time spent on most expensive state resolution in a room, summed over all workers. + This is a very rough proxy for "how fast is state res", but it doesn't accurately represent the system load (e.g. it completely ignores cheap state resolutions). + """ + json_data = { + 'queries': [ + { + 'datasource': { + 'type': 'prometheus', + 'uid': 'AbuT5CJ4z', + }, + 'exemplar': False, + 'expr': 'sum(rate(synapse_state_res_db_for_biggest_room_seconds_total{instance="10.0.0.34:9000"}[1m]))', + 'format': 'time_series', + 'hide': False, + 'instant': False, + 'interval': '', + 'refId': 'B', + 'queryType': 'timeSeriesQuery', + 'utcOffsetSec': -25200, + 'legendFormat': '', + 'datasourceId': 1, + 'intervalMs': 15000, + 'maxDataPoints': 1863, + }, + { + 'datasource': { + 'type': 'prometheus', + 'uid': 'AbuT5CJ4z', + }, + 'exemplar': False, + 'expr': 'sum(rate(synapse_state_res_cpu_for_biggest_room_seconds_total{instance="10.0.0.34:9000"}[1m]))', + 'format': 'time_series', + 'hide': False, + 'instant': False, + 'interval': '', + 'refId': 'C', + 'queryType': 'timeSeriesQuery', + 'utcOffsetSec': -25200, + 'legendFormat': '', + 'datasourceId': 1, + 'intervalMs': 15000, + 'maxDataPoints': 1863, + }, + ], + 'range': { + 'from': '2023-02-23T04:36:12.870Z', + 'to': '2023-02-23T07:36:12.870Z', + 'raw': { + 'from': 'now-3h', + 'to': 'now', + }, + }, + 'from': f'now-{data_range}m', + 'to': 'now', + } + response = requests.post(f'{endpoint}/api/ds/query', headers={'Authorization': f'Bearer {api_key}'}, json=json_data, verify=False).json() + + +# AVerage CPU time per block diff --git a/Checks/Matrix Synapse/matrix_synapse.py b/Checks/Matrix Synapse/matrix_synapse.py new file mode 100644 index 0000000..b8e67e8 --- /dev/null +++ b/Checks/Matrix Synapse/matrix_synapse.py @@ -0,0 +1,111 @@ +#!/usr/bin/env python3 +import argparse +import sys +import time + +import numpy as np +import requests + +import nagios +from grafana import get_avg_python_gc_time, get_event_send_time, get_outgoing_http_request_rate, get_waiting_for_db + +parser = argparse.ArgumentParser(description='Process some integers.') +parser.add_argument('--grafana-server', required=True, help='Grafana server.') +parser.add_argument('--synapse-server', required=True, help='Matrix Synapse server.') +parser.add_argument('--grafana-api-key', required=True) +parser.add_argument('--interval', default=15, type=int, help='Data interval in seconds.') +parser.add_argument('--range', default=2, type=int, help='Data range in minutes. Used for comparison and averaging.') +parser.add_argument('--type', required=True, choices=['gc-time', 'response-time', 'outgoing-http-rate', 'avg-send', 'db-lag']) +parser.add_argument('--warn', type=float, help='Manually set warn level.') +parser.add_argument('--crit', type=float, help='Manually set critical level.') +args = parser.parse_args() + +# TODO: add warn suppoort + +if args.type == 'gc-time': + # in seconds + python_gc_time_sum_MAX = 0.002 if not args.crit else args.crit + try: + python_gc_time_sum = np.round(np.average(get_avg_python_gc_time(args.grafana_api_key, args.interval, args.range, args.grafana_server)), 5) + if python_gc_time_sum >= python_gc_time_sum_MAX: + print(f'CRITICAL: average GC time per collection is {python_gc_time_sum} sec.') + sys.exit(nagios.CRITICAL) + else: + print(f'OK: average GC time per collection is {python_gc_time_sum} sec.') + sys.exit(nagios.OK) + except Exception as e: + print(f'UNKNOWN: failed to check avg. GC time "{e}"') + sys.exit(nagios.UNKNOWN) +elif args.type == 'response-time': + response_time_MAX = 1 if not args.crit else args.crit + timeout = 10 + try: + response_times = [] + for i in range(10): + start = time.perf_counter() + try: + response = requests.post(args.synapse_server, timeout=timeout, verify=False) + except Exception as e: + print(f'UNKNOWN: failed to ping endpoint "{e}"') + sys.exit(nagios.UNKNOWN) + request_time = time.perf_counter() - start + response_times.append(np.round(request_time, 2)) + time.sleep(1) + response_time = np.round(np.average(response_times), 2) + if response_time > response_time_MAX: + print(f'CRITICAL: response time is {response_time} sec.') + sys.exit(nagios.CRITICAL) + else: + print(f'OK: response time is {response_time} sec.') + sys.exit(nagios.OK) + except Exception as e: + print(f'UNKNOWN: failed to check response time "{e}"') + sys.exit(nagios.UNKNOWN) +elif args.type == 'outgoing-http-rate': + # outgoing req/sec + outgoing_http_request_rate_MAX = 10 if not args.crit else args.crit + # try: + outgoing_http_request_rate = get_outgoing_http_request_rate(args.grafana_api_key, args.interval, args.range, args.grafana_server) + failed = {} + for k, v in outgoing_http_request_rate.items(): + if v > outgoing_http_request_rate_MAX: + failed[k] = v + if len(failed.keys()) > 0: + print(f'CRITICAL: outgoing HTTP request rate for {failed} req/sec.') + sys.exit(nagios.CRITICAL) + print(f'OK: outgoing HTTP request rate is {outgoing_http_request_rate} req/sec.') + sys.exit(nagios.OK) + # except Exception as e: + # print(f'UNKNOWN: failed to check outgoing HTTP request rate "{e}"') + # sys.exit(nagios.UNKNOWN) +elif args.type == 'avg-send': + # Average send time in seconds + event_send_time_MAX = 1 if not args.crit else args.crit + try: + event_send_time = get_event_send_time(args.grafana_api_key, args.interval, args.range, args.grafana_server) + if event_send_time > event_send_time_MAX: + print(f'CRITICAL: average message send time is {event_send_time} sec.') + sys.exit(nagios.CRITICAL) + else: + print(f'OK: average message send time is {event_send_time} sec.') + sys.exit(nagios.OK) + except Exception as e: + print(f'UNKNOWN: failed to check average message send time "{e}"') + sys.exit(nagios.UNKNOWN) +elif args.type == 'db-lag': + # in seconds + db_lag_MAX = 0.01 if not args.crit else args.crit + try: + db_lag = get_waiting_for_db(args.grafana_api_key, args.interval, args.range, args.grafana_server) + if db_lag > db_lag_MAX: + print(f'CRITICAL: DB lag is {db_lag} sec.') + sys.exit(nagios.CRITICAL) + else: + print(f'OK: DB lag is {db_lag} sec.') + sys.exit(nagios.OK) + except Exception as e: + print(f'UNKNOWN: failed to check DB lag "{e}"') + sys.exit(nagios.UNKNOWN) +else: + print('Wrong type') + sys.exit(nagios.UNKNOWN) diff --git a/Checks/Matrix Synapse/nagios.py b/Checks/Matrix Synapse/nagios.py new file mode 100644 index 0000000..5158064 --- /dev/null +++ b/Checks/Matrix Synapse/nagios.py @@ -0,0 +1,4 @@ +UNKNOWN = -1 +OK = 0 +WARNING = 1 +CRITICAL = 2 \ No newline at end of file diff --git a/Checks/Matrix Synapse/prometheus.py b/Checks/Matrix Synapse/prometheus.py new file mode 100644 index 0000000..d285fd7 --- /dev/null +++ b/Checks/Matrix Synapse/prometheus.py @@ -0,0 +1,12 @@ +from prometheus_client.parser import text_string_to_metric_families + + +def parse_metrics(families): + output = {} + for family in text_string_to_metric_families(families): + output[family.name] = {} + for sample in family.samples: + if sample.name not in output[family.name].keys(): + output[family.name][sample.name] = [] + output[family.name][sample.name].append(sample) + return output diff --git a/Checks/Matrix Synapse/requirements.txt b/Checks/Matrix Synapse/requirements.txt new file mode 100644 index 0000000..deccec1 --- /dev/null +++ b/Checks/Matrix Synapse/requirements.txt @@ -0,0 +1,9 @@ +prometheus_client +requests +numpy +nagiosplugin +matrix-nio +Pillow +python-magic +numpy +beautifulsoup4 \ No newline at end of file diff --git a/Checks/Matrix Synapse/synapse_client.py b/Checks/Matrix Synapse/synapse_client.py new file mode 100644 index 0000000..f5c70ca --- /dev/null +++ b/Checks/Matrix Synapse/synapse_client.py @@ -0,0 +1,110 @@ +import sys + +import requests + +import nagios + + +def handle_err(func): + def wrapper(*args, **kwargs): + try: + crit, ret = func(*args, **kwargs) + except Exception as e: + print(f"UNKNOWN: exception '{e}'") + sys.exit(nagios.UNKNOWN) + if crit: + print(f"CRITICAL: {crit}") + sys.exit(nagios.CRITICAL) + else: + return ret + + return wrapper + + +@handle_err +def login(user_id: str, passwd: str, homeserver: str): + data = {'type': 'm.login.password', 'user': user_id, 'password': passwd} + r = requests.post(f'{homeserver}/_matrix/client/r0/login', json=data) + if r.status_code != 200: + return f'Bad status code on login for {user_id}: {r.status_code}\nBody: {r.text}', None + return None, r.json() + + +@handle_err +def create_room(room_name, homeserver, auth_token): + """ + Creates an unencrypted room. + """ + data = {"name": room_name, "preset": "private_chat", "visibility": "private", # "initial_state": [{"type": "m.room.guest_access", "state_key": "", "content": {"guest_access": "can_join"}}] + } + r = requests.post(f'{homeserver}/_matrix/client/r0/createRoom?access_token={auth_token}', json=data) + if r.status_code != 200: + return Exception(f'Bad status code on create room for {room_name}: {r.status_code}\nBody: {r.text}'), None + return None, r.json() + + +@handle_err +def send_invite(room_id, target_user_id, homeserver, auth_token): + r = requests.post(f'{homeserver}/_matrix/client/r0/rooms/{room_id}/invite?access_token={auth_token}', json={'user_id': target_user_id}) + if r.status_code != 200: + return Exception(f'Bad status code on send invite for {room_id}: {r.status_code}\nBody: {r.text}'), None + return None, r.json() + + +@handle_err +def join_room(room_id, homeserver, auth_token): + r = requests.post(f'{homeserver}/_matrix/client/r0/join/{room_id}?access_token={auth_token}', data='{}') + if r.status_code != 200: + return Exception(f'Bad status code on join room for {room_id}: {r.status_code}\nBody: {r.text}'), None + return None, r.json() + + +@handle_err +def join_room_invite(room_id, homeserver, auth_token): + r = requests.post(f'{homeserver}/_matrix/client/r0/rooms/{room_id}/join?access_token={auth_token}', data='{}') + if r.status_code != 200: + return Exception(f'Bad status code on join room via invite for {room_id}: {r.status_code}\nBody: {r.text}'), None + return None, r.json() + + +@handle_err +def send_msg(message, room_id, homeserver, auth_token): + r = requests.post(f'{homeserver}/_matrix/client/r0/rooms/{room_id}/send/m.room.message?access_token={auth_token}', json={'msgtype': 'm.text', 'body': message}) + if r.status_code != 200: + return Exception(f'Bad status code on send message for {room_id}: {r.status_code}\nBody: {r.text}'), None + return None, r.json() + + +# errors will be handled in the other script +def get_event(event_id, room_id, homeserver, auth_token): + return requests.get(f'{homeserver}/_matrix/client/v3/rooms/{room_id}/event/{event_id}?access_token={auth_token}') + + +@handle_err +def get_state(homeserver, auth_token, since=None): + if since: + url = f'{homeserver}/_matrix/client/r0/sync?since{since}&access_token={auth_token}' + else: + url = f'{homeserver}/_matrix/client/r0/sync?access_token={auth_token}' + r = requests.get(url) + if r.status_code != 200: + return Exception(f'Bad status code on sync: {r.status_code}\nBody: {r.text}'), None + return None, r.json() + + +@handle_err +def forget_room(room_id, homeserver, auth_token): + r = requests.post(f'{homeserver}/_matrix/client/r0/rooms/{room_id}/forget?access_token={auth_token}', data='{}') + if r.status_code != 200: + return Exception(f'Bad status code on leave room for {room_id}: {r.status_code}\nBody: {r.text}'), None + return None, r.json() + + +@handle_err +def leave_room(room_id, homeserver, auth_token, forget=False): + r = requests.post(f'{homeserver}/_matrix/client/r0/rooms/{room_id}/leave?access_token={auth_token}', data='{}') + if r.status_code != 200: + return Exception(f'Bad status code on leave room for {room_id}: {r.status_code}\nBody: {r.text}'), None + if forget: + f = forget_room(room_id, homeserver, auth_token) + return None, r.json() diff --git a/Checks/__init__.py b/Checks/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/Checks/check_monitor_bot.sh b/Checks/check_monitor_bot.sh new file mode 100644 index 0000000..7c74037 --- /dev/null +++ b/Checks/check_monitor_bot.sh @@ -0,0 +1,4 @@ +python3 Matrix\ Synapse/check_monitor_bot.py \ + --metrics-endpoint "https://matrix.your-hs.com/matrix-monitor-bot/" \ + --domain your-hs.com \ + --ignore canarymod.net catgirl.cloud diff --git a/Checks/check_redis.py b/Checks/check_redis.py new file mode 100644 index 0000000..84332c3 --- /dev/null +++ b/Checks/check_redis.py @@ -0,0 +1,211 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +# pip install redis + +import redis +import sys +import argparse + +EXIT_OK = 0 +EXIT_WARNING = 1 +EXIT_CRITICAL = 2 +EXIT_UNKNONW = 3 +EXIT_INVALID_AUTH = 3 + + +class MonitoringPluginRedis(object): + + def __init__(self): + """ + """ + cli_args = self.parse_args() + + self.host = cli_args.host + self.port = cli_args.port + self.password = cli_args.password + self.dbname = cli_args.dbname + self.timeout = cli_args.timeout + self.key = cli_args.key_value + self.warning = cli_args.warning + self.critical = cli_args.critical + + try: + self.conn = redis.Redis( + host=self.host, + port=self.port, + password=self.password, + socket_timeout=self.timeout + ) + self.info_out = self.conn.info() + self.conn.ping() + + except Exception as e: + print(f"CRITICAL REDIS : {e}") + sys.exit(2) + + def parse_args(self): + """ + """ + parser = argparse.ArgumentParser( + description="monitoring plugin for redis-server, version: 1.0" + ) + parser.add_argument( + "-H", "--host", + dest="host", + help="Redis server to connect to. (default is 127.0.0.1)", + default="127.0.0.1" + ) + parser.add_argument( + "-p", "--port", + dest="port", + help="Redis port to connect to. (default is 6379)", + type=int, + default=6379 + ) + parser.add_argument( + "-P", "--password", + dest="password", + help="Redis password to connect to.", + default='' + ) + parser.add_argument( + "-d", "--dbname", + dest="dbname", + help="Redis database name (default is db0)", + default='db0' + ) + parser.add_argument( + "-t", "--timeout", + dest="timeout", + help="Number of seconds to wait before timing out and considering redis down", + type=int, + default=2 + ) + parser.add_argument( + "-w", "--warning", + dest="warning", + type=int, + help="Warning threshold." + ) + parser.add_argument( + "-c", "--critical", + dest="critical", + type=int, + help="Critical threshold." + ) + parser.add_argument( + "-k", "--key", + dest="key_value", + help="Stat to monitor (memory_mb, hit_ratio, or custom)", + default=None + ) + + return parser.parse_args() + + def get_version(self): + + return f"version: {self.info_out.get('redis_version')}" + + def get_client_connection(self): + + return f"connected_clients: {self.info_out.get('connected_clients')}" + + def get_number_keys(self): + + return f"{self.dbname}: {self.info_out.get(self.dbname).get('keys')}" + + def get_uptime(self): + + return f"uptime_in_days: {self.info_out.get('uptime_in_days')}" + + def get_used_memory(self): + + return f"used_memory_human: {self.info_out.get('used_memory_human')}" + + def check(self): + """ + """ + number_keys = '' + version = self.get_version() + client_connected = self.get_client_connection() + reverse_check = False + exit_string = "OK" + + if self.dbname in str(self.info_out): + number_keys = self.get_number_keys() + + memory = self.get_used_memory() + uptime = self.get_uptime() + + # print(self.info_out) + + if self.key: + if not self.warning or not self.critical: + exit_string = "UNKNOWN" + + if not self.warning: + status = "UNKNOWN: Warning level required" + if not self.critical: + status = "UNKNONW: Critical level required" + + print(status) + sys.exit(EXIT_UNKNONW) + + if self.key == "memory_mb": + reverse_check = True + info_value = int( + self.info_out.get("used_memory_rss") or self.info_out.get("used_memory") + ) / (1024 * 1024) + elif self.key == "hit_ratio": + reverse_check = False + hit = int(self.info_out.get("keyspace_hits")) + miss = int(self.info_out.get("keyspace_misses")) + + if hit > 0 and miss > 0: + info_value = int(100 * hit) / (hit + miss) + else: + info_value = 0 + else: + info_value = int(self.info_out.get(self.key)) + + if reverse_check: + if int(info_value) < int(self.critical): + exit_string = "CRITICAL" + elif int(info_value) < int(self.warning): + exit_string = "WARNING" + else: + if int(info_value) > int(self.critical): + exit_string = "CRITICAL" + elif int(info_value) > int(self.warning): + exit_string = "WARNING" + + status = f"{exit_string}: Redis {self.key} is {info_value}" + perfdata = f"{self.key}={info_value};{self.warning};{self.critical};0;{info_value}" + + print(f"{status} || {perfdata}") + + else: + + if number_keys == '': + status = f"OK REDIS No keys, {version}, {memory}, {uptime}" + else: + status = f"OK REDIS {version}, {client_connected}, {number_keys}, {memory}, {uptime}" + + print(status) + + if exit_string == "OK": + sys.exit(EXIT_OK) + if exit_string == "WARNING": + sys.exit(EXIT_WARNING) + if exit_string == "UNKNONW": + sys.exit(EXIT_UNKNONW) + else: + sys.exit(EXIT_CRITICAL) + + +if __name__ == "__main__": + """ + """ + server = MonitoringPluginRedis() + server.check() diff --git a/Checks/test-federation.sh b/Checks/test-federation.sh new file mode 100644 index 0000000..8a5138c --- /dev/null +++ b/Checks/test-federation.sh @@ -0,0 +1,9 @@ +python3 Matrix\ Synapse/check_federation.py \ + --bot1-user '@bot1:your-hs.com' \ + --bot1-pw password1234 \ + --bot1-hs https://matrix.your-hs.com \ + --bot1-auth-file /opt/custom-nagios-checks/auth-fed-test-bot1.json \ + --bot2-user '@bot2:matrix.org' \ + --bot2-pw password1234 \ + --bot2-hs https://matrix-federation.matrix.org \ + --bot2-auth-file /opt/custom-nagios-checks/auth-fed-test-bot2.json diff --git a/Checks/test-media-cdn.sh b/Checks/test-media-cdn.sh new file mode 100644 index 0000000..2c7b901 --- /dev/null +++ b/Checks/test-media-cdn.sh @@ -0,0 +1,8 @@ +python3 Matrix\ Synapse/check_media_cdn.py \ + --user '@bot1:your-hs.com' \ + --pw password1234 \ + --hs https://matrix.your-hs.com \ + --room '!banana:your-hs.com' \ + --auth-file ./auth-cdn.json \ + --media-cdn-domain matrix-media-cdn.your-hs.com \ + --admin-endpoint https://172.0.2.118 diff --git a/Other/icinga-to-kuma.py b/Other/icinga-to-kuma.py new file mode 100644 index 0000000..2a8f3f6 --- /dev/null +++ b/Other/icinga-to-kuma.py @@ -0,0 +1,85 @@ +import json +from pathlib import Path + +from flask import Flask, Response, request +from icinga2api.client import Client + +client = Client('https://localhost:8080', 'icingaweb2', 'password1234') + +OK = 0 +WARNING = 1 +CRITICAL = 2 +UNKNOWN = 3 + +app = Flask(__name__) + + +def return_json(json_dict, start_response, status_code=200): + headers = [('Content-Type', 'application/json')] + start_response(str(status_code), headers) + return iter([json.dumps(json_dict).encode('utf-8')]) + + +@app.route('/host') +@app.route('/host/') +@app.route("/host/") +def get_host_state(hostid=None): + path = Path(request.base_url) + args_service = request.args.getlist('service') + kuma_mode = True if request.args.get('kuma') == 'true' else False + + if not hostid: + return Response(json.dumps({'error': 'must specify host'}), status=406, mimetype='application/json') + + result = { + 'host': {}, + 'services': {}, + 'failed_services': [] + } + + host_status = client.objects.list('Host', filters='match(hpattern, host.name)', filter_vars={'hpattern': hostid}) + if not len(host_status): + return Response(json.dumps({'error': 'could not find host'}), status=404, mimetype='application/json') + else: + host_status = host_status[0] + + result['host'] = { + 'name': host_status['name'], + 'state': 0 if (host_status['attrs']['acknowledgement'] or host_status['attrs']['acknowledgement_expiry']) else host_status['attrs']['state'], + 'actual_state': host_status['attrs']['state'], + 'attrs': { + **host_status['attrs'] + } + } + + services_status = client.objects.list('Service', filters='match(hpattern, host.name)', filter_vars={'hpattern': hostid}) + for attrs in services_status: + name = attrs['name'].split('!')[1] + result['services'][name] = { + 'state': 0 if (attrs['attrs']['acknowledgement'] or attrs['attrs']['acknowledgement_expiry']) else attrs['attrs']['state'], + 'actual_state': attrs['attrs']['state'], + 'attrs': { + **attrs + } + } + + if len(args_service): + services = {} + for service in args_service: + if service in result['services'].keys(): + services[service] = result['services'][service] + else: + return Response(json.dumps({'error': 'service not found', 'service': service}), status=400, mimetype='application/json') + result['services'] = services + + if kuma_mode: + for name, service in result['services'].items(): + if service['state'] != OK: + result['failed_services'].append({'name': name, 'state': service['state']}) + if result['host']['state'] != OK: + result['failed_services'].append({'name': hostid, 'state': result['host']['state']}) + + if len(result['failed_services']): + return Response(json.dumps(result), status=410, mimetype='application/json') + else: + return result