From 23fb350116624fe0ae04a705214bed71613042a5 Mon Sep 17 00:00:00 2001 From: Cyberes Date: Fri, 21 Apr 2023 23:54:17 -0600 Subject: [PATCH] fix bugs, improvements --- check_federation.py | 143 ++++++++++++++++++++++++-------------- check_matrix_synapse.py | 9 ++- check_media_cdn.py | 68 +++++++++++------- checker/synapse_client.py | 37 +++++++++- 4 files changed, 178 insertions(+), 79 deletions(-) diff --git a/check_federation.py b/check_federation.py index 73626ac..5dc0ddb 100644 --- a/check_federation.py +++ b/check_federation.py @@ -5,6 +5,7 @@ import json import os import sys import time +import traceback import urllib from datetime import datetime from uuid import uuid4 @@ -12,6 +13,7 @@ from uuid import uuid4 from nio import AsyncClient, AsyncClientConfig, JoinError, JoinResponse, LoginResponse, RoomCreateError, RoomGetEventResponse, RoomSendError import checker.nagios as nagios +from checker.synapse_client import leave_all_rooms_async, leave_room_async parser = argparse.ArgumentParser(description='Test federation between two homeservers.') parser.add_argument('--bot1-user', required=True, help='User ID for bot 1.') @@ -53,7 +55,7 @@ async def test_one_direction(sender_client, receiver_client, receiver_user_id): test_room_name = str(uuid4()) new_test_room = await sender_client.room_create(name=test_room_name, invite=[receiver_user_id]) if isinstance(new_test_room, RoomCreateError): - return f'UNKNOWN: failed to create room "{new_test_room}"', nagios.UNKNOWN + return f'UNKNOWN: failed to create room "{new_test_room}"', nagios.UNKNOWN, [] new_test_room_id = new_test_room.room_id time.sleep(2) @@ -65,9 +67,19 @@ async def test_one_direction(sender_client, receiver_client, receiver_user_id): if isinstance(resp, JoinResponse): break elif isinstance(resp, JoinError): - return f'UNKNOWN: failed to join room "{vars(resp)}"', nagios.UNKNOWN + leave = [await leave_room_async(new_test_room_id, sender_client)] + leave_failures = [] + for event in leave: + if not event[0]: + leave_failures.append((event[1], event[2])) + return f'UNKNOWN: failed to join room "{vars(resp)}"', nagios.UNKNOWN, leave_failures if (datetime.now() - timeout_start).total_seconds() >= args.timeout: - return 'UNKNOWN: failed to join room, timeout.', nagios.UNKNOWN + leave = [await leave_room_async(new_test_room_id, sender_client)] + leave_failures = [] + for event in leave: + if not event[0]: + leave_failures.append((event[1], event[2])) + return 'UNKNOWN: failed to join room, timeout.', nagios.UNKNOWN, leave_failures time.sleep(2) @@ -76,14 +88,12 @@ async def test_one_direction(sender_client, receiver_client, receiver_user_id): msg = {'id': str(uuid4()), 'ts': send_msg_time.microsecond} resp = (await sender_client.room_send(new_test_room_id, 'm.room.message', {'body': json.dumps(msg), 'msgtype': 'm.room.message'})) if isinstance(resp, RoomSendError): - await sender_client.room_leave(new_test_room_id) - time.sleep(1) - await sender_client.room_forget(new_test_room_id) - time.sleep(1) - await receiver_client.room_leave(new_test_room_id) - time.sleep(1) - await receiver_client.room_forget(new_test_room_id) - return f'UNKNOWN: failed to send message "{resp}', nagios.UNKNOWN + leave = [await leave_room_async(new_test_room_id, sender_client), await leave_room_async(new_test_room_id, receiver_client)] + leave_failures = [] + for event in leave: + if not event[0]: + leave_failures.append((event[1], event[2])) + return f'UNKNOWN: failed to send message "{resp}', nagios.UNKNOWN, leave_failures msg_event_id = resp.event_id # Sender watches for the message @@ -95,39 +105,26 @@ async def test_one_direction(sender_client, receiver_client, receiver_user_id): recv_msg = json.loads(resp.event.source['content']['body']) break if (datetime.now() - start_check).total_seconds() >= args.timeout: - await sender_client.room_leave(new_test_room_id) - time.sleep(1) - await sender_client.room_forget(new_test_room_id) - time.sleep(1) - await receiver_client.room_leave(new_test_room_id) - time.sleep(1) - await receiver_client.room_forget(new_test_room_id) - return "CRITICAL: timeout - receiver did not recieve the sender's message.", nagios.CRITICAL + leave = [await leave_room_async(new_test_room_id, sender_client), await leave_room_async(new_test_room_id, receiver_client)] + leave_failures = [] + for event in leave: + if not event[0]: + leave_failures.append((event[1], event[2])) + return "CRITICAL: timeout - receiver did not recieve the sender's message.", nagios.CRITICAL, leave_failures # Double check everything makes sense if not msg == recv_msg: - await sender_client.room_leave(new_test_room_id) - time.sleep(1) - await sender_client.room_forget(new_test_room_id) - time.sleep(1) - await receiver_client.room_leave(new_test_room_id) - time.sleep(1) - await receiver_client.room_forget(new_test_room_id) - return "CRITICAL: sender's message did not match the receiver's.", nagios.CRITICAL + leave = [await leave_room_async(new_test_room_id, sender_client), await leave_room_async(new_test_room_id, receiver_client)] + leave_failures = [] + for event in leave: + if not event[0]: + leave_failures.append((event[1], event[2])) + return "CRITICAL: sender's message did not match the receiver's.", nagios.CRITICAL, leave_failures # Calculate the time it took to recieve the message, including sync bot1_msg_delta = (recv_msg_time - send_msg_time).total_seconds() - # Clean up the rooms - await sender_client.room_leave(new_test_room_id) - time.sleep(1) - await sender_client.room_forget(new_test_room_id) - time.sleep(1) - await receiver_client.room_leave(new_test_room_id) - time.sleep(1) - await receiver_client.room_forget(new_test_room_id) - - return bot1_msg_delta, nagios.OK + return bot1_msg_delta, nagios.OK, new_test_room_id async def login(user_id, passwd, homeserver, config_file=None): @@ -160,16 +157,29 @@ async def main() -> None: bot1 = await login(args.bot1_user, args.bot1_pw, args.bot1_hs, args.bot1_auth_file) bot2 = await login(args.bot2_user, args.bot2_pw, args.bot2_hs, args.bot2_auth_file) - bot1_output_msg, bot1_output_code = await test_one_direction(bot1, bot2, args.bot2_user) - bot2_output_msg, bot2_output_code = await test_one_direction(bot2, bot1, args.bot1_user) + bot1_output_msg, bot1_output_code, bot1_new_room_id = await test_one_direction(bot1, bot2, args.bot2_user) + bot2_output_msg, bot2_output_code, bot2_new_room_id = await test_one_direction(bot2, bot1, args.bot1_user) + + # Clean up + leave = [await leave_room_async(bot1_new_room_id, bot1), await leave_room_async(bot2_new_room_id, bot1), await leave_room_async(bot1_new_room_id, bot2), await leave_room_async(bot2_new_room_id, bot2)] + leave_failures = [] + for event in leave: + if not event[0]: + leave_failures.append((event[1], event[2])) + + bot1_leave_all_failures = await leave_all_rooms_async(bot1, exclude_starting_with='_PERM_') + bot2_leave_all_failures = await leave_all_rooms_async(bot2, exclude_starting_with='_PERM_') + await bot1.close() + await bot2.close() nagios_output = nagios.OK + prints = [] if bot1_output_code != nagios.OK: - print(bot1_output_msg) + prints.append(bot1_output_msg) nagios_output = bot1_output_code if bot2_output_code != nagios.OK: - print(bot2_output_msg) + prints.append(bot2_output_msg) if nagios_output < bot2_output_code: # Only set the code if our code is more severe nagios_output = bot2_output_code @@ -180,13 +190,13 @@ async def main() -> None: if bot1_output_msg >= args.crit: if nagios_output < nagios.CRITICAL: nagios_output = nagios.CRITICAL - print('CRITICAL:', bot1_hs_domain, '->', bot2_hs_domain, 'is', bot1_output_msg, 'seconds.') + prints.append(f'CRITICAL: {bot1_hs_domain} -> {bot2_hs_domain} is {bot1_output_msg} seconds.') elif bot1_output_msg >= args.warn: if nagios_output < nagios.WARNING: nagios_output = nagios.WARNING - print('WARNING:', bot1_hs_domain, '->', bot2_hs_domain, 'is', bot1_output_msg, 'seconds.') + prints.append(f'WARNING: {bot1_hs_domain} -> {bot2_hs_domain} is {bot1_output_msg} seconds.') else: - print('OK:', bot1_hs_domain, '->', bot2_hs_domain, 'is', bot1_output_msg, 'seconds.') + prints.append(f'OK: {bot1_hs_domain} -> {bot2_hs_domain} is {bot1_output_msg} seconds.') # bot2 -> bot1 if isinstance(bot2_output_msg, float): @@ -194,17 +204,47 @@ async def main() -> None: if bot2_output_msg >= args.crit: if nagios_output < nagios.CRITICAL: nagios_output = nagios.CRITICAL - print('CRITICAL:', bot1_hs_domain, '<-', bot2_hs_domain, 'is', bot2_output_msg, 'seconds.') + prints.append(f'CRITICAL: {bot1_hs_domain} <- {bot2_hs_domain} is {bot2_output_msg} seconds.') elif bot2_output_msg >= args.warn: if nagios_output < nagios.WARNING: nagios_output = nagios.WARNING - print('WARNING:', bot1_hs_domain, '<-', bot2_hs_domain, 'is', bot2_output_msg, 'seconds.') + prints.append(f'WARNING: {bot1_hs_domain} <- {bot2_hs_domain} is {bot2_output_msg} seconds.') else: - print('OK:', bot1_hs_domain, '<-', bot2_hs_domain, 'is', bot2_output_msg, 'seconds.') + prints.append(f'OK: {bot1_hs_domain} <- {bot2_hs_domain} is {bot2_output_msg} seconds.') - # Clean up - await bot1.close() - await bot2.close() + if len(leave_failures): + prints.append('=================================') + prints.append('WARN: a bot failed to leave a room:') + for err in leave_failures: + prints.append(err) + if nagios_output < nagios.WARNING: + nagios_output = nagios.WARNING + + bot1_leave_warned = False + for err in bot1_leave_all_failures: + if not err[0]: + if not bot1_leave_warned: + prints.append('=================================') + prints.append('WARN: bot1 failed to leave room:') + bot1_leave_warned = True + prints.append(err) + if nagios_output < nagios.WARNING: + nagios_output = nagios.WARNING + + bot2_leave_warned = False + for err in bot2_leave_all_failures: + if not err[0]: + if not bot2_leave_warned: + prints.append('=================================') + prints.append('WARN: bot2 failed to leave room:') + bot2_leave_warned = True + prints.append(err) + if nagios_output < nagios.WARNING: + nagios_output = nagios.WARNING + + for x in prints: + print(f'\n{x}', end=' ') + print(f"|'{bot1_hs_domain}_outbound'={bot1_output_msg}s;;; '{bot1_hs_domain}_inbound'={bot2_output_msg}s;;;") sys.exit(nagios_output) @@ -213,5 +253,6 @@ if __name__ == "__main__": try: asyncio.run(main()) except Exception as e: - print(f"UNKNOWN: exception '{e}'") + print(f"UNKNOWN: exception\n{e}") + print(traceback.format_exc()) sys.exit(nagios.UNKNOWN) diff --git a/check_matrix_synapse.py b/check_matrix_synapse.py index 6ccab81..a9d6900 100644 --- a/check_matrix_synapse.py +++ b/check_matrix_synapse.py @@ -2,6 +2,7 @@ import argparse import sys import time +import traceback import numpy as np import requests @@ -37,6 +38,7 @@ def main(): sys.exit(nagios.OK) except Exception as e: print(f'UNKNOWN: failed to check avg. GC time "{e}"') + print(traceback.format_exc()) sys.exit(nagios.UNKNOWN) elif args.type == 'response-time': response_time_MAX = 1 if not args.crit else args.crit @@ -49,6 +51,7 @@ def main(): response = requests.post(args.synapse_server, timeout=timeout, verify=False) except Exception as e: print(f'UNKNOWN: failed to ping endpoint "{e}"') + print(traceback.format_exc()) sys.exit(nagios.UNKNOWN) request_time = time.perf_counter() - start response_times.append(np.round(request_time, 2)) @@ -62,6 +65,7 @@ def main(): sys.exit(nagios.OK) except Exception as e: print(f'UNKNOWN: failed to check response time "{e}"') + print(traceback.format_exc()) sys.exit(nagios.UNKNOWN) elif args.type == 'outgoing-http-rate': # outgoing req/sec @@ -82,6 +86,7 @@ def main(): sys.exit(nagios.OK) except Exception as e: print(f'UNKNOWN: failed to check outgoing HTTP request rate "{e}"') + print(traceback.format_exc()) sys.exit(nagios.UNKNOWN) elif args.type == 'avg-send': # Average send time in seconds @@ -96,6 +101,7 @@ def main(): sys.exit(nagios.OK) except Exception as e: print(f'UNKNOWN: failed to check average message send time "{e}"') + print(traceback.format_exc()) sys.exit(nagios.UNKNOWN) elif args.type == 'db-lag': # in seconds @@ -110,6 +116,7 @@ def main(): sys.exit(nagios.OK) except Exception as e: print(f'UNKNOWN: failed to check DB lag "{e}"') + print(traceback.format_exc()) sys.exit(nagios.UNKNOWN) else: print('Wrong type') @@ -121,7 +128,5 @@ if __name__ == "__main__": main() except Exception as e: print(f'UNKNOWN: exception "{e}"') - import traceback - print(traceback.format_exc()) sys.exit(nagios.UNKNOWN) diff --git a/check_media_cdn.py b/check_media_cdn.py index 2d16594..39147b4 100644 --- a/check_media_cdn.py +++ b/check_media_cdn.py @@ -5,6 +5,7 @@ import json import os import sys import tempfile +import traceback import urllib import numpy as np @@ -52,7 +53,7 @@ def verify_media_header(header: str, header_dict: dict, good_value: str = None, warn_value = str(warn_value) critical_value = str(critical_value) if not header_value: - return f'CRITICAL: missing header "{header}"', nagios.CRITICAL + return f'CRITICAL: missing header\n"{header}"', nagios.CRITICAL if good_value: good_value = str(good_value) @@ -68,8 +69,10 @@ def verify_media_header(header: str, header_dict: dict, good_value: str = None, async def main() -> None: + exit_code = nagios.OK + async def cleanup(client, test_image_path, image_event_id=None): - global exit_code + nonlocal exit_code # Clean up if image_event_id: await client.room_redact(args.room, image_event_id) @@ -82,11 +85,13 @@ async def main() -> None: if r.status_code != 200: if nagios.WARNING < exit_code: exit_code = nagios.WARNING - print(f"WARN: failed to purge media for this user, request failed with '{r.text}'") + return f"WARN: failed to purge media for this user.\n{r.text}" + else: + return None except Exception as e: if nagios.WARNING < exit_code: exit_code = nagios.WARNING - print(f"WARN: failed to purge media for this user '{e}'") + return f"WARN: failed to purge media for this user.\n{e}" client = AsyncClient(args.hs, args.user, config=AsyncClientConfig(request_timeout=args.timeout, max_timeout_retry_wait_time=10)) if args.auth_file: @@ -98,8 +103,8 @@ async def main() -> None: if isinstance(resp, LoginResponse): write_login_details_to_disk(resp, args.hs, args.auth_file) else: - print(f'UNKNOWN: failed to log in "{resp}"') - sys.exit(nagios.UNKNOWN) + print(f'CRITICAL: failed to log in.\n{resp}') + sys.exit(nagios.CRITICAL) else: # Otherwise the config file exists, so we'll use the stored credentials with open(args.auth_file, "r") as f: @@ -124,8 +129,8 @@ async def main() -> None: image_event_id = (await send_image(client, args.room, test_image_path)) if isinstance(image_event_id, RoomSendError): await cleanup(client, test_image_path) - print(f'UNKNOWN: failed to send message "{image_event_id}"') - sys.exit(nagios.UNKNOWN) + print(f'CRITICAL: failed to send message.\n{image_event_id}') + sys.exit(nagios.CRITICAL) image_event_id = image_event_id.event_id # Get the event @@ -138,43 +143,43 @@ async def main() -> None: # matter in this situation. r = requests.head(target_file_url, allow_redirects=False) + prints = [] + if r.status_code != 200 and not args.media_cdn_redirect: await cleanup(client, test_image_path, image_event_id=image_event_id) - print(f'CRITICAL: status code was "{r.status_code}"') + prints.append(f'CRITICAL: status code is "{r.status_code}"') sys.exit(nagios.CRITICAL) else: - print(f'OK: status code was "{r.status_code}"') + prints.append(f'OK: status code is "{r.status_code}"') headers = dict(r.headers) - exit_code = nagios.OK - # Check domain if args.media_cdn_redirect: if 'location' in headers: domain = urllib.parse.urlparse(headers['location']).netloc if domain != args.check_domain: exit_code = nagios.CRITICAL - print(f'CRITICAL: redirect to media CDN domain is "{domain}"') + prints.append(f'CRITICAL: redirect to media CDN domain is "{domain}"') else: - print(f'OK: media CDN domain is "{domain}"') + prints.append(f'OK: media CDN domain is "{domain}"') else: exit_code = nagios.CRITICAL - print(f'CRITICAL: was not redirected to the media CDN domain.') + prints.append(f'CRITICAL: was not redirected to the media CDN domain.') # Make sure we aren't redirected if we're a Synapse server test = requests.head(target_file_url, headers={'User-Agent': 'Synapse/1.77.3'}, allow_redirects=False) if test.status_code != 200: - print('CRITICAL: Synapse user-agent was redirected with status code', test.status_code) + prints.append('CRITICAL: Synapse user-agent is redirected with status code', test.status_code) exit_code = nagios.CRITICAL else: - print(f'OK: Synapse user-agent is not redirected.') + prints.append(f'OK: Synapse user-agent is not redirected.') else: if 'location' in headers: exit_code = nagios.CRITICAL - print(f"CRITICAL: recieved 301 to {urllib.parse.urlparse(headers['location']).netloc}") + prints.append(f"CRITICAL: recieved 301 to {urllib.parse.urlparse(headers['location']).netloc}") else: - print(f'OK: was not redirected.') + prints.append(f'OK: is not redirected.') if args.required_headers: # Icinga may pass the values as one string @@ -183,17 +188,32 @@ async def main() -> None: for item in args.required_headers: key, value = item.split('=') header_chk, code = verify_media_header(key, headers, good_value=value) - print(header_chk) + prints.append(header_chk) if code > exit_code: exit_code = code results = [verify_media_header('synapse-media-local-status', headers), verify_media_header('synapse-media-s3-status', headers, good_value='200'), verify_media_header('synapse-media-server', headers, good_value='s3')] for header_chk, code in results: - print(header_chk) + prints.append(header_chk) if code > exit_code: exit_code = code - await cleanup(client, test_image_path, image_event_id=image_event_id) + clean_msg = await cleanup(client, test_image_path, image_event_id=image_event_id) + + if exit_code == nagios.OK: + print('OK: media CDN is good.') + elif exit_code == nagios.UNKNOWN: + print('UNKNOWN: media CDN is bad.') + elif exit_code == nagios.WARNING: + print('WARNING: media CDN is bad.') + elif exit_code == nagios.CRITICAL: + print('CRITICAL: media CDN is bad.') + for msg in prints: + print(msg) + + if clean_msg: + print(clean_msg) + sys.exit(exit_code) @@ -201,8 +221,6 @@ if __name__ == "__main__": try: asyncio.run(main()) except Exception as e: - print(f'UNKNOWN: exception "{e}"') - import traceback - + print(f'UNKNOWN: exception\n{e}') print(traceback.format_exc()) sys.exit(nagios.UNKNOWN) diff --git a/checker/synapse_client.py b/checker/synapse_client.py index e6193e3..e2a8b04 100644 --- a/checker/synapse_client.py +++ b/checker/synapse_client.py @@ -1,13 +1,15 @@ import asyncio +import copy import json import os import sys +import time import aiofiles.os import magic import markdown from PIL import Image -from nio import AsyncClient, LoginResponse, RoomSendError, UploadResponse +from nio import AsyncClient, LoginResponse, RoomSendError, UploadResponse, MatrixRoom, RoomLeaveResponse, RoomForgetResponse from . import nagios @@ -143,3 +145,36 @@ def login(user, pw, hs, auth_file, room): return x, client return asyncio.run(inner(user, pw, hs, auth_file, room)) + + +async def leave_room_async(room_id, client): + l = await client.room_leave(room_id) + time.sleep(1) + f = await client.room_forget(room_id) + return isinstance(l, RoomLeaveResponse) and isinstance(f, RoomForgetResponse), l, f + + +async def leave_all_rooms_async(client, exclude_starting_with=None): + results = [] + for room_id in (await client.joined_rooms()).rooms: + room = MatrixRoom(room_id, client.user_id) + # if exclude_starting_with and room.named_room_name() is not None and room.named_room_name().startswith(exclude_starting_with): + # continue + s, l, f = await leave_room_async(room_id, client) + results.append((s, l, f)) + time.sleep(1) + await client.sync() + invited_rooms = copy.copy(client.invited_rooms) # RuntimeError: dictionary changed size during iteration + for name, room in invited_rooms.items(): + print(room.room_id) + # if exclude_starting_with and room.named_room_name() is not None and room.named_room_name().startswith(exclude_starting_with): + # continue + s, l, f = await leave_room_async(room.room_id, client) + results.append((s, l, f)) + time.sleep(1) + await client.close() + return results + + +def leave_all_rooms(client, exclude_starting_with=None): + return asyncio.run(leave_all_rooms_async(client, exclude_starting_with))