#!/usr/bin/env python3 import argparse import asyncio import json import sys import time import traceback import urllib from datetime import datetime from uuid import uuid4 from nio import JoinError, JoinResponse, RoomCreateError, RoomGetEventResponse, RoomSendError import checker.nagios as nagios from checker.synapse_client import leave_all_rooms_async, leave_room_async, login_and_cache parser = argparse.ArgumentParser(description='Test federation between two homeservers.') parser.add_argument('--bot1-user', required=True, help='User ID for bot 1.') parser.add_argument('--bot1-pw', required=True, help='Password for bot 1.') parser.add_argument('--bot1-hs', required=True, help='Homeserver for bot 1.') parser.add_argument('--bot1-auth-file', help="File to cache the bot's login details to.") parser.add_argument('--bot2-user', required=True, help='User ID for bot 2.') parser.add_argument('--bot2-pw', required=True, help='Password for bot 2.') parser.add_argument('--bot2-hs', required=True, help='Homeserver for bot 2.') parser.add_argument('--bot2-auth-file', help="File to cache the bot's login details to.") parser.add_argument('--timeout', type=float, default=90, help='Request timeout limit.') parser.add_argument('--warn', type=float, default=2.0, help='Manually set warn level.') parser.add_argument('--crit', type=float, default=2.5, help='Manually set critical level.') args = parser.parse_args() bot1_hs_domain = urllib.parse.urlparse(args.bot1_hs).netloc bot2_hs_domain = urllib.parse.urlparse(args.bot2_hs).netloc async def test_one_direction(sender_client, receiver_client, receiver_user_id): # The sender creates the room and invites the receiver test_room_name = str(uuid4()) new_test_room = await sender_client.room_create(name=test_room_name, invite=[receiver_user_id]) if isinstance(new_test_room, RoomCreateError): return f'UNKNOWN: failed to create room "{new_test_room}"', nagios.UNKNOWN, [] new_test_room_id = new_test_room.room_id time.sleep(2) # The receiver joins via invite timeout_start = datetime.now() while True: resp = await receiver_client.join(new_test_room_id) if isinstance(resp, JoinResponse): break elif isinstance(resp, JoinError): leave = [await leave_room_async(new_test_room_id, sender_client)] leave_failures = [] for event in leave: if not event[0]: leave_failures.append((event[1], event[2])) return f'UNKNOWN: failed to join room "{vars(resp)}"', nagios.UNKNOWN, leave_failures if (datetime.now() - timeout_start).total_seconds() >= args.timeout: leave = [await leave_room_async(new_test_room_id, sender_client)] leave_failures = [] for event in leave: if not event[0]: leave_failures.append((event[1], event[2])) return 'UNKNOWN: failed to join room, timeout.', nagios.UNKNOWN, leave_failures time.sleep(2) # Sender sends the msg to room send_msg_time = datetime.now() msg = {'id': str(uuid4()), 'ts': send_msg_time.microsecond} resp = (await sender_client.room_send(new_test_room_id, 'm.room.message', {'body': json.dumps(msg), 'msgtype': 'm.room.message'})) if isinstance(resp, RoomSendError): leave = [await leave_room_async(new_test_room_id, sender_client), await leave_room_async(new_test_room_id, receiver_client)] leave_failures = [] for event in leave: if not event[0]: leave_failures.append((event[1], event[2])) return f'UNKNOWN: failed to send message "{resp}', nagios.UNKNOWN, leave_failures msg_event_id = resp.event_id # Sender watches for the message start_check = datetime.now() while True: resp = await receiver_client.room_get_event(new_test_room_id, msg_event_id) if isinstance(resp, RoomGetEventResponse): recv_msg_time = datetime.now() recv_msg = json.loads(resp.event.source['content']['body']) break if (datetime.now() - start_check).total_seconds() >= args.timeout: leave = [await leave_room_async(new_test_room_id, sender_client), await leave_room_async(new_test_room_id, receiver_client)] leave_failures = [] for event in leave: if not event[0]: leave_failures.append((event[1], event[2])) return "CRITICAL: timeout - receiver did not recieve the sender's message.", nagios.CRITICAL, leave_failures # Double check everything makes sense if not msg == recv_msg: leave = [await leave_room_async(new_test_room_id, sender_client), await leave_room_async(new_test_room_id, receiver_client)] leave_failures = [] for event in leave: if not event[0]: leave_failures.append((event[1], event[2])) return "CRITICAL: sender's message did not match the receiver's.", nagios.CRITICAL, leave_failures # Calculate the time it took to recieve the message, including sync bot1_msg_delta = (recv_msg_time - send_msg_time).total_seconds() return bot1_msg_delta, nagios.OK, new_test_room_id async def main() -> None: bot1 = await login_and_cache(args.bot1_user, args.bot1_pw, args.bot1_hs, args.bot1_auth_file, request_timeout=args.timeout) bot2 = await login_and_cache(args.bot2_user, args.bot2_pw, args.bot2_hs, args.bot2_auth_file, request_timeout=args.timeout) bot1_output_msg, bot1_output_code, bot1_new_room_id = await test_one_direction(bot1, bot2, args.bot2_user) bot2_output_msg, bot2_output_code, bot2_new_room_id = await test_one_direction(bot2, bot1, args.bot1_user) # Clean up leave = [await leave_room_async(bot1_new_room_id, bot1), await leave_room_async(bot2_new_room_id, bot1), await leave_room_async(bot1_new_room_id, bot2), await leave_room_async(bot2_new_room_id, bot2)] leave_failures = [] for event in leave: if not event[0]: leave_failures.append((event[1], event[2])) bot1_leave_all_failures = await leave_all_rooms_async(bot1, exclude_starting_with='_PERM_') bot2_leave_all_failures = await leave_all_rooms_async(bot2, exclude_starting_with='_PERM_') await bot1.close() await bot2.close() nagios_output = nagios.OK prints = [] if bot1_output_code != nagios.OK: prints.append(bot1_output_msg) nagios_output = bot1_output_code if bot2_output_code != nagios.OK: prints.append(bot2_output_msg) if nagios_output < bot2_output_code: # Only set the code if our code is more severe nagios_output = bot2_output_code # bot1 -> bot2 if isinstance(bot1_output_msg, float): # only do this if the func returned a value bot1_output_msg = round(bot1_output_msg, 2) if bot1_output_msg >= args.crit: if nagios_output < nagios.CRITICAL: nagios_output = nagios.CRITICAL prints.append(f'CRITICAL: {bot1_hs_domain} -> {bot2_hs_domain} is {bot1_output_msg} seconds.') elif bot1_output_msg >= args.warn: if nagios_output < nagios.WARNING: nagios_output = nagios.WARNING prints.append(f'WARNING: {bot1_hs_domain} -> {bot2_hs_domain} is {bot1_output_msg} seconds.') else: prints.append(f'OK: {bot1_hs_domain} -> {bot2_hs_domain} is {bot1_output_msg} seconds.') # bot2 -> bot1 if isinstance(bot2_output_msg, float): bot2_output_msg = round(bot2_output_msg, 2) if bot2_output_msg >= args.crit: if nagios_output < nagios.CRITICAL: nagios_output = nagios.CRITICAL prints.append(f'CRITICAL: {bot1_hs_domain} <- {bot2_hs_domain} is {bot2_output_msg} seconds.') elif bot2_output_msg >= args.warn: if nagios_output < nagios.WARNING: nagios_output = nagios.WARNING prints.append(f'WARNING: {bot1_hs_domain} <- {bot2_hs_domain} is {bot2_output_msg} seconds.') else: prints.append(f'OK: {bot1_hs_domain} <- {bot2_hs_domain} is {bot2_output_msg} seconds.') if len(leave_failures): prints.append('=================================') prints.append('WARN: a bot failed to leave a room:') for err in leave_failures: prints.append(err) if nagios_output < nagios.WARNING: nagios_output = nagios.WARNING bot1_leave_warned = False for err in bot1_leave_all_failures: if not err[0]: if not bot1_leave_warned: prints.append('=================================') prints.append('WARN: bot1 failed to leave room:') bot1_leave_warned = True prints.append(err) if nagios_output < nagios.WARNING: nagios_output = nagios.WARNING bot2_leave_warned = False for err in bot2_leave_all_failures: if not err[0]: if not bot2_leave_warned: prints.append('=================================') prints.append('WARN: bot2 failed to leave room:') bot2_leave_warned = True prints.append(err) if nagios_output < nagios.WARNING: nagios_output = nagios.WARNING for x in prints: print(f'\n{x}', end=' ') print(f"|'{bot1_hs_domain}_outbound'={bot1_output_msg}s;;; '{bot1_hs_domain}_inbound'={bot2_output_msg}s;;;") sys.exit(nagios_output) if __name__ == "__main__": try: asyncio.run(main()) except Exception as e: print(f"UNKNOWN: exception\n{e}") print(traceback.format_exc()) sys.exit(nagios.UNKNOWN)