215 lines
9.6 KiB
Python
Executable File
215 lines
9.6 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
import argparse
|
|
import asyncio
|
|
import json
|
|
import sys
|
|
import time
|
|
import traceback
|
|
import urllib
|
|
from datetime import datetime
|
|
from uuid import uuid4
|
|
|
|
from nio import JoinError, JoinResponse, RoomCreateError, RoomGetEventResponse, RoomSendError
|
|
|
|
import checker.nagios as nagios
|
|
from checker.synapse_client import leave_all_rooms_async, leave_room_async, login_and_cache
|
|
|
|
parser = argparse.ArgumentParser(description='Test federation between two homeservers.')
|
|
parser.add_argument('--bot1-user', required=True, help='User ID for bot 1.')
|
|
parser.add_argument('--bot1-pw', required=True, help='Password for bot 1.')
|
|
parser.add_argument('--bot1-hs', required=True, help='Homeserver for bot 1.')
|
|
parser.add_argument('--bot1-auth-file', help="File to cache the bot's login details to.")
|
|
parser.add_argument('--bot2-user', required=True, help='User ID for bot 2.')
|
|
parser.add_argument('--bot2-pw', required=True, help='Password for bot 2.')
|
|
parser.add_argument('--bot2-hs', required=True, help='Homeserver for bot 2.')
|
|
parser.add_argument('--bot2-auth-file', help="File to cache the bot's login details to.")
|
|
parser.add_argument('--timeout', type=float, default=90, help='Request timeout limit.')
|
|
parser.add_argument('--warn', type=float, default=2.0, help='Manually set warn level.')
|
|
parser.add_argument('--crit', type=float, default=2.5, help='Manually set critical level.')
|
|
args = parser.parse_args()
|
|
|
|
bot1_hs_domain = urllib.parse.urlparse(args.bot1_hs).netloc
|
|
bot2_hs_domain = urllib.parse.urlparse(args.bot2_hs).netloc
|
|
|
|
|
|
async def test_one_direction(sender_client, receiver_client, receiver_user_id):
|
|
# The sender creates the room and invites the receiver
|
|
test_room_name = str(uuid4())
|
|
new_test_room = await sender_client.room_create(name=test_room_name, invite=[receiver_user_id])
|
|
if isinstance(new_test_room, RoomCreateError):
|
|
return f'UNKNOWN: failed to create room "{new_test_room}"', nagios.UNKNOWN, []
|
|
new_test_room_id = new_test_room.room_id
|
|
|
|
time.sleep(2)
|
|
|
|
# The receiver joins via invite
|
|
timeout_start = datetime.now()
|
|
while True:
|
|
resp = await receiver_client.join(new_test_room_id)
|
|
if isinstance(resp, JoinResponse):
|
|
break
|
|
elif isinstance(resp, JoinError):
|
|
leave = [await leave_room_async(new_test_room_id, sender_client)]
|
|
leave_failures = []
|
|
for event in leave:
|
|
if not event[0]:
|
|
leave_failures.append((event[1], event[2]))
|
|
return f'UNKNOWN: failed to join room "{vars(resp)}"', nagios.UNKNOWN, leave_failures
|
|
if (datetime.now() - timeout_start).total_seconds() >= args.timeout:
|
|
leave = [await leave_room_async(new_test_room_id, sender_client)]
|
|
leave_failures = []
|
|
for event in leave:
|
|
if not event[0]:
|
|
leave_failures.append((event[1], event[2]))
|
|
return 'UNKNOWN: failed to join room, timeout.', nagios.UNKNOWN, leave_failures
|
|
|
|
time.sleep(2)
|
|
|
|
# Sender sends the msg to room
|
|
send_msg_time = datetime.now()
|
|
msg = {'id': str(uuid4()), 'ts': send_msg_time.microsecond}
|
|
resp = (await sender_client.room_send(new_test_room_id, 'm.room.message', {'body': json.dumps(msg), 'msgtype': 'm.room.message'}))
|
|
if isinstance(resp, RoomSendError):
|
|
leave = [await leave_room_async(new_test_room_id, sender_client), await leave_room_async(new_test_room_id, receiver_client)]
|
|
leave_failures = []
|
|
for event in leave:
|
|
if not event[0]:
|
|
leave_failures.append((event[1], event[2]))
|
|
return f'UNKNOWN: failed to send message "{resp}', nagios.UNKNOWN, leave_failures
|
|
msg_event_id = resp.event_id
|
|
|
|
# Sender watches for the message
|
|
start_check = datetime.now()
|
|
while True:
|
|
resp = await receiver_client.room_get_event(new_test_room_id, msg_event_id)
|
|
if isinstance(resp, RoomGetEventResponse):
|
|
recv_msg_time = datetime.now()
|
|
recv_msg = json.loads(resp.event.source['content']['body'])
|
|
break
|
|
if (datetime.now() - start_check).total_seconds() >= args.timeout:
|
|
leave = [await leave_room_async(new_test_room_id, sender_client), await leave_room_async(new_test_room_id, receiver_client)]
|
|
leave_failures = []
|
|
for event in leave:
|
|
if not event[0]:
|
|
leave_failures.append((event[1], event[2]))
|
|
return "CRITICAL: timeout - receiver did not recieve the sender's message.", nagios.CRITICAL, leave_failures
|
|
|
|
# Double check everything makes sense
|
|
if not msg == recv_msg:
|
|
leave = [await leave_room_async(new_test_room_id, sender_client), await leave_room_async(new_test_room_id, receiver_client)]
|
|
leave_failures = []
|
|
for event in leave:
|
|
if not event[0]:
|
|
leave_failures.append((event[1], event[2]))
|
|
return "CRITICAL: sender's message did not match the receiver's.", nagios.CRITICAL, leave_failures
|
|
|
|
# Calculate the time it took to recieve the message, including sync
|
|
bot1_msg_delta = (recv_msg_time - send_msg_time).total_seconds()
|
|
|
|
return bot1_msg_delta, nagios.OK, new_test_room_id
|
|
|
|
|
|
async def main() -> None:
|
|
bot1 = await login_and_cache(args.bot1_user, args.bot1_pw, args.bot1_hs, args.bot1_auth_file, request_timeout=args.timeout)
|
|
bot2 = await login_and_cache(args.bot2_user, args.bot2_pw, args.bot2_hs, args.bot2_auth_file, request_timeout=args.timeout)
|
|
|
|
bot1_output_msg, bot1_output_code, bot1_new_room_id = await test_one_direction(bot1, bot2, args.bot2_user)
|
|
bot2_output_msg, bot2_output_code, bot2_new_room_id = await test_one_direction(bot2, bot1, args.bot1_user)
|
|
|
|
# Clean up
|
|
leave = [await leave_room_async(bot1_new_room_id, bot1), await leave_room_async(bot2_new_room_id, bot1), await leave_room_async(bot1_new_room_id, bot2), await leave_room_async(bot2_new_room_id, bot2)]
|
|
leave_failures = []
|
|
for event in leave:
|
|
if not event[0]:
|
|
leave_failures.append((event[1], event[2]))
|
|
|
|
bot1_leave_all_failures = await leave_all_rooms_async(bot1, exclude_starting_with='_PERM_')
|
|
bot2_leave_all_failures = await leave_all_rooms_async(bot2, exclude_starting_with='_PERM_')
|
|
await bot1.close()
|
|
await bot2.close()
|
|
|
|
nagios_output = nagios.OK
|
|
prints = []
|
|
|
|
if bot1_output_code != nagios.OK:
|
|
prints.append(bot1_output_msg)
|
|
nagios_output = bot1_output_code
|
|
if bot2_output_code != nagios.OK:
|
|
prints.append(bot2_output_msg)
|
|
if nagios_output < bot2_output_code:
|
|
# Only set the code if our code is more severe
|
|
nagios_output = bot2_output_code
|
|
|
|
# bot1 -> bot2
|
|
if isinstance(bot1_output_msg, float): # only do this if the func returned a value
|
|
bot1_output_msg = round(bot1_output_msg, 2)
|
|
if bot1_output_msg >= args.crit:
|
|
if nagios_output < nagios.CRITICAL:
|
|
nagios_output = nagios.CRITICAL
|
|
prints.append(f'CRITICAL: {bot1_hs_domain} -> {bot2_hs_domain} is {bot1_output_msg} seconds.')
|
|
elif bot1_output_msg >= args.warn:
|
|
if nagios_output < nagios.WARNING:
|
|
nagios_output = nagios.WARNING
|
|
prints.append(f'WARNING: {bot1_hs_domain} -> {bot2_hs_domain} is {bot1_output_msg} seconds.')
|
|
else:
|
|
prints.append(f'OK: {bot1_hs_domain} -> {bot2_hs_domain} is {bot1_output_msg} seconds.')
|
|
|
|
# bot2 -> bot1
|
|
if isinstance(bot2_output_msg, float):
|
|
bot2_output_msg = round(bot2_output_msg, 2)
|
|
if bot2_output_msg >= args.crit:
|
|
if nagios_output < nagios.CRITICAL:
|
|
nagios_output = nagios.CRITICAL
|
|
prints.append(f'CRITICAL: {bot1_hs_domain} <- {bot2_hs_domain} is {bot2_output_msg} seconds.')
|
|
elif bot2_output_msg >= args.warn:
|
|
if nagios_output < nagios.WARNING:
|
|
nagios_output = nagios.WARNING
|
|
prints.append(f'WARNING: {bot1_hs_domain} <- {bot2_hs_domain} is {bot2_output_msg} seconds.')
|
|
else:
|
|
prints.append(f'OK: {bot1_hs_domain} <- {bot2_hs_domain} is {bot2_output_msg} seconds.')
|
|
|
|
if len(leave_failures):
|
|
prints.append('=================================')
|
|
prints.append('WARN: a bot failed to leave a room:')
|
|
for err in leave_failures:
|
|
prints.append(err)
|
|
if nagios_output < nagios.WARNING:
|
|
nagios_output = nagios.WARNING
|
|
|
|
bot1_leave_warned = False
|
|
for err in bot1_leave_all_failures:
|
|
if not err[0]:
|
|
if not bot1_leave_warned:
|
|
prints.append('=================================')
|
|
prints.append('WARN: bot1 failed to leave room:')
|
|
bot1_leave_warned = True
|
|
prints.append(err)
|
|
if nagios_output < nagios.WARNING:
|
|
nagios_output = nagios.WARNING
|
|
|
|
bot2_leave_warned = False
|
|
for err in bot2_leave_all_failures:
|
|
if not err[0]:
|
|
if not bot2_leave_warned:
|
|
prints.append('=================================')
|
|
prints.append('WARN: bot2 failed to leave room:')
|
|
bot2_leave_warned = True
|
|
prints.append(err)
|
|
if nagios_output < nagios.WARNING:
|
|
nagios_output = nagios.WARNING
|
|
|
|
for x in prints:
|
|
print(f'\n{x}', end=' ')
|
|
print(f"|'{bot1_hs_domain}_outbound'={bot1_output_msg}s;;; '{bot1_hs_domain}_inbound'={bot2_output_msg}s;;;")
|
|
|
|
sys.exit(nagios_output)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
try:
|
|
asyncio.run(main())
|
|
except Exception as e:
|
|
print(f"UNKNOWN: exception\n{e}")
|
|
print(traceback.format_exc())
|
|
sys.exit(nagios.UNKNOWN)
|