icinga2-checks/check_federation.py

215 lines
9.6 KiB
Python
Raw Normal View History

2023-04-21 23:54:16 -06:00
#!/usr/bin/env python3
import argparse
import asyncio
import json
import sys
import time
2023-04-21 23:54:17 -06:00
import traceback
2023-04-21 23:54:16 -06:00
import urllib
from datetime import datetime
from uuid import uuid4
from nio import JoinError, JoinResponse, RoomCreateError, RoomGetEventResponse, RoomSendError
2023-04-21 23:54:16 -06:00
2023-04-21 23:54:16 -06:00
import checker.nagios as nagios
from checker.synapse_client import leave_all_rooms_async, leave_room_async, login_and_cache
2023-04-21 23:54:16 -06:00
parser = argparse.ArgumentParser(description='Test federation between two homeservers.')
parser.add_argument('--bot1-user', required=True, help='User ID for bot 1.')
parser.add_argument('--bot1-pw', required=True, help='Password for bot 1.')
parser.add_argument('--bot1-hs', required=True, help='Homeserver for bot 1.')
parser.add_argument('--bot1-auth-file', help="File to cache the bot's login details to.")
parser.add_argument('--bot2-user', required=True, help='User ID for bot 2.')
parser.add_argument('--bot2-pw', required=True, help='Password for bot 2.')
parser.add_argument('--bot2-hs', required=True, help='Homeserver for bot 2.')
parser.add_argument('--bot2-auth-file', help="File to cache the bot's login details to.")
parser.add_argument('--timeout', type=float, default=90, help='Request timeout limit.')
parser.add_argument('--warn', type=float, default=2.0, help='Manually set warn level.')
parser.add_argument('--crit', type=float, default=2.5, help='Manually set critical level.')
args = parser.parse_args()
bot1_hs_domain = urllib.parse.urlparse(args.bot1_hs).netloc
bot2_hs_domain = urllib.parse.urlparse(args.bot2_hs).netloc
async def test_one_direction(sender_client, receiver_client, receiver_user_id):
# The sender creates the room and invites the receiver
test_room_name = str(uuid4())
new_test_room = await sender_client.room_create(name=test_room_name, invite=[receiver_user_id])
if isinstance(new_test_room, RoomCreateError):
2023-04-21 23:54:17 -06:00
return f'UNKNOWN: failed to create room "{new_test_room}"', nagios.UNKNOWN, []
2023-04-21 23:54:16 -06:00
new_test_room_id = new_test_room.room_id
time.sleep(2)
# The receiver joins via invite
timeout_start = datetime.now()
while True:
resp = await receiver_client.join(new_test_room_id)
if isinstance(resp, JoinResponse):
break
elif isinstance(resp, JoinError):
2023-04-21 23:54:17 -06:00
leave = [await leave_room_async(new_test_room_id, sender_client)]
leave_failures = []
for event in leave:
if not event[0]:
leave_failures.append((event[1], event[2]))
return f'UNKNOWN: failed to join room "{vars(resp)}"', nagios.UNKNOWN, leave_failures
2023-04-21 23:54:16 -06:00
if (datetime.now() - timeout_start).total_seconds() >= args.timeout:
2023-04-21 23:54:17 -06:00
leave = [await leave_room_async(new_test_room_id, sender_client)]
leave_failures = []
for event in leave:
if not event[0]:
leave_failures.append((event[1], event[2]))
return 'UNKNOWN: failed to join room, timeout.', nagios.UNKNOWN, leave_failures
2023-04-21 23:54:16 -06:00
time.sleep(2)
# Sender sends the msg to room
send_msg_time = datetime.now()
msg = {'id': str(uuid4()), 'ts': send_msg_time.microsecond}
resp = (await sender_client.room_send(new_test_room_id, 'm.room.message', {'body': json.dumps(msg), 'msgtype': 'm.room.message'}))
if isinstance(resp, RoomSendError):
2023-04-21 23:54:17 -06:00
leave = [await leave_room_async(new_test_room_id, sender_client), await leave_room_async(new_test_room_id, receiver_client)]
leave_failures = []
for event in leave:
if not event[0]:
leave_failures.append((event[1], event[2]))
return f'UNKNOWN: failed to send message "{resp}', nagios.UNKNOWN, leave_failures
2023-04-21 23:54:16 -06:00
msg_event_id = resp.event_id
# Sender watches for the message
start_check = datetime.now()
while True:
resp = await receiver_client.room_get_event(new_test_room_id, msg_event_id)
if isinstance(resp, RoomGetEventResponse):
recv_msg_time = datetime.now()
recv_msg = json.loads(resp.event.source['content']['body'])
break
if (datetime.now() - start_check).total_seconds() >= args.timeout:
2023-04-21 23:54:17 -06:00
leave = [await leave_room_async(new_test_room_id, sender_client), await leave_room_async(new_test_room_id, receiver_client)]
leave_failures = []
for event in leave:
if not event[0]:
leave_failures.append((event[1], event[2]))
return "CRITICAL: timeout - receiver did not recieve the sender's message.", nagios.CRITICAL, leave_failures
2023-04-21 23:54:16 -06:00
# Double check everything makes sense
if not msg == recv_msg:
2023-04-21 23:54:17 -06:00
leave = [await leave_room_async(new_test_room_id, sender_client), await leave_room_async(new_test_room_id, receiver_client)]
leave_failures = []
for event in leave:
if not event[0]:
leave_failures.append((event[1], event[2]))
return "CRITICAL: sender's message did not match the receiver's.", nagios.CRITICAL, leave_failures
2023-04-21 23:54:16 -06:00
# Calculate the time it took to recieve the message, including sync
bot1_msg_delta = (recv_msg_time - send_msg_time).total_seconds()
2023-04-21 23:54:17 -06:00
return bot1_msg_delta, nagios.OK, new_test_room_id
2023-04-21 23:54:16 -06:00
async def main() -> None:
bot1 = await login_and_cache(args.bot1_user, args.bot1_pw, args.bot1_hs, args.bot1_auth_file, request_timeout=args.timeout)
bot2 = await login_and_cache(args.bot2_user, args.bot2_pw, args.bot2_hs, args.bot2_auth_file, request_timeout=args.timeout)
2023-04-21 23:54:16 -06:00
2023-04-21 23:54:17 -06:00
bot1_output_msg, bot1_output_code, bot1_new_room_id = await test_one_direction(bot1, bot2, args.bot2_user)
bot2_output_msg, bot2_output_code, bot2_new_room_id = await test_one_direction(bot2, bot1, args.bot1_user)
# Clean up
leave = [await leave_room_async(bot1_new_room_id, bot1), await leave_room_async(bot2_new_room_id, bot1), await leave_room_async(bot1_new_room_id, bot2), await leave_room_async(bot2_new_room_id, bot2)]
leave_failures = []
for event in leave:
if not event[0]:
leave_failures.append((event[1], event[2]))
bot1_leave_all_failures = await leave_all_rooms_async(bot1, exclude_starting_with='_PERM_')
bot2_leave_all_failures = await leave_all_rooms_async(bot2, exclude_starting_with='_PERM_')
await bot1.close()
await bot2.close()
2023-04-21 23:54:16 -06:00
nagios_output = nagios.OK
2023-04-21 23:54:17 -06:00
prints = []
2023-04-21 23:54:16 -06:00
2023-04-21 23:54:16 -06:00
if bot1_output_code != nagios.OK:
2023-04-21 23:54:17 -06:00
prints.append(bot1_output_msg)
2023-04-21 23:54:16 -06:00
nagios_output = bot1_output_code
2023-04-21 23:54:16 -06:00
if bot2_output_code != nagios.OK:
2023-04-21 23:54:17 -06:00
prints.append(bot2_output_msg)
2023-04-21 23:54:16 -06:00
if nagios_output < bot2_output_code:
# Only set the code if our code is more severe
nagios_output = bot2_output_code
# bot1 -> bot2
if isinstance(bot1_output_msg, float): # only do this if the func returned a value
bot1_output_msg = round(bot1_output_msg, 2)
if bot1_output_msg >= args.crit:
if nagios_output < nagios.CRITICAL:
nagios_output = nagios.CRITICAL
2023-04-21 23:54:17 -06:00
prints.append(f'CRITICAL: {bot1_hs_domain} -> {bot2_hs_domain} is {bot1_output_msg} seconds.')
2023-04-21 23:54:16 -06:00
elif bot1_output_msg >= args.warn:
if nagios_output < nagios.WARNING:
nagios_output = nagios.WARNING
2023-04-21 23:54:17 -06:00
prints.append(f'WARNING: {bot1_hs_domain} -> {bot2_hs_domain} is {bot1_output_msg} seconds.')
2023-04-21 23:54:16 -06:00
else:
2023-04-21 23:54:17 -06:00
prints.append(f'OK: {bot1_hs_domain} -> {bot2_hs_domain} is {bot1_output_msg} seconds.')
2023-04-21 23:54:16 -06:00
# bot2 -> bot1
if isinstance(bot2_output_msg, float):
bot2_output_msg = round(bot2_output_msg, 2)
if bot2_output_msg >= args.crit:
if nagios_output < nagios.CRITICAL:
nagios_output = nagios.CRITICAL
2023-04-21 23:54:17 -06:00
prints.append(f'CRITICAL: {bot1_hs_domain} <- {bot2_hs_domain} is {bot2_output_msg} seconds.')
2023-04-21 23:54:16 -06:00
elif bot2_output_msg >= args.warn:
if nagios_output < nagios.WARNING:
nagios_output = nagios.WARNING
2023-04-21 23:54:17 -06:00
prints.append(f'WARNING: {bot1_hs_domain} <- {bot2_hs_domain} is {bot2_output_msg} seconds.')
2023-04-21 23:54:16 -06:00
else:
2023-04-21 23:54:17 -06:00
prints.append(f'OK: {bot1_hs_domain} <- {bot2_hs_domain} is {bot2_output_msg} seconds.')
2023-04-21 23:54:16 -06:00
2023-04-21 23:54:17 -06:00
if len(leave_failures):
prints.append('=================================')
prints.append('WARN: a bot failed to leave a room:')
for err in leave_failures:
prints.append(err)
if nagios_output < nagios.WARNING:
nagios_output = nagios.WARNING
bot1_leave_warned = False
for err in bot1_leave_all_failures:
if not err[0]:
if not bot1_leave_warned:
prints.append('=================================')
prints.append('WARN: bot1 failed to leave room:')
bot1_leave_warned = True
prints.append(err)
if nagios_output < nagios.WARNING:
nagios_output = nagios.WARNING
bot2_leave_warned = False
for err in bot2_leave_all_failures:
if not err[0]:
if not bot2_leave_warned:
prints.append('=================================')
prints.append('WARN: bot2 failed to leave room:')
bot2_leave_warned = True
prints.append(err)
if nagios_output < nagios.WARNING:
nagios_output = nagios.WARNING
for x in prints:
print(f'\n{x}', end=' ')
2023-04-21 23:54:17 -06:00
print(f"|'{bot1_hs_domain}_outbound'={bot1_output_msg}s;;; '{bot1_hs_domain}_inbound'={bot2_output_msg}s;;;")
2023-04-21 23:54:16 -06:00
sys.exit(nagios_output)
if __name__ == "__main__":
try:
asyncio.run(main())
except Exception as e:
2023-04-21 23:54:17 -06:00
print(f"UNKNOWN: exception\n{e}")
print(traceback.format_exc())
2023-04-21 23:54:16 -06:00
sys.exit(nagios.UNKNOWN)