fix bugs, improvements

This commit is contained in:
Cyberes 2023-04-21 23:54:17 -06:00
parent df52844870
commit 23fb350116
4 changed files with 178 additions and 79 deletions

View File

@ -5,6 +5,7 @@ import json
import os
import sys
import time
import traceback
import urllib
from datetime import datetime
from uuid import uuid4
@ -12,6 +13,7 @@ from uuid import uuid4
from nio import AsyncClient, AsyncClientConfig, JoinError, JoinResponse, LoginResponse, RoomCreateError, RoomGetEventResponse, RoomSendError
import checker.nagios as nagios
from checker.synapse_client import leave_all_rooms_async, leave_room_async
parser = argparse.ArgumentParser(description='Test federation between two homeservers.')
parser.add_argument('--bot1-user', required=True, help='User ID for bot 1.')
@ -53,7 +55,7 @@ async def test_one_direction(sender_client, receiver_client, receiver_user_id):
test_room_name = str(uuid4())
new_test_room = await sender_client.room_create(name=test_room_name, invite=[receiver_user_id])
if isinstance(new_test_room, RoomCreateError):
return f'UNKNOWN: failed to create room "{new_test_room}"', nagios.UNKNOWN
return f'UNKNOWN: failed to create room "{new_test_room}"', nagios.UNKNOWN, []
new_test_room_id = new_test_room.room_id
time.sleep(2)
@ -65,9 +67,19 @@ async def test_one_direction(sender_client, receiver_client, receiver_user_id):
if isinstance(resp, JoinResponse):
break
elif isinstance(resp, JoinError):
return f'UNKNOWN: failed to join room "{vars(resp)}"', nagios.UNKNOWN
leave = [await leave_room_async(new_test_room_id, sender_client)]
leave_failures = []
for event in leave:
if not event[0]:
leave_failures.append((event[1], event[2]))
return f'UNKNOWN: failed to join room "{vars(resp)}"', nagios.UNKNOWN, leave_failures
if (datetime.now() - timeout_start).total_seconds() >= args.timeout:
return 'UNKNOWN: failed to join room, timeout.', nagios.UNKNOWN
leave = [await leave_room_async(new_test_room_id, sender_client)]
leave_failures = []
for event in leave:
if not event[0]:
leave_failures.append((event[1], event[2]))
return 'UNKNOWN: failed to join room, timeout.', nagios.UNKNOWN, leave_failures
time.sleep(2)
@ -76,14 +88,12 @@ async def test_one_direction(sender_client, receiver_client, receiver_user_id):
msg = {'id': str(uuid4()), 'ts': send_msg_time.microsecond}
resp = (await sender_client.room_send(new_test_room_id, 'm.room.message', {'body': json.dumps(msg), 'msgtype': 'm.room.message'}))
if isinstance(resp, RoomSendError):
await sender_client.room_leave(new_test_room_id)
time.sleep(1)
await sender_client.room_forget(new_test_room_id)
time.sleep(1)
await receiver_client.room_leave(new_test_room_id)
time.sleep(1)
await receiver_client.room_forget(new_test_room_id)
return f'UNKNOWN: failed to send message "{resp}', nagios.UNKNOWN
leave = [await leave_room_async(new_test_room_id, sender_client), await leave_room_async(new_test_room_id, receiver_client)]
leave_failures = []
for event in leave:
if not event[0]:
leave_failures.append((event[1], event[2]))
return f'UNKNOWN: failed to send message "{resp}', nagios.UNKNOWN, leave_failures
msg_event_id = resp.event_id
# Sender watches for the message
@ -95,39 +105,26 @@ async def test_one_direction(sender_client, receiver_client, receiver_user_id):
recv_msg = json.loads(resp.event.source['content']['body'])
break
if (datetime.now() - start_check).total_seconds() >= args.timeout:
await sender_client.room_leave(new_test_room_id)
time.sleep(1)
await sender_client.room_forget(new_test_room_id)
time.sleep(1)
await receiver_client.room_leave(new_test_room_id)
time.sleep(1)
await receiver_client.room_forget(new_test_room_id)
return "CRITICAL: timeout - receiver did not recieve the sender's message.", nagios.CRITICAL
leave = [await leave_room_async(new_test_room_id, sender_client), await leave_room_async(new_test_room_id, receiver_client)]
leave_failures = []
for event in leave:
if not event[0]:
leave_failures.append((event[1], event[2]))
return "CRITICAL: timeout - receiver did not recieve the sender's message.", nagios.CRITICAL, leave_failures
# Double check everything makes sense
if not msg == recv_msg:
await sender_client.room_leave(new_test_room_id)
time.sleep(1)
await sender_client.room_forget(new_test_room_id)
time.sleep(1)
await receiver_client.room_leave(new_test_room_id)
time.sleep(1)
await receiver_client.room_forget(new_test_room_id)
return "CRITICAL: sender's message did not match the receiver's.", nagios.CRITICAL
leave = [await leave_room_async(new_test_room_id, sender_client), await leave_room_async(new_test_room_id, receiver_client)]
leave_failures = []
for event in leave:
if not event[0]:
leave_failures.append((event[1], event[2]))
return "CRITICAL: sender's message did not match the receiver's.", nagios.CRITICAL, leave_failures
# Calculate the time it took to recieve the message, including sync
bot1_msg_delta = (recv_msg_time - send_msg_time).total_seconds()
# Clean up the rooms
await sender_client.room_leave(new_test_room_id)
time.sleep(1)
await sender_client.room_forget(new_test_room_id)
time.sleep(1)
await receiver_client.room_leave(new_test_room_id)
time.sleep(1)
await receiver_client.room_forget(new_test_room_id)
return bot1_msg_delta, nagios.OK
return bot1_msg_delta, nagios.OK, new_test_room_id
async def login(user_id, passwd, homeserver, config_file=None):
@ -160,16 +157,29 @@ async def main() -> None:
bot1 = await login(args.bot1_user, args.bot1_pw, args.bot1_hs, args.bot1_auth_file)
bot2 = await login(args.bot2_user, args.bot2_pw, args.bot2_hs, args.bot2_auth_file)
bot1_output_msg, bot1_output_code = await test_one_direction(bot1, bot2, args.bot2_user)
bot2_output_msg, bot2_output_code = await test_one_direction(bot2, bot1, args.bot1_user)
bot1_output_msg, bot1_output_code, bot1_new_room_id = await test_one_direction(bot1, bot2, args.bot2_user)
bot2_output_msg, bot2_output_code, bot2_new_room_id = await test_one_direction(bot2, bot1, args.bot1_user)
# Clean up
leave = [await leave_room_async(bot1_new_room_id, bot1), await leave_room_async(bot2_new_room_id, bot1), await leave_room_async(bot1_new_room_id, bot2), await leave_room_async(bot2_new_room_id, bot2)]
leave_failures = []
for event in leave:
if not event[0]:
leave_failures.append((event[1], event[2]))
bot1_leave_all_failures = await leave_all_rooms_async(bot1, exclude_starting_with='_PERM_')
bot2_leave_all_failures = await leave_all_rooms_async(bot2, exclude_starting_with='_PERM_')
await bot1.close()
await bot2.close()
nagios_output = nagios.OK
prints = []
if bot1_output_code != nagios.OK:
print(bot1_output_msg)
prints.append(bot1_output_msg)
nagios_output = bot1_output_code
if bot2_output_code != nagios.OK:
print(bot2_output_msg)
prints.append(bot2_output_msg)
if nagios_output < bot2_output_code:
# Only set the code if our code is more severe
nagios_output = bot2_output_code
@ -180,13 +190,13 @@ async def main() -> None:
if bot1_output_msg >= args.crit:
if nagios_output < nagios.CRITICAL:
nagios_output = nagios.CRITICAL
print('CRITICAL:', bot1_hs_domain, '->', bot2_hs_domain, 'is', bot1_output_msg, 'seconds.')
prints.append(f'CRITICAL: {bot1_hs_domain} -> {bot2_hs_domain} is {bot1_output_msg} seconds.')
elif bot1_output_msg >= args.warn:
if nagios_output < nagios.WARNING:
nagios_output = nagios.WARNING
print('WARNING:', bot1_hs_domain, '->', bot2_hs_domain, 'is', bot1_output_msg, 'seconds.')
prints.append(f'WARNING: {bot1_hs_domain} -> {bot2_hs_domain} is {bot1_output_msg} seconds.')
else:
print('OK:', bot1_hs_domain, '->', bot2_hs_domain, 'is', bot1_output_msg, 'seconds.')
prints.append(f'OK: {bot1_hs_domain} -> {bot2_hs_domain} is {bot1_output_msg} seconds.')
# bot2 -> bot1
if isinstance(bot2_output_msg, float):
@ -194,17 +204,47 @@ async def main() -> None:
if bot2_output_msg >= args.crit:
if nagios_output < nagios.CRITICAL:
nagios_output = nagios.CRITICAL
print('CRITICAL:', bot1_hs_domain, '<-', bot2_hs_domain, 'is', bot2_output_msg, 'seconds.')
prints.append(f'CRITICAL: {bot1_hs_domain} <- {bot2_hs_domain} is {bot2_output_msg} seconds.')
elif bot2_output_msg >= args.warn:
if nagios_output < nagios.WARNING:
nagios_output = nagios.WARNING
print('WARNING:', bot1_hs_domain, '<-', bot2_hs_domain, 'is', bot2_output_msg, 'seconds.')
prints.append(f'WARNING: {bot1_hs_domain} <- {bot2_hs_domain} is {bot2_output_msg} seconds.')
else:
print('OK:', bot1_hs_domain, '<-', bot2_hs_domain, 'is', bot2_output_msg, 'seconds.')
prints.append(f'OK: {bot1_hs_domain} <- {bot2_hs_domain} is {bot2_output_msg} seconds.')
# Clean up
await bot1.close()
await bot2.close()
if len(leave_failures):
prints.append('=================================')
prints.append('WARN: a bot failed to leave a room:')
for err in leave_failures:
prints.append(err)
if nagios_output < nagios.WARNING:
nagios_output = nagios.WARNING
bot1_leave_warned = False
for err in bot1_leave_all_failures:
if not err[0]:
if not bot1_leave_warned:
prints.append('=================================')
prints.append('WARN: bot1 failed to leave room:')
bot1_leave_warned = True
prints.append(err)
if nagios_output < nagios.WARNING:
nagios_output = nagios.WARNING
bot2_leave_warned = False
for err in bot2_leave_all_failures:
if not err[0]:
if not bot2_leave_warned:
prints.append('=================================')
prints.append('WARN: bot2 failed to leave room:')
bot2_leave_warned = True
prints.append(err)
if nagios_output < nagios.WARNING:
nagios_output = nagios.WARNING
for x in prints:
print(f'\n{x}', end=' ')
print(f"|'{bot1_hs_domain}_outbound'={bot1_output_msg}s;;; '{bot1_hs_domain}_inbound'={bot2_output_msg}s;;;")
sys.exit(nagios_output)
@ -213,5 +253,6 @@ if __name__ == "__main__":
try:
asyncio.run(main())
except Exception as e:
print(f"UNKNOWN: exception '{e}'")
print(f"UNKNOWN: exception\n{e}")
print(traceback.format_exc())
sys.exit(nagios.UNKNOWN)

View File

@ -2,6 +2,7 @@
import argparse
import sys
import time
import traceback
import numpy as np
import requests
@ -37,6 +38,7 @@ def main():
sys.exit(nagios.OK)
except Exception as e:
print(f'UNKNOWN: failed to check avg. GC time "{e}"')
print(traceback.format_exc())
sys.exit(nagios.UNKNOWN)
elif args.type == 'response-time':
response_time_MAX = 1 if not args.crit else args.crit
@ -49,6 +51,7 @@ def main():
response = requests.post(args.synapse_server, timeout=timeout, verify=False)
except Exception as e:
print(f'UNKNOWN: failed to ping endpoint "{e}"')
print(traceback.format_exc())
sys.exit(nagios.UNKNOWN)
request_time = time.perf_counter() - start
response_times.append(np.round(request_time, 2))
@ -62,6 +65,7 @@ def main():
sys.exit(nagios.OK)
except Exception as e:
print(f'UNKNOWN: failed to check response time "{e}"')
print(traceback.format_exc())
sys.exit(nagios.UNKNOWN)
elif args.type == 'outgoing-http-rate':
# outgoing req/sec
@ -82,6 +86,7 @@ def main():
sys.exit(nagios.OK)
except Exception as e:
print(f'UNKNOWN: failed to check outgoing HTTP request rate "{e}"')
print(traceback.format_exc())
sys.exit(nagios.UNKNOWN)
elif args.type == 'avg-send':
# Average send time in seconds
@ -96,6 +101,7 @@ def main():
sys.exit(nagios.OK)
except Exception as e:
print(f'UNKNOWN: failed to check average message send time "{e}"')
print(traceback.format_exc())
sys.exit(nagios.UNKNOWN)
elif args.type == 'db-lag':
# in seconds
@ -110,6 +116,7 @@ def main():
sys.exit(nagios.OK)
except Exception as e:
print(f'UNKNOWN: failed to check DB lag "{e}"')
print(traceback.format_exc())
sys.exit(nagios.UNKNOWN)
else:
print('Wrong type')
@ -121,7 +128,5 @@ if __name__ == "__main__":
main()
except Exception as e:
print(f'UNKNOWN: exception "{e}"')
import traceback
print(traceback.format_exc())
sys.exit(nagios.UNKNOWN)

View File

@ -5,6 +5,7 @@ import json
import os
import sys
import tempfile
import traceback
import urllib
import numpy as np
@ -52,7 +53,7 @@ def verify_media_header(header: str, header_dict: dict, good_value: str = None,
warn_value = str(warn_value)
critical_value = str(critical_value)
if not header_value:
return f'CRITICAL: missing header "{header}"', nagios.CRITICAL
return f'CRITICAL: missing header\n"{header}"', nagios.CRITICAL
if good_value:
good_value = str(good_value)
@ -68,8 +69,10 @@ def verify_media_header(header: str, header_dict: dict, good_value: str = None,
async def main() -> None:
exit_code = nagios.OK
async def cleanup(client, test_image_path, image_event_id=None):
global exit_code
nonlocal exit_code
# Clean up
if image_event_id:
await client.room_redact(args.room, image_event_id)
@ -82,11 +85,13 @@ async def main() -> None:
if r.status_code != 200:
if nagios.WARNING < exit_code:
exit_code = nagios.WARNING
print(f"WARN: failed to purge media for this user, request failed with '{r.text}'")
return f"WARN: failed to purge media for this user.\n{r.text}"
else:
return None
except Exception as e:
if nagios.WARNING < exit_code:
exit_code = nagios.WARNING
print(f"WARN: failed to purge media for this user '{e}'")
return f"WARN: failed to purge media for this user.\n{e}"
client = AsyncClient(args.hs, args.user, config=AsyncClientConfig(request_timeout=args.timeout, max_timeout_retry_wait_time=10))
if args.auth_file:
@ -98,8 +103,8 @@ async def main() -> None:
if isinstance(resp, LoginResponse):
write_login_details_to_disk(resp, args.hs, args.auth_file)
else:
print(f'UNKNOWN: failed to log in "{resp}"')
sys.exit(nagios.UNKNOWN)
print(f'CRITICAL: failed to log in.\n{resp}')
sys.exit(nagios.CRITICAL)
else:
# Otherwise the config file exists, so we'll use the stored credentials
with open(args.auth_file, "r") as f:
@ -124,8 +129,8 @@ async def main() -> None:
image_event_id = (await send_image(client, args.room, test_image_path))
if isinstance(image_event_id, RoomSendError):
await cleanup(client, test_image_path)
print(f'UNKNOWN: failed to send message "{image_event_id}"')
sys.exit(nagios.UNKNOWN)
print(f'CRITICAL: failed to send message.\n{image_event_id}')
sys.exit(nagios.CRITICAL)
image_event_id = image_event_id.event_id
# Get the event
@ -138,43 +143,43 @@ async def main() -> None:
# matter in this situation.
r = requests.head(target_file_url, allow_redirects=False)
prints = []
if r.status_code != 200 and not args.media_cdn_redirect:
await cleanup(client, test_image_path, image_event_id=image_event_id)
print(f'CRITICAL: status code was "{r.status_code}"')
prints.append(f'CRITICAL: status code is "{r.status_code}"')
sys.exit(nagios.CRITICAL)
else:
print(f'OK: status code was "{r.status_code}"')
prints.append(f'OK: status code is "{r.status_code}"')
headers = dict(r.headers)
exit_code = nagios.OK
# Check domain
if args.media_cdn_redirect:
if 'location' in headers:
domain = urllib.parse.urlparse(headers['location']).netloc
if domain != args.check_domain:
exit_code = nagios.CRITICAL
print(f'CRITICAL: redirect to media CDN domain is "{domain}"')
prints.append(f'CRITICAL: redirect to media CDN domain is "{domain}"')
else:
print(f'OK: media CDN domain is "{domain}"')
prints.append(f'OK: media CDN domain is "{domain}"')
else:
exit_code = nagios.CRITICAL
print(f'CRITICAL: was not redirected to the media CDN domain.')
prints.append(f'CRITICAL: was not redirected to the media CDN domain.')
# Make sure we aren't redirected if we're a Synapse server
test = requests.head(target_file_url, headers={'User-Agent': 'Synapse/1.77.3'}, allow_redirects=False)
if test.status_code != 200:
print('CRITICAL: Synapse user-agent was redirected with status code', test.status_code)
prints.append('CRITICAL: Synapse user-agent is redirected with status code', test.status_code)
exit_code = nagios.CRITICAL
else:
print(f'OK: Synapse user-agent is not redirected.')
prints.append(f'OK: Synapse user-agent is not redirected.')
else:
if 'location' in headers:
exit_code = nagios.CRITICAL
print(f"CRITICAL: recieved 301 to {urllib.parse.urlparse(headers['location']).netloc}")
prints.append(f"CRITICAL: recieved 301 to {urllib.parse.urlparse(headers['location']).netloc}")
else:
print(f'OK: was not redirected.')
prints.append(f'OK: is not redirected.')
if args.required_headers:
# Icinga may pass the values as one string
@ -183,17 +188,32 @@ async def main() -> None:
for item in args.required_headers:
key, value = item.split('=')
header_chk, code = verify_media_header(key, headers, good_value=value)
print(header_chk)
prints.append(header_chk)
if code > exit_code:
exit_code = code
results = [verify_media_header('synapse-media-local-status', headers), verify_media_header('synapse-media-s3-status', headers, good_value='200'), verify_media_header('synapse-media-server', headers, good_value='s3')]
for header_chk, code in results:
print(header_chk)
prints.append(header_chk)
if code > exit_code:
exit_code = code
await cleanup(client, test_image_path, image_event_id=image_event_id)
clean_msg = await cleanup(client, test_image_path, image_event_id=image_event_id)
if exit_code == nagios.OK:
print('OK: media CDN is good.')
elif exit_code == nagios.UNKNOWN:
print('UNKNOWN: media CDN is bad.')
elif exit_code == nagios.WARNING:
print('WARNING: media CDN is bad.')
elif exit_code == nagios.CRITICAL:
print('CRITICAL: media CDN is bad.')
for msg in prints:
print(msg)
if clean_msg:
print(clean_msg)
sys.exit(exit_code)
@ -201,8 +221,6 @@ if __name__ == "__main__":
try:
asyncio.run(main())
except Exception as e:
print(f'UNKNOWN: exception "{e}"')
import traceback
print(f'UNKNOWN: exception\n{e}')
print(traceback.format_exc())
sys.exit(nagios.UNKNOWN)

View File

@ -1,13 +1,15 @@
import asyncio
import copy
import json
import os
import sys
import time
import aiofiles.os
import magic
import markdown
from PIL import Image
from nio import AsyncClient, LoginResponse, RoomSendError, UploadResponse
from nio import AsyncClient, LoginResponse, RoomSendError, UploadResponse, MatrixRoom, RoomLeaveResponse, RoomForgetResponse
from . import nagios
@ -143,3 +145,36 @@ def login(user, pw, hs, auth_file, room):
return x, client
return asyncio.run(inner(user, pw, hs, auth_file, room))
async def leave_room_async(room_id, client):
l = await client.room_leave(room_id)
time.sleep(1)
f = await client.room_forget(room_id)
return isinstance(l, RoomLeaveResponse) and isinstance(f, RoomForgetResponse), l, f
async def leave_all_rooms_async(client, exclude_starting_with=None):
results = []
for room_id in (await client.joined_rooms()).rooms:
room = MatrixRoom(room_id, client.user_id)
# if exclude_starting_with and room.named_room_name() is not None and room.named_room_name().startswith(exclude_starting_with):
# continue
s, l, f = await leave_room_async(room_id, client)
results.append((s, l, f))
time.sleep(1)
await client.sync()
invited_rooms = copy.copy(client.invited_rooms) # RuntimeError: dictionary changed size during iteration
for name, room in invited_rooms.items():
print(room.room_id)
# if exclude_starting_with and room.named_room_name() is not None and room.named_room_name().startswith(exclude_starting_with):
# continue
s, l, f = await leave_room_async(room.room_id, client)
results.append((s, l, f))
time.sleep(1)
await client.close()
return results
def leave_all_rooms(client, exclude_starting_with=None):
return asyncio.run(leave_all_rooms_async(client, exclude_starting_with))