icinga2-checks/check_matrix_media_cdn.py

259 lines
11 KiB
Python
Executable File

#!/usr/bin/env python3
import argparse
import asyncio
import json
import os
import sys
import tempfile
import time
import traceback
import urllib
import numpy as np
import requests
from PIL import Image
from nio import AsyncClient, AsyncClientConfig, LoginResponse, RoomSendError
from urllib3.exceptions import InsecureRequestWarning
from checker import nagios
from checker.synapse_client import send_image, write_login_details_to_disk
def verify_media_header(header: str, header_dict: dict, good_value: str = None, warn_value: str = None, critical_value: str = None):
"""
If you don't specify good_value, warn_value, or critical_value then the header will only be checked for existience.
"""
# Convert everything to lowercase strings to prevent any wierdness
header_dict = {k.lower(): v for k, v in header_dict.items()}
header = header.lower()
header_value = str(header_dict.get(header))
warn_value = str(warn_value)
critical_value = str(critical_value)
if not header_value:
return f'CRITICAL: missing header\n"{header}"', nagios.CRITICAL
if good_value:
good_value = str(good_value)
if header_value == good_value:
return f'OK: {header}: "{header_value}"', nagios.OK
else:
return f'CRITICAL: {header} is not "{good_value}", is "{header_value}"', nagios.CRITICAL
# elif warn_value and header_value == warn_value:
# return f'WARN: {header}: "{header_value}"', nagios.WARNING
# elif critical_value and header_value == critical_value:
# return f'CRITICAL: {header}: "{header_value}"', nagios.CRITICAL
return f'OK: {header} is present', nagios.OK # with value "{header_value}"'
async def main(args) -> None:
exit_code = nagios.OK
async def cleanup(client, test_image_path, image_event_id=None):
nonlocal exit_code
# Clean up
if image_event_id:
await client.room_redact(args.room, image_event_id)
os.remove(test_image_path)
await client.close()
requests.packages.urllib3.disable_warnings(category=InsecureRequestWarning)
try:
r = requests.delete(f'{args.admin_endpoint}/_synapse/admin/v1/users/{args.user}/media', headers={'Authorization': f'Bearer {client.access_token}'}, verify=False)
if r.status_code != 200:
if nagios.WARNING < exit_code:
exit_code = nagios.WARNING
return f"WARN: failed to purge media for this user.\n{r.text}"
else:
return None
except Exception as e:
if nagios.WARNING < exit_code:
exit_code = nagios.WARNING
return f"WARN: failed to purge media for this user.\n{e}"
client = AsyncClient(args.hs, args.user, config=AsyncClientConfig(request_timeout=args.timeout, max_timeout_retry_wait_time=10))
if args.auth_file:
# If there are no previously-saved credentials, we'll use the password
if not os.path.exists(args.auth_file):
resp = await client.login(args.pw)
# check that we logged in successfully
if isinstance(resp, LoginResponse):
write_login_details_to_disk(resp, args.hs, args.auth_file)
else:
print(f'CRITICAL: failed to log in.\n{resp}')
sys.exit(nagios.CRITICAL)
else:
# Otherwise the config file exists, so we'll use the stored credentials
with open(args.auth_file, "r") as f:
config = json.load(f)
client = AsyncClient(config["homeserver"])
client.access_token = config["access_token"]
client.user_id = config["user_id"]
client.device_id = config["device_id"]
else:
await client.login(args.pw)
await client.join(args.room)
# Create a random image
imarray = np.random.rand(100, 100, 3) * 255
im = Image.fromarray(imarray.astype('uint8')).convert('RGBA')
_, test_image_path = tempfile.mkstemp()
test_image_path = test_image_path + '.png'
im.save(test_image_path)
# Send the image and get the event ID
image_event_id = (await send_image(client, args.room, test_image_path))
if isinstance(image_event_id, RoomSendError):
await cleanup(client, test_image_path)
print(f'CRITICAL: failed to send message.\n{image_event_id}')
sys.exit(nagios.CRITICAL)
image_event_id = image_event_id.event_id
# Get the event
image_event = (await client.room_get_event(args.room, image_event_id)).event
# convert mxc:// to http://
target_file_url = await client.mxc_to_http(image_event.url)
# Check the file. Ignore the non-async thing here, it doesn't matter in this situation.
# Remember: Cloudflare does not cache non-GET requests.
start = time.time()
retried = 0
for i in range(args.retries):
if i % 5 == 0 and i != 0:
r = requests.get(target_file_url, allow_redirects=False)
else:
r = requests.head(target_file_url, allow_redirects=False)
headers = dict(r.headers)
if len(args.required_headers) == 1:
args.required_headers = args.required_headers[0].split(' ')
success = []
for item in args.required_headers:
key, value = item.split('=')
_, code = verify_media_header(key, headers, good_value=value)
success.append(code)
if sum(success) == 0:
break
time.sleep(1)
retried += 1
end = time.time()
if args.ignore_first_attempt and retried > 0:
retried -= 1
exit_code = nagios.STATE_OK
prints = []
if r.status_code != 200 and not args.media_cdn_redirect:
await cleanup(client, test_image_path, image_event_id=image_event_id)
print(f'CRITICAL: status code for CDN image is "{r.status_code}"')
print(f'Hosted URL is {r.url}')
sys.exit(nagios.CRITICAL)
else:
prints.append(f'OK: status code for CDN image is "{r.status_code}"')
# Check domain
if args.media_cdn_redirect:
if 'location' in headers:
domain = urllib.parse.urlparse(headers['location']).netloc
if domain != args.check_domain:
exit_code = nagios.CRITICAL
prints.append(f'CRITICAL: redirect to media CDN domain is "{domain}"')
else:
prints.append(f'OK: media CDN domain is "{domain}"')
else:
exit_code = nagios.CRITICAL
prints.append(f'CRITICAL: was not redirected to the media CDN domain.')
# Make sure we aren't redirected if we're a Synapse server
test = requests.head(target_file_url, headers={'User-Agent': 'Synapse/1.77.3'}, allow_redirects=False)
if test.status_code != 200:
prints.append('CRITICAL: Synapse user-agent is redirected with status code', test.status_code)
exit_code = nagios.CRITICAL
else:
prints.append(f'OK: Synapse user-agent is not redirected.')
else:
if 'location' in headers:
exit_code = nagios.CRITICAL
prints.append(f"CRITICAL: recieved 301 to {urllib.parse.urlparse(headers['location']).netloc}")
else:
prints.append(f'OK: was not redirected.')
if args.required_headers:
# Icinga may pass the values as one string
if len(args.required_headers) == 1:
args.required_headers = args.required_headers[0].split(' ')
for item in args.required_headers:
key, value = item.split('=')
header_chk, code = verify_media_header(key, headers, good_value=value)
prints.append(header_chk)
if code > exit_code:
exit_code = code
# results = [verify_media_header('synapse-media-local-status', headers), verify_media_header('synapse-media-s3-status', headers, good_value='200'), verify_media_header('synapse-media-server', headers, good_value='s3')]
# for header_chk, code in results:
# prints.append(header_chk)
# if code > exit_code:
# exit_code = code
clean_msg = await cleanup(client, test_image_path, image_event_id=image_event_id)
if exit_code == nagios.OK:
print('OK: media CDN is good.')
elif exit_code == nagios.UNKNOWN:
print('UNKNOWN: media CDN is bad.')
elif exit_code == nagios.WARNING:
print('WARNING: media CDN is bad.')
elif exit_code == nagios.CRITICAL:
print('CRITICAL: media CDN is bad.')
else:
raise Exception('No exit code matched')
for msg in prints:
print(msg)
if clean_msg:
print(clean_msg)
if exit_code == nagios.STATE_OK:
print(f'Took {int(end - start)} seconds to fetch the uploaded image with {retried} retries. | retries={retried}')
sys.exit(exit_code)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='')
parser.add_argument('--user', required=True, help='User ID for the bot.')
parser.add_argument('--pw', required=True, help='Password for the bot.')
parser.add_argument('--hs', required=True, help='Homeserver of the bot.')
parser.add_argument('--admin-endpoint', required=True, help='Admin endpoint that will be called to purge media for this user.')
parser.add_argument('--room', required=True, help='The room the bot should send its test messages in.')
parser.add_argument('--check-domain', required=True, help='The domain that should be present.')
parser.add_argument('--media-cdn-redirect', default='true', help='If set, the server must respond with a redirect to the media CDN domain.')
parser.add_argument('--required-headers', nargs='*', help="If these headers aren't set to the correct value, critical. Use the format 'key=value")
parser.add_argument('--auth-file', help="File to cache the bot's login details to.")
parser.add_argument('--retries', type=int, default=11, help="It may take a few seconds for Synapse to send the uploaded file to S3. Retry this many times with 1 second interval between.")
parser.add_argument('--timeout', type=float, default=90, help='Request timeout limit.')
parser.add_argument('--warn', type=float, default=2.0, help='Manually set warn level.')
parser.add_argument('--crit', type=float, default=2.5, help='Manually set critical level.')
parser.add_argument('--ignore-first-attempt', action='store_true', help='Ignore the first attempt at fetching the image from the media CDN because the server may not have cached the file if it has never been requested before.')
args = parser.parse_args()
if args.media_cdn_redirect == 'true':
args.media_cdn_redirect = True
elif args.media_cdn_redirect == 'false':
args.media_cdn_redirect = False
else:
print('UNKNOWN: could not parse the value for --media-cdn-redirect')
sys.exit(nagios.UNKNOWN)
try:
asyncio.run(main(args))
except Exception as e:
print(f'UNKNOWN: exception\n{e}')
traceback.print_exc()
sys.exit(nagios.UNKNOWN)