From df52844870ea2d7cdc6a42571335cb4287bf277f Mon Sep 17 00:00:00 2001 From: Cyberes Date: Fri, 21 Apr 2023 23:54:17 -0600 Subject: [PATCH] fix media cdn check, fix monitor bot check, add ignore and exclude services to icinga2kuma.py --- check_federation.py | 2 +- check_media_cdn.py | 105 ++++++++---- check_monitor_bot.py | 196 ++++++++++------------ checker/synapse_client.py | 2 +- Other/icinga-to-kuma.py => icinga2kuma.py | 57 ++++--- 5 files changed, 203 insertions(+), 159 deletions(-) rename Other/icinga-to-kuma.py => icinga2kuma.py (53%) diff --git a/check_federation.py b/check_federation.py index 6378613..73626ac 100644 --- a/check_federation.py +++ b/check_federation.py @@ -65,7 +65,7 @@ async def test_one_direction(sender_client, receiver_client, receiver_user_id): if isinstance(resp, JoinResponse): break elif isinstance(resp, JoinError): - return f'UNKNOWN: failed to join room "{resp}"', nagios.UNKNOWN + return f'UNKNOWN: failed to join room "{vars(resp)}"', nagios.UNKNOWN if (datetime.now() - timeout_start).total_seconds() >= args.timeout: return 'UNKNOWN: failed to join room, timeout.', nagios.UNKNOWN diff --git a/check_media_cdn.py b/check_media_cdn.py index fe866a4..2d16594 100644 --- a/check_media_cdn.py +++ b/check_media_cdn.py @@ -22,32 +22,49 @@ parser.add_argument('--pw', required=True, help='Password for the bot.') parser.add_argument('--hs', required=True, help='Homeserver of the bot.') parser.add_argument('--admin-endpoint', required=True, help='Admin endpoint that will be called to purge media for this user.') parser.add_argument('--room', required=True, help='The room the bot should send its test messages in.') -parser.add_argument('--media-cdn-domain', required=True, help='The domain to make sure it redirects to.') +parser.add_argument('--check-domain', required=True, help='The domain that should be present.') +parser.add_argument('--media-cdn-redirect', default='true', help='If set, the server must respond with a redirect to the media CDN domain.') +parser.add_argument('--required-headers', nargs='*', help="If these headers aren't set to the correct value, critical. Use the format 'key=value") parser.add_argument('--auth-file', help="File to cache the bot's login details to.") parser.add_argument('--timeout', type=float, default=90, help='Request timeout limit.') parser.add_argument('--warn', type=float, default=2.0, help='Manually set warn level.') parser.add_argument('--crit', type=float, default=2.5, help='Manually set critical level.') args = parser.parse_args() +if args.media_cdn_redirect == 'true': + args.media_cdn_redirect = True +elif args.media_cdn_redirect == 'false': + args.media_cdn_redirect = False +else: + print('UNKNOWN: could not parse the value for --media-cdn-redirect') + sys.exit(nagios.UNKNOWN) + def verify_media_header(header: str, header_dict: dict, good_value: str = None, warn_value: str = None, critical_value: str = None): """ If you don't specify good_value, warn_value, or critical_value then the header will only be checked for existience. """ - # Convert everything to strings to prevent any wierdness + + # Convert everything to lowercase strings to prevent any wierdness + header_dict = {k.lower(): v for k, v in header_dict.items()} + header = header.lower() header_value = str(header_dict.get(header)) - good_value = str(good_value) warn_value = str(warn_value) critical_value = str(critical_value) if not header_value: return f'CRITICAL: missing header "{header}"', nagios.CRITICAL - elif good_value and header_value == good_value: - return f'OK: {header}: "{header_value}"', nagios.OK - elif warn_value and header_value == warn_value: - return f'WARN: {header}: "{header_value}"', nagios.WARNING - elif critical_value and header_value == critical_value: - return f'CRITICAL: {header}: "{header_value}"', nagios.CRITICAL - return f'OK: {header} is present with value "{header_value}"', nagios.OK + + if good_value: + good_value = str(good_value) + if header_value == good_value: + return f'OK: {header}: "{header_value}"', nagios.OK + else: + return f'CRITICAL: {header} is not "{good_value}", is "{header_value}"', nagios.CRITICAL + # elif warn_value and header_value == warn_value: + # return f'WARN: {header}: "{header_value}"', nagios.WARNING + # elif critical_value and header_value == critical_value: + # return f'CRITICAL: {header}: "{header_value}"', nagios.CRITICAL + return f'OK: {header} is present', nagios.OK # with value "{header_value}"' async def main() -> None: @@ -119,32 +136,62 @@ async def main() -> None: # Check the headers. Ignore the non-async thing here, it doesn't # matter in this situation. - headers = dict(requests.head(target_file_url).headers) + r = requests.head(target_file_url, allow_redirects=False) + + if r.status_code != 200 and not args.media_cdn_redirect: + await cleanup(client, test_image_path, image_event_id=image_event_id) + print(f'CRITICAL: status code was "{r.status_code}"') + sys.exit(nagios.CRITICAL) + else: + print(f'OK: status code was "{r.status_code}"') + + headers = dict(r.headers) exit_code = nagios.OK # Check domain - domain = urllib.parse.urlparse(headers['location']).netloc - if domain != args.media_cdn_domain: - exit_code = nagios.CRITICAL - print(f'CRITICAL: media CDN domain is "{domain}"') - else: - print(f'OK: media CDN domain is "{domain}"') + if args.media_cdn_redirect: + if 'location' in headers: + domain = urllib.parse.urlparse(headers['location']).netloc + if domain != args.check_domain: + exit_code = nagios.CRITICAL + print(f'CRITICAL: redirect to media CDN domain is "{domain}"') + else: + print(f'OK: media CDN domain is "{domain}"') + else: + exit_code = nagios.CRITICAL + print(f'CRITICAL: was not redirected to the media CDN domain.') - results = [verify_media_header('synapse-media-local-status', headers), verify_media_header('synapse-media-s3-status', headers, good_value='200'), verify_media_header('synapse-media-server', headers, good_value='s3'), - verify_media_header('Server', headers, good_value='cloudflare')] + # Make sure we aren't redirected if we're a Synapse server + test = requests.head(target_file_url, headers={'User-Agent': 'Synapse/1.77.3'}, allow_redirects=False) + if test.status_code != 200: + print('CRITICAL: Synapse user-agent was redirected with status code', test.status_code) + exit_code = nagios.CRITICAL + else: + print(f'OK: Synapse user-agent is not redirected.') + else: + if 'location' in headers: + exit_code = nagios.CRITICAL + print(f"CRITICAL: recieved 301 to {urllib.parse.urlparse(headers['location']).netloc}") + else: + print(f'OK: was not redirected.') + + if args.required_headers: + # Icinga may pass the values as one string + if len(args.required_headers) == 1: + args.required_headers = args.required_headers[0].split(' ') + for item in args.required_headers: + key, value = item.split('=') + header_chk, code = verify_media_header(key, headers, good_value=value) + print(header_chk) + if code > exit_code: + exit_code = code + + results = [verify_media_header('synapse-media-local-status', headers), verify_media_header('synapse-media-s3-status', headers, good_value='200'), verify_media_header('synapse-media-server', headers, good_value='s3')] for header_chk, code in results: - if code != nagios.OK: - exit_code = code print(header_chk) - - # Make sure we aren't redirected if we're a Synapse server - test = requests.head(target_file_url, headers={'User-Agent': 'Synapse/1.77.3'}, allow_redirects=False) - if test.status_code != 200: - print('CRITICAL: Synapse user-agent redirected with status code', test.status_code) - exit_code = nagios.CRITICAL - else: - print(f'OK: Synapse user-agent not redirected.') + if code > exit_code: + exit_code = code await cleanup(client, test_image_path, image_event_id=image_event_id) sys.exit(exit_code) diff --git a/check_monitor_bot.py b/check_monitor_bot.py index f2071fb..95a3e17 100644 --- a/check_monitor_bot.py +++ b/check_monitor_bot.py @@ -1,9 +1,7 @@ #!/usr/bin/env python3 import argparse -import json import sys -import numpy as np import requests from checker import nagios @@ -11,129 +9,113 @@ from checker import nagios parser = argparse.ArgumentParser(description='') parser.add_argument('--metrics-endpoint', required=True, help='Target URL to scrape.') parser.add_argument('--domain', required=True, help='Our domain.') -parser.add_argument('--prometheus', action='store_true', help='Use Promethus instead of scraping the status page.') parser.add_argument('--ignore', nargs='*', default=[], help='Ignore these hosts.') parser.add_argument('--timeout', type=float, default=90, help='Request timeout limit.') -parser.add_argument('--warn', type=float, default=20, help='Manually set warn level.') -parser.add_argument('--crit', type=float, default=30, help='Manually set critical level.') +parser.add_argument('--warn', type=float, default=20, help='Manually set warn level for response time in seconds.') +parser.add_argument('--crit', type=float, default=30, help='Manually set critical levelfor response time in seconds.') +parser.add_argument('--warn-percent', type=int, default=30, help='Manually set warn level for the percentage of hosts that must fail the checks.') +parser.add_argument('--crit-percent', type=int, default=50, help='Manually set crit level for the percentage of hosts that must fail the checks.') args = parser.parse_args() +def make_percent(num: float): + return int(num * 100) + + def main(): - if args.prometheus: - from checker.prometheus import parse_metrics + from bs4 import BeautifulSoup + import re - r = requests.get(args.metrics_endpoint) - if r.status_code != 200: - sys.exit(nagios.UNKNOWN) + # Split the values since icinga will quote the args + if len(args.ignore) == 1: + args.ignore = args.ignore[0].strip(' ').split(' ') - metrics = {} - for item in parse_metrics(r.text)['monbot_ping_receive_delay_seconds']['monbot_ping_receive_delay_seconds_sum']: - if item.labels['receivingDomain'] not in metrics.keys(): - metrics[item.labels['receivingDomain']] = {} - metrics[item.labels['receivingDomain']][item.labels['sourceDomain']] = item.value + def get_sec(time_str): + """Get seconds from time.""" + h, m, s = time_str.split(':') + return int(h) * 3600 + int(m) * 60 + int(s) - pings = {'receiver': [], 'sender': [], } - for receiving_domain, senders in metrics.items(): - if receiving_domain == args.domain: - for k, v in senders.items(): - pings['receiver'].append(v) + def ms_to_s(s): + min_m = re.match(r'^(\d+)m([\d.]+)s', s) + if min_m: + return get_sec(f'0:{min_m.group(1)}:{int(float(min_m.group(2)))}') + elif s.endswith('ms'): + return float('0.' + s.strip('ms')) + elif s.endswith('s'): + return float(s.strip('ms')) + + r = requests.get(args.metrics_endpoint) + if r.status_code != 200: + sys.exit(nagios.UNKNOWN) + soup = BeautifulSoup(r.text, 'html.parser') + tooltips = soup.find_all('span', {'class', 'tooltip'}) + data = {} + for item in tooltips: + m = re.match(r'\s*Send: (.*?)\s*\s*Receive: (.*?)\s*<\/span>', str(item)) + if m: + domain = item.parent.parent.find('span', {'class': 'domain'}).text + data[domain] = { + 'send': ms_to_s(m.group(1)), + 'receive': ms_to_s(m.group(2)), + } + exit_code = nagios.OK + info_str = [] + data_str = [] + warn_failed_hosts = [] + crit_failed_hosts = [] + + if len(data.keys()) == 0: + print('UNKNOWN: failed to find any servers.') + sys.exit(nagios.UNKNOWN) + + for domain, values in data.items(): + if domain not in args.ignore: + if 'send' in values.keys(): + if values['send'] >= args.crit: + info_str.append(f'CRITICAL: {domain} send is {values["send"]}s.') + crit_failed_hosts.append(domain) + elif values['send'] >= args.warn: + info_str.append(f'WARN: {domain} send is {values["send"]}s.') + warn_failed_hosts.append(domain) else: - for k, v in senders.items(): - if k == args.domain: - pings['sender'].append(v) + info_str.append(f'UNKNOWN: {domain} send is empty.') - print(json.dumps(pings)) + if 'receive' in values.keys(): + if values['receive'] >= args.crit: + info_str.append(f'CRITICAL: {domain} receive is {values["receive"]}s.') + crit_failed_hosts.append(domain) + elif values['receive'] >= args.warn: + info_str.append(f'WARN: {domain} receive is {values["receive"]}s.') + warn_failed_hosts.append(domain) + else: + info_str.append(f'UNKNOWN: {domain} receive is empty.') - receiver_avg = np.round(np.average(pings['receiver']), 2) - sender_avg = np.round(np.average(pings['sender']), 2) + if 'send' in values.keys() and 'receive' in values.keys(): + data_str.append( + f"'{domain}-send'={values['send']}s;;; '{domain}-receive'={values['receive']}s;;;" + ) - print('receiver latency is', receiver_avg) - print('sender latency is', sender_avg) + if not len(crit_failed_hosts) and not len(warn_failed_hosts): + print(f'OK: ping time is good.', end=' ') else: - from bs4 import BeautifulSoup - import re + if len(crit_failed_hosts) / len(data.keys()) >= (args.crit_percent / 100): + # CRIT takes precedence + exit_code = nagios.CRITICAL + print(f'CRITICAL: {make_percent(len(crit_failed_hosts) / len(data.keys()))}% of hosts are marked as critical.') + elif len(warn_failed_hosts) / len(data.keys()) >= (args.warn_percent / 100): + exit_code = nagios.WARNING + print(f'WARN: {make_percent(len(warn_failed_hosts) / len(data.keys()))}% of hosts are marked as warn.') - # Split the values since icinga will quote the args - if len(args.ignore) == 1: - args.ignore = args.ignore[0].strip(' ').split(' ') - - def get_sec(time_str): - """Get seconds from time.""" - h, m, s = time_str.split(':') - return int(h) * 3600 + int(m) * 60 + int(s) - - def ms_to_s(s): - min_m = re.match(r'^(\d+)m([\d.]+)s', s) - if min_m: - return get_sec(f'0:{min_m.group(1)}:{int(float(min_m.group(2)))}') - elif s.endswith('ms'): - return float('0.' + s.strip('ms')) - elif s.endswith('s'): - return float(s.strip('ms')) - - r = requests.get(args.metrics_endpoint) - if r.status_code != 200: - sys.exit(nagios.UNKNOWN) - soup = BeautifulSoup(r.text, 'html.parser') - tooltips = soup.find_all('span', {'class', 'tooltip'}) - data = {} - for item in tooltips: - m = re.match(r'\s*Send: (.*?)\s*\s*Receive: (.*?)\s*<\/span>', str(item)) - if m: - domain = item.parent.parent.find('span', {'class': 'domain'}).text - data[domain] = { - 'send': ms_to_s(m.group(1)), - 'receive': ms_to_s(m.group(2)), - } - exit_code = nagios.OK - info_str = [] - data_str = [] - - if len(data.keys()) == 0: - print('UNKNOWN: failed to find any servers.') - sys.exit(nagios.UNKNOWN) - - for domain, values in data.items(): - if domain not in args.ignore: - if 'send' in values.keys(): - if values['send'] >= args.crit: - info_str.append(f'CRITICAL: {domain} send is {values["send"]}s.') - exit_code = nagios.CRITICAL - elif values['send'] >= args.warn: - info_str.append(f'WARN: {domain} send is {values["send"]}s.') - if exit_code < nagios.WARNING: - exit_code = nagios.WARNING - # else: - # print(f'OK: {domain} send is {values["send"]}s.') - else: - info_str.append(f'UNKNOWN: {domain} send is empty.') - - if 'receive' in values.keys(): - if values['receive'] >= args.crit: - info_str.append(f'CRITICAL: {domain} receive is {values["receive"]}s.') - exit_code = nagios.CRITICAL - elif values['receive'] >= args.warn: - info_str.append(f'WARN: {domain} receive is {values["receive"]}s.') - if exit_code < nagios.WARNING: - exit_code = nagios.WARNING - # else: - # print(f'OK: {domain} receive is {values["receive"]}s.') - else: - info_str.append(f'UNKNOWN: {domain} receive is empty.') - - if 'send' in values.keys() and 'receive' in values.keys(): - data_str.append( - f"'{domain}-send'={values['send']}s;;; '{domain}-receive'={values['receive']}s;;;" - ) - if any(('CRITICAL' not in s and 'WARNING' not in s) for s in info_str) or len(info_str) == 0: - print(f'OK: ping time is good.', end=' ') - else: + if exit_code != nagios.OK: for x in info_str: print(x, end=('\n' if info_str.index(x) + 1 < len(info_str) else '')) - print(f'|{" ".join(data_str)}') + else: + print('OK: ping is good') + print(f'Warn hosts: {", ".join(warn_failed_hosts) if len(warn_failed_hosts) else "none"}') + print(f'Critical hosts: {", ".join(crit_failed_hosts) if len(crit_failed_hosts) else "none"}') + print(f'|{" ".join(data_str)}') - sys.exit(exit_code) + sys.exit(exit_code) if __name__ == "__main__": diff --git a/checker/synapse_client.py b/checker/synapse_client.py index bd17e8b..e6193e3 100644 --- a/checker/synapse_client.py +++ b/checker/synapse_client.py @@ -86,7 +86,7 @@ async def send_image(client, room_id, image): resp, maybe_keys = await client.upload(f, content_type=mime_type, # image/jpeg filename=os.path.basename(image), filesize=file_stat.st_size, ) if not isinstance(resp, UploadResponse): - print(f'UNKNOWN: failed to upload image "{resp}"') + print(f'UNKNOWN: failed to upload image "{vars(resp)}"') sys.exit(nagios.UNKNOWN) content = {"body": os.path.basename(image), # descriptive title diff --git a/Other/icinga-to-kuma.py b/icinga2kuma.py similarity index 53% rename from Other/icinga-to-kuma.py rename to icinga2kuma.py index deb347c..b26c518 100644 --- a/Other/icinga-to-kuma.py +++ b/icinga2kuma.py @@ -1,19 +1,27 @@ -import argparse import json +import os +import sys from pathlib import Path +import urllib3 from flask import Flask, Response, request from icinga2api.client import Client from checker import nagios -parser = argparse.ArgumentParser(description='') -parser.add_argument('--endpoint', default='https://localhost:8080', help='Icinga2 URL for the API. Defaults to "https://localhost:8080"') -parser.add_argument('--user', default='icingaweb2', help='API username. Defaults to "icingaweb2"') -parser.add_argument('--pw', required=True, help='API password.') -args = parser.parse_args() +endpoint = 'https://localhost:8080' # Icinga2 URL for the API. Defaults to "https://localhost:8080" +icinga2_user = 'icingaweb2' # API username. Defaults to "icingaweb2" +icinga2_pw = '' # API password or set ICINGA2KUMA_ICINGA2_PW -client = Client(args.endpoint, args.user, args.pw) +if (icinga2_pw == '' or not icinga2_pw) and os.environ.get('ICINGA2KUMA_ICINGA2_PW'): + icinga2_pw = os.environ.get('ICINGA2KUMA_ICINGA2_PW') +else: + print('Must specify icinga2 API password.') + sys.exit(1) + +urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) + +client = Client(endpoint, icinga2_user, icinga2_pw) app = Flask(__name__) @@ -24,6 +32,8 @@ app = Flask(__name__) def get_host_state(hostid=None): path = Path(request.base_url) args_service = request.args.getlist('service') + args_exclude_service = request.args.getlist('exclude') # do not list these services + args_ignore_service = request.args.getlist('ignore') # do not trigger a fail if these services fail kuma_mode = True if request.args.get('kuma') == 'true' else False if not hostid: @@ -32,7 +42,9 @@ def get_host_state(hostid=None): result = { 'host': {}, 'services': {}, - 'failed_services': [] + 'failed_services': [], + 'excluded_services': [], + 'ignored_services': [], } host_status = client.objects.list('Host', filters='match(hpattern, host.name)', filter_vars={'hpattern': hostid}) @@ -53,13 +65,16 @@ def get_host_state(hostid=None): services_status = client.objects.list('Service', filters='match(hpattern, host.name)', filter_vars={'hpattern': hostid}) for attrs in services_status: name = attrs['name'].split('!')[1] - result['services'][name] = { - 'state': 0 if (attrs['attrs']['acknowledgement'] or attrs['attrs']['acknowledgement_expiry']) else attrs['attrs']['state'], - 'actual_state': attrs['attrs']['state'], - 'attrs': { - **attrs + if name in args_exclude_service: + result['excluded_services'].append(name) + else: + result['services'][name] = { + 'state': 0 if (attrs['attrs']['acknowledgement'] or attrs['attrs']['acknowledgement_expiry']) else attrs['attrs']['state'], + 'actual_state': attrs['attrs']['state'], + 'attrs': { + **attrs + } } - } if len(args_service): services = {} @@ -70,14 +85,14 @@ def get_host_state(hostid=None): return Response(json.dumps({'error': 'service not found', 'service': service}), status=400, mimetype='application/json') result['services'] = services - if kuma_mode: - for name, service in result['services'].items(): - if service['state'] != nagios.OK: - result['failed_services'].append({'name': name, 'state': service['state']}) - if result['host']['state'] != nagios.OK: - result['failed_services'].append({'name': hostid, 'state': result['host']['state']}) + # if kuma_mode: + for name, service in result['services'].items(): + if service['state'] != nagios.OK and name not in args_ignore_service: + result['failed_services'].append({'name': name, 'state': service['state']}) + if result['host']['state'] != nagios.OK: + result['failed_services'].append({'name': hostid, 'state': result['host']['state']}) - if len(result['failed_services']): + if kuma_mode and len(result['failed_services']): return Response(json.dumps(result), status=410, mimetype='application/json') else: return result