fix media cdn check, fix monitor bot check, add ignore and exclude services to icinga2kuma.py

This commit is contained in:
Cyberes 2023-04-21 23:54:17 -06:00
parent 62b7cd6594
commit df52844870
5 changed files with 203 additions and 159 deletions

View File

@ -65,7 +65,7 @@ async def test_one_direction(sender_client, receiver_client, receiver_user_id):
if isinstance(resp, JoinResponse): if isinstance(resp, JoinResponse):
break break
elif isinstance(resp, JoinError): elif isinstance(resp, JoinError):
return f'UNKNOWN: failed to join room "{resp}"', nagios.UNKNOWN return f'UNKNOWN: failed to join room "{vars(resp)}"', nagios.UNKNOWN
if (datetime.now() - timeout_start).total_seconds() >= args.timeout: if (datetime.now() - timeout_start).total_seconds() >= args.timeout:
return 'UNKNOWN: failed to join room, timeout.', nagios.UNKNOWN return 'UNKNOWN: failed to join room, timeout.', nagios.UNKNOWN

View File

@ -22,32 +22,49 @@ parser.add_argument('--pw', required=True, help='Password for the bot.')
parser.add_argument('--hs', required=True, help='Homeserver of the bot.') parser.add_argument('--hs', required=True, help='Homeserver of the bot.')
parser.add_argument('--admin-endpoint', required=True, help='Admin endpoint that will be called to purge media for this user.') parser.add_argument('--admin-endpoint', required=True, help='Admin endpoint that will be called to purge media for this user.')
parser.add_argument('--room', required=True, help='The room the bot should send its test messages in.') parser.add_argument('--room', required=True, help='The room the bot should send its test messages in.')
parser.add_argument('--media-cdn-domain', required=True, help='The domain to make sure it redirects to.') parser.add_argument('--check-domain', required=True, help='The domain that should be present.')
parser.add_argument('--media-cdn-redirect', default='true', help='If set, the server must respond with a redirect to the media CDN domain.')
parser.add_argument('--required-headers', nargs='*', help="If these headers aren't set to the correct value, critical. Use the format 'key=value")
parser.add_argument('--auth-file', help="File to cache the bot's login details to.") parser.add_argument('--auth-file', help="File to cache the bot's login details to.")
parser.add_argument('--timeout', type=float, default=90, help='Request timeout limit.') parser.add_argument('--timeout', type=float, default=90, help='Request timeout limit.')
parser.add_argument('--warn', type=float, default=2.0, help='Manually set warn level.') parser.add_argument('--warn', type=float, default=2.0, help='Manually set warn level.')
parser.add_argument('--crit', type=float, default=2.5, help='Manually set critical level.') parser.add_argument('--crit', type=float, default=2.5, help='Manually set critical level.')
args = parser.parse_args() args = parser.parse_args()
if args.media_cdn_redirect == 'true':
args.media_cdn_redirect = True
elif args.media_cdn_redirect == 'false':
args.media_cdn_redirect = False
else:
print('UNKNOWN: could not parse the value for --media-cdn-redirect')
sys.exit(nagios.UNKNOWN)
def verify_media_header(header: str, header_dict: dict, good_value: str = None, warn_value: str = None, critical_value: str = None): def verify_media_header(header: str, header_dict: dict, good_value: str = None, warn_value: str = None, critical_value: str = None):
""" """
If you don't specify good_value, warn_value, or critical_value then the header will only be checked for existience. If you don't specify good_value, warn_value, or critical_value then the header will only be checked for existience.
""" """
# Convert everything to strings to prevent any wierdness
# Convert everything to lowercase strings to prevent any wierdness
header_dict = {k.lower(): v for k, v in header_dict.items()}
header = header.lower()
header_value = str(header_dict.get(header)) header_value = str(header_dict.get(header))
good_value = str(good_value)
warn_value = str(warn_value) warn_value = str(warn_value)
critical_value = str(critical_value) critical_value = str(critical_value)
if not header_value: if not header_value:
return f'CRITICAL: missing header "{header}"', nagios.CRITICAL return f'CRITICAL: missing header "{header}"', nagios.CRITICAL
elif good_value and header_value == good_value:
return f'OK: {header}: "{header_value}"', nagios.OK if good_value:
elif warn_value and header_value == warn_value: good_value = str(good_value)
return f'WARN: {header}: "{header_value}"', nagios.WARNING if header_value == good_value:
elif critical_value and header_value == critical_value: return f'OK: {header}: "{header_value}"', nagios.OK
return f'CRITICAL: {header}: "{header_value}"', nagios.CRITICAL else:
return f'OK: {header} is present with value "{header_value}"', nagios.OK return f'CRITICAL: {header} is not "{good_value}", is "{header_value}"', nagios.CRITICAL
# elif warn_value and header_value == warn_value:
# return f'WARN: {header}: "{header_value}"', nagios.WARNING
# elif critical_value and header_value == critical_value:
# return f'CRITICAL: {header}: "{header_value}"', nagios.CRITICAL
return f'OK: {header} is present', nagios.OK # with value "{header_value}"'
async def main() -> None: async def main() -> None:
@ -119,32 +136,62 @@ async def main() -> None:
# Check the headers. Ignore the non-async thing here, it doesn't # Check the headers. Ignore the non-async thing here, it doesn't
# matter in this situation. # matter in this situation.
headers = dict(requests.head(target_file_url).headers) r = requests.head(target_file_url, allow_redirects=False)
if r.status_code != 200 and not args.media_cdn_redirect:
await cleanup(client, test_image_path, image_event_id=image_event_id)
print(f'CRITICAL: status code was "{r.status_code}"')
sys.exit(nagios.CRITICAL)
else:
print(f'OK: status code was "{r.status_code}"')
headers = dict(r.headers)
exit_code = nagios.OK exit_code = nagios.OK
# Check domain # Check domain
domain = urllib.parse.urlparse(headers['location']).netloc if args.media_cdn_redirect:
if domain != args.media_cdn_domain: if 'location' in headers:
exit_code = nagios.CRITICAL domain = urllib.parse.urlparse(headers['location']).netloc
print(f'CRITICAL: media CDN domain is "{domain}"') if domain != args.check_domain:
else: exit_code = nagios.CRITICAL
print(f'OK: media CDN domain is "{domain}"') print(f'CRITICAL: redirect to media CDN domain is "{domain}"')
else:
print(f'OK: media CDN domain is "{domain}"')
else:
exit_code = nagios.CRITICAL
print(f'CRITICAL: was not redirected to the media CDN domain.')
results = [verify_media_header('synapse-media-local-status', headers), verify_media_header('synapse-media-s3-status', headers, good_value='200'), verify_media_header('synapse-media-server', headers, good_value='s3'), # Make sure we aren't redirected if we're a Synapse server
verify_media_header('Server', headers, good_value='cloudflare')] test = requests.head(target_file_url, headers={'User-Agent': 'Synapse/1.77.3'}, allow_redirects=False)
if test.status_code != 200:
print('CRITICAL: Synapse user-agent was redirected with status code', test.status_code)
exit_code = nagios.CRITICAL
else:
print(f'OK: Synapse user-agent is not redirected.')
else:
if 'location' in headers:
exit_code = nagios.CRITICAL
print(f"CRITICAL: recieved 301 to {urllib.parse.urlparse(headers['location']).netloc}")
else:
print(f'OK: was not redirected.')
if args.required_headers:
# Icinga may pass the values as one string
if len(args.required_headers) == 1:
args.required_headers = args.required_headers[0].split(' ')
for item in args.required_headers:
key, value = item.split('=')
header_chk, code = verify_media_header(key, headers, good_value=value)
print(header_chk)
if code > exit_code:
exit_code = code
results = [verify_media_header('synapse-media-local-status', headers), verify_media_header('synapse-media-s3-status', headers, good_value='200'), verify_media_header('synapse-media-server', headers, good_value='s3')]
for header_chk, code in results: for header_chk, code in results:
if code != nagios.OK:
exit_code = code
print(header_chk) print(header_chk)
if code > exit_code:
# Make sure we aren't redirected if we're a Synapse server exit_code = code
test = requests.head(target_file_url, headers={'User-Agent': 'Synapse/1.77.3'}, allow_redirects=False)
if test.status_code != 200:
print('CRITICAL: Synapse user-agent redirected with status code', test.status_code)
exit_code = nagios.CRITICAL
else:
print(f'OK: Synapse user-agent not redirected.')
await cleanup(client, test_image_path, image_event_id=image_event_id) await cleanup(client, test_image_path, image_event_id=image_event_id)
sys.exit(exit_code) sys.exit(exit_code)

View File

@ -1,9 +1,7 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
import argparse import argparse
import json
import sys import sys
import numpy as np
import requests import requests
from checker import nagios from checker import nagios
@ -11,129 +9,113 @@ from checker import nagios
parser = argparse.ArgumentParser(description='') parser = argparse.ArgumentParser(description='')
parser.add_argument('--metrics-endpoint', required=True, help='Target URL to scrape.') parser.add_argument('--metrics-endpoint', required=True, help='Target URL to scrape.')
parser.add_argument('--domain', required=True, help='Our domain.') parser.add_argument('--domain', required=True, help='Our domain.')
parser.add_argument('--prometheus', action='store_true', help='Use Promethus instead of scraping the status page.')
parser.add_argument('--ignore', nargs='*', default=[], help='Ignore these hosts.') parser.add_argument('--ignore', nargs='*', default=[], help='Ignore these hosts.')
parser.add_argument('--timeout', type=float, default=90, help='Request timeout limit.') parser.add_argument('--timeout', type=float, default=90, help='Request timeout limit.')
parser.add_argument('--warn', type=float, default=20, help='Manually set warn level.') parser.add_argument('--warn', type=float, default=20, help='Manually set warn level for response time in seconds.')
parser.add_argument('--crit', type=float, default=30, help='Manually set critical level.') parser.add_argument('--crit', type=float, default=30, help='Manually set critical levelfor response time in seconds.')
parser.add_argument('--warn-percent', type=int, default=30, help='Manually set warn level for the percentage of hosts that must fail the checks.')
parser.add_argument('--crit-percent', type=int, default=50, help='Manually set crit level for the percentage of hosts that must fail the checks.')
args = parser.parse_args() args = parser.parse_args()
def make_percent(num: float):
return int(num * 100)
def main(): def main():
if args.prometheus: from bs4 import BeautifulSoup
from checker.prometheus import parse_metrics import re
r = requests.get(args.metrics_endpoint) # Split the values since icinga will quote the args
if r.status_code != 200: if len(args.ignore) == 1:
sys.exit(nagios.UNKNOWN) args.ignore = args.ignore[0].strip(' ').split(' ')
metrics = {} def get_sec(time_str):
for item in parse_metrics(r.text)['monbot_ping_receive_delay_seconds']['monbot_ping_receive_delay_seconds_sum']: """Get seconds from time."""
if item.labels['receivingDomain'] not in metrics.keys(): h, m, s = time_str.split(':')
metrics[item.labels['receivingDomain']] = {} return int(h) * 3600 + int(m) * 60 + int(s)
metrics[item.labels['receivingDomain']][item.labels['sourceDomain']] = item.value
pings = {'receiver': [], 'sender': [], } def ms_to_s(s):
for receiving_domain, senders in metrics.items(): min_m = re.match(r'^(\d+)m([\d.]+)s', s)
if receiving_domain == args.domain: if min_m:
for k, v in senders.items(): return get_sec(f'0:{min_m.group(1)}:{int(float(min_m.group(2)))}')
pings['receiver'].append(v) elif s.endswith('ms'):
return float('0.' + s.strip('ms'))
elif s.endswith('s'):
return float(s.strip('ms'))
r = requests.get(args.metrics_endpoint)
if r.status_code != 200:
sys.exit(nagios.UNKNOWN)
soup = BeautifulSoup(r.text, 'html.parser')
tooltips = soup.find_all('span', {'class', 'tooltip'})
data = {}
for item in tooltips:
m = re.match(r'<span class="tooltip">\s*Send: (.*?)\s*<br\/>\s*Receive: (.*?)\s*<\/span>', str(item))
if m:
domain = item.parent.parent.find('span', {'class': 'domain'}).text
data[domain] = {
'send': ms_to_s(m.group(1)),
'receive': ms_to_s(m.group(2)),
}
exit_code = nagios.OK
info_str = []
data_str = []
warn_failed_hosts = []
crit_failed_hosts = []
if len(data.keys()) == 0:
print('UNKNOWN: failed to find any servers.')
sys.exit(nagios.UNKNOWN)
for domain, values in data.items():
if domain not in args.ignore:
if 'send' in values.keys():
if values['send'] >= args.crit:
info_str.append(f'CRITICAL: {domain} send is {values["send"]}s.')
crit_failed_hosts.append(domain)
elif values['send'] >= args.warn:
info_str.append(f'WARN: {domain} send is {values["send"]}s.')
warn_failed_hosts.append(domain)
else: else:
for k, v in senders.items(): info_str.append(f'UNKNOWN: {domain} send is empty.')
if k == args.domain:
pings['sender'].append(v)
print(json.dumps(pings)) if 'receive' in values.keys():
if values['receive'] >= args.crit:
info_str.append(f'CRITICAL: {domain} receive is {values["receive"]}s.')
crit_failed_hosts.append(domain)
elif values['receive'] >= args.warn:
info_str.append(f'WARN: {domain} receive is {values["receive"]}s.')
warn_failed_hosts.append(domain)
else:
info_str.append(f'UNKNOWN: {domain} receive is empty.')
receiver_avg = np.round(np.average(pings['receiver']), 2) if 'send' in values.keys() and 'receive' in values.keys():
sender_avg = np.round(np.average(pings['sender']), 2) data_str.append(
f"'{domain}-send'={values['send']}s;;; '{domain}-receive'={values['receive']}s;;;"
)
print('receiver latency is', receiver_avg) if not len(crit_failed_hosts) and not len(warn_failed_hosts):
print('sender latency is', sender_avg) print(f'OK: ping time is good.', end=' ')
else: else:
from bs4 import BeautifulSoup if len(crit_failed_hosts) / len(data.keys()) >= (args.crit_percent / 100):
import re # CRIT takes precedence
exit_code = nagios.CRITICAL
print(f'CRITICAL: {make_percent(len(crit_failed_hosts) / len(data.keys()))}% of hosts are marked as critical.')
elif len(warn_failed_hosts) / len(data.keys()) >= (args.warn_percent / 100):
exit_code = nagios.WARNING
print(f'WARN: {make_percent(len(warn_failed_hosts) / len(data.keys()))}% of hosts are marked as warn.')
# Split the values since icinga will quote the args if exit_code != nagios.OK:
if len(args.ignore) == 1:
args.ignore = args.ignore[0].strip(' ').split(' ')
def get_sec(time_str):
"""Get seconds from time."""
h, m, s = time_str.split(':')
return int(h) * 3600 + int(m) * 60 + int(s)
def ms_to_s(s):
min_m = re.match(r'^(\d+)m([\d.]+)s', s)
if min_m:
return get_sec(f'0:{min_m.group(1)}:{int(float(min_m.group(2)))}')
elif s.endswith('ms'):
return float('0.' + s.strip('ms'))
elif s.endswith('s'):
return float(s.strip('ms'))
r = requests.get(args.metrics_endpoint)
if r.status_code != 200:
sys.exit(nagios.UNKNOWN)
soup = BeautifulSoup(r.text, 'html.parser')
tooltips = soup.find_all('span', {'class', 'tooltip'})
data = {}
for item in tooltips:
m = re.match(r'<span class="tooltip">\s*Send: (.*?)\s*<br\/>\s*Receive: (.*?)\s*<\/span>', str(item))
if m:
domain = item.parent.parent.find('span', {'class': 'domain'}).text
data[domain] = {
'send': ms_to_s(m.group(1)),
'receive': ms_to_s(m.group(2)),
}
exit_code = nagios.OK
info_str = []
data_str = []
if len(data.keys()) == 0:
print('UNKNOWN: failed to find any servers.')
sys.exit(nagios.UNKNOWN)
for domain, values in data.items():
if domain not in args.ignore:
if 'send' in values.keys():
if values['send'] >= args.crit:
info_str.append(f'CRITICAL: {domain} send is {values["send"]}s.')
exit_code = nagios.CRITICAL
elif values['send'] >= args.warn:
info_str.append(f'WARN: {domain} send is {values["send"]}s.')
if exit_code < nagios.WARNING:
exit_code = nagios.WARNING
# else:
# print(f'OK: {domain} send is {values["send"]}s.')
else:
info_str.append(f'UNKNOWN: {domain} send is empty.')
if 'receive' in values.keys():
if values['receive'] >= args.crit:
info_str.append(f'CRITICAL: {domain} receive is {values["receive"]}s.')
exit_code = nagios.CRITICAL
elif values['receive'] >= args.warn:
info_str.append(f'WARN: {domain} receive is {values["receive"]}s.')
if exit_code < nagios.WARNING:
exit_code = nagios.WARNING
# else:
# print(f'OK: {domain} receive is {values["receive"]}s.')
else:
info_str.append(f'UNKNOWN: {domain} receive is empty.')
if 'send' in values.keys() and 'receive' in values.keys():
data_str.append(
f"'{domain}-send'={values['send']}s;;; '{domain}-receive'={values['receive']}s;;;"
)
if any(('CRITICAL' not in s and 'WARNING' not in s) for s in info_str) or len(info_str) == 0:
print(f'OK: ping time is good.', end=' ')
else:
for x in info_str: for x in info_str:
print(x, end=('\n' if info_str.index(x) + 1 < len(info_str) else '')) print(x, end=('\n' if info_str.index(x) + 1 < len(info_str) else ''))
print(f'|{" ".join(data_str)}') else:
print('OK: ping is good')
print(f'Warn hosts: {", ".join(warn_failed_hosts) if len(warn_failed_hosts) else "none"}')
print(f'Critical hosts: {", ".join(crit_failed_hosts) if len(crit_failed_hosts) else "none"}')
print(f'|{" ".join(data_str)}')
sys.exit(exit_code) sys.exit(exit_code)
if __name__ == "__main__": if __name__ == "__main__":

View File

@ -86,7 +86,7 @@ async def send_image(client, room_id, image):
resp, maybe_keys = await client.upload(f, content_type=mime_type, # image/jpeg resp, maybe_keys = await client.upload(f, content_type=mime_type, # image/jpeg
filename=os.path.basename(image), filesize=file_stat.st_size, ) filename=os.path.basename(image), filesize=file_stat.st_size, )
if not isinstance(resp, UploadResponse): if not isinstance(resp, UploadResponse):
print(f'UNKNOWN: failed to upload image "{resp}"') print(f'UNKNOWN: failed to upload image "{vars(resp)}"')
sys.exit(nagios.UNKNOWN) sys.exit(nagios.UNKNOWN)
content = {"body": os.path.basename(image), # descriptive title content = {"body": os.path.basename(image), # descriptive title

View File

@ -1,19 +1,27 @@
import argparse
import json import json
import os
import sys
from pathlib import Path from pathlib import Path
import urllib3
from flask import Flask, Response, request from flask import Flask, Response, request
from icinga2api.client import Client from icinga2api.client import Client
from checker import nagios from checker import nagios
parser = argparse.ArgumentParser(description='') endpoint = 'https://localhost:8080' # Icinga2 URL for the API. Defaults to "https://localhost:8080"
parser.add_argument('--endpoint', default='https://localhost:8080', help='Icinga2 URL for the API. Defaults to "https://localhost:8080"') icinga2_user = 'icingaweb2' # API username. Defaults to "icingaweb2"
parser.add_argument('--user', default='icingaweb2', help='API username. Defaults to "icingaweb2"') icinga2_pw = '' # API password or set ICINGA2KUMA_ICINGA2_PW
parser.add_argument('--pw', required=True, help='API password.')
args = parser.parse_args()
client = Client(args.endpoint, args.user, args.pw) if (icinga2_pw == '' or not icinga2_pw) and os.environ.get('ICINGA2KUMA_ICINGA2_PW'):
icinga2_pw = os.environ.get('ICINGA2KUMA_ICINGA2_PW')
else:
print('Must specify icinga2 API password.')
sys.exit(1)
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
client = Client(endpoint, icinga2_user, icinga2_pw)
app = Flask(__name__) app = Flask(__name__)
@ -24,6 +32,8 @@ app = Flask(__name__)
def get_host_state(hostid=None): def get_host_state(hostid=None):
path = Path(request.base_url) path = Path(request.base_url)
args_service = request.args.getlist('service') args_service = request.args.getlist('service')
args_exclude_service = request.args.getlist('exclude') # do not list these services
args_ignore_service = request.args.getlist('ignore') # do not trigger a fail if these services fail
kuma_mode = True if request.args.get('kuma') == 'true' else False kuma_mode = True if request.args.get('kuma') == 'true' else False
if not hostid: if not hostid:
@ -32,7 +42,9 @@ def get_host_state(hostid=None):
result = { result = {
'host': {}, 'host': {},
'services': {}, 'services': {},
'failed_services': [] 'failed_services': [],
'excluded_services': [],
'ignored_services': [],
} }
host_status = client.objects.list('Host', filters='match(hpattern, host.name)', filter_vars={'hpattern': hostid}) host_status = client.objects.list('Host', filters='match(hpattern, host.name)', filter_vars={'hpattern': hostid})
@ -53,13 +65,16 @@ def get_host_state(hostid=None):
services_status = client.objects.list('Service', filters='match(hpattern, host.name)', filter_vars={'hpattern': hostid}) services_status = client.objects.list('Service', filters='match(hpattern, host.name)', filter_vars={'hpattern': hostid})
for attrs in services_status: for attrs in services_status:
name = attrs['name'].split('!')[1] name = attrs['name'].split('!')[1]
result['services'][name] = { if name in args_exclude_service:
'state': 0 if (attrs['attrs']['acknowledgement'] or attrs['attrs']['acknowledgement_expiry']) else attrs['attrs']['state'], result['excluded_services'].append(name)
'actual_state': attrs['attrs']['state'], else:
'attrs': { result['services'][name] = {
**attrs 'state': 0 if (attrs['attrs']['acknowledgement'] or attrs['attrs']['acknowledgement_expiry']) else attrs['attrs']['state'],
'actual_state': attrs['attrs']['state'],
'attrs': {
**attrs
}
} }
}
if len(args_service): if len(args_service):
services = {} services = {}
@ -70,14 +85,14 @@ def get_host_state(hostid=None):
return Response(json.dumps({'error': 'service not found', 'service': service}), status=400, mimetype='application/json') return Response(json.dumps({'error': 'service not found', 'service': service}), status=400, mimetype='application/json')
result['services'] = services result['services'] = services
if kuma_mode: # if kuma_mode:
for name, service in result['services'].items(): for name, service in result['services'].items():
if service['state'] != nagios.OK: if service['state'] != nagios.OK and name not in args_ignore_service:
result['failed_services'].append({'name': name, 'state': service['state']}) result['failed_services'].append({'name': name, 'state': service['state']})
if result['host']['state'] != nagios.OK: if result['host']['state'] != nagios.OK:
result['failed_services'].append({'name': hostid, 'state': result['host']['state']}) result['failed_services'].append({'name': hostid, 'state': result['host']['state']})
if len(result['failed_services']): if kuma_mode and len(result['failed_services']):
return Response(json.dumps(result), status=410, mimetype='application/json') return Response(json.dumps(result), status=410, mimetype='application/json')
else: else:
return result return result