fix media cdn check, fix monitor bot check, add ignore and exclude services to icinga2kuma.py

This commit is contained in:
Cyberes 2023-04-21 23:54:17 -06:00
parent 62b7cd6594
commit df52844870
5 changed files with 203 additions and 159 deletions

View File

@ -65,7 +65,7 @@ async def test_one_direction(sender_client, receiver_client, receiver_user_id):
if isinstance(resp, JoinResponse): if isinstance(resp, JoinResponse):
break break
elif isinstance(resp, JoinError): elif isinstance(resp, JoinError):
return f'UNKNOWN: failed to join room "{resp}"', nagios.UNKNOWN return f'UNKNOWN: failed to join room "{vars(resp)}"', nagios.UNKNOWN
if (datetime.now() - timeout_start).total_seconds() >= args.timeout: if (datetime.now() - timeout_start).total_seconds() >= args.timeout:
return 'UNKNOWN: failed to join room, timeout.', nagios.UNKNOWN return 'UNKNOWN: failed to join room, timeout.', nagios.UNKNOWN

View File

@ -22,32 +22,49 @@ parser.add_argument('--pw', required=True, help='Password for the bot.')
parser.add_argument('--hs', required=True, help='Homeserver of the bot.') parser.add_argument('--hs', required=True, help='Homeserver of the bot.')
parser.add_argument('--admin-endpoint', required=True, help='Admin endpoint that will be called to purge media for this user.') parser.add_argument('--admin-endpoint', required=True, help='Admin endpoint that will be called to purge media for this user.')
parser.add_argument('--room', required=True, help='The room the bot should send its test messages in.') parser.add_argument('--room', required=True, help='The room the bot should send its test messages in.')
parser.add_argument('--media-cdn-domain', required=True, help='The domain to make sure it redirects to.') parser.add_argument('--check-domain', required=True, help='The domain that should be present.')
parser.add_argument('--media-cdn-redirect', default='true', help='If set, the server must respond with a redirect to the media CDN domain.')
parser.add_argument('--required-headers', nargs='*', help="If these headers aren't set to the correct value, critical. Use the format 'key=value")
parser.add_argument('--auth-file', help="File to cache the bot's login details to.") parser.add_argument('--auth-file', help="File to cache the bot's login details to.")
parser.add_argument('--timeout', type=float, default=90, help='Request timeout limit.') parser.add_argument('--timeout', type=float, default=90, help='Request timeout limit.')
parser.add_argument('--warn', type=float, default=2.0, help='Manually set warn level.') parser.add_argument('--warn', type=float, default=2.0, help='Manually set warn level.')
parser.add_argument('--crit', type=float, default=2.5, help='Manually set critical level.') parser.add_argument('--crit', type=float, default=2.5, help='Manually set critical level.')
args = parser.parse_args() args = parser.parse_args()
if args.media_cdn_redirect == 'true':
args.media_cdn_redirect = True
elif args.media_cdn_redirect == 'false':
args.media_cdn_redirect = False
else:
print('UNKNOWN: could not parse the value for --media-cdn-redirect')
sys.exit(nagios.UNKNOWN)
def verify_media_header(header: str, header_dict: dict, good_value: str = None, warn_value: str = None, critical_value: str = None): def verify_media_header(header: str, header_dict: dict, good_value: str = None, warn_value: str = None, critical_value: str = None):
""" """
If you don't specify good_value, warn_value, or critical_value then the header will only be checked for existience. If you don't specify good_value, warn_value, or critical_value then the header will only be checked for existience.
""" """
# Convert everything to strings to prevent any wierdness
# Convert everything to lowercase strings to prevent any wierdness
header_dict = {k.lower(): v for k, v in header_dict.items()}
header = header.lower()
header_value = str(header_dict.get(header)) header_value = str(header_dict.get(header))
good_value = str(good_value)
warn_value = str(warn_value) warn_value = str(warn_value)
critical_value = str(critical_value) critical_value = str(critical_value)
if not header_value: if not header_value:
return f'CRITICAL: missing header "{header}"', nagios.CRITICAL return f'CRITICAL: missing header "{header}"', nagios.CRITICAL
elif good_value and header_value == good_value:
if good_value:
good_value = str(good_value)
if header_value == good_value:
return f'OK: {header}: "{header_value}"', nagios.OK return f'OK: {header}: "{header_value}"', nagios.OK
elif warn_value and header_value == warn_value: else:
return f'WARN: {header}: "{header_value}"', nagios.WARNING return f'CRITICAL: {header} is not "{good_value}", is "{header_value}"', nagios.CRITICAL
elif critical_value and header_value == critical_value: # elif warn_value and header_value == warn_value:
return f'CRITICAL: {header}: "{header_value}"', nagios.CRITICAL # return f'WARN: {header}: "{header_value}"', nagios.WARNING
return f'OK: {header} is present with value "{header_value}"', nagios.OK # elif critical_value and header_value == critical_value:
# return f'CRITICAL: {header}: "{header_value}"', nagios.CRITICAL
return f'OK: {header} is present', nagios.OK # with value "{header_value}"'
async def main() -> None: async def main() -> None:
@ -119,32 +136,62 @@ async def main() -> None:
# Check the headers. Ignore the non-async thing here, it doesn't # Check the headers. Ignore the non-async thing here, it doesn't
# matter in this situation. # matter in this situation.
headers = dict(requests.head(target_file_url).headers) r = requests.head(target_file_url, allow_redirects=False)
if r.status_code != 200 and not args.media_cdn_redirect:
await cleanup(client, test_image_path, image_event_id=image_event_id)
print(f'CRITICAL: status code was "{r.status_code}"')
sys.exit(nagios.CRITICAL)
else:
print(f'OK: status code was "{r.status_code}"')
headers = dict(r.headers)
exit_code = nagios.OK exit_code = nagios.OK
# Check domain # Check domain
if args.media_cdn_redirect:
if 'location' in headers:
domain = urllib.parse.urlparse(headers['location']).netloc domain = urllib.parse.urlparse(headers['location']).netloc
if domain != args.media_cdn_domain: if domain != args.check_domain:
exit_code = nagios.CRITICAL exit_code = nagios.CRITICAL
print(f'CRITICAL: media CDN domain is "{domain}"') print(f'CRITICAL: redirect to media CDN domain is "{domain}"')
else: else:
print(f'OK: media CDN domain is "{domain}"') print(f'OK: media CDN domain is "{domain}"')
else:
results = [verify_media_header('synapse-media-local-status', headers), verify_media_header('synapse-media-s3-status', headers, good_value='200'), verify_media_header('synapse-media-server', headers, good_value='s3'), exit_code = nagios.CRITICAL
verify_media_header('Server', headers, good_value='cloudflare')] print(f'CRITICAL: was not redirected to the media CDN domain.')
for header_chk, code in results:
if code != nagios.OK:
exit_code = code
print(header_chk)
# Make sure we aren't redirected if we're a Synapse server # Make sure we aren't redirected if we're a Synapse server
test = requests.head(target_file_url, headers={'User-Agent': 'Synapse/1.77.3'}, allow_redirects=False) test = requests.head(target_file_url, headers={'User-Agent': 'Synapse/1.77.3'}, allow_redirects=False)
if test.status_code != 200: if test.status_code != 200:
print('CRITICAL: Synapse user-agent redirected with status code', test.status_code) print('CRITICAL: Synapse user-agent was redirected with status code', test.status_code)
exit_code = nagios.CRITICAL exit_code = nagios.CRITICAL
else: else:
print(f'OK: Synapse user-agent not redirected.') print(f'OK: Synapse user-agent is not redirected.')
else:
if 'location' in headers:
exit_code = nagios.CRITICAL
print(f"CRITICAL: recieved 301 to {urllib.parse.urlparse(headers['location']).netloc}")
else:
print(f'OK: was not redirected.')
if args.required_headers:
# Icinga may pass the values as one string
if len(args.required_headers) == 1:
args.required_headers = args.required_headers[0].split(' ')
for item in args.required_headers:
key, value = item.split('=')
header_chk, code = verify_media_header(key, headers, good_value=value)
print(header_chk)
if code > exit_code:
exit_code = code
results = [verify_media_header('synapse-media-local-status', headers), verify_media_header('synapse-media-s3-status', headers, good_value='200'), verify_media_header('synapse-media-server', headers, good_value='s3')]
for header_chk, code in results:
print(header_chk)
if code > exit_code:
exit_code = code
await cleanup(client, test_image_path, image_event_id=image_event_id) await cleanup(client, test_image_path, image_event_id=image_event_id)
sys.exit(exit_code) sys.exit(exit_code)

View File

@ -1,9 +1,7 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
import argparse import argparse
import json
import sys import sys
import numpy as np
import requests import requests
from checker import nagios from checker import nagios
@ -11,46 +9,20 @@ from checker import nagios
parser = argparse.ArgumentParser(description='') parser = argparse.ArgumentParser(description='')
parser.add_argument('--metrics-endpoint', required=True, help='Target URL to scrape.') parser.add_argument('--metrics-endpoint', required=True, help='Target URL to scrape.')
parser.add_argument('--domain', required=True, help='Our domain.') parser.add_argument('--domain', required=True, help='Our domain.')
parser.add_argument('--prometheus', action='store_true', help='Use Promethus instead of scraping the status page.')
parser.add_argument('--ignore', nargs='*', default=[], help='Ignore these hosts.') parser.add_argument('--ignore', nargs='*', default=[], help='Ignore these hosts.')
parser.add_argument('--timeout', type=float, default=90, help='Request timeout limit.') parser.add_argument('--timeout', type=float, default=90, help='Request timeout limit.')
parser.add_argument('--warn', type=float, default=20, help='Manually set warn level.') parser.add_argument('--warn', type=float, default=20, help='Manually set warn level for response time in seconds.')
parser.add_argument('--crit', type=float, default=30, help='Manually set critical level.') parser.add_argument('--crit', type=float, default=30, help='Manually set critical levelfor response time in seconds.')
parser.add_argument('--warn-percent', type=int, default=30, help='Manually set warn level for the percentage of hosts that must fail the checks.')
parser.add_argument('--crit-percent', type=int, default=50, help='Manually set crit level for the percentage of hosts that must fail the checks.')
args = parser.parse_args() args = parser.parse_args()
def make_percent(num: float):
return int(num * 100)
def main(): def main():
if args.prometheus:
from checker.prometheus import parse_metrics
r = requests.get(args.metrics_endpoint)
if r.status_code != 200:
sys.exit(nagios.UNKNOWN)
metrics = {}
for item in parse_metrics(r.text)['monbot_ping_receive_delay_seconds']['monbot_ping_receive_delay_seconds_sum']:
if item.labels['receivingDomain'] not in metrics.keys():
metrics[item.labels['receivingDomain']] = {}
metrics[item.labels['receivingDomain']][item.labels['sourceDomain']] = item.value
pings = {'receiver': [], 'sender': [], }
for receiving_domain, senders in metrics.items():
if receiving_domain == args.domain:
for k, v in senders.items():
pings['receiver'].append(v)
else:
for k, v in senders.items():
if k == args.domain:
pings['sender'].append(v)
print(json.dumps(pings))
receiver_avg = np.round(np.average(pings['receiver']), 2)
sender_avg = np.round(np.average(pings['sender']), 2)
print('receiver latency is', receiver_avg)
print('sender latency is', sender_avg)
else:
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import re import re
@ -89,6 +61,8 @@ def main():
exit_code = nagios.OK exit_code = nagios.OK
info_str = [] info_str = []
data_str = [] data_str = []
warn_failed_hosts = []
crit_failed_hosts = []
if len(data.keys()) == 0: if len(data.keys()) == 0:
print('UNKNOWN: failed to find any servers.') print('UNKNOWN: failed to find any servers.')
@ -99,26 +73,20 @@ def main():
if 'send' in values.keys(): if 'send' in values.keys():
if values['send'] >= args.crit: if values['send'] >= args.crit:
info_str.append(f'CRITICAL: {domain} send is {values["send"]}s.') info_str.append(f'CRITICAL: {domain} send is {values["send"]}s.')
exit_code = nagios.CRITICAL crit_failed_hosts.append(domain)
elif values['send'] >= args.warn: elif values['send'] >= args.warn:
info_str.append(f'WARN: {domain} send is {values["send"]}s.') info_str.append(f'WARN: {domain} send is {values["send"]}s.')
if exit_code < nagios.WARNING: warn_failed_hosts.append(domain)
exit_code = nagios.WARNING
# else:
# print(f'OK: {domain} send is {values["send"]}s.')
else: else:
info_str.append(f'UNKNOWN: {domain} send is empty.') info_str.append(f'UNKNOWN: {domain} send is empty.')
if 'receive' in values.keys(): if 'receive' in values.keys():
if values['receive'] >= args.crit: if values['receive'] >= args.crit:
info_str.append(f'CRITICAL: {domain} receive is {values["receive"]}s.') info_str.append(f'CRITICAL: {domain} receive is {values["receive"]}s.')
exit_code = nagios.CRITICAL crit_failed_hosts.append(domain)
elif values['receive'] >= args.warn: elif values['receive'] >= args.warn:
info_str.append(f'WARN: {domain} receive is {values["receive"]}s.') info_str.append(f'WARN: {domain} receive is {values["receive"]}s.')
if exit_code < nagios.WARNING: warn_failed_hosts.append(domain)
exit_code = nagios.WARNING
# else:
# print(f'OK: {domain} receive is {values["receive"]}s.')
else: else:
info_str.append(f'UNKNOWN: {domain} receive is empty.') info_str.append(f'UNKNOWN: {domain} receive is empty.')
@ -126,11 +94,25 @@ def main():
data_str.append( data_str.append(
f"'{domain}-send'={values['send']}s;;; '{domain}-receive'={values['receive']}s;;;" f"'{domain}-send'={values['send']}s;;; '{domain}-receive'={values['receive']}s;;;"
) )
if any(('CRITICAL' not in s and 'WARNING' not in s) for s in info_str) or len(info_str) == 0:
if not len(crit_failed_hosts) and not len(warn_failed_hosts):
print(f'OK: ping time is good.', end=' ') print(f'OK: ping time is good.', end=' ')
else: else:
if len(crit_failed_hosts) / len(data.keys()) >= (args.crit_percent / 100):
# CRIT takes precedence
exit_code = nagios.CRITICAL
print(f'CRITICAL: {make_percent(len(crit_failed_hosts) / len(data.keys()))}% of hosts are marked as critical.')
elif len(warn_failed_hosts) / len(data.keys()) >= (args.warn_percent / 100):
exit_code = nagios.WARNING
print(f'WARN: {make_percent(len(warn_failed_hosts) / len(data.keys()))}% of hosts are marked as warn.')
if exit_code != nagios.OK:
for x in info_str: for x in info_str:
print(x, end=('\n' if info_str.index(x) + 1 < len(info_str) else '')) print(x, end=('\n' if info_str.index(x) + 1 < len(info_str) else ''))
else:
print('OK: ping is good')
print(f'Warn hosts: {", ".join(warn_failed_hosts) if len(warn_failed_hosts) else "none"}')
print(f'Critical hosts: {", ".join(crit_failed_hosts) if len(crit_failed_hosts) else "none"}')
print(f'|{" ".join(data_str)}') print(f'|{" ".join(data_str)}')
sys.exit(exit_code) sys.exit(exit_code)

View File

@ -86,7 +86,7 @@ async def send_image(client, room_id, image):
resp, maybe_keys = await client.upload(f, content_type=mime_type, # image/jpeg resp, maybe_keys = await client.upload(f, content_type=mime_type, # image/jpeg
filename=os.path.basename(image), filesize=file_stat.st_size, ) filename=os.path.basename(image), filesize=file_stat.st_size, )
if not isinstance(resp, UploadResponse): if not isinstance(resp, UploadResponse):
print(f'UNKNOWN: failed to upload image "{resp}"') print(f'UNKNOWN: failed to upload image "{vars(resp)}"')
sys.exit(nagios.UNKNOWN) sys.exit(nagios.UNKNOWN)
content = {"body": os.path.basename(image), # descriptive title content = {"body": os.path.basename(image), # descriptive title

View File

@ -1,19 +1,27 @@
import argparse
import json import json
import os
import sys
from pathlib import Path from pathlib import Path
import urllib3
from flask import Flask, Response, request from flask import Flask, Response, request
from icinga2api.client import Client from icinga2api.client import Client
from checker import nagios from checker import nagios
parser = argparse.ArgumentParser(description='') endpoint = 'https://localhost:8080' # Icinga2 URL for the API. Defaults to "https://localhost:8080"
parser.add_argument('--endpoint', default='https://localhost:8080', help='Icinga2 URL for the API. Defaults to "https://localhost:8080"') icinga2_user = 'icingaweb2' # API username. Defaults to "icingaweb2"
parser.add_argument('--user', default='icingaweb2', help='API username. Defaults to "icingaweb2"') icinga2_pw = '' # API password or set ICINGA2KUMA_ICINGA2_PW
parser.add_argument('--pw', required=True, help='API password.')
args = parser.parse_args()
client = Client(args.endpoint, args.user, args.pw) if (icinga2_pw == '' or not icinga2_pw) and os.environ.get('ICINGA2KUMA_ICINGA2_PW'):
icinga2_pw = os.environ.get('ICINGA2KUMA_ICINGA2_PW')
else:
print('Must specify icinga2 API password.')
sys.exit(1)
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
client = Client(endpoint, icinga2_user, icinga2_pw)
app = Flask(__name__) app = Flask(__name__)
@ -24,6 +32,8 @@ app = Flask(__name__)
def get_host_state(hostid=None): def get_host_state(hostid=None):
path = Path(request.base_url) path = Path(request.base_url)
args_service = request.args.getlist('service') args_service = request.args.getlist('service')
args_exclude_service = request.args.getlist('exclude') # do not list these services
args_ignore_service = request.args.getlist('ignore') # do not trigger a fail if these services fail
kuma_mode = True if request.args.get('kuma') == 'true' else False kuma_mode = True if request.args.get('kuma') == 'true' else False
if not hostid: if not hostid:
@ -32,7 +42,9 @@ def get_host_state(hostid=None):
result = { result = {
'host': {}, 'host': {},
'services': {}, 'services': {},
'failed_services': [] 'failed_services': [],
'excluded_services': [],
'ignored_services': [],
} }
host_status = client.objects.list('Host', filters='match(hpattern, host.name)', filter_vars={'hpattern': hostid}) host_status = client.objects.list('Host', filters='match(hpattern, host.name)', filter_vars={'hpattern': hostid})
@ -53,6 +65,9 @@ def get_host_state(hostid=None):
services_status = client.objects.list('Service', filters='match(hpattern, host.name)', filter_vars={'hpattern': hostid}) services_status = client.objects.list('Service', filters='match(hpattern, host.name)', filter_vars={'hpattern': hostid})
for attrs in services_status: for attrs in services_status:
name = attrs['name'].split('!')[1] name = attrs['name'].split('!')[1]
if name in args_exclude_service:
result['excluded_services'].append(name)
else:
result['services'][name] = { result['services'][name] = {
'state': 0 if (attrs['attrs']['acknowledgement'] or attrs['attrs']['acknowledgement_expiry']) else attrs['attrs']['state'], 'state': 0 if (attrs['attrs']['acknowledgement'] or attrs['attrs']['acknowledgement_expiry']) else attrs['attrs']['state'],
'actual_state': attrs['attrs']['state'], 'actual_state': attrs['attrs']['state'],
@ -70,14 +85,14 @@ def get_host_state(hostid=None):
return Response(json.dumps({'error': 'service not found', 'service': service}), status=400, mimetype='application/json') return Response(json.dumps({'error': 'service not found', 'service': service}), status=400, mimetype='application/json')
result['services'] = services result['services'] = services
if kuma_mode: # if kuma_mode:
for name, service in result['services'].items(): for name, service in result['services'].items():
if service['state'] != nagios.OK: if service['state'] != nagios.OK and name not in args_ignore_service:
result['failed_services'].append({'name': name, 'state': service['state']}) result['failed_services'].append({'name': name, 'state': service['state']})
if result['host']['state'] != nagios.OK: if result['host']['state'] != nagios.OK:
result['failed_services'].append({'name': hostid, 'state': result['host']['state']}) result['failed_services'].append({'name': hostid, 'state': result['host']['state']})
if len(result['failed_services']): if kuma_mode and len(result['failed_services']):
return Response(json.dumps(result), status=410, mimetype='application/json') return Response(json.dumps(result), status=410, mimetype='application/json')
else: else:
return result return result