icinga2-checks/check_monitor_bot.py

130 lines
4.8 KiB
Python

#!/usr/bin/env python3
import argparse
import json
import sys
import numpy as np
import requests
from checker import nagios
parser = argparse.ArgumentParser(description='')
parser.add_argument('--metrics-endpoint', required=True, help='Target URL to scrape.')
parser.add_argument('--domain', required=True, help='Our domain.')
parser.add_argument('--prometheus', action='store_true', help='Use Promethus instead of scraping the status page.')
parser.add_argument('--ignore', nargs='*', default=[], help='Ignore these hosts.')
parser.add_argument('--timeout', type=float, default=90, help='Request timeout limit.')
parser.add_argument('--warn', type=float, default=20, help='Manually set warn level.')
parser.add_argument('--crit', type=float, default=30, help='Manually set critical level.')
args = parser.parse_args()
if args.prometheus:
from checker.prometheus import parse_metrics
r = requests.get(args.metrics_endpoint)
if r.status_code != 200:
sys.exit(nagios.UNKNOWN)
metrics = {}
for item in parse_metrics(r.text)['monbot_ping_receive_delay_seconds']['monbot_ping_receive_delay_seconds_sum']:
if item.labels['receivingDomain'] not in metrics.keys():
metrics[item.labels['receivingDomain']] = {}
metrics[item.labels['receivingDomain']][item.labels['sourceDomain']] = item.value
pings = {'receiver': [], 'sender': [], }
for receiving_domain, senders in metrics.items():
if receiving_domain == args.domain:
for k, v in senders.items():
pings['receiver'].append(v)
else:
for k, v in senders.items():
if k == args.domain:
pings['sender'].append(v)
print(json.dumps(pings))
receiver_avg = np.round(np.average(pings['receiver']), 2)
sender_avg = np.round(np.average(pings['sender']), 2)
print('receiver latency is', receiver_avg)
print('sender latency is', sender_avg)
else:
from bs4 import BeautifulSoup
import re
# Split the values since icinga will quote the args
if len(args.ignore) == 1:
args.ignore = args.ignore[0].strip(' ').split(' ')
def get_sec(time_str):
"""Get seconds from time."""
h, m, s = time_str.split(':')
return int(h) * 3600 + int(m) * 60 + int(s)
def ms_to_s(s):
min_m = re.match(r'^(\d+)m([\d.]+)s', s)
if min_m:
return get_sec(f'0:{min_m.group(1)}:{int(float(min_m.group(2)))}')
elif s.endswith('ms'):
return float('0.' + s.strip('ms'))
elif s.endswith('s'):
return float(s.strip('ms'))
r = requests.get(args.metrics_endpoint)
if r.status_code != 200:
sys.exit(nagios.UNKNOWN)
soup = BeautifulSoup(r.text, 'html.parser')
tooltips = soup.find_all('span', {'class', 'tooltip'})
data = {}
for item in tooltips:
m = re.match(r'<span class="tooltip">\s*Send: (.*?)\s*<br\/>\s*Receive: (.*?)\s*<\/span>', str(item))
if m:
domain = item.parent.parent.find('span', {'class': 'domain'}).text
data[domain] = {
'send': ms_to_s(m.group(1)),
'receive': ms_to_s(m.group(2)),
}
exit_code = nagios.OK
info_str = []
data_str = []
if len(data.keys()) == 0:
print('UNKNOWN: failed to find any servers.')
sys.exit(nagios.UNKNOWN)
for domain, values in data.items():
if domain not in args.ignore:
if values['send'] >= args.crit:
info_str.append(f'CRITICAL: {domain} send is {values["send"]}s.')
exit_code = nagios.CRITICAL
elif values['send'] >= args.warn:
info_str.append(f'WARN: {domain} send is {values["send"]}s.')
if exit_code < nagios.WARNING:
exit_code = nagios.WARNING
# else:
# print(f'OK: {domain} send is {values["send"]}s.')
if values['receive'] >= args.crit:
info_str.append(f'CRITICAL: {domain} receive is {values["receive"]}s.')
exit_code = nagios.CRITICAL
elif values['receive'] >= args.warn:
info_str.append(f'WARN: {domain} receive is {values["receive"]}s.')
if exit_code < nagios.WARNING:
exit_code = nagios.WARNING
# else:
# print(f'OK: {domain} receive is {values["receive"]}s.')
data_str.append(
f"'{domain}-send'={values['send']}s;;; '{domain}-receive'={values['receive']}s;;;"
)
if any(('CRITICAL' not in s and 'WARNING' not in s) for s in info_str) or len(info_str) == 0:
print(f'OK: ping time is good.', end=' ')
else:
for x in info_str:
print(x, end=('\n' if info_str.index(x) + 1 < len(info_str) else ''))
print(f'|{" ".join(data_str)}')
sys.exit(exit_code)