This commit is contained in:
Cyberes 2023-04-21 23:54:16 -06:00
parent d6c165667e
commit 39042ba364
16 changed files with 1492 additions and 0 deletions

View File

View File

@ -0,0 +1,201 @@
#!/usr/bin/env python3
import argparse
import asyncio
import json
import os
import sys
import time
import urllib
from datetime import datetime
from uuid import uuid4
from nio import AsyncClient, AsyncClientConfig, JoinError, JoinResponse, LoginResponse, RoomCreateError, RoomGetEventResponse, RoomSendError
import nagios
parser = argparse.ArgumentParser(description='Test federation between two homeservers.')
parser.add_argument('--bot1-user', required=True, help='User ID for bot 1.')
parser.add_argument('--bot1-pw', required=True, help='Password for bot 1.')
parser.add_argument('--bot1-hs', required=True, help='Homeserver for bot 1.')
parser.add_argument('--bot1-auth-file', help="File to cache the bot's login details to.")
parser.add_argument('--bot2-user', required=True, help='User ID for bot 2.')
parser.add_argument('--bot2-pw', required=True, help='Password for bot 2.')
parser.add_argument('--bot2-hs', required=True, help='Homeserver for bot 2.')
parser.add_argument('--bot2-auth-file', help="File to cache the bot's login details to.")
parser.add_argument('--timeout', type=float, default=90, help='Request timeout limit.')
parser.add_argument('--warn', type=float, default=2.0, help='Manually set warn level.')
parser.add_argument('--crit', type=float, default=2.5, help='Manually set critical level.')
args = parser.parse_args()
bot1_hs_domain = urllib.parse.urlparse(args.bot1_hs).netloc
bot2_hs_domain = urllib.parse.urlparse(args.bot2_hs).netloc
def write_details_to_disk(resp: LoginResponse, homeserver, config_file) -> None:
"""Writes the required login details to disk so we can log in later without
using a password.
Arguments:
resp {LoginResponse} -- the successful client login response.
homeserver -- URL of homeserver, e.g. "https://matrix.example.org"
"""
# open the config file in write-mode
with open(config_file, "w") as f:
# write the login details to disk
json.dump({"homeserver": homeserver, # e.g. "https://matrix.example.org"
"user_id": resp.user_id, # e.g. "@user:example.org"
"device_id": resp.device_id, # device ID, 10 uppercase letters
"access_token": resp.access_token, # cryptogr. access token
}, f, )
async def test_one_direction(sender_client, receiver_client, receiver_user_id):
# The sender creates the room and invites the receiver
test_room_name = str(uuid4())
new_test_room = await sender_client.room_create(name=test_room_name, invite=[receiver_user_id])
if isinstance(new_test_room, RoomCreateError):
print(new_test_room)
new_test_room_id = new_test_room.room_id
time.sleep(2)
# The receiver joins via invite
timeout_start = datetime.now()
while True:
resp = await receiver_client.join(new_test_room_id)
if isinstance(resp, JoinResponse):
break
elif isinstance(resp, JoinError):
return f'UNKNOWN: failed to join room "{resp}"', nagios.UNKNOWN
if (datetime.now() - timeout_start).total_seconds() >= args.timeout:
return 'UNKNOWN: failed to join room, timeout.', nagios.UNKNOWN
time.sleep(2)
# Sender sends the msg to room
send_msg_time = datetime.now()
msg = {'id': str(uuid4()), 'ts': send_msg_time.microsecond}
resp = (await sender_client.room_send(new_test_room_id, 'm.room.message', {'body': json.dumps(msg), 'msgtype': 'm.room.message'}))
if isinstance(resp, RoomSendError):
return f'UNKNOWN: failed to send message "{resp}', nagios.UNKNOWN
msg_event_id = resp.event_id
# Sender watches for the message
start_check = datetime.now()
while True:
resp = await receiver_client.room_get_event(new_test_room_id, msg_event_id)
if isinstance(resp, RoomGetEventResponse):
recv_msg_time = datetime.now()
recv_msg = json.loads(resp.event.source['content']['body'])
break
if (datetime.now() - start_check).total_seconds() >= args.timeout:
await sender_client.room_leave(new_test_room_id)
await sender_client.room_forget(new_test_room_id)
await receiver_client.room_leave(new_test_room_id)
await receiver_client.room_forget(new_test_room_id)
return "CRITICAL: timeout - receiver did not recieve the sender's message.", nagios.CRITICAL
# Double check everything makes sense
if not msg == recv_msg:
await sender_client.room_leave(new_test_room_id)
await sender_client.room_forget(new_test_room_id)
await receiver_client.room_leave(new_test_room_id)
await receiver_client.room_forget(new_test_room_id)
return "CRITICAL: sender's message did not match the receiver's.", nagios.CRITICAL
# Calculate the time it took to recieve the message, including sync
bot1_msg_delta = (recv_msg_time - send_msg_time).total_seconds()
# Clean up the rooms
await sender_client.room_leave(new_test_room_id)
await sender_client.room_forget(new_test_room_id)
await receiver_client.room_leave(new_test_room_id)
await receiver_client.room_forget(new_test_room_id)
return bot1_msg_delta, True
async def login(user_id, passwd, homeserver, config_file=None):
client = AsyncClient(homeserver, user_id, config=AsyncClientConfig(request_timeout=args.timeout, max_timeout_retry_wait_time=10))
if config_file:
# If there are no previously-saved credentials, we'll use the password
if not os.path.exists(config_file):
resp = await client.login(passwd)
# check that we logged in successfully
if isinstance(resp, LoginResponse):
write_details_to_disk(resp, homeserver, config_file)
else:
print(f'UNKNOWN: failed to log in "{resp}"')
sys.exit(nagios.UNKNOWN)
else:
# Otherwise the config file exists, so we'll use the stored credentials
with open(config_file, "r") as f:
config = json.load(f)
client = AsyncClient(config["homeserver"])
client.access_token = config["access_token"]
client.user_id = config["user_id"]
client.device_id = config["device_id"]
else:
await client.login(passwd)
return client
async def main() -> None:
bot1 = await login(args.bot1_user, args.bot1_pw, args.bot1_hs, args.bot1_auth_file)
bot2 = await login(args.bot2_user, args.bot2_pw, args.bot2_hs, args.bot2_auth_file)
bot1_output_msg, bot1_output_code = await test_one_direction(bot1, bot2, args.bot2_user)
bot2_output_msg, bot2_output_code = await test_one_direction(bot2, bot1, args.bot1_user)
nagios_output = nagios.OK
if not bot1_output_code:
print(bot1_output_msg)
nagios_output = bot1_output_code
if not bot2_output_code:
print(bot2_output_msg)
if nagios_output < bot2_output_code:
# Only set the code if our code is more severe
nagios_output = bot2_output_code
# bot1 -> bot2
if isinstance(bot1_output_msg, float): # only do this if the func returned a value
bot1_output_msg = round(bot1_output_msg, 2)
if bot1_output_msg >= args.crit:
if nagios_output < nagios.CRITICAL:
nagios_output = nagios.CRITICAL
print('CRITICAL:', bot1_hs_domain, '->', bot2_hs_domain, 'is', bot1_output_msg, 'seconds.')
elif bot1_output_msg >= args.warn:
if nagios_output < nagios.WARNING:
nagios_output = nagios.WARNING
print('WARNING:', bot1_hs_domain, '->', bot2_hs_domain, 'is', bot1_output_msg, 'seconds.')
else:
print('OK:', bot1_hs_domain, '->', bot2_hs_domain, 'is', bot1_output_msg, 'seconds.')
# bot2 -> bot1
if isinstance(bot2_output_msg, float):
bot2_output_msg = round(bot2_output_msg, 2)
if bot2_output_msg >= args.crit:
if nagios_output < nagios.CRITICAL:
nagios_output = nagios.CRITICAL
print('CRITICAL:', bot1_hs_domain, '<-', bot2_hs_domain, 'is', bot2_output_msg, 'seconds.')
elif bot2_output_msg >= args.warn:
if nagios_output < nagios.WARNING:
nagios_output = nagios.WARNING
print('WARNING:', bot1_hs_domain, '<-', bot2_hs_domain, 'is', bot2_output_msg, 'seconds.')
else:
print('OK:', bot1_hs_domain, '<-', bot2_hs_domain, 'is', bot2_output_msg, 'seconds.')
# Clean up
await bot1.close()
await bot2.close()
sys.exit(nagios_output)
if __name__ == "__main__":
try:
asyncio.run(main())
except Exception as e:
print(f"UNKNOWN: exception '{e}'")
sys.exit(nagios.UNKNOWN)

View File

@ -0,0 +1,220 @@
#!/usr/bin/env python3
import argparse
import asyncio
import json
import os
import sys
import tempfile
import urllib
import aiofiles.os
import magic
import numpy as np
import requests
from PIL import Image
from nio import AsyncClient, AsyncClientConfig, LoginResponse, UploadResponse
from urllib3.exceptions import InsecureRequestWarning
import nagios
parser = argparse.ArgumentParser(description='')
parser.add_argument('--user', required=True, help='User ID for the bot.')
parser.add_argument('--pw', required=True, help='Password for the bot.')
parser.add_argument('--hs', required=True, help='Homeserver of the bot.')
parser.add_argument('--admin-endpoint', required=True, help='Admin endpoint that will be called to purge media for this user.')
parser.add_argument('--room', required=True, help='The room the bot should send its test messages in.')
parser.add_argument('--media-cdn-domain', required=True, help='The domain to make sure it redirects to.')
parser.add_argument('--auth-file', help="File to cache the bot's login details to.")
parser.add_argument('--timeout', type=float, default=90, help='Request timeout limit.')
parser.add_argument('--warn', type=float, default=2.0, help='Manually set warn level.')
parser.add_argument('--crit', type=float, default=2.5, help='Manually set critical level.')
args = parser.parse_args()
CONFIG_FILE = args.auth_file
def verify_media_header(header: str, header_dict: dict, good_value: str = None, warn_value: str = None, critical_value: str = None):
"""
If you don't specify good_value, warn_value, or critical_value then the header will only be checked for existience.
"""
# Convert everything to strings to prevent any wierdness
header_value = str(header_dict.get(header))
good_value = str(good_value)
warn_value = str(warn_value)
critical_value = str(critical_value)
if not header_value:
return f'CRITICAL: missing header "{header}"', nagios.CRITICAL
elif good_value and header_value == good_value:
return f'OK: {header}: "{header_value}"', nagios.OK
elif warn_value and header_value == warn_value:
return f'WARN: {header}: "{header_value}"', nagios.WARNING
elif critical_value and header_value == critical_value:
return f'CRITICAL: {header}: "{header_value}"', nagios.CRITICAL
return f'OK: {header} is present with value "{header_value}"', nagios.OK
def write_details_to_disk(resp: LoginResponse, homeserver) -> None:
"""Writes the required login details to disk so we can log in later without
using a password.
Arguments:
resp {LoginResponse} -- the successful client login response.
homeserver -- URL of homeserver, e.g. "https://matrix.example.org"
"""
# open the config file in write-mode
with open(CONFIG_FILE, "w") as f:
# write the login details to disk
json.dump({"homeserver": homeserver, # e.g. "https://matrix.example.org"
"user_id": resp.user_id, # e.g. "@user:example.org"
"device_id": resp.device_id, # device ID, 10 uppercase letters
"access_token": resp.access_token, # cryptogr. access token
}, f, )
async def send_image(client, room_id, image):
"""Send image to room.
Arguments:
---------
client : Client
room_id : str
image : str, file name of image
This is a working example for a JPG image.
"content": {
"body": "someimage.jpg",
"info": {
"size": 5420,
"mimetype": "image/jpeg",
"thumbnail_info": {
"w": 100,
"h": 100,
"mimetype": "image/jpeg",
"size": 2106
},
"w": 100,
"h": 100,
"thumbnail_url": "mxc://example.com/SomeStrangeThumbnailUriKey"
},
"msgtype": "m.image",
"url": "mxc://example.com/SomeStrangeUriKey"
}
"""
mime_type = magic.from_file(image, mime=True) # e.g. "image/jpeg"
if not mime_type.startswith("image/"):
print(f'UNKNOWN: wrong mime type "{mime_type}"')
sys.exit(nagios.UNKNOWN)
im = Image.open(image)
(width, height) = im.size # im.size returns (width,height) tuple
# first do an upload of image, then send URI of upload to room
file_stat = await aiofiles.os.stat(image)
async with aiofiles.open(image, "r+b") as f:
resp, maybe_keys = await client.upload(f, content_type=mime_type, # image/jpeg
filename=os.path.basename(image), filesize=file_stat.st_size, )
if not isinstance(resp, UploadResponse):
print(f'UNKNOWN: failed to upload image "{resp}"')
sys.exit(nagios.UNKNOWN)
content = {"body": os.path.basename(image), # descriptive title
"info": {"size": file_stat.st_size, "mimetype": mime_type, "thumbnail_info": None, # TODO
"w": width, # width in pixel
"h": height, # height in pixel
"thumbnail_url": None, # TODO
}, "msgtype": "m.image", "url": resp.content_uri, }
try:
return await client.room_send(room_id, message_type="m.room.message", content=content)
except Exception as e:
print(f"Image send of file {image} failed.")
print(f'UNKNOWN: failed to send image event "{e}"')
sys.exit(nagios.UNKNOWN)
async def main() -> None:
client = AsyncClient(args.hs, args.user, config=AsyncClientConfig(request_timeout=args.timeout, max_timeout_retry_wait_time=10))
if args.auth_file:
# If there are no previously-saved credentials, we'll use the password
if not os.path.exists(CONFIG_FILE):
resp = await client.login(args.pw)
# check that we logged in successfully
if isinstance(resp, LoginResponse):
write_details_to_disk(resp, args.hs)
else:
print(f'UNKNOWN: failed to log in "{resp}"')
sys.exit(nagios.UNKNOWN)
else:
# Otherwise the config file exists, so we'll use the stored credentials
with open(CONFIG_FILE, "r") as f:
config = json.load(f)
client = AsyncClient(config["homeserver"])
client.access_token = config["access_token"]
client.user_id = config["user_id"]
client.device_id = config["device_id"]
else:
await client.login(args.pw)
await client.join(args.room)
# Create a random image
imarray = np.random.rand(100, 100, 3) * 255
im = Image.fromarray(imarray.astype('uint8')).convert('RGBA')
_, test_image_path = tempfile.mkstemp()
test_image_path = test_image_path + '.png'
im.save(test_image_path)
# Send the image and get the event ID
image_event_id = (await send_image(client, args.room, test_image_path)).event_id
# Get the event
image_event = (await client.room_get_event(args.room, image_event_id)).event
# convert mxc:// to http://
target_file_url = await client.mxc_to_http(image_event.url)
# Check the headers. Ignore the non-async thing here, it doesn't
# matter in this situation.
headers = dict(requests.head(target_file_url).headers)
exit_code = nagios.OK
# Check domain
domain = urllib.parse.urlparse(headers['location']).netloc
if domain != args.media_cdn_domain:
exit_code = nagios.CRITICAL
print(f'CRITICAL: media CDN domain is "{domain}"')
else:
print(f'OK: media CDN domain is "{domain}"')
results = [verify_media_header('synapse-media-local-status', headers), verify_media_header('synapse-media-s3-status', headers, good_value='200'), verify_media_header('synapse-media-server', headers, good_value='s3'),
verify_media_header('Server', headers, good_value='cloudflare')]
for header_chk, code in results:
if code != nagios.OK:
exit_code = code
print(header_chk)
# Clean up
await client.room_redact(args.room, image_event_id)
os.remove(test_image_path)
await client.close()
requests.packages.urllib3.disable_warnings(category=InsecureRequestWarning)
try:
r = requests.delete(f'{args.admin_endpoint}/_synapse/admin/v1/users/{args.user}/media', headers={'Authorization': f'Bearer {client.access_token}'}, verify=False)
if r.status_code != 200:
if nagios.WARNING < exit_code:
exit_code = nagios.WARNING
print(f"WARN: failed to purge media for this user, request failed with '{r.text}'")
except Exception as e:
if nagios.WARNING < exit_code:
exit_code = nagios.WARNING
print(f"WARN: failed to purge media for this user '{e}'")
sys.exit(exit_code)
if __name__ == "__main__":
try:
asyncio.run(main())
except Exception as e:
print(f'UNKNOWN: exception "{e}"')
sys.exit(nagios.UNKNOWN)

View File

@ -0,0 +1,130 @@
#!/usr/bin/env python3
import argparse
import json
import sys
import numpy as np
import requests
import nagios
parser = argparse.ArgumentParser(description='')
parser.add_argument('--metrics-endpoint', required=True, help='Target URL to scrape.')
parser.add_argument('--domain', required=True, help='Our domain.')
parser.add_argument('--prometheus', action='store_true', help='Use Promethus instead of scraping the status page.')
parser.add_argument('--ignore', nargs='*', default=[], help='Ignore these hosts.')
parser.add_argument('--timeout', type=float, default=90, help='Request timeout limit.')
parser.add_argument('--warn', type=float, default=20, help='Manually set warn level.')
parser.add_argument('--crit', type=float, default=30, help='Manually set critical level.')
args = parser.parse_args()
if args.prometheus:
from prometheus import parse_metrics
r = requests.get(args.metrics_endpoint)
if r.status_code != 200:
sys.exit(nagios.UNKNOWN)
metrics = {}
for item in parse_metrics(r.text)['monbot_ping_receive_delay_seconds']['monbot_ping_receive_delay_seconds_sum']:
if item.labels['receivingDomain'] not in metrics.keys():
metrics[item.labels['receivingDomain']] = {}
metrics[item.labels['receivingDomain']][item.labels['sourceDomain']] = item.value
pings = {'receiver': [], 'sender': [], }
for receiving_domain, senders in metrics.items():
if receiving_domain == args.domain:
for k, v in senders.items():
pings['receiver'].append(v)
else:
for k, v in senders.items():
if k == args.domain:
pings['sender'].append(v)
print(json.dumps(pings))
receiver_avg = np.round(np.average(pings['receiver']), 2)
sender_avg = np.round(np.average(pings['sender']), 2)
print('receiver latency is', receiver_avg)
print('sender latency is', sender_avg)
else:
from bs4 import BeautifulSoup
import re
# Split the values since icinga will quote the args
if len(args.ignore) == 1:
args.ignore = args.ignore[0].strip(' ').split(' ')
def get_sec(time_str):
"""Get seconds from time."""
h, m, s = time_str.split(':')
return int(h) * 3600 + int(m) * 60 + int(s)
def ms_to_s(s):
min_m = re.match(r'^(\d+)m([\d.]+)s', s)
if min_m:
return get_sec(f'0:{min_m.group(1)}:{int(float(min_m.group(2)))}')
elif s.endswith('ms'):
return float('0.' + s.strip('ms'))
elif s.endswith('s'):
return float(s.strip('ms'))
r = requests.get(args.metrics_endpoint)
if r.status_code != 200:
sys.exit(nagios.UNKNOWN)
soup = BeautifulSoup(r.text, 'html.parser')
tooltips = soup.find_all('span', {'class', 'tooltip'})
data = {}
for item in tooltips:
m = re.match(r'<span class="tooltip">\s*Send: (.*?)\s*<br\/>\s*Receive: (.*?)\s*<\/span>', str(item))
print(item)
if m:
domain = item.parent.parent.find('span', {'class': 'domain'}).text
data[domain] = {
'send': ms_to_s(m.group(1)),
'receive': ms_to_s(m.group(2)),
}
exit_code = nagios.OK
info_str = []
data_str = []
if len(data.keys()) == 0:
print('UNKNOWN: failed to find any servers.')
sys.exit(nagios.UNKNOWN)
for domain, values in data.items():
if domain not in args.ignore:
if values['send'] >= args.crit:
info_str.append(f'CRITICAL: {domain} send is {values["send"]}s.')
exit_code = nagios.CRITICAL
elif values['send'] >= args.warn:
info_str.append(f'WARN: {domain} send is {values["send"]}s.')
if exit_code < nagios.WARNING:
exit_code = nagios.WARNING
# else:
# print(f'OK: {domain} send is {values["send"]}s.')
if values['receive'] >= args.crit:
info_str.append(f'CRITICAL: {domain} receive is {values["receive"]}s.')
exit_code = nagios.CRITICAL
elif values['receive'] >= args.warn:
info_str.append(f'WARN: {domain} receive is {values["receive"]}s.')
if exit_code < nagios.WARNING:
exit_code = nagios.WARNING
# else:
# print(f'OK: {domain} receive is {values["receive"]}s.')
data_str.append(
f"'{domain}-send'={values['send']}s;;; '{domain}-receive'={values['receive']}s;;;"
)
if any(('CRITICAL' not in s and 'WARNING' not in s) for s in info_str) or len(info_str) == 0:
print(f'OK: ping time is good.', end=' ')
else:
for x in info_str:
print(x, end=('\n' if info_str.index(x) + 1 < len(info_str) else ''))
print(f'|{" ".join(data_str)}')
sys.exit(exit_code)

View File

@ -0,0 +1,378 @@
import json
import numpy as np
import requests
from urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(category=InsecureRequestWarning)
def get_avg_python_gc_time(api_key, interval, data_range, endpoint):
json_data = {
'queries': [
{
'datasource': {
'type': 'prometheus',
'uid': 'AbuT5CJ4z',
},
'expr': 'rate(python_gc_time_sum{instance="10.0.0.34:9000",job=~"(federation-receiver|federation-sender|initialsync|synapse|synchrotron)",index=~".*"}[30s])/rate(python_gc_time_count[30s])',
'format': 'time_series',
'intervalFactor': 2,
'refId': 'A',
'step': 20,
'target': '',
'interval': '',
# 'key': 'Q-7edaea76-89bd-4b29-8412-a68bf4646712-0',
'queryType': 'timeSeriesQuery',
'exemplar': False,
# 'requestId': 'Q-7edaea76-89bd-4b29-8412-a68bf4646712-0A',
'utcOffsetSec': -25200,
'legendFormat': '',
'datasourceId': 1,
'intervalMs': interval * 1000,
# 'maxDataPoints': 1383,
},
],
'from': f'now-{data_range}m',
'to': 'now',
}
response = requests.post(f'{endpoint}/api/ds/query', headers={'Authorization': f'Bearer {api_key}'}, json=json_data, verify=False).json()
good = []
for i in response['results']['A']['frames']:
# This one can sometimes be null
new = []
for x in range(len(i['data']['values'][1])):
if i['data']['values'][1][x] is not None:
new.append(i['data']['values'][1][x])
good.append(new)
# Remove empty arrays
results = []
for x in good:
if len(x) > 0:
results.append(x)
return [np.round(np.average(i), 5) for i in results]
def get_outgoing_http_request_rate(api_key, interval, data_range, endpoint):
json_data = {
'queries': [
{
'datasource': {
'type': 'prometheus',
'uid': 'AbuT5CJ4z',
},
'editorMode': 'code',
'expr': 'rate(synapse_http_client_requests_total{job=~"(federation-receiver|federation-sender|initialsync|synapse|synchrotron)",index=~".*",instance="10.0.0.34:9000"}[2m])',
'range': True,
'refId': 'A',
'interval': '',
# 'key': 'Q-8b3dabd7-358e-45ed-a9ba-7be3f5fcf274-0',
'queryType': 'timeSeriesQuery',
'exemplar': False,
# 'requestId': 'Q-8b3dabd7-358e-45ed-a9ba-7be3f5fcf274-0Q-c5c08c6b-7591-424c-8eac-53837fa51e89-1A',
'utcOffsetSec': -25200,
'legendFormat': '',
'datasourceId': 1,
'intervalMs': interval * 1000,
# 'maxDataPoints': 10,
},
{
'datasource': {
'type': 'prometheus',
'uid': 'AbuT5CJ4z',
},
'editorMode': 'code',
'expr': 'rate(synapse_http_matrixfederationclient_requests_total{job=~"(federation-receiver|federation-sender|initialsync|synapse|synchrotron)",index=~".*",instance="10.0.0.34:9000"}[2m])',
'range': True,
'refId': 'B',
'interval': '',
# 'key': 'Q-c5c08c6b-7591-424c-8eac-53837fa51e89-1',
'queryType': 'timeSeriesQuery',
'exemplar': False,
# 'requestId': 'Q-8b3dabd7-358e-45ed-a9ba-7be3f5fcf274-0Q-c5c08c6b-7591-424c-8eac-53837fa51e89-1B',
'utcOffsetSec': -25200,
'legendFormat': '',
'datasourceId': 1,
'intervalMs': interval * 1000,
# 'maxDataPoints': 10,
},
],
'from': f'now-{data_range}m',
'to': 'now',
}
response = requests.post(f'{endpoint}/api/ds/query', headers={'Authorization': f'Bearer {api_key}'}, json=json_data, verify=False).json()
output = {}
for letter, result in response['results'].items():
name = result['frames'][0]['schema']['name'].split('=')[-1].strip('}').strip('"')
output[name] = np.round(np.average(result['frames'][0]['data']['values'][1]), 2)
return output
# return {
# 'GET': np.round(np.average(response['results']['A']['frames'][0]['data']['values'][1]), 2),
# 'POST': np.round(np.average(response['results']['A']['frames'][1]['data']['values'][1]), 2),
# 'PUT': np.round(np.average(response['results']['A']['frames'][2]['data']['values'][1]), 2),
# 'fedr_GET': np.round(np.average(response['results']['B']['frames'][0]['data']['values'][1]), 2)
# }
def get_event_send_time(api_key, interval, data_range, endpoint):
json_data = {
'queries': [
{
'datasource': {
'type': 'prometheus',
'uid': 'AbuT5CJ4z',
},
'expr': 'histogram_quantile(0.99, sum(rate(synapse_http_server_response_time_seconds_bucket{servlet=\'RoomSendEventRestServlet\',index=~".*",instance="10.0.0.34:9000",code=~"2.."}[2m])) by (le))',
'format': 'time_series',
'intervalFactor': 1,
'refId': 'D',
'interval': '',
# 'key': 'Q-d8eb3572-9aea-4a73-92f2-e08b33c21ecb-0',
'editorMode': 'builder',
'range': True,
'instant': True,
'queryType': 'timeSeriesQuery',
'exemplar': False,
# 'requestId': 'Q-d8eb3572-9aea-4a73-92f2-e08b33c21ecb-0Q-a9222e59-18ff-4b3b-80ae-27bea8f149a9-1Q-0378a458-1ade-410e-a4b3-ae4aaa91d709-2Q-da4c00b6-61c1-49f5-8a0a-9f19990acfb7-3Q-21254889-3cf6-4d97-8dc5-ddf68360847e-4Q-502b8ed5-4050-461c-befc-76f6796dce68-5Q-364dc896-c399-4e58-8930-cba2e3d1d579-6Q-9072e904-da8d-4b00-b454-dac45b7c38f0-7D',
'utcOffsetSec': -25200,
'legendFormat': '',
'datasourceId': 1,
'intervalMs': interval * 1000,
# 'maxDataPoints': 1383,
},
{
'datasource': {
'type': 'prometheus',
'uid': 'AbuT5CJ4z',
},
'expr': 'histogram_quantile(0.9, sum(rate(synapse_http_server_response_time_seconds_bucket{servlet=\'RoomSendEventRestServlet\',index=~".*",instance="10.0.0.34:9000",code=~"2.."}[2m])) by (le))',
'format': 'time_series',
'interval': '',
'intervalFactor': 1,
'refId': 'A',
# 'key': 'Q-a9222e59-18ff-4b3b-80ae-27bea8f149a9-1',
'queryType': 'timeSeriesQuery',
'exemplar': False,
# 'requestId': 'Q-d8eb3572-9aea-4a73-92f2-e08b33c21ecb-0Q-a9222e59-18ff-4b3b-80ae-27bea8f149a9-1Q-0378a458-1ade-410e-a4b3-ae4aaa91d709-2Q-da4c00b6-61c1-49f5-8a0a-9f19990acfb7-3Q-21254889-3cf6-4d97-8dc5-ddf68360847e-4Q-502b8ed5-4050-461c-befc-76f6796dce68-5Q-364dc896-c399-4e58-8930-cba2e3d1d579-6Q-9072e904-da8d-4b00-b454-dac45b7c38f0-7A',
'utcOffsetSec': -25200,
'legendFormat': '',
'datasourceId': 1,
'intervalMs': interval * 1000,
# 'maxDataPoints': 1383,
},
{
'datasource': {
'type': 'prometheus',
'uid': 'AbuT5CJ4z',
},
'expr': 'histogram_quantile(0.75, sum(rate(synapse_http_server_response_time_seconds_bucket{servlet=\'RoomSendEventRestServlet\',index=~".*",instance="10.0.0.34:9000",code=~"2.."}[2m])) by (le))',
'format': 'time_series',
'intervalFactor': 1,
'refId': 'C',
'interval': '',
# 'key': 'Q-0378a458-1ade-410e-a4b3-ae4aaa91d709-2',
'queryType': 'timeSeriesQuery',
'exemplar': False,
# 'requestId': 'Q-d8eb3572-9aea-4a73-92f2-e08b33c21ecb-0Q-a9222e59-18ff-4b3b-80ae-27bea8f149a9-1Q-0378a458-1ade-410e-a4b3-ae4aaa91d709-2Q-da4c00b6-61c1-49f5-8a0a-9f19990acfb7-3Q-21254889-3cf6-4d97-8dc5-ddf68360847e-4Q-502b8ed5-4050-461c-befc-76f6796dce68-5Q-364dc896-c399-4e58-8930-cba2e3d1d579-6Q-9072e904-da8d-4b00-b454-dac45b7c38f0-7C',
'utcOffsetSec': -25200,
'legendFormat': '',
'datasourceId': 1,
'intervalMs': interval * 1000,
# 'maxDataPoints': 1383,
},
{
'datasource': {
'type': 'prometheus',
'uid': 'AbuT5CJ4z',
},
'expr': 'histogram_quantile(0.5, sum(rate(synapse_http_server_response_time_seconds_bucket{servlet=\'RoomSendEventRestServlet\',index=~".*",instance="10.0.0.34:9000",code=~"2.."}[2m])) by (le))',
'format': 'time_series',
'intervalFactor': 1,
'refId': 'B',
'interval': '',
# 'key': 'Q-da4c00b6-61c1-49f5-8a0a-9f19990acfb7-3',
'queryType': 'timeSeriesQuery',
'exemplar': False,
# 'requestId': 'Q-d8eb3572-9aea-4a73-92f2-e08b33c21ecb-0Q-a9222e59-18ff-4b3b-80ae-27bea8f149a9-1Q-0378a458-1ade-410e-a4b3-ae4aaa91d709-2Q-da4c00b6-61c1-49f5-8a0a-9f19990acfb7-3Q-21254889-3cf6-4d97-8dc5-ddf68360847e-4Q-502b8ed5-4050-461c-befc-76f6796dce68-5Q-364dc896-c399-4e58-8930-cba2e3d1d579-6Q-9072e904-da8d-4b00-b454-dac45b7c38f0-7B',
'utcOffsetSec': -25200,
'legendFormat': '',
'datasourceId': 1,
'intervalMs': interval * 1000,
# 'maxDataPoints': 1383,
},
{
'datasource': {
'type': 'prometheus',
'uid': 'AbuT5CJ4z',
},
'expr': 'histogram_quantile(0.25, sum(rate(synapse_http_server_response_time_seconds_bucket{servlet=\'RoomSendEventRestServlet\',index=~".*",instance="10.0.0.34:9000",code=~"2.."}[2m])) by (le))',
'refId': 'F',
'interval': '',
# 'key': 'Q-21254889-3cf6-4d97-8dc5-ddf68360847e-4',
'queryType': 'timeSeriesQuery',
'exemplar': False,
# 'requestId': 'Q-d8eb3572-9aea-4a73-92f2-e08b33c21ecb-0Q-a9222e59-18ff-4b3b-80ae-27bea8f149a9-1Q-0378a458-1ade-410e-a4b3-ae4aaa91d709-2Q-da4c00b6-61c1-49f5-8a0a-9f19990acfb7-3Q-21254889-3cf6-4d97-8dc5-ddf68360847e-4Q-502b8ed5-4050-461c-befc-76f6796dce68-5Q-364dc896-c399-4e58-8930-cba2e3d1d579-6Q-9072e904-da8d-4b00-b454-dac45b7c38f0-7F',
'utcOffsetSec': -25200,
'legendFormat': '',
'datasourceId': 1,
'intervalMs': interval * 1000,
# 'maxDataPoints': 1383,
},
{
'datasource': {
'type': 'prometheus',
'uid': 'AbuT5CJ4z',
},
'expr': 'histogram_quantile(0.05, sum(rate(synapse_http_server_response_time_seconds_bucket{servlet=\'RoomSendEventRestServlet\',index=~".*",instance="10.0.0.34:9000",code=~"2.."}[2m])) by (le))',
'refId': 'G',
'interval': '',
# 'key': 'Q-502b8ed5-4050-461c-befc-76f6796dce68-5',
'queryType': 'timeSeriesQuery',
'exemplar': False,
# 'requestId': 'Q-d8eb3572-9aea-4a73-92f2-e08b33c21ecb-0Q-a9222e59-18ff-4b3b-80ae-27bea8f149a9-1Q-0378a458-1ade-410e-a4b3-ae4aaa91d709-2Q-da4c00b6-61c1-49f5-8a0a-9f19990acfb7-3Q-21254889-3cf6-4d97-8dc5-ddf68360847e-4Q-502b8ed5-4050-461c-befc-76f6796dce68-5Q-364dc896-c399-4e58-8930-cba2e3d1d579-6Q-9072e904-da8d-4b00-b454-dac45b7c38f0-7G',
'utcOffsetSec': -25200,
'legendFormat': '',
'datasourceId': 1,
'intervalMs': interval * 1000,
# 'maxDataPoints': 1383,
},
{
'datasource': {
'type': 'prometheus',
'uid': 'AbuT5CJ4z',
},
'expr': 'sum(rate(synapse_http_server_response_time_seconds_sum{servlet=\'RoomSendEventRestServlet\',index=~".*",instance="10.0.0.34:9000",code=~"2.."}[2m])) / sum(rate(synapse_http_server_response_time_seconds_count{servlet=\'RoomSendEventRestServlet\',index=~".*",instance="10.0.0.34:9000",code=~"2.."}[2m]))',
'refId': 'H',
'interval': '',
# 'key': 'Q-364dc896-c399-4e58-8930-cba2e3d1d579-6',
'queryType': 'timeSeriesQuery',
'exemplar': False,
# 'requestId': 'Q-d8eb3572-9aea-4a73-92f2-e08b33c21ecb-0Q-a9222e59-18ff-4b3b-80ae-27bea8f149a9-1Q-0378a458-1ade-410e-a4b3-ae4aaa91d709-2Q-da4c00b6-61c1-49f5-8a0a-9f19990acfb7-3Q-21254889-3cf6-4d97-8dc5-ddf68360847e-4Q-502b8ed5-4050-461c-befc-76f6796dce68-5Q-364dc896-c399-4e58-8930-cba2e3d1d579-6Q-9072e904-da8d-4b00-b454-dac45b7c38f0-7H',
'utcOffsetSec': -25200,
'legendFormat': '',
'datasourceId': 1,
'intervalMs': interval * 1000,
# 'maxDataPoints': 1383,
},
{
'datasource': {
'type': 'prometheus',
'uid': 'AbuT5CJ4z',
},
'expr': 'sum(rate(synapse_storage_events_persisted_events_total{instance="10.0.0.34:9000"}[2m]))',
'hide': False,
'instant': False,
'refId': 'E',
'interval': '',
# 'key': 'Q-9072e904-da8d-4b00-b454-dac45b7c38f0-7',
'editorMode': 'code',
'queryType': 'timeSeriesQuery',
'exemplar': False,
# 'requestId': 'Q-d8eb3572-9aea-4a73-92f2-e08b33c21ecb-0Q-a9222e59-18ff-4b3b-80ae-27bea8f149a9-1Q-0378a458-1ade-410e-a4b3-ae4aaa91d709-2Q-da4c00b6-61c1-49f5-8a0a-9f19990acfb7-3Q-21254889-3cf6-4d97-8dc5-ddf68360847e-4Q-502b8ed5-4050-461c-befc-76f6796dce68-5Q-364dc896-c399-4e58-8930-cba2e3d1d579-6Q-9072e904-da8d-4b00-b454-dac45b7c38f0-7E',
'utcOffsetSec': -25200,
'legendFormat': '',
'datasourceId': 1,
'intervalMs': interval * 1000,
# 'maxDataPoints': 1383,
},
],
'from': f'now-{data_range}m',
'to': 'now',
}
response = requests.post(f'{endpoint}/api/ds/query', headers={'Authorization': f'Bearer {api_key}'}, json=json_data, verify=False).json()
return np.round(np.average(response['results']['E']['frames'][0]['data']['values'][1]), 2)
def get_waiting_for_db(api_key, interval, data_range, endpoint):
json_data = {
'queries': [
{
'datasource': {
'type': 'prometheus',
'uid': 'AbuT5CJ4z',
},
'expr': 'rate(synapse_storage_schedule_time_sum{instance="10.0.0.34:9000",job=~"(federation-receiver|federation-sender|initialsync|synapse|synchrotron)",index=~".*"}[30s])/rate(synapse_storage_schedule_time_count[30s])',
'format': 'time_series',
'intervalFactor': 2,
'refId': 'A',
'step': 20,
'interval': '',
# 'key': 'Q-459af7f4-0427-4832-9353-46086b3f5c27-0',
'queryType': 'timeSeriesQuery',
'exemplar': False,
# 'requestId': 'Q-459af7f4-0427-4832-9353-46086b3f5c27-0A',
'utcOffsetSec': -25200,
'legendFormat': '',
'datasourceId': 1,
'intervalMs': interval * 1000,
# 'maxDataPoints': 1383,
},
],
'from': f'now-{data_range}m',
'to': 'now',
}
response = requests.post(f'{endpoint}/api/ds/query', headers={'Authorization': f'Bearer {api_key}'}, json=json_data, verify=False).json()
return np.round(np.average(response['results']['A']['frames'][0]['data']['values'][1]), 5)
def get_stateres_worst_case(api_key, interval, data_range, endpoint):
"""
CPU and DB time spent on most expensive state resolution in a room, summed over all workers.
This is a very rough proxy for "how fast is state res", but it doesn't accurately represent the system load (e.g. it completely ignores cheap state resolutions).
"""
json_data = {
'queries': [
{
'datasource': {
'type': 'prometheus',
'uid': 'AbuT5CJ4z',
},
'exemplar': False,
'expr': 'sum(rate(synapse_state_res_db_for_biggest_room_seconds_total{instance="10.0.0.34:9000"}[1m]))',
'format': 'time_series',
'hide': False,
'instant': False,
'interval': '',
'refId': 'B',
'queryType': 'timeSeriesQuery',
'utcOffsetSec': -25200,
'legendFormat': '',
'datasourceId': 1,
'intervalMs': 15000,
'maxDataPoints': 1863,
},
{
'datasource': {
'type': 'prometheus',
'uid': 'AbuT5CJ4z',
},
'exemplar': False,
'expr': 'sum(rate(synapse_state_res_cpu_for_biggest_room_seconds_total{instance="10.0.0.34:9000"}[1m]))',
'format': 'time_series',
'hide': False,
'instant': False,
'interval': '',
'refId': 'C',
'queryType': 'timeSeriesQuery',
'utcOffsetSec': -25200,
'legendFormat': '',
'datasourceId': 1,
'intervalMs': 15000,
'maxDataPoints': 1863,
},
],
'range': {
'from': '2023-02-23T04:36:12.870Z',
'to': '2023-02-23T07:36:12.870Z',
'raw': {
'from': 'now-3h',
'to': 'now',
},
},
'from': f'now-{data_range}m',
'to': 'now',
}
response = requests.post(f'{endpoint}/api/ds/query', headers={'Authorization': f'Bearer {api_key}'}, json=json_data, verify=False).json()
# AVerage CPU time per block

View File

@ -0,0 +1,111 @@
#!/usr/bin/env python3
import argparse
import sys
import time
import numpy as np
import requests
import nagios
from grafana import get_avg_python_gc_time, get_event_send_time, get_outgoing_http_request_rate, get_waiting_for_db
parser = argparse.ArgumentParser(description='Process some integers.')
parser.add_argument('--grafana-server', required=True, help='Grafana server.')
parser.add_argument('--synapse-server', required=True, help='Matrix Synapse server.')
parser.add_argument('--grafana-api-key', required=True)
parser.add_argument('--interval', default=15, type=int, help='Data interval in seconds.')
parser.add_argument('--range', default=2, type=int, help='Data range in minutes. Used for comparison and averaging.')
parser.add_argument('--type', required=True, choices=['gc-time', 'response-time', 'outgoing-http-rate', 'avg-send', 'db-lag'])
parser.add_argument('--warn', type=float, help='Manually set warn level.')
parser.add_argument('--crit', type=float, help='Manually set critical level.')
args = parser.parse_args()
# TODO: add warn suppoort
if args.type == 'gc-time':
# in seconds
python_gc_time_sum_MAX = 0.002 if not args.crit else args.crit
try:
python_gc_time_sum = np.round(np.average(get_avg_python_gc_time(args.grafana_api_key, args.interval, args.range, args.grafana_server)), 5)
if python_gc_time_sum >= python_gc_time_sum_MAX:
print(f'CRITICAL: average GC time per collection is {python_gc_time_sum} sec.')
sys.exit(nagios.CRITICAL)
else:
print(f'OK: average GC time per collection is {python_gc_time_sum} sec.')
sys.exit(nagios.OK)
except Exception as e:
print(f'UNKNOWN: failed to check avg. GC time "{e}"')
sys.exit(nagios.UNKNOWN)
elif args.type == 'response-time':
response_time_MAX = 1 if not args.crit else args.crit
timeout = 10
try:
response_times = []
for i in range(10):
start = time.perf_counter()
try:
response = requests.post(args.synapse_server, timeout=timeout, verify=False)
except Exception as e:
print(f'UNKNOWN: failed to ping endpoint "{e}"')
sys.exit(nagios.UNKNOWN)
request_time = time.perf_counter() - start
response_times.append(np.round(request_time, 2))
time.sleep(1)
response_time = np.round(np.average(response_times), 2)
if response_time > response_time_MAX:
print(f'CRITICAL: response time is {response_time} sec.')
sys.exit(nagios.CRITICAL)
else:
print(f'OK: response time is {response_time} sec.')
sys.exit(nagios.OK)
except Exception as e:
print(f'UNKNOWN: failed to check response time "{e}"')
sys.exit(nagios.UNKNOWN)
elif args.type == 'outgoing-http-rate':
# outgoing req/sec
outgoing_http_request_rate_MAX = 10 if not args.crit else args.crit
# try:
outgoing_http_request_rate = get_outgoing_http_request_rate(args.grafana_api_key, args.interval, args.range, args.grafana_server)
failed = {}
for k, v in outgoing_http_request_rate.items():
if v > outgoing_http_request_rate_MAX:
failed[k] = v
if len(failed.keys()) > 0:
print(f'CRITICAL: outgoing HTTP request rate for {failed} req/sec.')
sys.exit(nagios.CRITICAL)
print(f'OK: outgoing HTTP request rate is {outgoing_http_request_rate} req/sec.')
sys.exit(nagios.OK)
# except Exception as e:
# print(f'UNKNOWN: failed to check outgoing HTTP request rate "{e}"')
# sys.exit(nagios.UNKNOWN)
elif args.type == 'avg-send':
# Average send time in seconds
event_send_time_MAX = 1 if not args.crit else args.crit
try:
event_send_time = get_event_send_time(args.grafana_api_key, args.interval, args.range, args.grafana_server)
if event_send_time > event_send_time_MAX:
print(f'CRITICAL: average message send time is {event_send_time} sec.')
sys.exit(nagios.CRITICAL)
else:
print(f'OK: average message send time is {event_send_time} sec.')
sys.exit(nagios.OK)
except Exception as e:
print(f'UNKNOWN: failed to check average message send time "{e}"')
sys.exit(nagios.UNKNOWN)
elif args.type == 'db-lag':
# in seconds
db_lag_MAX = 0.01 if not args.crit else args.crit
try:
db_lag = get_waiting_for_db(args.grafana_api_key, args.interval, args.range, args.grafana_server)
if db_lag > db_lag_MAX:
print(f'CRITICAL: DB lag is {db_lag} sec.')
sys.exit(nagios.CRITICAL)
else:
print(f'OK: DB lag is {db_lag} sec.')
sys.exit(nagios.OK)
except Exception as e:
print(f'UNKNOWN: failed to check DB lag "{e}"')
sys.exit(nagios.UNKNOWN)
else:
print('Wrong type')
sys.exit(nagios.UNKNOWN)

View File

@ -0,0 +1,4 @@
UNKNOWN = -1
OK = 0
WARNING = 1
CRITICAL = 2

View File

@ -0,0 +1,12 @@
from prometheus_client.parser import text_string_to_metric_families
def parse_metrics(families):
output = {}
for family in text_string_to_metric_families(families):
output[family.name] = {}
for sample in family.samples:
if sample.name not in output[family.name].keys():
output[family.name][sample.name] = []
output[family.name][sample.name].append(sample)
return output

View File

@ -0,0 +1,9 @@
prometheus_client
requests
numpy
nagiosplugin
matrix-nio
Pillow
python-magic
numpy
beautifulsoup4

View File

@ -0,0 +1,110 @@
import sys
import requests
import nagios
def handle_err(func):
def wrapper(*args, **kwargs):
try:
crit, ret = func(*args, **kwargs)
except Exception as e:
print(f"UNKNOWN: exception '{e}'")
sys.exit(nagios.UNKNOWN)
if crit:
print(f"CRITICAL: {crit}")
sys.exit(nagios.CRITICAL)
else:
return ret
return wrapper
@handle_err
def login(user_id: str, passwd: str, homeserver: str):
data = {'type': 'm.login.password', 'user': user_id, 'password': passwd}
r = requests.post(f'{homeserver}/_matrix/client/r0/login', json=data)
if r.status_code != 200:
return f'Bad status code on login for {user_id}: {r.status_code}\nBody: {r.text}', None
return None, r.json()
@handle_err
def create_room(room_name, homeserver, auth_token):
"""
Creates an unencrypted room.
"""
data = {"name": room_name, "preset": "private_chat", "visibility": "private", # "initial_state": [{"type": "m.room.guest_access", "state_key": "", "content": {"guest_access": "can_join"}}]
}
r = requests.post(f'{homeserver}/_matrix/client/r0/createRoom?access_token={auth_token}', json=data)
if r.status_code != 200:
return Exception(f'Bad status code on create room for {room_name}: {r.status_code}\nBody: {r.text}'), None
return None, r.json()
@handle_err
def send_invite(room_id, target_user_id, homeserver, auth_token):
r = requests.post(f'{homeserver}/_matrix/client/r0/rooms/{room_id}/invite?access_token={auth_token}', json={'user_id': target_user_id})
if r.status_code != 200:
return Exception(f'Bad status code on send invite for {room_id}: {r.status_code}\nBody: {r.text}'), None
return None, r.json()
@handle_err
def join_room(room_id, homeserver, auth_token):
r = requests.post(f'{homeserver}/_matrix/client/r0/join/{room_id}?access_token={auth_token}', data='{}')
if r.status_code != 200:
return Exception(f'Bad status code on join room for {room_id}: {r.status_code}\nBody: {r.text}'), None
return None, r.json()
@handle_err
def join_room_invite(room_id, homeserver, auth_token):
r = requests.post(f'{homeserver}/_matrix/client/r0/rooms/{room_id}/join?access_token={auth_token}', data='{}')
if r.status_code != 200:
return Exception(f'Bad status code on join room via invite for {room_id}: {r.status_code}\nBody: {r.text}'), None
return None, r.json()
@handle_err
def send_msg(message, room_id, homeserver, auth_token):
r = requests.post(f'{homeserver}/_matrix/client/r0/rooms/{room_id}/send/m.room.message?access_token={auth_token}', json={'msgtype': 'm.text', 'body': message})
if r.status_code != 200:
return Exception(f'Bad status code on send message for {room_id}: {r.status_code}\nBody: {r.text}'), None
return None, r.json()
# errors will be handled in the other script
def get_event(event_id, room_id, homeserver, auth_token):
return requests.get(f'{homeserver}/_matrix/client/v3/rooms/{room_id}/event/{event_id}?access_token={auth_token}')
@handle_err
def get_state(homeserver, auth_token, since=None):
if since:
url = f'{homeserver}/_matrix/client/r0/sync?since{since}&access_token={auth_token}'
else:
url = f'{homeserver}/_matrix/client/r0/sync?access_token={auth_token}'
r = requests.get(url)
if r.status_code != 200:
return Exception(f'Bad status code on sync: {r.status_code}\nBody: {r.text}'), None
return None, r.json()
@handle_err
def forget_room(room_id, homeserver, auth_token):
r = requests.post(f'{homeserver}/_matrix/client/r0/rooms/{room_id}/forget?access_token={auth_token}', data='{}')
if r.status_code != 200:
return Exception(f'Bad status code on leave room for {room_id}: {r.status_code}\nBody: {r.text}'), None
return None, r.json()
@handle_err
def leave_room(room_id, homeserver, auth_token, forget=False):
r = requests.post(f'{homeserver}/_matrix/client/r0/rooms/{room_id}/leave?access_token={auth_token}', data='{}')
if r.status_code != 200:
return Exception(f'Bad status code on leave room for {room_id}: {r.status_code}\nBody: {r.text}'), None
if forget:
f = forget_room(room_id, homeserver, auth_token)
return None, r.json()

0
Checks/__init__.py Normal file
View File

View File

@ -0,0 +1,4 @@
python3 Matrix\ Synapse/check_monitor_bot.py \
--metrics-endpoint "https://matrix.your-hs.com/matrix-monitor-bot/" \
--domain your-hs.com \
--ignore canarymod.net catgirl.cloud

211
Checks/check_redis.py Normal file
View File

@ -0,0 +1,211 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# pip install redis
import redis
import sys
import argparse
EXIT_OK = 0
EXIT_WARNING = 1
EXIT_CRITICAL = 2
EXIT_UNKNONW = 3
EXIT_INVALID_AUTH = 3
class MonitoringPluginRedis(object):
def __init__(self):
"""
"""
cli_args = self.parse_args()
self.host = cli_args.host
self.port = cli_args.port
self.password = cli_args.password
self.dbname = cli_args.dbname
self.timeout = cli_args.timeout
self.key = cli_args.key_value
self.warning = cli_args.warning
self.critical = cli_args.critical
try:
self.conn = redis.Redis(
host=self.host,
port=self.port,
password=self.password,
socket_timeout=self.timeout
)
self.info_out = self.conn.info()
self.conn.ping()
except Exception as e:
print(f"CRITICAL REDIS : {e}")
sys.exit(2)
def parse_args(self):
"""
"""
parser = argparse.ArgumentParser(
description="monitoring plugin for redis-server, version: 1.0"
)
parser.add_argument(
"-H", "--host",
dest="host",
help="Redis server to connect to. (default is 127.0.0.1)",
default="127.0.0.1"
)
parser.add_argument(
"-p", "--port",
dest="port",
help="Redis port to connect to. (default is 6379)",
type=int,
default=6379
)
parser.add_argument(
"-P", "--password",
dest="password",
help="Redis password to connect to.",
default=''
)
parser.add_argument(
"-d", "--dbname",
dest="dbname",
help="Redis database name (default is db0)",
default='db0'
)
parser.add_argument(
"-t", "--timeout",
dest="timeout",
help="Number of seconds to wait before timing out and considering redis down",
type=int,
default=2
)
parser.add_argument(
"-w", "--warning",
dest="warning",
type=int,
help="Warning threshold."
)
parser.add_argument(
"-c", "--critical",
dest="critical",
type=int,
help="Critical threshold."
)
parser.add_argument(
"-k", "--key",
dest="key_value",
help="Stat to monitor (memory_mb, hit_ratio, or custom)",
default=None
)
return parser.parse_args()
def get_version(self):
return f"version: {self.info_out.get('redis_version')}"
def get_client_connection(self):
return f"connected_clients: {self.info_out.get('connected_clients')}"
def get_number_keys(self):
return f"{self.dbname}: {self.info_out.get(self.dbname).get('keys')}"
def get_uptime(self):
return f"uptime_in_days: {self.info_out.get('uptime_in_days')}"
def get_used_memory(self):
return f"used_memory_human: {self.info_out.get('used_memory_human')}"
def check(self):
"""
"""
number_keys = ''
version = self.get_version()
client_connected = self.get_client_connection()
reverse_check = False
exit_string = "OK"
if self.dbname in str(self.info_out):
number_keys = self.get_number_keys()
memory = self.get_used_memory()
uptime = self.get_uptime()
# print(self.info_out)
if self.key:
if not self.warning or not self.critical:
exit_string = "UNKNOWN"
if not self.warning:
status = "UNKNOWN: Warning level required"
if not self.critical:
status = "UNKNONW: Critical level required"
print(status)
sys.exit(EXIT_UNKNONW)
if self.key == "memory_mb":
reverse_check = True
info_value = int(
self.info_out.get("used_memory_rss") or self.info_out.get("used_memory")
) / (1024 * 1024)
elif self.key == "hit_ratio":
reverse_check = False
hit = int(self.info_out.get("keyspace_hits"))
miss = int(self.info_out.get("keyspace_misses"))
if hit > 0 and miss > 0:
info_value = int(100 * hit) / (hit + miss)
else:
info_value = 0
else:
info_value = int(self.info_out.get(self.key))
if reverse_check:
if int(info_value) < int(self.critical):
exit_string = "CRITICAL"
elif int(info_value) < int(self.warning):
exit_string = "WARNING"
else:
if int(info_value) > int(self.critical):
exit_string = "CRITICAL"
elif int(info_value) > int(self.warning):
exit_string = "WARNING"
status = f"{exit_string}: Redis {self.key} is {info_value}"
perfdata = f"{self.key}={info_value};{self.warning};{self.critical};0;{info_value}"
print(f"{status} || {perfdata}")
else:
if number_keys == '':
status = f"OK REDIS No keys, {version}, {memory}, {uptime}"
else:
status = f"OK REDIS {version}, {client_connected}, {number_keys}, {memory}, {uptime}"
print(status)
if exit_string == "OK":
sys.exit(EXIT_OK)
if exit_string == "WARNING":
sys.exit(EXIT_WARNING)
if exit_string == "UNKNONW":
sys.exit(EXIT_UNKNONW)
else:
sys.exit(EXIT_CRITICAL)
if __name__ == "__main__":
"""
"""
server = MonitoringPluginRedis()
server.check()

View File

@ -0,0 +1,9 @@
python3 Matrix\ Synapse/check_federation.py \
--bot1-user '@bot1:your-hs.com' \
--bot1-pw password1234 \
--bot1-hs https://matrix.your-hs.com \
--bot1-auth-file /opt/custom-nagios-checks/auth-fed-test-bot1.json \
--bot2-user '@bot2:matrix.org' \
--bot2-pw password1234 \
--bot2-hs https://matrix-federation.matrix.org \
--bot2-auth-file /opt/custom-nagios-checks/auth-fed-test-bot2.json

8
Checks/test-media-cdn.sh Normal file
View File

@ -0,0 +1,8 @@
python3 Matrix\ Synapse/check_media_cdn.py \
--user '@bot1:your-hs.com' \
--pw password1234 \
--hs https://matrix.your-hs.com \
--room '!banana:your-hs.com' \
--auth-file ./auth-cdn.json \
--media-cdn-domain matrix-media-cdn.your-hs.com \
--admin-endpoint https://172.0.2.118

85
Other/icinga-to-kuma.py Normal file
View File

@ -0,0 +1,85 @@
import json
from pathlib import Path
from flask import Flask, Response, request
from icinga2api.client import Client
client = Client('https://localhost:8080', 'icingaweb2', 'password1234')
OK = 0
WARNING = 1
CRITICAL = 2
UNKNOWN = 3
app = Flask(__name__)
def return_json(json_dict, start_response, status_code=200):
headers = [('Content-Type', 'application/json')]
start_response(str(status_code), headers)
return iter([json.dumps(json_dict).encode('utf-8')])
@app.route('/host')
@app.route('/host/')
@app.route("/host/<hostid>")
def get_host_state(hostid=None):
path = Path(request.base_url)
args_service = request.args.getlist('service')
kuma_mode = True if request.args.get('kuma') == 'true' else False
if not hostid:
return Response(json.dumps({'error': 'must specify host'}), status=406, mimetype='application/json')
result = {
'host': {},
'services': {},
'failed_services': []
}
host_status = client.objects.list('Host', filters='match(hpattern, host.name)', filter_vars={'hpattern': hostid})
if not len(host_status):
return Response(json.dumps({'error': 'could not find host'}), status=404, mimetype='application/json')
else:
host_status = host_status[0]
result['host'] = {
'name': host_status['name'],
'state': 0 if (host_status['attrs']['acknowledgement'] or host_status['attrs']['acknowledgement_expiry']) else host_status['attrs']['state'],
'actual_state': host_status['attrs']['state'],
'attrs': {
**host_status['attrs']
}
}
services_status = client.objects.list('Service', filters='match(hpattern, host.name)', filter_vars={'hpattern': hostid})
for attrs in services_status:
name = attrs['name'].split('!')[1]
result['services'][name] = {
'state': 0 if (attrs['attrs']['acknowledgement'] or attrs['attrs']['acknowledgement_expiry']) else attrs['attrs']['state'],
'actual_state': attrs['attrs']['state'],
'attrs': {
**attrs
}
}
if len(args_service):
services = {}
for service in args_service:
if service in result['services'].keys():
services[service] = result['services'][service]
else:
return Response(json.dumps({'error': 'service not found', 'service': service}), status=400, mimetype='application/json')
result['services'] = services
if kuma_mode:
for name, service in result['services'].items():
if service['state'] != OK:
result['failed_services'].append({'name': name, 'state': service['state']})
if result['host']['state'] != OK:
result['failed_services'].append({'name': hostid, 'state': result['host']['state']})
if len(result['failed_services']):
return Response(json.dumps(result), status=410, mimetype='application/json')
else:
return result