143 lines
5.6 KiB
Python
Executable File
143 lines
5.6 KiB
Python
Executable File
#!/usr/bin/env python
|
|
import argparse
|
|
import sys
|
|
import traceback
|
|
|
|
import mysql.connector
|
|
|
|
from checker import nagios
|
|
from checker.result import quit_check
|
|
from checker.types import try_int
|
|
|
|
"""
|
|
## MySQL User Setup ##
|
|
|
|
Attention: The DB-user you type in must have CLIENT REPLICATION rights on the DB-server. Example:
|
|
GRANT REPLICATION CLIENT,SLAVE MONITOR on *.* TO 'nagios'@'monitoringhost' IDENTIFIED BY 'secret';
|
|
|
|
If you use MariaDB 10.5 or newer, the DB user must have REPLICA MONITOR rights:
|
|
GRANT REPLICA MONITOR ON *.* TO 'nagios'@'monitoringhost' IDENTIFIED BY 'secret';
|
|
|
|
|
|
## Usage example ##
|
|
|
|
Target delay = 300 seconds
|
|
Warning deviation = 10%
|
|
Critical deviation = 15%
|
|
|
|
This will set the warning levels to 330 and 270 seconds, if the delay is greater or less than these values it will return WARNING.
|
|
The critical levels will be 345 and 255 seconds.
|
|
"""
|
|
|
|
|
|
def main(args):
|
|
cursor = cnx = None
|
|
try:
|
|
cnx = mysql.connector.connect(user=args.username, password=args.password, host=args.host)
|
|
cursor = cnx.cursor(dictionary=True)
|
|
cursor.execute("SHOW SLAVE STATUS")
|
|
|
|
slave_status = cursor.fetchone()
|
|
if slave_status is None:
|
|
print("UNKNOWN - Could not retrieve slave status")
|
|
sys.exit(nagios.STATE_UNKNOWN)
|
|
|
|
slave_io_running = slave_status['Slave_IO_Running']
|
|
if not slave_io_running:
|
|
print("UNKNOWN - Could not retrieve Slave_IO_Running")
|
|
sys.exit(nagios.STATE_UNKNOWN)
|
|
|
|
slave_sql_running = slave_status['Slave_SQL_Running']
|
|
if not slave_sql_running:
|
|
print("UNKNOWN - Could not retrieve Slave_SQL_Running")
|
|
sys.exit(nagios.STATE_UNKNOWN)
|
|
|
|
last_io_error = slave_status['Last_IO_Error']
|
|
last_sql_error = slave_status['Last_SQL_Error']
|
|
|
|
replication_delay = -1
|
|
perfdata = {}
|
|
exit_code = nagios.STATE_OK
|
|
exit_msg = []
|
|
|
|
if slave_io_running != 'Yes':
|
|
exit_code = nagios.STATE_CRIT
|
|
exit_msg.append('Slave IO is not running!')
|
|
if last_io_error:
|
|
exit_code = nagios.STATE_CRIT
|
|
exit_msg.append(f'Last IO Error: {last_io_error.strip(".")}.')
|
|
if slave_sql_running != 'Yes':
|
|
exit_code = nagios.STATE_CRIT
|
|
exit_msg.append('Slave SQL is not running!')
|
|
if last_sql_error:
|
|
exit_code = nagios.STATE_CRIT
|
|
exit_msg.append(f'Last SQL Error: {last_sql_error.strip(".")}.')
|
|
|
|
if exit_code == nagios.STATE_OK:
|
|
# Only replication delay if everything else is healthy.
|
|
if not slave_status['Seconds_Behind_Master']:
|
|
quit_check('Slave is not configured for delay', nagios.STATE_CRIT)
|
|
|
|
replication_delay = try_int(slave_status['Seconds_Behind_Master'])
|
|
|
|
if args.target_delay:
|
|
warn_deviation_max = args.target_delay * (1 + (args.warning_deviation / 100))
|
|
warn_deviation_min = args.target_delay * (1 - (args.warning_deviation / 100))
|
|
crit_deviation_max = args.target_delay * (1 + (args.critical_deviation / 100))
|
|
crit_deviation_min = args.target_delay * (1 - (args.critical_deviation / 100))
|
|
if replication_delay <= crit_deviation_min:
|
|
exit_code = nagios.STATE_CRIT
|
|
exit_msg.append('Replication is delayed!')
|
|
elif replication_delay >= crit_deviation_max:
|
|
exit_code = nagios.STATE_CRIT
|
|
exit_msg.append('Replication delay too short!')
|
|
elif replication_delay <= warn_deviation_min:
|
|
exit_code = nagios.STATE_WARN
|
|
exit_msg.append('Replication is delayed!')
|
|
elif replication_delay >= warn_deviation_max:
|
|
exit_code = nagios.STATE_WARN
|
|
exit_msg.append('Replication delay too short!')
|
|
|
|
if exit_code == nagios.STATE_OK:
|
|
exit_msg.append('Slave is healthy! Slave SQL: running. Slave IO: running.')
|
|
|
|
if replication_delay > -1:
|
|
exit_msg.append(f'Slave is {replication_delay} seconds behind master.')
|
|
perfdata = {
|
|
"replication_delay": {
|
|
"value": replication_delay,
|
|
"min": 0,
|
|
"unit": "s",
|
|
},
|
|
}
|
|
|
|
quit_check(' '.join(exit_msg), exit_code, perfdata)
|
|
|
|
except mysql.connector.Error as e:
|
|
quit_check(f'Could not connect to database: {e}', nagios.STATE_CRIT)
|
|
except Exception as e:
|
|
quit_check(f'Unknown error: {e}\n{traceback.format_exc()}', nagios.STATE_UNKNOWN)
|
|
finally:
|
|
if cursor:
|
|
cursor.close()
|
|
if cnx:
|
|
cnx.close()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
parser = argparse.ArgumentParser(description='Check a MySQL slave.')
|
|
parser.add_argument('--host', required=True, help='The IP of the slave to connect to.')
|
|
parser.add_argument('--username', required=True, help='Username.')
|
|
parser.add_argument('--password', required=True, help='Password.')
|
|
parser.add_argument('--target-delay', default=None, type=int, help='The target delay in seconds.')
|
|
parser.add_argument('--warning-deviation', default=10, type=int, help='If the delay deviates more than this percentage from the target delay, return warning.')
|
|
parser.add_argument('--critical-deviation', default=15, type=int, help='If the delay deviates more than this percentage from the target delay, return critical.')
|
|
args = parser.parse_args()
|
|
|
|
try:
|
|
main(args)
|
|
except Exception as e:
|
|
print(f"UNKNOWN - {e}")
|
|
traceback.print_exc()
|
|
sys.exit(nagios.STATE_UNKNOWN)
|