2024-02-18 12:55:45 -07:00
#!/usr/bin/env python
import argparse
import sys
import traceback
import mysql . connector
2024-04-05 23:51:42 -06:00
from checker import nagios
from checker . result import quit_check
2024-02-18 12:55:45 -07:00
from checker . types import try_int
"""
2024-02-18 14:54:33 -07:00
## MySQL User Setup ##
Attention : The DB - user you type in must have CLIENT REPLICATION rights on the DB - server . Example :
2024-03-28 20:50:10 -06:00
GRANT REPLICATION CLIENT , SLAVE MONITOR on * . * TO ' nagios ' @ ' monitoringhost ' IDENTIFIED BY ' secret ' ;
2024-02-18 14:54:33 -07:00
If you use MariaDB 10.5 or newer , the DB user must have REPLICA MONITOR rights :
GRANT REPLICA MONITOR ON * . * TO ' nagios ' @ ' monitoringhost ' IDENTIFIED BY ' secret ' ;
2024-02-18 12:55:45 -07:00
## Usage example ##
Target delay = 300 seconds
Warning deviation = 10 %
Critical deviation = 15 %
This will set the warning levels to 330 and 270 seconds , if the delay is greater or less than these values it will return WARNING .
2024-03-28 20:50:10 -06:00
The critical levels will be 345 and 255 seconds .
2024-02-18 12:55:45 -07:00
"""
def main ( args ) :
cursor = cnx = None
try :
cnx = mysql . connector . connect ( user = args . username , password = args . password , host = args . host )
cursor = cnx . cursor ( dictionary = True )
cursor . execute ( " SHOW SLAVE STATUS " )
2024-03-03 22:49:16 -07:00
slave_status = cursor . fetchone ( )
2024-02-18 12:55:45 -07:00
if slave_status is None :
print ( " UNKNOWN - Could not retrieve slave status " )
2024-03-03 22:49:16 -07:00
sys . exit ( nagios . STATE_UNKNOWN )
2024-02-18 12:55:45 -07:00
slave_io_running = slave_status [ ' Slave_IO_Running ' ]
2024-03-03 22:49:16 -07:00
if not slave_io_running :
print ( " UNKNOWN - Could not retrieve Slave_IO_Running " )
sys . exit ( nagios . STATE_UNKNOWN )
2024-02-18 12:55:45 -07:00
slave_sql_running = slave_status [ ' Slave_SQL_Running ' ]
2024-03-03 22:49:16 -07:00
if not slave_sql_running :
print ( " UNKNOWN - Could not retrieve Slave_SQL_Running " )
sys . exit ( nagios . STATE_UNKNOWN )
2024-02-18 12:55:45 -07:00
last_io_error = slave_status [ ' Last_IO_Error ' ]
2024-03-03 22:52:53 -07:00
last_sql_error = slave_status [ ' Last_SQL_Error ' ]
2024-02-18 12:55:45 -07:00
2024-03-03 22:49:16 -07:00
replication_delay = - 1
perfdata = { }
2024-02-18 12:55:45 -07:00
exit_code = nagios . STATE_OK
exit_msg = [ ]
if slave_io_running != ' Yes ' :
exit_code = nagios . STATE_CRIT
exit_msg . append ( ' Slave IO is not running! ' )
if last_io_error :
exit_code = nagios . STATE_CRIT
exit_msg . append ( f ' Last IO Error: { last_io_error . strip ( " . " ) } . ' )
2024-04-09 21:23:07 -06:00
if slave_sql_running != ' Yes ' :
exit_code = nagios . STATE_CRIT
exit_msg . append ( ' Slave SQL is not running! ' )
if last_sql_error :
exit_code = nagios . STATE_CRIT
exit_msg . append ( f ' Last SQL Error: { last_sql_error . strip ( " . " ) } . ' )
2024-02-18 12:55:45 -07:00
2024-03-03 22:49:16 -07:00
if exit_code == nagios . STATE_OK :
2024-03-03 22:52:53 -07:00
# Only replication delay if everything else is healthy.
2024-03-03 22:49:16 -07:00
if not slave_status [ ' Seconds_Behind_Master ' ] :
2024-04-05 23:53:42 -06:00
quit_check ( ' Slave is not configured for delay ' , nagios . STATE_CRIT )
2024-03-03 22:49:16 -07:00
replication_delay = try_int ( slave_status [ ' Seconds_Behind_Master ' ] )
if args . target_delay :
warn_deviation_max = args . target_delay * ( 1 + ( args . warning_deviation / 100 ) )
warn_deviation_min = args . target_delay * ( 1 - ( args . warning_deviation / 100 ) )
crit_deviation_max = args . target_delay * ( 1 + ( args . critical_deviation / 100 ) )
crit_deviation_min = args . target_delay * ( 1 - ( args . critical_deviation / 100 ) )
if replication_delay < = crit_deviation_min :
exit_code = nagios . STATE_CRIT
exit_msg . append ( ' Replication is delayed! ' )
2024-04-06 00:05:11 -06:00
elif replication_delay > = crit_deviation_max :
2024-03-03 22:49:16 -07:00
exit_code = nagios . STATE_CRIT
2024-08-09 15:00:13 -06:00
exit_msg . append ( ' Replication delay too short! ' )
2024-04-06 00:05:11 -06:00
elif replication_delay < = warn_deviation_min :
2024-03-03 22:49:16 -07:00
exit_code = nagios . STATE_WARN
exit_msg . append ( ' Replication is delayed! ' )
2024-04-06 00:05:11 -06:00
elif replication_delay > = warn_deviation_max :
2024-03-03 22:49:16 -07:00
exit_code = nagios . STATE_WARN
2024-08-09 15:00:13 -06:00
exit_msg . append ( ' Replication delay too short! ' )
2024-02-18 12:55:45 -07:00
if exit_code == nagios . STATE_OK :
exit_msg . append ( ' Slave is healthy! Slave SQL: running. Slave IO: running. ' )
2024-03-03 22:49:16 -07:00
if replication_delay > - 1 :
exit_msg . append ( f ' Slave is { replication_delay } seconds behind master. ' )
perfdata = {
" replication_delay " : {
" value " : replication_delay ,
" min " : 0 ,
" unit " : " s " ,
} ,
}
2024-02-18 12:55:45 -07:00
2024-04-05 23:51:42 -06:00
quit_check ( ' ' . join ( exit_msg ) , exit_code , perfdata )
2024-02-18 12:55:45 -07:00
except mysql . connector . Error as e :
2024-04-05 23:51:42 -06:00
quit_check ( f ' Could not connect to database: { e } ' , nagios . STATE_CRIT )
2024-02-18 12:55:45 -07:00
except Exception as e :
2024-04-05 23:51:42 -06:00
quit_check ( f ' Unknown error: { e } \n { traceback . format_exc ( ) } ' , nagios . STATE_UNKNOWN )
2024-02-18 12:55:45 -07:00
finally :
if cursor :
cursor . close ( )
if cnx :
cnx . close ( )
if __name__ == " __main__ " :
parser = argparse . ArgumentParser ( description = ' Check a MySQL slave. ' )
parser . add_argument ( ' --host ' , required = True , help = ' The IP of the slave to connect to. ' )
parser . add_argument ( ' --username ' , required = True , help = ' Username. ' )
parser . add_argument ( ' --password ' , required = True , help = ' Password. ' )
parser . add_argument ( ' --target-delay ' , default = None , type = int , help = ' The target delay in seconds. ' )
parser . add_argument ( ' --warning-deviation ' , default = 10 , type = int , help = ' If the delay deviates more than this percentage from the target delay, return warning. ' )
parser . add_argument ( ' --critical-deviation ' , default = 15 , type = int , help = ' If the delay deviates more than this percentage from the target delay, return critical. ' )
args = parser . parse_args ( )
2024-04-05 17:45:24 -06:00
try :
main ( args )
except Exception as e :
print ( f " UNKNOWN - { e } " )
traceback . print_exc ( )
sys . exit ( nagios . STATE_UNKNOWN )