check_mysql_slave: make work when slave is kill

This commit is contained in:
Cyberes 2024-03-03 22:49:16 -07:00
parent 404d33fc81
commit f76ec953b9
1 changed files with 58 additions and 34 deletions

View File

@ -35,18 +35,29 @@ def main(args):
cnx = mysql.connector.connect(user=args.username, password=args.password, host=args.host) cnx = mysql.connector.connect(user=args.username, password=args.password, host=args.host)
cursor = cnx.cursor(dictionary=True) cursor = cnx.cursor(dictionary=True)
cursor.execute("SHOW SLAVE STATUS") cursor.execute("SHOW SLAVE STATUS")
slave_status = cursor.fetchone()
slave_status = cursor.fetchone()
if slave_status is None: if slave_status is None:
print("UNKNOWN - Could not retrieve slave status") print("UNKNOWN - Could not retrieve slave status")
sys.exit(3) sys.exit(nagios.STATE_UNKNOWN)
slave_io_running = slave_status['Slave_IO_Running'] slave_io_running = slave_status['Slave_IO_Running']
slave_sql_running = slave_status['Slave_SQL_Running'] if not slave_io_running:
replication_delay = try_int(slave_status['Seconds_Behind_Master']) print("UNKNOWN - Could not retrieve Slave_IO_Running")
last_io_error = slave_status['Last_IO_Error'] sys.exit(nagios.STATE_UNKNOWN)
last_sql_error = slave_status['Last_SQL_Error']
slave_sql_running = slave_status['Slave_SQL_Running']
if not slave_sql_running:
print("UNKNOWN - Could not retrieve Slave_SQL_Running")
sys.exit(nagios.STATE_UNKNOWN)
last_io_error = slave_status['Last_IO_Error']
if not last_io_error:
print("UNKNOWN - Could not retrieve Last_IO_Error")
sys.exit(nagios.STATE_UNKNOWN)
replication_delay = -1
perfdata = {}
exit_code = nagios.STATE_OK exit_code = nagios.STATE_OK
exit_msg = [] exit_msg = []
@ -56,13 +67,26 @@ def main(args):
if slave_io_running != 'Yes': if slave_io_running != 'Yes':
exit_code = nagios.STATE_CRIT exit_code = nagios.STATE_CRIT
exit_msg.append('Slave IO is not running!') exit_msg.append('Slave IO is not running!')
if last_sql_error:
exit_code = nagios.STATE_CRIT
exit_msg.append(f'Last SQL Error: {last_sql_error.strip(".")}.')
if last_io_error: if last_io_error:
exit_code = nagios.STATE_CRIT exit_code = nagios.STATE_CRIT
exit_msg.append(f'Last IO Error: {last_io_error.strip(".")}.') exit_msg.append(f'Last IO Error: {last_io_error.strip(".")}.')
if exit_code == nagios.STATE_OK:
# Only check these things if everything else is healthy.
last_sql_error = slave_status['Last_SQL_Error']
if not last_sql_error:
print("UNKNOWN - Could not retrieve Last_SQL_Error")
sys.exit(nagios.STATE_UNKNOWN)
if last_sql_error:
exit_code = nagios.STATE_CRIT
exit_msg.append(f'Last SQL Error: {last_sql_error.strip(".")}.')
if not slave_status['Seconds_Behind_Master']:
print("UNKNOWN - Could not retrieve Seconds_Behind_Master")
sys.exit(nagios.STATE_UNKNOWN)
replication_delay = try_int(slave_status['Seconds_Behind_Master'])
if args.target_delay: if args.target_delay:
warn_deviation_max = args.target_delay * (1 + (args.warning_deviation / 100)) warn_deviation_max = args.target_delay * (1 + (args.warning_deviation / 100))
warn_deviation_min = args.target_delay * (1 - (args.warning_deviation / 100)) warn_deviation_min = args.target_delay * (1 - (args.warning_deviation / 100))
@ -84,8 +108,8 @@ def main(args):
if exit_code == nagios.STATE_OK: if exit_code == nagios.STATE_OK:
exit_msg.append('Slave is healthy! Slave SQL: running. Slave IO: running.') exit_msg.append('Slave is healthy! Slave SQL: running. Slave IO: running.')
if replication_delay > -1:
exit_msg.append(f'Slave is {replication_delay} seconds behind master.') exit_msg.append(f'Slave is {replication_delay} seconds behind master.')
perfdata = { perfdata = {
"replication_delay": { "replication_delay": {
"value": replication_delay, "value": replication_delay,