check_mysql_slave: make work when slave is kill

This commit is contained in:
Cyberes 2024-03-03 22:49:16 -07:00
parent 404d33fc81
commit f76ec953b9
1 changed files with 58 additions and 34 deletions

View File

@ -35,18 +35,29 @@ def main(args):
cnx = mysql.connector.connect(user=args.username, password=args.password, host=args.host)
cursor = cnx.cursor(dictionary=True)
cursor.execute("SHOW SLAVE STATUS")
slave_status = cursor.fetchone()
slave_status = cursor.fetchone()
if slave_status is None:
print("UNKNOWN - Could not retrieve slave status")
sys.exit(3)
sys.exit(nagios.STATE_UNKNOWN)
slave_io_running = slave_status['Slave_IO_Running']
slave_sql_running = slave_status['Slave_SQL_Running']
replication_delay = try_int(slave_status['Seconds_Behind_Master'])
last_io_error = slave_status['Last_IO_Error']
last_sql_error = slave_status['Last_SQL_Error']
if not slave_io_running:
print("UNKNOWN - Could not retrieve Slave_IO_Running")
sys.exit(nagios.STATE_UNKNOWN)
slave_sql_running = slave_status['Slave_SQL_Running']
if not slave_sql_running:
print("UNKNOWN - Could not retrieve Slave_SQL_Running")
sys.exit(nagios.STATE_UNKNOWN)
last_io_error = slave_status['Last_IO_Error']
if not last_io_error:
print("UNKNOWN - Could not retrieve Last_IO_Error")
sys.exit(nagios.STATE_UNKNOWN)
replication_delay = -1
perfdata = {}
exit_code = nagios.STATE_OK
exit_msg = []
@ -56,43 +67,56 @@ def main(args):
if slave_io_running != 'Yes':
exit_code = nagios.STATE_CRIT
exit_msg.append('Slave IO is not running!')
if last_sql_error:
exit_code = nagios.STATE_CRIT
exit_msg.append(f'Last SQL Error: {last_sql_error.strip(".")}.')
if last_io_error:
exit_code = nagios.STATE_CRIT
exit_msg.append(f'Last IO Error: {last_io_error.strip(".")}.')
if args.target_delay:
warn_deviation_max = args.target_delay * (1 + (args.warning_deviation / 100))
warn_deviation_min = args.target_delay * (1 - (args.warning_deviation / 100))
crit_deviation_max = args.target_delay * (1 + (args.critical_deviation / 100))
crit_deviation_min = args.target_delay * (1 - (args.critical_deviation / 100))
if replication_delay <= crit_deviation_min:
if exit_code == nagios.STATE_OK:
# Only check these things if everything else is healthy.
last_sql_error = slave_status['Last_SQL_Error']
if not last_sql_error:
print("UNKNOWN - Could not retrieve Last_SQL_Error")
sys.exit(nagios.STATE_UNKNOWN)
if last_sql_error:
exit_code = nagios.STATE_CRIT
exit_msg.append('Replication is delayed!')
if replication_delay >= crit_deviation_max:
exit_code = nagios.STATE_CRIT
exit_msg.append('Replication is ahead???')
if replication_delay <= warn_deviation_min:
exit_code = nagios.STATE_WARN
exit_msg.append('Replication is delayed!')
if replication_delay >= warn_deviation_max:
exit_code = nagios.STATE_WARN
exit_msg.append('Replication is ahead???')
exit_msg.append(f'Last SQL Error: {last_sql_error.strip(".")}.')
if not slave_status['Seconds_Behind_Master']:
print("UNKNOWN - Could not retrieve Seconds_Behind_Master")
sys.exit(nagios.STATE_UNKNOWN)
replication_delay = try_int(slave_status['Seconds_Behind_Master'])
if args.target_delay:
warn_deviation_max = args.target_delay * (1 + (args.warning_deviation / 100))
warn_deviation_min = args.target_delay * (1 - (args.warning_deviation / 100))
crit_deviation_max = args.target_delay * (1 + (args.critical_deviation / 100))
crit_deviation_min = args.target_delay * (1 - (args.critical_deviation / 100))
if replication_delay <= crit_deviation_min:
exit_code = nagios.STATE_CRIT
exit_msg.append('Replication is delayed!')
if replication_delay >= crit_deviation_max:
exit_code = nagios.STATE_CRIT
exit_msg.append('Replication is ahead???')
if replication_delay <= warn_deviation_min:
exit_code = nagios.STATE_WARN
exit_msg.append('Replication is delayed!')
if replication_delay >= warn_deviation_max:
exit_code = nagios.STATE_WARN
exit_msg.append('Replication is ahead???')
if exit_code == nagios.STATE_OK:
exit_msg.append('Slave is healthy! Slave SQL: running. Slave IO: running.')
exit_msg.append(f'Slave is {replication_delay} seconds behind master.')
perfdata = {
"replication_delay": {
"value": replication_delay,
"min": 0,
"unit": "s",
},
}
if replication_delay > -1:
exit_msg.append(f'Slave is {replication_delay} seconds behind master.')
perfdata = {
"replication_delay": {
"value": replication_delay,
"min": 0,
"unit": "s",
},
}
text_result = ' '.join(exit_msg)
print_icinga2_check_status(text_result, exit_code, perfdata)