From f76ec953b9bd7e71160c523014621128bf26c606 Mon Sep 17 00:00:00 2001 From: Cyberes Date: Sun, 3 Mar 2024 22:49:16 -0700 Subject: [PATCH] check_mysql_slave: make work when slave is kill --- check_mysql_slave.py | 92 ++++++++++++++++++++++++++++---------------- 1 file changed, 58 insertions(+), 34 deletions(-) diff --git a/check_mysql_slave.py b/check_mysql_slave.py index 52cd635..5b9029c 100755 --- a/check_mysql_slave.py +++ b/check_mysql_slave.py @@ -35,18 +35,29 @@ def main(args): cnx = mysql.connector.connect(user=args.username, password=args.password, host=args.host) cursor = cnx.cursor(dictionary=True) cursor.execute("SHOW SLAVE STATUS") - slave_status = cursor.fetchone() + slave_status = cursor.fetchone() if slave_status is None: print("UNKNOWN - Could not retrieve slave status") - sys.exit(3) + sys.exit(nagios.STATE_UNKNOWN) slave_io_running = slave_status['Slave_IO_Running'] - slave_sql_running = slave_status['Slave_SQL_Running'] - replication_delay = try_int(slave_status['Seconds_Behind_Master']) - last_io_error = slave_status['Last_IO_Error'] - last_sql_error = slave_status['Last_SQL_Error'] + if not slave_io_running: + print("UNKNOWN - Could not retrieve Slave_IO_Running") + sys.exit(nagios.STATE_UNKNOWN) + slave_sql_running = slave_status['Slave_SQL_Running'] + if not slave_sql_running: + print("UNKNOWN - Could not retrieve Slave_SQL_Running") + sys.exit(nagios.STATE_UNKNOWN) + + last_io_error = slave_status['Last_IO_Error'] + if not last_io_error: + print("UNKNOWN - Could not retrieve Last_IO_Error") + sys.exit(nagios.STATE_UNKNOWN) + + replication_delay = -1 + perfdata = {} exit_code = nagios.STATE_OK exit_msg = [] @@ -56,43 +67,56 @@ def main(args): if slave_io_running != 'Yes': exit_code = nagios.STATE_CRIT exit_msg.append('Slave IO is not running!') - if last_sql_error: - exit_code = nagios.STATE_CRIT - exit_msg.append(f'Last SQL Error: {last_sql_error.strip(".")}.') if last_io_error: exit_code = nagios.STATE_CRIT exit_msg.append(f'Last IO Error: {last_io_error.strip(".")}.') - if args.target_delay: - warn_deviation_max = args.target_delay * (1 + (args.warning_deviation / 100)) - warn_deviation_min = args.target_delay * (1 - (args.warning_deviation / 100)) - crit_deviation_max = args.target_delay * (1 + (args.critical_deviation / 100)) - crit_deviation_min = args.target_delay * (1 - (args.critical_deviation / 100)) - if replication_delay <= crit_deviation_min: + if exit_code == nagios.STATE_OK: + # Only check these things if everything else is healthy. + + last_sql_error = slave_status['Last_SQL_Error'] + if not last_sql_error: + print("UNKNOWN - Could not retrieve Last_SQL_Error") + sys.exit(nagios.STATE_UNKNOWN) + if last_sql_error: exit_code = nagios.STATE_CRIT - exit_msg.append('Replication is delayed!') - if replication_delay >= crit_deviation_max: - exit_code = nagios.STATE_CRIT - exit_msg.append('Replication is ahead???') - if replication_delay <= warn_deviation_min: - exit_code = nagios.STATE_WARN - exit_msg.append('Replication is delayed!') - if replication_delay >= warn_deviation_max: - exit_code = nagios.STATE_WARN - exit_msg.append('Replication is ahead???') + exit_msg.append(f'Last SQL Error: {last_sql_error.strip(".")}.') + + if not slave_status['Seconds_Behind_Master']: + print("UNKNOWN - Could not retrieve Seconds_Behind_Master") + sys.exit(nagios.STATE_UNKNOWN) + replication_delay = try_int(slave_status['Seconds_Behind_Master']) + + if args.target_delay: + warn_deviation_max = args.target_delay * (1 + (args.warning_deviation / 100)) + warn_deviation_min = args.target_delay * (1 - (args.warning_deviation / 100)) + crit_deviation_max = args.target_delay * (1 + (args.critical_deviation / 100)) + crit_deviation_min = args.target_delay * (1 - (args.critical_deviation / 100)) + if replication_delay <= crit_deviation_min: + exit_code = nagios.STATE_CRIT + exit_msg.append('Replication is delayed!') + if replication_delay >= crit_deviation_max: + exit_code = nagios.STATE_CRIT + exit_msg.append('Replication is ahead???') + if replication_delay <= warn_deviation_min: + exit_code = nagios.STATE_WARN + exit_msg.append('Replication is delayed!') + if replication_delay >= warn_deviation_max: + exit_code = nagios.STATE_WARN + exit_msg.append('Replication is ahead???') if exit_code == nagios.STATE_OK: exit_msg.append('Slave is healthy! Slave SQL: running. Slave IO: running.') - exit_msg.append(f'Slave is {replication_delay} seconds behind master.') - - perfdata = { - "replication_delay": { - "value": replication_delay, - "min": 0, - "unit": "s", - }, - } + if replication_delay > -1: + exit_msg.append(f'Slave is {replication_delay} seconds behind master.') + perfdata = { + "replication_delay": { + "value": replication_delay, + "min": 0, + "unit": "s", + }, + } text_result = ' '.join(exit_msg) print_icinga2_check_status(text_result, exit_code, perfdata)