From f76ec953b9bd7e71160c523014621128bf26c606 Mon Sep 17 00:00:00 2001
From: Cyberes <cyberes@evulid.cc>
Date: Sun, 3 Mar 2024 22:49:16 -0700
Subject: [PATCH] check_mysql_slave: make work when slave is kill

---
 check_mysql_slave.py | 92 ++++++++++++++++++++++++++++----------------
 1 file changed, 58 insertions(+), 34 deletions(-)

diff --git a/check_mysql_slave.py b/check_mysql_slave.py
index 52cd635..5b9029c 100755
--- a/check_mysql_slave.py
+++ b/check_mysql_slave.py
@@ -35,18 +35,29 @@ def main(args):
         cnx = mysql.connector.connect(user=args.username, password=args.password, host=args.host)
         cursor = cnx.cursor(dictionary=True)
         cursor.execute("SHOW SLAVE STATUS")
-        slave_status = cursor.fetchone()
 
+        slave_status = cursor.fetchone()
         if slave_status is None:
             print("UNKNOWN - Could not retrieve slave status")
-            sys.exit(3)
+            sys.exit(nagios.STATE_UNKNOWN)
 
         slave_io_running = slave_status['Slave_IO_Running']
-        slave_sql_running = slave_status['Slave_SQL_Running']
-        replication_delay = try_int(slave_status['Seconds_Behind_Master'])
-        last_io_error = slave_status['Last_IO_Error']
-        last_sql_error = slave_status['Last_SQL_Error']
+        if not slave_io_running:
+            print("UNKNOWN - Could not retrieve Slave_IO_Running")
+            sys.exit(nagios.STATE_UNKNOWN)
 
+        slave_sql_running = slave_status['Slave_SQL_Running']
+        if not slave_sql_running:
+            print("UNKNOWN - Could not retrieve Slave_SQL_Running")
+            sys.exit(nagios.STATE_UNKNOWN)
+
+        last_io_error = slave_status['Last_IO_Error']
+        if not last_io_error:
+            print("UNKNOWN - Could not retrieve Last_IO_Error")
+            sys.exit(nagios.STATE_UNKNOWN)
+
+        replication_delay = -1
+        perfdata = {}
         exit_code = nagios.STATE_OK
         exit_msg = []
 
@@ -56,43 +67,56 @@ def main(args):
         if slave_io_running != 'Yes':
             exit_code = nagios.STATE_CRIT
             exit_msg.append('Slave IO is not running!')
-        if last_sql_error:
-            exit_code = nagios.STATE_CRIT
-            exit_msg.append(f'Last SQL Error: {last_sql_error.strip(".")}.')
         if last_io_error:
             exit_code = nagios.STATE_CRIT
             exit_msg.append(f'Last IO Error: {last_io_error.strip(".")}.')
 
-        if args.target_delay:
-            warn_deviation_max = args.target_delay * (1 + (args.warning_deviation / 100))
-            warn_deviation_min = args.target_delay * (1 - (args.warning_deviation / 100))
-            crit_deviation_max = args.target_delay * (1 + (args.critical_deviation / 100))
-            crit_deviation_min = args.target_delay * (1 - (args.critical_deviation / 100))
-            if replication_delay <= crit_deviation_min:
+        if exit_code == nagios.STATE_OK:
+            # Only check these things if everything else is healthy.
+
+            last_sql_error = slave_status['Last_SQL_Error']
+            if not last_sql_error:
+                print("UNKNOWN - Could not retrieve Last_SQL_Error")
+                sys.exit(nagios.STATE_UNKNOWN)
+            if last_sql_error:
                 exit_code = nagios.STATE_CRIT
-                exit_msg.append('Replication is delayed!')
-            if replication_delay >= crit_deviation_max:
-                exit_code = nagios.STATE_CRIT
-                exit_msg.append('Replication is ahead???')
-            if replication_delay <= warn_deviation_min:
-                exit_code = nagios.STATE_WARN
-                exit_msg.append('Replication is delayed!')
-            if replication_delay >= warn_deviation_max:
-                exit_code = nagios.STATE_WARN
-                exit_msg.append('Replication is ahead???')
+                exit_msg.append(f'Last SQL Error: {last_sql_error.strip(".")}.')
+
+            if not slave_status['Seconds_Behind_Master']:
+                print("UNKNOWN - Could not retrieve Seconds_Behind_Master")
+                sys.exit(nagios.STATE_UNKNOWN)
+            replication_delay = try_int(slave_status['Seconds_Behind_Master'])
+
+            if args.target_delay:
+                warn_deviation_max = args.target_delay * (1 + (args.warning_deviation / 100))
+                warn_deviation_min = args.target_delay * (1 - (args.warning_deviation / 100))
+                crit_deviation_max = args.target_delay * (1 + (args.critical_deviation / 100))
+                crit_deviation_min = args.target_delay * (1 - (args.critical_deviation / 100))
+                if replication_delay <= crit_deviation_min:
+                    exit_code = nagios.STATE_CRIT
+                    exit_msg.append('Replication is delayed!')
+                if replication_delay >= crit_deviation_max:
+                    exit_code = nagios.STATE_CRIT
+                    exit_msg.append('Replication is ahead???')
+                if replication_delay <= warn_deviation_min:
+                    exit_code = nagios.STATE_WARN
+                    exit_msg.append('Replication is delayed!')
+                if replication_delay >= warn_deviation_max:
+                    exit_code = nagios.STATE_WARN
+                    exit_msg.append('Replication is ahead???')
 
         if exit_code == nagios.STATE_OK:
             exit_msg.append('Slave is healthy! Slave SQL: running. Slave IO: running.')
 
-        exit_msg.append(f'Slave is {replication_delay} seconds behind master.')
-
-        perfdata = {
-            "replication_delay": {
-                "value": replication_delay,
-                "min": 0,
-                "unit": "s",
-            },
-        }
+        if replication_delay > -1:
+            exit_msg.append(f'Slave is {replication_delay} seconds behind master.')
+            perfdata = {
+                "replication_delay": {
+                    "value": replication_delay,
+                    "min": 0,
+                    "unit": "s",
+                },
+            }
 
         text_result = ' '.join(exit_msg)
         print_icinga2_check_status(text_result, exit_code, perfdata)