icinga2-checks/check_mysql_slave.py

#!/usr/bin/env python
import argparse
import sys
import traceback

import mysql.connector

from checker import nagios, print_icinga2_check_status
from checker.types import try_int

"""
## MySQL User Setup ##

Attention: The DB-user you type in must have CLIENT REPLICATION rights on the DB-server. Example:
    GRANT REPLICATION CLIENT on *.* TO 'nagios'@'monitoringhost' IDENTIFIED BY 'secret';

If you use MariaDB 10.5 or newer, the DB user must have REPLICA MONITOR rights:
    GRANT REPLICA MONITOR ON *.* TO 'nagios'@'monitoringhost' IDENTIFIED BY 'secret';


## Usage example ##

Target delay = 300 seconds
Warning deviation = 10%
Critical deviation = 15%

This will set the warning levels to 330 and 270 seconds, if the delay is greater or less than these values it will return WARNING.
The critical levels will be 345 and 255 seconds. 
"""


def main(args):
    cursor = cnx = None
    try:
        cnx = mysql.connector.connect(user=args.username, password=args.password, host=args.host)
        cursor = cnx.cursor(dictionary=True)
        cursor.execute("SHOW SLAVE STATUS")

        slave_status = cursor.fetchone()
        if slave_status is None:
            print("UNKNOWN - Could not retrieve slave status")
            sys.exit(nagios.STATE_UNKNOWN)

        slave_io_running = slave_status['Slave_IO_Running']
        if not slave_io_running:
            print("UNKNOWN - Could not retrieve Slave_IO_Running")
            sys.exit(nagios.STATE_UNKNOWN)

        slave_sql_running = slave_status['Slave_SQL_Running']
        if not slave_sql_running:
            print("UNKNOWN - Could not retrieve Slave_SQL_Running")
            sys.exit(nagios.STATE_UNKNOWN)

        last_io_error = slave_status['Last_IO_Error']
        last_sql_error = slave_status['Last_SQL_Error']

        replication_delay = -1
        perfdata = {}
        exit_code = nagios.STATE_OK
        exit_msg = []

        if slave_sql_running != 'Yes':
            exit_code = nagios.STATE_CRIT
            exit_msg.append('Slave SQL is not running!')
        if slave_io_running != 'Yes':
            exit_code = nagios.STATE_CRIT
            exit_msg.append('Slave IO is not running!')
        if last_io_error:
            exit_code = nagios.STATE_CRIT
            exit_msg.append(f'Last IO Error: {last_io_error.strip(".")}.')

        if exit_code == nagios.STATE_OK:
            # Only replication delay if everything else is healthy.
            if not slave_status['Seconds_Behind_Master']:
                print("UNKNOWN - Could not retrieve Seconds_Behind_Master")
                sys.exit(nagios.STATE_UNKNOWN)
            replication_delay = try_int(slave_status['Seconds_Behind_Master'])

            if args.target_delay:
                warn_deviation_max = args.target_delay * (1 + (args.warning_deviation / 100))
                warn_deviation_min = args.target_delay * (1 - (args.warning_deviation / 100))
                crit_deviation_max = args.target_delay * (1 + (args.critical_deviation / 100))
                crit_deviation_min = args.target_delay * (1 - (args.critical_deviation / 100))
                if replication_delay <= crit_deviation_min:
                    exit_code = nagios.STATE_CRIT
                    exit_msg.append('Replication is delayed!')
                if replication_delay >= crit_deviation_max:
                    exit_code = nagios.STATE_CRIT
                    exit_msg.append('Replication is ahead???')
                if replication_delay <= warn_deviation_min:
                    exit_code = nagios.STATE_WARN
                    exit_msg.append('Replication is delayed!')
                if replication_delay >= warn_deviation_max:
                    exit_code = nagios.STATE_WARN
                    exit_msg.append('Replication is ahead???')

        if exit_code == nagios.STATE_OK:
            exit_msg.append('Slave is healthy! Slave SQL: running. Slave IO: running.')

        if replication_delay > -1:
            exit_msg.append(f'Slave is {replication_delay} seconds behind master.')
            perfdata = {
                "replication_delay": {
                    "value": replication_delay,
                    "min": 0,
                    "unit": "s",
                },
            }

        text_result = ' '.join(exit_msg)
        print_icinga2_check_status(text_result, exit_code, perfdata)
        sys.exit(exit_code)

    except mysql.connector.Error as e:
        print("UNKNOWN - Could not connect to database!")
        print(e)
        sys.exit(nagios.STATE_UNKNOWN)
    except Exception as e:
        print(f"UNKNOWN - {e}")
        traceback.print_exc()
        sys.exit(nagios.STATE_UNKNOWN)

    finally:
        if cursor:
            cursor.close()
        if cnx:
            cnx.close()


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Check a MySQL slave.')
    parser.add_argument('--host', required=True, help='The IP of the slave to connect to.')
    parser.add_argument('--username', required=True, help='Username.')
    parser.add_argument('--password', required=True, help='Password.')
    parser.add_argument('--target-delay', default=None, type=int, help='The target delay in seconds.')
    parser.add_argument('--warning-deviation', default=10, type=int, help='If the delay deviates more than this percentage from the target delay, return warning.')
    parser.add_argument('--critical-deviation', default=15, type=int, help='If the delay deviates more than this percentage from the target delay, return critical.')
    args = parser.parse_args()
    main(args)
add my own mysql slave check 2024-02-18 12:55:45 -07:00			`#!/usr/bin/env python`
			`import argparse`
			`import sys`
			`import traceback`

			`import mysql.connector`

			`from checker import nagios, print_icinga2_check_status`
			`from checker.types import try_int`

			`"""`
fix a few issues with check_media_cdn 2024-02-18 14:54:33 -07:00			`## MySQL User Setup ##`

			`Attention: The DB-user you type in must have CLIENT REPLICATION rights on the DB-server. Example:`
			`GRANT REPLICATION CLIENT on . TO 'nagios'@'monitoringhost' IDENTIFIED BY 'secret';`

			`If you use MariaDB 10.5 or newer, the DB user must have REPLICA MONITOR rights:`
			`GRANT REPLICA MONITOR ON . TO 'nagios'@'monitoringhost' IDENTIFIED BY 'secret';`


add my own mysql slave check 2024-02-18 12:55:45 -07:00			`## Usage example ##`

			`Target delay = 300 seconds`
			`Warning deviation = 10%`
			`Critical deviation = 15%`

			`This will set the warning levels to 330 and 270 seconds, if the delay is greater or less than these values it will return WARNING.`
			`The critical levels will be 345 and 255 seconds.`
			`"""`


			`def main(args):`
			`cursor = cnx = None`
			`try:`
			`cnx = mysql.connector.connect(user=args.username, password=args.password, host=args.host)`
			`cursor = cnx.cursor(dictionary=True)`
			`cursor.execute("SHOW SLAVE STATUS")`

check_mysql_slave: make work when slave is kill 2024-03-03 22:49:16 -07:00			`slave_status = cursor.fetchone()`
add my own mysql slave check 2024-02-18 12:55:45 -07:00			`if slave_status is None:`
			`print("UNKNOWN - Could not retrieve slave status")`
check_mysql_slave: make work when slave is kill 2024-03-03 22:49:16 -07:00			`sys.exit(nagios.STATE_UNKNOWN)`
add my own mysql slave check 2024-02-18 12:55:45 -07:00
			`slave_io_running = slave_status['Slave_IO_Running']`
check_mysql_slave: make work when slave is kill 2024-03-03 22:49:16 -07:00			`if not slave_io_running:`
			`print("UNKNOWN - Could not retrieve Slave_IO_Running")`
			`sys.exit(nagios.STATE_UNKNOWN)`

add my own mysql slave check 2024-02-18 12:55:45 -07:00			`slave_sql_running = slave_status['Slave_SQL_Running']`
check_mysql_slave: make work when slave is kill 2024-03-03 22:49:16 -07:00			`if not slave_sql_running:`
			`print("UNKNOWN - Could not retrieve Slave_SQL_Running")`
			`sys.exit(nagios.STATE_UNKNOWN)`

add my own mysql slave check 2024-02-18 12:55:45 -07:00			`last_io_error = slave_status['Last_IO_Error']`
check_mysql_slave: fix null attributes 2024-03-03 22:52:53 -07:00			`last_sql_error = slave_status['Last_SQL_Error']`
add my own mysql slave check 2024-02-18 12:55:45 -07:00
check_mysql_slave: make work when slave is kill 2024-03-03 22:49:16 -07:00			`replication_delay = -1`
			`perfdata = {}`
add my own mysql slave check 2024-02-18 12:55:45 -07:00			`exit_code = nagios.STATE_OK`
			`exit_msg = []`

			`if slave_sql_running != 'Yes':`
			`exit_code = nagios.STATE_CRIT`
			`exit_msg.append('Slave SQL is not running!')`
			`if slave_io_running != 'Yes':`
			`exit_code = nagios.STATE_CRIT`
			`exit_msg.append('Slave IO is not running!')`
			`if last_io_error:`
			`exit_code = nagios.STATE_CRIT`
			`exit_msg.append(f'Last IO Error: {last_io_error.strip(".")}.')`

check_mysql_slave: make work when slave is kill 2024-03-03 22:49:16 -07:00			`if exit_code == nagios.STATE_OK:`
check_mysql_slave: fix null attributes 2024-03-03 22:52:53 -07:00			`# Only replication delay if everything else is healthy.`
check_mysql_slave: make work when slave is kill 2024-03-03 22:49:16 -07:00			`if not slave_status['Seconds_Behind_Master']:`
			`print("UNKNOWN - Could not retrieve Seconds_Behind_Master")`
			`sys.exit(nagios.STATE_UNKNOWN)`
			`replication_delay = try_int(slave_status['Seconds_Behind_Master'])`

			`if args.target_delay:`
			`warn_deviation_max = args.target_delay * (1 + (args.warning_deviation / 100))`
			`warn_deviation_min = args.target_delay * (1 - (args.warning_deviation / 100))`
			`crit_deviation_max = args.target_delay * (1 + (args.critical_deviation / 100))`
			`crit_deviation_min = args.target_delay * (1 - (args.critical_deviation / 100))`
			`if replication_delay <= crit_deviation_min:`
			`exit_code = nagios.STATE_CRIT`
			`exit_msg.append('Replication is delayed!')`
			`if replication_delay >= crit_deviation_max:`
			`exit_code = nagios.STATE_CRIT`
			`exit_msg.append('Replication is ahead???')`
			`if replication_delay <= warn_deviation_min:`
			`exit_code = nagios.STATE_WARN`
			`exit_msg.append('Replication is delayed!')`
			`if replication_delay >= warn_deviation_max:`
			`exit_code = nagios.STATE_WARN`
			`exit_msg.append('Replication is ahead???')`
add my own mysql slave check 2024-02-18 12:55:45 -07:00
			`if exit_code == nagios.STATE_OK:`
			`exit_msg.append('Slave is healthy! Slave SQL: running. Slave IO: running.')`

check_mysql_slave: make work when slave is kill 2024-03-03 22:49:16 -07:00			`if replication_delay > -1:`
			`exit_msg.append(f'Slave is {replication_delay} seconds behind master.')`
			`perfdata = {`
			`"replication_delay": {`
			`"value": replication_delay,`
			`"min": 0,`
			`"unit": "s",`
			`},`
			`}`
add my own mysql slave check 2024-02-18 12:55:45 -07:00
			`text_result = ' '.join(exit_msg)`
			`print_icinga2_check_status(text_result, exit_code, perfdata)`
			`sys.exit(exit_code)`

			`except mysql.connector.Error as e:`
			`print("UNKNOWN - Could not connect to database!")`
			`print(e)`
			`sys.exit(nagios.STATE_UNKNOWN)`
			`except Exception as e:`
			`print(f"UNKNOWN - {e}")`
			`traceback.print_exc()`
			`sys.exit(nagios.STATE_UNKNOWN)`

			`finally:`
			`if cursor:`
			`cursor.close()`
			`if cnx:`
			`cnx.close()`


			`if __name__ == "__main__":`
			`parser = argparse.ArgumentParser(description='Check a MySQL slave.')`
			`parser.add_argument('--host', required=True, help='The IP of the slave to connect to.')`
			`parser.add_argument('--username', required=True, help='Username.')`
			`parser.add_argument('--password', required=True, help='Password.')`
			`parser.add_argument('--target-delay', default=None, type=int, help='The target delay in seconds.')`
			`parser.add_argument('--warning-deviation', default=10, type=int, help='If the delay deviates more than this percentage from the target delay, return warning.')`
			`parser.add_argument('--critical-deviation', default=15, type=int, help='If the delay deviates more than this percentage from the target delay, return critical.')`
			`args = parser.parse_args()`
			`main(args)`