icinga2-checks/check_systemd_timer.py

145 lines
5.6 KiB
Python
Executable File

#!/usr/bin/env python3
import argparse
import re
import subprocess
import sys
import traceback
from datetime import datetime, timedelta
from typing import Tuple, Union
import humanfriendly
from dateparser import parse
from dateutil import tz
from pydantic import BaseModel
from checker import nagios
from checker.result import quit_check
SYSTEMCTL_STATUS_RE = r'Loaded:\s.*\.timer;\s(.*?);.*?\)\s*Active:\s(.*?) since.*?\s*Trigger:\s(.*?;|n\/a)'
class TimerInfo(BaseModel):
next: Union[datetime, None]
time_left: Union[timedelta, None]
last: Union[datetime, None]
since_last: Union[timedelta, None]
unit: str
enabled: bool
active: bool
status: Union[str, None]
def get_last_trigger(timer_name: str):
output = subprocess.check_output("systemctl list-timers --all", shell=True).decode('utf-8')
lines = output.strip().split("\n")
for line in lines[1:]:
fields = line.split()
if timer_name in fields:
if len(fields) < 14:
# Timer has not been run yet.
return None
try:
return parse(fields[7] + ' ' + fields[8] + ' ' + fields[9] + ' ' + fields[10])
except IndexError:
print(fields)
raise
def get_next_elapse(timer_name: str) -> Tuple[TimerInfo | None, None | str]:
now = datetime.now()
try:
output = subprocess.check_output(["systemctl", "status", timer_name], universal_newlines=True)
if timer_name in output.split('\n')[0]:
try:
parts = re.search(SYSTEMCTL_STATUS_RE, output)
next_trigger_str = parts.group(3)
if next_trigger_str.lower() != 'n/a':
next_trigger = parse(next_trigger_str)
time_left = next_trigger - now.replace(tzinfo=next_trigger.tzinfo)
else:
next_trigger = None
time_left = None
last_trigger = get_last_trigger(timer_name)
if last_trigger is not None:
since_last = now.replace(tzinfo=last_trigger.tzinfo) - last_trigger
else:
since_last = None
return TimerInfo(
next=next_trigger,
time_left=time_left,
last=last_trigger,
since_last=since_last,
enabled=parts.group(1).lower() == 'enabled',
active=parts.group(2).split(' ')[0].lower() == 'active',
status=parts.group(2).split(' ')[-1].lower().strip('(').strip(')') if '(' in parts.group(2).split(' ')[-1] else None,
unit=timer_name,
), None
except Exception:
print(output)
traceback.print_exc()
sys.exit(nagios.STATE_UNKNOWN)
return None, 'Timer not found'
except subprocess.CalledProcessError as e:
return None, f'systemctl status failed: {e}'
def check_timer(timer_name: str, expected_interval: int = None):
if not timer_name.endswith('.timer'):
timer_name = timer_name + '.timer'
timer_info, timer_error = get_next_elapse(timer_name)
if timer_error:
quit_check(str(timer_error), nagios.STATE_UNKNOWN)
if not timer_info.enabled:
quit_check(f'{timer_name} is not enabled', nagios.STATE_CRIT)
# if (next_elapse.left is not None and next_elapse.passed is not None) and (next_elapse.left < 0 or next_elapse.passed < 0):
# quit_check(f'Timer is negative??? Left: {next_elapse["left"]}. Passed: {next_elapse["passed"]}', nagios.STATE_UNKNOWN)
next_elapse_human = timer_info.next.replace(tzinfo=tz.tzlocal()).strftime('%a %Y-%m-%d %H:%M %Z') if timer_info.next else 'N/A'
remaining_time_human = humanfriendly.format_timespan(timer_info.time_left) if timer_info.time_left else 'N/A'
since_last_human = humanfriendly.format_timespan(timer_info.since_last) if timer_info.since_last else 'N/A'
perfdata_dict = {
'remaining_time': {
'value': timer_info.time_left.seconds if timer_info.time_left else 0,
'unit': 's',
'min': 0
},
'time_since_last': {
'value': timer_info.since_last.seconds if timer_info.since_last else 0,
'unit': 's',
'min': 0
}
}
timer_print_info = f'Next trigger time: {next_elapse_human}. Time until next trigger: {remaining_time_human}. Time since last trigger: {since_last_human}.'
if expected_interval is not None and timer_info.next and timer_info.last:
next_trigger_time = timer_info.next
actual_interval = next_trigger_time - timer_info.last
actual_interval_seconds = actual_interval.total_seconds()
if actual_interval_seconds > expected_interval:
quit_check(f'{timer_name} is active but the last trigger was more than the expected interval ago. {timer_print_info}', nagios.STATE_CRIT, perfdata_dict)
quit_check(f'{timer_name}{" is active" + (" (" + timer_info.status + ")" if timer_info else "") + "." if timer_info.active else " -> "} {timer_print_info}', nagios.STATE_OK, perfdata_dict)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('-t', '--timer', required=True, help='The name of the timer to check.')
parser.add_argument('-i', '--interval', type=int, help='The expected interval between timer triggers in seconds.')
args = parser.parse_args()
try:
check_timer(args.timer, args.interval)
except Exception as e:
print(f'UNKNOWN - exception "{e}"')
traceback.print_exc()
sys.exit(nagios.STATE_UNKNOWN)