icinga2-checks/check_systemd_timer.py

145 lines
5.6 KiB
Python
Raw Normal View History

2024-02-28 09:54:57 -07:00
#!/usr/bin/env python3
import argparse
2024-03-07 15:18:46 -07:00
import re
import subprocess
2024-02-28 09:54:57 -07:00
import sys
import traceback
2024-10-10 20:34:47 -06:00
from datetime import datetime, timedelta
from typing import Tuple, Union
2024-02-28 09:54:57 -07:00
2024-03-07 15:18:46 -07:00
import humanfriendly
2024-10-10 20:34:47 -06:00
from dateparser import parse
from dateutil import tz
from pydantic import BaseModel
2024-02-28 09:54:57 -07:00
from checker import nagios
from checker.result import quit_check
SYSTEMCTL_STATUS_RE = r'Loaded:\s.*\.timer;\s(.*?);.*?\)\s*Active:\s(.*?) since.*?\s*Trigger:\s(.*?;|n\/a)'
class TimerInfo(BaseModel):
next: Union[datetime, None]
time_left: Union[timedelta, None]
2024-10-10 20:34:47 -06:00
last: Union[datetime, None]
since_last: Union[timedelta, None]
unit: str
enabled: bool
active: bool
status: Union[str, None]
def get_last_trigger(timer_name: str):
output = subprocess.check_output("systemctl list-timers --all", shell=True).decode('utf-8')
lines = output.strip().split("\n")
for line in lines[1:]:
fields = line.split()
if timer_name in fields:
if len(fields) < 14:
# Timer has not been run yet.
return None
try:
return parse(fields[7] + ' ' + fields[8] + ' ' + fields[9] + ' ' + fields[10])
except IndexError:
print(fields)
raise
def get_next_elapse(timer_name: str) -> Tuple[TimerInfo | None, None | str]:
now = datetime.now()
try:
output = subprocess.check_output(["systemctl", "status", timer_name], universal_newlines=True)
2024-10-10 20:34:47 -06:00
if timer_name in output.split('\n')[0]:
try:
parts = re.search(SYSTEMCTL_STATUS_RE, output)
next_trigger_str = parts.group(3)
if next_trigger_str.lower() != 'n/a':
next_trigger = parse(next_trigger_str)
time_left = next_trigger - now.replace(tzinfo=next_trigger.tzinfo)
else:
next_trigger = None
time_left = None
2024-10-10 20:34:47 -06:00
last_trigger = get_last_trigger(timer_name)
if last_trigger is not None:
since_last = now.replace(tzinfo=last_trigger.tzinfo) - last_trigger
2024-10-10 20:34:47 -06:00
else:
since_last = None
return TimerInfo(
next=next_trigger,
time_left=time_left,
last=last_trigger,
since_last=since_last,
enabled=parts.group(1).lower() == 'enabled',
active=parts.group(2).split(' ')[0].lower() == 'active',
status=parts.group(2).split(' ')[-1].lower().strip('(').strip(')') if '(' in parts.group(2).split(' ')[-1] else None,
unit=timer_name,
), None
except Exception:
print(output)
traceback.print_exc()
sys.exit(nagios.STATE_UNKNOWN)
return None, 'Timer not found'
except subprocess.CalledProcessError as e:
2024-10-10 20:34:47 -06:00
return None, f'systemctl status failed: {e}'
2024-02-28 09:54:57 -07:00
def check_timer(timer_name: str, expected_interval: int = None):
2024-02-28 21:07:48 -07:00
if not timer_name.endswith('.timer'):
timer_name = timer_name + '.timer'
timer_info, timer_error = get_next_elapse(timer_name)
if timer_error:
quit_check(str(timer_error), nagios.STATE_UNKNOWN)
if not timer_info.enabled:
quit_check(f'{timer_name} is not enabled', nagios.STATE_CRIT)
# if (next_elapse.left is not None and next_elapse.passed is not None) and (next_elapse.left < 0 or next_elapse.passed < 0):
# quit_check(f'Timer is negative??? Left: {next_elapse["left"]}. Passed: {next_elapse["passed"]}', nagios.STATE_UNKNOWN)
next_elapse_human = timer_info.next.replace(tzinfo=tz.tzlocal()).strftime('%a %Y-%m-%d %H:%M %Z') if timer_info.next else 'N/A'
remaining_time_human = humanfriendly.format_timespan(timer_info.time_left) if timer_info.time_left else 'N/A'
since_last_human = humanfriendly.format_timespan(timer_info.since_last) if timer_info.since_last else 'N/A'
perfdata_dict = {
'remaining_time': {
'value': timer_info.time_left.seconds if timer_info.time_left else 0,
'unit': 's',
'min': 0
},
'time_since_last': {
'value': timer_info.since_last.seconds if timer_info.since_last else 0,
'unit': 's',
'min': 0
}
}
timer_print_info = f'Next trigger time: {next_elapse_human}. Time until next trigger: {remaining_time_human}. Time since last trigger: {since_last_human}.'
if expected_interval is not None and timer_info.next and timer_info.last:
next_trigger_time = timer_info.next
actual_interval = next_trigger_time - timer_info.last
actual_interval_seconds = actual_interval.total_seconds()
if actual_interval_seconds > expected_interval:
quit_check(f'{timer_name} is active but the last trigger was more than the expected interval ago. {timer_print_info}', nagios.STATE_CRIT, perfdata_dict)
quit_check(f'{timer_name}{" is active" + (" (" + timer_info.status + ")" if timer_info else "") + "." if timer_info.active else " -> "} {timer_print_info}', nagios.STATE_OK, perfdata_dict)
2024-02-28 09:54:57 -07:00
if __name__ == '__main__':
parser = argparse.ArgumentParser()
2024-02-28 10:04:13 -07:00
parser.add_argument('-t', '--timer', required=True, help='The name of the timer to check.')
parser.add_argument('-i', '--interval', type=int, help='The expected interval between timer triggers in seconds.')
2024-02-28 09:54:57 -07:00
args = parser.parse_args()
try:
check_timer(args.timer, args.interval)
2024-02-28 09:54:57 -07:00
except Exception as e:
print(f'UNKNOWN - exception "{e}"')
traceback.print_exc()
sys.exit(nagios.STATE_UNKNOWN)