redo check_systemd_timer again

This commit is contained in:
Cyberes 2024-10-10 20:34:47 -06:00
parent c2d36b06db
commit afacf15c50
1 changed files with 64 additions and 84 deletions

View File

@ -4,93 +4,85 @@ import re
import subprocess
import sys
import traceback
from datetime import datetime
from typing import Optional, Tuple, Union
from datetime import datetime, timedelta
from typing import Tuple, Union
import humanfriendly
from dateparser import parse
from dateutil import tz
from pydantic import BaseModel
from checker import nagios
from checker.humanfriendly import parse_systemctl_time_delta
from checker.result import quit_check
sys.path.insert(0, "/usr/lib/python3/dist-packages")
import dbus
SYSTEMCTL_TIMERS_RE = re.compile(
r'^(([A-Za-z]*\s[0-9]{4}-[0-9]{2}-[0-9]{2}\s*[0-9]{2}:[0-9]{2}:[0-9]{2}\s[A-Z]*)|n\/a|-)\s*((([0-9]*[a-z]*\s)*(?:left)?)|n\/a|-)\s*(([A-Za-z]*\s[0-9]{4}-[0-9]{2}-[0-9]{2}\s[0-9]{2}:[0-9]{2}:[0-9]{2}\s[A-Z]*)|n\/a|-)\s*(([0-9A-Za-z\s]*\sago)|n\/a|-)\s*(.*?\.timer)\s*((.*?\.service)|\s*)'
)
SYSTEMCTL_STATUS_RE = r'Loaded:\s.*\.timer;\s(.*?);.*?\)\s*Active:\s(.*?) since.*?\s*Trigger:\s(.*?);'
class TimerInfo(BaseModel):
next: Optional[datetime]
left: Optional[Union[float, None]]
last: Optional[str]
passed: Optional[Union[float, None]]
unit: Optional[str]
activates: Optional[str]
next: datetime
time_left: timedelta
last: Union[datetime, None]
since_last: Union[timedelta, None]
unit: str
enabled: bool
active: bool
status: Union[str, None]
def is_timer_running(timer_name):
def get_last_trigger(timer_name: str):
output = subprocess.check_output("systemctl list-timers --all", shell=True).decode('utf-8')
lines = output.strip().split("\n")
for line in lines[1:]:
fields = line.split()
if timer_name in fields:
if len(fields) < 14:
# Timer has not been run yet.
return None
try:
return parse(fields[7] + ' ' + fields[8] + ' ' + fields[9] + ' ' + fields[10])
except IndexError:
print(fields)
raise
def get_next_elapse(timer_name: str) -> Tuple[TimerInfo | None, None | str]:
try:
output = subprocess.check_output(["systemctl", "status", timer_name], universal_newlines=True)
if "Active: active (running)" in output:
return True
return False
except subprocess.CalledProcessError as e:
return False
if timer_name in output.split('\n')[0]:
try:
parts = re.search(SYSTEMCTL_STATUS_RE, output)
next_trigger = parse(parts.group(3))
now = datetime.now(tz=next_trigger.tzinfo)
time_left = next_trigger - now
last_trigger = get_last_trigger(timer_name)
if last_trigger is not None:
since_last = last_trigger - now
else:
since_last = None
def get_next_elapse(timer_name) -> Tuple[TimerInfo | None, None | str]:
try:
output = subprocess.check_output(["systemctl", "list-timers", "--all"], universal_newlines=True)
lines = output.split('\n')
for line in lines:
if timer_name in line:
try:
parts = re.search(SYSTEMCTL_TIMERS_RE, line)
print(since_last)
datetime_object = None
if parts.group(2):
datetime_object = datetime.strptime(parts.group(2), '%a %Y-%m-%d %H:%M:%S %Z')
time_left: float | None = None
if parts.group(4):
try:
time_left = parse_systemctl_time_delta(parts.group(4))
except Exception as tl_err:
if isinstance(tl_err, humanfriendly.InvalidTimespan):
return None, f'Invalid Timespan: "{parts.group(4)}"'
else:
raise
time_passed: float | None = None
if parts.group(9):
try:
time_passed = parse_systemctl_time_delta(parts.group(9))
except Exception as tp_err:
if isinstance(tp_err, humanfriendly.InvalidTimespan):
return None, f'Invalid Timespan: "{parts.group(9)}"'
else:
raise
timer_info = TimerInfo(
next=datetime_object,
left=time_left,
last=parts.group(7),
passed=time_passed,
unit=parts.group(10),
activates=parts.group(12)
)
return timer_info, None
except Exception:
print(output)
traceback.print_exc()
sys.exit(nagios.STATE_UNKNOWN)
return TimerInfo(
next=next_trigger,
time_left=time_left,
last=last_trigger,
since_last=since_last,
enabled=parts.group(1).lower() == 'enabled',
active=parts.group(2).split(' ')[0].lower() == 'active',
status=parts.group(2).split(' ')[-1].lower().strip('(').strip(')') if '(' in parts.group(2).split(' ')[-1] else None,
unit=timer_name,
), None
except Exception:
print(output)
traceback.print_exc()
sys.exit(nagios.STATE_UNKNOWN)
return None, 'Timer not found'
except subprocess.CalledProcessError as e:
return None, f'systemctl list-timers failed: {e}'
return None, f'systemctl status failed: {e}'
def check_timer(timer_name: str, expected_interval: int = None):
@ -105,7 +97,6 @@ def check_timer(timer_name: str, expected_interval: int = None):
timer_unit = system_bus.get_object('org.freedesktop.systemd1', timer_unit_path)
timer_properties = dbus.Interface(timer_unit, 'org.freedesktop.DBus.Properties')
active_state = timer_properties.Get('org.freedesktop.systemd1.Unit', 'ActiveState')
running_state = is_timer_running(timer_name)
if active_state == 'active':
next_elapse, err = get_next_elapse(timer_name)
@ -115,45 +106,34 @@ def check_timer(timer_name: str, expected_interval: int = None):
# if (next_elapse.left is not None and next_elapse.passed is not None) and (next_elapse.left < 0 or next_elapse.passed < 0):
# quit_check(f'Timer is negative??? Left: {next_elapse["left"]}. Passed: {next_elapse["passed"]}', nagios.STATE_UNKNOWN)
if next_elapse.next:
next_elapse_human = next_elapse.next.replace(tzinfo=tz.tzlocal()).strftime('%a %Y-%m-%d %H:%M %Z')
else:
next_elapse_human = 'N/A'
if next_elapse.left is not None:
remaining_time_human = humanfriendly.format_timespan(next_elapse.left)
else:
remaining_time_human = 'N/A'
if next_elapse.passed is not None:
passed_time_human = humanfriendly.format_timespan(next_elapse.passed)
else:
passed_time_human = 'N/A'
next_elapse_human = next_elapse.next.replace(tzinfo=tz.tzlocal()).strftime('%a %Y-%m-%d %H:%M %Z')
remaining_time_human = humanfriendly.format_timespan(next_elapse.time_left)
since_last_human = humanfriendly.format_timespan(next_elapse.since_last) if next_elapse.since_last else 'N/A'
perfdata_dict = {
'remaining_time': {
'value': int(next_elapse.left) if next_elapse.left is not None else 0,
'value': next_elapse.time_left.seconds,
'unit': 's',
'min': 0
},
'time_since_last': {
'value': int(next_elapse.passed) if next_elapse.passed is not None else 0,
'value': next_elapse.since_last.seconds if next_elapse.since_last is not None else 0,
'unit': 's',
'min': 0
}
}
timer_info = f'Next trigger time: {next_elapse_human}. Time until next trigger: {remaining_time_human}. Time since last trigger: {passed_time_human}.'
timer_info = f'Next trigger time: {next_elapse_human}. Time until next trigger: {remaining_time_human}. Time since last trigger: {since_last_human}.'
if expected_interval is not None and next_elapse.next and next_elapse.last:
next_trigger_time = next_elapse.next
last_trigger_time = datetime.strptime(next_elapse.last, '%a %Y-%m-%d %H:%M:%S %Z')
actual_interval = next_trigger_time - last_trigger_time
actual_interval = next_trigger_time - next_elapse.last
actual_interval_seconds = actual_interval.total_seconds()
if actual_interval_seconds > expected_interval:
quit_check(f'{timer_name} is active but the last trigger was more than the expected interval ago. {timer_info}', nagios.STATE_CRIT, perfdata_dict)
quit_check(f'{timer_name} is {"active" if not running_state else "active (running)"}. {timer_info}', nagios.STATE_OK, perfdata_dict)
quit_check(f'{timer_name}{" is active" + (" (" + next_elapse.status + ")" if next_elapse else "") + "." if next_elapse.active else " -> "} {timer_info}', nagios.STATE_OK, perfdata_dict)
else:
quit_check(f'{timer_name} is not enabled', nagios.STATE_CRIT)
except dbus.exceptions.DBusException: