redo check_systemd_timer again
This commit is contained in:
parent
c2d36b06db
commit
afacf15c50
|
@ -4,93 +4,85 @@ import re
|
|||
import subprocess
|
||||
import sys
|
||||
import traceback
|
||||
from datetime import datetime
|
||||
from typing import Optional, Tuple, Union
|
||||
from datetime import datetime, timedelta
|
||||
from typing import Tuple, Union
|
||||
|
||||
import humanfriendly
|
||||
from dateparser import parse
|
||||
from dateutil import tz
|
||||
from pydantic import BaseModel
|
||||
|
||||
from checker import nagios
|
||||
from checker.humanfriendly import parse_systemctl_time_delta
|
||||
from checker.result import quit_check
|
||||
|
||||
sys.path.insert(0, "/usr/lib/python3/dist-packages")
|
||||
import dbus
|
||||
|
||||
SYSTEMCTL_TIMERS_RE = re.compile(
|
||||
r'^(([A-Za-z]*\s[0-9]{4}-[0-9]{2}-[0-9]{2}\s*[0-9]{2}:[0-9]{2}:[0-9]{2}\s[A-Z]*)|n\/a|-)\s*((([0-9]*[a-z]*\s)*(?:left)?)|n\/a|-)\s*(([A-Za-z]*\s[0-9]{4}-[0-9]{2}-[0-9]{2}\s[0-9]{2}:[0-9]{2}:[0-9]{2}\s[A-Z]*)|n\/a|-)\s*(([0-9A-Za-z\s]*\sago)|n\/a|-)\s*(.*?\.timer)\s*((.*?\.service)|\s*)'
|
||||
)
|
||||
SYSTEMCTL_STATUS_RE = r'Loaded:\s.*\.timer;\s(.*?);.*?\)\s*Active:\s(.*?) since.*?\s*Trigger:\s(.*?);'
|
||||
|
||||
|
||||
class TimerInfo(BaseModel):
|
||||
next: Optional[datetime]
|
||||
left: Optional[Union[float, None]]
|
||||
last: Optional[str]
|
||||
passed: Optional[Union[float, None]]
|
||||
unit: Optional[str]
|
||||
activates: Optional[str]
|
||||
next: datetime
|
||||
time_left: timedelta
|
||||
last: Union[datetime, None]
|
||||
since_last: Union[timedelta, None]
|
||||
unit: str
|
||||
enabled: bool
|
||||
active: bool
|
||||
status: Union[str, None]
|
||||
|
||||
|
||||
def is_timer_running(timer_name):
|
||||
def get_last_trigger(timer_name: str):
|
||||
output = subprocess.check_output("systemctl list-timers --all", shell=True).decode('utf-8')
|
||||
lines = output.strip().split("\n")
|
||||
for line in lines[1:]:
|
||||
fields = line.split()
|
||||
if timer_name in fields:
|
||||
if len(fields) < 14:
|
||||
# Timer has not been run yet.
|
||||
return None
|
||||
try:
|
||||
return parse(fields[7] + ' ' + fields[8] + ' ' + fields[9] + ' ' + fields[10])
|
||||
except IndexError:
|
||||
print(fields)
|
||||
raise
|
||||
|
||||
|
||||
def get_next_elapse(timer_name: str) -> Tuple[TimerInfo | None, None | str]:
|
||||
try:
|
||||
output = subprocess.check_output(["systemctl", "status", timer_name], universal_newlines=True)
|
||||
if "Active: active (running)" in output:
|
||||
return True
|
||||
return False
|
||||
except subprocess.CalledProcessError as e:
|
||||
return False
|
||||
if timer_name in output.split('\n')[0]:
|
||||
try:
|
||||
parts = re.search(SYSTEMCTL_STATUS_RE, output)
|
||||
|
||||
next_trigger = parse(parts.group(3))
|
||||
now = datetime.now(tz=next_trigger.tzinfo)
|
||||
time_left = next_trigger - now
|
||||
last_trigger = get_last_trigger(timer_name)
|
||||
if last_trigger is not None:
|
||||
since_last = last_trigger - now
|
||||
else:
|
||||
since_last = None
|
||||
|
||||
def get_next_elapse(timer_name) -> Tuple[TimerInfo | None, None | str]:
|
||||
try:
|
||||
output = subprocess.check_output(["systemctl", "list-timers", "--all"], universal_newlines=True)
|
||||
lines = output.split('\n')
|
||||
for line in lines:
|
||||
if timer_name in line:
|
||||
try:
|
||||
parts = re.search(SYSTEMCTL_TIMERS_RE, line)
|
||||
print(since_last)
|
||||
|
||||
datetime_object = None
|
||||
if parts.group(2):
|
||||
datetime_object = datetime.strptime(parts.group(2), '%a %Y-%m-%d %H:%M:%S %Z')
|
||||
|
||||
time_left: float | None = None
|
||||
if parts.group(4):
|
||||
try:
|
||||
time_left = parse_systemctl_time_delta(parts.group(4))
|
||||
except Exception as tl_err:
|
||||
if isinstance(tl_err, humanfriendly.InvalidTimespan):
|
||||
return None, f'Invalid Timespan: "{parts.group(4)}"'
|
||||
else:
|
||||
raise
|
||||
|
||||
time_passed: float | None = None
|
||||
if parts.group(9):
|
||||
try:
|
||||
time_passed = parse_systemctl_time_delta(parts.group(9))
|
||||
except Exception as tp_err:
|
||||
if isinstance(tp_err, humanfriendly.InvalidTimespan):
|
||||
return None, f'Invalid Timespan: "{parts.group(9)}"'
|
||||
else:
|
||||
raise
|
||||
|
||||
timer_info = TimerInfo(
|
||||
next=datetime_object,
|
||||
left=time_left,
|
||||
last=parts.group(7),
|
||||
passed=time_passed,
|
||||
unit=parts.group(10),
|
||||
activates=parts.group(12)
|
||||
)
|
||||
return timer_info, None
|
||||
except Exception:
|
||||
print(output)
|
||||
traceback.print_exc()
|
||||
sys.exit(nagios.STATE_UNKNOWN)
|
||||
return TimerInfo(
|
||||
next=next_trigger,
|
||||
time_left=time_left,
|
||||
last=last_trigger,
|
||||
since_last=since_last,
|
||||
enabled=parts.group(1).lower() == 'enabled',
|
||||
active=parts.group(2).split(' ')[0].lower() == 'active',
|
||||
status=parts.group(2).split(' ')[-1].lower().strip('(').strip(')') if '(' in parts.group(2).split(' ')[-1] else None,
|
||||
unit=timer_name,
|
||||
), None
|
||||
except Exception:
|
||||
print(output)
|
||||
traceback.print_exc()
|
||||
sys.exit(nagios.STATE_UNKNOWN)
|
||||
return None, 'Timer not found'
|
||||
except subprocess.CalledProcessError as e:
|
||||
return None, f'systemctl list-timers failed: {e}'
|
||||
return None, f'systemctl status failed: {e}'
|
||||
|
||||
|
||||
def check_timer(timer_name: str, expected_interval: int = None):
|
||||
|
@ -105,7 +97,6 @@ def check_timer(timer_name: str, expected_interval: int = None):
|
|||
timer_unit = system_bus.get_object('org.freedesktop.systemd1', timer_unit_path)
|
||||
timer_properties = dbus.Interface(timer_unit, 'org.freedesktop.DBus.Properties')
|
||||
active_state = timer_properties.Get('org.freedesktop.systemd1.Unit', 'ActiveState')
|
||||
running_state = is_timer_running(timer_name)
|
||||
|
||||
if active_state == 'active':
|
||||
next_elapse, err = get_next_elapse(timer_name)
|
||||
|
@ -115,45 +106,34 @@ def check_timer(timer_name: str, expected_interval: int = None):
|
|||
# if (next_elapse.left is not None and next_elapse.passed is not None) and (next_elapse.left < 0 or next_elapse.passed < 0):
|
||||
# quit_check(f'Timer is negative??? Left: {next_elapse["left"]}. Passed: {next_elapse["passed"]}', nagios.STATE_UNKNOWN)
|
||||
|
||||
if next_elapse.next:
|
||||
next_elapse_human = next_elapse.next.replace(tzinfo=tz.tzlocal()).strftime('%a %Y-%m-%d %H:%M %Z')
|
||||
else:
|
||||
next_elapse_human = 'N/A'
|
||||
|
||||
if next_elapse.left is not None:
|
||||
remaining_time_human = humanfriendly.format_timespan(next_elapse.left)
|
||||
else:
|
||||
remaining_time_human = 'N/A'
|
||||
if next_elapse.passed is not None:
|
||||
passed_time_human = humanfriendly.format_timespan(next_elapse.passed)
|
||||
else:
|
||||
passed_time_human = 'N/A'
|
||||
next_elapse_human = next_elapse.next.replace(tzinfo=tz.tzlocal()).strftime('%a %Y-%m-%d %H:%M %Z')
|
||||
remaining_time_human = humanfriendly.format_timespan(next_elapse.time_left)
|
||||
since_last_human = humanfriendly.format_timespan(next_elapse.since_last) if next_elapse.since_last else 'N/A'
|
||||
|
||||
perfdata_dict = {
|
||||
'remaining_time': {
|
||||
'value': int(next_elapse.left) if next_elapse.left is not None else 0,
|
||||
'value': next_elapse.time_left.seconds,
|
||||
'unit': 's',
|
||||
'min': 0
|
||||
},
|
||||
'time_since_last': {
|
||||
'value': int(next_elapse.passed) if next_elapse.passed is not None else 0,
|
||||
'value': next_elapse.since_last.seconds if next_elapse.since_last is not None else 0,
|
||||
'unit': 's',
|
||||
'min': 0
|
||||
}
|
||||
}
|
||||
|
||||
timer_info = f'Next trigger time: {next_elapse_human}. Time until next trigger: {remaining_time_human}. Time since last trigger: {passed_time_human}.'
|
||||
timer_info = f'Next trigger time: {next_elapse_human}. Time until next trigger: {remaining_time_human}. Time since last trigger: {since_last_human}.'
|
||||
|
||||
if expected_interval is not None and next_elapse.next and next_elapse.last:
|
||||
next_trigger_time = next_elapse.next
|
||||
last_trigger_time = datetime.strptime(next_elapse.last, '%a %Y-%m-%d %H:%M:%S %Z')
|
||||
actual_interval = next_trigger_time - last_trigger_time
|
||||
actual_interval = next_trigger_time - next_elapse.last
|
||||
actual_interval_seconds = actual_interval.total_seconds()
|
||||
|
||||
if actual_interval_seconds > expected_interval:
|
||||
quit_check(f'{timer_name} is active but the last trigger was more than the expected interval ago. {timer_info}', nagios.STATE_CRIT, perfdata_dict)
|
||||
|
||||
quit_check(f'{timer_name} is {"active" if not running_state else "active (running)"}. {timer_info}', nagios.STATE_OK, perfdata_dict)
|
||||
quit_check(f'{timer_name}{" is active" + (" (" + next_elapse.status + ")" if next_elapse else "") + "." if next_elapse.active else " -> "} {timer_info}', nagios.STATE_OK, perfdata_dict)
|
||||
else:
|
||||
quit_check(f'{timer_name} is not enabled', nagios.STATE_CRIT)
|
||||
except dbus.exceptions.DBusException:
|
||||
|
|
Loading…
Reference in New Issue