mirror of https://github.com/aredn/aredn.git
Simply the watchdog by fixing various timeouts. (#1038)
Unfortunately there doesnt appear to be much flexibility in the various hardware watchdogs on radios, so setting the watchdog > 60 seconds mostly doesnt work. So rework the settings to allow for this and that our watchdog tests must be frequent and quick.
This commit is contained in:
parent
2fe3904f0a
commit
e5bd9bc066
|
@ -54,12 +54,12 @@ function wait_for_ticks(ticks)
|
|||
local when = nixio.sysinfo().uptime + ticks
|
||||
while true
|
||||
do
|
||||
ticks = when - nixio.sysinfo().uptime
|
||||
if ticks > 0 then
|
||||
if ticks >= 0 then
|
||||
coroutine.yield(ticks)
|
||||
else
|
||||
break
|
||||
end
|
||||
ticks = when - nixio.sysinfo().uptime
|
||||
end
|
||||
end
|
||||
|
||||
|
|
|
@ -39,15 +39,9 @@ local REBOOT = "/sbin/reboot"
|
|||
|
||||
local W = {}
|
||||
|
||||
-- Configuration limits and defaults
|
||||
local config_limits = {
|
||||
startup_delay = { 600, 600, 3600 },
|
||||
ping_count = { 1, 3, 10 },
|
||||
ping_timeout = { 1, 5, 10 },
|
||||
tick = { 60, 120, 600 },
|
||||
failures = { 2, 5, 25 },
|
||||
daily = { -1, -1, 23 }
|
||||
}
|
||||
local tick = 20
|
||||
local ping_timeout = 3
|
||||
local startup_delay = 600
|
||||
|
||||
-- Set of daemons to monitor
|
||||
local default_daemons = "olsrd dnsmasq telnetd dropbear uhttpd"
|
||||
|
@ -68,6 +62,7 @@ function W.get_config(verbose)
|
|||
ping_addresses[#ping_addresses + 1] = address
|
||||
end
|
||||
end
|
||||
|
||||
local daemons = {}
|
||||
local mydaemons = c:get("aredn", "@watchdog[0]", "daemons") or default_daemons
|
||||
for daemon in mydaemons:gmatch("(%S+)") do
|
||||
|
@ -76,34 +71,14 @@ function W.get_config(verbose)
|
|||
end
|
||||
daemons[#daemons + 1] = daemon
|
||||
end
|
||||
local config = {
|
||||
|
||||
local daily = tonumber(c:get("aredn", "@watchdog[0]", "daily") or nil) or -1
|
||||
|
||||
return {
|
||||
ping_addresses = ping_addresses,
|
||||
daemons = daemons
|
||||
daemons = daemons,
|
||||
daily = daily
|
||||
}
|
||||
for k, v in pairs(config_limits)
|
||||
do
|
||||
local val = tonumber(c:get("aredn", "@watchdog[0]", k) or nil)
|
||||
if not val then
|
||||
config[k] = v[2]
|
||||
elseif val < v[1] then
|
||||
config[k] = v[1]
|
||||
elseif val > v[3] then
|
||||
config[k] = v[3]
|
||||
else
|
||||
config[k] = val
|
||||
end
|
||||
end
|
||||
|
||||
-- Make sure we have enough tick time for any pings
|
||||
local total_ping_time = 30 + (config.ping_timeout + config.ping_count) * #config.ping_addresses
|
||||
if total_ping_time > config.tick then
|
||||
config.tick = math.ceil(total_ping_time / 60) * 60
|
||||
if verbose then
|
||||
mainlog:write("adjusting tick to " .. config.tick)
|
||||
end
|
||||
end
|
||||
|
||||
return config
|
||||
end
|
||||
|
||||
function W.start()
|
||||
|
@ -112,12 +87,15 @@ function W.start()
|
|||
return
|
||||
end
|
||||
|
||||
local ub = ubus.connect()
|
||||
local config = W.get_config(true)
|
||||
|
||||
-- Dont start monitoring too soon. Let the system settle down.
|
||||
wait_for_ticks(math.max(1, config.startup_delay - nixio.sysinfo().uptime))
|
||||
ub:call("system", "watchdog", { frequency = 1 })
|
||||
ub:call("system", "watchdog", { timeout = 60 })
|
||||
|
||||
-- Dont start monitoring too soon. Let the system settle down.
|
||||
wait_for_ticks(math.max(0, startup_delay - nixio.sysinfo().uptime))
|
||||
|
||||
local ub = ubus.connect()
|
||||
ub:call("system", "watchdog", { magicclose = true })
|
||||
ub:call("system", "watchdog", { stop = true })
|
||||
|
||||
|
@ -129,9 +107,6 @@ function W.start()
|
|||
return
|
||||
end
|
||||
|
||||
-- We make sure it's at least 5 minutes
|
||||
ub:call("system", "watchdog", { timeout = math.ceil(math.max(300, config.tick * config.failures)) })
|
||||
|
||||
local daily_reboot_armed = false
|
||||
|
||||
while true
|
||||
|
@ -146,7 +121,7 @@ function W.start()
|
|||
-- over we must have just seen the previous hour
|
||||
if config.daily ~= -1 then
|
||||
local time = os.date("*t")
|
||||
if time.min >= (60 - config.tick * 3) and (time.hour + 1) % 24 == config.daily then
|
||||
if time.min >= 55 and (time.hour + 1) % 24 == config.daily then
|
||||
daily_reboot_armed = true
|
||||
elseif daily_reboot_armed and time.hour == config.daily then
|
||||
mainlog:write("reboot")
|
||||
|
@ -177,7 +152,7 @@ function W.start()
|
|||
success = false
|
||||
for _, address in ipairs(config.ping_addresses)
|
||||
do
|
||||
if os.execute(PING .. " -c " .. config.ping_count .. " -A -q -W " .. config.ping_timeout .. " " .. address .. " > /dev/null 2>&1") == 0 then
|
||||
if os.execute(PING .. " -c 1 -A -q -W " .. ping_timeout .. " " .. address .. " > /dev/null 2>&1") == 0 then
|
||||
success = true
|
||||
break
|
||||
else
|
||||
|
@ -191,12 +166,13 @@ function W.start()
|
|||
|
||||
end
|
||||
if success then
|
||||
wd:write("V")
|
||||
wd:write("1")
|
||||
wd:flush()
|
||||
else
|
||||
mainlog:write("failed")
|
||||
end
|
||||
|
||||
wait_for_ticks(math.max(1, config.tick - (os.time() - now)))
|
||||
wait_for_ticks(math.max(0, tick - (os.time() - now)))
|
||||
end
|
||||
end
|
||||
|
||||
|
|
|
@ -228,7 +228,7 @@ function lqm()
|
|||
end
|
||||
|
||||
-- Let things startup for a while before we begin
|
||||
wait_for_ticks(math.max(1, 30 - nixio.sysinfo().uptime))
|
||||
wait_for_ticks(math.max(0, 30 - nixio.sysinfo().uptime))
|
||||
|
||||
-- Create filters (cannot create during install as they disappear on reboot)
|
||||
os.execute(NFT .. " flush chain ip fw4 input_lqm 2> /dev/null")
|
||||
|
@ -599,6 +599,7 @@ function lqm()
|
|||
local raw = io.popen("/usr/bin/curl --retry 0 --connect-timeout " .. connect_timeout .. " --speed-time " .. speed_time .. " --speed-limit " .. speed_limit .. " -s \"http://" .. track.ip .. ":8080/cgi-bin/sysinfo.json?link_info=1&lqm=1\" -o - 2> /dev/null")
|
||||
local info = luci.jsonc.parse(raw:read("*a"))
|
||||
raw:close()
|
||||
wait_for_ticks(0)
|
||||
if info then
|
||||
rflinks[track.mac] = nil
|
||||
if tonumber(info.lat) and tonumber(info.lon) then
|
||||
|
@ -716,6 +717,7 @@ function lqm()
|
|||
end
|
||||
ptime = socket.gettime(0) - pstart
|
||||
sigsock:close()
|
||||
wait_for_ticks(0)
|
||||
|
||||
local ping_loss_run_avg = 1 - config.ping_penalty / 100
|
||||
if success > 0 then
|
||||
|
|
Loading…
Reference in New Issue