Watchdog support, initial version. (#1030)

* Watchdog support, initial version.
The watchdog monitors three things:
1. A set of important system daemons.
2. A set of pingable ip addresses.
3. A time the node should reboot everyday.
This commit is contained in:
Tim Wilkinson 2023-12-18 23:07:52 -08:00 committed by GitHub
parent 8f5e87eb11
commit cd4b99197a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 233 additions and 2 deletions

View File

@ -43,6 +43,7 @@ require("iwinfo")
require("aredn.hardware")
require("aredn.log")
require("luci.jsonc")
require("ubus")
-- aggressive gc on low memory devices
if nixio.sysinfo().totalram < 32 * 1024 * 1024 then

View File

@ -0,0 +1,197 @@
--[[
Part of AREDN -- Used for creating Amateur Radio Emergency Data Networks
Copyright (C) 2023 Tim Wilkinson
See Contributors file for additional contributors
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation version 3 of the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
Additional Terms:
Additional use restrictions exist on the AREDN(TM) trademark and logo.
See AREDNLicense.txt for more info.
Attributions to the AREDN Project must be retained in the source code.
If importing this code into a new or existing project attribution
to the AREDN project must be added to the source code.
You must not misrepresent the origin of the material contained within.
Modified versions must be modified to attribute to the original source
and be marked in reasonable ways as differentiate it from the original
version
--]]
local PING = "/bin/ping"
local PIDOF = "/bin/pidof"
local REBOOT = "/sbin/reboot"
local W = {}
-- Configuration limits and defaults
local config_limits = {
startup_delay = { 600, 600, 3600 },
ping_count = { 1, 3, 10 },
ping_timeout = { 1, 5, 10 },
tick = { 60, 60, 600 },
failures = { 2, 3, 25 },
daily = { -1, -1, 23 }
}
-- Set of daemons to monitor
local default_daemons = "olsrd dnsmasq telnetd dropbear uhttpd"
if uci.cursor():get("vtun", "server_0", "host") or uci.cursor():get("vtun", "client_0", "name") then
default_daemons = default_daemons .. " vtund"
end
function W.get_config()
local c = uci.cursor()
if c:get("aredn", "@watchdog[0]", "enable") ~= "1" then
return nil
end
local ping_addresses = {}
local addresses = c:get("aredn", "@watchdog[0]", "ping_addresses") or ""
for address in addresses:gmatch("(%S+)") do
if address:match("^%d+%.%d+%.%d+%.%d+$") then
mainlog:write("pinging " .. address)
ping_addresses[#ping_addresses + 1] = address
end
end
local daemons = {}
local mydaemons = c:get("aredn", "@watchdog[0]", "daemons") or default_daemons
for daemon in mydaemons:gmatch("(%S+)") do
mainlog:write("monitor " .. daemon)
daemons[#daemons + 1] = daemon
end
local config = {
ping_addresses = ping_addresses,
daemons = daemons
}
for k, v in pairs(config_limits)
do
local val = tonumber(c:get("aredn", "@watchdog[0]", k) or nil)
if not val then
config[k] = v[2]
elseif val < v[1] then
config[k] = v[1]
elseif val > v[3] then
config[k] = v[3]
else
config[k] = val
end
end
return config
end
function W.start()
local config = W.get_config()
if not config then
exit_app()
return
end
-- Dont start monitoring too soon. Let the system settle down.
wait_for_ticks(math.max(1, config.startup_delay - nixio.sysinfo().uptime))
local ub = ubus.connect()
ub:call("system", "watchdog", { magicclose = true })
ub:call("system", "watchdog", { stop = true })
local wd = io.open("/dev/watchdog", "w")
if not wd then
mainlog:write("Watchdog failed to start: Cannot open /dev/watchdog\n")
ub:call("system", "watchdog", { stop = false })
exit_app()
return
end
-- Make sure we have enough tick time for any pings
local total_ping_time = 30 + (config.ping_timeout + config.ping_count) * #config.ping_addresses
if total_ping_time > config.tick then
config.tick = math.ceil(total_ping_time / 60) * 60
mainlog:write("adjusted tick to " .. config.tick)
end
-- The reboot timeout seem to be 3-5x the timeout value
-- We make sure it's at least 5 minutes
ub:call("system", "watchdog", { timeout = math.ceil(math.max(300, config.tick * config.failures) / 3) })
local daily_reboot_armed = false
while true
do
local now = os.time()
local success = true
-- Reboot a device daily at a given time if configured. To avoid rebooting over and
-- over we must have just seen the previous hour
if config.daily ~= -1 then
local time = os.date("*t")
if time.min >= (60 - config.tick * 3) and (time.hour + 1) % 24 == config.daily then
daily_reboot_armed = true
elseif daily_reboot_armed and time.hour == config.daily then
mainlog:write("reboot")
os.execute(REBOOT .. " >/dev/null 2>&1")
daily_reboot_armed = false
else
daily_reboot_armed = false
end
end
for _ = 1, 1
do
-- Check various daemons are running
for _, daemon in ipairs(config.daemons)
do
if os.execute(PIDOF .. " " .. daemon .. " > /dev/null ") ~= 0 then
mainlog:write("pidof " .. daemon .. " failed")
success = false
break
end
end
if not success then
break
end
-- Check we can reach any of the ping addresses
if #config.ping_addresses > 0 then
success = false
for _, address in ipairs(config.ping_addresses)
do
if os.execute(PING .. " -c " .. config.ping_count .. " -A -q -W " .. config.ping_timeout .. " " .. address .. " > /dev/null 2>&1") == 0 then
success = true
break
else
mainlog:write("ping " .. address .. " failed")
end
end
if not success then
break
end
end
end
if success then
wd:write("V")
else
mainlog:write("failed")
end
wait_for_ticks(math.max(1, config.tick - (os.time() - now)))
end
end
return W.start

View File

@ -62,7 +62,7 @@ function olsrd_restart()
end
end
function watchdog()
function olsrd_watchdog()
while true
do
wait_for_ticks(223)
@ -86,4 +86,4 @@ function watchdog()
end
end
return watchdog
return olsrd_watchdog

View File

@ -1063,6 +1063,9 @@ local config_special = {
lqm_enable = c:get("aredn", "@lqm[0]", "enable"),
tunnel_weight = c:get("aredn", "@tunnel[0]", "weight"),
supernode_enable = c:get("aredn", "@supernode[0]", "enable"),
watchdog_enable = c:get("aredn", "@watchdog[0]", "enable"),
watchdog_pings = c:get("aredn", "@watchdog[0]", "ping_addresses"),
watchdog_daily = c:get("aredn", "@watchdog[0]", "daily"),
wifi_mode_0 = c:get("wireless", "@wifi-iface[0]", "mode"),
wifi_mode_1 = c:get("wireless", "@wifi-iface[1]", "mode")
}
@ -1097,6 +1100,15 @@ do
if oc:get("aredn", "@supernode[0]", "enable") ~= config_special.supernode_enable then
changes.reboot = true
end
if oc:get("aredn", "@watchdog[0]", "enable") ~= config_special.watchdog_enable then
changes.reboot = true
end
if oc:get("aredn", "@watchdog[0]", "ping_addresses") ~= config_special.watchdog_pings then
changes.manager = true
end
if oc:get("aredn", "@watchdog[0]", "daily") ~= config_special.watchdog_daily then
changes.manager = true
end
elseif file == "network" then
changes.network = true
elseif file == "dhcp" then

View File

@ -247,6 +247,27 @@ local settings = {
desc = "<b>WAN-Only Tunnel</b> prevents tunnel traffic from being routed over the Mesh network itself<br><br><small>aredn.@tunnel[0].wanonly</small>",
default = "1"
},
{
category = "Watchdog",
key = "aredn.@watchdog[0].enable",
type = "boolean",
desc = "<b>The Watchdog</b> will reboot the node if it stops operating correctly<br><br><small>aredn.@watchdog[0].enable</small>",
default = "0"
},
{
category = "Watchdog",
key = "aredn.@watchdog[0].ping_addresses",
type = "string",
desc = "<b>Watchdog IP addresses</b> is a whitespace seperated list of IP addresses, one of which should always be pingable<br><br><small>aredn.@watchdog[0].ping_addresses</small>",
default = ""
},
{
category = "Watchdog",
key = "aredn.@watchdog[0].daily",
type = "string",
desc = "<b>Daily Watchdog hour</b> is the hour every day (0-23) to automatically reboot the node<br><br><small>aredn.@watchdog[0].daily</small>",
default = ""
},
{
category = "Memory Settings",
key = "aredn.@meshstatus[0].lowmem",