Olsrd crash fixes (#1234)

* Simplify the OLSR watchdog

* Dont pull the routing table into LQM for supernodes.
LQM tracks routes on nodes to help keep leaf nodes connected even
when circumstances would probably prevent this. However on supernodes
the routing table is massive and pulling this into LQM will frequently
crash OLSRD. As we dont need this for supernode just dont do it for them.
This commit is contained in:
Tim Wilkinson 2024-05-31 22:44:36 -07:00 committed by GitHub
parent 192e6deaec
commit 0328f0ec7e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 23 additions and 49 deletions

View File

@ -247,6 +247,7 @@ end
local myhostname = canonical_hostname(aredn.info.get_nvram("node") or "localnode")
local myip = uci.cursor():get("network", "wifi", "ipaddr")
local is_supernode = uci.cursor():get("aredn", "@supernode[0]", "enable") == "1"
local wgsupport = nixio.fs.stat("/usr/bin/wg")
@ -803,8 +804,11 @@ function lqm()
--
-- Pull in the routing table to see how many node routes are associated with each tracker.
-- We dont do this if this is a supernode because the routes table is massive and can cause
-- crash olsrd.
--
total_node_route_count = 0
if not is_supernode then
for _, route in ipairs(aredn.olsr.getOLSRRoutes())
do
-- Count routes to nodes. There are two routes to most nodes, the node's primary address
@ -817,6 +821,7 @@ function lqm()
end
end
end
end
--
-- At this point we have gather all the data we need to determine which links are best to use and

View File

@ -36,53 +36,22 @@
--]]
local watchdogfile = "/tmp/olsrd.watchdog"
local pidfile = "/var/run/olsrd.pid"
local logfile = "/tmp/olsrd.log"
function olsrd_restart()
-- print "olsrd_restart"
os.execute("/etc/init.d/olsrd restart")
if nixio.fs.stat(logfile) then
local lines = read_all(logfile):splitNewLine()
lines[#lines + 1] = secondsToClock(nixio.sysinfo().uptime) .. " " .. os.date()
local start = 1
if #lines > 300 then
start = #lines - 275
end
local f = io.open(logfile, "w")
if f then
for i = start, #lines
do
f:write(lines[i] .. "\n")
end
f:close()
end
end
end
local sleeptime = 3 * 60 -- 3 minutes
local timeout = 10 * 60 -- 10 minutes
function olsrd_watchdog()
while true
do
wait_for_ticks(223)
local pid = read_all(pidfile)
if pid and nixio.fs.stat("/proc/" .. pid) then
wait_for_ticks(sleeptime)
if nixio.fs.stat(watchdogfile) then
local watchtime = tonumber(read_all(watchdogfile))
-- If watchtime hasn't update recently then we restart OLSRD
if watchtime + timeout < os.time() then
nixio.syslog("err", "olsrd watchdog timeout - restarting")
os.remove(watchdogfile)
else
olsrd_restart()
end
else
local pids = capture("pidof olsrd"):splitWhiteSpace()
if #pids == 1 then
write_all(pidfile, pids[1]);
elseif #pids == 0 then
olsrd_restart()
os.execute("/etc/init.d/olsrd restart")
end
end
end
end