Olsrd crash fixes (#1234)

* Simplify the OLSR watchdog

* Dont pull the routing table into LQM for supernodes.
LQM tracks routes on nodes to help keep leaf nodes connected even
when circumstances would probably prevent this. However on supernodes
the routing table is massive and pulling this into LQM will frequently
crash OLSRD. As we dont need this for supernode just dont do it for them.
This commit is contained in:
Tim Wilkinson 2024-05-31 22:44:36 -07:00 committed by GitHub
parent 192e6deaec
commit 0328f0ec7e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 23 additions and 49 deletions

View File

@ -247,6 +247,7 @@ end
local myhostname = canonical_hostname(aredn.info.get_nvram("node") or "localnode")
local myip = uci.cursor():get("network", "wifi", "ipaddr")
local is_supernode = uci.cursor():get("aredn", "@supernode[0]", "enable") == "1"
local wgsupport = nixio.fs.stat("/usr/bin/wg")
@ -803,17 +804,21 @@ function lqm()
--
-- Pull in the routing table to see how many node routes are associated with each tracker.
-- We dont do this if this is a supernode because the routes table is massive and can cause
-- crash olsrd.
--
total_node_route_count = 0
for _, route in ipairs(aredn.olsr.getOLSRRoutes())
do
-- Count routes to nodes. There are two routes to most nodes, the node's primary address
-- and the node's dtdlink address.
if route.genmask == 32 and route.destination:match("^10%.") then
local track = ip2tracker[route.gateway];
if track then
track.node_route_count = track.node_route_count + 1
total_node_route_count = total_node_route_count + 1
if not is_supernode then
for _, route in ipairs(aredn.olsr.getOLSRRoutes())
do
-- Count routes to nodes. There are two routes to most nodes, the node's primary address
-- and the node's dtdlink address.
if route.genmask == 32 and route.destination:match("^10%.") then
local track = ip2tracker[route.gateway];
if track then
track.node_route_count = track.node_route_count + 1
total_node_route_count = total_node_route_count + 1
end
end
end
end

View File

@ -36,53 +36,22 @@
--]]
local watchdogfile = "/tmp/olsrd.watchdog"
local pidfile = "/var/run/olsrd.pid"
local logfile = "/tmp/olsrd.log"
function olsrd_restart()
-- print "olsrd_restart"
os.execute("/etc/init.d/olsrd restart")
if nixio.fs.stat(logfile) then
local lines = read_all(logfile):splitNewLine()
lines[#lines + 1] = secondsToClock(nixio.sysinfo().uptime) .. " " .. os.date()
local start = 1
if #lines > 300 then
start = #lines - 275
end
local f = io.open(logfile, "w")
if f then
for i = start, #lines
do
f:write(lines[i] .. "\n")
end
f:close()
end
end
end
local sleeptime = 3 * 60 -- 3 minutes
local timeout = 10 * 60 -- 10 minutes
function olsrd_watchdog()
while true
do
wait_for_ticks(223)
local pid = read_all(pidfile)
if pid and nixio.fs.stat("/proc/" .. pid) then
if nixio.fs.stat(watchdogfile) then
wait_for_ticks(sleeptime)
if nixio.fs.stat(watchdogfile) then
local watchtime = tonumber(read_all(watchdogfile))
-- If watchtime hasn't update recently then we restart OLSRD
if watchtime + timeout < os.time() then
nixio.syslog("err", "olsrd watchdog timeout - restarting")
os.remove(watchdogfile)
else
olsrd_restart()
end
else
local pids = capture("pidof olsrd"):splitWhiteSpace()
if #pids == 1 then
write_all(pidfile, pids[1]);
elseif #pids == 0 then
olsrd_restart()
os.execute("/etc/init.d/olsrd restart")
end
end
end
end