Olsrd crash fixes (#1234)

* Simplify the OLSR watchdog

* Dont pull the routing table into LQM for supernodes.
LQM tracks routes on nodes to help keep leaf nodes connected even
when circumstances would probably prevent this. However on supernodes
the routing table is massive and pulling this into LQM will frequently
crash OLSRD. As we dont need this for supernode just dont do it for them.
This commit is contained in:
Tim Wilkinson 2024-05-31 22:44:36 -07:00 committed by GitHub
parent 192e6deaec
commit 0328f0ec7e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 23 additions and 49 deletions

View File

@ -247,6 +247,7 @@ end
local myhostname = canonical_hostname(aredn.info.get_nvram("node") or "localnode") local myhostname = canonical_hostname(aredn.info.get_nvram("node") or "localnode")
local myip = uci.cursor():get("network", "wifi", "ipaddr") local myip = uci.cursor():get("network", "wifi", "ipaddr")
local is_supernode = uci.cursor():get("aredn", "@supernode[0]", "enable") == "1"
local wgsupport = nixio.fs.stat("/usr/bin/wg") local wgsupport = nixio.fs.stat("/usr/bin/wg")
@ -803,17 +804,21 @@ function lqm()
-- --
-- Pull in the routing table to see how many node routes are associated with each tracker. -- Pull in the routing table to see how many node routes are associated with each tracker.
-- We dont do this if this is a supernode because the routes table is massive and can cause
-- crash olsrd.
-- --
total_node_route_count = 0 total_node_route_count = 0
for _, route in ipairs(aredn.olsr.getOLSRRoutes()) if not is_supernode then
do for _, route in ipairs(aredn.olsr.getOLSRRoutes())
-- Count routes to nodes. There are two routes to most nodes, the node's primary address do
-- and the node's dtdlink address. -- Count routes to nodes. There are two routes to most nodes, the node's primary address
if route.genmask == 32 and route.destination:match("^10%.") then -- and the node's dtdlink address.
local track = ip2tracker[route.gateway]; if route.genmask == 32 and route.destination:match("^10%.") then
if track then local track = ip2tracker[route.gateway];
track.node_route_count = track.node_route_count + 1 if track then
total_node_route_count = total_node_route_count + 1 track.node_route_count = track.node_route_count + 1
total_node_route_count = total_node_route_count + 1
end
end end
end end
end end

View File

@ -36,53 +36,22 @@
--]] --]]
local watchdogfile = "/tmp/olsrd.watchdog" local watchdogfile = "/tmp/olsrd.watchdog"
local pidfile = "/var/run/olsrd.pid" local sleeptime = 3 * 60 -- 3 minutes
local logfile = "/tmp/olsrd.log" local timeout = 10 * 60 -- 10 minutes
function olsrd_restart()
-- print "olsrd_restart"
os.execute("/etc/init.d/olsrd restart")
if nixio.fs.stat(logfile) then
local lines = read_all(logfile):splitNewLine()
lines[#lines + 1] = secondsToClock(nixio.sysinfo().uptime) .. " " .. os.date()
local start = 1
if #lines > 300 then
start = #lines - 275
end
local f = io.open(logfile, "w")
if f then
for i = start, #lines
do
f:write(lines[i] .. "\n")
end
f:close()
end
end
end
function olsrd_watchdog() function olsrd_watchdog()
while true while true
do do
wait_for_ticks(223) wait_for_ticks(sleeptime)
if nixio.fs.stat(watchdogfile) then
local pid = read_all(pidfile) local watchtime = tonumber(read_all(watchdogfile))
if pid and nixio.fs.stat("/proc/" .. pid) then -- If watchtime hasn't update recently then we restart OLSRD
if nixio.fs.stat(watchdogfile) then if watchtime + timeout < os.time() then
nixio.syslog("err", "olsrd watchdog timeout - restarting")
os.remove(watchdogfile) os.remove(watchdogfile)
else os.execute("/etc/init.d/olsrd restart")
olsrd_restart()
end
else
local pids = capture("pidof olsrd"):splitWhiteSpace()
if #pids == 1 then
write_all(pidfile, pids[1]);
elseif #pids == 0 then
olsrd_restart()
end end
end end
end end
end end