From 0328f0ec7ef3158bb4658c0c9d241d2a1f3b8dd3 Mon Sep 17 00:00:00 2001 From: Tim Wilkinson Date: Fri, 31 May 2024 22:44:36 -0700 Subject: [PATCH] Olsrd crash fixes (#1234) * Simplify the OLSR watchdog * Dont pull the routing table into LQM for supernodes. LQM tracks routes on nodes to help keep leaf nodes connected even when circumstances would probably prevent this. However on supernodes the routing table is massive and pulling this into LQM will frequently crash OLSRD. As we dont need this for supernode just dont do it for them. --- files/usr/local/bin/mgr/lqm.lua | 23 ++++++---- files/usr/local/bin/mgr/olsrd_watchdog.lua | 49 ++++------------------ 2 files changed, 23 insertions(+), 49 deletions(-) diff --git a/files/usr/local/bin/mgr/lqm.lua b/files/usr/local/bin/mgr/lqm.lua index cd6eed44..3950786e 100755 --- a/files/usr/local/bin/mgr/lqm.lua +++ b/files/usr/local/bin/mgr/lqm.lua @@ -247,6 +247,7 @@ end local myhostname = canonical_hostname(aredn.info.get_nvram("node") or "localnode") local myip = uci.cursor():get("network", "wifi", "ipaddr") +local is_supernode = uci.cursor():get("aredn", "@supernode[0]", "enable") == "1" local wgsupport = nixio.fs.stat("/usr/bin/wg") @@ -803,17 +804,21 @@ function lqm() -- -- Pull in the routing table to see how many node routes are associated with each tracker. + -- We dont do this if this is a supernode because the routes table is massive and can cause + -- crash olsrd. -- total_node_route_count = 0 - for _, route in ipairs(aredn.olsr.getOLSRRoutes()) - do - -- Count routes to nodes. There are two routes to most nodes, the node's primary address - -- and the node's dtdlink address. - if route.genmask == 32 and route.destination:match("^10%.") then - local track = ip2tracker[route.gateway]; - if track then - track.node_route_count = track.node_route_count + 1 - total_node_route_count = total_node_route_count + 1 + if not is_supernode then + for _, route in ipairs(aredn.olsr.getOLSRRoutes()) + do + -- Count routes to nodes. There are two routes to most nodes, the node's primary address + -- and the node's dtdlink address. + if route.genmask == 32 and route.destination:match("^10%.") then + local track = ip2tracker[route.gateway]; + if track then + track.node_route_count = track.node_route_count + 1 + total_node_route_count = total_node_route_count + 1 + end end end end diff --git a/files/usr/local/bin/mgr/olsrd_watchdog.lua b/files/usr/local/bin/mgr/olsrd_watchdog.lua index f3ad6a5e..98e46320 100644 --- a/files/usr/local/bin/mgr/olsrd_watchdog.lua +++ b/files/usr/local/bin/mgr/olsrd_watchdog.lua @@ -36,53 +36,22 @@ --]] local watchdogfile = "/tmp/olsrd.watchdog" -local pidfile = "/var/run/olsrd.pid" -local logfile = "/tmp/olsrd.log" - -function olsrd_restart() - -- print "olsrd_restart" - - os.execute("/etc/init.d/olsrd restart") - - if nixio.fs.stat(logfile) then - local lines = read_all(logfile):splitNewLine() - lines[#lines + 1] = secondsToClock(nixio.sysinfo().uptime) .. " " .. os.date() - local start = 1 - if #lines > 300 then - start = #lines - 275 - end - local f = io.open(logfile, "w") - if f then - for i = start, #lines - do - f:write(lines[i] .. "\n") - end - f:close() - end - end -end +local sleeptime = 3 * 60 -- 3 minutes +local timeout = 10 * 60 -- 10 minutes function olsrd_watchdog() while true do - wait_for_ticks(223) - - local pid = read_all(pidfile) - if pid and nixio.fs.stat("/proc/" .. pid) then - if nixio.fs.stat(watchdogfile) then + wait_for_ticks(sleeptime) + if nixio.fs.stat(watchdogfile) then + local watchtime = tonumber(read_all(watchdogfile)) + -- If watchtime hasn't update recently then we restart OLSRD + if watchtime + timeout < os.time() then + nixio.syslog("err", "olsrd watchdog timeout - restarting") os.remove(watchdogfile) - else - olsrd_restart() - end - else - local pids = capture("pidof olsrd"):splitWhiteSpace() - if #pids == 1 then - write_all(pidfile, pids[1]); - elseif #pids == 0 then - olsrd_restart() + os.execute("/etc/init.d/olsrd restart") end end - end end