Olsrd crash fixes (#1234)

* Simplify the OLSR watchdog * Dont pull the routing table into LQM for supernodes. LQM tracks routes on nodes to help keep leaf nodes connected even when circumstances would probably prevent this. However on supernodes the routing table is massive and pulling this into LQM will frequently crash OLSRD. As we dont need this for supernode just dont do it for them.
2024-05-31 22:44:36 -07:00 · 2024-05-31 22:44:36 -07:00 · 0328f0ec7e
parent 192e6deaec
commit 0328f0ec7e
2 changed files with 23 additions and 49 deletions
--- a/files/usr/local/bin/mgr/lqm.lua
+++ b/files/usr/local/bin/mgr/lqm.lua
@ -247,6 +247,7 @@ end

 local myhostname = canonical_hostname(aredn.info.get_nvram("node") or "localnode")
 local myip = uci.cursor():get("network", "wifi", "ipaddr")
+local is_supernode = uci.cursor():get("aredn", "@supernode[0]", "enable") == "1"

 local wgsupport = nixio.fs.stat("/usr/bin/wg")

@ -803,8 +804,11 @@ function lqm()

        --
        -- Pull in the routing table to see how many node routes are associated with each tracker.
+        -- We dont do this if this is a supernode because the routes table is massive and can cause
+        -- crash olsrd.
        --
        total_node_route_count = 0
+        if not is_supernode then
            for _, route in ipairs(aredn.olsr.getOLSRRoutes())
            do
                -- Count routes to nodes. There are two routes to most nodes, the node's primary address
@ -817,6 +821,7 @@ function lqm()
                    end
                end
            end
+        end

        --
        -- At this point we have gather all the data we need to determine which links are best to use and
--- a/files/usr/local/bin/mgr/olsrd_watchdog.lua
+++ b/files/usr/local/bin/mgr/olsrd_watchdog.lua
@ -36,53 +36,22 @@
 --]]

 local watchdogfile = "/tmp/olsrd.watchdog"
-local pidfile = "/var/run/olsrd.pid"
-local logfile = "/tmp/olsrd.log"
-
-function olsrd_restart()
-    -- print "olsrd_restart"
-
-    os.execute("/etc/init.d/olsrd restart")
-
-    if nixio.fs.stat(logfile) then
-        local lines = read_all(logfile):splitNewLine()
-        lines[#lines + 1] = secondsToClock(nixio.sysinfo().uptime) .. " " .. os.date()
-        local start = 1
-        if #lines > 300 then
-            start = #lines - 275
-        end
-        local f = io.open(logfile, "w")
-        if f then
-            for i = start, #lines
-            do
-                f:write(lines[i] .. "\n")
-            end
-            f:close()
-        end
-    end
-end
+local sleeptime = 3 * 60 -- 3 minutes
+local timeout = 10 * 60 -- 10 minutes

 function olsrd_watchdog()
    while true
    do
-        wait_for_ticks(223)
-
-        local pid = read_all(pidfile)
-        if pid and nixio.fs.stat("/proc/" .. pid) then
+        wait_for_ticks(sleeptime)
        if nixio.fs.stat(watchdogfile) then
+            local watchtime = tonumber(read_all(watchdogfile))
+            -- If watchtime hasn't update recently then we restart OLSRD
+            if watchtime + timeout < os.time() then
+                nixio.syslog("err", "olsrd watchdog timeout - restarting")
                os.remove(watchdogfile)
-            else
-                olsrd_restart()
-            end
-        else
-            local pids = capture("pidof olsrd"):splitWhiteSpace()
-            if #pids == 1 then
-                write_all(pidfile, pids[1]);
-            elseif #pids == 0 then
-                olsrd_restart()
+                os.execute("/etc/init.d/olsrd restart")
            end
        end
-
    end
 end