Olsrd crash fixes (#1234)

* Simplify the OLSR watchdog * Dont pull the routing table into LQM for supernodes. LQM tracks routes on nodes to help keep leaf nodes connected even when circumstances would probably prevent this. However on supernodes the routing table is massive and pulling this into LQM will frequently crash OLSRD. As we dont need this for supernode just dont do it for them.
2024-05-31 22:44:36 -07:00 · 2024-05-31 22:44:36 -07:00 · 0328f0ec7e
parent 192e6deaec
commit 0328f0ec7e
2 changed files with 23 additions and 49 deletions
--- a/files/usr/local/bin/mgr/lqm.lua
+++ b/files/usr/local/bin/mgr/lqm.lua
@ -247,6 +247,7 @@ end
 local myhostname = canonical_hostname(aredn.info.get_nvram("node") or "localnode")
 local myip = uci.cursor():get("network", "wifi", "ipaddr")
 local is_supernode = uci.cursor():get("aredn", "@supernode[0]", "enable") == "1"
 local wgsupport = nixio.fs.stat("/usr/bin/wg")
@ -803,17 +804,21 @@ function lqm()
        --
        -- Pull in the routing table to see how many node routes are associated with each tracker.
        -- We dont do this if this is a supernode because the routes table is massive and can cause
        -- crash olsrd.
        --
        total_node_route_count = 0
-        for _, route in ipairs(aredn.olsr.getOLSRRoutes())
+        if not is_supernode then
-        do
+            for _, route in ipairs(aredn.olsr.getOLSRRoutes())
-            -- Count routes to nodes. There are two routes to most nodes, the node's primary address
+            do
-            -- and the node's dtdlink address.
+                -- Count routes to nodes. There are two routes to most nodes, the node's primary address
-            if route.genmask == 32 and route.destination:match("^10%.") then
+                -- and the node's dtdlink address.
-                local track = ip2tracker[route.gateway];
+                if route.genmask == 32 and route.destination:match("^10%.") then
-                if track then
+                    local track = ip2tracker[route.gateway];
-                    track.node_route_count = track.node_route_count + 1
+                    if track then
-                    total_node_route_count = total_node_route_count + 1
+                        track.node_route_count = track.node_route_count + 1
                        total_node_route_count = total_node_route_count + 1
                    end
                end
            end
        end
--- a/files/usr/local/bin/mgr/olsrd_watchdog.lua
+++ b/files/usr/local/bin/mgr/olsrd_watchdog.lua
@ -36,53 +36,22 @@
 --]]
 local watchdogfile = "/tmp/olsrd.watchdog"
-local pidfile = "/var/run/olsrd.pid"
+local sleeptime = 3 * 60 -- 3 minutes
-local logfile = "/tmp/olsrd.log"
+local timeout = 10 * 60 -- 10 minutes
 function olsrd_restart()
    -- print "olsrd_restart"
    os.execute("/etc/init.d/olsrd restart")
    if nixio.fs.stat(logfile) then
        local lines = read_all(logfile):splitNewLine()
        lines[#lines + 1] = secondsToClock(nixio.sysinfo().uptime) .. " " .. os.date()
        local start = 1
        if #lines > 300 then
            start = #lines - 275
        end
        local f = io.open(logfile, "w")
        if f then
            for i = start, #lines
            do
                f:write(lines[i] .. "\n")
            end
            f:close()
        end
    end
 end
 function olsrd_watchdog()
    while true
    do
-        wait_for_ticks(223)
+        wait_for_ticks(sleeptime)
-
+        if nixio.fs.stat(watchdogfile) then
-        local pid = read_all(pidfile)
+            local watchtime = tonumber(read_all(watchdogfile))
-        if pid and nixio.fs.stat("/proc/" .. pid) then
+            -- If watchtime hasn't update recently then we restart OLSRD
-            if nixio.fs.stat(watchdogfile) then
+            if watchtime + timeout < os.time() then
                nixio.syslog("err", "olsrd watchdog timeout - restarting")
                os.remove(watchdogfile)
-            else
+                os.execute("/etc/init.d/olsrd restart")
                olsrd_restart()
            end
        else
            local pids = capture("pidof olsrd"):splitWhiteSpace()
            if #pids == 1 then
                write_all(pidfile, pids[1]);
            elseif #pids == 0 then
                olsrd_restart()
            end
        end
    end
 end