mirror of https://github.com/aredn/aredn.git
Olsrd crash fixes (#1234)
* Simplify the OLSR watchdog * Dont pull the routing table into LQM for supernodes. LQM tracks routes on nodes to help keep leaf nodes connected even when circumstances would probably prevent this. However on supernodes the routing table is massive and pulling this into LQM will frequently crash OLSRD. As we dont need this for supernode just dont do it for them.
This commit is contained in:
parent
192e6deaec
commit
0328f0ec7e
|
@ -247,6 +247,7 @@ end
|
|||
|
||||
local myhostname = canonical_hostname(aredn.info.get_nvram("node") or "localnode")
|
||||
local myip = uci.cursor():get("network", "wifi", "ipaddr")
|
||||
local is_supernode = uci.cursor():get("aredn", "@supernode[0]", "enable") == "1"
|
||||
|
||||
local wgsupport = nixio.fs.stat("/usr/bin/wg")
|
||||
|
||||
|
@ -803,8 +804,11 @@ function lqm()
|
|||
|
||||
--
|
||||
-- Pull in the routing table to see how many node routes are associated with each tracker.
|
||||
-- We dont do this if this is a supernode because the routes table is massive and can cause
|
||||
-- crash olsrd.
|
||||
--
|
||||
total_node_route_count = 0
|
||||
if not is_supernode then
|
||||
for _, route in ipairs(aredn.olsr.getOLSRRoutes())
|
||||
do
|
||||
-- Count routes to nodes. There are two routes to most nodes, the node's primary address
|
||||
|
@ -817,6 +821,7 @@ function lqm()
|
|||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
--
|
||||
-- At this point we have gather all the data we need to determine which links are best to use and
|
||||
|
|
|
@ -36,53 +36,22 @@
|
|||
--]]
|
||||
|
||||
local watchdogfile = "/tmp/olsrd.watchdog"
|
||||
local pidfile = "/var/run/olsrd.pid"
|
||||
local logfile = "/tmp/olsrd.log"
|
||||
|
||||
function olsrd_restart()
|
||||
-- print "olsrd_restart"
|
||||
|
||||
os.execute("/etc/init.d/olsrd restart")
|
||||
|
||||
if nixio.fs.stat(logfile) then
|
||||
local lines = read_all(logfile):splitNewLine()
|
||||
lines[#lines + 1] = secondsToClock(nixio.sysinfo().uptime) .. " " .. os.date()
|
||||
local start = 1
|
||||
if #lines > 300 then
|
||||
start = #lines - 275
|
||||
end
|
||||
local f = io.open(logfile, "w")
|
||||
if f then
|
||||
for i = start, #lines
|
||||
do
|
||||
f:write(lines[i] .. "\n")
|
||||
end
|
||||
f:close()
|
||||
end
|
||||
end
|
||||
end
|
||||
local sleeptime = 3 * 60 -- 3 minutes
|
||||
local timeout = 10 * 60 -- 10 minutes
|
||||
|
||||
function olsrd_watchdog()
|
||||
while true
|
||||
do
|
||||
wait_for_ticks(223)
|
||||
|
||||
local pid = read_all(pidfile)
|
||||
if pid and nixio.fs.stat("/proc/" .. pid) then
|
||||
wait_for_ticks(sleeptime)
|
||||
if nixio.fs.stat(watchdogfile) then
|
||||
local watchtime = tonumber(read_all(watchdogfile))
|
||||
-- If watchtime hasn't update recently then we restart OLSRD
|
||||
if watchtime + timeout < os.time() then
|
||||
nixio.syslog("err", "olsrd watchdog timeout - restarting")
|
||||
os.remove(watchdogfile)
|
||||
else
|
||||
olsrd_restart()
|
||||
end
|
||||
else
|
||||
local pids = capture("pidof olsrd"):splitWhiteSpace()
|
||||
if #pids == 1 then
|
||||
write_all(pidfile, pids[1]);
|
||||
elseif #pids == 0 then
|
||||
olsrd_restart()
|
||||
os.execute("/etc/init.d/olsrd restart")
|
||||
end
|
||||
end
|
||||
|
||||
end
|
||||
end
|
||||
|
||||
|
|
Loading…
Reference in New Issue