aredn: harden cron maintenance scripts

ensure maintenance scripts execute one at
a time and never in duplication
This commit is contained in:
Joe AE6XE 2018-09-12 23:11:25 -07:00
parent c5e9342e29
commit 3938f33afe
3 changed files with 94 additions and 23 deletions

View File

@ -1,5 +1,5 @@
#!/bin/sh
<<'LICENSE'
true <<'LICENSE'
Part of AREDN -- Used for creating Amateur Radio Emergency Data Networks
Copyright (C) 2018 Joe Ayers AE6XE
See Contributors file for additional contributors
@ -36,36 +36,43 @@ LICENSE
# Look for hung 'iw' zombie processes prone to hang
# when available memory is low.
# wait for rssi_monitor and snrlog to run
sleep 10
zombiepid="/tmp/clean_zombie.pid"
for pid in `ps 2>/dev/null | egrep "^\s*\d+\s+root\s+\d+\s+Z\s+\[iw\]"| sed -e "s/^\s*//"| cut -f1 -d\ `
[ -e $zombiepid ] && [ -d "/proc/$(cat $zombiepid)" ] && exit
echo "$$" > $zombiepid
# wait for rssi_monitor and snrlog to run
sleep 20;
for pid in $(ps | grep -E "^\s*\d+\s+root\s+\d+\s+Z\s+\[iw\]"| sed -e "s/^\s*//"| cut -f1 -d\ )
do
# found an "iw" zombie
sleep 10 # give time in case process is naturally closing and needs more time
if [ -d /proc/$pid ] ; then
sleep 10 # in case process is naturally closing and needs more time
if [ -d "/proc/$pid" ] ; then
date >> /tmp/zombie.log
ps | egrep "^\s*${pid}" | grep -v grep | tail -1 >> /tmp/zombie.log
ppid=`cat /proc/$pid/status | grep -i ppid | cut -f2`
if [ -d /proc/$ppid ] ; then
ps | egrep "\s*${ppid}" | grep -v grep | tail -1 >> /tmp/zombie.log
if ( ! `grep crond /proc/$ppid/status 2>&1 > /dev/null` ) then
if [ $ppid -gt 1 ] ; then
ps | grep -E "^\s*${pid}\s+" | grep -v grep | tail -1 >> /tmp/zombie.log
ppid="$(grep -i ppid < /proc/$pid/status | cut -f2)"
if [ -d "/proc/$ppid" ] ; then
ps | grep -E "^\s*${ppid}\s+" | grep -v grep | tail -1 >> /tmp/zombie.log
grep crond /proc/$ppid/status 2>&1 > /dev/null
if [ $? -ne 0 -a "$ppid" -gt 1 ]; then
# kill the zombie's parent process to free up resources
kill -9 $ppid 2>&1 >> /tmp/zombie.log
echo "Killed $ppid" >> /tmp/zombie.log
if [ `wc -l /tmp/zombie.log | cut -f1 -d\ ` -gt 100 ] ; then
# kill the zombie's parent process to free up resources
kill -9 "$ppid" 2>&1 >> /tmp/zombie.log
echo "Killed $ppid" >> /tmp/zombie.log
if [ "$(wc -l /tmp/zombie.log | cut -f1 -d\ )" -gt 300 ] ; then
# keep file size in check
cp /tmp/zombie.log /tmp/zombie.tmp
tail -80 /tmp/zombie.tmp > /tmp/zombie.log
rm -f /tmp/zombie.tmp
fi
# keep file size in check
cp /tmp/zombie.log /tmp/zombie.tmp
tail -275 /tmp/zombie.tmp > /tmp/zombie.log
rm -f /tmp/zombie.tmp
fi
fi
fi
echo "" >> /tmp/zombie.log
fi
done
rm $zombiepid

View File

@ -41,9 +41,40 @@
$now=`cat /proc/uptime | cut -f1 -d" "`;
chomp $now;
exit 0 unless $now > 119;
sleep 3; # wait for snrlog to see that we are not running
$rssipid="/tmp/rssi_monitor.pid";
if ( -f "$rssipid" )
{
chomp (${rssipidvalue}=`cat $rssipid`);
exit 0 if ( ${rssipidvalue} > 0 and -d "/proc/${rssipidvalue}" );
}
open(my $mypid, '>', $rssipid) or die("Could not open $rssipid. $!");
print $mypid $$;
close $mypid;
$snrlogpid="/tmp/snrlog.pid";
if ( -f "$snrlogpid" )
{
chomp (${snrlogpidvalue}=`cat $snrlogpid`);
$waitcount=0;
while ( ${snrlogpidvalue} > 0 and -d "/proc/${snrlogpidvalue}" and $waitcount < 4)
{
sleep 5;
$waitcount+=1;
}
if ( $waitcount = 4 ) # skip this turn if snrlog still running
{
unlink $rssipid;
exit 0;
}
}
sleep 7;
chomp ($iface=`uci -q get 'network.wifi.ifname'`); # wireless interface
foreach(`iwinfo $iface info`)
{
@ -337,6 +368,7 @@ for (keys %rssiHist)
}
close $dfh;
unlink $rssipid;
# when logfile gets 1k over $MAXSIZE, then chop down
$MAXSIZE = 2**14;

View File

@ -49,7 +49,6 @@ string.print=print_r
-- delay just after rssi_monitor has a chance to run noise floor calibration
sleep(5)
local MAXLINES=2880 -- 2 days worth
local AGETIME=43200
local INACTIVETIMEOUT=10000
@ -66,6 +65,8 @@ local stations={}
local wifiiface=""
local bandwidth=""
local nulledout={}
local pidfile="/tmp/snrlog.pid"
local rssifile="/tmp/rssi_monitor.pid"
-- Neighbor Class
Neighbor={}
@ -255,6 +256,35 @@ end
-- Neighbor Class END
-- MAIN() -------------------------------------------------------------------------------------
-- check to make sure a prior instance is not still running
local f = io.open(pidfile,"r")
if (f) then
local oldpid = f:read("*number")
f:close()
if (oldpid ~= nill and dir_exists("/proc/" .. oldpid)) then
return
end
end
--- create pid file to communicate I'm running
f, err=assert(io.open(pidfile, "w"),"Cannot open file (pidfile) to write!")
if (f) then
local mypid = posix.unistd.getpid()
f:write(mypid)
f:close()
end
--- Do not run if prior period rssi_monitor is still running
local f = io.open(rssifile,"r")
if (f) then
local oldpid = f:read("*number")
f:close()
if (oldpid ~= nill and dir_exists("/proc/" .. oldpid)) then
os.remove(pidfile)
return
end
end
-- get wifi interface name
wifiiface=get_ifname("wifi")
@ -388,4 +418,6 @@ for k,v in pairs(snrdatcache) do
end
f:close()
os.remove(pidfile)
-- END MAIN