tame{,d}: Reload runner when unresponsive
This tries to be a bit more resilient in case a runner becomes unresponsive, rather than waiting for tamed to kill itself. * bin/tame (RUNNER_CMD_WAITTIME): New variable. (command-runner): Tell runner to reload if it does not respond in RUNNER_CMD_WAITTIME seconds. (verify-runner-ack): New function. * bin/tamed (mkfifos): Only keep stdin open. stdout isn't necessary, and may have actually been causing subtle issues. (spawn-runner): Support restarting dslc on SIGHUP.master v3.3.3
parent
5679be281a
commit
db1c03dfd9
34
bin/tame
34
bin/tame
|
@ -24,6 +24,10 @@ declare -r mypath=$( dirname "$( readlink -f "$0" )" )
|
|||
declare -ri EX_NOTAMED=1 # tried to start tamed but failed
|
||||
declare -ri EX_USAGE=64 # incorrect usage; sysexits.h
|
||||
|
||||
# maximum amount of time in seconds to wait for runner to ack
|
||||
# before forcibly restarting it
|
||||
declare -ri RUNNER_CMD_WAITTIME=3
|
||||
|
||||
|
||||
# Send a single command to a runner and observe the result
|
||||
#
|
||||
|
@ -48,7 +52,18 @@ command-runner()
|
|||
trap 'kill -TERM $pid &>/dev/null' INT TERM
|
||||
|
||||
# all remaining arguments are passed to the runner
|
||||
echo "$@" > "$base/0"
|
||||
echo "$*" > "$base/0"
|
||||
|
||||
# we should immediately get a response from the runner;
|
||||
# if not, then it may have stalled for some reason
|
||||
verify-runner-ack "$*" < "$base/1" || {
|
||||
echo "warning: failed runner $id ack; requesting reload" >&2
|
||||
kill -HUP "$pid"
|
||||
sleep "$RUNNER_CMD_WAITTIME"
|
||||
|
||||
# try once more
|
||||
verify-runner-ack "$*" < "$base/1" || exit
|
||||
}
|
||||
|
||||
# output lines from runner until we reach a line stating "DONE"
|
||||
while read line; do
|
||||
|
@ -85,6 +100,23 @@ verify-runner()
|
|||
}
|
||||
|
||||
|
||||
# Wait for command acknowledgment from runner
|
||||
#
|
||||
# The runner must respond within RUNNER_CMD_WAITTIME seconds
|
||||
# and must echo back the command that was given. Otherwise,
|
||||
# this function returns with a non-zero status.
|
||||
verify-runner-ack()
|
||||
{
|
||||
local -r cmd="${1?Missing command}"
|
||||
|
||||
read -t"$RUNNER_CMD_WAITTIME" -r ack || return
|
||||
test "COMMAND $cmd" == "$ack" || {
|
||||
# TODO check for ack mismatch once output race condition is fixed
|
||||
:
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
# Wait somewhat impatiently for tamed
|
||||
#
|
||||
# Assumes that tamed's runner 0 is running once the pidfile becomes
|
||||
|
|
15
bin/tamed
15
bin/tamed
|
@ -53,10 +53,10 @@ mkfifos()
|
|||
echo "fatal: failed to create FIFO at $in"
|
||||
exit $EX_CANTCREAT
|
||||
}
|
||||
|
||||
# keep FIFOs open so we don't get EOF from writers
|
||||
tail -f >"$root/$n" &
|
||||
done
|
||||
|
||||
# keep FIFOs open so we don't get EOF from writers
|
||||
tail -f >"$root/0" &
|
||||
}
|
||||
|
||||
|
||||
|
@ -80,8 +80,13 @@ spawn-runner()
|
|||
|
||||
# loop to restart runner in case of crash
|
||||
while true; do
|
||||
"$mypath/dslc" < "$base/0" &> "$base/1"
|
||||
echo "warning: runner $id exited with code ${PIPESTATUS[0]}; restarting" >&2
|
||||
declare -i job=0
|
||||
trap 'kill -INT $job' HUP
|
||||
"$mypath/dslc" < "$base/0" &> "$base/1" & job=$!
|
||||
|
||||
declare -i status=0
|
||||
wait -n 2>/dev/null || status=$?
|
||||
echo "warning: runner $id exited with code $status; restarting" >&2
|
||||
done &
|
||||
|
||||
echo "$!" > "$base/pid"
|
||||
|
|
Loading…
Reference in New Issue