From db1c03dfd9c26150ad1faa0392c482c2ac682856 Mon Sep 17 00:00:00 2001 From: Mike Gerwitz Date: Tue, 16 Oct 2018 08:53:04 -0400 Subject: [PATCH] tame{,d}: Reload runner when unresponsive This tries to be a bit more resilient in case a runner becomes unresponsive, rather than waiting for tamed to kill itself. * bin/tame (RUNNER_CMD_WAITTIME): New variable. (command-runner): Tell runner to reload if it does not respond in RUNNER_CMD_WAITTIME seconds. (verify-runner-ack): New function. * bin/tamed (mkfifos): Only keep stdin open. stdout isn't necessary, and may have actually been causing subtle issues. (spawn-runner): Support restarting dslc on SIGHUP. --- bin/tame | 34 +++++++++++++++++++++++++++++++++- bin/tamed | 15 ++++++++++----- 2 files changed, 43 insertions(+), 6 deletions(-) diff --git a/bin/tame b/bin/tame index ecb2bed2..e7a5e997 100755 --- a/bin/tame +++ b/bin/tame @@ -24,6 +24,10 @@ declare -r mypath=$( dirname "$( readlink -f "$0" )" ) declare -ri EX_NOTAMED=1 # tried to start tamed but failed declare -ri EX_USAGE=64 # incorrect usage; sysexits.h +# maximum amount of time in seconds to wait for runner to ack +# before forcibly restarting it +declare -ri RUNNER_CMD_WAITTIME=3 + # Send a single command to a runner and observe the result # @@ -48,7 +52,18 @@ command-runner() trap 'kill -TERM $pid &>/dev/null' INT TERM # all remaining arguments are passed to the runner - echo "$@" > "$base/0" + echo "$*" > "$base/0" + + # we should immediately get a response from the runner; + # if not, then it may have stalled for some reason + verify-runner-ack "$*" < "$base/1" || { + echo "warning: failed runner $id ack; requesting reload" >&2 + kill -HUP "$pid" + sleep "$RUNNER_CMD_WAITTIME" + + # try once more + verify-runner-ack "$*" < "$base/1" || exit + } # output lines from runner until we reach a line stating "DONE" while read line; do @@ -85,6 +100,23 @@ verify-runner() } +# Wait for command acknowledgment from runner +# +# The runner must respond within RUNNER_CMD_WAITTIME seconds +# and must echo back the command that was given. Otherwise, +# this function returns with a non-zero status. +verify-runner-ack() +{ + local -r cmd="${1?Missing command}" + + read -t"$RUNNER_CMD_WAITTIME" -r ack || return + test "COMMAND $cmd" == "$ack" || { + # TODO check for ack mismatch once output race condition is fixed + : + } +} + + # Wait somewhat impatiently for tamed # # Assumes that tamed's runner 0 is running once the pidfile becomes diff --git a/bin/tamed b/bin/tamed index 25b20778..3b8b48b2 100755 --- a/bin/tamed +++ b/bin/tamed @@ -53,10 +53,10 @@ mkfifos() echo "fatal: failed to create FIFO at $in" exit $EX_CANTCREAT } - - # keep FIFOs open so we don't get EOF from writers - tail -f >"$root/$n" & done + + # keep FIFOs open so we don't get EOF from writers + tail -f >"$root/0" & } @@ -80,8 +80,13 @@ spawn-runner() # loop to restart runner in case of crash while true; do - "$mypath/dslc" < "$base/0" &> "$base/1" - echo "warning: runner $id exited with code ${PIPESTATUS[0]}; restarting" >&2 + declare -i job=0 + trap 'kill -INT $job' HUP + "$mypath/dslc" < "$base/0" &> "$base/1" & job=$! + + declare -i status=0 + wait -n 2>/dev/null || status=$? + echo "warning: runner $id exited with code $status; restarting" >&2 done & echo "$!" > "$base/pid"