diff --git a/bin/tame b/bin/tame index 2e7bf42c..bd98d979 100755 --- a/bin/tame +++ b/bin/tame @@ -93,7 +93,7 @@ command-runner() # if not, then it may have stalled for some reason verify-runner-ack "$*" < "$base/1" || { echo "warning: failed runner $id ack; requesting reload" >&2 - kill -HUP "$pid" + reload-runner-and-wait "$base" "$id" "$pid" # give some extra time in case the host is under high load sleep "$TAME_CMD_WAITTIME" @@ -213,7 +213,7 @@ spawn-runner-and-wait() # wait on the expected id local -ri nextid=$(( maxid + 1 )) - wait-for-runner "$root" "$nextid" + wait-for-runner "$root/$nextid" "$nextid" echo "$nextid" } @@ -314,6 +314,33 @@ verify-runner() } +# Request a runner reload and wait for the reload to complete. +# +# Before requesting the reload, a `reloading` flag will be set on the +# runner. This flag is expected to be cleared by tamed once the runner has +# been restarted in response to SIGHUP. +# +# See `wait-for-runner` for more information on waiting for the flag. +reload-runner-and-wait() { + local -r base="${1?Missing base}" + local -ri id="${2?Missing id}" + local -ri pid="${3?Missing pid}" + + # mark the runner as unavailable + touch "$base/reloading" + + # issue a reload request to the runner + kill -HUP "$pid" + + # we must not continue before the runner has been reloaded, otherwise we + # may issue new requests to a process that is being killed + wait-for-runner "$base" "$id" || { + echo "error: failed to reload runner $id" >&2 + exit "$EX_STALLED" + } +} + + # Wait for command acknowledgment from runner # # The runner must respond within TAME_CMD_WAITTIME seconds @@ -336,9 +363,12 @@ verify-runner-ack() # Assumes that the runner is ready once the pidfile becomes # available. Polls for a maximum of six seconds before giving up # and exiting with a non-zero status. +# +# This will also wait for the clearing of the `reloading` flag set by +# `reload-runner-and-wait`, as part of that same time. wait-for-runner() { - local -r root=${1?Missing root} + local -r base=${1?Missing base} local -r id=${2?Missing runner id} # we could use inotify, but that is not installed by default @@ -346,7 +376,10 @@ wait-for-runner() # another dependency (give up after 6 seconds) local -i i=12 while test $((i--)); do - test ! -f "$root/$id/pid" || return 0 + if [ -f "$base/pid" -a ! -f "$base/reloading" ]; then + return 0 + fi + sleep 0.5 done @@ -416,7 +449,7 @@ _start-tamed() # wait for tamed even if it was already started (just in # case this script was executed right after tamed started # but before it is done initializing) - wait-for-runner "$root" 0 + wait-for-runner "$root/0" 0 } diff --git a/bin/tamed b/bin/tamed index 05893134..c744e740 100755 --- a/bin/tamed +++ b/bin/tamed @@ -129,16 +129,33 @@ spawn-runner() # monitor runner usage and kill when inactive stall-monitor "$base" & - # loop to restart runner in case of crash - while true; do - declare -i job=0 - trap 'kill -INT $job' HUP - "$mypath/dslc" < "$base/0" &> "$base/1" & job=$! - declare -i status=0 - wait -n 2>/dev/null || status=$? - echo "warning: runner $id exited with code $status; restarting" >&2 - done & + # loop to restart runner in case of crash + ( + declare -i job=0 + trap force-job-reload HUP + + force-job-reload() { + kill -9 $job + } + + while true; do + # store the time that the runner was started so that we can later + # determine if it should be restarted to forcefully reclaim memory + date +%s > "$base/created-ts" + + "$mypath/dslc" < "$base/0" &> "$base/1" & job=$! + + # this flag is set by the `tame` client so that it knows when the + # runner becomes available + rm -f "$base/reloading" + + declare -i status=0 + wait "$job" 2>/dev/null || status=$? + + echo "warning: runner $id exited with code $status (pid $job); restarting" >&2 + done + ) & echo "$!" > "$base/pid"