Improve runner reload stability

The `tame` client has the ability to request a runner reload by issuing
SIGHUP to the runner PID.  This will cause `tamed` to kill the runner and
respawn it.

There were situations where this process hung or did not operate as
expected; the process was not reliable.  This does a number of things to
make sure the process is in a good state before proceeding:

  1. The HUP trap is set a single time, rather than being reset each time
     the signal is received for the runner.
  2. A `reloading` flag is set by the `tame` client when reloading is
     requested, and the flag is cleared by `tamed` when spawning a new
     runner.  This allows the client to wait until the reload is complete,
     and bail out otherwise.  Without this, there is a race, where the
     client may proceed to issue additional requests before the original
     runner terminates.
  3. Kill the runner with SIGKILL (signal 9).  This gives no opportunity for
     the runner to ignore the signal, and gives us confidence that the
     runner is actually going to be killed.

This may have caused errors that look like this (where 129 is the HUP
reload, which is expected):

  warning: failed runner 0 ack; requesting reload
  warning: runner 0 exited with code 129; restarting
  error: runner 0 still unresponsive; giving up

The last line may also be omitted, with instead an empty `xmlo` being
generated.

DEV-10806
main
Mike Gerwitz 2023-10-03 13:31:36 -04:00
parent b8a36ec984
commit 7c9d6837fe
2 changed files with 64 additions and 14 deletions

View File

@ -93,7 +93,7 @@ command-runner()
# if not, then it may have stalled for some reason # if not, then it may have stalled for some reason
verify-runner-ack "$*" < "$base/1" || { verify-runner-ack "$*" < "$base/1" || {
echo "warning: failed runner $id ack; requesting reload" >&2 echo "warning: failed runner $id ack; requesting reload" >&2
kill -HUP "$pid" reload-runner-and-wait "$base" "$id" "$pid"
# give some extra time in case the host is under high load # give some extra time in case the host is under high load
sleep "$TAME_CMD_WAITTIME" sleep "$TAME_CMD_WAITTIME"
@ -213,7 +213,7 @@ spawn-runner-and-wait()
# wait on the expected id # wait on the expected id
local -ri nextid=$(( maxid + 1 )) local -ri nextid=$(( maxid + 1 ))
wait-for-runner "$root" "$nextid" wait-for-runner "$root/$nextid" "$nextid"
echo "$nextid" echo "$nextid"
} }
@ -314,6 +314,33 @@ verify-runner()
} }
# Request a runner reload and wait for the reload to complete.
#
# Before requesting the reload, a `reloading` flag will be set on the
# runner. This flag is expected to be cleared by tamed once the runner has
# been restarted in response to SIGHUP.
#
# See `wait-for-runner` for more information on waiting for the flag.
reload-runner-and-wait() {
local -r base="${1?Missing base}"
local -ri id="${2?Missing id}"
local -ri pid="${3?Missing pid}"
# mark the runner as unavailable
touch "$base/reloading"
# issue a reload request to the runner
kill -HUP "$pid"
# we must not continue before the runner has been reloaded, otherwise we
# may issue new requests to a process that is being killed
wait-for-runner "$base" "$id" || {
echo "error: failed to reload runner $id" >&2
exit "$EX_STALLED"
}
}
# Wait for command acknowledgment from runner # Wait for command acknowledgment from runner
# #
# The runner must respond within TAME_CMD_WAITTIME seconds # The runner must respond within TAME_CMD_WAITTIME seconds
@ -336,9 +363,12 @@ verify-runner-ack()
# Assumes that the runner is ready once the pidfile becomes # Assumes that the runner is ready once the pidfile becomes
# available. Polls for a maximum of six seconds before giving up # available. Polls for a maximum of six seconds before giving up
# and exiting with a non-zero status. # and exiting with a non-zero status.
#
# This will also wait for the clearing of the `reloading` flag set by
# `reload-runner-and-wait`, as part of that same time.
wait-for-runner() wait-for-runner()
{ {
local -r root=${1?Missing root} local -r base=${1?Missing base}
local -r id=${2?Missing runner id} local -r id=${2?Missing runner id}
# we could use inotify, but that is not installed by default # we could use inotify, but that is not installed by default
@ -346,7 +376,10 @@ wait-for-runner()
# another dependency (give up after 6 seconds) # another dependency (give up after 6 seconds)
local -i i=12 local -i i=12
while test $((i--)); do while test $((i--)); do
test ! -f "$root/$id/pid" || return 0 if [ -f "$base/pid" -a ! -f "$base/reloading" ]; then
return 0
fi
sleep 0.5 sleep 0.5
done done
@ -416,7 +449,7 @@ _start-tamed()
# wait for tamed even if it was already started (just in # wait for tamed even if it was already started (just in
# case this script was executed right after tamed started # case this script was executed right after tamed started
# but before it is done initializing) # but before it is done initializing)
wait-for-runner "$root" 0 wait-for-runner "$root/0" 0
} }

View File

@ -129,16 +129,33 @@ spawn-runner()
# monitor runner usage and kill when inactive # monitor runner usage and kill when inactive
stall-monitor "$base" & stall-monitor "$base" &
# loop to restart runner in case of crash
while true; do
declare -i job=0
trap 'kill -INT $job' HUP
"$mypath/dslc" < "$base/0" &> "$base/1" & job=$!
declare -i status=0 # loop to restart runner in case of crash
wait -n 2>/dev/null || status=$? (
echo "warning: runner $id exited with code $status; restarting" >&2 declare -i job=0
done & trap force-job-reload HUP
force-job-reload() {
kill -9 $job
}
while true; do
# store the time that the runner was started so that we can later
# determine if it should be restarted to forcefully reclaim memory
date +%s > "$base/created-ts"
"$mypath/dslc" < "$base/0" &> "$base/1" & job=$!
# this flag is set by the `tame` client so that it knows when the
# runner becomes available
rm -f "$base/reloading"
declare -i status=0
wait "$job" 2>/dev/null || status=$?
echo "warning: runner $id exited with code $status (pid $job); restarting" >&2
done
) &
echo "$!" > "$base/pid" echo "$!" > "$base/pid"