From 7c9d6837fe18e444c33bfa855613d31460ccb051 Mon Sep 17 00:00:00 2001 From: Mike Gerwitz Date: Tue, 3 Oct 2023 13:31:36 -0400 Subject: [PATCH] Improve runner reload stability The `tame` client has the ability to request a runner reload by issuing SIGHUP to the runner PID. This will cause `tamed` to kill the runner and respawn it. There were situations where this process hung or did not operate as expected; the process was not reliable. This does a number of things to make sure the process is in a good state before proceeding: 1. The HUP trap is set a single time, rather than being reset each time the signal is received for the runner. 2. A `reloading` flag is set by the `tame` client when reloading is requested, and the flag is cleared by `tamed` when spawning a new runner. This allows the client to wait until the reload is complete, and bail out otherwise. Without this, there is a race, where the client may proceed to issue additional requests before the original runner terminates. 3. Kill the runner with SIGKILL (signal 9). This gives no opportunity for the runner to ignore the signal, and gives us confidence that the runner is actually going to be killed. This may have caused errors that look like this (where 129 is the HUP reload, which is expected): warning: failed runner 0 ack; requesting reload warning: runner 0 exited with code 129; restarting error: runner 0 still unresponsive; giving up The last line may also be omitted, with instead an empty `xmlo` being generated. DEV-10806 --- bin/tame | 43 ++++++++++++++++++++++++++++++++++++++----- bin/tamed | 35 ++++++++++++++++++++++++++--------- 2 files changed, 64 insertions(+), 14 deletions(-) diff --git a/bin/tame b/bin/tame index 2e7bf42c..bd98d979 100755 --- a/bin/tame +++ b/bin/tame @@ -93,7 +93,7 @@ command-runner() # if not, then it may have stalled for some reason verify-runner-ack "$*" < "$base/1" || { echo "warning: failed runner $id ack; requesting reload" >&2 - kill -HUP "$pid" + reload-runner-and-wait "$base" "$id" "$pid" # give some extra time in case the host is under high load sleep "$TAME_CMD_WAITTIME" @@ -213,7 +213,7 @@ spawn-runner-and-wait() # wait on the expected id local -ri nextid=$(( maxid + 1 )) - wait-for-runner "$root" "$nextid" + wait-for-runner "$root/$nextid" "$nextid" echo "$nextid" } @@ -314,6 +314,33 @@ verify-runner() } +# Request a runner reload and wait for the reload to complete. +# +# Before requesting the reload, a `reloading` flag will be set on the +# runner. This flag is expected to be cleared by tamed once the runner has +# been restarted in response to SIGHUP. +# +# See `wait-for-runner` for more information on waiting for the flag. +reload-runner-and-wait() { + local -r base="${1?Missing base}" + local -ri id="${2?Missing id}" + local -ri pid="${3?Missing pid}" + + # mark the runner as unavailable + touch "$base/reloading" + + # issue a reload request to the runner + kill -HUP "$pid" + + # we must not continue before the runner has been reloaded, otherwise we + # may issue new requests to a process that is being killed + wait-for-runner "$base" "$id" || { + echo "error: failed to reload runner $id" >&2 + exit "$EX_STALLED" + } +} + + # Wait for command acknowledgment from runner # # The runner must respond within TAME_CMD_WAITTIME seconds @@ -336,9 +363,12 @@ verify-runner-ack() # Assumes that the runner is ready once the pidfile becomes # available. Polls for a maximum of six seconds before giving up # and exiting with a non-zero status. +# +# This will also wait for the clearing of the `reloading` flag set by +# `reload-runner-and-wait`, as part of that same time. wait-for-runner() { - local -r root=${1?Missing root} + local -r base=${1?Missing base} local -r id=${2?Missing runner id} # we could use inotify, but that is not installed by default @@ -346,7 +376,10 @@ wait-for-runner() # another dependency (give up after 6 seconds) local -i i=12 while test $((i--)); do - test ! -f "$root/$id/pid" || return 0 + if [ -f "$base/pid" -a ! -f "$base/reloading" ]; then + return 0 + fi + sleep 0.5 done @@ -416,7 +449,7 @@ _start-tamed() # wait for tamed even if it was already started (just in # case this script was executed right after tamed started # but before it is done initializing) - wait-for-runner "$root" 0 + wait-for-runner "$root/0" 0 } diff --git a/bin/tamed b/bin/tamed index 05893134..c744e740 100755 --- a/bin/tamed +++ b/bin/tamed @@ -129,16 +129,33 @@ spawn-runner() # monitor runner usage and kill when inactive stall-monitor "$base" & - # loop to restart runner in case of crash - while true; do - declare -i job=0 - trap 'kill -INT $job' HUP - "$mypath/dslc" < "$base/0" &> "$base/1" & job=$! - declare -i status=0 - wait -n 2>/dev/null || status=$? - echo "warning: runner $id exited with code $status; restarting" >&2 - done & + # loop to restart runner in case of crash + ( + declare -i job=0 + trap force-job-reload HUP + + force-job-reload() { + kill -9 $job + } + + while true; do + # store the time that the runner was started so that we can later + # determine if it should be restarted to forcefully reclaim memory + date +%s > "$base/created-ts" + + "$mypath/dslc" < "$base/0" &> "$base/1" & job=$! + + # this flag is set by the `tame` client so that it knows when the + # runner becomes available + rm -f "$base/reloading" + + declare -i status=0 + wait "$job" 2>/dev/null || status=$? + + echo "warning: runner $id exited with code $status (pid $job); restarting" >&2 + done + ) & echo "$!" > "$base/pid"