Improve runner reload stability

The `tame` client has the ability to request a runner reload by issuing SIGHUP to the runner PID. This will cause `tamed` to kill the runner and respawn it. There were situations where this process hung or did not operate as expected; the process was not reliable. This does a number of things to make sure the process is in a good state before proceeding: 1. The HUP trap is set a single time, rather than being reset each time the signal is received for the runner. 2. A `reloading` flag is set by the `tame` client when reloading is requested, and the flag is cleared by `tamed` when spawning a new runner. This allows the client to wait until the reload is complete, and bail out otherwise. Without this, there is a race, where the client may proceed to issue additional requests before the original runner terminates. 3. Kill the runner with SIGKILL (signal 9). This gives no opportunity for the runner to ignore the signal, and gives us confidence that the runner is actually going to be killed. This may have caused errors that look like this (where 129 is the HUP reload, which is expected): warning: failed runner 0 ack; requesting reload warning: runner 0 exited with code 129; restarting error: runner 0 still unresponsive; giving up The last line may also be omitted, with instead an empty `xmlo` being generated. DEV-10806
2023-10-03 13:31:36 -04:00 · 2023-10-03 13:31:36 -04:00 · 7c9d6837fe
parent b8a36ec984
commit 7c9d6837fe
2 changed files with 64 additions and 14 deletions
--- a/bin/tame
+++ b/bin/tame
@ -93,7 +93,7 @@ command-runner()
  # if not, then it may have stalled for some reason
  verify-runner-ack "$*" < "$base/1" || {
    echo "warning: failed runner $id ack; requesting reload" >&2
-    kill -HUP "$pid"
+    reload-runner-and-wait "$base" "$id" "$pid"

    # give some extra time in case the host is under high load
    sleep "$TAME_CMD_WAITTIME"
@ -213,7 +213,7 @@ spawn-runner-and-wait()

  # wait on the expected id
  local -ri nextid=$(( maxid + 1 ))
-  wait-for-runner "$root" "$nextid"
+  wait-for-runner "$root/$nextid" "$nextid"

  echo "$nextid"
 }
@ -314,6 +314,33 @@ verify-runner()
 }


+# Request a runner reload and wait for the reload to complete.
+#
+# Before requesting the reload, a `reloading` flag will be set on the
+# runner.  This flag is expected to be cleared by tamed once the runner has
+# been restarted in response to SIGHUP.
+#
+# See `wait-for-runner` for more information on waiting for the flag.
+reload-runner-and-wait() {
+  local -r base="${1?Missing base}"
+  local -ri id="${2?Missing id}"
+  local -ri pid="${3?Missing pid}"
+
+  # mark the runner as unavailable
+  touch "$base/reloading"
+
+  # issue a reload request to the runner
+  kill -HUP "$pid"
+
+  # we must not continue before the runner has been reloaded, otherwise we
+  # may issue new requests to a process that is being killed
+  wait-for-runner "$base" "$id" || {
+    echo "error: failed to reload runner $id" >&2
+    exit "$EX_STALLED"
+  }
+}
+
+
 # Wait for command acknowledgment from runner
 #
 # The runner must respond within TAME_CMD_WAITTIME seconds
@ -336,9 +363,12 @@ verify-runner-ack()
 # Assumes that the runner is ready once the pidfile becomes
 # available.  Polls for a maximum of six seconds before giving up
 # and exiting with a non-zero status.
+#
+# This will also wait for the clearing of the `reloading` flag set by
+# `reload-runner-and-wait`, as part of that same time.
 wait-for-runner()
 {
-  local -r root=${1?Missing root}
+  local -r base=${1?Missing base}
  local -r id=${2?Missing runner id}

  # we could use inotify, but that is not installed by default
@ -346,7 +376,10 @@ wait-for-runner()
  # another dependency (give up after 6 seconds)
  local -i i=12
  while test $((i--)); do
-    test ! -f "$root/$id/pid" || return 0
+    if [ -f "$base/pid" -a ! -f "$base/reloading" ]; then
+      return 0
+    fi
+
    sleep 0.5
  done

@ -416,7 +449,7 @@ _start-tamed()
  # wait for tamed even if it was already started (just in
  # case this script was executed right after tamed started
  # but before it is done initializing)
-  wait-for-runner "$root" 0
+  wait-for-runner "$root/0" 0
 }


--- a/bin/tamed
+++ b/bin/tamed
@ -129,16 +129,33 @@ spawn-runner()
  # monitor runner usage and kill when inactive
  stall-monitor "$base" &

+
  # loop to restart runner in case of crash
-  while true; do
+  (
    declare -i job=0
-    trap 'kill -INT $job' HUP
+    trap force-job-reload HUP
+
+    force-job-reload() {
+      kill -9 $job
+    }
+
+    while true; do
+      # store the time that the runner was started so that we can later
+      # determine if it should be restarted to forcefully reclaim memory
+      date +%s > "$base/created-ts"
+
      "$mypath/dslc" < "$base/0" &> "$base/1" & job=$!

+      # this flag is set by the `tame` client so that it knows when the
+      # runner becomes available
+      rm -f "$base/reloading"
+
      declare -i status=0
-    wait -n 2>/dev/null || status=$?
-    echo "warning: runner $id exited with code $status; restarting" >&2
-  done &
+      wait "$job" 2>/dev/null || status=$?
+
+      echo "warning: runner $id exited with code $status (pid $job); restarting" >&2
+    done
+  ) &

  echo "$!" > "$base/pid"