From 7c9d6837fe18e444c33bfa855613d31460ccb051 Mon Sep 17 00:00:00 2001
From: Mike Gerwitz <mike.gerwitz@ryansg.com>
Date: Tue, 3 Oct 2023 13:31:36 -0400
Subject: [PATCH] Improve runner reload stability

The `tame` client has the ability to request a runner reload by issuing
SIGHUP to the runner PID.  This will cause `tamed` to kill the runner and
respawn it.

There were situations where this process hung or did not operate as
expected; the process was not reliable.  This does a number of things to
make sure the process is in a good state before proceeding:

  1. The HUP trap is set a single time, rather than being reset each time
     the signal is received for the runner.
  2. A `reloading` flag is set by the `tame` client when reloading is
     requested, and the flag is cleared by `tamed` when spawning a new
     runner.  This allows the client to wait until the reload is complete,
     and bail out otherwise.  Without this, there is a race, where the
     client may proceed to issue additional requests before the original
     runner terminates.
  3. Kill the runner with SIGKILL (signal 9).  This gives no opportunity for
     the runner to ignore the signal, and gives us confidence that the
     runner is actually going to be killed.

This may have caused errors that look like this (where 129 is the HUP
reload, which is expected):

  warning: failed runner 0 ack; requesting reload
  warning: runner 0 exited with code 129; restarting
  error: runner 0 still unresponsive; giving up

The last line may also be omitted, with instead an empty `xmlo` being
generated.

DEV-10806
---
 bin/tame  | 43 ++++++++++++++++++++++++++++++++++++++-----
 bin/tamed | 35 ++++++++++++++++++++++++++---------
 2 files changed, 64 insertions(+), 14 deletions(-)

diff --git a/bin/tame b/bin/tame
index 2e7bf42c..bd98d979 100755
--- a/bin/tame
+++ b/bin/tame
@@ -93,7 +93,7 @@ command-runner()
   # if not, then it may have stalled for some reason
   verify-runner-ack "$*" < "$base/1" || {
     echo "warning: failed runner $id ack; requesting reload" >&2
-    kill -HUP "$pid"
+    reload-runner-and-wait "$base" "$id" "$pid"
 
     # give some extra time in case the host is under high load
     sleep "$TAME_CMD_WAITTIME"
@@ -213,7 +213,7 @@ spawn-runner-and-wait()
 
   # wait on the expected id
   local -ri nextid=$(( maxid + 1 ))
-  wait-for-runner "$root" "$nextid"
+  wait-for-runner "$root/$nextid" "$nextid"
 
   echo "$nextid"
 }
@@ -314,6 +314,33 @@ verify-runner()
 }
 
 
+# Request a runner reload and wait for the reload to complete.
+#
+# Before requesting the reload, a `reloading` flag will be set on the
+# runner.  This flag is expected to be cleared by tamed once the runner has
+# been restarted in response to SIGHUP.
+#
+# See `wait-for-runner` for more information on waiting for the flag.
+reload-runner-and-wait() {
+  local -r base="${1?Missing base}"
+  local -ri id="${2?Missing id}"
+  local -ri pid="${3?Missing pid}"
+
+  # mark the runner as unavailable
+  touch "$base/reloading"
+
+  # issue a reload request to the runner
+  kill -HUP "$pid"
+
+  # we must not continue before the runner has been reloaded, otherwise we
+  # may issue new requests to a process that is being killed
+  wait-for-runner "$base" "$id" || {
+    echo "error: failed to reload runner $id" >&2
+    exit "$EX_STALLED"
+  }
+}
+
+
 # Wait for command acknowledgment from runner
 #
 # The runner must respond within TAME_CMD_WAITTIME seconds
@@ -336,9 +363,12 @@ verify-runner-ack()
 # Assumes that the runner is ready once the pidfile becomes
 # available.  Polls for a maximum of six seconds before giving up
 # and exiting with a non-zero status.
+#
+# This will also wait for the clearing of the `reloading` flag set by
+# `reload-runner-and-wait`, as part of that same time.
 wait-for-runner()
 {
-  local -r root=${1?Missing root}
+  local -r base=${1?Missing base}
   local -r id=${2?Missing runner id}
 
   # we could use inotify, but that is not installed by default
@@ -346,7 +376,10 @@ wait-for-runner()
   # another dependency (give up after 6 seconds)
   local -i i=12
   while test $((i--)); do
-    test ! -f "$root/$id/pid" || return 0
+    if [ -f "$base/pid" -a ! -f "$base/reloading" ]; then
+      return 0
+    fi
+
     sleep 0.5
   done
 
@@ -416,7 +449,7 @@ _start-tamed()
   # wait for tamed even if it was already started (just in
   # case this script was executed right after tamed started
   # but before it is done initializing)
-  wait-for-runner "$root" 0
+  wait-for-runner "$root/0" 0
 }
 
 
diff --git a/bin/tamed b/bin/tamed
index 05893134..c744e740 100755
--- a/bin/tamed
+++ b/bin/tamed
@@ -129,16 +129,33 @@ spawn-runner()
   # monitor runner usage and kill when inactive
   stall-monitor "$base" &
 
-  # loop to restart runner in case of crash
-  while true; do
-    declare -i job=0
-    trap 'kill -INT $job' HUP
-    "$mypath/dslc" < "$base/0" &> "$base/1" & job=$!
 
-    declare -i status=0
-    wait -n 2>/dev/null || status=$?
-    echo "warning: runner $id exited with code $status; restarting" >&2
-  done &
+  # loop to restart runner in case of crash
+  (
+    declare -i job=0
+    trap force-job-reload HUP
+
+    force-job-reload() {
+      kill -9 $job
+    }
+
+    while true; do
+      # store the time that the runner was started so that we can later
+      # determine if it should be restarted to forcefully reclaim memory
+      date +%s > "$base/created-ts"
+
+      "$mypath/dslc" < "$base/0" &> "$base/1" & job=$!
+
+      # this flag is set by the `tame` client so that it knows when the
+      # runner becomes available
+      rm -f "$base/reloading"
+
+      declare -i status=0
+      wait "$job" 2>/dev/null || status=$?
+
+      echo "warning: runner $id exited with code $status (pid $job); restarting" >&2
+    done
+  ) &
 
   echo "$!" > "$base/pid"