Prevent hanging tame client when tamed runner is killed mid-process

It's a tad embarrassing that this has been eluding me for quite some time. I happened to run into it while testing the previous commit, which in turn only existed because I was trying to optimize runner performance. We'd have situations where, following a runner reload (exit code 129 = SIGHUP), the build would simply hang indefinitely. Apparently, `tame`, in `command-runner`, blocks on a `read` without a timeout, expecting that the FIFO attached to stdin will close if the runner crashes. Maybe that used to be the case, but that is no longer true today. Because of that, the FIFO stays open, and read continues to block, waiting for `DONE`. Now, `tamed`, when seeing that a runner has crashed (which could have been due to a reload), will check to see if that runner is marked busy. If so, that means that the client `tame` did not see `DONE`, because it did not clear the flag via `command-runner`'s `mark-available.` To notify the client that someone went wrong, `tamed` will inject a `DONE` into the output FIFO, which will allow the client to fail properly. `dslc` catches exceptions and should output `DONE` under normal operating conditions. However, since some of our systems require so much memory to build, we may encounter the OOM killer. In that case, the process has no time to recover (it is killed with SIGKILL), and therefore cannot output `DONE`. I suspect this is what has been happening to cause occasional build hangs. One final thing to clean this up: since we're properly handling reloads now, based on this commit and the immediately preceding one, we can suppress the warning when the code is 129 (see comments). DEV-10806
2023-10-03 14:05:46 -04:00 · 2023-10-03 14:05:46 -04:00 · fb5947d59e
parent 7c9d6837fe
commit fb5947d59e
2 changed files with 59 additions and 8 deletions
--- a/bin/tame
+++ b/bin/tame
@ -22,13 +22,14 @@ set -euo pipefail
 declare mypath; mypath=$( dirname "$( readlink -f "$0" )" )
 readonly mypath

-declare -ri EX_NOTAMED=1  # tried to start tamed or runner but failed
-declare -ri EX_STALLED=2  # runner stalled and could not recover
-declare -ri EX_NORUN=3    # no available runners
-declare -ri EX_DLOCK=4    # failed to get a lock to start tamed
-declare -ri EX_BLOCK=5    # failed to get a lock for busy runner check
-declare -ri EX_NODONE=6   # tamed did not provide a DONE with exit code
-declare -ri EX_USAGE=64   # incorrect usage; sysexits.h
+declare -ri EX_NOTAMED=1    # tried to start tamed or runner but failed
+declare -ri EX_STALLED=2    # runner stalled and could not recover
+declare -ri EX_NORUN=3      # no available runners
+declare -ri EX_DLOCK=4      # failed to get a lock to start tamed
+declare -ri EX_BLOCK=5      # failed to get a lock for busy runner check
+declare -ri EX_NODONE=6     # tamed did not provide a DONE with exit code
+declare -ri EX_UNEXPECTED=7 # runner terminated unexpectedly (see `tamed`)
+declare -ri EX_USAGE=64     # incorrect usage; sysexits.h

 # maximum amount of time in seconds to wait for runner to ack
 # before forcibly restarting it
--- a/bin/tamed
+++ b/bin/tamed
@ -140,6 +140,15 @@ spawn-runner()
    }

    while true; do
+      # if this runner is busy, then it must have terminated while
+      # processing (otherwise the client `tame` would have marked it as
+      # available); let's act on its behalf so that the client sees that we
+      # failed (which we'll represent with error code 2).
+      declare -i busy=$(< "$base/busy")
+      if runner-is-busy "$base"; then
+        inject-runner-unexpected-exit "$base" "$id"
+      fi
+
      # store the time that the runner was started so that we can later
      # determine if it should be restarted to forcefully reclaim memory
      date +%s > "$base/created-ts"
@ -153,7 +162,11 @@ spawn-runner()
      declare -i status=0
      wait "$job" 2>/dev/null || status=$?

-      echo "warning: runner $id exited with code $status (pid $job); restarting" >&2
+      # 129 = signal (128) + HUP (1), which is an explicit reload request
+      # that we need not report
+      if [ "$status" -ne 129 ]; then
+        echo "warning: runner $id exited with code $status (pid $job); restarting" >&2
+      fi
    done
  ) &

@ -166,6 +179,43 @@ spawn-runner()
 }


+# Whether the runner at the provided base is busy
+runner-is-busy() {
+  local -r base="$root/$id"
+
+  declare -i busy=$(< "$base/busy")
+  test "$busy" -eq 1
+}
+
+
+# Inject an exit code into the runner's output stream indicating an
+# unexpected exit
+#
+# The string `DONE n` is normally output at the end of a runner's
+# compilation (via `dslc`), where `n` is the exit code.  But if the runner
+# terminates before compilation completes (e.g. is OOM-killed), then it will
+# never have the chance to do so, leaving the client waiting for a
+# response.  If the client is not checking for stalls (due to
+# configuration), it may hang indefinitely.
+#
+# This function will inject a message into the output stream of the runner
+# as if `dslc` itself replied so that the `tame` client can observe a
+# failure and react accordingly.  This uses the `tame` `EX_UNEXPECTED` exit
+# code.
+#
+# This also outputs a warning to stderr.
+inject-runner-unexpected-exit() {
+  local -r base="${1?Missing base}"
+  local -ri id="${2?Missing id}"
+
+  echo "warning: runner $id exited unexpectedly" >&2
+
+  # TODO: Worth a shared file with `tame`?
+  local -ri EX_UNEXPECTED=7
+  echo "DONE $EX_UNEXPECTED" > "$base/1"
+}
+
+
 # Monitor the given runner runtab and append to the aggregate runtab
 #
 # The aggregate runtab is append-only and has a row-level lock to support