tamed: Add runtab and TAMED_RUNTAB_OUT

This provides logging that can be used to analyze jobs. See `tamed --help` for some examples. More to come. You'll notice that one of the examples reprents package build time in _minutes_. This is why TAMER is necessary; as of the time of writing, the longest-building package is nearly five and a half minutes, and there are a number of packages that take a minute or more. But, there are potentially other optimizations that can be done. And this is _after_ many rounds of optimizations over the years. (TAME was not originally built for what it is currently being used for.)
2022-01-19 16:47:12 -05:00 · 2022-01-19 16:47:12 -05:00 · 6fd570477a
parent 4a3b86f480
commit 6fd570477a
4 changed files with 148 additions and 6 deletions
--- a/RELEASES.md
+++ b/RELEASES.md
@ -22,6 +22,11 @@ Compiler
 --------
 - Input mappings will no longer emit the destination param as a dependency.
 - `tamed --report` and `TAMED_TUI` for analyzing build performance.
+- Runners now store start time and duration for each command, available in
+  the runpath for reporting.
+- `TAMED_RUNTAB_OUT`, if set, will aggregate all runners' runtabs into a
+  single file as jobs are completed.  See `tamed --help` for more
+  information and examples.

 Documentation
 -------------
--- a/bin/tame
+++ b/bin/tame
@ -83,7 +83,7 @@ command-runner()

  # log the provided command line and starting time so that we can determine
  # what is currently being compiled and how long it is taking
-  date +%s > "$base/cmdstart"
+  millis > "$base/cmdstart"
  echo "$*" > "$base/cmdline"

  # all remaining arguments are passed to the runner
@ -113,7 +113,9 @@ command-runner()
    if [ "${line:0:5}" == "DONE " ]; then
      read -r _ code _ <<< "$line"

+      runtab-append "$base"
      mark-available "$base"
+
      return "$code"
    fi

@ -237,7 +239,56 @@ mark-available()
  echo idle > "$base/cmdline"

  # this can be used to determine how long the worker has been idle
-  date +%s > "$base/cmdstart"
+  millis > "$base/cmdstart"
+}
+
+
+# Output seconds and milliseconds, space-delimited
+millis()
+{
+  local date
+  date=( $(date '+%s %N') )
+
+  # %N returns nanoseconds and it may be 0-prefixed, which would be
+  # interpreted as octal without the explicit base specification
+  echo "${date[0]}" "$(( 10#"${date[1]}" / 1000000 ))"
+}
+
+
+# Append data to the runner table (runtab)
+#
+# This takes information about the most recently executed command and
+# appends it to a table representing the work that the runner has
+# done.  This should be done at the end of processing a particular job but
+# before marking the runner as available using `mark-available'.
+#
+# The columns of this report are, tab-delimited:
+#   1. Start date (Unix timestamp, seconds);
+#   2. Duration (milliseconds); and
+#   3. Runner command line
+runtab-append()
+{
+  local -r base=${1?Missing runner base path}
+
+  local cmd duration
+  local -a cmdstart now
+
+  cmd=$(< "$base/cmdline")
+  cmdstart=( $(< "$base/cmdstart") )
+  now=( $(millis) )
+
+  # duration consists of seconds and nanoseconds; let's just deal with
+  # milliseconds, since any greater precision is not useful to us with how
+  # slow the system is today, and convert it into a decimal for
+  # reporting.  Nanoseconds may be 0-prefixed, which will be interpreted as
+  # octal without an explicit base specification.
+  duration=$((
+    ((now[0] * 1000) + now[1])
+    - ((cmdstart[0] * 1000) + cmdstart[1])
+  ))
+
+  # the duration is in milliseconds
+  printf "%d\t%s\t%s\n" "$cmdstart" "$duration" "$cmd" >> "$base/runtab"
 }


--- a/bin/tamed
+++ b/bin/tamed
@ -24,6 +24,8 @@ readonly mypath

 declare -ri EX_RUNNING=1
 declare -ri EX_NOTRUNNING=2  # tamed is not running
+declare -ri EX_RUNTAB_LOCK=3 # failed to acquire aggregate runtab lock
+declare -ri EX_RUNTAB_OUT=4  # failed to write to aggregate runtab
 declare -ri EX_USAGE=64      # incorrect usage; sysexits.h
 declare -ri EX_CANTCREAT=73  # cannot create file; sysexits.h

@ -45,6 +47,9 @@ declare root=
 declare -r TAMED_TUI="${TAMED_TUI:-}"
 declare tui_mode=

+# file into which aggregate runner report will be placed (none if empty)
+declare -r TAMED_RUNTAB_OUT="${TAMED_RUNTAB_OUT:-}"
+

 # Create FIFOs for runner
 #
@ -117,6 +122,10 @@ spawn-runner()
  # flag as available (the client will manipulate these)
  echo 0 > "$base/busy"

+  # runtab is used for reporting, which we will optionally aggregate
+  > "$base/runtab"
+  monitor-runner-runtab "$root" "$base/runtab" &
+
  # monitor runner usage and kill when inactive
  stall-monitor "$base" &

@ -140,6 +149,55 @@ spawn-runner()
 }


+# Monitor the given runner runtab and append to the aggregate runtab
+#
+# The aggregate runtab is append-only and has a row-level lock to support
+# concurrent writes without having to rely on kernel buffering.
+monitor-runner-runtab()
+{
+  local -r root="${1?Missing root run path}"
+  local -r runtab="${2?Missing runtab path}"
+
+  # no use in aggregating if it was not requested
+  test -n "$TAMED_RUNTAB_OUT" || return 0
+
+  while ! spawner-dead; do
+    # this is a shared file, and while buffering _should_ be sufficient, we
+    # may as well avoid potential headaches entirely by locking during the
+    # operation
+    tail -f "$runtab" | while read -r row; do
+        # we want to lock _per row write_, since output will be interleaved
+        # between all the runners
+        (
+          local -ri timeout=3
+
+          flock -w $timeout 7 || {
+              echo "error: failed to acquire lock on aggregate runtab" >&2
+              exit $EX_RUNTAB_LOCK
+          }
+
+          echo "$row" >&7
+        ) 7>> "$TAMED_RUNTAB_OUT"
+    done
+  done
+}
+
+
+# Check that we can write to the provided runtab, and clear it
+runtab-check-and-clear()
+{
+  test -n "$TAMED_RUNTAB_OUT" || return 0
+
+  # clear the runtab, and see if we can write to it
+  >"$TAMED_RUNTAB_OUT" || {
+    echo "error: unable to write to '$TAMED_RUNTAB_OUT' (TAMED_RUNTAB_OUT)"
+    exit $EX_RUNTAB_OUT
+  }
+
+  echo "tamed: aggregating runner runtabs into '$TAMED_RUNTAB_OUT'"
+}
+
+
 # Kill runner at BASE when it becomes inactive for TAMED_STALL_SECONDS
 # seconds
 #
@ -248,12 +306,15 @@ abort-if-not-running()
 # runners; the target tamed is expected to clean up itself.
 # Consequently, if a tamed terminated abnormally without
 # cleaning up, this will not solve that problem.
+#
+# Note that this is also called by tame to clean up an old tamed
+# before spawning a new one.
 kill-running()
 {
  local -r root="${1?Missing root}"

-  abort-if-not-running "$root" 2>/dev/null
-  local -r pid=$( cat "$root"/pid 2>/dev/null )
+  test -d "$root" || return 0
+  local -r pid=$( < "$root"/pid 2>/dev/null )

  test -n "$pid" || return 0

@ -301,14 +362,17 @@ runner-report()
  test -f "$path/cmdline" || return 0

  local cmdline=$(< "$path/cmdline" )
-  local cmdstart=$(< "$path/cmdstart" )
+  local -a cmdstart cmdstart_fmt
+
+  cmdstart=( $(< "$path/cmdstart" ) )
+  cmdstart_fmt=$(date --date=@"${cmdstart[0]}" +%Y-%m-%dT%H:%M:%S)

  local -i now=$(date +%s)

  cat <<EOF
 runner:  $id
 command: $cmdline
-start:   $cmdstart ($(date +%Y-%m-%dT%H:%M:%S <<< "$cmdstart"))
+start:   ${cmdstart[0]}.${cmdstart[1]} ($cmdstart_fmt)
 elapsed: $((now - cmdstart)) seconds
 EOF
 }
@ -430,6 +494,23 @@ to the process id of make itself.  It defaults to the actual
 parent process id (PPID), so tamed will not kill itself if
 run manually on a shell (unless the shell exits first).

+TAMED_RUNTAB_OUT can specify a file in which to write job
+start times (as seconds from the Unix epoch); durations
+(in milliseconds); and commands from each of the runners.
+The table is tab-delimited.  Here are some useful examples:
+
+  # format nicely into columns and view in pager
+  $ column runtab | less
+
+  # sort by runtime descending (second column)
+  $ sort -rnk2 runtab
+
+  # take the runtime and command columns
+  $ cut -2,3 runtab
+
+  # convert milliseconds into minutes (!) and sort desc
+  $ awk '{ $2 = $2 / 1000 / 60; print } runtab | sort -nrk2
+
 Options:
  --help    show this message
  --kill    kill a runing tamed at path RUNPATH
@ -444,6 +525,9 @@ Environment Variables:
  TAMED_JAVA_OPTS       opts to pass to dslc, and in turn, the JVM
  TAMED_TUI             run in TUI mode (provide UI features like a
                          dynamic runner status line)
+  TAMED_RUNTAB_OUT      file into which aggregate runner report will
+                          be written (otherwise reports are only
+                          available per-runner while tamed is running)
 EOF

  exit $EX_USAGE
@ -550,6 +634,7 @@ main()

  abort-if-running "$root"
  tui-check
+  runtab-check-and-clear

  # clean up background processes before we exit
  trap exit TERM
--- a/build-aux/Makefile.am
+++ b/build-aux/Makefile.am
@ -115,6 +115,7 @@ export TAME_CMD_WAITTIME
 export TAMED_STALL_SECONDS
 export TAMED_JAVA_OPTS
 export TAMED_TUI
+export TAMED_RUNTAB_OUT

 TAMED_SPAWNER_PID=$(shell echo $$PPID)
 export TAMED_SPAWNER_PID