tame/bin/tamed

#!/bin/bash
# Daemon for accepting TAME commands (compilers, linker, etc)
#
#   Copyright (C) 2014-2023 Ryan Specialty, LLC.
#
#   This program is free software: you can redistribute it and/or modify
#   it under the terms of the GNU General Public License as published by
#   the Free Software Foundation, either version 3 of the License, or
#   (at your option) any later version.
#
#   This program is distributed in the hope that it will be useful,
#   but WITHOUT ANY WARRANTY; without even the implied warranty of
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#   GNU General Public License for more details.
#
#   You should have received a copy of the GNU General Public License
#   along with this program.  If not, see <http://www.gnu.org/licenses/>.
##

set -euo pipefail

declare mypath; mypath=$( dirname "$( readlink -f "$0" )" )
readonly mypath

declare -ri EX_RUNNING=1
declare -ri EX_NOTRUNNING=2  # tamed is not running
declare -ri EX_RUNTAB_LOCK=3 # failed to acquire aggregate runtab lock
declare -ri EX_RUNTAB_OUT=4  # failed to write to aggregate runtab
declare -ri EX_USAGE=64      # incorrect usage; sysexits.h
declare -ri EX_CANTCREAT=73  # cannot create file; sysexits.h

# number of seconds of output silence before runners are considered unused
# and are subject to termination (see stall-monitor)
declare -ri TAMED_STALL_SECONDS="${TAMED_STALL_SECONDS:-1}"

# id of process that indirectly spawned tamed (default $PPID)
declare -ri TAMED_SPAWNER_PID="${TAMED_SPAWNER_PID:-$PPID}"

# options to pass to JVM via dslc
declare -r TAMED_JAVA_OPTS="${TAMED_JAVA_OPTS:-}"
export JAVA_OPTS="$TAMED_JAVA_OPTS"

# set by `main', global for `cleanup' and `runner-report-all'
declare root=

# non-empty if in TUI (terminal UI) mode (use `in-tui-mode')
declare -r TAMED_TUI="${TAMED_TUI:-}"
declare tui_mode=

# file into which aggregate runner report will be placed (none if empty)
declare -r TAMED_RUNTAB_OUT="${TAMED_RUNTAB_OUT:-}"


# Create FIFOs for runner
#
# The FIFOs are intended to be attached to stderr and stdout
# of the runner and will be created relative to the given
# root path ROOT.
#
# If a FIFO cannot be created, exit with EX_CANTCREAT.
mkfifos()
{
  local -r root="${1?Missing root path}"

  mkdir -p "$root"

  # note that there's no stderr; see `add-runner'
  for n in 0 1; do
    rm -f "$root-$n"

    mkfifo -m 0600 "$root/$n" || {
      log "fatal: failed to create FIFO at $root/n" >&2
      exit $EX_CANTCREAT
    }
  done

  # keep FIFOs open so we don't get EOF from writers
  tail -f >"$root/0" &
}


# Output a line, clearing the remainder of the line if in TUI mode
log()
{
  if in-tui-mode; then
    echo -en "\e[2K"
  fi

  echo "$@"
}


# Spawn a new runner using the next available runner id
#
# See `spawn-runner' for more information.
spawn-next-runner()
{
  local -r root="${1?Missing root path}"

  # get the next available id
  local -ri id=$( < "$root/maxid" )

  spawn-runner "$(( id + 1 ))" "$root"
}


# Spawn a runner
#
# A new runner is created by spawning dslc and attaching
# new FIFOs under the given id ID relative to the given
# run path ROOT.  The PID of the runner will be stored
# alongside the FIFOs in a pidfile `pid'.
spawn-runner()
{
  local -ri id="${1?Missing id}"
  local -r root="${2?Missing root run path}"

  local -r base="$root/$id"

  mkfifos "$base"

  # flag as available (the client will manipulate these)
  echo 0 > "$base/busy"

  # runtab is used for reporting, which we will optionally aggregate
  > "$base/runtab"
  monitor-runner-runtab "$root" "$base/runtab" &

  # monitor runner usage and kill when inactive
  stall-monitor "$base" &

  # loop to restart runner in case of crash
  while true; do
    declare -i job=0
    trap 'kill -INT $job' HUP
    "$mypath/dslc" < "$base/0" &> "$base/1" & job=$!

    declare -i status=0
    wait -n 2>/dev/null || status=$?
    echo "warning: runner $id exited with code $status; restarting" >&2
  done &

  echo "$!" > "$base/pid"

  # we assume that this is the new largest runner id
  echo "$id" > "$root/maxid"

  log "runner $id ($!): $base"
}


# Monitor the given runner runtab and append to the aggregate runtab
#
# The aggregate runtab is append-only and has a row-level lock to support
# concurrent writes without having to rely on kernel buffering.
monitor-runner-runtab()
{
  local -r root="${1?Missing root run path}"
  local -r runtab="${2?Missing runtab path}"

  # no use in aggregating if it was not requested
  test -n "$TAMED_RUNTAB_OUT" || return 0

  while ! spawner-dead; do
    # this is a shared file, and while buffering _should_ be sufficient, we
    # may as well avoid potential headaches entirely by locking during the
    # operation
    tail -f "$runtab" | while read -r row; do
        # we want to lock _per row write_, since output will be interleaved
        # between all the runners
        (
          local -ri timeout=3

          flock -w $timeout 7 || {
              echo "error: failed to acquire lock on aggregate runtab" >&2
              exit $EX_RUNTAB_LOCK
          }

          echo "$row" >&7
        ) 7>> "$TAMED_RUNTAB_OUT"
    done
  done
}


# Check that we can write to the provided runtab, and clear it
runtab-check-and-clear()
{
  test -n "$TAMED_RUNTAB_OUT" || return 0

  # clear the runtab, and see if we can write to it
  >"$TAMED_RUNTAB_OUT" || {
    echo "error: unable to write to '$TAMED_RUNTAB_OUT' (TAMED_RUNTAB_OUT)"
    exit $EX_RUNTAB_OUT
  }

  echo "tamed: aggregating runner runtabs into '$TAMED_RUNTAB_OUT'"
}


# Kill runner at BASE when it becomes inactive for TAMED_STALL_SECONDS
# seconds
#
# This monitors the modification time on the stdout FIFO.  stdin does not
# need to be monitored since dslc immediately echoes back commands it
# receives.
#
# dslc is pretty chatty at the time of writing this, so TAMED_STALL_SECONDS
# can easily be <=30s even for large packages.  This may need to change in
# the future if it becomes too much less chatty.  Increase that environment
# variable if runners stall unexpectedly in the middle of builds.
#
# If the id of the spawning process has been provided then we will never
# consider ourselves to be stalled if that process is still running.  This
# prevents, for example, tamed from killing itself while a parent make
# process is still running.
stall-monitor()
{
  local -r base="${1?Missing base}"

  # monitor output FIFO modification time
  while true; do
    local -i since last
    since=$( date +%s )
    sleep "$TAMED_STALL_SECONDS"
    last=$( stat -c%Y "$base/1" )

    # keep waiting if there has been activity since $since
    test "$last" -le "$since" || continue

    spawner-dead || continue

    # no activity; kill
    local -r pid=$( cat "$base/pid" )
    kill "$pid"
    wait "$pid" 2>/dev/null

    # this stall subprocess is no longer needed
    break
  done
}


# Check to see if the spawning process has died
#
# If no spawning process was provided, then this always returns a zero
# status.  Otherwise, it returns whether the given pid is _not_ running.
spawner-dead()
{
  test "$TAMED_SPAWNER_PID" -gt 0 || return 0

  ! ps "$TAMED_SPAWNER_PID" &>/dev/null
}


# Exit if tamed is already running at path ROOT
#
# If tamed is already running at ROOT, exit with status
# EX_RUNNING; otherwise, do nothing except output a warning
# if a stale pid file exists.
abort-if-running()
{
  local -r root="${1?Missing root rundir}"

  local -ri pid=$( cat "$root/pid" 2>/dev/null )

  test "$pid" -gt 0 || return 0

  ! ps "$pid" &>/dev/null || {
    log "fatal: tamed is already running at $root (pid $pid)!" >&2
    exit $EX_RUNNING
  }

  test -z "$pid" || {
    log "warning: clearing stale tamed (pid $pid)" >&2
  }
}


# Exit with EX_NOTRUNNING if tamed is not running at path ROOT
#
# ROOT must both exist and contain a `pid` file of a running process.
abort-if-not-running()
{
  local -r root="${1?Missing root rundir}"

  test -d "$root" || {
    log "tamed is not running at $root: path does not exist" >&2
    exit $EX_NOTRUNNING
  }

  local -ri pid=$( cat "$root/pid" 2>/dev/null )

  # this should not happen unless bash crashed
  ps "$pid" &>/dev/null || {
    log "tamed is not running at $root: process $pid has terminated" >&2
    exit $EX_NOTRUNNING
  }
}


# Kill running tamed at path ROOT
#
# If no pidfile is found at ROOT, do nothing.  This sends a
# signal only to the parent tamed process, _not_ individual
# runners; the target tamed is expected to clean up itself.
# Consequently, if a tamed terminated abnormally without
# cleaning up, this will not solve that problem.
#
# Note that this is also called by tame to clean up an old tamed
# before spawning a new one.
kill-running()
{
  local -r root="${1?Missing root}"

  test -d "$root" || return 0
  local -r pid=$( < "$root"/pid 2>/dev/null )

  test -n "$pid" || return 0

  log "killing tamed at $root ($pid)..."
  kill "$pid"
}


runner-report-all()
{
  local -r root="${1?Missing root}"

  abort-if-not-running "$root"
  for-each-runner "$root" runner-report
}


for-each-runner()
{
  local -r root="${1?Missing root}"
  local -r cmd="${2?Missing command}"
  shift 2

  local -ri maxid=$(cat "$root/maxid")

  echo "tamed is running at $root with $((maxid+1)) runner(s)"

  for runner in $(seq 0 "$maxid"); do
    echo
    "$cmd" "$root" "$@" "$runner"
  done
}


# Report on the status and current operation of each runner
#
# This report is generated by tamed rather than delegating to the runners
# themselves to avoid the complexity of mitigating output races.
runner-report()
{
  local -r root="${1?Missing root}"
  local -ri id="${2?Missing runner id}"

  local -r path="$root/$id"
  test -f "$path/cmdline" || return 0

  local cmdline=$(< "$path/cmdline" )
  local -a cmdstart cmdstart_fmt

  cmdstart=( $(< "$path/cmdstart" ) )
  cmdstart_fmt=$(date --date=@"${cmdstart[0]}" +%Y-%m-%dT%H:%M:%S)

  local -i now=$(date +%s)

  cat <<EOF
runner:  $id
command: $cmdline
start:   ${cmdstart[0]}.${cmdstart[1]} ($cmdstart_fmt)
elapsed: $((now - cmdstart)) seconds
EOF
}


elide-paths()
{
  local -r cols="${1?Missing columns}"
  local -r buffer="${2?Missing buffer}"

  # first, keep the first letter and last three of each dir, if doing so
  # would remove three or more characters; for example:
  #   "suppliers/foobarbaz/quux/quuux.xmlo" => "s…ers/f…baz/quux/quuux.xmlo"
  result=$(
    echo "$buffer" \
      | sed 's|\([a-zA-Z0-9_-]\)[a-zA-Z0-9_-]\{3,\}\([a-zA-Z9-9_-]\{3\}\)/|\1…\2/|g'
  )

  [ "${#result}" -gt $cols ] || {
    echo -n "$result"
    return
  }

  # more aggressive: remove all but the first letter if it would save at
  # least three characters, as in:
  #   "suppliers/foobarbaz/quux/quuux.xmlo" => "s…/f…/quux/quuux.xmlo"
  result=$(
    echo "$buffer" | sed 's|\([a-zA-Z0-9_-]\)[^ /]\{3,\}/|\1…/|g'
  )

  [ "${#result}" -gt $cols ] || {
    echo -n "$result"
    return
  }

  # even more aggressive: elide all but the filename, as in:
  #   "suppliers/foobarbaz/quux/quuux.xmlo" => "…/quuux.xmlo"
  result=$(
    echo "$buffer" | sed 's|[a-zA-Z0-9_-/]*/|…/|g'
  )

  [ "${#result}" -gt $cols ] || {
    echo -n "$result"
    return
  }

  # at this point, it's better to provide _some_ useful information for
  # _some_ runners, so just truncate the previous result (we probably have
  # too many runners for the current terminal width)
  echo -n "${result::$((cols-1))}…"
}


# Report of all runners' status on a single line
#
# Idle runners are not output for now, since that increases the likelihood
# that we will not output something when runners are done doing their jobs
# (including overwriting the PS1).
runner-report-line() {
  local -r root="${1?Missing root}"

  # buffer output so that our report does not get mixed with normal
  # runner output
  local buffer=$( runner-report-all "$root" | awk '
    /^command: idle/,/^$/ { next }      # skip idle
    /^command:/ { printf "[%s ", $NF }  # e.g. "[foo/bar.xmlo "
    /^elapsed:/ { printf "%ds] ", $2 }  # e.g. "2s] "
  ' )

  # ensure proper empty output without formatting if there is no line
  test -n "$buffer" || return 0

  # bash has checkwinsize, but that runs after every command; try to use
  # tput, defaulting to 80.  Note that we have to check this every time, in
  # case the terminal has been resized.
  local -ri cols=$(tput cols || echo 80)

  # rather than worrying about line wrapping, fit to one line
  if [[ "${#buffer}" -gt $cols ]]; then
    buffer=$(elide-paths $cols "$buffer")
  fi

  # output in bold, overwrite our line that may already be present here, and
  # place cursor at beginning of the line so any runner output will
  # overwrite
  echo -en "\e[1m$buffer\e[0m\r"
}


# Clean up child processes before exit
#
# This should be called before exit (perhaps by a trap).  Kills
# the entire process group.
#
# Do not attach this to a SIGTERM trap or it will infinitely
# recurse.
cleanup()
{
  rm -rf "$root"
  kill 0
}


# Output usage information and exit
usage()
{
  cat <<EOF
Usage: $0 [--kill] [runpath]
Start tamed and runners.  Do not fork into background process.

The default value of RUNPATH is \`/run/user/$UID/tamed'.

Only one runner is currently supported.  tamed exits once all
runners have terminated.  Runners will be killed once they are
inactive for at least TAMED_STALL_SECONDS (default 1), unless
the process identified by TAMED_SPAWNER_PID is still running.
For example, a build script may wish to set TAMED_SPAWNER_PID
to the process id of make itself.  It defaults to the actual
parent process id (PPID), so tamed will not kill itself if
run manually on a shell (unless the shell exits first).

TAMED_RUNTAB_OUT can specify a file in which to write job
start times (as seconds from the Unix epoch); durations
(in milliseconds); and commands from each of the runners.
The table is tab-delimited.  Here are some useful examples:

  # format nicely into columns and view in pager
  $ column runtab | less

  # sort by runtime descending (second column)
  $ sort -rnk2 runtab

  # take the runtime and command columns
  $ cut -2,3 runtab

  # convert milliseconds into minutes (!) and sort desc
  $ awk '{ \$2 = \$2 / 1000 / 60; print }' runtab | sort -nrk2

  # convert to CSV (assuming no quoting is needed)
  $ tr '\t' , < runtab > runtab.csv

Options:
  --help    show this message
  --kill    kill a runing tamed at path RUNPATH
  --report  display runner report (this is subject to change
              in later versions)

Environment Variables:
  TAMED_STALL_SECONDS   number of seconds of runner inactivity before
                          runner is automatically killed (default 1)
  TAMED_SPAWNER_PID     inhibit stalling while this process is running
                          (default PPID)
  TAMED_JAVA_OPTS       opts to pass to dslc, and in turn, the JVM
  TAMED_TUI             run in TUI mode (provide UI features like a
                          dynamic runner status line)
  TAMED_RUNTAB_OUT      file into which aggregate runner report will
                          be written (otherwise reports are only
                          available per-runner while tamed is running)
EOF

  exit $EX_USAGE
}


# Determine whether to enable TUI mode
#
# TUI (terminal UI) mode will augment the output with features that only
# make sense when running on a user's terminal, such as the runner status
# line.
tui-check()
{
  test "$TAMED_TUI" == 1 || return 0
  tui_mode=1
  log "tamed is running in TUI mode (TAMED_TUI=0 to disable)"
}


# Whether we're running in TUI mode
in-tui-mode()
{
  test -n "$tui_mode"
}


# If in TUI mode, continuously update the last line of output with runner
# status
#
# This is not an easy undertaking with how our build process currently
# works.  Make is responsible, currently, for echoing lines, and so we must
# frequently re-echo our status line in an attempt to redisplay the line
# after it is overwritten.
#
# Further, most output is unaware that the entire line needs to be
# overwritten; if output is not properly transformed in the Makefile, then
# portions of the status line may remain in the history, partly overwritten
# by build output.
#
# Another concern is that we do not want to keep outputting after the
# process is finished, which would overwrite the PS1.  To try to avoid this,
# we omit idle runner output and only clear the line _once_ when the status
# line is empty, in the hope that all runners will be idle for long enough
# before the build completes, make exists, exits, and the PS1 is output.
#
# If not in TUI mode, this does nothing.
tui-runner-status-line()
{
  in-tui-mode || return 0

  local cache= cleared=

  while ! spawner-dead; do
    # this will fail if no runners have been created yet, so just ignore
    # it; if we fail to output the status line, the build will still work
    cache=$(runner-report-line "$root" 2>/dev/null)

    # if the line is empty, clear the output _once_ (to get rid of
    # whatever was there before), but do not do it again, otherwise we
    # risk overwriting lines post-build (like the PS1 or late-stage make
    # targets).
    if [ -z "$cache" -a -z "$cleared" ]; then
      log -n ""
      cleared=1

      sleep 1
      continue
    fi

    cleared=

    # output the cache frequently to try to overcome build output
    for i in {0..9}; do
      log -n "$cache"
      sleep 0.1
    done
  done
}


# Run tamed
main()
{
  local kill= report=
  case "${1:-}" in
    --kill) kill=1; shift;;
    --report) report=1; shift;;
    --help) usage;;
  esac

  root="${1:-/run/user/$UID/tamed}"

  # report requested
  test -z "$report" || {
    runner-report-all "$root"
    exit
  }

  # kill if requested
  test -z "$kill" || {
    kill-running "$root"
    exit
  }

  abort-if-running "$root"
  tui-check
  runtab-check-and-clear

  # clean up background processes before we exit
  trap exit TERM
  trap cleanup EXIT

  # start fresh
  rm -rf "$root"; mkdir -p "$root"
  local -i pid=$$
  echo $pid > "$root/pid"

  # start with a single runner; we'll spawn more if requested
  spawn-runner 0 "$root"
  trap "spawn-next-runner '$root'" USR1

  # status line reporting on runners for TUI mode
  tui-runner-status-line &

  # wait for runners to complete or for a signal to be received by this
  # process that terminates `wait'
  while true; do
    wait -n || {
      status=$?

      # ignore USR{1,2}
      if [ $status -ne 138 -a $status -ne 140 ]; then
        exit $status
      fi
    }
  done
}

main "$@"