tame/bin/tamed

740 lines
20 KiB
Bash
Executable File

#!/bin/bash
# Daemon for accepting TAME commands (compilers, linker, etc)
#
# Copyright (C) 2014-2023 Ryan Specialty, LLC.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
##
set -euo pipefail
declare mypath; mypath=$( dirname "$( readlink -f "$0" )" )
readonly mypath
declare -ri EX_RUNNING=1
declare -ri EX_NOTRUNNING=2 # tamed is not running
declare -ri EX_RUNTAB_LOCK=3 # failed to acquire aggregate runtab lock
declare -ri EX_RUNTAB_OUT=4 # failed to write to aggregate runtab
declare -ri EX_USAGE=64 # incorrect usage; sysexits.h
declare -ri EX_CANTCREAT=73 # cannot create file; sysexits.h
# number of seconds of output silence before runners are considered unused
# and are subject to termination (see stall-monitor)
declare -ri TAMED_STALL_SECONDS="${TAMED_STALL_SECONDS:-1}"
# id of process that indirectly spawned tamed (default $PPID)
declare -ri TAMED_SPAWNER_PID="${TAMED_SPAWNER_PID:-$PPID}"
# options to pass to JVM via dslc
declare -r TAMED_JAVA_OPTS="${TAMED_JAVA_OPTS:-}"
export JAVA_OPTS="$TAMED_JAVA_OPTS"
# set by `main', global for `cleanup' and `runner-report-all'
declare root=
# non-empty if in TUI (terminal UI) mode (use `in-tui-mode')
declare -r TAMED_TUI="${TAMED_TUI:-}"
declare tui_mode=
# file into which aggregate runner report will be placed (none if empty)
declare -r TAMED_RUNTAB_OUT="${TAMED_RUNTAB_OUT:-}"
# Create FIFOs for runner
#
# The FIFOs are intended to be attached to stderr and stdout
# of the runner and will be created relative to the given
# root path ROOT.
#
# If a FIFO cannot be created, exit with EX_CANTCREAT.
mkfifos()
{
local -r root="${1?Missing root path}"
mkdir -p "$root"
# note that there's no stderr; see `add-runner'
for n in 0 1; do
rm -f "$root-$n"
mkfifo -m 0600 "$root/$n" || {
log "fatal: failed to create FIFO at $root/n" >&2
exit $EX_CANTCREAT
}
done
# keep FIFOs open so we don't get EOF from writers
tail -f >"$root/0" &
}
# Output a line, clearing the remainder of the line if in TUI mode
log()
{
if in-tui-mode; then
echo -en "\e[2K"
fi
echo "$@"
}
# Spawn a new runner using the next available runner id
#
# See `spawn-runner' for more information.
spawn-next-runner()
{
local -r root="${1?Missing root path}"
# get the next available id
local -ri id=$( < "$root/maxid" )
spawn-runner "$(( id + 1 ))" "$root"
}
# Spawn a runner
#
# A new runner is created by spawning dslc and attaching
# new FIFOs under the given id ID relative to the given
# run path ROOT. The PID of the runner will be stored
# alongside the FIFOs in a pidfile `pid'.
spawn-runner()
{
local -ri id="${1?Missing id}"
local -r root="${2?Missing root run path}"
local -r base="$root/$id"
mkfifos "$base"
# flag as available (the client will manipulate these)
echo 0 > "$base/busy"
# runtab is used for reporting, which we will optionally aggregate
> "$base/runtab"
monitor-runner-runtab "$root" "$base/runtab" &
# monitor runner usage and kill when inactive
stall-monitor "$base" &
# loop to restart runner in case of crash
(
declare -i job=0
trap force-job-reload HUP
force-job-reload() {
kill -9 $job
}
while true; do
# if this runner is busy, then it must have terminated while
# processing (otherwise the client `tame` would have marked it as
# available); let's act on its behalf so that the client sees that we
# failed (which we'll represent with error code 2).
declare -i busy=$(< "$base/busy")
if runner-is-busy "$base"; then
inject-runner-unexpected-exit "$base" "$id"
fi
# store the time that the runner was started so that we can later
# determine if it should be restarted to forcefully reclaim memory
date +%s > "$base/created-ts"
"$mypath/dslc" < "$base/0" &> "$base/1" & job=$!
# this flag is set by the `tame` client so that it knows when the
# runner becomes available
rm -f "$base/reloading"
declare -i status=0
wait "$job" 2>/dev/null || status=$?
# 129 = signal (128) + HUP (1), which is an explicit reload request
# that we need not report
if [ "$status" -ne 129 ]; then
echo "warning: runner $id exited with code $status (pid $job); restarting" >&2
fi
done
) &
echo "$!" > "$base/pid"
# we assume that this is the new largest runner id
echo "$id" > "$root/maxid"
log "runner $id ($!): $base"
}
# Whether the runner at the provided base is busy
runner-is-busy() {
local -r base="$root/$id"
declare -i busy=$(< "$base/busy")
test "$busy" -eq 1
}
# Inject an exit code into the runner's output stream indicating an
# unexpected exit
#
# The string `DONE n` is normally output at the end of a runner's
# compilation (via `dslc`), where `n` is the exit code. But if the runner
# terminates before compilation completes (e.g. is OOM-killed), then it will
# never have the chance to do so, leaving the client waiting for a
# response. If the client is not checking for stalls (due to
# configuration), it may hang indefinitely.
#
# This function will inject a message into the output stream of the runner
# as if `dslc` itself replied so that the `tame` client can observe a
# failure and react accordingly. This uses the `tame` `EX_UNEXPECTED` exit
# code.
#
# This also outputs a warning to stderr.
inject-runner-unexpected-exit() {
local -r base="${1?Missing base}"
local -ri id="${2?Missing id}"
echo "warning: runner $id exited unexpectedly" >&2
# TODO: Worth a shared file with `tame`?
local -ri EX_UNEXPECTED=7
echo "DONE $EX_UNEXPECTED" > "$base/1"
}
# Monitor the given runner runtab and append to the aggregate runtab
#
# The aggregate runtab is append-only and has a row-level lock to support
# concurrent writes without having to rely on kernel buffering.
monitor-runner-runtab()
{
local -r root="${1?Missing root run path}"
local -r runtab="${2?Missing runtab path}"
# no use in aggregating if it was not requested
test -n "$TAMED_RUNTAB_OUT" || return 0
while ! spawner-dead; do
# this is a shared file, and while buffering _should_ be sufficient, we
# may as well avoid potential headaches entirely by locking during the
# operation
tail -f "$runtab" | while read -r row; do
# we want to lock _per row write_, since output will be interleaved
# between all the runners
(
local -ri timeout=3
flock -w $timeout 7 || {
echo "error: failed to acquire lock on aggregate runtab" >&2
exit $EX_RUNTAB_LOCK
}
echo "$row" >&7
) 7>> "$TAMED_RUNTAB_OUT"
done
done
}
# Check that we can write to the provided runtab, and clear it
runtab-check-and-clear()
{
test -n "$TAMED_RUNTAB_OUT" || return 0
# clear the runtab, and see if we can write to it
>"$TAMED_RUNTAB_OUT" || {
echo "error: unable to write to '$TAMED_RUNTAB_OUT' (TAMED_RUNTAB_OUT)"
exit $EX_RUNTAB_OUT
}
echo "tamed: aggregating runner runtabs into '$TAMED_RUNTAB_OUT'"
}
# Kill runner at BASE when it becomes inactive for TAMED_STALL_SECONDS
# seconds
#
# This monitors the modification time on the stdout FIFO. stdin does not
# need to be monitored since dslc immediately echoes back commands it
# receives.
#
# dslc is pretty chatty at the time of writing this, so TAMED_STALL_SECONDS
# can easily be <=30s even for large packages. This may need to change in
# the future if it becomes too much less chatty. Increase that environment
# variable if runners stall unexpectedly in the middle of builds.
#
# If the id of the spawning process has been provided then we will never
# consider ourselves to be stalled if that process is still running. This
# prevents, for example, tamed from killing itself while a parent make
# process is still running.
stall-monitor()
{
local -r base="${1?Missing base}"
# monitor output FIFO modification time
while true; do
local -i since last
since=$( date +%s )
sleep "$TAMED_STALL_SECONDS"
last=$( stat -c%Y "$base/1" )
# keep waiting if there has been activity since $since
test "$last" -le "$since" || continue
spawner-dead || continue
# no activity; kill
local -r pid=$( cat "$base/pid" )
kill "$pid"
wait "$pid" 2>/dev/null
# this stall subprocess is no longer needed
break
done
}
# Check to see if the spawning process has died
#
# If no spawning process was provided, then this always returns a zero
# status. Otherwise, it returns whether the given pid is _not_ running.
spawner-dead()
{
test "$TAMED_SPAWNER_PID" -gt 0 || return 0
! ps "$TAMED_SPAWNER_PID" &>/dev/null
}
# Exit if tamed is already running at path ROOT
#
# If tamed is already running at ROOT, exit with status
# EX_RUNNING; otherwise, do nothing except output a warning
# if a stale pid file exists.
abort-if-running()
{
local -r root="${1?Missing root rundir}"
local -ri pid=$( cat "$root/pid" 2>/dev/null )
test "$pid" -gt 0 || return 0
! ps "$pid" &>/dev/null || {
log "fatal: tamed is already running at $root (pid $pid)!" >&2
exit $EX_RUNNING
}
test -z "$pid" || {
log "warning: clearing stale tamed (pid $pid)" >&2
}
}
# Exit with EX_NOTRUNNING if tamed is not running at path ROOT
#
# ROOT must both exist and contain a `pid` file of a running process.
abort-if-not-running()
{
local -r root="${1?Missing root rundir}"
test -d "$root" || {
log "tamed is not running at $root: path does not exist" >&2
exit $EX_NOTRUNNING
}
local -ri pid=$( cat "$root/pid" 2>/dev/null )
# this should not happen unless bash crashed
ps "$pid" &>/dev/null || {
log "tamed is not running at $root: process $pid has terminated" >&2
exit $EX_NOTRUNNING
}
}
# Kill running tamed at path ROOT
#
# If no pidfile is found at ROOT, do nothing. This sends a
# signal only to the parent tamed process, _not_ individual
# runners; the target tamed is expected to clean up itself.
# Consequently, if a tamed terminated abnormally without
# cleaning up, this will not solve that problem.
#
# Note that this is also called by tame to clean up an old tamed
# before spawning a new one.
kill-running()
{
local -r root="${1?Missing root}"
test -d "$root" || return 0
local -r pid=$( < "$root"/pid 2>/dev/null )
test -n "$pid" || return 0
log "killing tamed at $root ($pid)..."
kill "$pid"
}
runner-report-all()
{
local -r root="${1?Missing root}"
abort-if-not-running "$root"
for-each-runner "$root" runner-report
}
for-each-runner()
{
local -r root="${1?Missing root}"
local -r cmd="${2?Missing command}"
shift 2
local -ri maxid=$(cat "$root/maxid")
echo "tamed is running at $root with $((maxid+1)) runner(s)"
for runner in $(seq 0 "$maxid"); do
echo
"$cmd" "$root" "$@" "$runner"
done
}
# Report on the status and current operation of each runner
#
# This report is generated by tamed rather than delegating to the runners
# themselves to avoid the complexity of mitigating output races.
runner-report()
{
local -r root="${1?Missing root}"
local -ri id="${2?Missing runner id}"
local -r path="$root/$id"
test -f "$path/cmdline" || return 0
local cmdline=$(< "$path/cmdline" )
local -a cmdstart cmdstart_fmt
cmdstart=( $(< "$path/cmdstart" ) )
cmdstart_fmt=$(date --date=@"${cmdstart[0]}" +%Y-%m-%dT%H:%M:%S)
local -i now=$(date +%s)
cat <<EOF
runner: $id
command: $cmdline
start: ${cmdstart[0]}.${cmdstart[1]} ($cmdstart_fmt)
elapsed: $((now - cmdstart)) seconds
EOF
}
elide-paths()
{
local -r cols="${1?Missing columns}"
local -r buffer="${2?Missing buffer}"
# first, keep the first letter and last three of each dir, if doing so
# would remove three or more characters; for example:
# "suppliers/foobarbaz/quux/quuux.xmlo" => "s…ers/f…baz/quux/quuux.xmlo"
result=$(
echo "$buffer" \
| sed 's|\([a-zA-Z0-9_-]\)[a-zA-Z0-9_-]\{3,\}\([a-zA-Z9-9_-]\{3\}\)/|\1…\2/|g'
)
[ "${#result}" -gt $cols ] || {
echo -n "$result"
return
}
# more aggressive: remove all but the first letter if it would save at
# least three characters, as in:
# "suppliers/foobarbaz/quux/quuux.xmlo" => "s…/f…/quux/quuux.xmlo"
result=$(
echo "$buffer" | sed 's|\([a-zA-Z0-9_-]\)[^ /]\{3,\}/|\1…/|g'
)
[ "${#result}" -gt $cols ] || {
echo -n "$result"
return
}
# even more aggressive: elide all but the filename, as in:
# "suppliers/foobarbaz/quux/quuux.xmlo" => "…/quuux.xmlo"
result=$(
echo "$buffer" | sed 's|[a-zA-Z0-9_-/]*/|…/|g'
)
[ "${#result}" -gt $cols ] || {
echo -n "$result"
return
}
# at this point, it's better to provide _some_ useful information for
# _some_ runners, so just truncate the previous result (we probably have
# too many runners for the current terminal width)
echo -n "${result::$((cols-1))}"
}
# Report of all runners' status on a single line
#
# Idle runners are not output for now, since that increases the likelihood
# that we will not output something when runners are done doing their jobs
# (including overwriting the PS1).
runner-report-line() {
local -r root="${1?Missing root}"
# buffer output so that our report does not get mixed with normal
# runner output
local buffer=$( runner-report-all "$root" | awk '
/^command: idle/,/^$/ { next } # skip idle
/^command:/ { printf "[%s ", $NF } # e.g. "[foo/bar.xmlo "
/^elapsed:/ { printf "%ds] ", $2 } # e.g. "2s] "
' )
# ensure proper empty output without formatting if there is no line
test -n "$buffer" || return 0
# bash has checkwinsize, but that runs after every command; try to use
# tput, defaulting to 80. Note that we have to check this every time, in
# case the terminal has been resized.
local -ri cols=$(tput cols || echo 80)
# rather than worrying about line wrapping, fit to one line
if [[ "${#buffer}" -gt $cols ]]; then
buffer=$(elide-paths $cols "$buffer")
fi
# output in bold, overwrite our line that may already be present here, and
# place cursor at beginning of the line so any runner output will
# overwrite
echo -en "\e[1m$buffer\e[0m\r"
}
# Clean up child processes before exit
#
# This should be called before exit (perhaps by a trap). Kills
# the entire process group.
#
# Do not attach this to a SIGTERM trap or it will infinitely
# recurse.
cleanup()
{
rm -rf "$root"
kill 0
}
# Output usage information and exit
usage()
{
cat <<EOF
Usage: $0 [--kill] [runpath]
Start tamed and runners. Do not fork into background process.
The default value of RUNPATH is \`/run/user/$UID/tamed'.
Only one runner is currently supported. tamed exits once all
runners have terminated. Runners will be killed once they are
inactive for at least TAMED_STALL_SECONDS (default 1), unless
the process identified by TAMED_SPAWNER_PID is still running.
For example, a build script may wish to set TAMED_SPAWNER_PID
to the process id of make itself. It defaults to the actual
parent process id (PPID), so tamed will not kill itself if
run manually on a shell (unless the shell exits first).
TAMED_RUNTAB_OUT can specify a file in which to write job
start times (as seconds from the Unix epoch); durations
(in milliseconds); and commands from each of the runners.
The table is tab-delimited. Here are some useful examples:
# format nicely into columns and view in pager
$ column runtab | less
# sort by runtime descending (second column)
$ sort -rnk2 runtab
# take the runtime and command columns
$ cut -2,3 runtab
# convert milliseconds into minutes (!) and sort desc
$ awk '{ \$2 = \$2 / 1000 / 60; print }' runtab | sort -nrk2
# convert to CSV (assuming no quoting is needed)
$ tr '\t' , < runtab > runtab.csv
Options:
--help show this message
--kill kill a runing tamed at path RUNPATH
--report display runner report (this is subject to change
in later versions)
Environment Variables:
TAMED_STALL_SECONDS number of seconds of runner inactivity before
runner is automatically killed (default 1)
TAMED_SPAWNER_PID inhibit stalling while this process is running
(default PPID)
TAMED_JAVA_OPTS opts to pass to dslc, and in turn, the JVM
TAMED_TUI run in TUI mode (provide UI features like a
dynamic runner status line)
TAMED_RUNTAB_OUT file into which aggregate runner report will
be written (otherwise reports are only
available per-runner while tamed is running)
EOF
exit $EX_USAGE
}
# Determine whether to enable TUI mode
#
# TUI (terminal UI) mode will augment the output with features that only
# make sense when running on a user's terminal, such as the runner status
# line.
tui-check()
{
test "$TAMED_TUI" == 1 || return 0
tui_mode=1
log "tamed is running in TUI mode (TAMED_TUI=0 to disable)"
}
# Whether we're running in TUI mode
in-tui-mode()
{
test -n "$tui_mode"
}
# If in TUI mode, continuously update the last line of output with runner
# status
#
# This is not an easy undertaking with how our build process currently
# works. Make is responsible, currently, for echoing lines, and so we must
# frequently re-echo our status line in an attempt to redisplay the line
# after it is overwritten.
#
# Further, most output is unaware that the entire line needs to be
# overwritten; if output is not properly transformed in the Makefile, then
# portions of the status line may remain in the history, partly overwritten
# by build output.
#
# Another concern is that we do not want to keep outputting after the
# process is finished, which would overwrite the PS1. To try to avoid this,
# we omit idle runner output and only clear the line _once_ when the status
# line is empty, in the hope that all runners will be idle for long enough
# before the build completes, make exists, exits, and the PS1 is output.
#
# If not in TUI mode, this does nothing.
tui-runner-status-line()
{
in-tui-mode || return 0
local cache= cleared=
while ! spawner-dead; do
# this will fail if no runners have been created yet, so just ignore
# it; if we fail to output the status line, the build will still work
cache=$(runner-report-line "$root" 2>/dev/null)
# if the line is empty, clear the output _once_ (to get rid of
# whatever was there before), but do not do it again, otherwise we
# risk overwriting lines post-build (like the PS1 or late-stage make
# targets).
if [ -z "$cache" -a -z "$cleared" ]; then
log -n ""
cleared=1
sleep 1
continue
fi
cleared=
# output the cache frequently to try to overcome build output
for i in {0..9}; do
log -n "$cache"
sleep 0.1
done
done
}
# Run tamed
main()
{
local kill= report=
case "${1:-}" in
--kill) kill=1; shift;;
--report) report=1; shift;;
--help) usage;;
esac
root="${1:-/run/user/$UID/tamed}"
# report requested
test -z "$report" || {
runner-report-all "$root"
exit
}
# kill if requested
test -z "$kill" || {
kill-running "$root"
exit
}
abort-if-running "$root"
tui-check
runtab-check-and-clear
# clean up background processes before we exit
trap exit TERM
trap cleanup EXIT
# start fresh
rm -rf "$root"; mkdir -p "$root"
local -i pid=$$
echo $pid > "$root/pid"
# start with a single runner; we'll spawn more if requested
spawn-runner 0 "$root"
trap "spawn-next-runner '$root'" USR1
# status line reporting on runners for TUI mode
tui-runner-status-line &
# wait for runners to complete or for a signal to be received by this
# process that terminates `wait'
while true; do
wait -n || {
status=$?
# ignore USR{1,2}
if [ $status -ne 138 -a $status -ne 140 ]; then
exit $status
fi
}
done
}
main "$@"