#!/bin/bash
# Client for TAME daemon (tamed)
#
# Copyright (C) 2014-2023 Ryan Specialty, LLC.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see .
##
set -euo pipefail
declare mypath; mypath=$( dirname "$( readlink -f "$0" )" )
readonly mypath
declare -ri EX_NOTAMED=1 # tried to start tamed or runner but failed
declare -ri EX_STALLED=2 # runner stalled and could not recover
declare -ri EX_NORUN=3 # no available runners
declare -ri EX_DLOCK=4 # failed to get a lock to start tamed
declare -ri EX_BLOCK=5 # failed to get a lock for busy runner check
declare -ri EX_NODONE=6 # tamed did not provide a DONE with exit code
declare -ri EX_USAGE=64 # incorrect usage; sysexits.h
# maximum amount of time in seconds to wait for runner to ack
# before forcibly restarting it
declare -ri TAME_CMD_WAITTIME="${TAME_CMD_WAITTIME:-3}"
# propagate to daemon
export TAMED_STALL_SECONDS
export TAMED_SPAWNER_PID
export TAMED_JAVA_OPTS
# Send a single command to the next available runner and
# observe the result
#
# See `command-runner' for more information.
command-available-runner()
{
local -r root="${1?Missing root run path}"
shift 1
local -r id=$( reserve-runner "$root" )
test -n "$id" || {
echo "no available runners at $root" >&2
exit $EX_NORUN
}
command-runner "$id" "$root" "$@" \
| tee -a "run-$id.log"
}
# Send a single command to a runner and observe the result
#
# stdin will be directed to the runner. stdout of the runner will be
# echoed until a line beginning with "DONE" is found, after which this
# procedure will return with the exit code indicated by the runner.
command-runner()
{
local -ri id="${1?Missing id}"
local -r root="${2?Missing root run path}"
shift 2
local -r base="$root/$id"
local -ri pid=$( cat "$base/pid" )
verify-runner "$base" "$id" "$pid"
# forward signals to runner so that build is actually halted
# (rather than continuing in background after we die)
trap 'kill -TERM $pid &>/dev/null' INT TERM
# log the provided command line and starting time so that we can determine
# what is currently being compiled and how long it is taking
millis > "$base/cmdstart"
echo "$*" > "$base/cmdline"
# all remaining arguments are passed to the runner
echo "$*" > "$base/0"
# we should immediately get a response from the runner;
# if not, then it may have stalled for some reason
verify-runner-ack "$*" < "$base/1" || {
echo "warning: failed runner $id ack; requesting reload" >&2
kill -HUP "$pid"
# give some extra time in case the host is under high load
sleep "$TAME_CMD_WAITTIME"
# try one last time
echo "$*" > "$base/0"
verify-runner-ack "$*" < "$base/1" || {
echo "error: runner $id still unresponsive; giving up" >&2
exit "$EX_STALLED"
}
}
# output lines from runner until we reach a line stating "DONE"
while read -r line; do
# don't parse words in the initial read because we may be
# dealing with a lot of lines
if [ "${line:0:5}" == "DONE " ]; then
read -r _ code _ <<< "$line"
runtab-append "$base"
mark-available "$base"
return "$code"
fi
echo "$line"
done < "$base/1"
# We should have returned as soon as we received DONE. If this was not
# provided, then something probably went wrong (e.g. JVM crash).
return "$EX_NODONE"
}
# Get id of the first available runner and mark it as busy
#
# If no runners are available, tamed is signalled to spawn a new one.
#
# This command calls `mark-busy' so that it can acquire a runner in an
# atomic manner. The caller is responsible for invoking `mark-available'
# after processing is complete.
#
# If no runner is available, then the result will be empty.
reserve-runner()
{
local -r root=${1?Missing root}
local -r timeout=10
(
flock -w $timeout 7 || {
echo "error: failed to acquire busy lock at $root" >&2
exit $EX_BLOCK
}
# grab the first available or request a new one
local id; id=$( get-available-runner-id "$root" )
if [ -z "$id" ]; then
id=$( spawn-runner-and-wait "$root" ) || {
echo "error: failed to reserve runner at $root" >&2
exit $EX_NORUN
}
fi
# mark it as busy while we still have the lock
mark-busy "$root/$id"
echo "$id"
) 7>"$root/busy-lock"
}
# Get the id of the next available runner
#
# THIS FUNCTION MUST BE GUARDED BY A MUTEX! Otherwise there is a race
# between acquiring the available id and then actually making use of it.
#
# If multiple runners are available, then the first available runner sorted
# numerically will be chosen. This helps to give the same runners more
# work, since they're more likely to have source (and compiled) already
# parsed in memory. As such, runners will have load disproportionately
# spread, and may exhibit large variances in resource consumption.
#
# Sorting numerically is done because globbing sorts lexically---if runner
# 10 is spawned, then it would find itself after "1" in the list rather than
# after runner "9".
#
# If all runners are visible, then nothing will be returned.
get-available-runner-id()
{
local -r root=${1?Missing root}
grep -l 0 "$root"/*/busy \
| awk -F/ '{ print $(NF-1) }' \
| sort -n \
| head -n1
}
# Tell tamed to spawn a new runner and output the new runner id
#
# THIS FUNCTION MUST BE GUARDED BY A MUTEX! Otherwise there is a race
# between signaling and reading from `maxid'.
#
# This sens USR1 to tamed indicating that the next available runner should
# be spawned, and then waits on that expected runner. See `wait-for-runner'
# for more information on waiting.
spawn-runner-and-wait()
{
local -r root=${1?Missing root}
local -r pid=$( < "$root/pid" )
local -ri maxid=$( < "$root/maxid" )
# request runner
kill -USR1 "$pid"
# wait on the expected id
local -ri nextid=$(( maxid + 1 ))
wait-for-runner "$root" "$nextid"
echo "$nextid"
}
# Mark a runner as busy (unable to accept new commands)
#
# Once work is done, use `mark-available' to undo this operation.
mark-busy()
{
local -r base=${1?Missing runner base path}
echo 1 > "$base/busy"
}
# Mark a runner as available (able to accept new commands)
#
# Once work is available, use `mark-busy' to undo this operation.
mark-available()
{
local -r base=${1?Missing runner base path}
echo 0 > "$base/busy"
echo idle > "$base/cmdline"
# this can be used to determine how long the worker has been idle
millis > "$base/cmdstart"
}
# Output seconds and milliseconds, space-delimited
millis()
{
local date
date=( $(date '+%s %N') )
# %N returns nanoseconds and it may be 0-prefixed, which would be
# interpreted as octal without the explicit base specification
echo "${date[0]}" "$(( 10#"${date[1]}" / 1000000 ))"
}
# Append data to the runner table (runtab)
#
# This takes information about the most recently executed command and
# appends it to a table representing the work that the runner has
# done. This should be done at the end of processing a particular job but
# before marking the runner as available using `mark-available'.
#
# The columns of this report are, tab-delimited:
# 1. Start date (Unix timestamp, seconds);
# 2. Duration (milliseconds); and
# 3. Runner command line
runtab-append()
{
local -r base=${1?Missing runner base path}
local cmd duration
local -a cmdstart now
cmd=$(< "$base/cmdline")
cmdstart=( $(< "$base/cmdstart") )
now=( $(millis) )
# duration consists of seconds and nanoseconds; let's just deal with
# milliseconds, since any greater precision is not useful to us with how
# slow the system is today, and convert it into a decimal for
# reporting. Nanoseconds may be 0-prefixed, which will be interpreted as
# octal without an explicit base specification.
duration=$((
((now[0] * 1000) + now[1])
- ((cmdstart[0] * 1000) + cmdstart[1])
))
# the duration is in milliseconds
printf "%d\t%s\t%s\n" "$cmdstart" "$duration" "$cmd" >> "$base/runtab"
}
# Verify that a runner is available
#
# If the runner is offline or not owned by $UID, then exit with
# a non-zero status.
verify-runner()
{
local -r base="${1?Missing base}"
local -ri id="${2?Missing id}"
local -ri pid="${3?Missing pid}"
ps "$pid" &>/dev/null || {
echo "error: runner $id ($pid) is offline!" >&2
exit "$EX_NOTAMED"
}
test -O "$base/0" || {
echo "error: runner $id ($pid) is not owned by $USER!" >&2
exit "$EX_NOTAMED"
}
}
# Wait for command acknowledgment from runner
#
# The runner must respond within TAME_CMD_WAITTIME seconds
# and must echo back the command that was given. Otherwise,
# this function returns with a non-zero status.
verify-runner-ack()
{
local -r cmd="${1?Missing command}"
read -t"$TAME_CMD_WAITTIME" -r ack || return
test "COMMAND $cmd" == "$ack" || {
# TODO check for ack mismatch once output race condition is fixed
:
}
}
# Wait somewhat impatiently for a runner
#
# Assumes that the runner is ready once the pidfile becomes
# available. Polls for a maximum of six seconds before giving up
# and exiting with a non-zero status.
wait-for-runner()
{
local -r root=${1?Missing root}
local -r id=${2?Missing runner id}
# we could use inotify, but that is not installed by default
# on Debian systems, so let's just poll rather than introduce
# another dependency (give up after 6 seconds)
local -i i=12
while test $((i--)); do
test ! -f "$root/$id/pid" || return 0
sleep 0.5
done
# still not available
echo "error: runner $id still unavailable; giving up" >&2
exit "$EX_NOTAMED"
}
# Attempts to start tamed if it's not already running
#
# This is designed to be safe for parallel builds by allowing only the first
# process to start tamed and hanging the others until spawning is complete.
#
# See `_start-tamed' for more information.
start-tamed-safe()
{
local -r root=${1?Missing root}
local -ri timeout=5
local -r guard="$root-guard"
mkdir -p "$( dirname "$root" )"
(
flock -w $timeout 6 || {
echo "error: failed to acquire tamed spawning lock at $root" >&2
exit $EX_DLOCK
}
_start-tamed "$root"
flock -u 6
rm -f "$guard"
) 6>"$guard"
}
# Start tamed if it is not already running
#
# If tamed is already running, nothing will happen; otherwise, start
# tamed and wait impatiently for the runner to become available.
#
# Even if tamed is started, wait for runner 0 to become available;
# this ensures that tamed is initialized even if this script is run
# after tamed is started but before it has fully come online (e.g
# parallel make).
_start-tamed()
{
local -r root="${1?Missing root}"
local -ri pid=$( cat "$root/pid" 2>/dev/null )
ps "$pid" &>/dev/null || {
echo "starting tamed at $root..."
# tell tamed to clean up so that we eliminate race conditions
# with wait-for-tamed (this will also kill any stray processes
# that a previous tamed may have spawned but didn't get the
# chance to clean up)
kill-tamed "$root" || true
# start tamed and allow it to persist for future commands
"$mypath/tamed" "$root" & disown
}
# wait for tamed even if it was already started (just in
# case this script was executed right after tamed started
# but before it is done initializing)
wait-for-runner "$root" 0
}
# Kill tamed
#
# Ask tamed to kill itself.
kill-tamed()
{
local -r root="${1?Missing root}"
"$mypath/tamed" --kill "$root"
}
# Filter dslc output to essential information
#
# The original output of dslc is quite noisy; this filters it down
# to only errors and warnings.
#
# Eventually, dslc out to be modified to handle filtering its own
# output rather than wasting cycles doing this filtering.
saneout()
{
# the final line clears the entire line before outputting in an attempt to
# better accommodate the runner status line from tamed; this can be
# removed once the Makefile properly takes up this task.
awk '
/^~~~~\[begin /,/^~~~~\[end / { next }
/^rm / { next }
/^COMMAND / { next }
/^Exception|^\t+at / {
if ( /^E/ ) {
print;
print "Stack trace written to run-*.log";
}
next;
}
/([Ww]arning|[Nn]otice)[: ]/ { printf "\033[0;33m"; w++; out=1; }
/[Ff]atal:/ { printf "\033[0;31m"; out=1; }
/!|[Ee]rror:/ { printf "\033[0;31m"; e++; out=1; }
/internal:/ { printf "\033[0;35m"; out=1; }
/internal error:/ { printf "\033[1m"; out=1; }
/^[^[]/ || out { print; printf "\033[0;0m"; out=0; }
' | sed 's/^/\x1b[2K\r/'
}
# Output usage information and exit
usage()
{
cat <