537 lines
15 KiB
Bash
Executable File
537 lines
15 KiB
Bash
Executable File
#!/bin/bash
|
|
# Client for TAME daemon (tamed)
|
|
#
|
|
# Copyright (C) 2014-2022 Ryan Specialty Group, LLC.
|
|
#
|
|
# This program is free software: you can redistribute it and/or modify
|
|
# it under the terms of the GNU General Public License as published by
|
|
# the Free Software Foundation, either version 3 of the License, or
|
|
# (at your option) any later version.
|
|
#
|
|
# This program is distributed in the hope that it will be useful,
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
# GNU General Public License for more details.
|
|
#
|
|
# You should have received a copy of the GNU General Public License
|
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
##
|
|
|
|
set -euo pipefail
|
|
|
|
declare mypath; mypath=$( dirname "$( readlink -f "$0" )" )
|
|
readonly mypath
|
|
|
|
declare -ri EX_NOTAMED=1 # tried to start tamed or runner but failed
|
|
declare -ri EX_STALLED=2 # runner stalled and could not recover
|
|
declare -ri EX_NORUN=3 # no available runners
|
|
declare -ri EX_DLOCK=4 # failed to get a lock to start tamed
|
|
declare -ri EX_BLOCK=5 # failed to get a lock for busy runner check
|
|
declare -ri EX_NODONE=6 # tamed did not provide a DONE with exit code
|
|
declare -ri EX_USAGE=64 # incorrect usage; sysexits.h
|
|
|
|
# maximum amount of time in seconds to wait for runner to ack
|
|
# before forcibly restarting it
|
|
declare -ri TAME_CMD_WAITTIME="${TAME_CMD_WAITTIME:-3}"
|
|
|
|
# propagate to daemon
|
|
export TAMED_STALL_SECONDS
|
|
export TAMED_SPAWNER_PID
|
|
export TAMED_JAVA_OPTS
|
|
|
|
|
|
# Send a single command to the next available runner and
|
|
# observe the result
|
|
#
|
|
# See `command-runner' for more information.
|
|
command-available-runner()
|
|
{
|
|
local -r root="${1?Missing root run path}"
|
|
shift 1
|
|
|
|
local -r id=$( reserve-runner "$root" )
|
|
|
|
test -n "$id" || {
|
|
echo "no available runners at $root" >&2
|
|
exit $EX_NORUN
|
|
}
|
|
|
|
command-runner "$id" "$root" "$@" \
|
|
| tee -a "run-$id.log"
|
|
}
|
|
|
|
|
|
# Send a single command to a runner and observe the result
|
|
#
|
|
# stdin will be directed to the runner. stdout of the runner will be
|
|
# echoed until a line beginning with "DONE" is found, after which this
|
|
# procedure will return with the exit code indicated by the runner.
|
|
command-runner()
|
|
{
|
|
local -ri id="${1?Missing id}"
|
|
local -r root="${2?Missing root run path}"
|
|
shift 2
|
|
|
|
local -r base="$root/$id"
|
|
local -ri pid=$( cat "$base/pid" )
|
|
|
|
verify-runner "$base" "$id" "$pid"
|
|
|
|
# forward signals to runner so that build is actually halted
|
|
# (rather than continuing in background after we die)
|
|
trap 'kill -TERM $pid &>/dev/null' INT TERM
|
|
|
|
# log the provided command line and starting time so that we can determine
|
|
# what is currently being compiled and how long it is taking
|
|
millis > "$base/cmdstart"
|
|
echo "$*" > "$base/cmdline"
|
|
|
|
# all remaining arguments are passed to the runner
|
|
echo "$*" > "$base/0"
|
|
|
|
# we should immediately get a response from the runner;
|
|
# if not, then it may have stalled for some reason
|
|
verify-runner-ack "$*" < "$base/1" || {
|
|
echo "warning: failed runner $id ack; requesting reload" >&2
|
|
kill -HUP "$pid"
|
|
|
|
# give some extra time in case the host is under high load
|
|
sleep "$TAME_CMD_WAITTIME"
|
|
|
|
# try one last time
|
|
echo "$*" > "$base/0"
|
|
verify-runner-ack "$*" < "$base/1" || {
|
|
echo "error: runner $id still unresponsive; giving up" >&2
|
|
exit "$EX_STALLED"
|
|
}
|
|
}
|
|
|
|
# output lines from runner until we reach a line stating "DONE"
|
|
while read -r line; do
|
|
# don't parse words in the initial read because we may be
|
|
# dealing with a lot of lines
|
|
if [ "${line:0:5}" == "DONE " ]; then
|
|
read -r _ code _ <<< "$line"
|
|
|
|
runtab-append "$base"
|
|
mark-available "$base"
|
|
|
|
return "$code"
|
|
fi
|
|
|
|
echo "$line"
|
|
done < "$base/1"
|
|
|
|
# We should have returned as soon as we received DONE. If this was not
|
|
# provided, then something probably went wrong (e.g. JVM crash).
|
|
return "$EX_NODONE"
|
|
}
|
|
|
|
|
|
# Get id of the first available runner and mark it as busy
|
|
#
|
|
# If no runners are available, tamed is signalled to spawn a new one.
|
|
#
|
|
# This command calls `mark-busy' so that it can acquire a runner in an
|
|
# atomic manner. The caller is responsible for invoking `mark-available'
|
|
# after processing is complete.
|
|
#
|
|
# If no runner is available, then the result will be empty.
|
|
reserve-runner()
|
|
{
|
|
local -r root=${1?Missing root}
|
|
|
|
local -r timeout=10
|
|
|
|
(
|
|
flock -w $timeout 7 || {
|
|
echo "error: failed to acquire busy lock at $root" >&2
|
|
exit $EX_BLOCK
|
|
}
|
|
|
|
# grab the first available or request a new one
|
|
local id; id=$( get-available-runner-id "$root" )
|
|
if [ -z "$id" ]; then
|
|
id=$( spawn-runner-and-wait "$root" ) || {
|
|
echo "error: failed to reserve runner at $root" >&2
|
|
exit $EX_NORUN
|
|
}
|
|
fi
|
|
|
|
# mark it as busy while we still have the lock
|
|
mark-busy "$root/$id"
|
|
|
|
echo "$id"
|
|
) 7>"$root/busy-lock"
|
|
}
|
|
|
|
|
|
# Get the id of the next available runner
|
|
#
|
|
# THIS FUNCTION MUST BE GUARDED BY A MUTEX! Otherwise there is a race
|
|
# between acquiring the available id and then actually making use of it.
|
|
#
|
|
# If multiple runners are available, then the first available runner sorted
|
|
# numerically will be chosen. This helps to give the same runners more
|
|
# work, since they're more likely to have source (and compiled) already
|
|
# parsed in memory. As such, runners will have load disproportionately
|
|
# spread, and may exhibit large variances in resource consumption.
|
|
#
|
|
# Sorting numerically is done because globbing sorts lexically---if runner
|
|
# 10 is spawned, then it would find itself after "1" in the list rather than
|
|
# after runner "9".
|
|
#
|
|
# If all runners are visible, then nothing will be returned.
|
|
get-available-runner-id()
|
|
{
|
|
local -r root=${1?Missing root}
|
|
|
|
grep -l 0 "$root"/*/busy \
|
|
| awk -F/ '{ print $(NF-1) }' \
|
|
| sort -n \
|
|
| head -n1
|
|
}
|
|
|
|
|
|
# Tell tamed to spawn a new runner and output the new runner id
|
|
#
|
|
# THIS FUNCTION MUST BE GUARDED BY A MUTEX! Otherwise there is a race
|
|
# between signaling and reading from `maxid'.
|
|
#
|
|
# This sens USR1 to tamed indicating that the next available runner should
|
|
# be spawned, and then waits on that expected runner. See `wait-for-runner'
|
|
# for more information on waiting.
|
|
spawn-runner-and-wait()
|
|
{
|
|
local -r root=${1?Missing root}
|
|
|
|
local -r pid=$( < "$root/pid" )
|
|
local -ri maxid=$( < "$root/maxid" )
|
|
|
|
# request runner
|
|
kill -USR1 "$pid"
|
|
|
|
# wait on the expected id
|
|
local -ri nextid=$(( maxid + 1 ))
|
|
wait-for-runner "$root" "$nextid"
|
|
|
|
echo "$nextid"
|
|
}
|
|
|
|
|
|
# Mark a runner as busy (unable to accept new commands)
|
|
#
|
|
# Once work is done, use `mark-available' to undo this operation.
|
|
mark-busy()
|
|
{
|
|
local -r base=${1?Missing runner base path}
|
|
echo 1 > "$base/busy"
|
|
}
|
|
|
|
|
|
# Mark a runner as available (able to accept new commands)
|
|
#
|
|
# Once work is available, use `mark-busy' to undo this operation.
|
|
mark-available()
|
|
{
|
|
local -r base=${1?Missing runner base path}
|
|
echo 0 > "$base/busy"
|
|
echo idle > "$base/cmdline"
|
|
|
|
# this can be used to determine how long the worker has been idle
|
|
millis > "$base/cmdstart"
|
|
}
|
|
|
|
|
|
# Output seconds and milliseconds, space-delimited
|
|
millis()
|
|
{
|
|
local date
|
|
date=( $(date '+%s %N') )
|
|
|
|
# %N returns nanoseconds and it may be 0-prefixed, which would be
|
|
# interpreted as octal without the explicit base specification
|
|
echo "${date[0]}" "$(( 10#"${date[1]}" / 1000000 ))"
|
|
}
|
|
|
|
|
|
# Append data to the runner table (runtab)
|
|
#
|
|
# This takes information about the most recently executed command and
|
|
# appends it to a table representing the work that the runner has
|
|
# done. This should be done at the end of processing a particular job but
|
|
# before marking the runner as available using `mark-available'.
|
|
#
|
|
# The columns of this report are, tab-delimited:
|
|
# 1. Start date (Unix timestamp, seconds);
|
|
# 2. Duration (milliseconds); and
|
|
# 3. Runner command line
|
|
runtab-append()
|
|
{
|
|
local -r base=${1?Missing runner base path}
|
|
|
|
local cmd duration
|
|
local -a cmdstart now
|
|
|
|
cmd=$(< "$base/cmdline")
|
|
cmdstart=( $(< "$base/cmdstart") )
|
|
now=( $(millis) )
|
|
|
|
# duration consists of seconds and nanoseconds; let's just deal with
|
|
# milliseconds, since any greater precision is not useful to us with how
|
|
# slow the system is today, and convert it into a decimal for
|
|
# reporting. Nanoseconds may be 0-prefixed, which will be interpreted as
|
|
# octal without an explicit base specification.
|
|
duration=$((
|
|
((now[0] * 1000) + now[1])
|
|
- ((cmdstart[0] * 1000) + cmdstart[1])
|
|
))
|
|
|
|
# the duration is in milliseconds
|
|
printf "%d\t%s\t%s\n" "$cmdstart" "$duration" "$cmd" >> "$base/runtab"
|
|
}
|
|
|
|
|
|
# Verify that a runner is available
|
|
#
|
|
# If the runner is offline or not owned by $UID, then exit with
|
|
# a non-zero status.
|
|
verify-runner()
|
|
{
|
|
local -r base="${1?Missing base}"
|
|
local -ri id="${2?Missing id}"
|
|
local -ri pid="${3?Missing pid}"
|
|
|
|
ps "$pid" &>/dev/null || {
|
|
echo "error: runner $id ($pid) is offline!" >&2
|
|
exit "$EX_NOTAMED"
|
|
}
|
|
|
|
test -O "$base/0" || {
|
|
echo "error: runner $id ($pid) is not owned by $USER!" >&2
|
|
exit "$EX_NOTAMED"
|
|
}
|
|
}
|
|
|
|
|
|
# Wait for command acknowledgment from runner
|
|
#
|
|
# The runner must respond within TAME_CMD_WAITTIME seconds
|
|
# and must echo back the command that was given. Otherwise,
|
|
# this function returns with a non-zero status.
|
|
verify-runner-ack()
|
|
{
|
|
local -r cmd="${1?Missing command}"
|
|
|
|
read -t"$TAME_CMD_WAITTIME" -r ack || return
|
|
test "COMMAND $cmd" == "$ack" || {
|
|
# TODO check for ack mismatch once output race condition is fixed
|
|
:
|
|
}
|
|
}
|
|
|
|
|
|
# Wait somewhat impatiently for a runner
|
|
#
|
|
# Assumes that the runner is ready once the pidfile becomes
|
|
# available. Polls for a maximum of six seconds before giving up
|
|
# and exiting with a non-zero status.
|
|
wait-for-runner()
|
|
{
|
|
local -r root=${1?Missing root}
|
|
local -r id=${2?Missing runner id}
|
|
|
|
# we could use inotify, but that is not installed by default
|
|
# on Debian systems, so let's just poll rather than introduce
|
|
# another dependency (give up after 6 seconds)
|
|
local -i i=12
|
|
while test $((i--)); do
|
|
test ! -f "$root/$id/pid" || return 0
|
|
sleep 0.5
|
|
done
|
|
|
|
# still not available
|
|
echo "error: runner $id still unavailable; giving up" >&2
|
|
exit "$EX_NOTAMED"
|
|
}
|
|
|
|
|
|
# Attempts to start tamed if it's not already running
|
|
#
|
|
# This is designed to be safe for parallel builds by allowing only the first
|
|
# process to start tamed and hanging the others until spawning is complete.
|
|
#
|
|
# See `_start-tamed' for more information.
|
|
start-tamed-safe()
|
|
{
|
|
local -r root=${1?Missing root}
|
|
|
|
local -ri timeout=5
|
|
local -r guard="$root-guard"
|
|
|
|
mkdir -p "$( dirname "$root" )"
|
|
|
|
(
|
|
flock -w $timeout 6 || {
|
|
echo "error: failed to acquire tamed spawning lock at $root" >&2
|
|
exit $EX_DLOCK
|
|
}
|
|
|
|
_start-tamed "$root"
|
|
|
|
flock -u 6
|
|
rm -f "$guard"
|
|
) 6>"$guard"
|
|
}
|
|
|
|
|
|
# Start tamed if it is not already running
|
|
#
|
|
# If tamed is already running, nothing will happen; otherwise, start
|
|
# tamed and wait impatiently for the runner to become available.
|
|
#
|
|
# Even if tamed is started, wait for runner 0 to become available;
|
|
# this ensures that tamed is initialized even if this script is run
|
|
# after tamed is started but before it has fully come online (e.g
|
|
# parallel make).
|
|
_start-tamed()
|
|
{
|
|
local -r root="${1?Missing root}"
|
|
|
|
local -ri pid=$( cat "$root/pid" 2>/dev/null )
|
|
|
|
ps "$pid" &>/dev/null || {
|
|
echo "starting tamed at $root..."
|
|
|
|
# tell tamed to clean up so that we eliminate race conditions
|
|
# with wait-for-tamed (this will also kill any stray processes
|
|
# that a previous tamed may have spawned but didn't get the
|
|
# chance to clean up)
|
|
kill-tamed "$root" || true
|
|
|
|
# start tamed and allow it to persist for future commands
|
|
"$mypath/tamed" "$root" & disown
|
|
}
|
|
|
|
# wait for tamed even if it was already started (just in
|
|
# case this script was executed right after tamed started
|
|
# but before it is done initializing)
|
|
wait-for-runner "$root" 0
|
|
}
|
|
|
|
|
|
# Kill tamed
|
|
#
|
|
# Ask tamed to kill itself.
|
|
kill-tamed()
|
|
{
|
|
local -r root="${1?Missing root}"
|
|
|
|
"$mypath/tamed" --kill "$root"
|
|
}
|
|
|
|
|
|
# Filter dslc output to essential information
|
|
#
|
|
# The original output of dslc is quite noisy; this filters it down
|
|
# to only errors and warnings.
|
|
#
|
|
# Eventually, dslc out to be modified to handle filtering its own
|
|
# output rather than wasting cycles doing this filtering.
|
|
saneout()
|
|
{
|
|
# the final line clears the entire line before outputting in an attempt to
|
|
# better accommodate the runner status line from tamed; this can be
|
|
# removed once the Makefile properly takes up this task.
|
|
awk '
|
|
/^~~~~\[begin /,/^~~~~\[end / { next }
|
|
/^rm / { next }
|
|
/^COMMAND / { next }
|
|
/^Exception|^\t+at / {
|
|
if ( /^E/ ) {
|
|
print;
|
|
print "Stack trace written to run-*.log";
|
|
}
|
|
next;
|
|
}
|
|
/([Ww]arning|[Nn]otice)[: ]/ { printf "\033[0;33m"; w++; out=1; }
|
|
/[Ff]atal:/ { printf "\033[0;31m"; out=1; }
|
|
/!|[Ee]rror:/ { printf "\033[0;31m"; e++; out=1; }
|
|
/internal:/ { printf "\033[0;35m"; out=1; }
|
|
/internal error:/ { printf "\033[1m"; out=1; }
|
|
/^[^[]/ || out { print; printf "\033[0;0m"; out=0; }
|
|
' | sed 's/^/\x1b[2K\r/'
|
|
}
|
|
|
|
|
|
# Output usage information and exit
|
|
usage()
|
|
{
|
|
cat <<EOF
|
|
Usage: $0 [-v|--verbose] cmdline
|
|
Or: $0 --kill
|
|
Send command line CMDLINE to a tamed runner. Start tamed if
|
|
not already running.
|
|
|
|
If a runner does not acknlowedge a request in TAME_CMD_WAITTIME
|
|
seconds, it will be reloaded and given TAME_CMD_WAITTIME seconds
|
|
to come online. After that time has elapsed, the command will
|
|
be re-attempted, timing out again after TAME_CMD_WAITTIME and
|
|
and at that point giving up.
|
|
|
|
The first available runner sorted numerically will be
|
|
chosen. This helps to give the same runners more work,
|
|
since they're more likely to have source (and compiled)
|
|
already parsed in memory. As such, runners will have load
|
|
disproportionately spread, and may exhibit large variances
|
|
in resource consumption.
|
|
|
|
If all runners are busy, then a new runner will be spawned,
|
|
allowing for parallel builds.
|
|
|
|
Options:
|
|
--help show this message
|
|
--kill kill tamed
|
|
-v, --verbose show runner logs
|
|
|
|
Environment Variables:
|
|
TAME_VERBOSE when greater than zero, show runner logs
|
|
(see also --verbose)
|
|
TAME_CMD_WAITTIME number of seconds to wait for ack from
|
|
runner (default 3)
|
|
EOF
|
|
|
|
exit $EX_USAGE
|
|
}
|
|
|
|
|
|
# Run tame
|
|
main()
|
|
{
|
|
local -r root=/run/user/$UID/tamed
|
|
|
|
local outcmd=saneout
|
|
|
|
test $# -gt 0 || usage
|
|
|
|
case "${1:-}" in
|
|
--kill) kill-tamed "$root"; exit;;
|
|
-v|--verbose) outcmd=cat; shift;;
|
|
--help) usage;;
|
|
esac
|
|
|
|
# alternative to --verbose
|
|
if [ "${TAME_VERBOSE:-0}" -ge 1 ]; then
|
|
outcmd=cat
|
|
fi
|
|
|
|
start-tamed-safe "$root"
|
|
|
|
# for now we only support a single runner
|
|
command-available-runner "$root" "$@" \
|
|
| "$outcmd"
|
|
}
|
|
|
|
main "$@"
|
|
|