466 lines
12 KiB
Bash
Executable File
466 lines
12 KiB
Bash
Executable File
#!/bin/bash
|
|
# Client for TAME daemon (tamed)
|
|
#
|
|
# Copyright (C) 2014-2019 Ryan Specialty Group, LLC.
|
|
#
|
|
# This program is free software: you can redistribute it and/or modify
|
|
# it under the terms of the GNU General Public License as published by
|
|
# the Free Software Foundation, either version 3 of the License, or
|
|
# (at your option) any later version.
|
|
#
|
|
# This program is distributed in the hope that it will be useful,
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
# GNU General Public License for more details.
|
|
#
|
|
# You should have received a copy of the GNU General Public License
|
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
##
|
|
|
|
set -euo pipefail
|
|
|
|
declare -r mypath=$( dirname "$( readlink -f "$0" )" )
|
|
|
|
declare -ri EX_NOTAMED=1 # tried to start tamed or runner but failed
|
|
declare -ri EX_STALLED=2 # runner stalled and could not recover
|
|
declare -ri EX_NORUN=3 # no available runners
|
|
declare -ri EX_DLOCK=4 # failed to get a lock to start tamed
|
|
declare -ri EX_BLOCK=5 # failed to get a lock for busy runner check
|
|
declare -ri EX_USAGE=64 # incorrect usage; sysexits.h
|
|
|
|
# maximum amount of time in seconds to wait for runner to ack
|
|
# before forcibly restarting it
|
|
declare -ri TAME_CMD_WAITTIME="${TAME_CMD_WAITTIME:-3}"
|
|
|
|
# propagate to daemon
|
|
export TAMED_STALL_SECONDS
|
|
export TAMED_SPAWNER_PID
|
|
|
|
|
|
# Send a single command to the next available runner and
|
|
# observe the result
|
|
#
|
|
# See `command-runner' for more information.
|
|
command-available-runner()
|
|
{
|
|
local -r root="${1?Missing root run path}"
|
|
shift 1
|
|
|
|
local -r id=$( reserve-runner "$root" )
|
|
|
|
test -n "$id" || {
|
|
echo "no available runners at $root" >&2
|
|
exit $EX_NORUN
|
|
}
|
|
|
|
command-runner "$id" "$root" "$@" \
|
|
| tee -a "run-$id.log"
|
|
}
|
|
|
|
|
|
# Send a single command to a runner and observe the result
|
|
#
|
|
# stdin will be directed to the runner. stdout of the runner will be
|
|
# echoed until a line beginning with "DONE" is found, after which this
|
|
# procedure will return with the exit code indicated by the runner.
|
|
command-runner()
|
|
{
|
|
local -ri id="${1?Missing id}"
|
|
local -r root="${2?Missing root run path}"
|
|
shift 2
|
|
|
|
local -r base="$root/$id"
|
|
local -ri pid=$( cat "$base/pid" )
|
|
|
|
verify-runner "$base" "$pid"
|
|
|
|
# forward signals to runner so that build is actually halted
|
|
# (rather than continuing in background after we die)
|
|
trap 'kill -TERM $pid &>/dev/null' INT TERM
|
|
|
|
# all remaining arguments are passed to the runner
|
|
echo "$*" > "$base/0"
|
|
|
|
# we should immediately get a response from the runner;
|
|
# if not, then it may have stalled for some reason
|
|
verify-runner-ack "$*" < "$base/1" || {
|
|
echo "warning: failed runner $id ack; requesting reload" >&2
|
|
kill -HUP "$pid"
|
|
|
|
# give some extra time in case the host is under high load
|
|
sleep "$TAME_CMD_WAITTIME"
|
|
|
|
# try one last time
|
|
echo "$*" > "$base/0"
|
|
verify-runner-ack "$*" < "$base/1" || {
|
|
echo "error: runner $id still unresponsive; giving up" >&2
|
|
exit "$EX_STALLED"
|
|
}
|
|
}
|
|
|
|
# output lines from runner until we reach a line stating "DONE"
|
|
while read line; do
|
|
# don't parse words in the initial read because we may be
|
|
# dealing with a lot of lines
|
|
if [ "${line:0:5}" == "DONE " ]; then
|
|
read _ code _ <<< "$line"
|
|
|
|
mark-available "$base"
|
|
return "$code"
|
|
fi
|
|
|
|
echo "$line"
|
|
done < "$base/1"
|
|
}
|
|
|
|
|
|
# Get id of the first available runner and mark it as busy
|
|
#
|
|
# If no runners are available, tamed is signalled to spawn a new one.
|
|
#
|
|
# This command calls `mark-busy' so that it can acquire a runner in an
|
|
# atomic manner. The caller is responsible for invoking `mark-available'
|
|
# after processing is complete.
|
|
#
|
|
# If no runner is available, then the result will be empty.
|
|
reserve-runner()
|
|
{
|
|
local -r root=${1?Missing root}
|
|
|
|
local -r timeout=10
|
|
|
|
(
|
|
flock -w $timeout 7 || {
|
|
echo "error: failed to acquire busy lock at $root" >&2
|
|
exit $EX_BLOCK
|
|
}
|
|
|
|
# grab the first available or request a new one
|
|
local id=$( get-available-runner-id "$root" )
|
|
if [ -z "$id" ]; then
|
|
id=$( spawn-runner-and-wait "$root" ) || {
|
|
echo "error: failed to reserve runner at $root" >&2
|
|
exit $EX_NORUN
|
|
}
|
|
fi
|
|
|
|
# mark it as busy while we still have the lock
|
|
mark-busy "$root/$id"
|
|
|
|
echo "$id"
|
|
) 7>"$root/busy-lock"
|
|
}
|
|
|
|
|
|
# Get the id of the next available runner
|
|
#
|
|
# THIS FUNCTION MUST BE GUARDED BY A MUTEX! Otherwise there is a race
|
|
# between acquiring the available id and then actually making use of it.
|
|
#
|
|
# If multiple runners are available, then the first available runner sorted
|
|
# numerically will be chosen. This helps to give the same runners more
|
|
# work, since they're more likely to have source (and compiled) already
|
|
# parsed in memory. As such, runners will have load disproportionately
|
|
# spread, and may exhibit large variances in resource consumption.
|
|
#
|
|
# Sorting numerically is done because globbing sorts lexically---if runner
|
|
# 10 is spawned, then it would find itself after "1" in the list rather than
|
|
# after runner "9".
|
|
#
|
|
# If all runners are visible, then nothing will be returned.
|
|
get-available-runner-id()
|
|
{
|
|
local -r root=${1?Missing root}
|
|
|
|
grep -l 0 "$root"/*/busy \
|
|
| awk -F/ '{ print $(NF-1) }' \
|
|
| sort -n \
|
|
| head -n1
|
|
}
|
|
|
|
|
|
# Tell tamed to spawn a new runner and output the new runner id
|
|
#
|
|
# THIS FUNCTION MUST BE GUARDED BY A MUTEX! Otherwise there is a race
|
|
# between signaling and reading from `maxid'.
|
|
#
|
|
# This sens USR1 to tamed indicating that the next available runner should
|
|
# be spawned, and then waits on that expected runner. See `wait-for-runner'
|
|
# for more information on waiting.
|
|
spawn-runner-and-wait()
|
|
{
|
|
local -r root=${1?Missing root}
|
|
|
|
local -r pid=$( < "$root/pid" )
|
|
local -ri maxid=$( < "$root/maxid" )
|
|
|
|
# request runner
|
|
kill -USR1 "$pid"
|
|
|
|
# wait on the expected id
|
|
local -ri nextid=$(( maxid + 1 ))
|
|
wait-for-runner "$root" "$nextid"
|
|
|
|
echo "$nextid"
|
|
}
|
|
|
|
|
|
# Mark a runner as busy (unable to accept new commands)
|
|
#
|
|
# Once work is done, use `mark-available' to undo this operation.
|
|
mark-busy()
|
|
{
|
|
local -r base=${1?Missing runner base path}
|
|
echo 1 > "$base/busy"
|
|
}
|
|
|
|
|
|
# Mark a runner as available (able to accept new commands)
|
|
#
|
|
# Once work is available, use `mark-busy' to undo this operation.
|
|
mark-available()
|
|
{
|
|
local -r base=${1?Missing runner base path}
|
|
echo 0 > "$base/busy"
|
|
}
|
|
|
|
|
|
# Verify that a runner is available
|
|
#
|
|
# If the runner is offline or not owned by $UID, then exit with
|
|
# a non-zero status.
|
|
verify-runner()
|
|
{
|
|
local -r base="${1?Missing base}"
|
|
local -ri pid="${2?Missing pid}"
|
|
|
|
ps "$pid" &>/dev/null || {
|
|
echo "error: runner $id ($pid) is offline!" >&2
|
|
exit "$EX_NOTAMED"
|
|
}
|
|
|
|
test -O "$base/0" || {
|
|
echo "error: runner $id ($pid) is not owned by $USER!" >&2
|
|
exit "$EX_NOTAMED"
|
|
}
|
|
}
|
|
|
|
|
|
# Wait for command acknowledgment from runner
|
|
#
|
|
# The runner must respond within TAME_CMD_WAITTIME seconds
|
|
# and must echo back the command that was given. Otherwise,
|
|
# this function returns with a non-zero status.
|
|
verify-runner-ack()
|
|
{
|
|
local -r cmd="${1?Missing command}"
|
|
|
|
read -t"$TAME_CMD_WAITTIME" -r ack || return
|
|
test "COMMAND $cmd" == "$ack" || {
|
|
# TODO check for ack mismatch once output race condition is fixed
|
|
:
|
|
}
|
|
}
|
|
|
|
|
|
# Wait somewhat impatiently for a runner
|
|
#
|
|
# Assumes that the runner is ready once the pidfile becomes
|
|
# available. Polls for a maximum of six seconds before giving up
|
|
# and exiting with a non-zero status.
|
|
wait-for-runner()
|
|
{
|
|
local -r root=${1?Missing root}
|
|
local -r id=${2?Missing runner id}
|
|
|
|
# we could use inotify, but that is not installed by default
|
|
# on Debian systems, so let's just poll rather than introduce
|
|
# another dependency (give up after 6 seconds)
|
|
local -i i=12
|
|
while test $((i--)); do
|
|
test ! -f "$root/$id/pid" || return 0
|
|
sleep 0.5
|
|
done
|
|
|
|
# still not available
|
|
echo "error: runner $id still unavailable; giving up" >&2
|
|
exit "$EX_NOTAMED"
|
|
}
|
|
|
|
|
|
# Attempts to start tamed if it's not already running
|
|
#
|
|
# This is designed to be safe for parallel builds by allowing only the first
|
|
# process to start tamed and hanging the others until spawning is complete.
|
|
#
|
|
# See `_start-tamed' for more information.
|
|
start-tamed-safe()
|
|
{
|
|
local -r root=${1?Missing root}
|
|
|
|
local -ri timeout=5
|
|
local -r guard="$root-guard"
|
|
|
|
mkdir -p "$( dirname "$root" )"
|
|
|
|
(
|
|
flock -w $timeout 6 || {
|
|
echo "error: failed to acquire tamed spawning lock at $root" >&2
|
|
exit $EX_DLOCK
|
|
}
|
|
|
|
_start-tamed "$root"
|
|
|
|
flock -u 6
|
|
rm -f "$guard"
|
|
) 6>"$guard"
|
|
}
|
|
|
|
|
|
# Start tamed if it is not already running
|
|
#
|
|
# If tamed is already running, nothing will happen; otherwise, start
|
|
# tamed and wait impatiently for the runner to become available.
|
|
#
|
|
# Even if tamed is started, wait for runner 0 to become available;
|
|
# this ensures that tamed is initialized even if this script is run
|
|
# after tamed is started but before it has fully come online (e.g
|
|
# parallel make).
|
|
_start-tamed()
|
|
{
|
|
local -r root="${1?Missing root}"
|
|
|
|
local -ri pid=$( cat "$root/pid" 2>/dev/null )
|
|
|
|
ps "$pid" &>/dev/null || {
|
|
echo "starting tamed at $root..."
|
|
|
|
# tell tamed to clean up so that we eliminate race conditions
|
|
# with wait-for-tamed (this will also kill any stray processes
|
|
# that a previous tamed may have spawned but didn't get the
|
|
# chance to clean up)
|
|
kill-tamed "$root" || true
|
|
|
|
# start tamed and allow it to persist for future commands
|
|
"$mypath/tamed" "$root" & disown
|
|
}
|
|
|
|
# wait for tamed even if it was already started (just in
|
|
# case this script was executed right after tamed started
|
|
# but before it is done initializing)
|
|
wait-for-runner "$root" 0
|
|
}
|
|
|
|
|
|
# Kill tamed
|
|
#
|
|
# Ask tamed to kill itself.
|
|
kill-tamed()
|
|
{
|
|
local -r root="${1?Missing root}"
|
|
|
|
"$mypath/tamed" --kill "$root"
|
|
}
|
|
|
|
|
|
# Filter dslc output to essential information
|
|
#
|
|
# The original output of dslc is quite noisy; this filters it down
|
|
# to only errors and warnings.
|
|
#
|
|
# Eventually, dslc out to be modified to handle filtering its own
|
|
# output rather than wasting cycles doing this filtering.
|
|
saneout()
|
|
{
|
|
awk ' \
|
|
/^~~~~\[begin /,/^~~~~\[end / { next } \
|
|
/^rm / { next } \
|
|
/^COMMAND / { next } \
|
|
/^Exception|^\t+at / { \
|
|
if ( /^E/ ) { \
|
|
print; \
|
|
print "Stack trace written to run-*.log"; \
|
|
} \
|
|
next; \
|
|
} \
|
|
/([Ww]arning|[Nn]otice)[: ]/ { printf "\033[0;33m"; w++; out=1; } \
|
|
/[Ff]atal:/ { printf "\033[0;31m"; out=1; } \
|
|
/!|[Ee]rror:/ { printf "\033[0;31m"; e++; out=1; } \
|
|
/internal:/ { printf "\033[0;35m"; out=1; } \
|
|
/internal error:/ { printf "\033[1m"; out=1; } \
|
|
/^[^[]/ || out { print; printf "\033[0;0m"; out=0; } \
|
|
'
|
|
}
|
|
|
|
|
|
# Output usage information and exit
|
|
usage()
|
|
{
|
|
cat <<EOF
|
|
Usage: $0 [-v|--verbose] cmdline
|
|
Or: $0 --kill
|
|
Send command line CMDLINE to a tamed runner. Start tamed if
|
|
not already running.
|
|
|
|
If a runner does not acknlowedge a request in TAME_CMD_WAITTIME
|
|
seconds, it will be reloaded and given TAME_CMD_WAITTIME seconds
|
|
to come online. After that time has elapsed, the command will
|
|
be re-attempted, timing out again after TAME_CMD_WAITTIME and
|
|
and at that point giving up.
|
|
|
|
The first available runner sorted numerically will be
|
|
chosen. This helps to give the same runners more work,
|
|
since they're more likely to have source (and compiled)
|
|
already parsed in memory. As such, runners will have load
|
|
disproportionately spread, and may exhibit large variances
|
|
in resource consumption.
|
|
|
|
If all runners are busy, then a new runner will be spawned,
|
|
allowing for parallel builds.
|
|
|
|
Options:
|
|
--help show this message
|
|
--kill kill tamed
|
|
-v, --verbose show runner logs
|
|
|
|
Environment Variables:
|
|
TAME_VERBOSE when greater than zero, show runner logs
|
|
(see also --verbose)
|
|
TAME_CMD_WAITTIME number of seconds to wait for ack from
|
|
runner (default 3)
|
|
EOF
|
|
|
|
exit $EX_USAGE
|
|
}
|
|
|
|
|
|
# Run tame
|
|
main()
|
|
{
|
|
local -r root=/run/user/$UID/tamed
|
|
|
|
local outcmd=saneout
|
|
|
|
test $# -gt 0 || usage
|
|
|
|
case "${1:-}" in
|
|
--kill) kill-tamed "$root"; exit;;
|
|
-v|--verbose) outcmd=cat; shift;;
|
|
--help) usage;;
|
|
esac
|
|
|
|
# alternative to --verbose
|
|
if [ "${TAME_VERBOSE:-0}" -ge 1 ]; then
|
|
outcmd=cat
|
|
fi
|
|
|
|
start-tamed-safe "$root"
|
|
|
|
# for now we only support a single runner
|
|
command-available-runner "$root" "$@" \
|
|
| "$outcmd"
|
|
}
|
|
|
|
main "$@"
|
|
|