#!/bin/bash # Client for TAME daemon (tamed) # # Copyright (C) 2014-2022 Ryan Specialty Group, LLC. # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . ## set -euo pipefail declare mypath; mypath=$( dirname "$( readlink -f "$0" )" ) readonly mypath declare -ri EX_NOTAMED=1 # tried to start tamed or runner but failed declare -ri EX_STALLED=2 # runner stalled and could not recover declare -ri EX_NORUN=3 # no available runners declare -ri EX_DLOCK=4 # failed to get a lock to start tamed declare -ri EX_BLOCK=5 # failed to get a lock for busy runner check declare -ri EX_NODONE=6 # tamed did not provide a DONE with exit code declare -ri EX_USAGE=64 # incorrect usage; sysexits.h # maximum amount of time in seconds to wait for runner to ack # before forcibly restarting it declare -ri TAME_CMD_WAITTIME="${TAME_CMD_WAITTIME:-3}" # propagate to daemon export TAMED_STALL_SECONDS export TAMED_SPAWNER_PID export TAMED_JAVA_OPTS # Send a single command to the next available runner and # observe the result # # See `command-runner' for more information. command-available-runner() { local -r root="${1?Missing root run path}" shift 1 local -r id=$( reserve-runner "$root" ) test -n "$id" || { echo "no available runners at $root" >&2 exit $EX_NORUN } command-runner "$id" "$root" "$@" \ | tee -a "run-$id.log" } # Send a single command to a runner and observe the result # # stdin will be directed to the runner. stdout of the runner will be # echoed until a line beginning with "DONE" is found, after which this # procedure will return with the exit code indicated by the runner. command-runner() { local -ri id="${1?Missing id}" local -r root="${2?Missing root run path}" shift 2 local -r base="$root/$id" local -ri pid=$( cat "$base/pid" ) verify-runner "$base" "$id" "$pid" # forward signals to runner so that build is actually halted # (rather than continuing in background after we die) trap 'kill -TERM $pid &>/dev/null' INT TERM # log the provided command line and starting time so that we can determine # what is currently being compiled and how long it is taking millis > "$base/cmdstart" echo "$*" > "$base/cmdline" # all remaining arguments are passed to the runner echo "$*" > "$base/0" # we should immediately get a response from the runner; # if not, then it may have stalled for some reason verify-runner-ack "$*" < "$base/1" || { echo "warning: failed runner $id ack; requesting reload" >&2 kill -HUP "$pid" # give some extra time in case the host is under high load sleep "$TAME_CMD_WAITTIME" # try one last time echo "$*" > "$base/0" verify-runner-ack "$*" < "$base/1" || { echo "error: runner $id still unresponsive; giving up" >&2 exit "$EX_STALLED" } } # output lines from runner until we reach a line stating "DONE" while read -r line; do # don't parse words in the initial read because we may be # dealing with a lot of lines if [ "${line:0:5}" == "DONE " ]; then read -r _ code _ <<< "$line" runtab-append "$base" mark-available "$base" return "$code" fi echo "$line" done < "$base/1" # We should have returned as soon as we received DONE. If this was not # provided, then something probably went wrong (e.g. JVM crash). return "$EX_NODONE" } # Get id of the first available runner and mark it as busy # # If no runners are available, tamed is signalled to spawn a new one. # # This command calls `mark-busy' so that it can acquire a runner in an # atomic manner. The caller is responsible for invoking `mark-available' # after processing is complete. # # If no runner is available, then the result will be empty. reserve-runner() { local -r root=${1?Missing root} local -r timeout=10 ( flock -w $timeout 7 || { echo "error: failed to acquire busy lock at $root" >&2 exit $EX_BLOCK } # grab the first available or request a new one local id; id=$( get-available-runner-id "$root" ) if [ -z "$id" ]; then id=$( spawn-runner-and-wait "$root" ) || { echo "error: failed to reserve runner at $root" >&2 exit $EX_NORUN } fi # mark it as busy while we still have the lock mark-busy "$root/$id" echo "$id" ) 7>"$root/busy-lock" } # Get the id of the next available runner # # THIS FUNCTION MUST BE GUARDED BY A MUTEX! Otherwise there is a race # between acquiring the available id and then actually making use of it. # # If multiple runners are available, then the first available runner sorted # numerically will be chosen. This helps to give the same runners more # work, since they're more likely to have source (and compiled) already # parsed in memory. As such, runners will have load disproportionately # spread, and may exhibit large variances in resource consumption. # # Sorting numerically is done because globbing sorts lexically---if runner # 10 is spawned, then it would find itself after "1" in the list rather than # after runner "9". # # If all runners are visible, then nothing will be returned. get-available-runner-id() { local -r root=${1?Missing root} grep -l 0 "$root"/*/busy \ | awk -F/ '{ print $(NF-1) }' \ | sort -n \ | head -n1 } # Tell tamed to spawn a new runner and output the new runner id # # THIS FUNCTION MUST BE GUARDED BY A MUTEX! Otherwise there is a race # between signaling and reading from `maxid'. # # This sens USR1 to tamed indicating that the next available runner should # be spawned, and then waits on that expected runner. See `wait-for-runner' # for more information on waiting. spawn-runner-and-wait() { local -r root=${1?Missing root} local -r pid=$( < "$root/pid" ) local -ri maxid=$( < "$root/maxid" ) # request runner kill -USR1 "$pid" # wait on the expected id local -ri nextid=$(( maxid + 1 )) wait-for-runner "$root" "$nextid" echo "$nextid" } # Mark a runner as busy (unable to accept new commands) # # Once work is done, use `mark-available' to undo this operation. mark-busy() { local -r base=${1?Missing runner base path} echo 1 > "$base/busy" } # Mark a runner as available (able to accept new commands) # # Once work is available, use `mark-busy' to undo this operation. mark-available() { local -r base=${1?Missing runner base path} echo 0 > "$base/busy" echo idle > "$base/cmdline" # this can be used to determine how long the worker has been idle millis > "$base/cmdstart" } # Output seconds and milliseconds, space-delimited millis() { local date date=( $(date '+%s %N') ) # %N returns nanoseconds and it may be 0-prefixed, which would be # interpreted as octal without the explicit base specification echo "${date[0]}" "$(( 10#"${date[1]}" / 1000000 ))" } # Append data to the runner table (runtab) # # This takes information about the most recently executed command and # appends it to a table representing the work that the runner has # done. This should be done at the end of processing a particular job but # before marking the runner as available using `mark-available'. # # The columns of this report are, tab-delimited: # 1. Start date (Unix timestamp, seconds); # 2. Duration (milliseconds); and # 3. Runner command line runtab-append() { local -r base=${1?Missing runner base path} local cmd duration local -a cmdstart now cmd=$(< "$base/cmdline") cmdstart=( $(< "$base/cmdstart") ) now=( $(millis) ) # duration consists of seconds and nanoseconds; let's just deal with # milliseconds, since any greater precision is not useful to us with how # slow the system is today, and convert it into a decimal for # reporting. Nanoseconds may be 0-prefixed, which will be interpreted as # octal without an explicit base specification. duration=$(( ((now[0] * 1000) + now[1]) - ((cmdstart[0] * 1000) + cmdstart[1]) )) # the duration is in milliseconds printf "%d\t%s\t%s\n" "$cmdstart" "$duration" "$cmd" >> "$base/runtab" } # Verify that a runner is available # # If the runner is offline or not owned by $UID, then exit with # a non-zero status. verify-runner() { local -r base="${1?Missing base}" local -ri id="${2?Missing id}" local -ri pid="${3?Missing pid}" ps "$pid" &>/dev/null || { echo "error: runner $id ($pid) is offline!" >&2 exit "$EX_NOTAMED" } test -O "$base/0" || { echo "error: runner $id ($pid) is not owned by $USER!" >&2 exit "$EX_NOTAMED" } } # Wait for command acknowledgment from runner # # The runner must respond within TAME_CMD_WAITTIME seconds # and must echo back the command that was given. Otherwise, # this function returns with a non-zero status. verify-runner-ack() { local -r cmd="${1?Missing command}" read -t"$TAME_CMD_WAITTIME" -r ack || return test "COMMAND $cmd" == "$ack" || { # TODO check for ack mismatch once output race condition is fixed : } } # Wait somewhat impatiently for a runner # # Assumes that the runner is ready once the pidfile becomes # available. Polls for a maximum of six seconds before giving up # and exiting with a non-zero status. wait-for-runner() { local -r root=${1?Missing root} local -r id=${2?Missing runner id} # we could use inotify, but that is not installed by default # on Debian systems, so let's just poll rather than introduce # another dependency (give up after 6 seconds) local -i i=12 while test $((i--)); do test ! -f "$root/$id/pid" || return 0 sleep 0.5 done # still not available echo "error: runner $id still unavailable; giving up" >&2 exit "$EX_NOTAMED" } # Attempts to start tamed if it's not already running # # This is designed to be safe for parallel builds by allowing only the first # process to start tamed and hanging the others until spawning is complete. # # See `_start-tamed' for more information. start-tamed-safe() { local -r root=${1?Missing root} local -ri timeout=5 local -r guard="$root-guard" mkdir -p "$( dirname "$root" )" ( flock -w $timeout 6 || { echo "error: failed to acquire tamed spawning lock at $root" >&2 exit $EX_DLOCK } _start-tamed "$root" flock -u 6 rm -f "$guard" ) 6>"$guard" } # Start tamed if it is not already running # # If tamed is already running, nothing will happen; otherwise, start # tamed and wait impatiently for the runner to become available. # # Even if tamed is started, wait for runner 0 to become available; # this ensures that tamed is initialized even if this script is run # after tamed is started but before it has fully come online (e.g # parallel make). _start-tamed() { local -r root="${1?Missing root}" local -ri pid=$( cat "$root/pid" 2>/dev/null ) ps "$pid" &>/dev/null || { echo "starting tamed at $root..." # tell tamed to clean up so that we eliminate race conditions # with wait-for-tamed (this will also kill any stray processes # that a previous tamed may have spawned but didn't get the # chance to clean up) kill-tamed "$root" || true # start tamed and allow it to persist for future commands "$mypath/tamed" "$root" & disown } # wait for tamed even if it was already started (just in # case this script was executed right after tamed started # but before it is done initializing) wait-for-runner "$root" 0 } # Kill tamed # # Ask tamed to kill itself. kill-tamed() { local -r root="${1?Missing root}" "$mypath/tamed" --kill "$root" } # Filter dslc output to essential information # # The original output of dslc is quite noisy; this filters it down # to only errors and warnings. # # Eventually, dslc out to be modified to handle filtering its own # output rather than wasting cycles doing this filtering. saneout() { # the final line clears the entire line before outputting in an attempt to # better accommodate the runner status line from tamed; this can be # removed once the Makefile properly takes up this task. awk ' /^~~~~\[begin /,/^~~~~\[end / { next } /^rm / { next } /^COMMAND / { next } /^Exception|^\t+at / { if ( /^E/ ) { print; print "Stack trace written to run-*.log"; } next; } /([Ww]arning|[Nn]otice)[: ]/ { printf "\033[0;33m"; w++; out=1; } /[Ff]atal:/ { printf "\033[0;31m"; out=1; } /!|[Ee]rror:/ { printf "\033[0;31m"; e++; out=1; } /internal:/ { printf "\033[0;35m"; out=1; } /internal error:/ { printf "\033[1m"; out=1; } /^[^[]/ || out { print; printf "\033[0;0m"; out=0; } ' | sed 's/^/\x1b[2K\r/' } # Output usage information and exit usage() { cat <