diff --git a/bin/tame b/bin/tame index a1451f06..a022320b 100755 --- a/bin/tame +++ b/bin/tame @@ -21,8 +21,11 @@ set -euo pipefail declare -r mypath=$( dirname "$( readlink -f "$0" )" ) -declare -ri EX_NOTAMED=1 # tried to start tamed but failed +declare -ri EX_NOTAMED=1 # tried to start tamed or runner but failed declare -ri EX_STALLED=2 # runner stalled and could not recover +declare -ri EX_NORUN=3 # no available runners +declare -ri EX_DLOCK=4 # failed to get a lock to start tamed +declare -ri EX_BLOCK=5 # failed to get a lock for busy runner check declare -ri EX_USAGE=64 # incorrect usage; sysexits.h # maximum amount of time in seconds to wait for runner to ack @@ -34,6 +37,27 @@ export TAMED_STALL_SECONDS export TAMED_SPAWNER_PID +# Send a single command to the next available runner and +# observe the result +# +# See `command-runner' for more information. +command-available-runner() +{ + local -r root="${1?Missing root run path}" + shift 1 + + local -r id=$( reserve-runner "$root" ) + + test -n "$id" || { + echo "no available runners at $root" >&2 + exit $EX_NORUN + } + + command-runner "$id" "$root" "$@" \ + | tee -a "run-$id.log" +} + + # Send a single command to a runner and observe the result # # stdin will be directed to the runner. stdout of the runner will be @@ -48,8 +72,6 @@ command-runner() local -r base="$root/$id" local -ri pid=$( cat "$base/pid" ) - # TODO flock - verify-runner "$base" "$pid" # forward signals to runner so that build is actually halted @@ -82,6 +104,8 @@ command-runner() # dealing with a lot of lines if [ "${line:0:5}" == "DONE " ]; then read _ code _ <<< "$line" + + mark-available "$base" return "$code" fi @@ -90,6 +114,117 @@ command-runner() } +# Get id of the first available runner and mark it as busy +# +# If no runners are available, tamed is signalled to spawn a new one. +# +# This command calls `mark-busy' so that it can acquire a runner in an +# atomic manner. The caller is responsible for invoking `mark-available' +# after processing is complete. +# +# If no runner is available, then the result will be empty. +reserve-runner() +{ + local -r root=${1?Missing root} + + local -r timeout=10 + + ( + flock -w $timeout 7 || { + echo "error: failed to acquire busy lock at $root" >&2 + exit $EX_BLOCK + } + + # grab the first available or request a new one + local id=$( get-available-runner-id "$root" ) + if [ -z "$id" ]; then + id=$( spawn-runner-and-wait "$root" ) || { + echo "error: failed to reserve runner at $root" >&2 + exit $EX_NORUN + } + fi + + # mark it as busy while we still have the lock + mark-busy "$root/$id" + + echo "$id" + ) 7>"$root/busy-lock" +} + + +# Get the id of the next available runner +# +# THIS FUNCTION MUST BE GUARDED BY A MUTEX! Otherwise there is a race +# between acquiring the available id and then actually making use of it. +# +# If multiple runners are available, then the first available runner sorted +# numerically will be chosen. This helps to give the same runners more +# work, since they're more likely to have source (and compiled) already +# parsed in memory. As such, runners will have load disproportionately +# spread, and may exhibit large variances in resource consumption. +# +# Sorting numerically is done because globbing sorts lexically---if runner +# 10 is spawned, then it would find itself after "1" in the list rather than +# after runner "9". +# +# If all runners are visible, then nothing will be returned. +get-available-runner-id() +{ + local -r root=${1?Missing root} + + grep -l 0 "$root"/*/busy \ + | awk -F/ '{ print $(NF-1) }' \ + | sort -n \ + | head -n1 +} + + +# Tell tamed to spawn a new runner and output the new runner id +# +# THIS FUNCTION MUST BE GUARDED BY A MUTEX! Otherwise there is a race +# between signaling and reading from `maxid'. +# +# This sens USR1 to tamed indicating that the next available runner should +# be spawned, and then waits on that expected runner. See `wait-for-runner' +# for more information on waiting. +spawn-runner-and-wait() +{ + local -r root=${1?Missing root} + + local -r pid=$( < "$root/pid" ) + local -ri maxid=$( < "$root/maxid" ) + + # request runner + kill -USR1 "$pid" + + # wait on the expected id + local -ri nextid=$(( maxid + 1 )) + wait-for-runner "$root" "$nextid" + + echo "$nextid" +} + + +# Mark a runner as busy (unable to accept new commands) +# +# Once work is done, use `mark-available' to undo this operation. +mark-busy() +{ + local -r base=${1?Missing runner base path} + echo 1 > "$base/busy" +} + + +# Mark a runner as available (able to accept new commands) +# +# Once work is available, use `mark-busy' to undo this operation. +mark-available() +{ + local -r base=${1?Missing runner base path} + echo 0 > "$base/busy" +} + + # Verify that a runner is available # # If the runner is offline or not owned by $UID, then exit with @@ -128,30 +263,57 @@ verify-runner-ack() } -# Wait somewhat impatiently for tamed +# Wait somewhat impatiently for a runner # -# Assumes that tamed's runner 0 is running once the pidfile becomes +# Assumes that the runner is ready once the pidfile becomes # available. Polls for a maximum of six seconds before giving up # and exiting with a non-zero status. -wait-for-tamed() +wait-for-runner() { - local -r base="${1?Missing base}" + local -r root=${1?Missing root} + local -r id=${2?Missing runner id} # we could use inotify, but that is not installed by default # on Debian systems, so let's just poll rather than introduce # another dependency (give up after 6 seconds) local -i i=12 while test $((i--)); do - test ! -f "$base/0/pid" || return 0 + test ! -f "$root/$id/pid" || return 0 sleep 0.5 done # still not available - echo 'error: tamed still unavailable; giving up' >&2 + echo "error: runner $id still unavailable; giving up" >&2 exit "$EX_NOTAMED" } +# Attempts to start tamed if it's not already running +# +# This is designed to be safe for parallel builds by allowing only the first +# process to start tamed and hanging the others until spawning is complete. +# +# See `_start-tamed' for more information. +start-tamed-safe() +{ + local -r root=${1?Missing root} + + local -ri timeout=5 + + ( + flock -w $timeout 6 || { + echo "error: failed to acquire tamed spawning lock at $root" >&2 + exit $EX_DLOCK + } + + _start-tamed "$root" + + flock -u 6 + rm -f "$root-guard" + ) 6>"$root-guard" +} + + # Start tamed if it is not already running # # If tamed is already running, nothing will happen; otherwise, start @@ -161,7 +323,7 @@ wait-for-tamed() # this ensures that tamed is initialized even if this script is run # after tamed is started but before it has fully come online (e.g # parallel make). -start-tamed() +_start-tamed() { local -r root="${1?Missing root}" @@ -183,7 +345,7 @@ start-tamed() # wait for tamed even if it was already started (just in # case this script was executed right after tamed started # but before it is done initializing) - wait-for-tamed "$root" + wait-for-runner "$root" 0 } @@ -243,6 +405,16 @@ to come online. After that time has elapsed, the command will be re-attempted, timing out again after TAME_CMD_WAITTIME and and at that point giving up. +The first available runner sorted numerically will be +chosen. This helps to give the same runners more work, +since they're more likely to have source (and compiled) +already parsed in memory. As such, runners will have load +disproportionately spread, and may exhibit large variances +in resource consumption. + +If all runners are busy, then a new runner will be spawned, +allowing for parallel builds. + Options: --help show this message --kill kill tamed @@ -279,11 +451,10 @@ main() outcmd=cat fi - start-tamed "$root" + start-tamed-safe "$root" # for now we only support a single runner - command-runner 0 "$root" "$@" \ - | tee -a "run-0.log" \ + command-available-runner "$root" "$@" \ | "$outcmd" } diff --git a/bin/tamed b/bin/tamed index 36b842f2..2e0afe1f 100755 --- a/bin/tamed +++ b/bin/tamed @@ -54,7 +54,7 @@ mkfifos() rm -f "$root-$n" mkfifo -m 0600 "$root/$n" || { - echo "fatal: failed to create FIFO at $in" + echo "fatal: failed to create FIFO at $root/n" exit $EX_CANTCREAT } done @@ -64,6 +64,20 @@ mkfifos() } +# Spawn a new runner using the next available runner id +# +# See `spawn-runner' for more information. +spawn-next-runner() +{ + local -r root="${1?Missing root path}" + + # get the next available id + local -ri id=$( < "$root/maxid" ) + + spawn-runner "$(( id + 1 ))" "$root" +} + + # Spawn a runner # # A new runner is created by spawning dslc and attaching @@ -79,6 +93,9 @@ spawn-runner() mkfifos "$base" + # flag as available (the client will manipulate these) + echo 0 > "$base/busy" + # monitor runner usage and kill when inactive stall-monitor "$base" & @@ -95,6 +112,9 @@ spawn-runner() echo "$!" > "$base/pid" + # we assume that this is the new largest runner id + echo "$id" > "$root/maxid" + echo "runner $id ($!): $base" } @@ -271,10 +291,22 @@ main() rm -rf "$root"; mkdir -p "$root" echo $$ > "$root/pid" - # only a single runner for now + # start with a single runner; we'll spawn more if requested spawn-runner 0 "$root" + trap "spawn-next-runner '$root'" USR1 - wait -n + # wait for runners to complete or for a signal to be received by this + # process that terminates `wait' + while true; do + wait -n || { + status=$? + + # ignore USR1 + if [ $status -ne 138 ]; then + exit $status + fi + } + done } main "$@" diff --git a/build-aux/Makefile.am b/build-aux/Makefile.am index c6f536f3..16267c66 100644 --- a/build-aux/Makefile.am +++ b/build-aux/Makefile.am @@ -86,7 +86,6 @@ ant = @ANT@ -e default: program-ui c1map FORCE .DELETE_ON_ERROR: -.NOTPARALLEL: # keep all intermediate files for easy introspection .SECONDARY: @@ -153,7 +152,11 @@ c1map: $(dest_c1map) %.csvo: %.csv cp $< $@ -%.xml: %.csvo +# TODO: This is necessary right now because of the current depgen +# process. Once that is eliminated in favor of individual dependency files +# (e.g. the %.d convention), this can go away since dependency generation +# can properly take place for the various file formats. +%.xml: %.csvo rater/core/vector/table.xmlo rater/tools/csv2xml $< > $@ version: .version.xml @@ -169,7 +172,7 @@ ui/html/index.phtml: ui/program.expanded.xml ui/package-dfns.xmlo: ui/package-dfns.xml ui/package-dfns.xml: ui/program.expanded.xml $(TAME) progui-pkg $< $@ -ui/package-map.xmlo: ui/package-map.xml +ui/package-map.xmlo: ui/package-map.xml ui/package-dfns.xmlo ui/package-map.xml: ui/program.expanded.xml ui/package-dfns.xml $(TAME) progui-pkg-map $< $@ diff --git a/doc/usage.texi b/doc/usage.texi index 6665da48..fd39ecba 100644 --- a/doc/usage.texi +++ b/doc/usage.texi @@ -157,6 +157,7 @@ $ ./configure SAXON_CP=/path/to/saxon9he.jar @menu * Common Targets:: Common Makefile targets. +* Parallel Builds:: Building multiple files concurrently. @end menu @@ -198,3 +199,53 @@ Here are the most common phony targets that may be useful: for example, to rebuild all files using the new version. @end table + + +@node Parallel Builds +@subsection Parallel Builds +@cindex Make, parallel builds +GNU Make offers parallel builds through the @code{-j} flag, + which specifies the maximum number of concurrent jobs. +This is supported by both @command{tame} and @command{tamed}. + +@command{tamed} starts by spawning a single runner, + which is marked as available. +When a command is issued to @command{tame}, + it will reserve the first available runner it finds by marking it as + busy. +Once the runner is finished, + it will be marked as available once again. +If all available runners are busy, + @command{tame} issues a signal to @command{tamed} to spawn another + runner, + which @command{tame} then reserves and marks as busy. +No runners are ever freed (terminated) until @command{tamed} itself + terminates. + +For example, + to build with up to four concurrent runners, + use @samp{-j4}: + +@float Figure, f:make-j +@example +$ make -j4 +@end example +@caption{Compiling with four concurrent runners} +@end float + +@cindex build, memory +@tip{Compiling and linking large packages can be memory intensive. + While runner memory consumption may vary, + it's wise to profile the memory usage of a single runner and + use that to estimate how many concurrent runners your system + can support.} + +@cindex Saxon +@tip{Saxon is also multi-threaded under certain circumstances, + so you should allocate fewer jobs than you have available CPU + cores. + GNU Make also offers a @code{-l} flag that tells it not to spawn + more jobs if the system is above the indicated load. + But note that, + even if a runner is idle, + it is still using up memory.}