Parallel build support

tamed was originally designed with support for parallel builds in mind, but I hadn't completed that work because we didn't have enough hardware that we'd benefit strongly from it. That has since changed. tamed will now spawn additional runners as needed to fulfill requests, which works around the issue of not knowing how many jobs GNU Make is going to try to do at once. There were a couple minor dependency fixes/workarounds for now in the Makefile, but otherwise everything appears to be working great.
2019-04-03 15:26:22 -04:00 · 2019-04-03 15:26:22 -04:00 · 1a35232bd8
parent 7b7cf13607
commit 1a35232bd8
4 changed files with 277 additions and 20 deletions
--- a/bin/tame
+++ b/bin/tame
@ -21,8 +21,11 @@ set -euo pipefail

 declare -r mypath=$( dirname "$( readlink -f "$0" )" )

-declare -ri EX_NOTAMED=1  # tried to start tamed but failed
+declare -ri EX_NOTAMED=1  # tried to start tamed or runner but failed
 declare -ri EX_STALLED=2  # runner stalled and could not recover
+declare -ri EX_NORUN=3    # no available runners
+declare -ri EX_DLOCK=4    # failed to get a lock to start tamed
+declare -ri EX_BLOCK=5    # failed to get a lock for busy runner check
 declare -ri EX_USAGE=64   # incorrect usage; sysexits.h

 # maximum amount of time in seconds to wait for runner to ack
@ -34,6 +37,27 @@ export TAMED_STALL_SECONDS
 export TAMED_SPAWNER_PID


+# Send a single command to the next available runner and
+# observe the result
+#
+# See `command-runner' for more information.
+command-available-runner()
+{
+  local -r root="${1?Missing root run path}"
+  shift 1
+
+  local -r id=$( reserve-runner "$root" )
+
+  test -n "$id" || {
+    echo "no available runners at $root" >&2
+    exit $EX_NORUN
+  }
+
+  command-runner "$id" "$root" "$@" \
+    | tee -a "run-$id.log"
+}
+
+
 # Send a single command to a runner and observe the result
 #
 # stdin will be directed to the runner.  stdout of the runner will be
@ -48,8 +72,6 @@ command-runner()
  local -r base="$root/$id"
  local -ri pid=$( cat "$base/pid" )

-  # TODO flock
-
  verify-runner "$base" "$pid"

  # forward signals to runner so that build is actually halted
@ -82,6 +104,8 @@ command-runner()
    # dealing with a lot of lines
    if [ "${line:0:5}" == "DONE " ]; then
      read _ code _ <<< "$line"
+
+      mark-available "$base"
      return "$code"
    fi

@ -90,6 +114,117 @@ command-runner()
 }


+# Get id of the first available runner and mark it as busy
+#
+# If no runners are available, tamed is signalled to spawn a new one.
+#
+# This command calls `mark-busy' so that it can acquire a runner in an
+# atomic manner.  The caller is responsible for invoking `mark-available'
+# after processing is complete.
+#
+# If no runner is available, then the result will be empty.
+reserve-runner()
+{
+  local -r root=${1?Missing root}
+
+  local -r timeout=10
+
+  (
+    flock -w $timeout 7 || {
+      echo "error: failed to acquire busy lock at $root" >&2
+      exit $EX_BLOCK
+    }
+
+    # grab the first available or request a new one
+    local id=$( get-available-runner-id "$root" )
+    if [ -z "$id" ]; then
+      id=$( spawn-runner-and-wait "$root" ) || {
+        echo "error: failed to reserve runner at $root" >&2
+        exit $EX_NORUN
+      }
+    fi
+
+    # mark it as busy while we still have the lock
+    mark-busy "$root/$id"
+
+    echo "$id"
+  ) 7>"$root/busy-lock"
+}
+
+
+# Get the id of the next available runner
+#
+# THIS FUNCTION MUST BE GUARDED BY A MUTEX!  Otherwise there is a race
+# between acquiring the available id and then actually making use of it.
+#
+# If multiple runners are available, then the first available runner sorted
+# numerically will be chosen.  This helps to give the same runners more
+# work, since they're more likely to have source (and compiled) already
+# parsed in memory.  As such, runners will have load disproportionately
+# spread, and may exhibit large variances in resource consumption.
+#
+# Sorting numerically is done because globbing sorts lexically---if runner
+# 10 is spawned, then it would find itself after "1" in the list rather than
+# after runner "9".
+#
+# If all runners are visible, then nothing will be returned.
+get-available-runner-id()
+{
+  local -r root=${1?Missing root}
+
+  grep -l 0 "$root"/*/busy \
+    | awk -F/ '{ print $(NF-1) }' \
+    | sort -n \
+    | head -n1
+}
+
+
+# Tell tamed to spawn a new runner and output the new runner id
+#
+# THIS FUNCTION MUST BE GUARDED BY A MUTEX!  Otherwise there is a race
+# between signaling and reading from `maxid'.
+#
+# This sens USR1 to tamed indicating that the next available runner should
+# be spawned, and then waits on that expected runner.  See `wait-for-runner'
+# for more information on waiting.
+spawn-runner-and-wait()
+{
+  local -r root=${1?Missing root}
+
+  local -r pid=$( < "$root/pid" )
+  local -ri maxid=$( < "$root/maxid" )
+
+  # request runner
+  kill -USR1 "$pid"
+
+  # wait on the expected id
+  local -ri nextid=$(( maxid + 1 ))
+  wait-for-runner "$root" "$nextid"
+
+  echo "$nextid"
+}
+
+
+# Mark a runner as busy (unable to accept new commands)
+#
+# Once work is done, use `mark-available' to undo this operation.
+mark-busy()
+{
+  local -r base=${1?Missing runner base path}
+  echo 1 > "$base/busy"
+}
+
+
+# Mark a runner as available (able to accept new commands)
+#
+# Once work is available, use `mark-busy' to undo this operation.
+mark-available()
+{
+  local -r base=${1?Missing runner base path}
+  echo 0 > "$base/busy"
+}
+
+
 # Verify that a runner is available
 #
 # If the runner is offline or not owned by $UID, then exit with
@ -128,30 +263,57 @@ verify-runner-ack()
 }


-# Wait somewhat impatiently for tamed
+# Wait somewhat impatiently for a runner
 #
-# Assumes that tamed's runner 0 is running once the pidfile becomes
+# Assumes that the runner is ready once the pidfile becomes
 # available.  Polls for a maximum of six seconds before giving up
 # and exiting with a non-zero status.
-wait-for-tamed()
+wait-for-runner()
 {
-  local -r base="${1?Missing base}"
+  local -r root=${1?Missing root}
+  local -r id=${2?Missing runner id}

  # we could use inotify, but that is not installed by default
  # on Debian systems, so let's just poll rather than introduce
  # another dependency (give up after 6 seconds)
  local -i i=12
  while test $((i--)); do
-    test ! -f "$base/0/pid" || return 0
+    test ! -f "$root/$id/pid" || return 0
    sleep 0.5
  done

  # still not available
-  echo 'error: tamed still unavailable; giving up' >&2
+  echo "error: runner $id still unavailable; giving up" >&2
  exit "$EX_NOTAMED"
 }


+# Attempts to start tamed if it's not already running
+#
+# This is designed to be safe for parallel builds by allowing only the first
+# process to start tamed and hanging the others until spawning is complete.
+#
+# See `_start-tamed' for more information.
+start-tamed-safe()
+{
+  local -r root=${1?Missing root}
+
+  local -ri timeout=5
+
+  (
+    flock -w $timeout 6 || {
+      echo "error: failed to acquire tamed spawning lock at $root" >&2
+      exit $EX_DLOCK
+    }
+
+    _start-tamed "$root"
+
+    flock -u 6
+    rm -f "$root-guard"
+  ) 6>"$root-guard"
+}
+
+
 # Start tamed if it is not already running
 #
 # If tamed is already running, nothing will happen; otherwise, start
@ -161,7 +323,7 @@ wait-for-tamed()
 # this ensures that tamed is initialized even if this script is run
 # after tamed is started but before it has fully come online (e.g
 # parallel make).
-start-tamed()
+_start-tamed()
 {
  local -r root="${1?Missing root}"

@ -183,7 +345,7 @@ start-tamed()
  # wait for tamed even if it was already started (just in
  # case this script was executed right after tamed started
  # but before it is done initializing)
-  wait-for-tamed "$root"
+  wait-for-runner "$root" 0
 }


@ -243,6 +405,16 @@ to come online.  After that time has elapsed, the command will
 be re-attempted, timing out again after TAME_CMD_WAITTIME and
 and at that point giving up.

+The first available runner sorted numerically will be
+chosen.  This helps to give the same runners more work,
+since they're more likely to have source (and compiled)
+already parsed in memory.  As such, runners will have load
+disproportionately spread, and may exhibit large variances
+in resource consumption.
+
+If all runners are busy, then a new runner will be spawned,
+allowing for parallel builds.
+
 Options:
  --help         show this message
  --kill         kill tamed
@ -279,11 +451,10 @@ main()
    outcmd=cat
  fi

-  start-tamed "$root"
+  start-tamed-safe "$root"

  # for now we only support a single runner
-  command-runner 0 "$root" "$@" \
-    | tee -a "run-0.log" \
+  command-available-runner "$root" "$@" \
    | "$outcmd"
 }

--- a/bin/tamed
+++ b/bin/tamed
@ -54,7 +54,7 @@ mkfifos()
    rm -f "$root-$n"

    mkfifo -m 0600 "$root/$n" || {
-      echo "fatal: failed to create FIFO at $in"
+      echo "fatal: failed to create FIFO at $root/n"
      exit $EX_CANTCREAT
    }
  done
@ -64,6 +64,20 @@ mkfifos()
 }


+# Spawn a new runner using the next available runner id
+#
+# See `spawn-runner' for more information.
+spawn-next-runner()
+{
+  local -r root="${1?Missing root path}"
+
+  # get the next available id
+  local -ri id=$( < "$root/maxid" )
+
+  spawn-runner "$(( id + 1 ))" "$root"
+}
+
+
 # Spawn a runner
 #
 # A new runner is created by spawning dslc and attaching
@ -79,6 +93,9 @@ spawn-runner()

  mkfifos "$base"

+  # flag as available (the client will manipulate these)
+  echo 0 > "$base/busy"
+
  # monitor runner usage and kill when inactive
  stall-monitor "$base" &

@ -95,6 +112,9 @@ spawn-runner()

  echo "$!" > "$base/pid"

+  # we assume that this is the new largest runner id
+  echo "$id" > "$root/maxid"
+
  echo "runner $id ($!): $base"
 }

@ -271,10 +291,22 @@ main()
  rm -rf "$root"; mkdir -p "$root"
  echo $$ > "$root/pid"

-  # only a single runner for now
+  # start with a single runner; we'll spawn more if requested
  spawn-runner 0 "$root"
+  trap "spawn-next-runner '$root'" USR1

-  wait -n
+  # wait for runners to complete or for a signal to be received by this
+  # process that terminates `wait'
+  while true; do
+    wait -n || {
+      status=$?
+
+      # ignore USR1
+      if [ $status -ne 138 ]; then
+        exit $status
+      fi
+    }
+  done
 }

 main "$@"
--- a/build-aux/Makefile.am
+++ b/build-aux/Makefile.am
@ -86,7 +86,6 @@ ant = @ANT@ -e
 default: program-ui c1map FORCE

 .DELETE_ON_ERROR:
-.NOTPARALLEL:

 # keep all intermediate files for easy introspection
 .SECONDARY:
@ -153,7 +152,11 @@ c1map: $(dest_c1map)
 %.csvo: %.csv
 	cp $< $@

-%.xml: %.csvo
+# TODO: This is necessary right now because of the current depgen
+# process.  Once that is eliminated in favor of individual dependency files
+# (e.g. the %.d convention), this can go away since dependency generation
+# can properly take place for the various file formats.
+%.xml: %.csvo rater/core/vector/table.xmlo
 	rater/tools/csv2xml $< > $@

 version: .version.xml
@ -169,7 +172,7 @@ ui/html/index.phtml: ui/program.expanded.xml
 ui/package-dfns.xmlo: ui/package-dfns.xml
 ui/package-dfns.xml: ui/program.expanded.xml
 	$(TAME) progui-pkg $< $@
-ui/package-map.xmlo: ui/package-map.xml
+ui/package-map.xmlo: ui/package-map.xml ui/package-dfns.xmlo
 ui/package-map.xml: ui/program.expanded.xml ui/package-dfns.xml
 	$(TAME) progui-pkg-map $< $@

--- a/doc/usage.texi
+++ b/doc/usage.texi
@ -157,6 +157,7 @@ $ ./configure SAXON_CP=/path/to/saxon9he.jar

@menu
 * Common Targets::  Common Makefile targets.
+* Parallel Builds:: Building multiple files concurrently.
@end menu


@ -198,3 +199,53 @@ Here are the most common phony targets that may be useful:
    for example,
    to rebuild all files using the new version.
@end table
+
+
+@node Parallel Builds
+@subsection Parallel Builds
+@cindex Make, parallel builds
+GNU Make offers parallel builds through the @code{-j} flag,
+  which specifies the maximum number of concurrent jobs.
+This is supported by both @command{tame} and @command{tamed}.
+
+@command{tamed} starts by spawning a single runner,
+  which is marked as available.
+When a command is issued to @command{tame},
+  it will reserve the first available runner it finds by marking it as
+    busy.
+Once the runner is finished,
+  it will be marked as available once again.
+If all available runners are busy,
+  @command{tame} issues a signal to @command{tamed} to spawn another
+    runner,
+      which @command{tame} then reserves and marks as busy.
+No runners are ever freed (terminated) until @command{tamed} itself
+  terminates.
+
+For example,
+  to build with up to four concurrent runners,
+    use @samp{-j4}:
+
+@float Figure, f:make-j
+@example
+$ make -j4
+@end example
+@caption{Compiling with four concurrent runners}
+@end float
+
+@cindex build, memory
+@tip{Compiling and linking large packages can be memory intensive.
+     While runner memory consumption may vary,
+       it's wise to profile the memory usage of a single runner and
+         use that to estimate how many concurrent runners your system
+           can support.}
+
+@cindex Saxon
+@tip{Saxon is also multi-threaded under certain circumstances,
+       so you should allocate fewer jobs than you have available CPU
+         cores.
+     GNU Make also offers a @code{-l} flag that tells it not to spawn
+       more jobs if the system is above the indicated load.
+     But note that,
+       even if a runner is idle,
+         it is still using up memory.}