[Reproducible-builds] Patch for build nodes pools

Vagrant Cascadian vagrant at debian.org
Fri Dec 18 23:24:38 UTC 2015


I've had this idea that we could make more efficient use of the nodes by
grouping them into pools...

This would hopefully balance out the load on the nodes (most of the
armhf nodes CPUs are idle roughly 25% of the time) a little better.

Perhaps more importantly, it should be much more resilient if one node
is down, as a given build job can use one of several build machines for
the second build.

You can still group pools into categories to ensure diversity in kernel
version, cpu type, operating date, etc.

The load check isn't perfect, but it's better than nothing, maybe good
enough as is. Adding a check for available ram should be simple enough.

Another option is to only use pools for the second build, which still
gets most of the benefits, but perhaps is a little simpler
configuration-wise.

Patch below! No idea if it works, given that I don't have a spare
jenkins.debian.net or build network to test on, but hopefully it
demonstrates the idea, and is mostly there.

My biggest concern with the code is not knowing if setting the NODE1,
PORT1, NODE2 and PORT2 within the function will work correctly and be
available outside of the function for the remainder of the process, or
other functions that run outside of reproducible_build.sh that need to
know those variables.

live well,
  vagrant

commit 200c45bbb5768dce5649b05ad599c85c6bb14b50
Author: Vagrant Cascadian <vagrant at debian.org>
Date:   Fri Dec 18 15:02:32 2015 -0800

    Implement support for build pools, and add an example pool.
---
 bin/reproducible_build.sh | 55 +++++++++++++++++++++++++++++++++++++++++++++--
 job-cfg/reproducible.yaml |  3 ++-
 2 files changed, 55 insertions(+), 3 deletions(-)

diff --git a/bin/reproducible_build.sh b/bin/reproducible_build.sh
index 338c207..c9d725f 100755
--- a/bin/reproducible_build.sh
+++ b/bin/reproducible_build.sh
@@ -688,9 +688,53 @@ check_buildinfo() {
 	rm -f $TMPFILE1 $TMPFILE2
 }
 
+select_least_loaded_node() {
+    local pool_nodes
+    local node
+    local port
+    local load
+    local best_load
+    local selected
+    # default to the first node
+    selected="$1"
+    pool_nodes="$@"
+    if [ "$selected" = "$pool_nodes" ]; then
+	echo $selected
+	return 0
+    fi
+    load = 0
+    best_load = 0
+    for this_node in $pool_nodes ; do
+	node=$(echo $this_node | cut -d : -f 1)
+	port=$(echo $this_node | cut -d : -f 2)
+	# Compare the number of processors against the load, and add
+	# 1000 so we don't need to bother comparing negative numbers.
+	# 
+	# TODO: account for available memory.
+	#
+	# TODO: this could be improved upon and simplified by calling
+	# a shell script on the remote end.
+	load=$(echo \
+		   $(ssh $node -p $port \
+			 "grep ^processor /proc/cpuinfo | wc -l ; echo ' \* X 100 + 1000 - 100 X ' ; cut -d ' ' -f 1 /proc/loadavg") | \
+		      tr 'X' '*' | \
+		      bc | \
+		      cut -d . -f 1)
+	if [ "$load" -gt "$best_load" ]; then
+	    selected="$this_node"
+	    best_load="$load"
+	fi
+    done
+    echo $selected
+}
+
 build_rebuild() {
 	FTBFS=1
 	mkdir b1 b2
+	local selected_node
+	selected_node=$(select_least_loaded_node $NODE1_POOL)
+	NODE1=$(echo $selected_node | cut -d : -f 1)
+	PORT1=$(echo $selected_node | cut -d : -f 2)
 	remote_build 1 $NODE1 $PORT1
 	if [ ! -f b1/${SRCPACKAGE}_${EVERSION}_${ARCH}.changes ] && [ -f b1/${SRCPACKAGE}_*_${ARCH}.changes ] ; then
 			echo "Version mismatch between main node (${SRCPACKAGE}_${EVERSION}_${ARCH}.dsc expected) and first build node ($(ls b1/*dsc)) for $SUITE/$ARCH, aborting. Please upgrade the schroots..." | tee -a ${RBUILDLOG}
@@ -700,6 +744,9 @@ build_rebuild() {
 			exit 0
 	elif [ -f b1/${SRCPACKAGE}_${EVERSION}_${ARCH}.changes ] ; then
 		# the first build did not FTBFS, try rebuild it.
+		selected_node=$(select_least_loaded_node $NODE2_POOL)
+		NODE2=$(echo $selected_node | cut -d : -f 1)
+		PORT2=$(echo $selected_node | cut -d : -f 2)
 		remote_build 2 $NODE2 $PORT2
 		if [ -f b2/${SRCPACKAGE}_${EVERSION}_${ARCH}.changes ] ; then
 			# both builds were fine, i.e., they did not FTBFS.
@@ -750,10 +797,14 @@ elif [ "$1" = "1" ] || [ "$1" = "2" ] ; then
 	exit 0
 elif [ "$2" != "" ] ; then
 	MODE="master"
+	NODE1_POOL="$1"
+	NODE2_POOL="$2"
+	# FIXME: postpone setting NODE1/PORT1 and NODE2/PORT2 until the builds
+	# run
 	NODE1="$(echo $1 | cut -d ':' -f1).debian.net"
 	NODE2="$(echo $2 | cut -d ':' -f1).debian.net"
-	PORT1="$(echo $1 | cut -d ':' -f2)"
-	PORT2="$(echo $2 | cut -d ':' -f2)"
+	PORT1="$(echo $1 | cut -d ':' -f2 | cut -d ' ' -f 1)"
+	PORT2="$(echo $2 | cut -d ':' -f2 | cut -d ' ' -f 1)"
 	# if no port is given, assume 22
 	if [ "$NODE1" = "${PORT1}.debian.net" ] ; then PORT1=22 ; fi
 	if [ "$NODE2" = "${PORT2}.debian.net" ] ; then PORT2=22 ; fi
diff --git a/job-cfg/reproducible.yaml b/job-cfg/reproducible.yaml
index 97df606..5963206 100644
--- a/job-cfg/reproducible.yaml
+++ b/job-cfg/reproducible.yaml
@@ -481,7 +481,8 @@
                 - '13': { my_node1: 'cbxi4pro0-armhf-rb:2226', my_node2: 'rpi2b-armhf-rb:2230'     }
                 - '14': { my_node1: 'rpi2b-armhf-rb:2230',     my_node2: 'wbq0-armhf-rb:2225'      }
                 - '15': { my_node1: 'wbd0-armhf-rb:2223',      my_node2: 'hb0-armhf-rb:2224'       }
-            my_shell: '/srv/jenkins/bin/reproducible_build.sh {my_node1} {my_node2}'
+                - '16': { my_node1: 'wbd0-armhf-rb:2223 wbq0-armhf-r:2225', my_node2: 'bpi0-armhf-rb:2222 odxu4-armhf-rb:2229' }
+            my_shell: '/srv/jenkins/bin/reproducible_build.sh "{my_node1}" "{my_node2}"'
             my_timed: '* * * * *'
             my_hname: ''
 
-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 818 bytes
Desc: not available
URL: <http://lists.alioth.debian.org/pipermail/reproducible-builds/attachments/20151218/ea08bd37/attachment.sig>


More information about the Reproducible-builds mailing list