torture: Parallelize kvm-series.sh guest-OS execution

Currently, kvm-series.sh builds and runs serially, which makes for long execution times. This commit changes its logic to build all of the needed kernels serially, but then run the corresponding guest OSes concurrently in batches using the entire machine. On large systems, this results in order-of-magnitude speedups of the guest-OS execution portion of the runtime. Signed-off-by: Paul E. McKenney <paulmck@kernel.org> Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
2026-05-26 16:12:59 +02:00 · 2025-12-29 11:13:54 -08:00 · 2025-12-29 11:13:54 -08:00 · 3ce40539cc
commit 3ce40539cc
parent e8a534a671
1 changed files with 153 additions and 21 deletions
--- a/tools/testing/selftests/rcutorture/bin/kvm-series.sh
+++ b/tools/testing/selftests/rcutorture/bin/kvm-series.sh
@ -15,7 +15,7 @@
 # This script is intended to replace kvm-check-branches.sh by providing
 # ease of use and faster execution.

-T="`mktemp -d ${TMPDIR-/tmp}/kvm-series.sh.XXXXXX`"
+T="`mktemp -d ${TMPDIR-/tmp}/kvm-series.sh.XXXXXX`"; export T
 trap 'rm -rf $T' 0

 scriptname=$0
@ -53,40 +53,62 @@ shift

 RCUTORTURE="`pwd`/tools/testing/selftests/rcutorture"; export RCUTORTURE
 PATH=${RCUTORTURE}/bin:$PATH; export PATH
+RES="${RCUTORTURE}/res"; export RES
 . functions.sh

 ret=0
-nfail=0
+nbuildfail=0
+nrunfail=0
 nsuccess=0
-faillist=
+ncpus=0
+buildfaillist=
+runfaillist=
 successlist=
 cursha1="`git rev-parse --abbrev-ref HEAD`"
 ds="`date +%Y.%m.%d-%H.%M.%S`-series"
+DS="${RES}/${ds}"; export DS
 startdate="`date`"
 starttime="`get_starttime`"

 echo " --- " $scriptname $args | tee -a $T/log
 echo " --- Results directory: " $ds | tee -a $T/log

+# Do all builds.  Iterate through commits within a given scenario
+# because builds normally go faster from one commit to the next within a
+# given scenario.  In contrast, switching scenarios on each rebuild will
+# often force a full rebuild due to Kconfig differences, for example,
+# turning preemption on and off.  Defer actual runs in order to run
+# lots of them concurrently on large systems.
+touch $T/torunlist
 for config in ${config_list}
 do
 	sha_n=0
 	for sha in ${sha1_list}
 	do
 		sha1=${sha_n}.${sha} # Enable "sort -k1nr" to list commits in order.
+		echo
 		echo Starting ${config}/${sha1} at `date` | tee -a $T/log
-		git checkout "${sha}"
-		time tools/testing/selftests/rcutorture/bin/kvm.sh --configs "$config" --datestamp "$ds/${config}/${sha1}" --duration 1 "$@"
+		git checkout --detach "${sha}"
+		tools/testing/selftests/rcutorture/bin/kvm.sh --configs "$config" --datestamp "$ds/${config}/${sha1}" --duration 1 --build-only --trust-make "$@"
 		curret=$?
 		if test "${curret}" -ne 0
 		then
-			nfail=$((nfail+1))
-			faillist="$faillist ${config}/${sha1}(${curret})"
+			nbuildfail=$((nbuildfail+1))
+			buildfaillist="$buildfaillist ${config}/${sha1}(${curret})"
 		else
-			nsuccess=$((nsuccess+1))
-			successlist="$successlist ${config}/${sha1}"
-			# Successful run, so remove large files.
-			rm -f ${RCUTORTURE}/$ds/${config}/${sha1}/{vmlinux,bzImage,System.map,Module.symvers}
+			batchncpus="`grep -v "^# cpus=" "${DS}/${config}/${sha1}/batches" | awk '{ sum += $3 } END { print sum }'`"
+			echo run_one_qemu ${sha_n} ${config}/${sha1} ${batchncpus} >> $T/torunlist
+			if test "${ncpus}" -eq 0
+			then
+				ncpus="`grep "^# cpus=" "${DS}/${config}/${sha1}/batches" | sed -e 's/^# cpus=//'`"
+				case "${ncpus}" in
+				^[0-9]*$)
+					;;
+				*)
+					ncpus=0
+					;;
+				esac
+			fi
 		fi
 		if test "${ret}" -eq 0
 		then
@ -95,22 +117,132 @@ do
 		sha_n=$((sha_n+1))
 	done
 done
+
+# If the user did not specify the number of CPUs, use them all.
+if test "${ncpus}" -eq 0
+then
+	ncpus="`identify_qemu_vcpus`"
+fi
+
+cpusused=0
+touch $T/successlistfile
+touch $T/faillistfile
+
+# do_run_one_qemu ds resultsdir qemu_curout
+#
+# Start the specified qemu run and record its success or failure.
+do_run_one_qemu () {
+	local ret
+	local ds="$1"
+	local resultsdir="$2"
+	local qemu_curout="$3"
+
+	tools/testing/selftests/rcutorture/bin/kvm-again.sh "${DS}/${resultsdir}" --link inplace-force > ${qemu_curout} 2>&1
+	ret=$?
+	if test "${ret}" -eq 0
+	then
+		echo ${resultsdir} >> $T/successlistfile
+		# Successful run, so remove large files.
+		rm -f ${DS}/${resultsdir}/{vmlinux,bzImage,System.map,Module.symvers}
+	else
+		echo "${resultsdir}(${ret})" >> $T/faillistfile
+	fi
+}
+
+# cleanup_qemu_batch batchncpus
+#
+# Update success and failure lists, files, and counts at the end of
+# a batch.
+cleanup_qemu_batch () {
+	local batchncpus="$1"
+
+	echo Waiting, cpusused=${cpusused}, ncpus=${ncpus} `date` | tee -a $T/log
+	wait
+	cpusused="${batchncpus}"
+	nsuccessbatch="`wc -l $T/successlistfile | awk '{ print $1 }'`"
+	nsuccess=$((nsuccess+nsuccessbatch))
+	successlist="$successlist `cat $T/successlistfile`"
+	rm $T/successlistfile
+	touch $T/successlistfile
+	nfailbatch="`wc -l $T/faillistfile | awk '{ print $1 }'`"
+	nrunfail=$((nrunfail+nfailbatch))
+	runfaillist="$runfaillist `cat $T/faillistfile`"
+	rm $T/faillistfile
+	touch $T/faillistfile
+}
+
+# run_one_qemu sha_n config/sha1 batchncpus
+#
+# Launch into the background the sha_n-th qemu job whose results directory
+# is config/sha1 and which uses batchncpus CPUs.  Once we reach a job that
+# would overflow the number of available CPUs, wait for the previous jobs
+# to complete and record their results.
+run_one_qemu () {
+	local sha_n="$1"
+	local config_sha1="$2"
+	local batchncpus="$3"
+	local qemu_curout
+
+	cpusused=$((cpusused+batchncpus))
+	if test "${cpusused}" -gt $ncpus
+	then
+		cleanup_qemu_batch "${batchncpus}"
+	fi
+	echo Starting ${config_sha1} using ${batchncpus} CPUs `date`
+	qemu_curout="${DS}/${config_sha1}/qemu-series"
+	do_run_one_qemu "$ds" "${config_sha1}" ${qemu_curout} &
+}
+
+# Re-ordering the runs will mess up the affinity chosen at build time
+# (among other things, over-using CPU 0), so suppress it.
+TORTURE_NO_AFFINITY="no-affinity"; export TORTURE_NO_AFFINITY
+
+# Run the kernels (if any) that built correctly.
+echo | tee -a $T/log # Put a blank line between build and run messages.
+. $T/torunlist
+cleanup_qemu_batch "${batchncpus}"
+
+# Get back to initial checkout/SHA-1.
 git checkout "${cursha1}"

-echo ${nsuccess} SUCCESSES: | tee -a $T/log
-echo ${successlist} | fmt | tee -a $T/log
-echo | tee -a $T/log
-echo ${nfail} FAILURES: | tee -a $T/log
-echo ${faillist} | fmt | tee -a $T/log
-if test -n "${faillist}"
+# Throw away leading and trailing space characters for fmt.
+successlist="`echo ${successlist} | sed -e 's/^ *//' -e 's/ *$//'`"
+buildfaillist="`echo ${buildfaillist} | sed -e 's/^ *//' -e 's/ *$//'`"
+runfaillist="`echo ${runfaillist} | sed -e 's/^ *//' -e 's/ *$//'`"
+
+# Print lists of successes, build failures, and run failures, if any.
+if test "${nsuccess}" -gt 0
 then
 	echo | tee -a $T/log
-	echo Failures across commits: | tee -a $T/log
-	echo ${faillist} | tr ' ' '\012' | sed -e 's,^[^/]*/,,' -e 's/([0-9]*)//' |
+	echo ${nsuccess} SUCCESSES: | tee -a $T/log
+	echo ${successlist} | fmt | tee -a $T/log
+fi
+if test "${nbuildfail}" -gt 0
+then
+	echo | tee -a $T/log
+	echo ${nbuildfail} BUILD FAILURES: | tee -a $T/log
+	echo ${buildfaillist} | fmt | tee -a $T/log
+fi
+if test "${nrunfail}" -gt 0
+then
+	echo | tee -a $T/log
+	echo ${nrunfail} RUN FAILURES: | tee -a $T/log
+	echo ${runfaillist} | fmt | tee -a $T/log
+fi
+
+# If there were build or runtime failures, map them to commits.
+if test "${nbuildfail}" -gt 0 || test "${nrunfail}" -gt 0
+then
+	echo | tee -a $T/log
+	echo Build failures across commits: | tee -a $T/log
+	echo ${buildfaillist} | tr ' ' '\012' | sed -e 's,^[^/]*/,,' -e 's/([0-9]*)//' |
 		sort | uniq -c | sort -k2n | tee -a $T/log
 fi
+
+# Print run summary.
+echo | tee -a $T/log
 echo Started at $startdate, ended at `date`, duration `get_starttime_duration $starttime`. | tee -a $T/log
-echo Summary: Successes: ${nsuccess} Failures: ${nfail} | tee -a $T/log
-cp $T/log tools/testing/selftests/rcutorture/res/${ds}
+echo Summary: Successes: ${nsuccess} " "Build Failures: ${nbuildfail} " "Runtime Failures: ${nrunfail}| tee -a $T/log
+cp $T/log ${DS}

 exit "${ret}"