buildman: Add a way to limit the number of buildmans

Buildman uses all available CPUs by default, so running more than one or
two concurrent processes is not normally useful.

However in some CI cases we want to be able to run several jobs at once
to save time. For example, in a lab situation we may want to run a test
on 20 boards at a time, since only the build step actually takes much
CPU.

Add an option which allows such a limit. When buildman starts up, it
waits until the number of running processes goes below the limit, then
claims a spot in the list. The list is maintained with a temporary file.

Note that the temp file is user-specific, since it is hard to create a
locked temporary file which can be accessed by any user. In most cases,
only one user is running jobs on a machine, so this should not matter.

Signed-off-by: Simon Glass <sjg@chromium.org>
This commit is contained in:
Simon Glass
2024-06-23 11:55:15 -06:00
parent 8941477e02
commit 5d679f801d
6 changed files with 277 additions and 4 deletions

View File

@@ -1286,6 +1286,11 @@ then buildman hangs. Failing to handle any eventuality is a bug in buildman and
should be reported. But you can use -T0 to disable threading and hopefully should be reported. But you can use -T0 to disable threading and hopefully
figure out the root cause of the build failure. figure out the root cause of the build failure.
For situations where buildman is invoked from multiple running processes, it is
sometimes useful to have buildman wait until the others have finished. Use the
--process-limit option for this: --process-limit 1 will allow only one buildman
to process jobs at a time.
Build summary Build summary
------------- -------------

View File

@@ -129,6 +129,8 @@ def add_after_m(parser):
default=False, help="Use an O= (output) directory per board rather than per thread") default=False, help="Use an O= (output) directory per board rather than per thread")
parser.add_argument('--print-arch', action='store_true', parser.add_argument('--print-arch', action='store_true',
default=False, help="Print the architecture for a board (ARCH=)") default=False, help="Print the architecture for a board (ARCH=)")
parser.add_argument('--process-limit', type=int,
default=0, help='Limit to number of buildmans running at once')
parser.add_argument('-r', '--reproducible-builds', action='store_true', parser.add_argument('-r', '--reproducible-builds', action='store_true',
help='Set SOURCE_DATE_EPOCH=0 to suuport a reproducible build') help='Set SOURCE_DATE_EPOCH=0 to suuport a reproducible build')
parser.add_argument('-R', '--regen-board-list', type=str, parser.add_argument('-R', '--regen-board-list', type=str,

View File

@@ -7,10 +7,13 @@
This holds the main control logic for buildman, when not running tests. This holds the main control logic for buildman, when not running tests.
""" """
import getpass
import multiprocessing import multiprocessing
import os import os
import shutil import shutil
import sys import sys
import tempfile
import time
from buildman import boards from buildman import boards
from buildman import bsettings from buildman import bsettings
@@ -21,10 +24,23 @@ from patman import gitutil
from patman import patchstream from patman import patchstream
from u_boot_pylib import command from u_boot_pylib import command
from u_boot_pylib import terminal from u_boot_pylib import terminal
from u_boot_pylib.terminal import tprint from u_boot_pylib import tools
from u_boot_pylib.terminal import print_clear, tprint
TEST_BUILDER = None TEST_BUILDER = None
# Space-separated list of buildman process IDs currently running jobs
RUNNING_FNAME = f'buildmanq.{getpass.getuser()}'
# Lock file for access to RUNNING_FILE
LOCK_FNAME = f'{RUNNING_FNAME}.lock'
# Wait time for access to lock (seconds)
LOCK_WAIT_S = 10
# Wait time to start running
RUN_WAIT_S = 300
def get_plural(count): def get_plural(count):
"""Returns a plural 's' if count is not 1""" """Returns a plural 's' if count is not 1"""
return 's' if count != 1 else '' return 's' if count != 1 else ''
@@ -578,6 +594,125 @@ def calc_adjust_cfg(adjust_cfg, reproducible_builds):
return adjust_cfg return adjust_cfg
def read_procs(tmpdir=tempfile.gettempdir()):
"""Read the list of running buildman processes
If the list is corrupted, returns an empty list
Args:
tmpdir (str): Temporary directory to use (for testing only)
"""
running_fname = os.path.join(tmpdir, RUNNING_FNAME)
procs = []
if os.path.exists(running_fname):
items = tools.read_file(running_fname, binary=False).split()
try:
procs = [int(x) for x in items]
except ValueError: # Handle invalid format
pass
return procs
def check_pid(pid):
"""Check for existence of a unix PID
https://stackoverflow.com/questions/568271/how-to-check-if-there-exists-a-process-with-a-given-pid-in-python
Args:
pid (int): PID to check
Returns:
True if it exists, else False
"""
try:
os.kill(pid, 0)
except OSError:
return False
else:
return True
def write_procs(procs, tmpdir=tempfile.gettempdir()):
"""Write the list of running buildman processes
Args:
tmpdir (str): Temporary directory to use (for testing only)
"""
running_fname = os.path.join(tmpdir, RUNNING_FNAME)
tools.write_file(running_fname, ' '.join([str(p) for p in procs]),
binary=False)
# Allow another user to access the file
os.chmod(running_fname, 0o666)
def wait_for_process_limit(limit, tmpdir=tempfile.gettempdir(),
pid=os.getpid()):
"""Wait until the number of buildman processes drops to the limit
This uses FileLock to protect a 'running' file, which contains a list of
PIDs of running buildman processes. The number of PIDs in the file indicates
the number of running processes.
When buildman starts up, it calls this function to wait until it is OK to
start the build.
On exit, no attempt is made to remove the PID from the file, since other
buildman processes will notice that the PID is no-longer valid, and ignore
it.
Two timeouts are provided:
LOCK_WAIT_S: length of time to wait for the lock; if this occurs, the
lock is busted / removed before trying again
RUN_WAIT_S: length of time to wait to be allowed to run; if this occurs,
the build starts, with the PID being added to the file.
Args:
limit (int): Maximum number of buildman processes, including this one;
must be > 0
tmpdir (str): Temporary directory to use (for testing only)
pid (int): Current process ID (for testing only)
"""
from filelock import Timeout, FileLock
running_fname = os.path.join(tmpdir, RUNNING_FNAME)
lock_fname = os.path.join(tmpdir, LOCK_FNAME)
lock = FileLock(lock_fname)
# Allow another user to access the file
col = terminal.Color()
tprint('Waiting for other buildman processes...', newline=False,
colour=col.RED)
claimed = False
deadline = time.time() + RUN_WAIT_S
while True:
try:
with lock.acquire(timeout=LOCK_WAIT_S):
os.chmod(lock_fname, 0o666)
procs = read_procs(tmpdir)
# Drop PIDs which are not running
procs = list(filter(check_pid, procs))
# If we haven't hit the limit, add ourself
if len(procs) < limit:
tprint('done...', newline=False)
claimed = True
if time.time() >= deadline:
tprint('timeout...', newline=False)
claimed = True
if claimed:
write_procs(procs + [pid], tmpdir)
break
except Timeout:
tprint('failed to get lock: busting...', newline=False)
os.remove(lock_fname)
time.sleep(1)
tprint('starting build', newline=False)
print_clear()
def do_buildman(args, toolchains=None, make_func=None, brds=None, def do_buildman(args, toolchains=None, make_func=None, brds=None,
clean_dir=False, test_thread_exceptions=False): clean_dir=False, test_thread_exceptions=False):
"""The main control code for buildman """The main control code for buildman
@@ -677,5 +812,8 @@ def do_buildman(args, toolchains=None, make_func=None, brds=None,
TEST_BUILDER = builder TEST_BUILDER = builder
if args.process_limit:
wait_for_process_limit(args.process_limit)
return run_builder(builder, series.commits if series else None, return run_builder(builder, series.commits if series else None,
brds.get_selected_dict(), args) brds.get_selected_dict(), args)

View File

@@ -8,7 +8,11 @@ version = "0.0.6"
authors = [ authors = [
{ name="Simon Glass", email="sjg@chromium.org" }, { name="Simon Glass", email="sjg@chromium.org" },
] ]
dependencies = ["u_boot_pylib >= 0.0.6", "patch-manager >= 0.0.6"] dependencies = [
"filelock >= 3.0.12",
"u_boot_pylib >= 0.0.6",
"patch-manager >= 0.0.6"
]
description = "Buildman build tool for U-Boot" description = "Buildman build tool for U-Boot"
readme = "README.rst" readme = "README.rst"
requires-python = ">=3.7" requires-python = ">=3.7"

View File

@@ -2,12 +2,14 @@
# Copyright (c) 2012 The Chromium OS Authors. # Copyright (c) 2012 The Chromium OS Authors.
# #
from filelock import FileLock
import os import os
import shutil import shutil
import sys import sys
import tempfile import tempfile
import time import time
import unittest import unittest
from unittest.mock import patch
from buildman import board from buildman import board
from buildman import boards from buildman import boards
@@ -156,6 +158,11 @@ class TestBuild(unittest.TestCase):
if not os.path.isdir(self.base_dir): if not os.path.isdir(self.base_dir):
os.mkdir(self.base_dir) os.mkdir(self.base_dir)
self.cur_time = 0
self.valid_pids = []
self.finish_time = None
self.finish_pid = None
def tearDown(self): def tearDown(self):
shutil.rmtree(self.base_dir) shutil.rmtree(self.base_dir)
@@ -747,6 +754,120 @@ class TestBuild(unittest.TestCase):
self.assertEqual([ self.assertEqual([
['MARY="mary"', 'Missing expected line: CONFIG_MARY="mary"']], result) ['MARY="mary"', 'Missing expected line: CONFIG_MARY="mary"']], result)
def get_procs(self):
running_fname = os.path.join(self.base_dir, control.RUNNING_FNAME)
items = tools.read_file(running_fname, binary=False).split()
return [int(x) for x in items]
def get_time(self):
return self.cur_time
def inc_time(self, amount):
self.cur_time += amount
# Handle a process exiting
if self.finish_time == self.cur_time:
self.valid_pids = [pid for pid in self.valid_pids
if pid != self.finish_pid]
def kill(self, pid, signal):
if pid not in self.valid_pids:
raise OSError('Invalid PID')
def test_process_limit(self):
"""Test wait_for_process_limit() function"""
tmpdir = self.base_dir
with (patch('time.time', side_effect=self.get_time),
patch('time.sleep', side_effect=self.inc_time),
patch('os.kill', side_effect=self.kill)):
# Grab the process. Since there is no other profcess, this should
# immediately succeed
control.wait_for_process_limit(1, tmpdir=tmpdir, pid=1)
lines = terminal.get_print_test_lines()
self.assertEqual(0, self.cur_time)
self.assertEqual('Waiting for other buildman processes...',
lines[0].text)
self.assertEqual(self._col.RED, lines[0].colour)
self.assertEqual(False, lines[0].newline)
self.assertEqual(True, lines[0].bright)
self.assertEqual('done...', lines[1].text)
self.assertEqual(None, lines[1].colour)
self.assertEqual(False, lines[1].newline)
self.assertEqual(True, lines[1].bright)
self.assertEqual('starting build', lines[2].text)
self.assertEqual([1], control.read_procs(tmpdir))
self.assertEqual(None, lines[2].colour)
self.assertEqual(False, lines[2].newline)
self.assertEqual(True, lines[2].bright)
# Try again, with a different PID...this should eventually timeout
# and start the build anyway
self.cur_time = 0
self.valid_pids = [1]
control.wait_for_process_limit(1, tmpdir=tmpdir, pid=2)
lines = terminal.get_print_test_lines()
self.assertEqual('Waiting for other buildman processes...',
lines[0].text)
self.assertEqual('timeout...', lines[1].text)
self.assertEqual(None, lines[1].colour)
self.assertEqual(False, lines[1].newline)
self.assertEqual(True, lines[1].bright)
self.assertEqual('starting build', lines[2].text)
self.assertEqual([1, 2], control.read_procs(tmpdir))
self.assertEqual(control.RUN_WAIT_S, self.cur_time)
# Check lock-busting
self.cur_time = 0
self.valid_pids = [1, 2]
lock_fname = os.path.join(tmpdir, control.LOCK_FNAME)
lock = FileLock(lock_fname)
lock.acquire(timeout=1)
control.wait_for_process_limit(1, tmpdir=tmpdir, pid=3)
lines = terminal.get_print_test_lines()
self.assertEqual('Waiting for other buildman processes...',
lines[0].text)
self.assertEqual('failed to get lock: busting...', lines[1].text)
self.assertEqual(None, lines[1].colour)
self.assertEqual(False, lines[1].newline)
self.assertEqual(True, lines[1].bright)
self.assertEqual('timeout...', lines[2].text)
self.assertEqual('starting build', lines[3].text)
self.assertEqual([1, 2, 3], control.read_procs(tmpdir))
self.assertEqual(control.RUN_WAIT_S, self.cur_time)
lock.release()
# Check handling of dead processes. Here we have PID 2 as a running
# process, even though the PID file contains 1, 2 and 3. So we can
# add one more PID, to make 2 and 4
self.cur_time = 0
self.valid_pids = [2]
control.wait_for_process_limit(2, tmpdir=tmpdir, pid=4)
lines = terminal.get_print_test_lines()
self.assertEqual('Waiting for other buildman processes...',
lines[0].text)
self.assertEqual('done...', lines[1].text)
self.assertEqual('starting build', lines[2].text)
self.assertEqual([2, 4], control.read_procs(tmpdir))
self.assertEqual(0, self.cur_time)
# Try again, with PID 2 quitting at time 50. This allows the new
# build to start
self.cur_time = 0
self.valid_pids = [2, 4]
self.finish_pid = 2
self.finish_time = 50
control.wait_for_process_limit(2, tmpdir=tmpdir, pid=5)
lines = terminal.get_print_test_lines()
self.assertEqual('Waiting for other buildman processes...',
lines[0].text)
self.assertEqual('done...', lines[1].text)
self.assertEqual('starting build', lines[2].text)
self.assertEqual([4, 5], control.read_procs(tmpdir))
self.assertEqual(self.finish_time, self.cur_time)
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()

View File

@@ -164,8 +164,11 @@ def print_clear():
global last_print_len global last_print_len
if last_print_len: if last_print_len:
print('\r%s\r' % (' '* last_print_len), end='', flush=True) if print_test_mode:
last_print_len = None print_test_list.append(PrintLine(None, None, None, None))
else:
print('\r%s\r' % (' '* last_print_len), end='', flush=True)
last_print_len = None
def set_print_test_mode(enable=True): def set_print_test_mode(enable=True):
"""Go into test mode, where all printing is recorded""" """Go into test mode, where all printing is recorded"""