diff --git a/nixos/lib/test-driver/test_driver/__init__.py b/nixos/lib/test-driver/test_driver/__init__.py index 371719d7a988..9daae1e941a6 100755 --- a/nixos/lib/test-driver/test_driver/__init__.py +++ b/nixos/lib/test-driver/test_driver/__init__.py @@ -76,6 +76,14 @@ def main() -> None: nargs="*", help="vlans to span by the driver", ) + arg_parser.add_argument( + "--global-timeout", + type=int, + metavar="GLOBAL_TIMEOUT", + action=EnvDefault, + envvar="globalTimeout", + help="Timeout in seconds for the whole test", + ) arg_parser.add_argument( "-o", "--output_directory", @@ -103,6 +111,7 @@ def main() -> None: args.testscript.read_text(), args.output_directory.resolve(), args.keep_vm_state, + args.global_timeout, ) as driver: if args.interactive: history_dir = os.getcwd() diff --git a/nixos/lib/test-driver/test_driver/driver.py b/nixos/lib/test-driver/test_driver/driver.py index 723c80717860..786821b0cc0d 100644 --- a/nixos/lib/test-driver/test_driver/driver.py +++ b/nixos/lib/test-driver/test_driver/driver.py @@ -1,6 +1,8 @@ import os import re +import signal import tempfile +import threading from contextlib import contextmanager from pathlib import Path from typing import Any, Callable, ContextManager, Dict, Iterator, List, Optional, Union @@ -41,6 +43,8 @@ class Driver: vlans: List[VLan] machines: List[Machine] polling_conditions: List[PollingCondition] + global_timeout: int + race_timer: threading.Timer def __init__( self, @@ -49,9 +53,12 @@ class Driver: tests: str, out_dir: Path, keep_vm_state: bool = False, + global_timeout: int = 24 * 60 * 60 * 7, ): self.tests = tests self.out_dir = out_dir + self.global_timeout = global_timeout + self.race_timer = threading.Timer(global_timeout, self.terminate_test) tmp_dir = get_tmp_dir() @@ -82,6 +89,7 @@ class Driver: def __exit__(self, *_: Any) -> None: with rootlog.nested("cleanup"): + self.race_timer.cancel() for machine in self.machines: machine.release() @@ -144,6 +152,10 @@ class Driver: def run_tests(self) -> None: """Run the test script (for non-interactive test runs)""" + rootlog.info( + f"Test will time out and terminate in {self.global_timeout} seconds" + ) + self.race_timer.start() self.test_script() # TODO: Collect coverage data for machine in self.machines: @@ -161,6 +173,19 @@ class Driver: with rootlog.nested("wait for all VMs to finish"): for machine in self.machines: machine.wait_for_shutdown() + self.race_timer.cancel() + + def terminate_test(self) -> None: + # This will be usually running in another thread than + # the thread actually executing the test script. + with rootlog.nested("timeout reached; test terminating..."): + for machine in self.machines: + machine.release() + # As we cannot `sys.exit` from another thread + # We can at least force the main thread to get SIGTERM'ed. + # This will prevent any user who caught all the exceptions + # to swallow them and prevent itself from terminating. + os.kill(os.getpid(), signal.SIGTERM) def create_machine(self, args: Dict[str, Any]) -> Machine: tmp_dir = get_tmp_dir() diff --git a/nixos/lib/testing-python.nix b/nixos/lib/testing-python.nix index 4904ad6e3591..f5222351518b 100644 --- a/nixos/lib/testing-python.nix +++ b/nixos/lib/testing-python.nix @@ -42,6 +42,7 @@ rec { , nodes ? {} , testScript , enableOCR ? false + , globalTimeout ? (60 * 60) , name ? "unnamed" , skipTypeCheck ? false # Skip linting (mainly intended for faster dev cycles) diff --git a/nixos/lib/testing/driver.nix b/nixos/lib/testing/driver.nix index cc97ca72083f..b6f01c38191d 100644 --- a/nixos/lib/testing/driver.nix +++ b/nixos/lib/testing/driver.nix @@ -94,6 +94,7 @@ let wrapProgram $out/bin/nixos-test-driver \ --set startScripts "''${vmStartScripts[*]}" \ --set testScript "$out/test-script" \ + --set globalTimeout "${toString config.globalTimeout}" \ --set vlans '${toString vlans}' \ ${lib.escapeShellArgs (lib.concatMap (arg: ["--add-flags" arg]) config.extraDriverArgs)} ''; @@ -123,6 +124,18 @@ in defaultText = "hostPkgs.qemu_test"; }; + globalTimeout = mkOption { + description = mdDoc '' + A global timeout for the complete test, expressed in seconds. + Beyond that timeout, every resource will be killed and released and the test will fail. + + By default, we use a 1 hour timeout. + ''; + type = types.int; + default = 60 * 60; + example = 10 * 60; + }; + enableOCR = mkOption { description = mdDoc '' Whether to enable Optical Character Recognition functionality for