From c9a5bf4a3897f61956bf8a25b652fcf535e5e821 Mon Sep 17 00:00:00 2001 From: Lucas Savva Date: Fri, 16 Dec 2022 11:18:54 +0000 Subject: [PATCH] nixos/acme: Increase number of retries in testing Helps to avoid failures in Hydra when the host server starts the web server too slowly. --- nixos/tests/acme.nix | 62 +++++++++++++++++++++++++++++--------------- 1 file changed, 41 insertions(+), 21 deletions(-) diff --git a/nixos/tests/acme.nix b/nixos/tests/acme.nix index d540bc6ec31b..6ac2400ca92c 100644 --- a/nixos/tests/acme.nix +++ b/nixos/tests/acme.nix @@ -144,7 +144,11 @@ in { name = "acme"; - meta.maintainers = lib.teams.acme.members; + meta = { + maintainers = lib.teams.acme.members; + # Hard timeout in seconds. Average run time is about 7 minutes. + timeout = 1800; + }; nodes = { # The fake ACME server which will respond to client requests @@ -357,6 +361,30 @@ in { import time + TOTAL_RETRIES = 20 + + + class BackoffTracker(object): + delay = 1 + increment = 1 + + def handle_fail(self, retries, message) -> int: + assert retries < TOTAL_RETRIES, message + + print(f"Retrying in {self.delay}s, {retries + 1}/{TOTAL_RETRIES}") + time.sleep(self.delay) + + # Only increment after the first try + if retries == 0: + self.delay += self.increment + self.increment *= 2 + + return retries + 1 + + + backoff = BackoffTracker() + + def switch_to(node, name): # On first switch, this will create a symlink to the current system so that we can # quickly switch between derivations @@ -404,9 +432,7 @@ in { assert False - def check_connection(node, domain, retries=3): - assert retries >= 0, f"Failed to connect to https://{domain}" - + def check_connection(node, domain, retries=0): result = node.succeed( "openssl s_client -brief -verify 2 -CAfile /tmp/ca.crt" f" -servername {domain} -connect {domain}:443 < /dev/null 2>&1" @@ -414,13 +440,11 @@ in { for line in result.lower().split("\n"): if "verification" in line and "error" in line: - time.sleep(3) - return check_connection(node, domain, retries - 1) + retries = backoff.handle_fail(retries, f"Failed to connect to https://{domain}") + return check_connection(node, domain, retries) - def check_connection_key_bits(node, domain, bits, retries=3): - assert retries >= 0, f"Did not find expected number of bits ({bits}) in key" - + def check_connection_key_bits(node, domain, bits, retries=0): result = node.succeed( "openssl s_client -CAfile /tmp/ca.crt" f" -servername {domain} -connect {domain}:443 < /dev/null" @@ -429,13 +453,11 @@ in { print("Key type:", result) if bits not in result: - time.sleep(3) - return check_connection_key_bits(node, domain, bits, retries - 1) + retries = backoff.handle_fail(retries, f"Did not find expected number of bits ({bits}) in key") + return check_connection_key_bits(node, domain, bits, retries) - def check_stapling(node, domain, retries=3): - assert retries >= 0, "OCSP Stapling check failed" - + def check_stapling(node, domain, retries=0): # Pebble doesn't provide a full OCSP responder, so just check the URL result = node.succeed( "openssl s_client -CAfile /tmp/ca.crt" @@ -445,21 +467,19 @@ in { print("OCSP Responder URL:", result) if "${caDomain}:4002" not in result.lower(): - time.sleep(3) - return check_stapling(node, domain, retries - 1) + retries = backoff.handle_fail(retries, "OCSP Stapling check failed") + return check_stapling(node, domain, retries) - def download_ca_certs(node, retries=5): - assert retries >= 0, "Failed to connect to pebble to download root CA certs" - + def download_ca_certs(node, retries=0): exit_code, _ = node.execute("curl https://${caDomain}:15000/roots/0 > /tmp/ca.crt") exit_code_2, _ = node.execute( "curl https://${caDomain}:15000/intermediate-keys/0 >> /tmp/ca.crt" ) if exit_code + exit_code_2 > 0: - time.sleep(3) - return download_ca_certs(node, retries - 1) + retries = backoff.handle_fail(retries, "Failed to connect to pebble to download root CA certs") + return download_ca_certs(node, retries) start_all()