doc: common/net/dns/unbound: cleanup the explanations for why i set what i do

2024-12-04 09:24:07 +00:00
parent 3fcf3bca8a
commit 716aa4be33
1 changed files with 66 additions and 74 deletions
--- a/hosts/common/net/dns/unbound.nix
+++ b/hosts/common/net/dns/unbound.nix
@@ -5,80 +5,6 @@
  config = lib.mkIf (!config.sane.services.hickory-dns.asSystemResolver) {
    services.resolved.enable = lib.mkForce false;

-    # resolve DNS recursively with Unbound.
-    services.unbound.enable = lib.mkDefault true;
-    services.unbound.resolveLocalQueries = false;  #< disable, so that i can manage networking.nameservers manually
-    services.unbound.settings.server.interface = [ "127.0.0.1" ];
-    services.unbound.settings.server.access-control = [ "127.0.0.0/8 allow" ];
-
-    # allow control via `unbound-control`. user must be a member of the `unbound` Unix group.
-    services.unbound.localControlSocketPath = "/run/unbound/unbound.ctl";
-
-    # exempt `pool.ntp.org` from DNSSEC checks to avoid a circular dependency between DNS resolution and NTP.
-    # without this, if the RTC fails, then both time and DNS are unrecoverable.
-    # services.unbound.settings.server.domain-insecure = "pool.ntp.org";
-    services.unbound.settings.server.domain-insecure = config.networking.timeServers;
-
-    # harden-dnssec-stripped: XXX(2024-12-03): DNSSEC doesn't play nice with the infra-cache, so set false to not enforce DNSSEC.
-    # i think the infra-cache is to blame: unbound gets a partial response from a server when network is coming online, part of what's dropped is DNSSEC,
-    # and so it thinks the server is misbehaving and kicks it out.
-    # services.unbound.settings.server.harden-dnssec-stripped = false;
-
-    # replace auto root-trust anchor calculations with hardcoded root key.
-    # this *might* cause issues when booting a very old config.
-    # if you remove this and use the auto key updating, consider persisting /var/lib/unbound.
-    # services.unbound.enableRootTrustAnchor = false;
-    # services.unbound.settings.server.trust-anchor-file = "${pkgs.dns-root-data}/root.key";
-    # services.unbound.settings.server.disable-dnssec-lame-check = true;
-
-    # root hints: are compiled-in (iterator/iter_hints.c), but can alternatively use more up-to-date ones should the nixos unbound package become outdated.
-    # services.unbound.settings.server.root-hints = "${pkgs.dns-root-data}/root.hints";
-
-    # scenario: net blip; unbound caches that a bunch of NS are unreachable; future queries fail
-    # - <https://forum.opnsense.org/index.php?topic=32852.0>
-    # infra settings described here:
-    # - <https://unbound.docs.nlnetlabs.nl/en/latest/reference/history/info-timeout-server-selection.html>
-    # - unbound keeps RTT estimates for each server.
-    # - if it sends a query and doesn't hear back based on when it expected, then it *resends* the query, with exponential backoff (i.e. doubling on each attempt)
-    #   - at this point, responses from the old query are *ignored*
-    # - for new/unknown hosts, a RTT timeout of 376ms is assumed
-    #
-    # services.unbound.settings.server.infra-keep-probing = true;  #< if unbound fails to reach a host (NS), it by default *does not try again* for 900s. keep-probing tells it to keep trying, with a backoff.
-    # services.unbound.settings.server.infra-cache-min-rtt = 1000;
-    # services.unbound.settings.server.infra-cache-max-rtt = 1000;
-    services.unbound.settings.server.infra-host-ttl = 30;  #< cache each NS's liveness for a max of 30s
-
-    # perf tuning; see: <https://unbound.docs.nlnetlabs.nl/en/latest/topics/core/performance.html>
-    # resource usage:
-    # - defaults (num-threads = 1; so-{rcvbuf,sndbuf} = 0, prefetch = false): 12.7M memory usage
-    # - num-threads = 2: 17.2M memory usage
-    # - num-threads = 4: 26.2M memory usage
-    # - num-threads = 4; so-{rcvbuf,sndbuf}=4m: 26.7M memory usage
-    # - prefetch = true: no increased memory; supposed 10% increase in traffic
-    #
-    # # i suspect most operations are async; the only serialized bits are either CPU or possibly local IO (i.e. syscalls to write sockets).
-    # # threading is probably only rarely helpful
-    # services.unbound.settings.server.num-threads = 4;
-    #
-    # services.unbound.settings.server.so-rcvbuf = "4m";  #< higher value means less likely to drop client queries
-    # services.unbound.settings.server.so-sndbuf = "4m";
-    #
-    # `prefetch`: prefetch RRs which are about to expire from the cache, to keep them primed.
-    # this seems to work fine when DNSSEC is disabled, but causes resolution failures straight out of boot when paired with DNSSEC (host-infra faiure?)
-    # services.unbound.settings.server.prefetch = true;
-
-    # if a resolution fails, or takes excessively long, reply with expired cache entries
-    # see: <https://unbound.docs.nlnetlabs.nl/en/latest/topics/core/serve-stale.html#rfc-8767>
-    # services.unbound.settings.server.serve-expired = true;
-    # services.unbound.settings.server.serve-expired-ttl = 86400;  #< don't serve any records more outdated than this
-    # services.unbound.settings.server.serve-expired-client-timeout = 2800;  #< only serve expired records if the client has been waiting this long, ms
-
-    # `cache-max-negative-ttl`: intended to limit damage during networking flakes, but instead seems to cause unbound to cache error responses it *wouldn't* otherwise cache
-    # services.unbound.settings.server.cache-max-negative-ttl = 60;
-
-    # `user-caps-for-id`: randomizes casing to avoid spoofing, but causes unbound to reply with no results to queries after boot (likely a infra-cache issue)
-    # services.unbound.settings.server.use-caps-for-id = true;
-
    networking.nameservers = [
      # be compatible with systemd-resolved
      # "127.0.0.53"
@@ -92,6 +18,72 @@
      # DNS serviced by `unbound` recursive resolver
      name_servers='127.0.0.1'
    '';
+
+    # resolve DNS recursively with Unbound.
+    services.unbound.enable = lib.mkDefault true;
+    services.unbound.resolveLocalQueries = false;  #< disable, so that i can manage networking.nameservers manually
+    services.unbound.settings.server.interface = [ "127.0.0.1" ];
+    services.unbound.settings.server.access-control = [ "127.0.0.0/8 allow" ];
+
+    # allow control via `unbound-control`. user must be a member of the `unbound` Unix group.
+    services.unbound.localControlSocketPath = "/run/unbound/unbound.ctl";
+
+    # exempt `pool.ntp.org` from DNSSEC checks to avoid a circular dependency between DNS resolution and NTP.
+    # without this, if the RTC fails, then both time and DNS are unrecoverable.
+    services.unbound.settings.server.domain-insecure = config.networking.timeServers;
+
+    # XXX(2024-12-03): BUG: during boot (before network is up), or during network blips, Unbound will
+    # receive a query, fail to evaluate it, and then resolve future identical queries with a no-answers response for the next ~15m.
+    # this *appears* to be some bug in Unbound's "infra-cache", as evidenced by `unbound-control flush_infra all`.
+    #
+    # the infra cache is a per-nameserver liveness and latency cache which Unbound uses to decide which of N applicable nameservers to route a given query to.
+    #
+    # there is apparently NO simple solution.
+    # the closest fix is to reduce the TTL of the infra-cache (`infra-host-ttl`) so as to limit the duration of this error.
+    # tried, but failed fixes:
+    # - server.harden-dnssec-stripped = false
+    # - services.unbound.enableRootTrustAnchor = false;  #< disable DNSSEC
+    #   - server.trust-anchor-file = "${pkgs.dns-root-data}/root.key";  #< hardcode root keys instead of dynamically probing them
+    # - server.disable-dnssec-lame-check = true;
+    # - server.infra-keep-probing = true;  #< if unbound fails to reach a host (NS), it by default *does not try again* for 900s. keep-probing tells it to keep trying, with a backoff.
+    # - server.infra-cache-min-rtt = 1000;
+    # - server.infra-cache-max-rtt = 1000;
+    #
+    # see also:
+    # - <https://forum.opnsense.org/index.php?topic=32852.0>
+    # - <https://unbound.docs.nlnetlabs.nl/en/latest/reference/history/info-timeout-server-selection.html>
+    #
+    services.unbound.settings.server.infra-host-ttl = 30;  #< cache each NS's liveness for a max of 30s
+
+    # perf tuning; see: <https://unbound.docs.nlnetlabs.nl/en/latest/topics/core/performance.html>
+    # resource usage:
+    # - defaults (num-threads = 1; so-{rcvbuf,sndbuf} = 0, prefetch = false): 12.7M memory usage
+    # - num-threads = 2: 17.2M memory usage
+    # - num-threads = 4: 26.2M memory usage
+    # - num-threads = 4; so-{rcvbuf,sndbuf}=4m: 26.7M memory usage
+    # - prefetch = true: no increased memory; supposed 10% increase in traffic
+    #
+    # # i suspect most operations are async; the only serialized bits are either CPU or possibly local IO (i.e. syscalls to write sockets).
+    # # threading is probably only rarely helpful
+    # services.unbound.settings.server.num-threads = 4;
+    #
+    # services.unbound.settings.server.so-rcvbuf = "1m";  #< higher value means less likely to drop client queries
+    # services.unbound.settings.server.so-sndbuf = "1m";
+    #
+    # `prefetch`: prefetch RRs which are about to expire from the cache, to keep them primed.
+    # services.unbound.settings.server.prefetch = true;
+
+    # if a resolution fails, or takes excessively long, reply with expired cache entries
+    # see: <https://unbound.docs.nlnetlabs.nl/en/latest/topics/core/serve-stale.html#rfc-8767>
+    # services.unbound.settings.server.serve-expired = true;
+    # services.unbound.settings.server.serve-expired-ttl = 86400;  #< don't serve any records more outdated than this
+    # services.unbound.settings.server.serve-expired-client-timeout = 2800;  #< only serve expired records if the client has been waiting this long, ms
+
+    # `cache-max-negative-ttl`: intended to limit damage during networking flakes, but instead seems to cause unbound to cache error responses it *wouldn't* otherwise cache
+    # services.unbound.settings.server.cache-max-negative-ttl = 60;
+
+    # `user-caps-for-id`: randomizes casing to avoid spoofing, but causes unbound to reply with no results to queries after boot (likely a infra-cache issue)
+    # services.unbound.settings.server.use-caps-for-id = true;
  };
 }