nvidia,nixos/nvidia: add datacenter drivers compatible with default cudaPkgs

For NVLink topology systems we need fabricmanager. Fabricmanager itself is
dependent on the datacenter driver set and not the regular x11 ones, it is also
tightly tied to the driver version. Furhtermore the current cudaPackages
defaults to version 11.8, which corresponds to the 520 datacenter drivers.

Future improvement should be to switch the main nvidia datacenter driver version
on the `config.cudaVersion` since these are well known from:

> https://docs.nvidia.com/deploy/cuda-compatibility/index.html#use-the-right-compat-package

This adds nixos configuration options `hardware.nvidia.datacenter.enable` and
`hardware.nvidia.datacenter.settings` (the settings configure fabricmanager)

Other interesting external links related to this commit are:

* Fabricmanager download site:
    - https://developer.download.nvidia.com/compute/cuda/redist/fabricmanager/linux-x86_64/
* Data Center drivers:
    - https://www.nvidia.com/Download/driverResults.aspx/193711/en-us/

Implementation specific details:

* Fabricmanager is added as a passthru package, similar to settings and
  presistenced.
* Adds `use{Settings,Persistenced,Fabricmanager}` with defaults to preserve x11
  expressions.
* Utilizes mkMerge to split the `hardware.nvidia` module into three comment
  delimited sections:
    1. Common
    2. X11/xorg
    3. Data Center
* Uses asserts to make the configurations mutualy exclusive.

Notes:

* Data Center Drivers are `x86_64` only.
* Reuses the `nvidia_x11` attribute in nixpkgs on enable, e.g. doesn't change it
  to `nvidia_driver` and sets that to either `nvidia_x11` or `nvidia_dc`.
* Should have a helper function which is switched on `config.cudaVersion` like
  `selectHighestVersion` but rather `selectCudaCompatibleVersion`.
This commit is contained in:
Edward Tjörnhammar 2023-09-11 16:37:26 +02:00
parent 23e69f92c8
commit 9b95f21cdb
6 changed files with 434 additions and 226 deletions

View File

@ -265,6 +265,8 @@ The module update takes care of the new config syntax and the data itself (user
- The `cawbird` package is dropped from nixpkgs, as it got broken by the Twitter API closing down and has been abandoned upstream.
- `hardware.nvidia` gained `datacenter` options for enabling NVIDIA Data Center drivers and configuration of NVLink/NVSwitch topologies through `nv-fabricmanager`.
- Certificate generation via the `security.acme` now limits the concurrent number of running certificate renewals and generation jobs, to avoid spiking resource usage when processing many certificates at once. The limit defaults to *5* and can be adjusted via `maxConcurrentRenewals`. Setting it to *0* disables the limits altogether.
- New `boot.bcache.enable` (default enabled) allows completely removing `bcache` mount support.

View File

@ -4,8 +4,10 @@
pkgs,
...
}: let
x11Enabled = config.services.xserver.enable
&& (lib.elem "nvidia" config.services.xserver.videoDrivers);
nvidia_x11 =
if (lib.elem "nvidia" config.services.xserver.videoDrivers)
if x11Enabled || cfg.datacenter.enable
then cfg.package
else null;
@ -18,9 +20,64 @@
primeEnabled = syncCfg.enable || reverseSyncCfg.enable || offloadCfg.enable;
busIDType = lib.types.strMatching "([[:print:]]+[\:\@][0-9]{1,3}\:[0-9]{1,2}\:[0-9])?";
ibtSupport = cfg.open || (nvidia_x11.ibtSupport or false);
settingsFormat = pkgs.formats.keyValue {};
in {
options = {
hardware.nvidia = {
datacenter.enable = lib.mkEnableOption (lib.mdDoc ''
Data Center drivers for NVIDIA cards on a NVLink topology.
'');
datacenter.settings = lib.mkOption {
type = settingsFormat.type;
default = {
LOG_LEVEL=4;
LOG_FILE_NAME="/var/log/fabricmanager.log";
LOG_APPEND_TO_LOG=1;
LOG_FILE_MAX_SIZE=1024;
LOG_USE_SYSLOG=0;
DAEMONIZE=1;
BIND_INTERFACE_IP="127.0.0.1";
STARTING_TCP_PORT=16000;
FABRIC_MODE=0;
FABRIC_MODE_RESTART=0;
STATE_FILE_NAME="/var/tmp/fabricmanager.state";
FM_CMD_BIND_INTERFACE="127.0.0.1";
FM_CMD_PORT_NUMBER=6666;
FM_STAY_RESIDENT_ON_FAILURES=0;
ACCESS_LINK_FAILURE_MODE=0;
TRUNK_LINK_FAILURE_MODE=0;
NVSWITCH_FAILURE_MODE=0;
ABORT_CUDA_JOBS_ON_FM_EXIT=1;
TOPOLOGY_FILE_PATH=nvidia_x11.fabricmanager + "/share/nvidia-fabricmanager/nvidia/nvswitch";
};
defaultText = lib.literalExpression ''
{
LOG_LEVEL=4;
LOG_FILE_NAME="/var/log/fabricmanager.log";
LOG_APPEND_TO_LOG=1;
LOG_FILE_MAX_SIZE=1024;
LOG_USE_SYSLOG=0;
DAEMONIZE=1;
BIND_INTERFACE_IP="127.0.0.1";
STARTING_TCP_PORT=16000;
FABRIC_MODE=0;
FABRIC_MODE_RESTART=0;
STATE_FILE_NAME="/var/tmp/fabricmanager.state";
FM_CMD_BIND_INTERFACE="127.0.0.1";
FM_CMD_PORT_NUMBER=6666;
FM_STAY_RESIDENT_ON_FAILURES=0;
ACCESS_LINK_FAILURE_MODE=0;
TRUNK_LINK_FAILURE_MODE=0;
NVSWITCH_FAILURE_MODE=0;
ABORT_CUDA_JOBS_ON_FM_EXIT=1;
TOPOLOGY_FILE_PATH=nvidia_x11.fabricmanager + "/share/nvidia-fabricmanager/nvidia/nvswitch";
}
'';
description = lib.mdDoc ''
Additional configuration options for fabricmanager.
'';
};
powerManagement.enable = lib.mkEnableOption (lib.mdDoc ''
experimental power management through systemd. For more information, see
the NVIDIA docs, on Chapter 21. Configuring Power Management Support.
@ -167,9 +224,15 @@ in {
It also drastically increases the time the driver needs to clock down after load.
'');
package = lib.mkPackageOptionMD config.boot.kernelPackages.nvidiaPackages "nvidia_x11" {
default = "stable";
package = lib.mkOption {
default = config.boot.kernelPackages.nvidiaPackages."${if cfg.datacenter.enable then "dc" else "stable"}";
defaultText = lib.literalExpression ''
config.boot.kernelPackages.nvidiaPackages."\$\{if cfg.datacenter.enable then "dc" else "stable"}"
'';
example = lib.mdDoc "config.boot.kernelPackages.nvidiaPackages.legacy_470";
description = lib.mdDoc ''
The NVIDIA driver package to use.
'';
};
open = lib.mkEnableOption (lib.mdDoc ''
@ -188,8 +251,46 @@ in {
then pCfg.intelBusId
else pCfg.amdgpuBusId;
in
lib.mkIf (nvidia_x11 != null) {
assertions = [
lib.mkIf (nvidia_x11 != null) (lib.mkMerge [
# Common
({
assertions = [
{
assertion = !(x11Enabled && cfg.datacenter.enable);
message = "You cannot configure both X11 and Data Center drivers at the same time.";
}
];
boot = {
blacklistedKernelModules = ["nouveau" "nvidiafb"];
kernelModules = [ "nvidia-uvm" ];
};
systemd.tmpfiles.rules =
lib.optional config.virtualisation.docker.enableNvidia
"L+ /run/nvidia-docker/bin - - - - ${nvidia_x11.bin}/origBin";
services.udev.extraRules =
''
# Create /dev/nvidia-uvm when the nvidia-uvm module is loaded.
KERNEL=="nvidia", RUN+="${pkgs.runtimeShell} -c 'mknod -m 666 /dev/nvidiactl c $$(grep nvidia-frontend /proc/devices | cut -d \ -f 1) 255'"
KERNEL=="nvidia", RUN+="${pkgs.runtimeShell} -c 'for i in $$(cat /proc/driver/nvidia/gpus/*/information | grep Minor | cut -d \ -f 4); do mknod -m 666 /dev/nvidia$${i} c $$(grep nvidia-frontend /proc/devices | cut -d \ -f 1) $${i}; done'"
KERNEL=="nvidia_modeset", RUN+="${pkgs.runtimeShell} -c 'mknod -m 666 /dev/nvidia-modeset c $$(grep nvidia-frontend /proc/devices | cut -d \ -f 1) 254'"
KERNEL=="nvidia_uvm", RUN+="${pkgs.runtimeShell} -c 'mknod -m 666 /dev/nvidia-uvm c $$(grep nvidia-uvm /proc/devices | cut -d \ -f 1) 0'"
KERNEL=="nvidia_uvm", RUN+="${pkgs.runtimeShell} -c 'mknod -m 666 /dev/nvidia-uvm-tools c $$(grep nvidia-uvm /proc/devices | cut -d \ -f 1) 1'"
'';
hardware.opengl = {
extraPackages = [
nvidia_x11.out
];
extraPackages32 = [
nvidia_x11.lib32
];
};
environment.systemPackages = [
nvidia_x11.bin
];
})
# X11
(lib.mkIf x11Enabled {
assertions = [
{
assertion = primeEnabled -> pCfg.intelBusId == "" || pCfg.amdgpuBusId == "";
message = "You cannot configure both an Intel iGPU and an AMD APU. Pick the one corresponding to your processor.";
@ -248,227 +349,207 @@ in {
{
assertion = cfg.dynamicBoost.enable -> lib.versionAtLeast nvidia_x11.version "510.39.01";
message = "NVIDIA's Dynamic Boost feature only exists on versions >= 510.39.01";
}
];
}];
# If Optimus/PRIME is enabled, we:
# - Specify the configured NVIDIA GPU bus ID in the Device section for the
# "nvidia" driver.
# - Add the AllowEmptyInitialConfiguration option to the Screen section for the
# "nvidia" driver, in order to allow the X server to start without any outputs.
# - Add a separate Device section for the Intel GPU, using the "modesetting"
# driver and with the configured BusID.
# - OR add a separate Device section for the AMD APU, using the "amdgpu"
# driver and with the configures BusID.
# - Reference that Device section from the ServerLayout section as an inactive
# device.
# - Configure the display manager to run specific `xrandr` commands which will
# configure/enable displays connected to the Intel iGPU / AMD APU.
# If Optimus/PRIME is enabled, we:
# - Specify the configured NVIDIA GPU bus ID in the Device section for the
# "nvidia" driver.
# - Add the AllowEmptyInitialConfiguration option to the Screen section for the
# "nvidia" driver, in order to allow the X server to start without any outputs.
# - Add a separate Device section for the Intel GPU, using the "modesetting"
# driver and with the configured BusID.
# - OR add a separate Device section for the AMD APU, using the "amdgpu"
# driver and with the configures BusID.
# - Reference that Device section from the ServerLayout section as an inactive
# device.
# - Configure the display manager to run specific `xrandr` commands which will
# configure/enable displays connected to the Intel iGPU / AMD APU.
# reverse sync implies offloading
hardware.nvidia.prime.offload.enable = lib.mkDefault reverseSyncCfg.enable;
# reverse sync implies offloading
hardware.nvidia.prime.offload.enable = lib.mkDefault reverseSyncCfg.enable;
services.xserver.drivers =
lib.optional primeEnabled {
name = igpuDriver;
display = offloadCfg.enable;
modules = lib.optional (igpuDriver == "amdgpu") pkgs.xorg.xf86videoamdgpu;
deviceSection =
''
BusID "${igpuBusId}"
''
+ lib.optionalString (syncCfg.enable && igpuDriver != "amdgpu") ''
Option "AccelMethod" "none"
'';
}
++ lib.singleton {
name = "nvidia";
modules = [nvidia_x11.bin];
display = !offloadCfg.enable;
deviceSection =
lib.optionalString primeEnabled
''
BusID "${pCfg.nvidiaBusId}"
''
+ lib.optionalString pCfg.allowExternalGpu ''
Option "AllowExternalGpus"
'';
screenSection =
''
Option "RandRRotation" "on"
''
+ lib.optionalString syncCfg.enable ''
Option "AllowEmptyInitialConfiguration"
''
+ lib.optionalString cfg.forceFullCompositionPipeline ''
Option "metamodes" "nvidia-auto-select +0+0 {ForceFullCompositionPipeline=On}"
Option "AllowIndirectGLXProtocol" "off"
Option "TripleBuffer" "on"
'';
};
services.xserver.serverLayoutSection =
lib.optionalString syncCfg.enable ''
Inactive "Device-${igpuDriver}[0]"
''
+ lib.optionalString reverseSyncCfg.enable ''
Inactive "Device-nvidia[0]"
''
+ lib.optionalString offloadCfg.enable ''
Option "AllowNVIDIAGPUScreens"
'';
services.xserver.displayManager.setupCommands = let
gpuProviderName =
if igpuDriver == "amdgpu"
then
# find the name of the provider if amdgpu
"`${lib.getExe pkgs.xorg.xrandr} --listproviders | ${lib.getExe pkgs.gnugrep} -i AMD | ${lib.getExe pkgs.gnused} -n 's/^.*name://p'`"
else igpuDriver;
providerCmdParams =
if syncCfg.enable
then "\"${gpuProviderName}\" NVIDIA-0"
else "NVIDIA-G0 \"${gpuProviderName}\"";
in
lib.optionalString (syncCfg.enable || reverseSyncCfg.enable) ''
# Added by nvidia configuration module for Optimus/PRIME.
${lib.getExe pkgs.xorg.xrandr} --setprovideroutputsource ${providerCmdParams}
${lib.getExe pkgs.xorg.xrandr} --auto
'';
environment.etc = {
"nvidia/nvidia-application-profiles-rc" = lib.mkIf nvidia_x11.useProfiles {source = "${nvidia_x11.bin}/share/nvidia/nvidia-application-profiles-rc";};
# 'nvidia_x11' installs it's files to /run/opengl-driver/...
"egl/egl_external_platform.d".source = "/run/opengl-driver/share/egl/egl_external_platform.d/";
};
hardware.opengl = {
extraPackages = [
nvidia_x11.out
pkgs.nvidia-vaapi-driver
];
extraPackages32 = [
nvidia_x11.lib32
pkgs.pkgsi686Linux.nvidia-vaapi-driver
];
};
environment.systemPackages =
[nvidia_x11.bin]
++ lib.optional cfg.nvidiaSettings nvidia_x11.settings
++ lib.optional cfg.nvidiaPersistenced nvidia_x11.persistenced
++ lib.optional offloadCfg.enableOffloadCmd
(pkgs.writeShellScriptBin "nvidia-offload" ''
export __NV_PRIME_RENDER_OFFLOAD=1
export __NV_PRIME_RENDER_OFFLOAD_PROVIDER=NVIDIA-G0
export __GLX_VENDOR_LIBRARY_NAME=nvidia
export __VK_LAYER_NV_optimus=NVIDIA_only
exec "$@"
'');
systemd.packages = lib.optional cfg.powerManagement.enable nvidia_x11.out;
systemd.services = let
nvidiaService = state: {
description = "NVIDIA system ${state} actions";
path = [pkgs.kbd];
serviceConfig = {
Type = "oneshot";
ExecStart = "${nvidia_x11.out}/bin/nvidia-sleep.sh '${state}'";
services.xserver.drivers =
lib.optional primeEnabled {
name = igpuDriver;
display = offloadCfg.enable;
modules = lib.optional (igpuDriver == "amdgpu") pkgs.xorg.xf86videoamdgpu;
deviceSection =
''
BusID "${igpuBusId}"
''
+ lib.optionalString (syncCfg.enable && igpuDriver != "amdgpu") ''
Option "AccelMethod" "none"
'';
}
++ lib.singleton {
name = "nvidia";
modules = [nvidia_x11.bin];
display = !offloadCfg.enable;
deviceSection =
lib.optionalString primeEnabled
''
BusID "${pCfg.nvidiaBusId}"
''
+ lib.optionalString pCfg.allowExternalGpu ''
Option "AllowExternalGpus"
'';
screenSection =
''
Option "RandRRotation" "on"
''
+ lib.optionalString syncCfg.enable ''
Option "AllowEmptyInitialConfiguration"
''
+ lib.optionalString cfg.forceFullCompositionPipeline ''
Option "metamodes" "nvidia-auto-select +0+0 {ForceFullCompositionPipeline=On}"
Option "AllowIndirectGLXProtocol" "off"
Option "TripleBuffer" "on"
'';
};
before = ["systemd-${state}.service"];
requiredBy = ["systemd-${state}.service"];
services.xserver.serverLayoutSection =
lib.optionalString syncCfg.enable ''
Inactive "Device-${igpuDriver}[0]"
''
+ lib.optionalString reverseSyncCfg.enable ''
Inactive "Device-nvidia[0]"
''
+ lib.optionalString offloadCfg.enable ''
Option "AllowNVIDIAGPUScreens"
'';
services.xserver.displayManager.setupCommands = let
gpuProviderName =
if igpuDriver == "amdgpu"
then
# find the name of the provider if amdgpu
"`${lib.getExe pkgs.xorg.xrandr} --listproviders | ${lib.getExe pkgs.gnugrep} -i AMD | ${lib.getExe pkgs.gnused} -n 's/^.*name://p'`"
else igpuDriver;
providerCmdParams =
if syncCfg.enable
then "\"${gpuProviderName}\" NVIDIA-0"
else "NVIDIA-G0 \"${gpuProviderName}\"";
in
lib.optionalString (syncCfg.enable || reverseSyncCfg.enable) ''
# Added by nvidia configuration module for Optimus/PRIME.
${lib.getExe pkgs.xorg.xrandr} --setprovideroutputsource ${providerCmdParams}
${lib.getExe pkgs.xorg.xrandr} --auto
'';
environment.etc = {
"nvidia/nvidia-application-profiles-rc" = lib.mkIf nvidia_x11.useProfiles {source = "${nvidia_x11.bin}/share/nvidia/nvidia-application-profiles-rc";};
# 'nvidia_x11' installs it's files to /run/opengl-driver/...
"egl/egl_external_platform.d".source = "/run/opengl-driver/share/egl/egl_external_platform.d/";
};
in
lib.mkMerge [
(lib.mkIf cfg.powerManagement.enable {
nvidia-suspend = nvidiaService "suspend";
nvidia-hibernate = nvidiaService "hibernate";
nvidia-resume =
(nvidiaService "resume")
// {
before = [];
after = ["systemd-suspend.service" "systemd-hibernate.service"];
requiredBy = ["systemd-suspend.service" "systemd-hibernate.service"];
};
})
(lib.mkIf cfg.nvidiaPersistenced {
"nvidia-persistenced" = {
description = "NVIDIA Persistence Daemon";
wantedBy = ["multi-user.target"];
serviceConfig = {
Type = "forking";
Restart = "always";
PIDFile = "/var/run/nvidia-persistenced/nvidia-persistenced.pid";
ExecStart = "${lib.getExe nvidia_x11.persistenced} --verbose";
ExecStopPost = "${pkgs.coreutils}/bin/rm -rf /var/run/nvidia-persistenced";
};
hardware.opengl = {
extraPackages = [
pkgs.nvidia-vaapi-driver
];
extraPackages32 = [
pkgs.pkgsi686Linux.nvidia-vaapi-driver
];
};
environment.systemPackages =
lib.optional cfg.nvidiaSettings nvidia_x11.settings
++ lib.optional cfg.nvidiaPersistenced nvidia_x11.persistenced
++ lib.optional offloadCfg.enableOffloadCmd
(pkgs.writeShellScriptBin "nvidia-offload" ''
export __NV_PRIME_RENDER_OFFLOAD=1
export __NV_PRIME_RENDER_OFFLOAD_PROVIDER=NVIDIA-G0
export __GLX_VENDOR_LIBRARY_NAME=nvidia
export __VK_LAYER_NV_optimus=NVIDIA_only
exec "$@"
'');
systemd.packages = lib.optional cfg.powerManagement.enable nvidia_x11.out;
systemd.services = let
nvidiaService = state: {
description = "NVIDIA system ${state} actions";
path = [pkgs.kbd];
serviceConfig = {
Type = "oneshot";
ExecStart = "${nvidia_x11.out}/bin/nvidia-sleep.sh '${state}'";
};
})
(lib.mkIf cfg.dynamicBoost.enable {
"nvidia-powerd" = {
description = "nvidia-powerd service";
path = [
pkgs.util-linux # nvidia-powerd wants lscpu
];
wantedBy = ["multi-user.target"];
serviceConfig = {
Type = "dbus";
BusName = "nvidia.powerd.server";
ExecStart = "${nvidia_x11.bin}/bin/nvidia-powerd";
before = ["systemd-${state}.service"];
requiredBy = ["systemd-${state}.service"];
};
in
lib.mkMerge [
(lib.mkIf cfg.powerManagement.enable {
nvidia-suspend = nvidiaService "suspend";
nvidia-hibernate = nvidiaService "hibernate";
nvidia-resume =
(nvidiaService "resume")
// {
before = [];
after = ["systemd-suspend.service" "systemd-hibernate.service"];
requiredBy = ["systemd-suspend.service" "systemd-hibernate.service"];
};
})
(lib.mkIf cfg.nvidiaPersistenced {
"nvidia-persistenced" = {
description = "NVIDIA Persistence Daemon";
wantedBy = ["multi-user.target"];
serviceConfig = {
Type = "forking";
Restart = "always";
PIDFile = "/var/run/nvidia-persistenced/nvidia-persistenced.pid";
ExecStart = "${lib.getExe nvidia_x11.persistenced} --verbose";
ExecStopPost = "${pkgs.coreutils}/bin/rm -rf /var/run/nvidia-persistenced";
};
};
};
})
];
})
(lib.mkIf cfg.dynamicBoost.enable {
"nvidia-powerd" = {
description = "nvidia-powerd service";
path = [
pkgs.util-linux # nvidia-powerd wants lscpu
];
wantedBy = ["multi-user.target"];
serviceConfig = {
Type = "dbus";
BusName = "nvidia.powerd.server";
ExecStart = "${nvidia_x11.bin}/bin/nvidia-powerd";
};
};
})
];
services.acpid.enable = true;
services.acpid.enable = true;
services.dbus.packages = lib.optional cfg.dynamicBoost.enable nvidia_x11.bin;
services.dbus.packages = lib.optional cfg.dynamicBoost.enable nvidia_x11.bin;
hardware.firmware = lib.optional cfg.open nvidia_x11.firmware;
hardware.firmware = lib.optional cfg.open nvidia_x11.firmware;
systemd.tmpfiles.rules =
lib.optional (nvidia_x11.persistenced != null && config.virtualisation.docker.enableNvidia)
"L+ /run/nvidia-docker/extras/bin/nvidia-persistenced - - - - ${nvidia_x11.persistenced}/origBin/nvidia-persistenced";
systemd.tmpfiles.rules =
lib.optional config.virtualisation.docker.enableNvidia
"L+ /run/nvidia-docker/bin - - - - ${nvidia_x11.bin}/origBin"
++ lib.optional (nvidia_x11.persistenced != null && config.virtualisation.docker.enableNvidia)
"L+ /run/nvidia-docker/extras/bin/nvidia-persistenced - - - - ${nvidia_x11.persistenced}/origBin/nvidia-persistenced";
boot = {
extraModulePackages =
if cfg.open
then [nvidia_x11.open]
else [nvidia_x11.bin];
# nvidia-uvm is required by CUDA applications.
kernelModules =
lib.optionals config.services.xserver.enable ["nvidia" "nvidia_modeset" "nvidia_drm"];
boot = {
blacklistedKernelModules = ["nouveau" "nvidiafb"];
# If requested enable modesetting via kernel parameter.
kernelParams =
lib.optional (offloadCfg.enable || cfg.modesetting.enable) "nvidia-drm.modeset=1"
++ lib.optional cfg.powerManagement.enable "nvidia.NVreg_PreserveVideoMemoryAllocations=1"
++ lib.optional cfg.open "nvidia.NVreg_OpenRmEnableUnsupportedGpus=1"
++ lib.optional (config.boot.kernelPackages.kernel.kernelAtLeast "6.2" && !ibtSupport) "ibt=off";
extraModulePackages =
if cfg.open
then [nvidia_x11.open]
else [nvidia_x11.bin];
# nvidia-uvm is required by CUDA applications.
kernelModules =
["nvidia-uvm"]
++ lib.optionals config.services.xserver.enable ["nvidia" "nvidia_modeset" "nvidia_drm"];
# If requested enable modesetting via kernel parameter.
kernelParams =
lib.optional (offloadCfg.enable || cfg.modesetting.enable) "nvidia-drm.modeset=1"
++ lib.optional cfg.powerManagement.enable "nvidia.NVreg_PreserveVideoMemoryAllocations=1"
++ lib.optional cfg.open "nvidia.NVreg_OpenRmEnableUnsupportedGpus=1"
++ lib.optional (config.boot.kernelPackages.kernel.kernelAtLeast "6.2" && !ibtSupport) "ibt=off";
# enable finegrained power management
extraModprobeConfig = lib.optionalString cfg.powerManagement.finegrained ''
options nvidia "NVreg_DynamicPowerManagement=0x02"
'';
};
services.udev.extraRules =
''
# Create /dev/nvidia-uvm when the nvidia-uvm module is loaded.
KERNEL=="nvidia", RUN+="${pkgs.runtimeShell} -c 'mknod -m 666 /dev/nvidiactl c $$(grep nvidia-frontend /proc/devices | cut -d \ -f 1) 255'"
KERNEL=="nvidia", RUN+="${pkgs.runtimeShell} -c 'for i in $$(cat /proc/driver/nvidia/gpus/*/information | grep Minor | cut -d \ -f 4); do mknod -m 666 /dev/nvidia$${i} c $$(grep nvidia-frontend /proc/devices | cut -d \ -f 1) $${i}; done'"
KERNEL=="nvidia_modeset", RUN+="${pkgs.runtimeShell} -c 'mknod -m 666 /dev/nvidia-modeset c $$(grep nvidia-frontend /proc/devices | cut -d \ -f 1) 254'"
KERNEL=="nvidia_uvm", RUN+="${pkgs.runtimeShell} -c 'mknod -m 666 /dev/nvidia-uvm c $$(grep nvidia-uvm /proc/devices | cut -d \ -f 1) 0'"
KERNEL=="nvidia_uvm", RUN+="${pkgs.runtimeShell} -c 'mknod -m 666 /dev/nvidia-uvm-tools c $$(grep nvidia-uvm /proc/devices | cut -d \ -f 1) 1'"
''
+ lib.optionalString cfg.powerManagement.finegrained (
# enable finegrained power management
extraModprobeConfig = lib.optionalString cfg.powerManagement.finegrained ''
options nvidia "NVreg_DynamicPowerManagement=0x02"
'';
};
services.udev.extraRules =
lib.optionalString cfg.powerManagement.finegrained (
lib.optionalString (lib.versionOlder config.boot.kernelPackages.kernel.version "5.5") ''
# Remove NVIDIA USB xHCI Host Controller devices, if present
ACTION=="add", SUBSYSTEM=="pci", ATTR{vendor}=="0x10de", ATTR{class}=="0x0c0330", ATTR{remove}="1"
@ -489,5 +570,30 @@ in {
ACTION=="unbind", SUBSYSTEM=="pci", ATTR{vendor}=="0x10de", ATTR{class}=="0x030200", TEST=="power/control", ATTR{power/control}="on"
''
);
};
})
# Data Center
(lib.mkIf (cfg.datacenter.enable) {
boot.extraModulePackages = [
nvidia_x11.bin
];
systemd.services.nvidia-fabricmanager = {
enable = true;
description = "Start NVIDIA NVLink Management";
wantedBy = [ "multi-user.target" ];
unitConfig.After = [ "network-online.target" ];
unitConfig.Requires = [ "network-online.target" ];
serviceConfig = {
Type = "forking";
TimeoutStartSec = 240;
ExecStart = let
nv-fab-conf = settingsFormat.generate "fabricmanager.conf" cfg.datacenter.settings;
in
nvidia_x11.fabricmanager + "/bin/nv-fabricmanager -c " + nv-fab-conf;
LimitCORE="infinity";
};
};
environment.systemPackages =
lib.optional cfg.datacenter.enable nvidia_x11.fabricmanager;
})
]);
}

View File

@ -75,6 +75,18 @@ rec {
url = "https://developer.nvidia.com/downloads/vulkan-beta-${lib.concatStrings (lib.splitString "." version)}-linux";
};
# data center driver compatible with current default cudaPackages
dc = dc_520;
dc_520 = generic rec {
version = "520.61.05";
url = "https://us.download.nvidia.com/tesla/${version}/NVIDIA-Linux-x86_64-${version}.run";
sha256_64bit = "sha256-EPYWZwOur/6iN/otDMrNDpNXr1mzu8cIqQl8lXhQlzU==";
fabricmanagerSha256 = "sha256-o8Kbmkg7qczKQclaGvEyXNzEOWq9ZpQZn9syeffnEiE==";
useSettings = false;
usePersistenced = false;
useFabricmanager = true;
};
# Update note:
# If you add a legacy driver here, also update `top-level/linux-kernels.nix`,
# adding to the `nvidia_x11_legacy*` entries.

View File

@ -0,0 +1,47 @@
nvidia_x11: sha256:
{ stdenv, lib, fetchurl, patchelf }:
let
sys = with lib; concatStringsSep "-" (reverseList (splitString "-" stdenv.system));
bsys = builtins.replaceStrings ["_"] ["-"] sys;
fmver = nvidia_x11.version;
in
stdenv.mkDerivation rec {
pname = "fabricmanager";
version = fmver;
src = fetchurl {
url = "https://developer.download.nvidia.com/compute/cuda/redist/fabricmanager/" +
"${sys}/${pname}-${sys}-${fmver}-archive.tar.xz";
inherit sha256;
};
phases = [ "unpackPhase" "installPhase" ];
installPhase = ''
find .
mkdir -p $out/{bin,share/nvidia-fabricmanager}
for bin in nv{-fabricmanager,switch-audit};do
${patchelf}/bin/patchelf \
--set-interpreter ${stdenv.cc.libc}/lib/ld-${bsys}.so.2 \
--set-rpath ${lib.makeLibraryPath [ stdenv.cc.libc ]} \
bin/$bin
done
mv bin/nv{-fabricmanager,switch-audit} $out/bin/.
for d in etc systemd share/nvidia;do
mv $d $out/share/nvidia-fabricmanager/.
done
for d in include lib;do
mv $d $out/.
done
'';
meta = with lib; {
homepage = "https://www.nvidia.com/object/unix.html";
description = "Fabricmanager daemon for NVLink intialization and control";
license = licenses.unfreeRedistributable;
platforms = nvidia_x11.meta.platforms;
mainProgram = "nv-fabricmanager";
maintainers = with maintainers; [ edwtjo ];
};
}

View File

@ -4,14 +4,19 @@
, sha256_64bit
, sha256_aarch64 ? null
, openSha256 ? null
, settingsSha256
, settingsSha256 ? null
, settingsVersion ? version
, persistencedSha256
, persistencedSha256 ? null
, persistencedVersion ? version
, fabricmanagerSha256 ? null
, fabricmanagerVersion ? version
, useGLVND ? true
, useProfiles ? true
, preferGtk2 ? false
, settings32Bit ? false
, useSettings ? true
, usePersistenced ? true
, useFabricmanager ? false
, ibtSupport ? false
, prePatch ? ""
@ -33,14 +38,21 @@
disable32Bit ? stdenv.hostPlatform.system == "aarch64-linux"
# 32 bit libs only version of this package
, lib32 ? null
# Whether to extract the GSP firmware
, firmware ? openSha256 != null
# Whether to extract the GSP firmware, datacenter drivers needs to extract the
# firmware
, firmware ? openSha256 != null || useFabricmanager
# Whether the user accepts the NVIDIA Software License
, config, acceptLicense ? config.nvidia.acceptLicense or false
}:
with lib;
assert !libsOnly -> kernel != null;
assert versionOlder version "391" -> sha256_32bit != null;
assert useSettings -> settingsSha256 != null;
assert usePersistenced -> persistencedSha256 != null;
assert useFabricmanager -> fabricmanagerSha256 != null;
assert useFabricmanager -> !(useSettings || usePersistenced);
let
nameSuffix = optionalString (!libsOnly) "-${kernel.version}";
@ -54,12 +66,33 @@ let
dbus # for nvidia-powerd
]);
# maybe silly since we've ignored this previously and just unfree..
throwLicense = throw ''
Use of NVIDIA Software requires license acceptance of the license:
- License For Customer Use of NVIDIA Software [1]
You can express acceptance by setting acceptLicense to true your nixpkgs.config.
Example:
configuration.nix:
nixpkgs.config.allowUnfree = true;
nixpkgs.config.nvidia.acceptLicense = true;
config.nix:
allowUnfree = true;
nvidia.acceptLicense = true;
[1]: https://www.nvidia.com/content/DriverDownloads/licence.php?lang=us
'';
self = stdenv.mkDerivation {
name = "nvidia-x11-${version}${nameSuffix}";
name = "nvidia-${if useFabricmanager then "dc" else "x11"}-${version}${nameSuffix}";
builder = ./builder.sh;
src =
if !acceptLicense && (openSha256 == null) then throwLicense else
if stdenv.hostPlatform.system == "x86_64-linux" then
fetchurl {
urls = if args ? url then [ args.url ] else [
@ -127,11 +160,17 @@ let
nvidia_x11 = self;
broken = brokenOpen;
}) openSha256;
settings = (if settings32Bit then pkgsi686Linux.callPackage else callPackage) (import ./settings.nix self settingsSha256) {
withGtk2 = preferGtk2;
withGtk3 = !preferGtk2;
};
persistenced = mapNullable (hash: callPackage (import ./persistenced.nix self hash) { }) persistencedSha256;
settings = if useSettings then
(if settings32Bit then pkgsi686Linux.callPackage else callPackage) (import ./settings.nix self settingsSha256) {
withGtk2 = preferGtk2;
withGtk3 = !preferGtk2;
} else {};
persistenced = if usePersistenced then
mapNullable (hash: callPackage (import ./persistenced.nix self hash) { }) persistencedSha256
else {};
fabricmanager = if useFabricmanager then
mapNullable (hash: callPackage (import ./fabricmanager.nix self hash) { }) fabricmanagerSha256
else {};
inherit persistencedVersion settingsVersion;
compressFirmware = false;
ibtSupport = ibtSupport || (lib.versionAtLeast version "530");
@ -141,12 +180,12 @@ let
meta = with lib; {
homepage = "https://www.nvidia.com/object/unix.html";
description = "X.org driver and kernel module for NVIDIA graphics cards";
description = "${if useFabricmanager then "Data Center" else "X.org"} driver and kernel module for NVIDIA cards";
license = licenses.unfreeRedistributable;
platforms = [ "x86_64-linux" ]
++ optionals (sha256_32bit != null) [ "i686-linux" ]
++ optionals (sha256_aarch64 != null) [ "aarch64-linux" ];
maintainers = with maintainers; [ jonringer kiskae ];
maintainers = with maintainers; [ jonringer kiskae edwtjo ];
priority = 4; # resolves collision with xorg-server's "lib/xorg/modules/extensions/libglx.so"
inherit broken;
};

View File

@ -410,6 +410,8 @@ in {
nvidia_x11_legacy470 = nvidiaPackages.legacy_470;
nvidia_x11_production = nvidiaPackages.production;
nvidia_x11_vulkan_beta = nvidiaPackages.vulkan_beta;
nvidia_dc = nvidiaPackages.dc;
nvidia_dc_520 = nvidiaPackages.dc_520;
# this is not a replacement for nvidia_x11*
# only the opensource kernel driver exposed for hydra to build