cudaPackages: add cudaFlags (#205351)

* cudaPackages: add cudaFlags * cudaNames -> cudaMicroarchitectureNames * update documentation, remove config for static library removal * doc link added to flags * fix whitespace in assignment
2022-12-17 01:00:36 -05:00 · 2022-12-17 01:00:36 -05:00 · 8392158289
commit 8392158289
parent 08b5fc6d8c
9 changed files with 114 additions and 107 deletions
--- a/doc/languages-frameworks/cuda.section.md
+++ b/doc/languages-frameworks/cuda.section.md
@ -32,3 +32,22 @@ mypkg = let
  }});
 in callPackage { inherit cudaPackages; };
 ```
+
+The CUDA NVCC compiler requires flags to determine which hardware you
+want to target for in terms of SASS (real hardware) or PTX (JIT kernels).
+
+Nixpkgs tries to target support real architecture defaults based on the
+CUDA toolkit version with PTX support for future hardware.  Experienced
+users may optmize this configuration for a variety of reasons such as
+reducing binary size and compile time, supporting legacy hardware, or
+optimizing for specific hardware.
+
+You may provide capabilities to add support or reduce binary size through
+`config` using `cudaCapabilities = [ "6.0" "7.0" ];` and
+`cudaForwardCompat = true;` if you want PTX support for future hardware.
+
+Please consult [GPUs supported](https://en.wikipedia.org/wiki/CUDA#GPUs_supported)
+for your specific card(s).
+
+Library maintainers should consult [NVCC Docs](https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/)
+and release notes for their software package.
--- a/pkgs/applications/science/math/mxnet/default.nix
+++ b/pkgs/applications/science/math/mxnet/default.nix
@ -2,11 +2,10 @@
 , opencv3, gtest, blas, gomp, llvmPackages, perl
 , cudaSupport ? config.cudaSupport or false, cudaPackages ? {}, nvidia_x11
 , cudnnSupport ? cudaSupport
-, cudaCapabilities ? [ "3.7" "5.0" "6.0" "7.0" "7.5" "8.0" "8.6" ]
 }:

 let
-  inherit (cudaPackages) cudatoolkit cudnn;
+  inherit (cudaPackages) cudatoolkit cudaFlags cudnn;
 in

 assert cudnnSupport -> cudaSupport;
@ -51,7 +50,7 @@ stdenv.mkDerivation rec {
      "-DUSE_OLDCMAKECUDA=ON"  # see https://github.com/apache/incubator-mxnet/issues/10743
      "-DCUDA_ARCH_NAME=All"
      "-DCUDA_HOST_COMPILER=${cudatoolkit.cc}/bin/cc"
-      "-DMXNET_CUDA_ARCH=${lib.concatStringsSep ";" cudaCapabilities}"
+      "-DMXNET_CUDA_ARCH=${cudaFlags.cudaCapabilitiesSemiColonString}"
    ] else [ "-DUSE_CUDA=OFF" ])
    ++ lib.optional (!cudnnSupport) "-DUSE_CUDNN=OFF";

--- a/pkgs/development/compilers/cudatoolkit/extension.nix
+++ b/pkgs/development/compilers/cudatoolkit/extension.nix
@ -10,6 +10,8 @@ final: prev: let
  ### Add classic cudatoolkit package
  cudatoolkit = buildCudaToolkitPackage ((attrs: attrs // { gcc = prev.pkgs.${attrs.gcc}; }) cudatoolkitVersions.${final.cudaVersion});

+  cudaFlags = final.callPackage ./flags.nix {};
+
 in {
-  inherit cudatoolkit;
+  inherit cudatoolkit cudaFlags;
 }
--- a/pkgs/development/compilers/cudatoolkit/flags.nix
+++ b/pkgs/development/compilers/cudatoolkit/flags.nix
@ -0,0 +1,78 @@
+{ config
+, lib
+, cudatoolkit
+}:
+let
+
+  # Flags are determined based on your CUDA toolkit by default.  You may benefit
+  # from improved performance, reduced file size, or greater hardware suppport by
+  # passing a configuration based on your specific GPU environment.
+  #
+  # config.cudaCapabilities: list of hardware generations to support (e.g., "8.0")
+  # config.cudaForwardCompat: bool for compatibility with future GPU generations
+  #
+  # Please see the accompanying documentation or https://github.com/NixOS/nixpkgs/pull/205351
+
+  defaultCudaCapabilities = rec {
+    cuda9 = [
+      "3.0"
+      "3.5"
+      "5.0"
+      "5.2"
+      "6.0"
+      "6.1"
+      "7.0"
+    ];
+
+    cuda10 = cuda9 ++ [
+      "7.5"
+    ];
+
+    cuda11 = [
+      "3.5"
+      "5.0"
+      "5.2"
+      "6.0"
+      "6.1"
+      "7.0"
+      "7.5"
+      "8.0"
+      "8.6"
+    ];
+
+  };
+
+  cudaMicroarchitectureNames = {
+    "3" = "Kepler";
+    "5" = "Maxwell";
+    "6" = "Pascal";
+    "7" = "Volta";
+    "8" = "Ampere";
+    "9" = "Hopper";
+  };
+
+  defaultCudaArchList = defaultCudaCapabilities."cuda${lib.versions.major cudatoolkit.version}";
+  cudaRealCapabilities = config.cudaCapabilities or defaultCudaArchList;
+  capabilitiesForward = "${lib.last cudaRealCapabilities}+PTX";
+
+  dropDot = ver: builtins.replaceStrings ["."] [""] ver;
+
+  archMapper = feat: map (ver: "${feat}_${dropDot ver}");
+  gencodeMapper = feat: map (ver: "-gencode=arch=compute_${dropDot ver},code=${feat}_${dropDot ver}");
+  cudaRealArchs = archMapper "sm" cudaRealCapabilities;
+  cudaPTXArchs = archMapper "compute" cudaRealCapabilities;
+  cudaArchs = cudaRealArchs ++ [ (lib.last cudaPTXArchs) ];
+
+  cudaArchNames = lib.unique (map (v: cudaMicroarchitectureNames.${lib.versions.major v}) cudaRealCapabilities);
+  cudaCapabilities = cudaRealCapabilities ++ lib.optional (config.cudaForwardCompat or true) capabilitiesForward;
+  cudaGencode = gencodeMapper "sm" cudaRealCapabilities ++ lib.optionals (config.cudaForwardCompat or true) (gencodeMapper "compute" [ (lib.last cudaPTXArchs) ]);
+
+  cudaCapabilitiesCommaString = lib.strings.concatStringsSep "," cudaCapabilities;
+  cudaCapabilitiesSemiColonString = lib.strings.concatStringsSep ";" cudaCapabilities;
+  cudaRealCapabilitiesCommaString = lib.strings.concatStringsSep "," cudaRealCapabilities;
+
+in
+{
+   inherit cudaArchs cudaArchNames cudaCapabilities cudaCapabilitiesCommaString cudaCapabilitiesSemiColonString
+     cudaRealCapabilities cudaRealCapabilitiesCommaString cudaGencode cudaRealArchs cudaPTXArchs;
+}
--- a/pkgs/development/libraries/science/math/magma/default.nix
+++ b/pkgs/development/libraries/science/math/magma/default.nix
@ -1,7 +1,7 @@
 { lib, stdenv, fetchurl, cmake, gfortran, ninja, cudaPackages, libpthreadstubs, lapack, blas }:

 let
-  inherit (cudaPackages) cudatoolkit;
+  inherit (cudaPackages) cudatoolkit cudaFlags;
 in

 assert let majorIs = lib.versions.major cudatoolkit.version;
@ -10,36 +10,6 @@ assert let majorIs = lib.versions.major cudatoolkit.version;
 let
  version = "2.6.2";

-  # We define a specific set of CUDA compute capabilities here,
-  # because CUDA 11 does not support compute capability 3.0. Also,
-  # we use it to enable newer capabilities that are not enabled
-  # by magma by default. The list of supported architectures
-  # can be found in magma's top-level CMakeLists.txt.
-  cudaCapabilities = rec {
-    cuda9 = [
-      "Kepler"  # 3.0, 3.5
-      "Maxwell" # 5.0
-      "Pascal"  # 6.0
-      "Volta"   # 7.0
-    ];
-
-    cuda10 = [
-      "Turing"  # 7.5
-    ] ++ cuda9;
-
-    cuda11 = [
-      "sm_35"   # sm_30 is not supported by CUDA 11
-      "Maxwell" # 5.0
-      "Pascal"  # 6.0
-      "Volta"   # 7.0
-      "Turing"  # 7.5
-      "Ampere"  # 8.0
-    ];
-  };
-
-  capabilityString = lib.strings.concatStringsSep ","
-    cudaCapabilities."cuda${lib.versions.major cudatoolkit.version}";
-
 in stdenv.mkDerivation {
  pname = "magma";
  inherit version;
@ -53,7 +23,9 @@ in stdenv.mkDerivation {

  buildInputs = [ cudatoolkit libpthreadstubs lapack blas ];

-  cmakeFlags = [ "-DGPU_TARGET=${capabilityString}" ];
+  cmakeFlags = [
+    "-DGPU_TARGET=${builtins.concatStringsSep "," cudaFlags.cudaRealArchs}"
+  ];

  doCheck = false;

--- a/pkgs/development/python-modules/jaxlib/default.nix
+++ b/pkgs/development/python-modules/jaxlib/default.nix
@ -41,7 +41,6 @@
 , zlib

  # CUDA flags:
-, cudaCapabilities ? [ "sm_35" "sm_50" "sm_60" "sm_70" "sm_75" "compute_80" ]
 , cudaSupport ? false
 , cudaPackages ? {}

@ -50,7 +49,7 @@
 }:

 let
-  inherit (cudaPackages) cudatoolkit cudnn nccl;
+  inherit (cudaPackages) cudatoolkit cudaFlags cudnn nccl;

  pname = "jaxlib";
  version = "0.3.22";
@ -165,7 +164,7 @@ let
      build --action_env TF_CUDA_PATHS="${cudatoolkit_joined},${cudnn},${nccl}"
      build --action_env TF_CUDA_VERSION="${lib.versions.majorMinor cudatoolkit.version}"
      build --action_env TF_CUDNN_VERSION="${lib.versions.major cudnn.version}"
-      build:cuda --action_env TF_CUDA_COMPUTE_CAPABILITIES="${lib.concatStringsSep "," cudaCapabilities}"
+      build:cuda --action_env TF_CUDA_COMPUTE_CAPABILITIES="${cudaFlags.cudaRealCapabilitiesCommaString}"
    '' + ''
      CFG
    '';
--- a/pkgs/development/python-modules/tensorflow/default.nix
+++ b/pkgs/development/python-modules/tensorflow/default.nix
@ -22,8 +22,6 @@
 , tensorboardSupport ? true
 # XLA without CUDA is broken
 , xlaSupport ? cudaSupport
-# Default from ./configure script
-, cudaCapabilities ? [ "sm_35" "sm_50" "sm_60" "sm_70" "sm_75" "compute_80" ]
 , sse42Support ? stdenv.hostPlatform.sse4_2Support
 , avx2Support  ? stdenv.hostPlatform.avx2Support
 , fmaSupport   ? stdenv.hostPlatform.fmaSupport
@ -32,7 +30,7 @@
 }:

 let
-  inherit (cudaPackages) cudatoolkit cudnn nccl;
+  inherit (cudaPackages) cudatoolkit cudaFlags cudnn nccl;
 in

 assert cudaSupport -> cudatoolkit != null
@ -305,7 +303,7 @@ let
    TF_CUDA_PATHS = lib.optionalString cudaSupport "${cudatoolkit_joined},${cudnn},${nccl}";
    GCC_HOST_COMPILER_PREFIX = lib.optionalString cudaSupport "${cudatoolkit_cc_joined}/bin";
    GCC_HOST_COMPILER_PATH = lib.optionalString cudaSupport "${cudatoolkit_cc_joined}/bin/gcc";
-    TF_CUDA_COMPUTE_CAPABILITIES = lib.concatStringsSep "," cudaCapabilities;
+    TF_CUDA_COMPUTE_CAPABILITIES = builtins.concatStringsSep "," cudaFlags.cudaRealArchs;

    postPatch = ''
      # bazel 3.3 should work just as well as bazel 3.1
--- a/pkgs/development/python-modules/torch/default.nix
+++ b/pkgs/development/python-modules/torch/default.nix
@ -3,7 +3,6 @@
  mklDnnSupport ? true, useSystemNccl ? true,
  MPISupport ? false, mpi,
  buildDocs ? false,
-  cudaArchList ? null,

  # Native build inputs
  cmake, util-linux, linkFarm, symlinkJoin, which, pybind11, removeReferencesTo,
@ -33,7 +32,7 @@
  isPy3k, pythonOlder }:

 let
-  inherit (cudaPackages) cudatoolkit cudnn nccl;
+  inherit (cudaPackages) cudatoolkit cudaFlags cudnn nccl;
 in

 # assert that everything needed for cuda is present and that the correct cuda versions are used
@ -52,64 +51,6 @@ let
    paths = [ cudatoolkit.out cudatoolkit.lib nccl.dev nccl.out ];
  };

-  # Give an explicit list of supported architectures for the build, See:
-  # - pytorch bug report: https://github.com/pytorch/pytorch/issues/23573
-  # - pytorch-1.2.0 build on nixpks: https://github.com/NixOS/nixpkgs/pull/65041
-  #
-  # This list was selected by omitting the TORCH_CUDA_ARCH_LIST parameter,
-  # observing the fallback option (which selected all architectures known
-  # from cudatoolkit_10_0, pytorch-1.2, and python-3.6), and doing a binary
-  # searching to find offending architectures.
-  #
-  # NOTE: Because of sandboxing, this derivation can't auto-detect the hardware's
-  # cuda architecture, so there is also now a problem around new architectures
-  # not being supported until explicitly added to this derivation.
-  #
-  # FIXME: CMake is throwing the following warning on python-1.2:
-  #
-  # ```
-  # CMake Warning at cmake/public/utils.cmake:172 (message):
-  #   In the future we will require one to explicitly pass TORCH_CUDA_ARCH_LIST
-  #   to cmake instead of implicitly setting it as an env variable.  This will
-  #   become a FATAL_ERROR in future version of pytorch.
-  # ```
-  # If this is causing problems for your build, this derivation may have to strip
-  # away the standard `buildPythonPackage` and use the
-  # [*Adjust Build Options*](https://github.com/pytorch/pytorch/tree/v1.2.0#adjust-build-options-optional)
-  # instructions. This will also add more flexibility around configurations
-  # (allowing FBGEMM to be built in pytorch-1.1), and may future proof this
-  # derivation.
-  brokenArchs = [ "3.0" ]; # this variable is only used as documentation.
-
-  cudaCapabilities = rec {
-    cuda9 = [
-      "3.5"
-      "5.0"
-      "5.2"
-      "6.0"
-      "6.1"
-      "7.0"
-      "7.0+PTX"  # I am getting a "undefined architecture compute_75" on cuda 9
-                 # which leads me to believe this is the final cuda-9-compatible architecture.
-    ];
-
-    cuda10 = cuda9 ++ [
-      "7.5"
-      "7.5+PTX"  # < most recent architecture as of cudatoolkit_10_0 and pytorch-1.2.0
-    ];
-
-    cuda11 = cuda10 ++ [
-      "8.0"
-      "8.0+PTX"  # < CUDA toolkit 11.0
-      "8.6"
-      "8.6+PTX"  # < CUDA toolkit 11.1
-    ];
-  };
-  final_cudaArchList =
-    if !cudaSupport || cudaArchList != null
-    then cudaArchList
-    else cudaCapabilities."cuda${lib.versions.major cudatoolkit.version}";
-
  # Normally libcuda.so.1 is provided at runtime by nvidia-x11 via
  # LD_LIBRARY_PATH=/run/opengl-driver/lib.  We only use the stub
  # libcuda.so from cudatoolkit for running tests, so that we don’t have
@ -153,7 +94,7 @@ in buildPythonPackage rec {
  ];

  preConfigure = lib.optionalString cudaSupport ''
-    export TORCH_CUDA_ARCH_LIST="${lib.strings.concatStringsSep ";" final_cudaArchList}"
+    export TORCH_CUDA_ARCH_LIST="${cudaFlags.cudaCapabilitiesSemiColonString}"
    export CC=${cudatoolkit.cc}/bin/gcc CXX=${cudatoolkit.cc}/bin/g++
  '' + lib.optionalString (cudaSupport && cudnn != null) ''
    export CUDNN_INCLUDE_DIR=${cudnn}/include
@ -308,7 +249,6 @@ in buildPythonPackage rec {

  passthru = {
    inherit cudaSupport cudaPackages;
-    cudaArchList = final_cudaArchList;
    # At least for 1.10.2 `torch.fft` is unavailable unless BLAS provider is MKL. This attribute allows for easy detection of its availability.
    blasProvider = blas.provider;
  };
--- a/pkgs/development/python-modules/torchvision/default.nix
+++ b/pkgs/development/python-modules/torchvision/default.nix
@ -15,7 +15,7 @@
 }:

 let
-  inherit (torch.cudaPackages) cudatoolkit cudnn;
+  inherit (torch.cudaPackages) cudatoolkit cudaFlags cudnn;

  cudatoolkit_joined = symlinkJoin {
    name = "${cudatoolkit.name}-unsplit";
@ -45,7 +45,7 @@ in buildPythonPackage rec {
  propagatedBuildInputs = [ numpy pillow torch scipy ];

  preBuild = lib.optionalString cudaSupport ''
-    export TORCH_CUDA_ARCH_LIST="${cudaArchStr}"
+    export TORCH_CUDA_ARCH_LIST="${cudaFlags.cudaCapabilitiesSemiColonString}"
    export FORCE_CUDA=1
  '';