[clang] [llvm] [CUDA] Add a pseudo GPU sm_next which allows overriding for SM/PTX version. (PR #100247)

Tue Jul 23 15:21:20 PDT 2024

llvmbot wrote:



@llvm/pr-subscribers-clang-driver

@llvm/pr-subscribers-clang

Author: Artem Belevich (Artem-B)

<details>
<summary>Changes</summary>

Sometimes users may need to use older clang with newer SM/PTX versions which clang does not know anything about, yet.

--offload-arch=sm_next, combined with --cuda-next-sm=X and --cuda-next-ptx=Y allows passing through the specified SM and PTX versions down to ptxas, which may be able to make sense of them. Or not, but that's up to the user to figure out the values they may need to use to make it work.

The feature is intended to be a stop-gap workaround for situations when clang didn't catch up to the newer CUDA SDK releases yet. No guarantees that it will work with any given combination of clang/CUDA/SM/PTX versions. YMMV.

---
Full diff: https://github.com/llvm/llvm-project/pull/100247.diff


11 Files Affected:

- (modified) clang/include/clang/Basic/Cuda.h (+1) 
- (modified) clang/include/clang/Basic/LangOptions.def (+2) 
- (modified) clang/include/clang/Driver/Options.td (+11) 
- (modified) clang/lib/Basic/Cuda.cpp (+7-2) 
- (modified) clang/lib/Basic/Targets/NVPTX.cpp (+5-1) 
- (modified) clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp (+1) 
- (modified) clang/lib/Driver/ToolChains/Cuda.cpp (+58-26) 
- (added) clang/test/Driver/cuda-sm_next.cu (+66) 
- (modified) clang/test/Misc/target-invalid-cpu-note.c (+1-1) 
- (modified) llvm/lib/Target/NVPTX/NVPTX.td (+4) 
- (modified) llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp (+25-10) 


``````````diff

diff --git a/clang/include/clang/Basic/Cuda.h b/clang/include/clang/Basic/Cuda.h
index 83699f8897f66..a81d185d78cbe 100644
--- a/clang/include/clang/Basic/Cuda.h
+++ b/clang/include/clang/Basic/Cuda.h
@@ -78,6 +78,7 @@ enum class OffloadArch {
   SM_89,
   SM_90,
   SM_90a,
+  SM_next,
   GFX600,
   GFX601,
   GFX602,
diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def
index 834a6f6cd43e3..c1901429e11fc 100644
--- a/clang/include/clang/Basic/LangOptions.def
+++ b/clang/include/clang/Basic/LangOptions.def
@@ -287,6 +287,8 @@ LANGOPT(HLSLStrictAvailability, 1, 0,
 LANGOPT(CUDAIsDevice      , 1, 0, "compiling for CUDA device")
 LANGOPT(CUDAAllowVariadicFunctions, 1, 0, "allowing variadic functions in CUDA device code")
 LANGOPT(CUDAHostDeviceConstexpr, 1, 1, "treating unattributed constexpr functions as __host__ __device__")
+LANGOPT(CUDANextSM, 32, 0, "SM version for sm_next target")
+LANGOPT(CUDANextPTX, 32, 0, "PTX version for sm_next target")
 LANGOPT(GPUDeviceApproxTranscendentals, 1, 0, "using approximate transcendental functions")
 LANGOPT(GPURelocatableDeviceCode, 1, 0, "generate relocatable device code")
 LANGOPT(OffloadImplicitHostDeviceTemplates, 1, 0, "assume template functions to be implicitly host device by default for CUDA/HIP")
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index fa36405ec1bdd..9bba185b218cb 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -1458,6 +1458,17 @@ def fno_hip_emit_relocatable : Flag<["-"], "fno-hip-emit-relocatable">,
   HelpText<"Do not override toolchain to compile HIP source to relocatable">;
 }
 
+def cuda_next_sm_EQ : Joined<["--"], "cuda-next-sm=">,
+  Visibility<[ClangOption, CC1Option]>,
+  HelpText<"SM version to use for sm_next GPU">,
+  MarshallingInfoInt<LangOpts<"CUDANextSM">, "0">,
+  ShouldParseIf<cuda.KeyPath>,Flags<[HelpHidden]>;
+def cuda_next_ptx_EQ : Joined<["--"], "cuda-next-ptx=">,
+  Visibility<[ClangOption, CC1Option]>,
+  HelpText<"SM version to use for sm_next GPU">,
+  MarshallingInfoInt<LangOpts<"CUDANextPTX">, "0">,
+  ShouldParseIf<cuda.KeyPath>,Flags<[HelpHidden]>;
+
 // Clang specific/exclusive options for OpenACC.
 def openacc_macro_override
     : Separate<["-"], "fexperimental-openacc-macro-override">,
diff --git a/clang/lib/Basic/Cuda.cpp b/clang/lib/Basic/Cuda.cpp
index faf3878f064d2..7f50b58aeca83 100644
--- a/clang/lib/Basic/Cuda.cpp
+++ b/clang/lib/Basic/Cuda.cpp
@@ -79,9 +79,11 @@ struct OffloadArchToStringMap {
 };
 } // namespace
 
-#define SM2(sm, ca) {OffloadArch::SM_##sm, "sm_" #sm, ca}
+#define SM2(sm, ca)                                                            \
+  { OffloadArch::SM_##sm, "sm_" #sm, ca }
 #define SM(sm) SM2(sm, "compute_" #sm)
-#define GFX(gpu) {OffloadArch::GFX##gpu, "gfx" #gpu, "compute_amdgcn"}
+#define GFX(gpu)                                                               \
+  { OffloadArch::GFX##gpu, "gfx" #gpu, "compute_amdgcn" }
 static const OffloadArchToStringMap arch_names[] = {
     // clang-format off
     {OffloadArch::UNUSED, "", ""},
@@ -96,6 +98,7 @@ static const OffloadArchToStringMap arch_names[] = {
     SM(89),                          // Ada Lovelace
     SM(90),                          // Hopper
     SM(90a),                         // Hopper
+    SM(next),                        // Placeholder for a new arch.
     GFX(600),  // gfx600
     GFX(601),  // gfx601
     GFX(602),  // gfx602
@@ -221,6 +224,8 @@ CudaVersion MinVersionForOffloadArch(OffloadArch A) {
     return CudaVersion::CUDA_118;
   case OffloadArch::SM_90a:
     return CudaVersion::CUDA_120;
+  case clang::OffloadArch::SM_next:
+    return CudaVersion::UNKNOWN;
   default:
     llvm_unreachable("invalid enum");
   }
diff --git a/clang/lib/Basic/Targets/NVPTX.cpp b/clang/lib/Basic/Targets/NVPTX.cpp
index 43b653dc52ce0..f4096a19af38e 100644
--- a/clang/lib/Basic/Targets/NVPTX.cpp
+++ b/clang/lib/Basic/Targets/NVPTX.cpp
@@ -13,8 +13,10 @@
 #include "NVPTX.h"
 #include "Targets.h"
 #include "clang/Basic/Builtins.h"
+#include "clang/Basic/Cuda.h"
 #include "clang/Basic/MacroBuilder.h"
 #include "clang/Basic/TargetBuiltins.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringSwitch.h"
 
 using namespace clang;
@@ -180,7 +182,7 @@ void NVPTXTargetInfo::getTargetDefines(const LangOptions &Opts,
 
   if (Opts.CUDAIsDevice || Opts.OpenMPIsTargetDevice || !HostTarget) {
     // Set __CUDA_ARCH__ for the GPU specified.
-    std::string CUDAArchCode = [this] {
+    std::string CUDAArchCode = [&]() -> std::string {
       switch (GPU) {
       case OffloadArch::GFX600:
       case OffloadArch::GFX601:
@@ -281,6 +283,8 @@ void NVPTXTargetInfo::getTargetDefines(const LangOptions &Opts,
       case OffloadArch::SM_90:
       case OffloadArch::SM_90a:
         return "900";
+      case OffloadArch::SM_next:
+        return llvm::itostr(Opts.CUDANextSM * 10);
       }
       llvm_unreachable("unhandled OffloadArch");
     }();
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
index f5bd4a141cc2d..a9a2e0bd3c758 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@@ -2276,6 +2276,7 @@ void CGOpenMPRuntimeGPU::processRequiresDirective(const OMPRequiresDecl *D) {
       case OffloadArch::SM_89:
       case OffloadArch::SM_90:
       case OffloadArch::SM_90a:
+      case OffloadArch::SM_next:
       case OffloadArch::GFX600:
       case OffloadArch::GFX601:
       case OffloadArch::GFX602:
diff --git a/clang/lib/Driver/ToolChains/Cuda.cpp b/clang/lib/Driver/ToolChains/Cuda.cpp
index 61d12b10dfb62..cdf7c14fa407c 100644
--- a/clang/lib/Driver/ToolChains/Cuda.cpp
+++ b/clang/lib/Driver/ToolChains/Cuda.cpp
@@ -116,6 +116,14 @@ CudaVersion parseCudaHFile(llvm::StringRef Input) {
   }
   return CudaVersion::UNKNOWN;
 }
+
+std::string getSMNext(const llvm::opt::ArgList &DriverArgs) {
+  return DriverArgs
+      .getLastArgValue(
+          options::OPT_cuda_next_sm_EQ,
+          StringRef(OffloadArchToString(OffloadArch::CudaDefault)).substr(3))
+      .str(); // Strip leading "sm_" from the GPU variant name.
+}
 } // namespace
 
 void CudaInstallationDetector::WarnIfUnsupportedVersion() {
@@ -457,7 +465,9 @@ void NVPTX::Assembler::ConstructJob(Compilation &C, const JobAction &JA,
     CmdArgs.push_back("-v");
 
   CmdArgs.push_back("--gpu-name");
-  CmdArgs.push_back(Args.MakeArgString(OffloadArchToString(gpu_arch)));
+  CmdArgs.push_back(Args.MakeArgString(gpu_arch == OffloadArch::SM_next
+                                           ? "sm_" + getSMNext(Args)
+                                           : OffloadArchToString(gpu_arch)));
   CmdArgs.push_back("--output-file");
   std::string OutputFileName = TC.getInputFilename(Output);
 
@@ -648,6 +658,13 @@ void NVPTX::getNVPTXTargetFeatures(const Driver &D, const llvm::Triple &Triple,
     Features.push_back(Args.MakeArgString(PtxFeature));
     return;
   }
+  // Add --cuda-next-ptx to the list of features, but carry on to add the
+  // default PTX feature for the detected CUDA SDK. NVPTX back-end will use the
+  // higher version.
+  StringRef NextPtx = Args.getLastArgValue(options::OPT_cuda_next_ptx_EQ);
+  if (!NextPtx.empty())
+    Features.push_back(Args.MakeArgString("+ptx" + NextPtx));
+
   CudaInstallationDetector CudaInstallation(D, Triple, Args);
 
   // New CUDA versions often introduce new instructions that are only supported
@@ -840,47 +857,62 @@ void CudaToolChain::addClangTargetOptions(
       CC1Args.push_back("-fcuda-allow-variadic-functions");
   }
 
-  if (DriverArgs.hasArg(options::OPT_nogpulib))
-    return;
-
   if (DeviceOffloadingKind == Action::OFK_OpenMP &&
       DriverArgs.hasArg(options::OPT_S))
     return;
 
-  std::string LibDeviceFile = CudaInstallation.getLibDeviceFile(GpuArch);
-  if (LibDeviceFile.empty()) {
-    getDriver().Diag(diag::err_drv_no_cuda_libdevice) << GpuArch;
-    return;
-  }
+  clang::CudaVersion CudaInstallationVersion = CudaInstallation.version();
+
+  if (!DriverArgs.hasArg(options::OPT_nogpulib)) {
+    std::string LibDeviceFile = CudaInstallation.getLibDeviceFile(GpuArch);
+    if (LibDeviceFile.empty()) {
+      getDriver().Diag(diag::err_drv_no_cuda_libdevice) << GpuArch;
+      return;
+    }
 
-  CC1Args.push_back("-mlink-builtin-bitcode");
-  CC1Args.push_back(DriverArgs.MakeArgString(LibDeviceFile));
+    CC1Args.push_back("-mlink-builtin-bitcode");
+    CC1Args.push_back(DriverArgs.MakeArgString(LibDeviceFile));
 
-  clang::CudaVersion CudaInstallationVersion = CudaInstallation.version();
+    if (DeviceOffloadingKind == Action::OFK_OpenMP) {
+      if (CudaInstallationVersion < CudaVersion::CUDA_92) {
+        getDriver().Diag(
+            diag::err_drv_omp_offload_target_cuda_version_not_support)
+            << CudaVersionToString(CudaInstallationVersion);
+        return;
+      }
+
+      // Link the bitcode library late if we're using device LTO.
+      if (getDriver().isUsingLTO(/* IsOffload */ true))
+        return;
+
+      addOpenMPDeviceRTL(getDriver(), DriverArgs, CC1Args, GpuArch.str(),
+                         getTriple(), HostTC);
+    }
+  }
 
   if (DriverArgs.hasFlag(options::OPT_fcuda_short_ptr,
                          options::OPT_fno_cuda_short_ptr, false))
     CC1Args.append({"-mllvm", "--nvptx-short-ptr"});
 
-  if (CudaInstallationVersion >= CudaVersion::UNKNOWN)
+  if (CudaInstallation.isValid() &&
+      CudaInstallationVersion > CudaVersion::UNKNOWN)
     CC1Args.push_back(
         DriverArgs.MakeArgString(Twine("-target-sdk-version=") +
                                  CudaVersionToString(CudaInstallationVersion)));
 
-  if (DeviceOffloadingKind == Action::OFK_OpenMP) {
-    if (CudaInstallationVersion < CudaVersion::CUDA_92) {
-      getDriver().Diag(
-          diag::err_drv_omp_offload_target_cuda_version_not_support)
-          << CudaVersionToString(CudaInstallationVersion);
-      return;
-    }
-
-    // Link the bitcode library late if we're using device LTO.
-    if (getDriver().isUsingLTO(/* IsOffload */ true))
-      return;
+  std::string NextSM = getSMNext(DriverArgs);
+  if (!NextSM.empty()) {
+    CC1Args.push_back(DriverArgs.MakeArgStringRef("--cuda-next-sm=" + NextSM));
+    CC1Args.append(
+        {"-mllvm", DriverArgs.MakeArgString(("--nvptx-next-sm=" + NextSM))});
+  }
 
-    addOpenMPDeviceRTL(getDriver(), DriverArgs, CC1Args, GpuArch.str(),
-                       getTriple(), HostTC);
+  StringRef NextPTX = DriverArgs.getLastArgValue(options::OPT_cuda_next_ptx_EQ);
+  if (!NextPTX.empty()) {
+    CC1Args.push_back(
+        DriverArgs.MakeArgStringRef(("--cuda-next-ptx=" + NextPTX).str()));
+    CC1Args.append({"-mllvm", DriverArgs.MakeArgString(
+                                  ("--nvptx-next-ptx=" + NextPTX).str())});
   }
 }
 
diff --git a/clang/test/Driver/cuda-sm_next.cu b/clang/test/Driver/cuda-sm_next.cu
new file mode 100644
index 0000000000000..379dcb297ae1a
--- /dev/null
+++ b/clang/test/Driver/cuda-sm_next.cu
@@ -0,0 +1,66 @@
+// Tests CUDA compilation targeting sm_next
+
+// CC1 options level check.
+// Check that by default we only supply sm_next CPU info without explicitly 
+// overriding SM/PTX versions, and letting LLVM pick the defaults.
+// RUN: %clang -### -c --target=x86_64-linux-gnu --cuda-device-only  \
+// RUN:    --cuda-path=%S/Inputs/CUDA_111/usr/local/cuda %s \
+// RUN:    --cuda-gpu-arch=sm_next  2>&1 \
+// RUN:   | FileCheck -check-prefixes=ARGS-COMMON,ARGS-ARCH %s
+//
+// Same, with explicitly set sm and PTX versions.
+// RUN: %clang -### -c --target=x86_64-linux-gnu --cuda-device-only \
+// RUN:    --cuda-path=%S/Inputs/CUDA_111/usr/local/cuda %s \
+// RUN:    --cuda-gpu-arch=sm_next --cuda-next-sm=111 --cuda-next-ptx=222  2>&1 \
+// RUN:   | FileCheck -check-prefixes=ARGS-COMMON,ARGS-OVERRIDE %s
+
+// Preprocessor level checks.
+// RUN: %clang -dD -E --target=x86_64-linux-gnu --cuda-device-only -nocudainc \
+// RUN:    --cuda-path=%S/Inputs/CUDA_111/usr/local/cuda %s \
+// RUN:    --cuda-gpu-arch=sm_next  2>&1 \
+// RUN:   | FileCheck -check-prefixes=PP-COMMON,PP-ARCH %s
+//
+// Same, with explicitly set sm and PTX versions.
+// RUN: %clang -dD -E --target=x86_64-linux-gnu --cuda-device-only -nocudainc \
+// RUN:    --cuda-path=%S/Inputs/CUDA_111/usr/local/cuda %s \
+// RUN:    --cuda-gpu-arch=sm_next --cuda-next-sm=111 --cuda-next-ptx=222  2>&1 \
+// RUN:   | FileCheck -check-prefixes=PP-COMMON,PP-OVERRIDE %s
+
+// PTX level checks. 
+// RUN: %clang -S --target=x86_64-linux-gnu --cuda-device-only -nocudainc -nocudalib \
+// RUN:    --cuda-path=%S/Inputs/CUDA_111/usr/local/cuda -o - %s \
+// RUN:    --cuda-gpu-arch=sm_next  2>&1 \
+// RUN:   | FileCheck -check-prefixes=PTX-ARCH %s
+//
+// Same, with explicitly set sm and PTX versions.
+// RUN: %clang -S --target=x86_64-linux-gnu --cuda-device-only -nocudainc -nocudalib \
+// RUN:      --cuda-path=%S/Inputs/CUDA_111/usr/local/cuda -o - %s \
+// RUN:      --cuda-gpu-arch=sm_next --cuda-next-sm=111 --cuda-next-ptx=222  2>&1 \
+// RUN:   | FileCheck -check-prefixes=PTX-OVERRIDE %s
+
+
+// ARGS-COMMON: "-cc1" "-triple" "nvptx64-nvidia-cuda"
+// We need to pass specific SM version to CC1, so that preprocessor can set __CUDA_ARCH__ macro
+// ARGS-ARCH-SAME: "--cuda-next-sm=52" "-mllvm" "--nvptx-next-sm=52"
+// .. but we do not explicitly set PTX version, and carry on with the default for the detected CUDA SDK.
+// ARGS-ARCH-NOT: --cuda-next-ptx=
+// ARGS-ARCH-NOT: --nvptx-next-ptx=
+// When we override SM and PTX versions, we explicitly set them for both clang and LLVM.
+// ARGS-OVERRIDE-SAME: "--cuda-next-sm=111" "-mllvm" "--nvptx-next-sm=111" "--cuda-next-ptx=222" "-mllvm" "--nvptx-next-ptx=222"
+// ARGS-COMMON-SAME: "-target-cpu" "sm_next"
+// ARGS-COMMON-SAME: "-target-feature" "+ptx71"
+// ARGS-COMMON-NEXT: ptxas
+// ARGS-ARCH-SAME: "--gpu-name" "sm_52"
+// ARGS-OVERRIDE-SAME: "--gpu-name" "sm_111"
+//
+//
+// PP-COMMON:  #define __NVPTX__ 1
+// PP-ARCH: #define __CUDA_ARCH__ 520
+// PP-OVERRIDE: #define __CUDA_ARCH__  1110
+//
+//
+// PTX-ARCH:  .version 8.5
+// PTX-ARCH:  .target sm_52
+// PTX-OVERRIDE:  .version 22.2
+// PTX-OVERRIDE:  .target sm_111
+
diff --git a/clang/test/Misc/target-invalid-cpu-note.c b/clang/test/Misc/target-invalid-cpu-note.c
index 4d6759dd81537..9c96153877d1c 100644
--- a/clang/test/Misc/target-invalid-cpu-note.c
+++ b/clang/test/Misc/target-invalid-cpu-note.c
@@ -29,7 +29,7 @@
 
 // RUN: not %clang_cc1 -triple nvptx--- -target-cpu not-a-cpu -fsyntax-only %s 2>&1 | FileCheck %s --check-prefix NVPTX
 // NVPTX: error: unknown target CPU 'not-a-cpu'
-// NVPTX-NEXT: note: valid target CPU values are: sm_20, sm_21, sm_30, sm_32, sm_35, sm_37, sm_50, sm_52, sm_53, sm_60, sm_61, sm_62, sm_70, sm_72, sm_75, sm_80, sm_86, sm_87, sm_89, sm_90, sm_90a, gfx600, gfx601, gfx602, gfx700, gfx701, gfx702, gfx703, gfx704, gfx705, gfx801, gfx802, gfx803, gfx805, gfx810, gfx9-generic, gfx900, gfx902, gfx904, gfx906, gfx908, gfx909, gfx90a, gfx90c, gfx940, gfx941, gfx942, gfx10-1-generic, gfx1010, gfx1011, gfx1012, gfx1013, gfx10-3-generic, gfx1030, gfx1031, gfx1032, gfx1033, gfx1034, gfx1035, gfx1036, gfx11-generic, gfx1100, gfx1101, gfx1102, gfx1103, gfx1150, gfx1151, gfx1152, gfx12-generic, gfx1200, gfx1201, amdgcnspirv{{$}}
+// NVPTX-NEXT: note: valid target CPU values are: sm_20, sm_21, sm_30, sm_32, sm_35, sm_37, sm_50, sm_52, sm_53, sm_60, sm_61, sm_62, sm_70, sm_72, sm_75, sm_80, sm_86, sm_87, sm_89, sm_90, sm_90a, sm_next, gfx600, gfx601, gfx602, gfx700, gfx701, gfx702, gfx703, gfx704, gfx705, gfx801, gfx802, gfx803, gfx805, gfx810, gfx9-generic, gfx900, gfx902, gfx904, gfx906, gfx908, gfx909, gfx90a, gfx90c, gfx940, gfx941, gfx942, gfx10-1-generic, gfx1010, gfx1011, gfx1012, gfx1013, gfx10-3-generic, gfx1030, gfx1031, gfx1032, gfx1033, gfx1034, gfx1035, gfx1036, gfx11-generic, gfx1100, gfx1101, gfx1102, gfx1103, gfx1150, gfx1151, gfx1152, gfx12-generic, gfx1200, gfx1201, amdgcnspirv{{$}}
 
 // RUN: not %clang_cc1 -triple r600--- -target-cpu not-a-cpu -fsyntax-only %s 2>&1 | FileCheck %s --check-prefix R600
 // R600: error: unknown target CPU 'not-a-cpu'
diff --git a/llvm/lib/Target/NVPTX/NVPTX.td b/llvm/lib/Target/NVPTX/NVPTX.td
index bb4549a5e6078..d55dc0194c6fc 100644
--- a/llvm/lib/Target/NVPTX/NVPTX.td
+++ b/llvm/lib/Target/NVPTX/NVPTX.td
@@ -39,12 +39,15 @@ foreach sm = [20, 21, 30, 32, 35, 37, 50, 52, 53,
   def SM#sm: FeatureSM<""#sm, !mul(sm, 10)>;
 
 def SM90a: FeatureSM<"90a", 901>;
+def SMnext: FeatureSM<"next", 9999>; // Placeholder for an unknown future version
 
 foreach version = [32, 40, 41, 42, 43, 50, 60, 61, 62, 63, 64, 65,
                    70, 71, 72, 73, 74, 75, 76, 77, 78,
                    80, 81, 82, 83, 84, 85] in
   def PTX#version: FeaturePTX<version>;
 
+def PTXnext: FeaturePTX<9999>; // Placeholder for an unknown future version.
+
 //===----------------------------------------------------------------------===//
 // NVPTX supported processors.
 //===----------------------------------------------------------------------===//
@@ -73,6 +76,7 @@ def : Proc<"sm_87", [SM87, PTX74]>;
 def : Proc<"sm_89", [SM89, PTX78]>;
 def : Proc<"sm_90", [SM90, PTX78]>;
 def : Proc<"sm_90a", [SM90a, PTX80]>;
+def : Proc<"sm_next", [SMnext, PTXnext]>;
 
 def NVPTXInstrInfo : InstrInfo {
 }
diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp b/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
index 420065585b384..594cb094e13ae 100644
--- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
@@ -12,6 +12,7 @@
 
 #include "NVPTXSubtarget.h"
 #include "NVPTXTargetMachine.h"
+#include "llvm/ADT/StringExtras.h"
 
 using namespace llvm;
 
@@ -26,24 +27,38 @@ static cl::opt<bool>
     NoF16Math("nvptx-no-f16-math", cl::Hidden,
               cl::desc("NVPTX Specific: Disable generation of f16 math ops."),
               cl::init(false));
+static cl::opt<unsigned>
+    NextSM("nvptx-next-sm", cl::Hidden,
+           cl::desc("NVPTX Specific: Override SM ID for sm_next."),
+           cl::init(90));
+static cl::opt<unsigned>
+    NextPTX("nvptx-next-ptx", cl::Hidden,
+            cl::desc("NVPTX Specific: Override PTX version for sm_next."),
+            cl::init(85));
+
 // Pin the vtable to this file.
 void NVPTXSubtarget::anchor() {}
 
 NVPTXSubtarget &NVPTXSubtarget::initializeSubtargetDependencies(StringRef CPU,
                                                                 StringRef FS) {
-    // Provide the default CPU if we don't have one.
-    TargetName = std::string(CPU.empty() ? "sm_30" : CPU);
+  // Provide the default CPU if we don't have one.
+  TargetName = std::string(CPU.empty() ? "sm_30" : CPU);
 
-    ParseSubtargetFeatures(TargetName, /*TuneCPU*/ TargetName, FS);
+  ParseSubtargetFeatures(TargetName, /*TuneCPU*/ TargetName, FS);
+  if (TargetName == "sm_next") {
+    TargetName = "sm_" + itostr(NextSM);
+    FullSmVersion = NextSM * 10;
+    PTXVersion = NextPTX;
+  }
 
-    // Re-map SM version numbers, SmVersion carries the regular SMs which do
-    // have relative order, while FullSmVersion allows distinguishing sm_90 from
-    // sm_90a, which would *not* be a subset of sm_91.
-    SmVersion = getSmVersion();
+  // Re-map SM version numbers, SmVersion carries the regular SMs which do
+  // have relative order, while FullSmVersion allows distinguishing sm_90 from
+  // sm_90a, which would *not* be a subset of sm_91.
+  SmVersion = getSmVersion();
 
-    // Set default to PTX 6.0 (CUDA 9.0)
-    if (PTXVersion == 0) {
-      PTXVersion = 60;
+  // Set default to PTX 6.0 (CUDA 9.0)
+  if (PTXVersion == 0) {
+    PTXVersion = 60;
   }
 
   return *this;

``````````

</details>


https://github.com/llvm/llvm-project/pull/100247