[clang] [llvm] [CUDA] Add a pseudo GPU sm_next which allows overriding for SM/PTX version. (PR #100247)

Artem Belevich via llvm-commits llvm-commits at lists.llvm.org
Mon Aug 12 13:37:49 PDT 2024


https://github.com/Artem-B updated https://github.com/llvm/llvm-project/pull/100247

>From 44a1045eee71777fa916e2a8043b2f99afc96a96 Mon Sep 17 00:00:00 2001
From: Artem Belevich <tra at google.com>
Date: Thu, 18 Jul 2024 15:05:01 -0700
Subject: [PATCH 1/4] [CUDA] Add a pseudo GPU sm_next which allows overrides
 for SM/PTX versions.

Sometimes users may need to use older clang with newer SM/PTX versions
which clang does not know anything about, yet.

--offload-arch=sm_next, combined with --cuda-next-sm=X and --cuda-next-ptx=Y
allows passing through the specified SM and PTX versions down to ptxas,
which may be able to make sense of them. Or not, but that's up to the user
to figure out the values they may need to use to make it work.

The feature is intended to be a stop-gap workaround for situations when
clang didn't catch up to the newer CUDA SDK releases yet.
No guarantees that it will work with any given combination of
clang/CUDA/SM/PTX versions. YMMV.
---
 clang/include/clang/Basic/Cuda.h          |  1 +
 clang/include/clang/Basic/LangOptions.def |  2 +
 clang/include/clang/Driver/Options.td     | 11 +++
 clang/lib/Basic/Cuda.cpp                  |  9 ++-
 clang/lib/Basic/Targets/NVPTX.cpp         |  6 +-
 clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp  |  1 +
 clang/lib/Driver/ToolChains/Cuda.cpp      | 84 ++++++++++++++++-------
 clang/test/Driver/cuda-sm_next.cu         | 66 ++++++++++++++++++
 clang/test/Misc/target-invalid-cpu-note.c |  2 +-
 llvm/lib/Target/NVPTX/NVPTX.td            |  4 ++
 llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp  | 35 +++++++---
 11 files changed, 181 insertions(+), 40 deletions(-)
 create mode 100644 clang/test/Driver/cuda-sm_next.cu

diff --git a/clang/include/clang/Basic/Cuda.h b/clang/include/clang/Basic/Cuda.h
index 83699f8897f663..a81d185d78cbee 100644
--- a/clang/include/clang/Basic/Cuda.h
+++ b/clang/include/clang/Basic/Cuda.h
@@ -78,6 +78,7 @@ enum class OffloadArch {
   SM_89,
   SM_90,
   SM_90a,
+  SM_next,
   GFX600,
   GFX601,
   GFX602,
diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def
index 834a6f6cd43e32..c1901429e11fc8 100644
--- a/clang/include/clang/Basic/LangOptions.def
+++ b/clang/include/clang/Basic/LangOptions.def
@@ -287,6 +287,8 @@ LANGOPT(HLSLStrictAvailability, 1, 0,
 LANGOPT(CUDAIsDevice      , 1, 0, "compiling for CUDA device")
 LANGOPT(CUDAAllowVariadicFunctions, 1, 0, "allowing variadic functions in CUDA device code")
 LANGOPT(CUDAHostDeviceConstexpr, 1, 1, "treating unattributed constexpr functions as __host__ __device__")
+LANGOPT(CUDANextSM, 32, 0, "SM version for sm_next target")
+LANGOPT(CUDANextPTX, 32, 0, "PTX version for sm_next target")
 LANGOPT(GPUDeviceApproxTranscendentals, 1, 0, "using approximate transcendental functions")
 LANGOPT(GPURelocatableDeviceCode, 1, 0, "generate relocatable device code")
 LANGOPT(OffloadImplicitHostDeviceTemplates, 1, 0, "assume template functions to be implicitly host device by default for CUDA/HIP")
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index fa36405ec1bddf..9bba185b218cb1 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -1458,6 +1458,17 @@ def fno_hip_emit_relocatable : Flag<["-"], "fno-hip-emit-relocatable">,
   HelpText<"Do not override toolchain to compile HIP source to relocatable">;
 }
 
+def cuda_next_sm_EQ : Joined<["--"], "cuda-next-sm=">,
+  Visibility<[ClangOption, CC1Option]>,
+  HelpText<"SM version to use for sm_next GPU">,
+  MarshallingInfoInt<LangOpts<"CUDANextSM">, "0">,
+  ShouldParseIf<cuda.KeyPath>,Flags<[HelpHidden]>;
+def cuda_next_ptx_EQ : Joined<["--"], "cuda-next-ptx=">,
+  Visibility<[ClangOption, CC1Option]>,
+  HelpText<"SM version to use for sm_next GPU">,
+  MarshallingInfoInt<LangOpts<"CUDANextPTX">, "0">,
+  ShouldParseIf<cuda.KeyPath>,Flags<[HelpHidden]>;
+
 // Clang specific/exclusive options for OpenACC.
 def openacc_macro_override
     : Separate<["-"], "fexperimental-openacc-macro-override">,
diff --git a/clang/lib/Basic/Cuda.cpp b/clang/lib/Basic/Cuda.cpp
index faf3878f064d20..7f50b58aeca833 100644
--- a/clang/lib/Basic/Cuda.cpp
+++ b/clang/lib/Basic/Cuda.cpp
@@ -79,9 +79,11 @@ struct OffloadArchToStringMap {
 };
 } // namespace
 
-#define SM2(sm, ca) {OffloadArch::SM_##sm, "sm_" #sm, ca}
+#define SM2(sm, ca)                                                            \
+  { OffloadArch::SM_##sm, "sm_" #sm, ca }
 #define SM(sm) SM2(sm, "compute_" #sm)
-#define GFX(gpu) {OffloadArch::GFX##gpu, "gfx" #gpu, "compute_amdgcn"}
+#define GFX(gpu)                                                               \
+  { OffloadArch::GFX##gpu, "gfx" #gpu, "compute_amdgcn" }
 static const OffloadArchToStringMap arch_names[] = {
     // clang-format off
     {OffloadArch::UNUSED, "", ""},
@@ -96,6 +98,7 @@ static const OffloadArchToStringMap arch_names[] = {
     SM(89),                          // Ada Lovelace
     SM(90),                          // Hopper
     SM(90a),                         // Hopper
+    SM(next),                        // Placeholder for a new arch.
     GFX(600),  // gfx600
     GFX(601),  // gfx601
     GFX(602),  // gfx602
@@ -221,6 +224,8 @@ CudaVersion MinVersionForOffloadArch(OffloadArch A) {
     return CudaVersion::CUDA_118;
   case OffloadArch::SM_90a:
     return CudaVersion::CUDA_120;
+  case clang::OffloadArch::SM_next:
+    return CudaVersion::UNKNOWN;
   default:
     llvm_unreachable("invalid enum");
   }
diff --git a/clang/lib/Basic/Targets/NVPTX.cpp b/clang/lib/Basic/Targets/NVPTX.cpp
index 43b653dc52ce0d..f4096a19af38ea 100644
--- a/clang/lib/Basic/Targets/NVPTX.cpp
+++ b/clang/lib/Basic/Targets/NVPTX.cpp
@@ -13,8 +13,10 @@
 #include "NVPTX.h"
 #include "Targets.h"
 #include "clang/Basic/Builtins.h"
+#include "clang/Basic/Cuda.h"
 #include "clang/Basic/MacroBuilder.h"
 #include "clang/Basic/TargetBuiltins.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringSwitch.h"
 
 using namespace clang;
@@ -180,7 +182,7 @@ void NVPTXTargetInfo::getTargetDefines(const LangOptions &Opts,
 
   if (Opts.CUDAIsDevice || Opts.OpenMPIsTargetDevice || !HostTarget) {
     // Set __CUDA_ARCH__ for the GPU specified.
-    std::string CUDAArchCode = [this] {
+    std::string CUDAArchCode = [&]() -> std::string {
       switch (GPU) {
       case OffloadArch::GFX600:
       case OffloadArch::GFX601:
@@ -281,6 +283,8 @@ void NVPTXTargetInfo::getTargetDefines(const LangOptions &Opts,
       case OffloadArch::SM_90:
       case OffloadArch::SM_90a:
         return "900";
+      case OffloadArch::SM_next:
+        return llvm::itostr(Opts.CUDANextSM * 10);
       }
       llvm_unreachable("unhandled OffloadArch");
     }();
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
index f5bd4a141cc2d7..a9a2e0bd3c7587 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@@ -2276,6 +2276,7 @@ void CGOpenMPRuntimeGPU::processRequiresDirective(const OMPRequiresDecl *D) {
       case OffloadArch::SM_89:
       case OffloadArch::SM_90:
       case OffloadArch::SM_90a:
+      case OffloadArch::SM_next:
       case OffloadArch::GFX600:
       case OffloadArch::GFX601:
       case OffloadArch::GFX602:
diff --git a/clang/lib/Driver/ToolChains/Cuda.cpp b/clang/lib/Driver/ToolChains/Cuda.cpp
index 61d12b10dfb62b..cdf7c14fa407cf 100644
--- a/clang/lib/Driver/ToolChains/Cuda.cpp
+++ b/clang/lib/Driver/ToolChains/Cuda.cpp
@@ -116,6 +116,14 @@ CudaVersion parseCudaHFile(llvm::StringRef Input) {
   }
   return CudaVersion::UNKNOWN;
 }
+
+std::string getSMNext(const llvm::opt::ArgList &DriverArgs) {
+  return DriverArgs
+      .getLastArgValue(
+          options::OPT_cuda_next_sm_EQ,
+          StringRef(OffloadArchToString(OffloadArch::CudaDefault)).substr(3))
+      .str(); // Strip leading "sm_" from the GPU variant name.
+}
 } // namespace
 
 void CudaInstallationDetector::WarnIfUnsupportedVersion() {
@@ -457,7 +465,9 @@ void NVPTX::Assembler::ConstructJob(Compilation &C, const JobAction &JA,
     CmdArgs.push_back("-v");
 
   CmdArgs.push_back("--gpu-name");
-  CmdArgs.push_back(Args.MakeArgString(OffloadArchToString(gpu_arch)));
+  CmdArgs.push_back(Args.MakeArgString(gpu_arch == OffloadArch::SM_next
+                                           ? "sm_" + getSMNext(Args)
+                                           : OffloadArchToString(gpu_arch)));
   CmdArgs.push_back("--output-file");
   std::string OutputFileName = TC.getInputFilename(Output);
 
@@ -648,6 +658,13 @@ void NVPTX::getNVPTXTargetFeatures(const Driver &D, const llvm::Triple &Triple,
     Features.push_back(Args.MakeArgString(PtxFeature));
     return;
   }
+  // Add --cuda-next-ptx to the list of features, but carry on to add the
+  // default PTX feature for the detected CUDA SDK. NVPTX back-end will use the
+  // higher version.
+  StringRef NextPtx = Args.getLastArgValue(options::OPT_cuda_next_ptx_EQ);
+  if (!NextPtx.empty())
+    Features.push_back(Args.MakeArgString("+ptx" + NextPtx));
+
   CudaInstallationDetector CudaInstallation(D, Triple, Args);
 
   // New CUDA versions often introduce new instructions that are only supported
@@ -840,47 +857,62 @@ void CudaToolChain::addClangTargetOptions(
       CC1Args.push_back("-fcuda-allow-variadic-functions");
   }
 
-  if (DriverArgs.hasArg(options::OPT_nogpulib))
-    return;
-
   if (DeviceOffloadingKind == Action::OFK_OpenMP &&
       DriverArgs.hasArg(options::OPT_S))
     return;
 
-  std::string LibDeviceFile = CudaInstallation.getLibDeviceFile(GpuArch);
-  if (LibDeviceFile.empty()) {
-    getDriver().Diag(diag::err_drv_no_cuda_libdevice) << GpuArch;
-    return;
-  }
+  clang::CudaVersion CudaInstallationVersion = CudaInstallation.version();
+
+  if (!DriverArgs.hasArg(options::OPT_nogpulib)) {
+    std::string LibDeviceFile = CudaInstallation.getLibDeviceFile(GpuArch);
+    if (LibDeviceFile.empty()) {
+      getDriver().Diag(diag::err_drv_no_cuda_libdevice) << GpuArch;
+      return;
+    }
 
-  CC1Args.push_back("-mlink-builtin-bitcode");
-  CC1Args.push_back(DriverArgs.MakeArgString(LibDeviceFile));
+    CC1Args.push_back("-mlink-builtin-bitcode");
+    CC1Args.push_back(DriverArgs.MakeArgString(LibDeviceFile));
 
-  clang::CudaVersion CudaInstallationVersion = CudaInstallation.version();
+    if (DeviceOffloadingKind == Action::OFK_OpenMP) {
+      if (CudaInstallationVersion < CudaVersion::CUDA_92) {
+        getDriver().Diag(
+            diag::err_drv_omp_offload_target_cuda_version_not_support)
+            << CudaVersionToString(CudaInstallationVersion);
+        return;
+      }
+
+      // Link the bitcode library late if we're using device LTO.
+      if (getDriver().isUsingLTO(/* IsOffload */ true))
+        return;
+
+      addOpenMPDeviceRTL(getDriver(), DriverArgs, CC1Args, GpuArch.str(),
+                         getTriple(), HostTC);
+    }
+  }
 
   if (DriverArgs.hasFlag(options::OPT_fcuda_short_ptr,
                          options::OPT_fno_cuda_short_ptr, false))
     CC1Args.append({"-mllvm", "--nvptx-short-ptr"});
 
-  if (CudaInstallationVersion >= CudaVersion::UNKNOWN)
+  if (CudaInstallation.isValid() &&
+      CudaInstallationVersion > CudaVersion::UNKNOWN)
     CC1Args.push_back(
         DriverArgs.MakeArgString(Twine("-target-sdk-version=") +
                                  CudaVersionToString(CudaInstallationVersion)));
 
-  if (DeviceOffloadingKind == Action::OFK_OpenMP) {
-    if (CudaInstallationVersion < CudaVersion::CUDA_92) {
-      getDriver().Diag(
-          diag::err_drv_omp_offload_target_cuda_version_not_support)
-          << CudaVersionToString(CudaInstallationVersion);
-      return;
-    }
-
-    // Link the bitcode library late if we're using device LTO.
-    if (getDriver().isUsingLTO(/* IsOffload */ true))
-      return;
+  std::string NextSM = getSMNext(DriverArgs);
+  if (!NextSM.empty()) {
+    CC1Args.push_back(DriverArgs.MakeArgStringRef("--cuda-next-sm=" + NextSM));
+    CC1Args.append(
+        {"-mllvm", DriverArgs.MakeArgString(("--nvptx-next-sm=" + NextSM))});
+  }
 
-    addOpenMPDeviceRTL(getDriver(), DriverArgs, CC1Args, GpuArch.str(),
-                       getTriple(), HostTC);
+  StringRef NextPTX = DriverArgs.getLastArgValue(options::OPT_cuda_next_ptx_EQ);
+  if (!NextPTX.empty()) {
+    CC1Args.push_back(
+        DriverArgs.MakeArgStringRef(("--cuda-next-ptx=" + NextPTX).str()));
+    CC1Args.append({"-mllvm", DriverArgs.MakeArgString(
+                                  ("--nvptx-next-ptx=" + NextPTX).str())});
   }
 }
 
diff --git a/clang/test/Driver/cuda-sm_next.cu b/clang/test/Driver/cuda-sm_next.cu
new file mode 100644
index 00000000000000..379dcb297ae1a7
--- /dev/null
+++ b/clang/test/Driver/cuda-sm_next.cu
@@ -0,0 +1,66 @@
+// Tests CUDA compilation targeting sm_next
+
+// CC1 options level check.
+// Check that by default we only supply sm_next CPU info without explicitly 
+// overriding SM/PTX versions, and letting LLVM pick the defaults.
+// RUN: %clang -### -c --target=x86_64-linux-gnu --cuda-device-only  \
+// RUN:    --cuda-path=%S/Inputs/CUDA_111/usr/local/cuda %s \
+// RUN:    --cuda-gpu-arch=sm_next  2>&1 \
+// RUN:   | FileCheck -check-prefixes=ARGS-COMMON,ARGS-ARCH %s
+//
+// Same, with explicitly set sm and PTX versions.
+// RUN: %clang -### -c --target=x86_64-linux-gnu --cuda-device-only \
+// RUN:    --cuda-path=%S/Inputs/CUDA_111/usr/local/cuda %s \
+// RUN:    --cuda-gpu-arch=sm_next --cuda-next-sm=111 --cuda-next-ptx=222  2>&1 \
+// RUN:   | FileCheck -check-prefixes=ARGS-COMMON,ARGS-OVERRIDE %s
+
+// Preprocessor level checks.
+// RUN: %clang -dD -E --target=x86_64-linux-gnu --cuda-device-only -nocudainc \
+// RUN:    --cuda-path=%S/Inputs/CUDA_111/usr/local/cuda %s \
+// RUN:    --cuda-gpu-arch=sm_next  2>&1 \
+// RUN:   | FileCheck -check-prefixes=PP-COMMON,PP-ARCH %s
+//
+// Same, with explicitly set sm and PTX versions.
+// RUN: %clang -dD -E --target=x86_64-linux-gnu --cuda-device-only -nocudainc \
+// RUN:    --cuda-path=%S/Inputs/CUDA_111/usr/local/cuda %s \
+// RUN:    --cuda-gpu-arch=sm_next --cuda-next-sm=111 --cuda-next-ptx=222  2>&1 \
+// RUN:   | FileCheck -check-prefixes=PP-COMMON,PP-OVERRIDE %s
+
+// PTX level checks. 
+// RUN: %clang -S --target=x86_64-linux-gnu --cuda-device-only -nocudainc -nocudalib \
+// RUN:    --cuda-path=%S/Inputs/CUDA_111/usr/local/cuda -o - %s \
+// RUN:    --cuda-gpu-arch=sm_next  2>&1 \
+// RUN:   | FileCheck -check-prefixes=PTX-ARCH %s
+//
+// Same, with explicitly set sm and PTX versions.
+// RUN: %clang -S --target=x86_64-linux-gnu --cuda-device-only -nocudainc -nocudalib \
+// RUN:      --cuda-path=%S/Inputs/CUDA_111/usr/local/cuda -o - %s \
+// RUN:      --cuda-gpu-arch=sm_next --cuda-next-sm=111 --cuda-next-ptx=222  2>&1 \
+// RUN:   | FileCheck -check-prefixes=PTX-OVERRIDE %s
+
+
+// ARGS-COMMON: "-cc1" "-triple" "nvptx64-nvidia-cuda"
+// We need to pass specific SM version to CC1, so that preprocessor can set __CUDA_ARCH__ macro
+// ARGS-ARCH-SAME: "--cuda-next-sm=52" "-mllvm" "--nvptx-next-sm=52"
+// .. but we do not explicitly set PTX version, and carry on with the default for the detected CUDA SDK.
+// ARGS-ARCH-NOT: --cuda-next-ptx=
+// ARGS-ARCH-NOT: --nvptx-next-ptx=
+// When we override SM and PTX versions, we explicitly set them for both clang and LLVM.
+// ARGS-OVERRIDE-SAME: "--cuda-next-sm=111" "-mllvm" "--nvptx-next-sm=111" "--cuda-next-ptx=222" "-mllvm" "--nvptx-next-ptx=222"
+// ARGS-COMMON-SAME: "-target-cpu" "sm_next"
+// ARGS-COMMON-SAME: "-target-feature" "+ptx71"
+// ARGS-COMMON-NEXT: ptxas
+// ARGS-ARCH-SAME: "--gpu-name" "sm_52"
+// ARGS-OVERRIDE-SAME: "--gpu-name" "sm_111"
+//
+//
+// PP-COMMON:  #define __NVPTX__ 1
+// PP-ARCH: #define __CUDA_ARCH__ 520
+// PP-OVERRIDE: #define __CUDA_ARCH__  1110
+//
+//
+// PTX-ARCH:  .version 8.5
+// PTX-ARCH:  .target sm_52
+// PTX-OVERRIDE:  .version 22.2
+// PTX-OVERRIDE:  .target sm_111
+
diff --git a/clang/test/Misc/target-invalid-cpu-note.c b/clang/test/Misc/target-invalid-cpu-note.c
index 4d6759dd81537a..9c96153877d1c6 100644
--- a/clang/test/Misc/target-invalid-cpu-note.c
+++ b/clang/test/Misc/target-invalid-cpu-note.c
@@ -29,7 +29,7 @@
 
 // RUN: not %clang_cc1 -triple nvptx--- -target-cpu not-a-cpu -fsyntax-only %s 2>&1 | FileCheck %s --check-prefix NVPTX
 // NVPTX: error: unknown target CPU 'not-a-cpu'
-// NVPTX-NEXT: note: valid target CPU values are: sm_20, sm_21, sm_30, sm_32, sm_35, sm_37, sm_50, sm_52, sm_53, sm_60, sm_61, sm_62, sm_70, sm_72, sm_75, sm_80, sm_86, sm_87, sm_89, sm_90, sm_90a, gfx600, gfx601, gfx602, gfx700, gfx701, gfx702, gfx703, gfx704, gfx705, gfx801, gfx802, gfx803, gfx805, gfx810, gfx9-generic, gfx900, gfx902, gfx904, gfx906, gfx908, gfx909, gfx90a, gfx90c, gfx940, gfx941, gfx942, gfx10-1-generic, gfx1010, gfx1011, gfx1012, gfx1013, gfx10-3-generic, gfx1030, gfx1031, gfx1032, gfx1033, gfx1034, gfx1035, gfx1036, gfx11-generic, gfx1100, gfx1101, gfx1102, gfx1103, gfx1150, gfx1151, gfx1152, gfx12-generic, gfx1200, gfx1201, amdgcnspirv{{$}}
+// NVPTX-NEXT: note: valid target CPU values are: sm_20, sm_21, sm_30, sm_32, sm_35, sm_37, sm_50, sm_52, sm_53, sm_60, sm_61, sm_62, sm_70, sm_72, sm_75, sm_80, sm_86, sm_87, sm_89, sm_90, sm_90a, sm_next, gfx600, gfx601, gfx602, gfx700, gfx701, gfx702, gfx703, gfx704, gfx705, gfx801, gfx802, gfx803, gfx805, gfx810, gfx9-generic, gfx900, gfx902, gfx904, gfx906, gfx908, gfx909, gfx90a, gfx90c, gfx940, gfx941, gfx942, gfx10-1-generic, gfx1010, gfx1011, gfx1012, gfx1013, gfx10-3-generic, gfx1030, gfx1031, gfx1032, gfx1033, gfx1034, gfx1035, gfx1036, gfx11-generic, gfx1100, gfx1101, gfx1102, gfx1103, gfx1150, gfx1151, gfx1152, gfx12-generic, gfx1200, gfx1201, amdgcnspirv{{$}}
 
 // RUN: not %clang_cc1 -triple r600--- -target-cpu not-a-cpu -fsyntax-only %s 2>&1 | FileCheck %s --check-prefix R600
 // R600: error: unknown target CPU 'not-a-cpu'
diff --git a/llvm/lib/Target/NVPTX/NVPTX.td b/llvm/lib/Target/NVPTX/NVPTX.td
index bb4549a5e60782..d55dc0194c6fc5 100644
--- a/llvm/lib/Target/NVPTX/NVPTX.td
+++ b/llvm/lib/Target/NVPTX/NVPTX.td
@@ -39,12 +39,15 @@ foreach sm = [20, 21, 30, 32, 35, 37, 50, 52, 53,
   def SM#sm: FeatureSM<""#sm, !mul(sm, 10)>;
 
 def SM90a: FeatureSM<"90a", 901>;
+def SMnext: FeatureSM<"next", 9999>; // Placeholder for an unknown future version
 
 foreach version = [32, 40, 41, 42, 43, 50, 60, 61, 62, 63, 64, 65,
                    70, 71, 72, 73, 74, 75, 76, 77, 78,
                    80, 81, 82, 83, 84, 85] in
   def PTX#version: FeaturePTX<version>;
 
+def PTXnext: FeaturePTX<9999>; // Placeholder for an unknown future version.
+
 //===----------------------------------------------------------------------===//
 // NVPTX supported processors.
 //===----------------------------------------------------------------------===//
@@ -73,6 +76,7 @@ def : Proc<"sm_87", [SM87, PTX74]>;
 def : Proc<"sm_89", [SM89, PTX78]>;
 def : Proc<"sm_90", [SM90, PTX78]>;
 def : Proc<"sm_90a", [SM90a, PTX80]>;
+def : Proc<"sm_next", [SMnext, PTXnext]>;
 
 def NVPTXInstrInfo : InstrInfo {
 }
diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp b/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
index 420065585b3849..594cb094e13aec 100644
--- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
@@ -12,6 +12,7 @@
 
 #include "NVPTXSubtarget.h"
 #include "NVPTXTargetMachine.h"
+#include "llvm/ADT/StringExtras.h"
 
 using namespace llvm;
 
@@ -26,24 +27,38 @@ static cl::opt<bool>
     NoF16Math("nvptx-no-f16-math", cl::Hidden,
               cl::desc("NVPTX Specific: Disable generation of f16 math ops."),
               cl::init(false));
+static cl::opt<unsigned>
+    NextSM("nvptx-next-sm", cl::Hidden,
+           cl::desc("NVPTX Specific: Override SM ID for sm_next."),
+           cl::init(90));
+static cl::opt<unsigned>
+    NextPTX("nvptx-next-ptx", cl::Hidden,
+            cl::desc("NVPTX Specific: Override PTX version for sm_next."),
+            cl::init(85));
+
 // Pin the vtable to this file.
 void NVPTXSubtarget::anchor() {}
 
 NVPTXSubtarget &NVPTXSubtarget::initializeSubtargetDependencies(StringRef CPU,
                                                                 StringRef FS) {
-    // Provide the default CPU if we don't have one.
-    TargetName = std::string(CPU.empty() ? "sm_30" : CPU);
+  // Provide the default CPU if we don't have one.
+  TargetName = std::string(CPU.empty() ? "sm_30" : CPU);
 
-    ParseSubtargetFeatures(TargetName, /*TuneCPU*/ TargetName, FS);
+  ParseSubtargetFeatures(TargetName, /*TuneCPU*/ TargetName, FS);
+  if (TargetName == "sm_next") {
+    TargetName = "sm_" + itostr(NextSM);
+    FullSmVersion = NextSM * 10;
+    PTXVersion = NextPTX;
+  }
 
-    // Re-map SM version numbers, SmVersion carries the regular SMs which do
-    // have relative order, while FullSmVersion allows distinguishing sm_90 from
-    // sm_90a, which would *not* be a subset of sm_91.
-    SmVersion = getSmVersion();
+  // Re-map SM version numbers, SmVersion carries the regular SMs which do
+  // have relative order, while FullSmVersion allows distinguishing sm_90 from
+  // sm_90a, which would *not* be a subset of sm_91.
+  SmVersion = getSmVersion();
 
-    // Set default to PTX 6.0 (CUDA 9.0)
-    if (PTXVersion == 0) {
-      PTXVersion = 60;
+  // Set default to PTX 6.0 (CUDA 9.0)
+  if (PTXVersion == 0) {
+    PTXVersion = 60;
   }
 
   return *this;

>From 87111300cc3fd7ec582fa06f8e8621038a56d10a Mon Sep 17 00:00:00 2001
From: Artem Belevich <tra at google.com>
Date: Tue, 23 Jul 2024 15:49:06 -0700
Subject: [PATCH 2/4] Undo unintentional formatting change.

---
 clang/lib/Basic/Cuda.cpp | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/clang/lib/Basic/Cuda.cpp b/clang/lib/Basic/Cuda.cpp
index 7f50b58aeca833..f714723db50fce 100644
--- a/clang/lib/Basic/Cuda.cpp
+++ b/clang/lib/Basic/Cuda.cpp
@@ -79,11 +79,9 @@ struct OffloadArchToStringMap {
 };
 } // namespace
 
-#define SM2(sm, ca)                                                            \
-  { OffloadArch::SM_##sm, "sm_" #sm, ca }
+#define SM2(sm, ca) {OffloadArch::SM_##sm, "sm_" #sm, ca}
 #define SM(sm) SM2(sm, "compute_" #sm)
-#define GFX(gpu)                                                               \
-  { OffloadArch::GFX##gpu, "gfx" #gpu, "compute_amdgcn" }
+#define GFX(gpu) {OffloadArch::GFX##gpu, "gfx" #gpu, "compute_amdgcn"}
 static const OffloadArchToStringMap arch_names[] = {
     // clang-format off
     {OffloadArch::UNUSED, "", ""},

>From 3e21acc856d941297d4b7e743f676d55bcda353b Mon Sep 17 00:00:00 2001
From: Artem Belevich <tra at google.com>
Date: Thu, 25 Jul 2024 10:54:38 -0700
Subject: [PATCH 3/4] Renamed options to sm_custom, improved implementation.

---
 clang/include/clang/Basic/Cuda.h              |  45 +++++++-
 .../clang/Basic/DiagnosticDriverKinds.td      |   2 +
 clang/include/clang/Basic/LangOptions.def     |   2 -
 clang/include/clang/Basic/LangOptions.h       |   4 +
 clang/include/clang/Driver/Options.td         |  12 +-
 clang/include/clang/Driver/ToolChain.h        |   7 ++
 clang/lib/Basic/Cuda.cpp                      |  96 ++++++++++------
 clang/lib/Basic/Targets/NVPTX.cpp             |   4 +-
 clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp      |   2 +-
 clang/lib/Driver/ToolChains/Clang.cpp         |   9 +-
 clang/lib/Driver/ToolChains/Clang.h           |   9 +-
 clang/lib/Driver/ToolChains/Cuda.cpp          | 108 ++++++++----------
 clang/lib/Driver/ToolChains/Cuda.h            |   3 +
 clang/test/Driver/cuda-detect.cu              |  15 +--
 clang/test/Driver/cuda-sm_next.cu             |  77 +++++--------
 clang/test/Misc/target-invalid-cpu-note.c     |   2 +-
 llvm/lib/Target/NVPTX/NVPTX.td                |   6 +-
 llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp      |  36 ++++--
 18 files changed, 258 insertions(+), 181 deletions(-)

diff --git a/clang/include/clang/Basic/Cuda.h b/clang/include/clang/Basic/Cuda.h
index a81d185d78cbee..fa025d32b67ae1 100644
--- a/clang/include/clang/Basic/Cuda.h
+++ b/clang/include/clang/Basic/Cuda.h
@@ -9,6 +9,7 @@
 #ifndef LLVM_CLANG_BASIC_CUDA_H
 #define LLVM_CLANG_BASIC_CUDA_H
 
+#include "llvm/ADT/StringRef.h"
 namespace llvm {
 class StringRef;
 class Twine;
@@ -52,6 +53,42 @@ const char *CudaVersionToString(CudaVersion V);
 // Input is "Major.Minor"
 CudaVersion CudaStringToVersion(const llvm::Twine &S);
 
+enum class PTXVersion {
+  PTX_UNKNOWN = 0,
+  PTX_32 = 32,
+  PTX_40 = 40,
+  PTX_41,
+  PTX_42,
+  PTX_43,
+  PTX_50 = 50,
+  PTX_60 = 60,
+  PTX_61,
+  PTX_62,
+  PTX_63,
+  PTX_64,
+  PTX_65,
+  PTX_70 = 70,
+  PTX_71,
+  PTX_72,
+  PTX_73,
+  PTX_74,
+  PTX_75,
+  PTX_76,
+  PTX_77,
+  PTX_78,
+  PTX_80 = 80,
+  PTX_81,
+  PTX_82,
+  PTX_83,
+  PTX_84,
+  PTX_85,
+  PTX_LAST = PTX_85,
+  PTX_custom = 9999, // placeholder for an unknown future version.
+};
+
+const std::string PTXVersionToFeature(PTXVersion V);
+PTXVersion GetRequiredPTXVersion(CudaVersion V);
+
 enum class OffloadArch {
   UNUSED,
   UNKNOWN,
@@ -78,7 +115,7 @@ enum class OffloadArch {
   SM_89,
   SM_90,
   SM_90a,
-  SM_next,
+  SM_custom,
   GFX600,
   GFX601,
   GFX602,
@@ -161,6 +198,12 @@ const char *OffloadArchToVirtualArchString(OffloadArch A);
 // The input should have the form "sm_20".
 OffloadArch StringToOffloadArch(llvm::StringRef S);
 
+// Converts custom SM name to its numeric value to be used in __CUDA_ARCH__
+// Custom SM name format: `sm_[ID][suffix]`.
+// The function returns `ID`*10 or zero on error.
+// `suffix` is expected to be empty or `a` and is ignored otherwise.
+unsigned CUDACustomSMToArchID(llvm::StringRef S);
+
 /// Get the earliest CudaVersion that supports the given OffloadArch.
 CudaVersion MinVersionForOffloadArch(OffloadArch A);
 
diff --git a/clang/include/clang/Basic/DiagnosticDriverKinds.td b/clang/include/clang/Basic/DiagnosticDriverKinds.td
index 3d8240f8357b40..5c5d97b7a85a65 100644
--- a/clang/include/clang/Basic/DiagnosticDriverKinds.td
+++ b/clang/include/clang/Basic/DiagnosticDriverKinds.td
@@ -741,6 +741,8 @@ def err_drv_invalid_or_unsupported_offload_target : Error<
   "invalid or unsupported offload target: '%0'">;
 def err_drv_cuda_offload_only_emit_bc : Error<
   "CUDA offload target is supported only along with --emit-llvm">;
+def err_drv_sm_custom_args : Error<
+  "offload target sm_custom requires both --cuda-custom_sm and --cuda_custom_ptx to be specified">;
 
 def warn_drv_jmc_requires_debuginfo : Warning<
   "%0 requires debug info. Use %1 or debug options that enable debugger's "
diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def
index c1901429e11fc8..834a6f6cd43e32 100644
--- a/clang/include/clang/Basic/LangOptions.def
+++ b/clang/include/clang/Basic/LangOptions.def
@@ -287,8 +287,6 @@ LANGOPT(HLSLStrictAvailability, 1, 0,
 LANGOPT(CUDAIsDevice      , 1, 0, "compiling for CUDA device")
 LANGOPT(CUDAAllowVariadicFunctions, 1, 0, "allowing variadic functions in CUDA device code")
 LANGOPT(CUDAHostDeviceConstexpr, 1, 1, "treating unattributed constexpr functions as __host__ __device__")
-LANGOPT(CUDANextSM, 32, 0, "SM version for sm_next target")
-LANGOPT(CUDANextPTX, 32, 0, "PTX version for sm_next target")
 LANGOPT(GPUDeviceApproxTranscendentals, 1, 0, "using approximate transcendental functions")
 LANGOPT(GPURelocatableDeviceCode, 1, 0, "generate relocatable device code")
 LANGOPT(OffloadImplicitHostDeviceTemplates, 1, 0, "assume template functions to be implicitly host device by default for CUDA/HIP")
diff --git a/clang/include/clang/Basic/LangOptions.h b/clang/include/clang/Basic/LangOptions.h
index 91f1c2f2e6239e..927fd8fde5f771 100644
--- a/clang/include/clang/Basic/LangOptions.h
+++ b/clang/include/clang/Basic/LangOptions.h
@@ -579,6 +579,10 @@ class LangOptions : public LangOptionsBase {
   // WebAssembly target.
   bool NoWasmOpt = false;
 
+  // Overrides for the custom SM/PTX variants for CUDA's sm_custom target.
+  std::string CUDACustomSM;
+  unsigned CUDACustomPTX = 0;
+
   LangOptions();
 
   /// Set language defaults for the given input language and
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 9bba185b218cb1..54ddc9da45919f 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -1458,15 +1458,15 @@ def fno_hip_emit_relocatable : Flag<["-"], "fno-hip-emit-relocatable">,
   HelpText<"Do not override toolchain to compile HIP source to relocatable">;
 }
 
-def cuda_next_sm_EQ : Joined<["--"], "cuda-next-sm=">,
+def cuda_custom_sm_EQ : Joined<["--"], "cuda-custom-sm=">,
   Visibility<[ClangOption, CC1Option]>,
-  HelpText<"SM version to use for sm_next GPU">,
-  MarshallingInfoInt<LangOpts<"CUDANextSM">, "0">,
+  HelpText<"SM version to use for sm_custom GPU">,
+  MarshallingInfoString<LangOpts<"CUDACustomSM">>,
   ShouldParseIf<cuda.KeyPath>,Flags<[HelpHidden]>;
-def cuda_next_ptx_EQ : Joined<["--"], "cuda-next-ptx=">,
+def cuda_custom_ptx_EQ : Joined<["--"], "cuda-custom-ptx=">,
   Visibility<[ClangOption, CC1Option]>,
-  HelpText<"SM version to use for sm_next GPU">,
-  MarshallingInfoInt<LangOpts<"CUDANextPTX">, "0">,
+  HelpText<"SM version to use for sm_custom GPU">,
+  MarshallingInfoInt<LangOpts<"CUDACustomPTX">, "0">,
   ShouldParseIf<cuda.KeyPath>,Flags<[HelpHidden]>;
 
 // Clang specific/exclusive options for OpenACC.
diff --git a/clang/include/clang/Driver/ToolChain.h b/clang/include/clang/Driver/ToolChain.h
index ece1384d5d3c02..c39e4a0dc9ce2e 100644
--- a/clang/include/clang/Driver/ToolChain.h
+++ b/clang/include/clang/Driver/ToolChain.h
@@ -678,6 +678,13 @@ class ToolChain {
   virtual void addClangTargetOptions(const llvm::opt::ArgList &DriverArgs,
                                      llvm::opt::ArgStringList &CC1Args,
                                      Action::OffloadKind DeviceOffloadKind) const;
+  /// [optional] Some toolchains may need more info and need to pass JobAction.
+  /// This is only intended to augment the function above.
+  virtual void addClangTargetOptions(const llvm::opt::ArgList &DriverArgs,
+                                     llvm::opt::ArgStringList &CC1Args,
+                                     const JobAction &JC) const {
+    addClangTargetOptions(DriverArgs, CC1Args, JC.getOffloadingDeviceKind());
+  }
 
   /// Add options that need to be passed to cc1as for this target.
   virtual void
diff --git a/clang/lib/Basic/Cuda.cpp b/clang/lib/Basic/Cuda.cpp
index f714723db50fce..832740ae1a4963 100644
--- a/clang/lib/Basic/Cuda.cpp
+++ b/clang/lib/Basic/Cuda.cpp
@@ -3,6 +3,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/VersionTuple.h"
 
 namespace clang {
@@ -11,40 +12,43 @@ struct CudaVersionMapEntry {
   const char *Name;
   CudaVersion Version;
   llvm::VersionTuple TVersion;
+  PTXVersion PTX;
 };
-#define CUDA_ENTRY(major, minor)                                               \
+#define CUDA_ENTRY(major, minor, ptx)                                          \
   {                                                                            \
     #major "." #minor, CudaVersion::CUDA_##major##minor,                       \
-        llvm::VersionTuple(major, minor)                                       \
+        llvm::VersionTuple(major, minor), PTXVersion::ptx                      \
   }
 
 static const CudaVersionMapEntry CudaNameVersionMap[] = {
-    CUDA_ENTRY(7, 0),
-    CUDA_ENTRY(7, 5),
-    CUDA_ENTRY(8, 0),
-    CUDA_ENTRY(9, 0),
-    CUDA_ENTRY(9, 1),
-    CUDA_ENTRY(9, 2),
-    CUDA_ENTRY(10, 0),
-    CUDA_ENTRY(10, 1),
-    CUDA_ENTRY(10, 2),
-    CUDA_ENTRY(11, 0),
-    CUDA_ENTRY(11, 1),
-    CUDA_ENTRY(11, 2),
-    CUDA_ENTRY(11, 3),
-    CUDA_ENTRY(11, 4),
-    CUDA_ENTRY(11, 5),
-    CUDA_ENTRY(11, 6),
-    CUDA_ENTRY(11, 7),
-    CUDA_ENTRY(11, 8),
-    CUDA_ENTRY(12, 0),
-    CUDA_ENTRY(12, 1),
-    CUDA_ENTRY(12, 2),
-    CUDA_ENTRY(12, 3),
-    CUDA_ENTRY(12, 4),
-    CUDA_ENTRY(12, 5),
-    {"", CudaVersion::NEW, llvm::VersionTuple(std::numeric_limits<int>::max())},
-    {"unknown", CudaVersion::UNKNOWN, {}} // End of list tombstone.
+    CUDA_ENTRY(7, 0, PTX_42),
+    CUDA_ENTRY(7, 5, PTX_43),
+    CUDA_ENTRY(8, 0, PTX_50),
+    CUDA_ENTRY(9, 0, PTX_60),
+    CUDA_ENTRY(9, 1, PTX_61),
+    CUDA_ENTRY(9, 2, PTX_62),
+    CUDA_ENTRY(10, 0, PTX_63),
+    CUDA_ENTRY(10, 1, PTX_64),
+    CUDA_ENTRY(10, 2, PTX_65),
+    CUDA_ENTRY(11, 0, PTX_70),
+    CUDA_ENTRY(11, 1, PTX_71),
+    CUDA_ENTRY(11, 2, PTX_72),
+    CUDA_ENTRY(11, 3, PTX_73),
+    CUDA_ENTRY(11, 4, PTX_74),
+    CUDA_ENTRY(11, 5, PTX_75),
+    CUDA_ENTRY(11, 6, PTX_76),
+    CUDA_ENTRY(11, 7, PTX_77),
+    CUDA_ENTRY(11, 8, PTX_78),
+    CUDA_ENTRY(12, 0, PTX_80),
+    CUDA_ENTRY(12, 1, PTX_81),
+    CUDA_ENTRY(12, 2, PTX_82),
+    CUDA_ENTRY(12, 3, PTX_83),
+    CUDA_ENTRY(12, 4, PTX_84),
+    CUDA_ENTRY(12, 5, PTX_85),
+    {"", CudaVersion::NEW, llvm::VersionTuple(std::numeric_limits<int>::max()),
+     PTXVersion::PTX_LAST},
+    // End of list tombstone
+    {"unknown", CudaVersion::UNKNOWN, {}, PTXVersion::PTX_42}
 };
 #undef CUDA_ENTRY
 
@@ -71,6 +75,20 @@ CudaVersion ToCudaVersion(llvm::VersionTuple Version) {
   return CudaVersion::UNKNOWN;
 }
 
+const std::string PTXVersionToFeature(PTXVersion V) {
+  if (V > PTXVersion::PTX_UNKNOWN && V <= PTXVersion::PTX_LAST)
+    return llvm::formatv("+ptx{0}", static_cast<unsigned>(V));
+  return {};
+}
+
+PTXVersion GetRequiredPTXVersion(CudaVersion V) {
+  for (auto &I : CudaNameVersionMap)
+    if (V == I.Version)
+      return I.PTX;
+
+  return PTXVersion::PTX_UNKNOWN;
+}
+
 namespace {
 struct OffloadArchToStringMap {
   OffloadArch arch;
@@ -79,9 +97,11 @@ struct OffloadArchToStringMap {
 };
 } // namespace
 
-#define SM2(sm, ca) {OffloadArch::SM_##sm, "sm_" #sm, ca}
+#define SM2(sm, ca)                                                            \
+  { OffloadArch::SM_##sm, "sm_" #sm, ca }
 #define SM(sm) SM2(sm, "compute_" #sm)
-#define GFX(gpu) {OffloadArch::GFX##gpu, "gfx" #gpu, "compute_amdgcn"}
+#define GFX(gpu)                                                               \
+  { OffloadArch::GFX##gpu, "gfx" #gpu, "compute_amdgcn" }
 static const OffloadArchToStringMap arch_names[] = {
     // clang-format off
     {OffloadArch::UNUSED, "", ""},
@@ -96,7 +116,7 @@ static const OffloadArchToStringMap arch_names[] = {
     SM(89),                          // Ada Lovelace
     SM(90),                          // Hopper
     SM(90a),                         // Hopper
-    SM(next),                        // Placeholder for a new arch.
+    SM(custom),                        // Placeholder for a new arch.
     GFX(600),  // gfx600
     GFX(601),  // gfx601
     GFX(602),  // gfx602
@@ -182,6 +202,18 @@ OffloadArch StringToOffloadArch(llvm::StringRef S) {
   return result->arch;
 }
 
+unsigned CUDACustomSMToArchID(llvm::StringRef S) {
+  if (!S.starts_with("sm_"))
+    return 0;
+  S = S.drop_front(3); // skip `sm_`
+  if (S.ends_with("a"))
+    S = S.drop_back(1);
+  unsigned ID;
+  if (S.getAsInteger(10, ID))
+    return 0; // We've failed to parse the SM name
+  return ID * 10;
+}
+
 CudaVersion MinVersionForOffloadArch(OffloadArch A) {
   if (A == OffloadArch::UNKNOWN)
     return CudaVersion::UNKNOWN;
@@ -222,7 +254,7 @@ CudaVersion MinVersionForOffloadArch(OffloadArch A) {
     return CudaVersion::CUDA_118;
   case OffloadArch::SM_90a:
     return CudaVersion::CUDA_120;
-  case clang::OffloadArch::SM_next:
+  case clang::OffloadArch::SM_custom:
     return CudaVersion::UNKNOWN;
   default:
     llvm_unreachable("invalid enum");
diff --git a/clang/lib/Basic/Targets/NVPTX.cpp b/clang/lib/Basic/Targets/NVPTX.cpp
index f4096a19af38ea..a245ef6dcc6cad 100644
--- a/clang/lib/Basic/Targets/NVPTX.cpp
+++ b/clang/lib/Basic/Targets/NVPTX.cpp
@@ -283,8 +283,8 @@ void NVPTXTargetInfo::getTargetDefines(const LangOptions &Opts,
       case OffloadArch::SM_90:
       case OffloadArch::SM_90a:
         return "900";
-      case OffloadArch::SM_next:
-        return llvm::itostr(Opts.CUDANextSM * 10);
+      case OffloadArch::SM_custom:
+        return llvm::itostr(CUDACustomSMToArchID(Opts.CUDACustomSM));
       }
       llvm_unreachable("unhandled OffloadArch");
     }();
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
index a9a2e0bd3c7587..423dd22bfa7723 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@@ -2276,7 +2276,7 @@ void CGOpenMPRuntimeGPU::processRequiresDirective(const OMPRequiresDecl *D) {
       case OffloadArch::SM_89:
       case OffloadArch::SM_90:
       case OffloadArch::SM_90a:
-      case OffloadArch::SM_next:
+      case OffloadArch::SM_custom:
       case OffloadArch::GFX600:
       case OffloadArch::GFX601:
       case OffloadArch::GFX602:
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index bc77b98c25645a..d305586d5ee4bb 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -1657,7 +1657,8 @@ void Clang::AddARMTargetArgs(const llvm::Triple &Triple, const ArgList &Args,
   AddUnalignedAccessWarning(CmdArgs);
 }
 
-void Clang::RenderTargetOptions(const llvm::Triple &EffectiveTriple,
+void Clang::RenderTargetOptions(const JobAction &JA,
+                                const llvm::Triple &EffectiveTriple,
                                 const ArgList &Args, bool KernelOrKext,
                                 ArgStringList &CmdArgs) const {
   const ToolChain &TC = getToolChain();
@@ -5314,7 +5315,7 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
     CmdArgs.push_back("-disable-llvm-passes");
 
     // Render target options.
-    TC.addClangTargetOptions(Args, CmdArgs, JA.getOffloadingDeviceKind());
+    TC.addClangTargetOptions(Args, CmdArgs, JA);
 
     // reject options that shouldn't be supported in bitcode
     // also reject kernel/kext
@@ -6005,7 +6006,7 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
                       /*ForAS*/ false, /*IsAux*/ true);
   }
 
-  TC.addClangTargetOptions(Args, CmdArgs, JA.getOffloadingDeviceKind());
+  TC.addClangTargetOptions(Args, CmdArgs, JA);
 
   addMCModel(D, Args, Triple, RelocationModel, CmdArgs);
 
@@ -6032,7 +6033,7 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
     CmdArgs.push_back(Args.MakeArgString(CPU));
   }
 
-  RenderTargetOptions(Triple, Args, KernelOrKext, CmdArgs);
+  RenderTargetOptions(JA, Triple, Args, KernelOrKext, CmdArgs);
 
   // Add clang-cl arguments.
   types::ID InputType = Input.getType();
diff --git a/clang/lib/Driver/ToolChains/Clang.h b/clang/lib/Driver/ToolChains/Clang.h
index 18f6c5ed06a59a..7ff15d1a601993 100644
--- a/clang/lib/Driver/ToolChains/Clang.h
+++ b/clang/lib/Driver/ToolChains/Clang.h
@@ -45,7 +45,8 @@ class LLVM_LIBRARY_VISIBILITY Clang : public Tool {
                                const InputInfo &Output,
                                const InputInfoList &Inputs) const;
 
-  void RenderTargetOptions(const llvm::Triple &EffectiveTriple,
+  void RenderTargetOptions(const JobAction &JA,
+                           const llvm::Triple &EffectiveTriple,
                            const llvm::opt::ArgList &Args, bool KernelOrKext,
                            llvm::opt::ArgStringList &CmdArgs) const;
 
@@ -61,6 +62,8 @@ class LLVM_LIBRARY_VISIBILITY Clang : public Tool {
                               llvm::opt::ArgStringList &CmdArgs) const;
   void AddMIPSTargetArgs(const llvm::opt::ArgList &Args,
                          llvm::opt::ArgStringList &CmdArgs) const;
+  void AddNVPTXTargetArgs(const JobAction &JA, const llvm::opt::ArgList &Args,
+                          llvm::opt::ArgStringList &CmdArgs) const;
   void AddPPCTargetArgs(const llvm::opt::ArgList &Args,
                         llvm::opt::ArgStringList &CmdArgs) const;
   void AddR600TargetArgs(const llvm::opt::ArgList &Args,
@@ -94,8 +97,8 @@ class LLVM_LIBRARY_VISIBILITY Clang : public Tool {
 
   mutable std::unique_ptr<llvm::raw_fd_ostream> CompilationDatabase = nullptr;
   void DumpCompilationDatabase(Compilation &C, StringRef Filename,
-                               StringRef Target,
-                               const InputInfo &Output, const InputInfo &Input,
+                               StringRef Target, const InputInfo &Output,
+                               const InputInfo &Input,
                                const llvm::opt::ArgList &Args) const;
 
   void DumpCompilationDatabaseFragmentToDir(
diff --git a/clang/lib/Driver/ToolChains/Cuda.cpp b/clang/lib/Driver/ToolChains/Cuda.cpp
index cdf7c14fa407cf..b8ba780686c357 100644
--- a/clang/lib/Driver/ToolChains/Cuda.cpp
+++ b/clang/lib/Driver/ToolChains/Cuda.cpp
@@ -117,12 +117,11 @@ CudaVersion parseCudaHFile(llvm::StringRef Input) {
   return CudaVersion::UNKNOWN;
 }
 
-std::string getSMNext(const llvm::opt::ArgList &DriverArgs) {
+std::string getCustomSM(const llvm::opt::ArgList &DriverArgs) {
   return DriverArgs
-      .getLastArgValue(
-          options::OPT_cuda_next_sm_EQ,
-          StringRef(OffloadArchToString(OffloadArch::CudaDefault)).substr(3))
-      .str(); // Strip leading "sm_" from the GPU variant name.
+      .getLastArgValue(options::OPT_cuda_custom_sm_EQ,
+                       StringRef(OffloadArchToString(OffloadArch::CudaDefault)))
+      .str();
 }
 } // namespace
 
@@ -465,8 +464,8 @@ void NVPTX::Assembler::ConstructJob(Compilation &C, const JobAction &JA,
     CmdArgs.push_back("-v");
 
   CmdArgs.push_back("--gpu-name");
-  CmdArgs.push_back(Args.MakeArgString(gpu_arch == OffloadArch::SM_next
-                                           ? "sm_" + getSMNext(Args)
+  CmdArgs.push_back(Args.MakeArgString(gpu_arch == OffloadArch::SM_custom
+                                           ? getCustomSM(Args)
                                            : OffloadArchToString(gpu_arch)));
   CmdArgs.push_back("--output-file");
   std::string OutputFileName = TC.getInputFilename(Output);
@@ -563,9 +562,23 @@ void NVPTX::FatBinary::ConstructJob(Compilation &C, const JobAction &JA,
       continue;
     // We need to pass an Arch of the form "sm_XX" for cubin files and
     // "compute_XX" for ptx.
-    const char *Arch = (II.getType() == types::TY_PP_Asm)
-                           ? OffloadArchToVirtualArchString(gpu_arch)
-                           : gpu_arch_str;
+    std::string Arch = [&]() -> std::string {
+      bool IsAsm = II.getType() == types::TY_PP_Asm;
+      if (gpu_arch != OffloadArch::SM_custom)
+        return (IsAsm) ? OffloadArchToVirtualArchString(gpu_arch)
+                       : gpu_arch_str;
+      std::string CustomSM = getCustomSM(Args);
+      if (IsAsm) {
+        StringRef SMID = CustomSM;
+        if (SMID.starts_with("sm_")) // Should always be true
+          SMID = SMID.drop_front(3);
+        else
+          C.getDriver().Diag(diag::err_drv_invalid_value_with_suggestion)
+              << "--cuda-custom-sm" << CustomSM << "sm_*";
+        return formatv("compute_{0}", SMID);
+      }
+      return CustomSM;
+    }();
     CmdArgs.push_back(
         Args.MakeArgString(llvm::Twine("--image=profile=") + Arch +
                            ",file=" + getToolChain().getInputFilename(II)));
@@ -658,50 +671,14 @@ void NVPTX::getNVPTXTargetFeatures(const Driver &D, const llvm::Triple &Triple,
     Features.push_back(Args.MakeArgString(PtxFeature));
     return;
   }
-  // Add --cuda-next-ptx to the list of features, but carry on to add the
-  // default PTX feature for the detected CUDA SDK. NVPTX back-end will use the
-  // higher version.
-  StringRef NextPtx = Args.getLastArgValue(options::OPT_cuda_next_ptx_EQ);
-  if (!NextPtx.empty())
-    Features.push_back(Args.MakeArgString("+ptx" + NextPtx));
-
   CudaInstallationDetector CudaInstallation(D, Triple, Args);
 
   // New CUDA versions often introduce new instructions that are only supported
   // by new PTX version, so we need to raise PTX level to enable them in NVPTX
   // back-end.
-  const char *PtxFeature = nullptr;
-  switch (CudaInstallation.version()) {
-#define CASE_CUDA_VERSION(CUDA_VER, PTX_VER)                                   \
-  case CudaVersion::CUDA_##CUDA_VER:                                           \
-    PtxFeature = "+ptx" #PTX_VER;                                              \
-    break;
-    CASE_CUDA_VERSION(125, 85);
-    CASE_CUDA_VERSION(124, 84);
-    CASE_CUDA_VERSION(123, 83);
-    CASE_CUDA_VERSION(122, 82);
-    CASE_CUDA_VERSION(121, 81);
-    CASE_CUDA_VERSION(120, 80);
-    CASE_CUDA_VERSION(118, 78);
-    CASE_CUDA_VERSION(117, 77);
-    CASE_CUDA_VERSION(116, 76);
-    CASE_CUDA_VERSION(115, 75);
-    CASE_CUDA_VERSION(114, 74);
-    CASE_CUDA_VERSION(113, 73);
-    CASE_CUDA_VERSION(112, 72);
-    CASE_CUDA_VERSION(111, 71);
-    CASE_CUDA_VERSION(110, 70);
-    CASE_CUDA_VERSION(102, 65);
-    CASE_CUDA_VERSION(101, 64);
-    CASE_CUDA_VERSION(100, 63);
-    CASE_CUDA_VERSION(92, 61);
-    CASE_CUDA_VERSION(91, 61);
-    CASE_CUDA_VERSION(90, 60);
-#undef CASE_CUDA_VERSION
-  default:
-    PtxFeature = "+ptx42";
-  }
-  Features.push_back(PtxFeature);
+  std::string PtxFeature =
+      PTXVersionToFeature(GetRequiredPTXVersion(CudaInstallation.version()));
+  Features.push_back(Args.MakeArgString(PtxFeature));
 }
 
 /// NVPTX toolchain. Our assembler is ptxas, and our linker is nvlink. This
@@ -899,20 +876,29 @@ void CudaToolChain::addClangTargetOptions(
     CC1Args.push_back(
         DriverArgs.MakeArgString(Twine("-target-sdk-version=") +
                                  CudaVersionToString(CudaInstallationVersion)));
+}
 
-  std::string NextSM = getSMNext(DriverArgs);
-  if (!NextSM.empty()) {
-    CC1Args.push_back(DriverArgs.MakeArgStringRef("--cuda-next-sm=" + NextSM));
-    CC1Args.append(
-        {"-mllvm", DriverArgs.MakeArgString(("--nvptx-next-sm=" + NextSM))});
-  }
+void CudaToolChain::addClangTargetOptions(const llvm::opt::ArgList &DriverArgs,
+                                          llvm::opt::ArgStringList &CC1Args,
+                                          const JobAction &JA) const {
+  addClangTargetOptions(DriverArgs, CC1Args, JA.getOffloadingDeviceKind());
 
-  StringRef NextPTX = DriverArgs.getLastArgValue(options::OPT_cuda_next_ptx_EQ);
-  if (!NextPTX.empty()) {
-    CC1Args.push_back(
-        DriverArgs.MakeArgStringRef(("--cuda-next-ptx=" + NextPTX).str()));
-    CC1Args.append({"-mllvm", DriverArgs.MakeArgString(
-                                  ("--nvptx-next-ptx=" + NextPTX).str())});
+  if (StringRef(JA.getOffloadingArch()) == "sm_custom") {
+
+    std::string CustomSM = getCustomSM(DriverArgs);
+    StringRef CustomPTX =
+        DriverArgs.getLastArgValue(options::OPT_cuda_custom_ptx_EQ);
+    if (CustomSM.empty() || CustomPTX.empty()) {
+      JA.getOffloadingToolChain()->getDriver().Diag(
+          diag::err_drv_sm_custom_args);
+    }
+    CC1Args.append(
+        {// Needed by preprocessor for __CUDA_ARCH__
+         DriverArgs.MakeArgStringRef("--cuda-custom-sm=" + CustomSM),
+         // Overrides target SM in LLVM
+         "-mllvm", DriverArgs.MakeArgString(("--nvptx-custom-sm=" + CustomSM)),
+         "-mllvm",
+         DriverArgs.MakeArgString(("--nvptx-custom-ptx=" + CustomPTX))});
   }
 }
 
diff --git a/clang/lib/Driver/ToolChains/Cuda.h b/clang/lib/Driver/ToolChains/Cuda.h
index 7a6a6fb209012e..679de56693a072 100644
--- a/clang/lib/Driver/ToolChains/Cuda.h
+++ b/clang/lib/Driver/ToolChains/Cuda.h
@@ -204,6 +204,9 @@ class LLVM_LIBRARY_VISIBILITY CudaToolChain : public NVPTXToolChain {
   addClangTargetOptions(const llvm::opt::ArgList &DriverArgs,
                         llvm::opt::ArgStringList &CC1Args,
                         Action::OffloadKind DeviceOffloadKind) const override;
+  void addClangTargetOptions(const llvm::opt::ArgList &DriverArgs,
+                             llvm::opt::ArgStringList &CC1Args,
+                             const JobAction &JA) const override;
 
   llvm::DenormalMode getDefaultDenormalModeForType(
       const llvm::opt::ArgList &DriverArgs, const JobAction &JA,
diff --git a/clang/test/Driver/cuda-detect.cu b/clang/test/Driver/cuda-detect.cu
index 23b6ba2fcc09d9..4adc6cb15734b0 100644
--- a/clang/test/Driver/cuda-detect.cu
+++ b/clang/test/Driver/cuda-detect.cu
@@ -46,16 +46,16 @@
 // RUN: %clang -### -v --target=i386-unknown-linux --cuda-gpu-arch=sm_21 \
 // RUN:   --cuda-path=%S/Inputs/CUDA_80/usr/local/cuda %s 2>&1 \
 // RUN:   | FileCheck %s -check-prefix COMMON \
-// RUN:     -check-prefixes PTX42,LIBDEVICE,LIBDEVICE20
+// RUN:     -check-prefixes PTX50,LIBDEVICE,LIBDEVICE20
 // RUN: %clang -### -v --target=i386-unknown-linux --cuda-gpu-arch=sm_32 \
 // RUN:   --cuda-path=%S/Inputs/CUDA_80/usr/local/cuda %s 2>&1 \
 // RUN:   | FileCheck %s -check-prefix COMMON \
-// RUN:     -check-prefixes PTX42,LIBDEVICE,LIBDEVICE20
+// RUN:     -check-prefixes PTX50,LIBDEVICE,LIBDEVICE20
 // sm_30, sm_6x map to compute_30.
 // RUN: %clang -### -v --target=i386-unknown-linux --cuda-gpu-arch=sm_30 \
 // RUN:   --cuda-path=%S/Inputs/CUDA_80/usr/local/cuda %s 2>&1 \
 // RUN:   | FileCheck %s -check-prefix COMMON \
-// RUN:     -check-prefixes PTX42,LIBDEVICE,LIBDEVICE30
+// RUN:     -check-prefixes PTX50,LIBDEVICE,LIBDEVICE30
 // sm_5x is a special case. Maps to compute_30 for cuda-7.x only.
 // RUN: %clang -### -v --target=i386-unknown-linux --cuda-gpu-arch=sm_50 \
 // RUN:   --cuda-path=%S/Inputs/CUDA/usr/local/cuda %s 2>&1 \
@@ -64,21 +64,21 @@
 // RUN: %clang -### -v --target=i386-unknown-linux --cuda-gpu-arch=sm_60 \
 // RUN:   --cuda-path=%S/Inputs/CUDA_80/usr/local/cuda %s 2>&1 \
 // RUN:   | FileCheck %s -check-prefix COMMON \
-// RUN:     -check-prefixes PTX42,LIBDEVICE,LIBDEVICE30
+// RUN:     -check-prefixes PTX50,LIBDEVICE,LIBDEVICE30
 // sm_35 and sm_37 -> compute_35
 // RUN: %clang -### -v --target=i386-unknown-linux --cuda-gpu-arch=sm_35 \
 // RUN:   --cuda-path=%S/Inputs/CUDA_80/usr/local/cuda %s 2>&1 \
 // RUN:   | FileCheck %s -check-prefix COMMON -check-prefix CUDAINC \
-// RUN:     -check-prefixes PTX42,LIBDEVICE,LIBDEVICE35
+// RUN:     -check-prefixes PTX50,LIBDEVICE,LIBDEVICE35
 // RUN: %clang -### -v --target=i386-unknown-linux --cuda-gpu-arch=sm_37 \
 // RUN:   --cuda-path=%S/Inputs/CUDA_80/usr/local/cuda %s 2>&1 \
 // RUN:   | FileCheck %s -check-prefix COMMON -check-prefix CUDAINC \
-// RUN:     -check-prefixes PTX42,LIBDEVICE,LIBDEVICE35
+// RUN:     -check-prefixes PTX50,LIBDEVICE,LIBDEVICE35
 // sm_5x -> compute_50 for CUDA-8.0 and newer.
 // RUN: %clang -### -v --target=i386-unknown-linux --cuda-gpu-arch=sm_50 \
 // RUN:   --cuda-path=%S/Inputs/CUDA_80/usr/local/cuda %s 2>&1 \
 // RUN:   | FileCheck %s -check-prefix COMMON \
-// RUN:     -check-prefixes PTX42,LIBDEVICE,LIBDEVICE50
+// RUN:     -check-prefixes PTX50,LIBDEVICE,LIBDEVICE50
 
 // CUDA-9+ uses the same libdevice for all GPU variants:
 // RUN: %clang -### -v --target=x86_64-unknown-linux --cuda-gpu-arch=sm_30 \
@@ -173,6 +173,7 @@
 // LIBDEVICE35-SAME: libdevice.compute_35.10.bc
 // LIBDEVICE50-SAME: libdevice.compute_50.10.bc
 // PTX42-SAME: "-target-feature" "+ptx42"
+// PTX50-SAME: "-target-feature" "+ptx50"
 // PTX60-SAME: "-target-feature" "+ptx60"
 // CUDAINC-SAME: "-include" "__clang_cuda_runtime_wrapper.h"
 // NOCUDAINC-NOT: "-include" "__clang_cuda_runtime_wrapper.h"
diff --git a/clang/test/Driver/cuda-sm_next.cu b/clang/test/Driver/cuda-sm_next.cu
index 379dcb297ae1a7..fcd561492125fd 100644
--- a/clang/test/Driver/cuda-sm_next.cu
+++ b/clang/test/Driver/cuda-sm_next.cu
@@ -1,66 +1,49 @@
-// Tests CUDA compilation targeting sm_next
+// Tests CUDA compilation targeting sm_custom
 
 // CC1 options level check.
-// Check that by default we only supply sm_next CPU info without explicitly 
-// overriding SM/PTX versions, and letting LLVM pick the defaults.
-// RUN: %clang -### -c --target=x86_64-linux-gnu --cuda-device-only  \
+// Check that by default we only supply sm_custom requires explicitly 
+// overriding SM/PTX versions.
+// RUN: not %clang -### -c --target=x86_64-linux-gnu --cuda-device-only  \
 // RUN:    --cuda-path=%S/Inputs/CUDA_111/usr/local/cuda %s \
-// RUN:    --cuda-gpu-arch=sm_next  2>&1 \
-// RUN:   | FileCheck -check-prefixes=ARGS-COMMON,ARGS-ARCH %s
+// RUN:    --cuda-gpu-arch=sm_custom  2>&1 \
+// RUN:   | FileCheck -check-prefixes=ERROR %s
 //
-// Same, with explicitly set sm and PTX versions.
-// RUN: %clang -### -c --target=x86_64-linux-gnu --cuda-device-only \
+// Check propagation of explicitly set sm and PTX versions to the tools.
+// RUN: %clang -### -c --target=x86_64-linux-gnu \
 // RUN:    --cuda-path=%S/Inputs/CUDA_111/usr/local/cuda %s \
-// RUN:    --cuda-gpu-arch=sm_next --cuda-next-sm=111 --cuda-next-ptx=222  2>&1 \
-// RUN:   | FileCheck -check-prefixes=ARGS-COMMON,ARGS-OVERRIDE %s
+// RUN:    --cuda-gpu-arch=sm_custom --cuda-custom-sm=sm_111 --cuda-custom-ptx=222  2>&1 \
+// RUN:   | FileCheck -check-prefixes=ARGS %s
 
 // Preprocessor level checks.
 // RUN: %clang -dD -E --target=x86_64-linux-gnu --cuda-device-only -nocudainc \
 // RUN:    --cuda-path=%S/Inputs/CUDA_111/usr/local/cuda %s \
-// RUN:    --cuda-gpu-arch=sm_next  2>&1 \
-// RUN:   | FileCheck -check-prefixes=PP-COMMON,PP-ARCH %s
-//
-// Same, with explicitly set sm and PTX versions.
-// RUN: %clang -dD -E --target=x86_64-linux-gnu --cuda-device-only -nocudainc \
-// RUN:    --cuda-path=%S/Inputs/CUDA_111/usr/local/cuda %s \
-// RUN:    --cuda-gpu-arch=sm_next --cuda-next-sm=111 --cuda-next-ptx=222  2>&1 \
-// RUN:   | FileCheck -check-prefixes=PP-COMMON,PP-OVERRIDE %s
+// RUN:    --cuda-gpu-arch=sm_custom --cuda-custom-sm=sm_111 --cuda-custom-ptx=222  2>&1 \
+// RUN:   | FileCheck -check-prefixes=PP %s
 
 // PTX level checks. 
 // RUN: %clang -S --target=x86_64-linux-gnu --cuda-device-only -nocudainc -nocudalib \
-// RUN:    --cuda-path=%S/Inputs/CUDA_111/usr/local/cuda -o - %s \
-// RUN:    --cuda-gpu-arch=sm_next  2>&1 \
-// RUN:   | FileCheck -check-prefixes=PTX-ARCH %s
-//
-// Same, with explicitly set sm and PTX versions.
-// RUN: %clang -S --target=x86_64-linux-gnu --cuda-device-only -nocudainc -nocudalib \
 // RUN:      --cuda-path=%S/Inputs/CUDA_111/usr/local/cuda -o - %s \
-// RUN:      --cuda-gpu-arch=sm_next --cuda-next-sm=111 --cuda-next-ptx=222  2>&1 \
-// RUN:   | FileCheck -check-prefixes=PTX-OVERRIDE %s
+// RUN:      --cuda-gpu-arch=sm_custom --cuda-custom-sm=sm_111 --cuda-custom-ptx=222  2>&1 \
+// RUN:   | FileCheck -check-prefixes=PTX %s
+
 
+// ERROR: clang: error: offload target sm_custom requires both --cuda-custom_sm and --cuda_custom_ptx to be specified
 
-// ARGS-COMMON: "-cc1" "-triple" "nvptx64-nvidia-cuda"
+// ARGS: "-cc1" "-triple" "nvptx64-nvidia-cuda"
 // We need to pass specific SM version to CC1, so that preprocessor can set __CUDA_ARCH__ macro
-// ARGS-ARCH-SAME: "--cuda-next-sm=52" "-mllvm" "--nvptx-next-sm=52"
-// .. but we do not explicitly set PTX version, and carry on with the default for the detected CUDA SDK.
-// ARGS-ARCH-NOT: --cuda-next-ptx=
-// ARGS-ARCH-NOT: --nvptx-next-ptx=
-// When we override SM and PTX versions, we explicitly set them for both clang and LLVM.
-// ARGS-OVERRIDE-SAME: "--cuda-next-sm=111" "-mllvm" "--nvptx-next-sm=111" "--cuda-next-ptx=222" "-mllvm" "--nvptx-next-ptx=222"
-// ARGS-COMMON-SAME: "-target-cpu" "sm_next"
-// ARGS-COMMON-SAME: "-target-feature" "+ptx71"
-// ARGS-COMMON-NEXT: ptxas
-// ARGS-ARCH-SAME: "--gpu-name" "sm_52"
-// ARGS-OVERRIDE-SAME: "--gpu-name" "sm_111"
+// and both SM and PTX to LLVM so it would generate correct PTX directives.
+// ARGS-SAME: "--cuda-custom-sm=sm_111" "-mllvm" "--nvptx-custom-sm=sm_111" "-mllvm" "--nvptx-custom-ptx=222"
+// ARGS-SAME: "-target-cpu" "sm_custom"
+// ARGS-SAME: "-target-feature" "+ptx71"
+// ARGS-NEXT: ptxas
+// ARGS-SAME: "--gpu-name" "sm_111"
+// ARGS-NEXT: fatbinary
+// ARGS-SAME: "--image=profile=sm_111,file= 
+// ARGS-SAME: "--image=profile=compute_111,file
 //
 //
-// PP-COMMON:  #define __NVPTX__ 1
-// PP-ARCH: #define __CUDA_ARCH__ 520
-// PP-OVERRIDE: #define __CUDA_ARCH__  1110
+// PP:  #define __NVPTX__ 1
+// PP: #define __CUDA_ARCH__  1110
 //
-//
-// PTX-ARCH:  .version 8.5
-// PTX-ARCH:  .target sm_52
-// PTX-OVERRIDE:  .version 22.2
-// PTX-OVERRIDE:  .target sm_111
-
+// PTX:  .version 22.2
+// PTX:  .target sm_111
diff --git a/clang/test/Misc/target-invalid-cpu-note.c b/clang/test/Misc/target-invalid-cpu-note.c
index 9c96153877d1c6..7239b8a1744c5a 100644
--- a/clang/test/Misc/target-invalid-cpu-note.c
+++ b/clang/test/Misc/target-invalid-cpu-note.c
@@ -29,7 +29,7 @@
 
 // RUN: not %clang_cc1 -triple nvptx--- -target-cpu not-a-cpu -fsyntax-only %s 2>&1 | FileCheck %s --check-prefix NVPTX
 // NVPTX: error: unknown target CPU 'not-a-cpu'
-// NVPTX-NEXT: note: valid target CPU values are: sm_20, sm_21, sm_30, sm_32, sm_35, sm_37, sm_50, sm_52, sm_53, sm_60, sm_61, sm_62, sm_70, sm_72, sm_75, sm_80, sm_86, sm_87, sm_89, sm_90, sm_90a, sm_next, gfx600, gfx601, gfx602, gfx700, gfx701, gfx702, gfx703, gfx704, gfx705, gfx801, gfx802, gfx803, gfx805, gfx810, gfx9-generic, gfx900, gfx902, gfx904, gfx906, gfx908, gfx909, gfx90a, gfx90c, gfx940, gfx941, gfx942, gfx10-1-generic, gfx1010, gfx1011, gfx1012, gfx1013, gfx10-3-generic, gfx1030, gfx1031, gfx1032, gfx1033, gfx1034, gfx1035, gfx1036, gfx11-generic, gfx1100, gfx1101, gfx1102, gfx1103, gfx1150, gfx1151, gfx1152, gfx12-generic, gfx1200, gfx1201, amdgcnspirv{{$}}
+// NVPTX-NEXT: note: valid target CPU values are: sm_20, sm_21, sm_30, sm_32, sm_35, sm_37, sm_50, sm_52, sm_53, sm_60, sm_61, sm_62, sm_70, sm_72, sm_75, sm_80, sm_86, sm_87, sm_89, sm_90, sm_90a, sm_custom, gfx600, gfx601, gfx602, gfx700, gfx701, gfx702, gfx703, gfx704, gfx705, gfx801, gfx802, gfx803, gfx805, gfx810, gfx9-generic, gfx900, gfx902, gfx904, gfx906, gfx908, gfx909, gfx90a, gfx90c, gfx940, gfx941, gfx942, gfx10-1-generic, gfx1010, gfx1011, gfx1012, gfx1013, gfx10-3-generic, gfx1030, gfx1031, gfx1032, gfx1033, gfx1034, gfx1035, gfx1036, gfx11-generic, gfx1100, gfx1101, gfx1102, gfx1103, gfx1150, gfx1151, gfx1152, gfx12-generic, gfx1200, gfx1201, amdgcnspirv{{$}}
 
 // RUN: not %clang_cc1 -triple r600--- -target-cpu not-a-cpu -fsyntax-only %s 2>&1 | FileCheck %s --check-prefix R600
 // R600: error: unknown target CPU 'not-a-cpu'
diff --git a/llvm/lib/Target/NVPTX/NVPTX.td b/llvm/lib/Target/NVPTX/NVPTX.td
index d55dc0194c6fc5..92c460798c2839 100644
--- a/llvm/lib/Target/NVPTX/NVPTX.td
+++ b/llvm/lib/Target/NVPTX/NVPTX.td
@@ -39,14 +39,14 @@ foreach sm = [20, 21, 30, 32, 35, 37, 50, 52, 53,
   def SM#sm: FeatureSM<""#sm, !mul(sm, 10)>;
 
 def SM90a: FeatureSM<"90a", 901>;
-def SMnext: FeatureSM<"next", 9999>; // Placeholder for an unknown future version
+def SMcustom: FeatureSM<"custom", 9999>; // Placeholder for an unknown future version
 
 foreach version = [32, 40, 41, 42, 43, 50, 60, 61, 62, 63, 64, 65,
                    70, 71, 72, 73, 74, 75, 76, 77, 78,
                    80, 81, 82, 83, 84, 85] in
   def PTX#version: FeaturePTX<version>;
 
-def PTXnext: FeaturePTX<9999>; // Placeholder for an unknown future version.
+def PTXcustom: FeaturePTX<9999>; // Placeholder for an unknown future version.
 
 //===----------------------------------------------------------------------===//
 // NVPTX supported processors.
@@ -76,7 +76,7 @@ def : Proc<"sm_87", [SM87, PTX74]>;
 def : Proc<"sm_89", [SM89, PTX78]>;
 def : Proc<"sm_90", [SM90, PTX78]>;
 def : Proc<"sm_90a", [SM90a, PTX80]>;
-def : Proc<"sm_next", [SMnext, PTXnext]>;
+def : Proc<"sm_custom", [SMcustom, PTXcustom]>;
 
 def NVPTXInstrInfo : InstrInfo {
 }
diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp b/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
index 594cb094e13aec..38c622cf73afdd 100644
--- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
@@ -13,6 +13,7 @@
 #include "NVPTXSubtarget.h"
 #include "NVPTXTargetMachine.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/Error.h"
 
 using namespace llvm;
 
@@ -27,14 +28,14 @@ static cl::opt<bool>
     NoF16Math("nvptx-no-f16-math", cl::Hidden,
               cl::desc("NVPTX Specific: Disable generation of f16 math ops."),
               cl::init(false));
+static cl::opt<std::string>
+    CustomSM("nvptx-custom-sm", cl::Hidden,
+             cl::desc("NVPTX Specific: Override SM ID for sm_custom."),
+             cl::init(""));
 static cl::opt<unsigned>
-    NextSM("nvptx-next-sm", cl::Hidden,
-           cl::desc("NVPTX Specific: Override SM ID for sm_next."),
-           cl::init(90));
-static cl::opt<unsigned>
-    NextPTX("nvptx-next-ptx", cl::Hidden,
-            cl::desc("NVPTX Specific: Override PTX version for sm_next."),
-            cl::init(85));
+    CustomPTX("nvptx-custom-ptx", cl::Hidden,
+              cl::desc("NVPTX Specific: Override PTX version for sm_custom."),
+              cl::init(0));
 
 // Pin the vtable to this file.
 void NVPTXSubtarget::anchor() {}
@@ -45,10 +46,23 @@ NVPTXSubtarget &NVPTXSubtarget::initializeSubtargetDependencies(StringRef CPU,
   TargetName = std::string(CPU.empty() ? "sm_30" : CPU);
 
   ParseSubtargetFeatures(TargetName, /*TuneCPU*/ TargetName, FS);
-  if (TargetName == "sm_next") {
-    TargetName = "sm_" + itostr(NextSM);
-    FullSmVersion = NextSM * 10;
-    PTXVersion = NextPTX;
+  if (TargetName == "sm_custom") {
+    if (CustomSM.empty() || CustomPTX == 0)
+      llvm::report_fatal_error("Target sm_custom requires --nvptx-custom-sm "
+                               "and --nvptx-custom-ptx to be specified.",
+                               false);
+    StringRef IDS(CustomSM);
+    if (!IDS.starts_with("sm_"))
+      llvm::report_fatal_error("Custom SM name must begin with 'sm_'");
+    IDS = IDS.drop_front(3);
+    if (IDS.ends_with("a"))
+      IDS = IDS.drop_back(1);
+    unsigned SMID;
+    if (IDS.getAsInteger(10, SMID))
+      llvm::report_fatal_error("Invalid custom SM format. Must be sm_NNN[a]");
+    TargetName = CustomSM;
+    FullSmVersion = SMID * 10;
+    PTXVersion = CustomPTX;
   }
 
   // Re-map SM version numbers, SmVersion carries the regular SMs which do

>From ac23908742b1c48dfcbda921f077fb2a8941fbc2 Mon Sep 17 00:00:00 2001
From: Artem Belevich <artemb at gmail.com>
Date: Mon, 12 Aug 2024 13:37:40 -0700
Subject: [PATCH 4/4] Discard redundant ().

Co-authored-by: Joseph Huber <huberjn at outlook.com>
---
 clang/lib/Driver/ToolChains/Cuda.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/lib/Driver/ToolChains/Cuda.cpp b/clang/lib/Driver/ToolChains/Cuda.cpp
index b8ba780686c357..c1e99ffb9be135 100644
--- a/clang/lib/Driver/ToolChains/Cuda.cpp
+++ b/clang/lib/Driver/ToolChains/Cuda.cpp
@@ -565,7 +565,7 @@ void NVPTX::FatBinary::ConstructJob(Compilation &C, const JobAction &JA,
     std::string Arch = [&]() -> std::string {
       bool IsAsm = II.getType() == types::TY_PP_Asm;
       if (gpu_arch != OffloadArch::SM_custom)
-        return (IsAsm) ? OffloadArchToVirtualArchString(gpu_arch)
+        return IsAsm ? OffloadArchToVirtualArchString(gpu_arch)
                        : gpu_arch_str;
       std::string CustomSM = getCustomSM(Args);
       if (IsAsm) {



More information about the llvm-commits mailing list