[clang] [llvm] [CUDA] Add a pseudo GPU sm_next which allows overriding for SM/PTX version. (PR #100247)

Fri Aug 16 11:36:03 PDT 2024

https://github.com/Artem-B updated https://github.com/llvm/llvm-project/pull/100247

>From da1ac9d36bd284dc607b7366ff83ba556fb64fb5 Mon Sep 17 00:00:00 2001
From: Artem Belevich <tra at google.com>
Date: Thu, 18 Jul 2024 15:05:01 -0700
Subject: [PATCH 1/2] [CUDA] Add a pseudo GPU sm_next which allows overrides
 for SM/PTX versions.

Sometimes users may need to use older clang with newer SM/PTX versions
which clang does not know anything about, yet.

--offload-arch=sm_next, combined with --cuda-next-sm=X and --cuda-next-ptx=Y
allows passing through the specified SM and PTX versions down to ptxas,
which may be able to make sense of them. Or not, but that's up to the user
to figure out the values they may need to use to make it work.

The feature is intended to be a stop-gap workaround for situations when
clang didn't catch up to the newer CUDA SDK releases yet.
No guarantees that it will work with any given combination of
clang/CUDA/SM/PTX versions. YMMV.
---
 clang/include/clang/Basic/Cuda.h              |  44 ++++++
 .../clang/Basic/DiagnosticDriverKinds.td      |   2 +
 clang/include/clang/Basic/LangOptions.h       |   4 +
 clang/include/clang/Driver/Options.td         |  11 ++
 clang/include/clang/Driver/ToolChain.h        |   7 +
 clang/lib/Basic/Cuda.cpp                      |  95 ++++++++----
 clang/lib/Basic/Targets/NVPTX.cpp             |   6 +-
 clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp      |   1 +
 clang/lib/Driver/ToolChains/Clang.cpp         |   9 +-
 clang/lib/Driver/ToolChains/Clang.h           |   9 +-
 clang/lib/Driver/ToolChains/Cuda.cpp          | 138 ++++++++++--------
 clang/lib/Driver/ToolChains/Cuda.h            |   3 +
 clang/test/Driver/cuda-detect.cu              |  15 +-
 clang/test/Driver/cuda-sm_next.cu             |  49 +++++++
 clang/test/Misc/target-invalid-cpu-note.c     |   2 +-
 llvm/lib/Target/NVPTX/NVPTX.td                |   4 +
 llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp      |  50 +++++--
 17 files changed, 332 insertions(+), 117 deletions(-)
 create mode 100644 clang/test/Driver/cuda-sm_next.cu

diff --git a/clang/include/clang/Basic/Cuda.h b/clang/include/clang/Basic/Cuda.h
index 83699f8897f663..fa025d32b67ae1 100644
--- a/clang/include/clang/Basic/Cuda.h
+++ b/clang/include/clang/Basic/Cuda.h
@@ -9,6 +9,7 @@
 #ifndef LLVM_CLANG_BASIC_CUDA_H
 #define LLVM_CLANG_BASIC_CUDA_H
 
+#include "llvm/ADT/StringRef.h"
 namespace llvm {
 class StringRef;
 class Twine;
@@ -52,6 +53,42 @@ const char *CudaVersionToString(CudaVersion V);
 // Input is "Major.Minor"
 CudaVersion CudaStringToVersion(const llvm::Twine &S);
 
+enum class PTXVersion {
+  PTX_UNKNOWN = 0,
+  PTX_32 = 32,
+  PTX_40 = 40,
+  PTX_41,
+  PTX_42,
+  PTX_43,
+  PTX_50 = 50,
+  PTX_60 = 60,
+  PTX_61,
+  PTX_62,
+  PTX_63,
+  PTX_64,
+  PTX_65,
+  PTX_70 = 70,
+  PTX_71,
+  PTX_72,
+  PTX_73,
+  PTX_74,
+  PTX_75,
+  PTX_76,
+  PTX_77,
+  PTX_78,
+  PTX_80 = 80,
+  PTX_81,
+  PTX_82,
+  PTX_83,
+  PTX_84,
+  PTX_85,
+  PTX_LAST = PTX_85,
+  PTX_custom = 9999, // placeholder for an unknown future version.
+};
+
+const std::string PTXVersionToFeature(PTXVersion V);
+PTXVersion GetRequiredPTXVersion(CudaVersion V);
+
 enum class OffloadArch {
   UNUSED,
   UNKNOWN,
@@ -78,6 +115,7 @@ enum class OffloadArch {
   SM_89,
   SM_90,
   SM_90a,
+  SM_custom,
   GFX600,
   GFX601,
   GFX602,
@@ -160,6 +198,12 @@ const char *OffloadArchToVirtualArchString(OffloadArch A);
 // The input should have the form "sm_20".
 OffloadArch StringToOffloadArch(llvm::StringRef S);
 
+// Converts custom SM name to its numeric value to be used in __CUDA_ARCH__
+// Custom SM name format: `sm_[ID][suffix]`.
+// The function returns `ID`*10 or zero on error.
+// `suffix` is expected to be empty or `a` and is ignored otherwise.
+unsigned CUDACustomSMToArchID(llvm::StringRef S);
+
 /// Get the earliest CudaVersion that supports the given OffloadArch.
 CudaVersion MinVersionForOffloadArch(OffloadArch A);
 
diff --git a/clang/include/clang/Basic/DiagnosticDriverKinds.td b/clang/include/clang/Basic/DiagnosticDriverKinds.td
index 92a602829933ce..57a407209253f8 100644
--- a/clang/include/clang/Basic/DiagnosticDriverKinds.td
+++ b/clang/include/clang/Basic/DiagnosticDriverKinds.td
@@ -743,6 +743,8 @@ def err_drv_invalid_or_unsupported_offload_target : Error<
   "invalid or unsupported offload target: '%0'">;
 def err_drv_cuda_offload_only_emit_bc : Error<
   "CUDA offload target is supported only along with --emit-llvm">;
+def err_drv_sm_custom_args : Error<
+  "offload target sm_custom requires both --cuda-custom_sm and --cuda_custom_ptx to be specified">;
 
 def warn_drv_jmc_requires_debuginfo : Warning<
   "%0 requires debug info. Use %1 or debug options that enable debugger's "
diff --git a/clang/include/clang/Basic/LangOptions.h b/clang/include/clang/Basic/LangOptions.h
index 91f1c2f2e6239e..927fd8fde5f771 100644
--- a/clang/include/clang/Basic/LangOptions.h
+++ b/clang/include/clang/Basic/LangOptions.h
@@ -579,6 +579,10 @@ class LangOptions : public LangOptionsBase {
   // WebAssembly target.
   bool NoWasmOpt = false;
 
+  // Overrides for the custom SM/PTX variants for CUDA's sm_custom target.
+  std::string CUDACustomSM;
+  unsigned CUDACustomPTX = 0;
+
   LangOptions();
 
   /// Set language defaults for the given input language and
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 0b38139bd27972..649e3055e005b4 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -1458,6 +1458,17 @@ def fno_hip_emit_relocatable : Flag<["-"], "fno-hip-emit-relocatable">,
   HelpText<"Do not override toolchain to compile HIP source to relocatable">;
 }
 
+def cuda_custom_sm_EQ : Joined<["--"], "cuda-custom-sm=">,
+  Visibility<[ClangOption, CC1Option]>,
+  HelpText<"SM version to use for sm_custom GPU">,
+  MarshallingInfoString<LangOpts<"CUDACustomSM">>,
+  ShouldParseIf<cuda.KeyPath>,Flags<[HelpHidden]>;
+def cuda_custom_ptx_EQ : Joined<["--"], "cuda-custom-ptx=">,
+  Visibility<[ClangOption, CC1Option]>,
+  HelpText<"SM version to use for sm_custom GPU">,
+  MarshallingInfoInt<LangOpts<"CUDACustomPTX">, "0">,
+  ShouldParseIf<cuda.KeyPath>,Flags<[HelpHidden]>;
+
 // Clang specific/exclusive options for OpenACC.
 def openacc_macro_override
     : Separate<["-"], "fexperimental-openacc-macro-override">,
diff --git a/clang/include/clang/Driver/ToolChain.h b/clang/include/clang/Driver/ToolChain.h
index 5347e29be91439..508114ff51ffb1 100644
--- a/clang/include/clang/Driver/ToolChain.h
+++ b/clang/include/clang/Driver/ToolChain.h
@@ -677,6 +677,13 @@ class ToolChain {
   virtual void addClangTargetOptions(const llvm::opt::ArgList &DriverArgs,
                                      llvm::opt::ArgStringList &CC1Args,
                                      Action::OffloadKind DeviceOffloadKind) const;
+  /// [optional] Some toolchains may need more info and need to pass JobAction.
+  /// This is only intended to augment the function above.
+  virtual void addClangTargetOptions(const llvm::opt::ArgList &DriverArgs,
+                                     llvm::opt::ArgStringList &CC1Args,
+                                     const JobAction &JC) const {
+    addClangTargetOptions(DriverArgs, CC1Args, JC.getOffloadingDeviceKind());
+  }
 
   /// Add options that need to be passed to cc1as for this target.
   virtual void
diff --git a/clang/lib/Basic/Cuda.cpp b/clang/lib/Basic/Cuda.cpp
index faf3878f064d20..832740ae1a4963 100644
--- a/clang/lib/Basic/Cuda.cpp
+++ b/clang/lib/Basic/Cuda.cpp
@@ -3,6 +3,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/VersionTuple.h"
 
 namespace clang {
@@ -11,40 +12,43 @@ struct CudaVersionMapEntry {
   const char *Name;
   CudaVersion Version;
   llvm::VersionTuple TVersion;
+  PTXVersion PTX;
 };
-#define CUDA_ENTRY(major, minor)                                               \
+#define CUDA_ENTRY(major, minor, ptx)                                          \
   {                                                                            \
     #major "." #minor, CudaVersion::CUDA_##major##minor,                       \
-        llvm::VersionTuple(major, minor)                                       \
+        llvm::VersionTuple(major, minor), PTXVersion::ptx                      \
   }
 
 static const CudaVersionMapEntry CudaNameVersionMap[] = {
-    CUDA_ENTRY(7, 0),
-    CUDA_ENTRY(7, 5),
-    CUDA_ENTRY(8, 0),
-    CUDA_ENTRY(9, 0),
-    CUDA_ENTRY(9, 1),
-    CUDA_ENTRY(9, 2),
-    CUDA_ENTRY(10, 0),
-    CUDA_ENTRY(10, 1),
-    CUDA_ENTRY(10, 2),
-    CUDA_ENTRY(11, 0),
-    CUDA_ENTRY(11, 1),
-    CUDA_ENTRY(11, 2),
-    CUDA_ENTRY(11, 3),
-    CUDA_ENTRY(11, 4),
-    CUDA_ENTRY(11, 5),
-    CUDA_ENTRY(11, 6),
-    CUDA_ENTRY(11, 7),
-    CUDA_ENTRY(11, 8),
-    CUDA_ENTRY(12, 0),
-    CUDA_ENTRY(12, 1),
-    CUDA_ENTRY(12, 2),
-    CUDA_ENTRY(12, 3),
-    CUDA_ENTRY(12, 4),
-    CUDA_ENTRY(12, 5),
-    {"", CudaVersion::NEW, llvm::VersionTuple(std::numeric_limits<int>::max())},
-    {"unknown", CudaVersion::UNKNOWN, {}} // End of list tombstone.
+    CUDA_ENTRY(7, 0, PTX_42),
+    CUDA_ENTRY(7, 5, PTX_43),
+    CUDA_ENTRY(8, 0, PTX_50),
+    CUDA_ENTRY(9, 0, PTX_60),
+    CUDA_ENTRY(9, 1, PTX_61),
+    CUDA_ENTRY(9, 2, PTX_62),
+    CUDA_ENTRY(10, 0, PTX_63),
+    CUDA_ENTRY(10, 1, PTX_64),
+    CUDA_ENTRY(10, 2, PTX_65),
+    CUDA_ENTRY(11, 0, PTX_70),
+    CUDA_ENTRY(11, 1, PTX_71),
+    CUDA_ENTRY(11, 2, PTX_72),
+    CUDA_ENTRY(11, 3, PTX_73),
+    CUDA_ENTRY(11, 4, PTX_74),
+    CUDA_ENTRY(11, 5, PTX_75),
+    CUDA_ENTRY(11, 6, PTX_76),
+    CUDA_ENTRY(11, 7, PTX_77),
+    CUDA_ENTRY(11, 8, PTX_78),
+    CUDA_ENTRY(12, 0, PTX_80),
+    CUDA_ENTRY(12, 1, PTX_81),
+    CUDA_ENTRY(12, 2, PTX_82),
+    CUDA_ENTRY(12, 3, PTX_83),
+    CUDA_ENTRY(12, 4, PTX_84),
+    CUDA_ENTRY(12, 5, PTX_85),
+    {"", CudaVersion::NEW, llvm::VersionTuple(std::numeric_limits<int>::max()),
+     PTXVersion::PTX_LAST},
+    // End of list tombstone
+    {"unknown", CudaVersion::UNKNOWN, {}, PTXVersion::PTX_42}
 };
 #undef CUDA_ENTRY
 
@@ -71,6 +75,20 @@ CudaVersion ToCudaVersion(llvm::VersionTuple Version) {
   return CudaVersion::UNKNOWN;
 }
 
+const std::string PTXVersionToFeature(PTXVersion V) {
+  if (V > PTXVersion::PTX_UNKNOWN && V <= PTXVersion::PTX_LAST)
+    return llvm::formatv("+ptx{0}", static_cast<unsigned>(V));
+  return {};
+}
+
+PTXVersion GetRequiredPTXVersion(CudaVersion V) {
+  for (auto &I : CudaNameVersionMap)
+    if (V == I.Version)
+      return I.PTX;
+
+  return PTXVersion::PTX_UNKNOWN;
+}
+
 namespace {
 struct OffloadArchToStringMap {
   OffloadArch arch;
@@ -79,9 +97,11 @@ struct OffloadArchToStringMap {
 };
 } // namespace
 
-#define SM2(sm, ca) {OffloadArch::SM_##sm, "sm_" #sm, ca}
+#define SM2(sm, ca)                                                            \
+  { OffloadArch::SM_##sm, "sm_" #sm, ca }
 #define SM(sm) SM2(sm, "compute_" #sm)
-#define GFX(gpu) {OffloadArch::GFX##gpu, "gfx" #gpu, "compute_amdgcn"}
+#define GFX(gpu)                                                               \
+  { OffloadArch::GFX##gpu, "gfx" #gpu, "compute_amdgcn" }
 static const OffloadArchToStringMap arch_names[] = {
     // clang-format off
     {OffloadArch::UNUSED, "", ""},
@@ -96,6 +116,7 @@ static const OffloadArchToStringMap arch_names[] = {
     SM(89),                          // Ada Lovelace
     SM(90),                          // Hopper
     SM(90a),                         // Hopper
+    SM(custom),                        // Placeholder for a new arch.
     GFX(600),  // gfx600
     GFX(601),  // gfx601
     GFX(602),  // gfx602
@@ -181,6 +202,18 @@ OffloadArch StringToOffloadArch(llvm::StringRef S) {
   return result->arch;
 }
 
+unsigned CUDACustomSMToArchID(llvm::StringRef S) {
+  if (!S.starts_with("sm_"))
+    return 0;
+  S = S.drop_front(3); // skip `sm_`
+  if (S.ends_with("a"))
+    S = S.drop_back(1);
+  unsigned ID;
+  if (S.getAsInteger(10, ID))
+    return 0; // We've failed to parse the SM name
+  return ID * 10;
+}
+
 CudaVersion MinVersionForOffloadArch(OffloadArch A) {
   if (A == OffloadArch::UNKNOWN)
     return CudaVersion::UNKNOWN;
@@ -221,6 +254,8 @@ CudaVersion MinVersionForOffloadArch(OffloadArch A) {
     return CudaVersion::CUDA_118;
   case OffloadArch::SM_90a:
     return CudaVersion::CUDA_120;
+  case clang::OffloadArch::SM_custom:
+    return CudaVersion::UNKNOWN;
   default:
     llvm_unreachable("invalid enum");
   }
diff --git a/clang/lib/Basic/Targets/NVPTX.cpp b/clang/lib/Basic/Targets/NVPTX.cpp
index 43b653dc52ce0d..a245ef6dcc6cad 100644
--- a/clang/lib/Basic/Targets/NVPTX.cpp
+++ b/clang/lib/Basic/Targets/NVPTX.cpp
@@ -13,8 +13,10 @@
 #include "NVPTX.h"
 #include "Targets.h"
 #include "clang/Basic/Builtins.h"
+#include "clang/Basic/Cuda.h"
 #include "clang/Basic/MacroBuilder.h"
 #include "clang/Basic/TargetBuiltins.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringSwitch.h"
 
 using namespace clang;
@@ -180,7 +182,7 @@ void NVPTXTargetInfo::getTargetDefines(const LangOptions &Opts,
 
   if (Opts.CUDAIsDevice || Opts.OpenMPIsTargetDevice || !HostTarget) {
     // Set __CUDA_ARCH__ for the GPU specified.
-    std::string CUDAArchCode = [this] {
+    std::string CUDAArchCode = [&]() -> std::string {
       switch (GPU) {
       case OffloadArch::GFX600:
       case OffloadArch::GFX601:
@@ -281,6 +283,8 @@ void NVPTXTargetInfo::getTargetDefines(const LangOptions &Opts,
       case OffloadArch::SM_90:
       case OffloadArch::SM_90a:
         return "900";
+      case OffloadArch::SM_custom:
+        return llvm::itostr(CUDACustomSMToArchID(Opts.CUDACustomSM));
       }
       llvm_unreachable("unhandled OffloadArch");
     }();
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
index 8965a14d88a6fb..cbf1de6958e056 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@@ -2277,6 +2277,7 @@ void CGOpenMPRuntimeGPU::processRequiresDirective(const OMPRequiresDecl *D) {
       case OffloadArch::SM_89:
       case OffloadArch::SM_90:
       case OffloadArch::SM_90a:
+      case OffloadArch::SM_custom:
       case OffloadArch::GFX600:
       case OffloadArch::GFX601:
       case OffloadArch::GFX602:
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index c698d38b80e578..0e3b5342c49574 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -1658,7 +1658,8 @@ void Clang::AddARMTargetArgs(const llvm::Triple &Triple, const ArgList &Args,
   AddUnalignedAccessWarning(CmdArgs);
 }
 
-void Clang::RenderTargetOptions(const llvm::Triple &EffectiveTriple,
+void Clang::RenderTargetOptions(const JobAction &JA,
+                                const llvm::Triple &EffectiveTriple,
                                 const ArgList &Args, bool KernelOrKext,
                                 ArgStringList &CmdArgs) const {
   const ToolChain &TC = getToolChain();
@@ -5353,7 +5354,7 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
     CmdArgs.push_back("-disable-llvm-passes");
 
     // Render target options.
-    TC.addClangTargetOptions(Args, CmdArgs, JA.getOffloadingDeviceKind());
+    TC.addClangTargetOptions(Args, CmdArgs, JA);
 
     // reject options that shouldn't be supported in bitcode
     // also reject kernel/kext
@@ -6044,7 +6045,7 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
                       /*ForAS*/ false, /*IsAux*/ true);
   }
 
-  TC.addClangTargetOptions(Args, CmdArgs, JA.getOffloadingDeviceKind());
+  TC.addClangTargetOptions(Args, CmdArgs, JA);
 
   addMCModel(D, Args, Triple, RelocationModel, CmdArgs);
 
@@ -6071,7 +6072,7 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
     CmdArgs.push_back(Args.MakeArgString(CPU));
   }
 
-  RenderTargetOptions(Triple, Args, KernelOrKext, CmdArgs);
+  RenderTargetOptions(JA, Triple, Args, KernelOrKext, CmdArgs);
 
   // Add clang-cl arguments.
   types::ID InputType = Input.getType();
diff --git a/clang/lib/Driver/ToolChains/Clang.h b/clang/lib/Driver/ToolChains/Clang.h
index 18f6c5ed06a59a..7ff15d1a601993 100644
--- a/clang/lib/Driver/ToolChains/Clang.h
+++ b/clang/lib/Driver/ToolChains/Clang.h
@@ -45,7 +45,8 @@ class LLVM_LIBRARY_VISIBILITY Clang : public Tool {
                                const InputInfo &Output,
                                const InputInfoList &Inputs) const;
 
-  void RenderTargetOptions(const llvm::Triple &EffectiveTriple,
+  void RenderTargetOptions(const JobAction &JA,
+                           const llvm::Triple &EffectiveTriple,
                            const llvm::opt::ArgList &Args, bool KernelOrKext,
                            llvm::opt::ArgStringList &CmdArgs) const;
 
@@ -61,6 +62,8 @@ class LLVM_LIBRARY_VISIBILITY Clang : public Tool {
                               llvm::opt::ArgStringList &CmdArgs) const;
   void AddMIPSTargetArgs(const llvm::opt::ArgList &Args,
                          llvm::opt::ArgStringList &CmdArgs) const;
+  void AddNVPTXTargetArgs(const JobAction &JA, const llvm::opt::ArgList &Args,
+                          llvm::opt::ArgStringList &CmdArgs) const;
   void AddPPCTargetArgs(const llvm::opt::ArgList &Args,
                         llvm::opt::ArgStringList &CmdArgs) const;
   void AddR600TargetArgs(const llvm::opt::ArgList &Args,
@@ -94,8 +97,8 @@ class LLVM_LIBRARY_VISIBILITY Clang : public Tool {
 
   mutable std::unique_ptr<llvm::raw_fd_ostream> CompilationDatabase = nullptr;
   void DumpCompilationDatabase(Compilation &C, StringRef Filename,
-                               StringRef Target,
-                               const InputInfo &Output, const InputInfo &Input,
+                               StringRef Target, const InputInfo &Output,
+                               const InputInfo &Input,
                                const llvm::opt::ArgList &Args) const;
 
   void DumpCompilationDatabaseFragmentToDir(
diff --git a/clang/lib/Driver/ToolChains/Cuda.cpp b/clang/lib/Driver/ToolChains/Cuda.cpp
index 67a427b9d5ceee..a628b521224bd6 100644
--- a/clang/lib/Driver/ToolChains/Cuda.cpp
+++ b/clang/lib/Driver/ToolChains/Cuda.cpp
@@ -116,6 +116,13 @@ CudaVersion parseCudaHFile(llvm::StringRef Input) {
   }
   return CudaVersion::UNKNOWN;
 }
+
+std::string getCustomSM(const llvm::opt::ArgList &DriverArgs) {
+  return DriverArgs
+      .getLastArgValue(options::OPT_cuda_custom_sm_EQ,
+                       StringRef(OffloadArchToString(OffloadArch::CudaDefault)))
+      .str();
+}
 } // namespace
 
 void CudaInstallationDetector::WarnIfUnsupportedVersion() {
@@ -457,7 +464,9 @@ void NVPTX::Assembler::ConstructJob(Compilation &C, const JobAction &JA,
     CmdArgs.push_back("-v");
 
   CmdArgs.push_back("--gpu-name");
-  CmdArgs.push_back(Args.MakeArgString(OffloadArchToString(gpu_arch)));
+  CmdArgs.push_back(Args.MakeArgString(gpu_arch == OffloadArch::SM_custom
+                                           ? getCustomSM(Args)
+                                           : OffloadArchToString(gpu_arch)));
   CmdArgs.push_back("--output-file");
   std::string OutputFileName = TC.getInputFilename(Output);
 
@@ -553,9 +562,23 @@ void NVPTX::FatBinary::ConstructJob(Compilation &C, const JobAction &JA,
       continue;
     // We need to pass an Arch of the form "sm_XX" for cubin files and
     // "compute_XX" for ptx.
-    const char *Arch = (II.getType() == types::TY_PP_Asm)
-                           ? OffloadArchToVirtualArchString(gpu_arch)
-                           : gpu_arch_str;
+    std::string Arch = [&]() -> std::string {
+      bool IsAsm = II.getType() == types::TY_PP_Asm;
+      if (gpu_arch != OffloadArch::SM_custom)
+        return IsAsm ? OffloadArchToVirtualArchString(gpu_arch)
+                       : gpu_arch_str;
+      std::string CustomSM = getCustomSM(Args);
+      if (IsAsm) {
+        StringRef SMID = CustomSM;
+        if (SMID.starts_with("sm_")) // Should always be true
+          SMID = SMID.drop_front(3);
+        else
+          C.getDriver().Diag(diag::err_drv_invalid_value_with_suggestion)
+              << "--cuda-custom-sm" << CustomSM << "sm_*";
+        return formatv("compute_{0}", SMID);
+      }
+      return CustomSM;
+    }();
     CmdArgs.push_back(
         Args.MakeArgString(llvm::Twine("--image=profile=") + Arch +
                            ",file=" + getToolChain().getInputFilename(II)));
@@ -664,38 +687,9 @@ void NVPTX::getNVPTXTargetFeatures(const Driver &D, const llvm::Triple &Triple,
   // New CUDA versions often introduce new instructions that are only supported
   // by new PTX version, so we need to raise PTX level to enable them in NVPTX
   // back-end.
-  const char *PtxFeature = nullptr;
-  switch (CudaInstallation.version()) {
-#define CASE_CUDA_VERSION(CUDA_VER, PTX_VER)                                   \
-  case CudaVersion::CUDA_##CUDA_VER:                                           \
-    PtxFeature = "+ptx" #PTX_VER;                                              \
-    break;
-    CASE_CUDA_VERSION(125, 85);
-    CASE_CUDA_VERSION(124, 84);
-    CASE_CUDA_VERSION(123, 83);
-    CASE_CUDA_VERSION(122, 82);
-    CASE_CUDA_VERSION(121, 81);
-    CASE_CUDA_VERSION(120, 80);
-    CASE_CUDA_VERSION(118, 78);
-    CASE_CUDA_VERSION(117, 77);
-    CASE_CUDA_VERSION(116, 76);
-    CASE_CUDA_VERSION(115, 75);
-    CASE_CUDA_VERSION(114, 74);
-    CASE_CUDA_VERSION(113, 73);
-    CASE_CUDA_VERSION(112, 72);
-    CASE_CUDA_VERSION(111, 71);
-    CASE_CUDA_VERSION(110, 70);
-    CASE_CUDA_VERSION(102, 65);
-    CASE_CUDA_VERSION(101, 64);
-    CASE_CUDA_VERSION(100, 63);
-    CASE_CUDA_VERSION(92, 61);
-    CASE_CUDA_VERSION(91, 61);
-    CASE_CUDA_VERSION(90, 60);
-#undef CASE_CUDA_VERSION
-  default:
-    PtxFeature = "+ptx42";
-  }
-  Features.push_back(PtxFeature);
+  std::string PtxFeature =
+      PTXVersionToFeature(GetRequiredPTXVersion(CudaInstallation.version()));
+  Features.push_back(Args.MakeArgString(PtxFeature));
 }
 
 /// NVPTX toolchain. Our assembler is ptxas, and our linker is nvlink. This
@@ -851,47 +845,71 @@ void CudaToolChain::addClangTargetOptions(
       CC1Args.push_back("-fcuda-allow-variadic-functions");
   }
 
-  if (DriverArgs.hasArg(options::OPT_nogpulib))
-    return;
-
   if (DeviceOffloadingKind == Action::OFK_OpenMP &&
       DriverArgs.hasArg(options::OPT_S))
     return;
 
-  std::string LibDeviceFile = CudaInstallation.getLibDeviceFile(GpuArch);
-  if (LibDeviceFile.empty()) {
-    getDriver().Diag(diag::err_drv_no_cuda_libdevice) << GpuArch;
-    return;
-  }
+  clang::CudaVersion CudaInstallationVersion = CudaInstallation.version();
 
-  CC1Args.push_back("-mlink-builtin-bitcode");
-  CC1Args.push_back(DriverArgs.MakeArgString(LibDeviceFile));
+  if (!DriverArgs.hasArg(options::OPT_nogpulib)) {
+    std::string LibDeviceFile = CudaInstallation.getLibDeviceFile(GpuArch);
+    if (LibDeviceFile.empty()) {
+      getDriver().Diag(diag::err_drv_no_cuda_libdevice) << GpuArch;
+      return;
+    }
 
-  clang::CudaVersion CudaInstallationVersion = CudaInstallation.version();
+    CC1Args.push_back("-mlink-builtin-bitcode");
+    CC1Args.push_back(DriverArgs.MakeArgString(LibDeviceFile));
+
+    if (DeviceOffloadingKind == Action::OFK_OpenMP) {
+      if (CudaInstallationVersion < CudaVersion::CUDA_92) {
+        getDriver().Diag(
+            diag::err_drv_omp_offload_target_cuda_version_not_support)
+            << CudaVersionToString(CudaInstallationVersion);
+        return;
+      }
+
+      // Link the bitcode library late if we're using device LTO.
+      if (getDriver().isUsingOffloadLTO())
+        return;
+
+      addOpenMPDeviceRTL(getDriver(), DriverArgs, CC1Args, GpuArch.str(),
+                         getTriple(), HostTC);
+    }
+  }
 
   if (DriverArgs.hasFlag(options::OPT_fcuda_short_ptr,
                          options::OPT_fno_cuda_short_ptr, false))
     CC1Args.append({"-mllvm", "--nvptx-short-ptr"});
 
-  if (CudaInstallationVersion >= CudaVersion::UNKNOWN)
+  if (CudaInstallation.isValid() &&
+      CudaInstallationVersion > CudaVersion::UNKNOWN)
     CC1Args.push_back(
         DriverArgs.MakeArgString(Twine("-target-sdk-version=") +
                                  CudaVersionToString(CudaInstallationVersion)));
+}
 
-  if (DeviceOffloadingKind == Action::OFK_OpenMP) {
-    if (CudaInstallationVersion < CudaVersion::CUDA_92) {
-      getDriver().Diag(
-          diag::err_drv_omp_offload_target_cuda_version_not_support)
-          << CudaVersionToString(CudaInstallationVersion);
-      return;
-    }
+void CudaToolChain::addClangTargetOptions(const llvm::opt::ArgList &DriverArgs,
+                                          llvm::opt::ArgStringList &CC1Args,
+                                          const JobAction &JA) const {
+  addClangTargetOptions(DriverArgs, CC1Args, JA.getOffloadingDeviceKind());
 
-    // Link the bitcode library late if we're using device LTO.
-    if (getDriver().isUsingOffloadLTO())
-      return;
+  if (StringRef(JA.getOffloadingArch()) == "sm_custom") {
 
-    addOpenMPDeviceRTL(getDriver(), DriverArgs, CC1Args, GpuArch.str(),
-                       getTriple(), HostTC);
+    std::string CustomSM = getCustomSM(DriverArgs);
+    StringRef CustomPTX =
+        DriverArgs.getLastArgValue(options::OPT_cuda_custom_ptx_EQ);
+    if (CustomSM.empty() || CustomPTX.empty()) {
+      JA.getOffloadingToolChain()->getDriver().Diag(
+          diag::err_drv_sm_custom_args);
+    }
+    CC1Args.append(
+        {// Needed by preprocessor for __CUDA_ARCH__
+         DriverArgs.MakeArgStringRef("--cuda-custom-sm=" + CustomSM),
+         // Overrides target SM in LLVM
+         "-mllvm", DriverArgs.MakeArgString(("--nvptx-custom-sm=" + CustomSM)),
+         "-mllvm",
+         DriverArgs.MakeArgString(("--nvptx-custom-ptx=" + CustomPTX))});
   }
 }
 
diff --git a/clang/lib/Driver/ToolChains/Cuda.h b/clang/lib/Driver/ToolChains/Cuda.h
index 7a6a6fb209012e..679de56693a072 100644
--- a/clang/lib/Driver/ToolChains/Cuda.h
+++ b/clang/lib/Driver/ToolChains/Cuda.h
@@ -204,6 +204,9 @@ class LLVM_LIBRARY_VISIBILITY CudaToolChain : public NVPTXToolChain {
   addClangTargetOptions(const llvm::opt::ArgList &DriverArgs,
                         llvm::opt::ArgStringList &CC1Args,
                         Action::OffloadKind DeviceOffloadKind) const override;
+  void addClangTargetOptions(const llvm::opt::ArgList &DriverArgs,
+                             llvm::opt::ArgStringList &CC1Args,
+                             const JobAction &JA) const override;
 
   llvm::DenormalMode getDefaultDenormalModeForType(
       const llvm::opt::ArgList &DriverArgs, const JobAction &JA,
diff --git a/clang/test/Driver/cuda-detect.cu b/clang/test/Driver/cuda-detect.cu
index 23b6ba2fcc09d9..4adc6cb15734b0 100644
--- a/clang/test/Driver/cuda-detect.cu
+++ b/clang/test/Driver/cuda-detect.cu
@@ -46,16 +46,16 @@
 // RUN: %clang -### -v --target=i386-unknown-linux --cuda-gpu-arch=sm_21 \
 // RUN:   --cuda-path=%S/Inputs/CUDA_80/usr/local/cuda %s 2>&1 \
 // RUN:   | FileCheck %s -check-prefix COMMON \
-// RUN:     -check-prefixes PTX42,LIBDEVICE,LIBDEVICE20
+// RUN:     -check-prefixes PTX50,LIBDEVICE,LIBDEVICE20
 // RUN: %clang -### -v --target=i386-unknown-linux --cuda-gpu-arch=sm_32 \
 // RUN:   --cuda-path=%S/Inputs/CUDA_80/usr/local/cuda %s 2>&1 \
 // RUN:   | FileCheck %s -check-prefix COMMON \
-// RUN:     -check-prefixes PTX42,LIBDEVICE,LIBDEVICE20
+// RUN:     -check-prefixes PTX50,LIBDEVICE,LIBDEVICE20
 // sm_30, sm_6x map to compute_30.
 // RUN: %clang -### -v --target=i386-unknown-linux --cuda-gpu-arch=sm_30 \
 // RUN:   --cuda-path=%S/Inputs/CUDA_80/usr/local/cuda %s 2>&1 \
 // RUN:   | FileCheck %s -check-prefix COMMON \
-// RUN:     -check-prefixes PTX42,LIBDEVICE,LIBDEVICE30
+// RUN:     -check-prefixes PTX50,LIBDEVICE,LIBDEVICE30
 // sm_5x is a special case. Maps to compute_30 for cuda-7.x only.
 // RUN: %clang -### -v --target=i386-unknown-linux --cuda-gpu-arch=sm_50 \
 // RUN:   --cuda-path=%S/Inputs/CUDA/usr/local/cuda %s 2>&1 \
@@ -64,21 +64,21 @@
 // RUN: %clang -### -v --target=i386-unknown-linux --cuda-gpu-arch=sm_60 \
 // RUN:   --cuda-path=%S/Inputs/CUDA_80/usr/local/cuda %s 2>&1 \
 // RUN:   | FileCheck %s -check-prefix COMMON \
-// RUN:     -check-prefixes PTX42,LIBDEVICE,LIBDEVICE30
+// RUN:     -check-prefixes PTX50,LIBDEVICE,LIBDEVICE30
 // sm_35 and sm_37 -> compute_35
 // RUN: %clang -### -v --target=i386-unknown-linux --cuda-gpu-arch=sm_35 \
 // RUN:   --cuda-path=%S/Inputs/CUDA_80/usr/local/cuda %s 2>&1 \
 // RUN:   | FileCheck %s -check-prefix COMMON -check-prefix CUDAINC \
-// RUN:     -check-prefixes PTX42,LIBDEVICE,LIBDEVICE35
+// RUN:     -check-prefixes PTX50,LIBDEVICE,LIBDEVICE35
 // RUN: %clang -### -v --target=i386-unknown-linux --cuda-gpu-arch=sm_37 \
 // RUN:   --cuda-path=%S/Inputs/CUDA_80/usr/local/cuda %s 2>&1 \
 // RUN:   | FileCheck %s -check-prefix COMMON -check-prefix CUDAINC \
-// RUN:     -check-prefixes PTX42,LIBDEVICE,LIBDEVICE35
+// RUN:     -check-prefixes PTX50,LIBDEVICE,LIBDEVICE35
 // sm_5x -> compute_50 for CUDA-8.0 and newer.
 // RUN: %clang -### -v --target=i386-unknown-linux --cuda-gpu-arch=sm_50 \
 // RUN:   --cuda-path=%S/Inputs/CUDA_80/usr/local/cuda %s 2>&1 \
 // RUN:   | FileCheck %s -check-prefix COMMON \
-// RUN:     -check-prefixes PTX42,LIBDEVICE,LIBDEVICE50
+// RUN:     -check-prefixes PTX50,LIBDEVICE,LIBDEVICE50
 
 // CUDA-9+ uses the same libdevice for all GPU variants:
 // RUN: %clang -### -v --target=x86_64-unknown-linux --cuda-gpu-arch=sm_30 \
@@ -173,6 +173,7 @@
 // LIBDEVICE35-SAME: libdevice.compute_35.10.bc
 // LIBDEVICE50-SAME: libdevice.compute_50.10.bc
 // PTX42-SAME: "-target-feature" "+ptx42"
+// PTX50-SAME: "-target-feature" "+ptx50"
 // PTX60-SAME: "-target-feature" "+ptx60"
 // CUDAINC-SAME: "-include" "__clang_cuda_runtime_wrapper.h"
 // NOCUDAINC-NOT: "-include" "__clang_cuda_runtime_wrapper.h"
diff --git a/clang/test/Driver/cuda-sm_next.cu b/clang/test/Driver/cuda-sm_next.cu
new file mode 100644
index 00000000000000..fcd561492125fd
--- /dev/null
+++ b/clang/test/Driver/cuda-sm_next.cu
@@ -0,0 +1,49 @@
+// Tests CUDA compilation targeting sm_custom
+
+// CC1 options level check.
+// Check that by default we only supply sm_custom requires explicitly 
+// overriding SM/PTX versions.
+// RUN: not %clang -### -c --target=x86_64-linux-gnu --cuda-device-only  \
+// RUN:    --cuda-path=%S/Inputs/CUDA_111/usr/local/cuda %s \
+// RUN:    --cuda-gpu-arch=sm_custom  2>&1 \
+// RUN:   | FileCheck -check-prefixes=ERROR %s
+//
+// Check propagation of explicitly set sm and PTX versions to the tools.
+// RUN: %clang -### -c --target=x86_64-linux-gnu \
+// RUN:    --cuda-path=%S/Inputs/CUDA_111/usr/local/cuda %s \
+// RUN:    --cuda-gpu-arch=sm_custom --cuda-custom-sm=sm_111 --cuda-custom-ptx=222  2>&1 \
+// RUN:   | FileCheck -check-prefixes=ARGS %s
+
+// Preprocessor level checks.
+// RUN: %clang -dD -E --target=x86_64-linux-gnu --cuda-device-only -nocudainc \
+// RUN:    --cuda-path=%S/Inputs/CUDA_111/usr/local/cuda %s \
+// RUN:    --cuda-gpu-arch=sm_custom --cuda-custom-sm=sm_111 --cuda-custom-ptx=222  2>&1 \
+// RUN:   | FileCheck -check-prefixes=PP %s
+
+// PTX level checks. 
+// RUN: %clang -S --target=x86_64-linux-gnu --cuda-device-only -nocudainc -nocudalib \
+// RUN:      --cuda-path=%S/Inputs/CUDA_111/usr/local/cuda -o - %s \
+// RUN:      --cuda-gpu-arch=sm_custom --cuda-custom-sm=sm_111 --cuda-custom-ptx=222  2>&1 \
+// RUN:   | FileCheck -check-prefixes=PTX %s
+
+
+// ERROR: clang: error: offload target sm_custom requires both --cuda-custom_sm and --cuda_custom_ptx to be specified
+
+// ARGS: "-cc1" "-triple" "nvptx64-nvidia-cuda"
+// We need to pass specific SM version to CC1, so that preprocessor can set __CUDA_ARCH__ macro
+// and both SM and PTX to LLVM so it would generate correct PTX directives.
+// ARGS-SAME: "--cuda-custom-sm=sm_111" "-mllvm" "--nvptx-custom-sm=sm_111" "-mllvm" "--nvptx-custom-ptx=222"
+// ARGS-SAME: "-target-cpu" "sm_custom"
+// ARGS-SAME: "-target-feature" "+ptx71"
+// ARGS-NEXT: ptxas
+// ARGS-SAME: "--gpu-name" "sm_111"
+// ARGS-NEXT: fatbinary
+// ARGS-SAME: "--image=profile=sm_111,file= 
+// ARGS-SAME: "--image=profile=compute_111,file
+//
+//
+// PP:  #define __NVPTX__ 1
+// PP: #define __CUDA_ARCH__  1110
+//
+// PTX:  .version 22.2
+// PTX:  .target sm_111
diff --git a/clang/test/Misc/target-invalid-cpu-note.c b/clang/test/Misc/target-invalid-cpu-note.c
index b1783f3917a350..b0dc76ee43c524 100644
--- a/clang/test/Misc/target-invalid-cpu-note.c
+++ b/clang/test/Misc/target-invalid-cpu-note.c
@@ -29,7 +29,7 @@
 
 // RUN: not %clang_cc1 -triple nvptx--- -target-cpu not-a-cpu -fsyntax-only %s 2>&1 | FileCheck %s --check-prefix NVPTX
 // NVPTX: error: unknown target CPU 'not-a-cpu'
-// NVPTX-NEXT: note: valid target CPU values are: sm_20, sm_21, sm_30, sm_32, sm_35, sm_37, sm_50, sm_52, sm_53, sm_60, sm_61, sm_62, sm_70, sm_72, sm_75, sm_80, sm_86, sm_87, sm_89, sm_90, sm_90a, gfx600, gfx601, gfx602, gfx700, gfx701, gfx702, gfx703, gfx704, gfx705, gfx801, gfx802, gfx803, gfx805, gfx810, gfx9-generic, gfx900, gfx902, gfx904, gfx906, gfx908, gfx909, gfx90a, gfx90c, gfx940, gfx941, gfx942, gfx10-1-generic, gfx1010, gfx1011, gfx1012, gfx1013, gfx10-3-generic, gfx1030, gfx1031, gfx1032, gfx1033, gfx1034, gfx1035, gfx1036, gfx11-generic, gfx1100, gfx1101, gfx1102, gfx1103, gfx1150, gfx1151, gfx1152, gfx12-generic, gfx1200, gfx1201, amdgcnspirv{{$}}
+// NVPTX-NEXT: note: valid target CPU values are: sm_20, sm_21, sm_30, sm_32, sm_35, sm_37, sm_50, sm_52, sm_53, sm_60, sm_61, sm_62, sm_70, sm_72, sm_75, sm_80, sm_86, sm_87, sm_89, sm_90, sm_90a, sm_custom, gfx600, gfx601, gfx602, gfx700, gfx701, gfx702, gfx703, gfx704, gfx705, gfx801, gfx802, gfx803, gfx805, gfx810, gfx9-generic, gfx900, gfx902, gfx904, gfx906, gfx908, gfx909, gfx90a, gfx90c, gfx940, gfx941, gfx942, gfx10-1-generic, gfx1010, gfx1011, gfx1012, gfx1013, gfx10-3-generic, gfx1030, gfx1031, gfx1032, gfx1033, gfx1034, gfx1035, gfx1036, gfx11-generic, gfx1100, gfx1101, gfx1102, gfx1103, gfx1150, gfx1151, gfx1152, gfx12-generic, gfx1200, gfx1201, amdgcnspirv{{$}}
 
 // RUN: not %clang_cc1 -triple r600--- -target-cpu not-a-cpu -fsyntax-only %s 2>&1 | FileCheck %s --check-prefix R600
 // R600: error: unknown target CPU 'not-a-cpu'
diff --git a/llvm/lib/Target/NVPTX/NVPTX.td b/llvm/lib/Target/NVPTX/NVPTX.td
index bb4549a5e60782..92c460798c2839 100644
--- a/llvm/lib/Target/NVPTX/NVPTX.td
+++ b/llvm/lib/Target/NVPTX/NVPTX.td
@@ -39,12 +39,15 @@ foreach sm = [20, 21, 30, 32, 35, 37, 50, 52, 53,
   def SM#sm: FeatureSM<""#sm, !mul(sm, 10)>;
 
 def SM90a: FeatureSM<"90a", 901>;
+def SMcustom: FeatureSM<"custom", 9999>; // Placeholder for an unknown future version
 
 foreach version = [32, 40, 41, 42, 43, 50, 60, 61, 62, 63, 64, 65,
                    70, 71, 72, 73, 74, 75, 76, 77, 78,
                    80, 81, 82, 83, 84, 85] in
   def PTX#version: FeaturePTX<version>;
 
+def PTXcustom: FeaturePTX<9999>; // Placeholder for an unknown future version.
+
 //===----------------------------------------------------------------------===//
 // NVPTX supported processors.
 //===----------------------------------------------------------------------===//
@@ -73,6 +76,7 @@ def : Proc<"sm_87", [SM87, PTX74]>;
 def : Proc<"sm_89", [SM89, PTX78]>;
 def : Proc<"sm_90", [SM90, PTX78]>;
 def : Proc<"sm_90a", [SM90a, PTX80]>;
+def : Proc<"sm_custom", [SMcustom, PTXcustom]>;
 
 def NVPTXInstrInfo : InstrInfo {
 }
diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp b/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
index 420065585b3849..c2626f2f5e7717 100644
--- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
@@ -12,6 +12,8 @@
 
 #include "NVPTXSubtarget.h"
 #include "NVPTXTargetMachine.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/Error.h"
 
 using namespace llvm;
 
@@ -26,25 +28,51 @@ static cl::opt<bool>
     NoF16Math("nvptx-no-f16-math", cl::Hidden,
               cl::desc("NVPTX Specific: Disable generation of f16 math ops."),
               cl::init(false));
+static cl::opt<std::string>
+    CustomSM("nvptx-custom-sm", cl::Hidden,
+             cl::desc("NVPTX Specific: Override SM ID for sm_custom."),
+             cl::init(""));
+static cl::opt<unsigned>
+    CustomPTX("nvptx-custom-ptx", cl::Hidden,
+              cl::desc("NVPTX Specific: Override PTX version for sm_custom."),
+              cl::init(0));
+
 // Pin the vtable to this file.
 void NVPTXSubtarget::anchor() {}
 
 NVPTXSubtarget &NVPTXSubtarget::initializeSubtargetDependencies(StringRef CPU,
                                                                 StringRef FS) {
-    // Provide the default CPU if we don't have one.
-    TargetName = std::string(CPU.empty() ? "sm_30" : CPU);
+  // Provide the default CPU if we don't have one.
+  TargetName = std::string(CPU.empty() ? "sm_30" : CPU);
 
-    ParseSubtargetFeatures(TargetName, /*TuneCPU*/ TargetName, FS);
+  ParseSubtargetFeatures(TargetName, /*TuneCPU*/ TargetName, FS);
+  if (TargetName == "sm_custom") {
+    if (CustomSM.empty() || CustomPTX == 0)
+      llvm::report_fatal_error("Target sm_custom requires --nvptx-custom-sm "
+                               "and --nvptx-custom-ptx to be specified.",
+                               false);
+    StringRef IDS(CustomSM);
+    if (!IDS.starts_with("sm_"))
+      llvm::report_fatal_error("Custom SM name must begin with 'sm_'");
+    IDS = IDS.drop_front(3);
+    if (IDS.ends_with("a"))
+      IDS = IDS.drop_back(1);
+    unsigned SMID;
+    if (IDS.getAsInteger(10, SMID))
+      llvm::report_fatal_error("Invalid custom SM format. Must be sm_NNN[a]");
+    TargetName = CustomSM;
+    FullSmVersion = SMID * 10;
+    PTXVersion = CustomPTX;
+  }
 
-    // Re-map SM version numbers, SmVersion carries the regular SMs which do
-    // have relative order, while FullSmVersion allows distinguishing sm_90 from
-    // sm_90a, which would *not* be a subset of sm_91.
-    SmVersion = getSmVersion();
+  // Re-map SM version numbers, SmVersion carries the regular SMs which do
+  // have relative order, while FullSmVersion allows distinguishing sm_90 from
+  // sm_90a, which would *not* be a subset of sm_91.
+  SmVersion = getSmVersion();
 
-    // Set default to PTX 6.0 (CUDA 9.0)
-    if (PTXVersion == 0) {
-      PTXVersion = 60;
-  }
+  // Set default to PTX 6.0 (CUDA 9.0)
+  if (PTXVersion == 0)
+    PTXVersion = 60;
 
   return *this;
 }

>From dfdbc01f5aae9c89665384d651e1510c63df1dfa Mon Sep 17 00:00:00 2001
From: Artem Belevich <tra at google.com>
Date: Mon, 12 Aug 2024 15:53:57 -0700
Subject: [PATCH 2/2] clang-format

---
 clang/lib/Basic/Cuda.cpp             | 15 +++++----------
 clang/lib/Driver/ToolChains/Cuda.cpp |  3 +--
 2 files changed, 6 insertions(+), 12 deletions(-)

diff --git a/clang/lib/Basic/Cuda.cpp b/clang/lib/Basic/Cuda.cpp
index 832740ae1a4963..d3e6b93ee58a63 100644
--- a/clang/lib/Basic/Cuda.cpp
+++ b/clang/lib/Basic/Cuda.cpp
@@ -15,10 +15,8 @@ struct CudaVersionMapEntry {
   PTXVersion PTX;
 };
 #define CUDA_ENTRY(major, minor, ptx)                                          \
-  {                                                                            \
-    #major "." #minor, CudaVersion::CUDA_##major##minor,                       \
-        llvm::VersionTuple(major, minor), PTXVersion::ptx                      \
-  }
+  {#major "." #minor, CudaVersion::CUDA_##major##minor,                        \
+   llvm::VersionTuple(major, minor), PTXVersion::ptx}
 
 static const CudaVersionMapEntry CudaNameVersionMap[] = {
     CUDA_ENTRY(7, 0, PTX_42),
@@ -48,8 +46,7 @@ static const CudaVersionMapEntry CudaNameVersionMap[] = {
     {"", CudaVersion::NEW, llvm::VersionTuple(std::numeric_limits<int>::max()),
      PTXVersion::PTX_LAST},
     // End of list tombstone
-    {"unknown", CudaVersion::UNKNOWN, {}, PTXVersion::PTX_42}
-};
+    {"unknown", CudaVersion::UNKNOWN, {}, PTXVersion::PTX_42}};
 #undef CUDA_ENTRY
 
 const char *CudaVersionToString(CudaVersion V) {
@@ -97,11 +94,9 @@ struct OffloadArchToStringMap {
 };
 } // namespace
 
-#define SM2(sm, ca)                                                            \
-  { OffloadArch::SM_##sm, "sm_" #sm, ca }
+#define SM2(sm, ca) {OffloadArch::SM_##sm, "sm_" #sm, ca}
 #define SM(sm) SM2(sm, "compute_" #sm)
-#define GFX(gpu)                                                               \
-  { OffloadArch::GFX##gpu, "gfx" #gpu, "compute_amdgcn" }
+#define GFX(gpu) {OffloadArch::GFX##gpu, "gfx" #gpu, "compute_amdgcn"}
 static const OffloadArchToStringMap arch_names[] = {
     // clang-format off
     {OffloadArch::UNUSED, "", ""},
diff --git a/clang/lib/Driver/ToolChains/Cuda.cpp b/clang/lib/Driver/ToolChains/Cuda.cpp
index a628b521224bd6..518b7d929c10a5 100644
--- a/clang/lib/Driver/ToolChains/Cuda.cpp
+++ b/clang/lib/Driver/ToolChains/Cuda.cpp
@@ -565,8 +565,7 @@ void NVPTX::FatBinary::ConstructJob(Compilation &C, const JobAction &JA,
     std::string Arch = [&]() -> std::string {
       bool IsAsm = II.getType() == types::TY_PP_Asm;
       if (gpu_arch != OffloadArch::SM_custom)
-        return IsAsm ? OffloadArchToVirtualArchString(gpu_arch)
-                       : gpu_arch_str;
+        return IsAsm ? OffloadArchToVirtualArchString(gpu_arch) : gpu_arch_str;
       std::string CustomSM = getCustomSM(Args);
       if (IsAsm) {
         StringRef SMID = CustomSM;