[llvm] [clang] [CUDA] Add support for CUDA-12.3 and sm_90a (PR #74895)

Fri Dec 8 15:57:37 PST 2023

https://github.com/Artem-B updated https://github.com/llvm/llvm-project/pull/74895

>From 3ce8e08b94e33480139e13ca9f0fd7b719ff2c3d Mon Sep 17 00:00:00 2001
From: Artem Belevich <tra at google.com>
Date: Wed, 6 Dec 2023 12:11:38 -0800
Subject: [PATCH 1/3] [CUDA] Add support for CUDA-12.3 and sm_90a

---
 clang/docs/ReleaseNotes.rst                 |  3 +++
 clang/include/clang/Basic/BuiltinsNVPTX.def | 13 +++++++++++--
 clang/include/clang/Basic/Cuda.h            |  7 +++++--
 clang/lib/Basic/Cuda.cpp                    |  5 +++++
 clang/lib/Basic/Targets/NVPTX.cpp           |  3 +++
 clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp    |  1 +
 clang/lib/Driver/ToolChains/Cuda.cpp        |  6 ++++++
 clang/test/Misc/target-invalid-cpu-note.c   |  2 +-
 llvm/lib/Target/NVPTX/NVPTX.td              | 19 ++++++++++---------
 llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp    |  7 ++++++-
 llvm/lib/Target/NVPTX/NVPTXSubtarget.h      | 11 +++++++++--
 11 files changed, 60 insertions(+), 17 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 89ea2f0930ceca..1bf68a46a64dac 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -937,6 +937,9 @@ CUDA/HIP Language Changes
 CUDA Support
 ^^^^^^^^^^^^
 
+- Clang now supports CUDA SDK up to 12.3
+- Added support for sm_90a
+
 AIX Support
 ^^^^^^^^^^^
 
diff --git a/clang/include/clang/Basic/BuiltinsNVPTX.def b/clang/include/clang/Basic/BuiltinsNVPTX.def
index d74a7d1e55dd28..0f2e8260143be7 100644
--- a/clang/include/clang/Basic/BuiltinsNVPTX.def
+++ b/clang/include/clang/Basic/BuiltinsNVPTX.def
@@ -26,7 +26,9 @@
 #pragma push_macro("SM_87")
 #pragma push_macro("SM_89")
 #pragma push_macro("SM_90")
-#define SM_90 "sm_90"
+#pragma push_macro("SM_90a")
+#define SM_90a "sm_90a"
+#define SM_90 "sm_90|" SM_90a
 #define SM_89 "sm_89|" SM_90
 #define SM_87 "sm_87|" SM_89
 #define SM_86 "sm_86|" SM_87
@@ -56,7 +58,11 @@
 #pragma push_macro("PTX78")
 #pragma push_macro("PTX80")
 #pragma push_macro("PTX81")
-#define PTX81 "ptx81"
+#pragma push_macro("PTX82")
+#pragma push_macro("PTX83")
+#define PTX83 "ptx83"
+#define PTX82 "ptx82|" PTX83
+#define PTX81 "ptx81|" PTX82
 #define PTX80 "ptx80|" PTX81
 #define PTX78 "ptx78|" PTX80
 #define PTX77 "ptx77|" PTX78
@@ -1055,6 +1061,7 @@ TARGET_BUILTIN(__nvvm_getctarank_shared_cluster, "iv*3", "", AND(SM_90,PTX78))
 #pragma pop_macro("SM_87")
 #pragma pop_macro("SM_89")
 #pragma pop_macro("SM_90")
+#pragma pop_macro("SM_90a")
 #pragma pop_macro("PTX42")
 #pragma pop_macro("PTX60")
 #pragma pop_macro("PTX61")
@@ -1072,3 +1079,5 @@ TARGET_BUILTIN(__nvvm_getctarank_shared_cluster, "iv*3", "", AND(SM_90,PTX78))
 #pragma pop_macro("PTX78")
 #pragma pop_macro("PTX80")
 #pragma pop_macro("PTX81")
+#pragma pop_macro("PTX82")
+#pragma pop_macro("PTX83")
diff --git a/clang/include/clang/Basic/Cuda.h b/clang/include/clang/Basic/Cuda.h
index 2d912bdbbd1bc5..916cb4b7ef34a7 100644
--- a/clang/include/clang/Basic/Cuda.h
+++ b/clang/include/clang/Basic/Cuda.h
@@ -39,9 +39,11 @@ enum class CudaVersion {
   CUDA_118,
   CUDA_120,
   CUDA_121,
-  FULLY_SUPPORTED = CUDA_118,
+  CUDA_122,
+  CUDA_123,
+  FULLY_SUPPORTED = CUDA_123,
   PARTIALLY_SUPPORTED =
-      CUDA_121, // Partially supported. Proceed with a warning.
+      CUDA_123, // Partially supported. Proceed with a warning.
   NEW = 10000,  // Too new. Issue a warning, but allow using it.
 };
 const char *CudaVersionToString(CudaVersion V);
@@ -71,6 +73,7 @@ enum class CudaArch {
   SM_87,
   SM_89,
   SM_90,
+  SM_90a,
   GFX600,
   GFX601,
   GFX602,
diff --git a/clang/lib/Basic/Cuda.cpp b/clang/lib/Basic/Cuda.cpp
index 65840b9f20252b..1b1da6a1356f2c 100644
--- a/clang/lib/Basic/Cuda.cpp
+++ b/clang/lib/Basic/Cuda.cpp
@@ -39,6 +39,8 @@ static const CudaVersionMapEntry CudaNameVersionMap[] = {
     CUDA_ENTRY(11, 8),
     CUDA_ENTRY(12, 0),
     CUDA_ENTRY(12, 1),
+    CUDA_ENTRY(12, 2),
+    CUDA_ENTRY(12, 3),
     {"", CudaVersion::NEW, llvm::VersionTuple(std::numeric_limits<int>::max())},
     {"unknown", CudaVersion::UNKNOWN, {}} // End of list tombstone.
 };
@@ -93,6 +95,7 @@ static const CudaArchToStringMap arch_names[] = {
     SM(87),                          // Jetson/Drive AGX Orin
     SM(89),                          // Ada Lovelace
     SM(90),                          // Hopper
+    SM(90a),                         // Hopper
     GFX(600),  // gfx600
     GFX(601),  // gfx601
     GFX(602),  // gfx602
@@ -209,6 +212,8 @@ CudaVersion MinVersionForCudaArch(CudaArch A) {
   case CudaArch::SM_89:
   case CudaArch::SM_90:
     return CudaVersion::CUDA_118;
+  case CudaArch::SM_90a:
+    return CudaVersion::CUDA_120;
   default:
     llvm_unreachable("invalid enum");
   }
diff --git a/clang/lib/Basic/Targets/NVPTX.cpp b/clang/lib/Basic/Targets/NVPTX.cpp
index 3a4a75b0348f20..5c601812f61759 100644
--- a/clang/lib/Basic/Targets/NVPTX.cpp
+++ b/clang/lib/Basic/Targets/NVPTX.cpp
@@ -262,11 +262,14 @@ void NVPTXTargetInfo::getTargetDefines(const LangOptions &Opts,
       case CudaArch::SM_89:
         return "890";
       case CudaArch::SM_90:
+      case CudaArch::SM_90a:
         return "900";
       }
       llvm_unreachable("unhandled CudaArch");
     }();
     Builder.defineMacro("__CUDA_ARCH__", CUDAArchCode);
+    if (GPU == CudaArch::SM_90a)
+      Builder.defineMacro("__CUDA_ARCH_FEAT_SM90_ALL", "1");
   }
 }
 
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
index 293ccaa3413cdf..299ee1460b3db0 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@@ -3483,6 +3483,7 @@ void CGOpenMPRuntimeGPU::processRequiresDirective(
       case CudaArch::SM_87:
       case CudaArch::SM_89:
       case CudaArch::SM_90:
+      case CudaArch::SM_90a:
       case CudaArch::GFX600:
       case CudaArch::GFX601:
       case CudaArch::GFX602:
diff --git a/clang/lib/Driver/ToolChains/Cuda.cpp b/clang/lib/Driver/ToolChains/Cuda.cpp
index e95ff98e6c940f..ef1e77974c1eaa 100644
--- a/clang/lib/Driver/ToolChains/Cuda.cpp
+++ b/clang/lib/Driver/ToolChains/Cuda.cpp
@@ -78,6 +78,10 @@ CudaVersion getCudaVersion(uint32_t raw_version) {
     return CudaVersion::CUDA_120;
   if (raw_version < 12020)
     return CudaVersion::CUDA_121;
+  if (raw_version < 12030)
+    return CudaVersion::CUDA_122;
+  if (raw_version < 12040)
+    return CudaVersion::CUDA_123;
   return CudaVersion::NEW;
 }
 
@@ -671,6 +675,8 @@ void NVPTX::getNVPTXTargetFeatures(const Driver &D, const llvm::Triple &Triple,
   case CudaVersion::CUDA_##CUDA_VER:                                           \
     PtxFeature = "+ptx" #PTX_VER;                                              \
     break;
+    CASE_CUDA_VERSION(123, 83);
+    CASE_CUDA_VERSION(122, 82);
     CASE_CUDA_VERSION(121, 81);
     CASE_CUDA_VERSION(120, 80);
     CASE_CUDA_VERSION(118, 78);
diff --git a/clang/test/Misc/target-invalid-cpu-note.c b/clang/test/Misc/target-invalid-cpu-note.c
index c7146e63add5f2..5475b1d8bd052d 100644
--- a/clang/test/Misc/target-invalid-cpu-note.c
+++ b/clang/test/Misc/target-invalid-cpu-note.c
@@ -29,7 +29,7 @@
 
 // RUN: not %clang_cc1 -triple nvptx--- -target-cpu not-a-cpu -fsyntax-only %s 2>&1 | FileCheck %s --check-prefix NVPTX
 // NVPTX: error: unknown target CPU 'not-a-cpu'
-// NVPTX-NEXT: note: valid target CPU values are: sm_20, sm_21, sm_30, sm_32, sm_35, sm_37, sm_50, sm_52, sm_53, sm_60, sm_61, sm_62, sm_70, sm_72, sm_75, sm_80, sm_86, sm_87, sm_89, sm_90, gfx600, gfx601, gfx602, gfx700, gfx701, gfx702, gfx703, gfx704, gfx705, gfx801, gfx802, gfx803, gfx805, gfx810, gfx900, gfx902, gfx904, gfx906, gfx908, gfx909, gfx90a, gfx90c, gfx940, gfx941, gfx942, gfx1010, gfx1011, gfx1012, gfx1013, gfx1030, gfx1031, gfx1032, gfx1033, gfx1034, gfx1035, gfx1036, gfx1100, gfx1101, gfx1102, gfx1103, gfx1150, gfx1151, gfx1200, gfx1201{{$}}
+// NVPTX-NEXT: note: valid target CPU values are: sm_20, sm_21, sm_30, sm_32, sm_35, sm_37, sm_50, sm_52, sm_53, sm_60, sm_61, sm_62, sm_70, sm_72, sm_75, sm_80, sm_86, sm_87, sm_89, sm_90, sm_90a, gfx600, gfx601, gfx602, gfx700, gfx701, gfx702, gfx703, gfx704, gfx705, gfx801, gfx802, gfx803, gfx805, gfx810, gfx900, gfx902, gfx904, gfx906, gfx908, gfx909, gfx90a, gfx90c, gfx940, gfx941, gfx942, gfx1010, gfx1011, gfx1012, gfx1013, gfx1030, gfx1031, gfx1032, gfx1033, gfx1034, gfx1035, gfx1036, gfx1100, gfx1101, gfx1102, gfx1103, gfx1150, gfx1151, gfx1200, gfx1201{{$}}
 
 // RUN: not %clang_cc1 -triple r600--- -target-cpu not-a-cpu -fsyntax-only %s 2>&1 | FileCheck %s --check-prefix R600
 // R600: error: unknown target CPU 'not-a-cpu'
diff --git a/llvm/lib/Target/NVPTX/NVPTX.td b/llvm/lib/Target/NVPTX/NVPTX.td
index 02fa2a4ee81ec5..f2a4ce381b40b4 100644
--- a/llvm/lib/Target/NVPTX/NVPTX.td
+++ b/llvm/lib/Target/NVPTX/NVPTX.td
@@ -24,23 +24,24 @@ include "NVPTXInstrInfo.td"
 //   TableGen in NVPTXGenSubtarget.inc.
 //===----------------------------------------------------------------------===//
 
-class FeatureSM<int version>:
-   SubtargetFeature<"sm_"# version, "SmVersion",
-                    "" # version,
-                    "Target SM " # version>;
-def SM90a: FeatureSM<90>;
+class FeatureSM<string sm, int value>:
+   SubtargetFeature<"sm_"# sm, "FullSmVersion",
+                    "" # value,
+                    "Target SM " # sm>;
 
 class FeaturePTX<int version>:
    SubtargetFeature<"ptx"# version, "PTXVersion",
                     "" # version,
                     "Use PTX version " # version>;
 
-foreach version = [20, 21, 30, 32, 35, 37, 50, 52, 53,
-                   60, 61, 62, 70, 72, 75, 80, 86, 87, 89, 90] in
-  def SM#version: FeatureSM<version>;
+foreach sm = [20, 21, 30, 32, 35, 37, 50, 52, 53,
+              60, 61, 62, 70, 72, 75, 80, 86, 87, 89, 90] in
+  def SM#sm: FeatureSM<""#sm, !mul(sm, 10)>;
+
+def SM90a: FeatureSM<"90a", 901>;
 
 foreach version = [32, 40, 41, 42, 43, 50, 60, 61, 63, 64, 65,
-                   70, 71, 72, 73, 74, 75, 76, 77, 78, 80, 81] in
+                   70, 71, 72, 73, 74, 75, 76, 77, 78, 80, 81, 82, 83] in
   def PTX#version: FeaturePTX<version>;
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp b/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
index 7fa64af196b936..420065585b3849 100644
--- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
@@ -36,6 +36,11 @@ NVPTXSubtarget &NVPTXSubtarget::initializeSubtargetDependencies(StringRef CPU,
 
     ParseSubtargetFeatures(TargetName, /*TuneCPU*/ TargetName, FS);
 
+    // Re-map SM version numbers, SmVersion carries the regular SMs which do
+    // have relative order, while FullSmVersion allows distinguishing sm_90 from
+    // sm_90a, which would *not* be a subset of sm_91.
+    SmVersion = getSmVersion();
+
     // Set default to PTX 6.0 (CUDA 9.0)
     if (PTXVersion == 0) {
       PTXVersion = 60;
@@ -48,7 +53,7 @@ NVPTXSubtarget::NVPTXSubtarget(const Triple &TT, const std::string &CPU,
                                const std::string &FS,
                                const NVPTXTargetMachine &TM)
     : NVPTXGenSubtargetInfo(TT, CPU, /*TuneCPU*/ CPU, FS), PTXVersion(0),
-      SmVersion(20), TM(TM),
+      FullSmVersion(200), SmVersion(getSmVersion()), TM(TM),
       TLInfo(TM, initializeSubtargetDependencies(CPU, FS)) {}
 
 bool NVPTXSubtarget::hasImageHandles() const {
diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
index 93af11c258b480..951962d1e68be8 100644
--- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
+++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
@@ -35,7 +35,12 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
   // PTX version x.y is represented as 10*x+y, e.g. 3.1 == 31
   unsigned PTXVersion;
 
-  // SM version x.y is represented as 10*x+y, e.g. 3.1 == 31
+  // Full SM version x.y is represented as 100*x+10*y+feature, e.g. 3.1 == 310
+  // sm_90a == 901
+  unsigned int FullSmVersion;
+
+  // SM version x.y is represented as 10*x+y, e.g. 3.1 == 31. Derived from
+  // FullSmVersion.
   unsigned int SmVersion;
 
   const NVPTXTargetMachine &TM;
@@ -80,8 +85,10 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
   bool allowFP16Math() const;
   bool hasMaskOperator() const { return PTXVersion >= 71; }
   bool hasNoReturn() const { return SmVersion >= 30 && PTXVersion >= 64; }
-  unsigned int getSmVersion() const { return SmVersion; }
+  unsigned int getSmVersion() const { return FullSmVersion / 10; }
+  unsigned int getFullSmVersion() const { return FullSmVersion; }
   std::string getTargetName() const { return TargetName; }
+  bool isSm90a() const { return getFullSmVersion() == 901; }
 
   // Get maximum value of required alignments among the supported data types.
   // From the PTX ISA doc, section 8.2.3:

>From 6bd838c0f60e050cad79bc8b198808eb5e7c1586 Mon Sep 17 00:00:00 2001
From: Artem Belevich <tra at google.com>
Date: Fri, 8 Dec 2023 15:49:23 -0800
Subject: [PATCH 2/3] use hasAAFeatures()

---
 llvm/lib/Target/NVPTX/NVPTXSubtarget.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
index 951962d1e68be8..63e3b50a09fa05 100644
--- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
+++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
@@ -88,7 +88,11 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
   unsigned int getSmVersion() const { return FullSmVersion / 10; }
   unsigned int getFullSmVersion() const { return FullSmVersion; }
   std::string getTargetName() const { return TargetName; }
-  bool isSm90a() const { return getFullSmVersion() == 901; }
+
+  // GPUs with "a" suffix have include architecture-accelerated features that
+  // are supported on the specified architecture only, hence such targets do not
+  // follow the onion layer model.
+  bool hasAAFeatures() const { return getFullSmVersion() % 10; }
 
   // Get maximum value of required alignments among the supported data types.
   // From the PTX ISA doc, section 8.2.3:

>From c4e89c6fbb421ae511d555efddb7befe63b30d05 Mon Sep 17 00:00:00 2001
From: Artem Belevich <tra at google.com>
Date: Fri, 8 Dec 2023 15:56:39 -0800
Subject: [PATCH 3/3] Rearrange functions in sensible order and add more
 details to the comment.

---
 llvm/lib/Target/NVPTX/NVPTXSubtarget.h | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
index 63e3b50a09fa05..a5bbf032bd4ce5 100644
--- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
+++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
@@ -85,14 +85,17 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
   bool allowFP16Math() const;
   bool hasMaskOperator() const { return PTXVersion >= 71; }
   bool hasNoReturn() const { return SmVersion >= 30 && PTXVersion >= 64; }
-  unsigned int getSmVersion() const { return FullSmVersion / 10; }
   unsigned int getFullSmVersion() const { return FullSmVersion; }
-  std::string getTargetName() const { return TargetName; }
-
+  unsigned int getSmVersion() const { return getFullSmVersion() / 10; }
   // GPUs with "a" suffix have include architecture-accelerated features that
   // are supported on the specified architecture only, hence such targets do not
-  // follow the onion layer model.
+  // follow the onion layer model. hasAAFeatures() allows distinguishing such
+  // GPU variants from the base GPU architecture.
+  // - 0 represents base GPU model,
+  // - non-zero value identifies particular architecture-accelerated variant.
   bool hasAAFeatures() const { return getFullSmVersion() % 10; }
+  std::string getTargetName() const { return TargetName; }
+
 
   // Get maximum value of required alignments among the supported data types.
   // From the PTX ISA doc, section 8.2.3: