[clang] [llvm] [CUDA] Add support for CUDA-12.3 and sm_90a (PR #74895)
Artem Belevich via llvm-commits
llvm-commits at lists.llvm.org
Fri Dec 8 16:01:47 PST 2023
https://github.com/Artem-B updated https://github.com/llvm/llvm-project/pull/74895
>From 3ce8e08b94e33480139e13ca9f0fd7b719ff2c3d Mon Sep 17 00:00:00 2001
From: Artem Belevich <tra at google.com>
Date: Wed, 6 Dec 2023 12:11:38 -0800
Subject: [PATCH 1/3] [CUDA] Add support for CUDA-12.3 and sm_90a
---
clang/docs/ReleaseNotes.rst | 3 +++
clang/include/clang/Basic/BuiltinsNVPTX.def | 13 +++++++++++--
clang/include/clang/Basic/Cuda.h | 7 +++++--
clang/lib/Basic/Cuda.cpp | 5 +++++
clang/lib/Basic/Targets/NVPTX.cpp | 3 +++
clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp | 1 +
clang/lib/Driver/ToolChains/Cuda.cpp | 6 ++++++
clang/test/Misc/target-invalid-cpu-note.c | 2 +-
llvm/lib/Target/NVPTX/NVPTX.td | 19 ++++++++++---------
llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp | 7 ++++++-
llvm/lib/Target/NVPTX/NVPTXSubtarget.h | 11 +++++++++--
11 files changed, 60 insertions(+), 17 deletions(-)
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 89ea2f0930ceca..1bf68a46a64dac 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -937,6 +937,9 @@ CUDA/HIP Language Changes
CUDA Support
^^^^^^^^^^^^
+- Clang now supports CUDA SDK up to 12.3
+- Added support for sm_90a
+
AIX Support
^^^^^^^^^^^
diff --git a/clang/include/clang/Basic/BuiltinsNVPTX.def b/clang/include/clang/Basic/BuiltinsNVPTX.def
index d74a7d1e55dd28..0f2e8260143be7 100644
--- a/clang/include/clang/Basic/BuiltinsNVPTX.def
+++ b/clang/include/clang/Basic/BuiltinsNVPTX.def
@@ -26,7 +26,9 @@
#pragma push_macro("SM_87")
#pragma push_macro("SM_89")
#pragma push_macro("SM_90")
-#define SM_90 "sm_90"
+#pragma push_macro("SM_90a")
+#define SM_90a "sm_90a"
+#define SM_90 "sm_90|" SM_90a
#define SM_89 "sm_89|" SM_90
#define SM_87 "sm_87|" SM_89
#define SM_86 "sm_86|" SM_87
@@ -56,7 +58,11 @@
#pragma push_macro("PTX78")
#pragma push_macro("PTX80")
#pragma push_macro("PTX81")
-#define PTX81 "ptx81"
+#pragma push_macro("PTX82")
+#pragma push_macro("PTX83")
+#define PTX83 "ptx83"
+#define PTX82 "ptx82|" PTX83
+#define PTX81 "ptx81|" PTX82
#define PTX80 "ptx80|" PTX81
#define PTX78 "ptx78|" PTX80
#define PTX77 "ptx77|" PTX78
@@ -1055,6 +1061,7 @@ TARGET_BUILTIN(__nvvm_getctarank_shared_cluster, "iv*3", "", AND(SM_90,PTX78))
#pragma pop_macro("SM_87")
#pragma pop_macro("SM_89")
#pragma pop_macro("SM_90")
+#pragma pop_macro("SM_90a")
#pragma pop_macro("PTX42")
#pragma pop_macro("PTX60")
#pragma pop_macro("PTX61")
@@ -1072,3 +1079,5 @@ TARGET_BUILTIN(__nvvm_getctarank_shared_cluster, "iv*3", "", AND(SM_90,PTX78))
#pragma pop_macro("PTX78")
#pragma pop_macro("PTX80")
#pragma pop_macro("PTX81")
+#pragma pop_macro("PTX82")
+#pragma pop_macro("PTX83")
diff --git a/clang/include/clang/Basic/Cuda.h b/clang/include/clang/Basic/Cuda.h
index 2d912bdbbd1bc5..916cb4b7ef34a7 100644
--- a/clang/include/clang/Basic/Cuda.h
+++ b/clang/include/clang/Basic/Cuda.h
@@ -39,9 +39,11 @@ enum class CudaVersion {
CUDA_118,
CUDA_120,
CUDA_121,
- FULLY_SUPPORTED = CUDA_118,
+ CUDA_122,
+ CUDA_123,
+ FULLY_SUPPORTED = CUDA_123,
PARTIALLY_SUPPORTED =
- CUDA_121, // Partially supported. Proceed with a warning.
+ CUDA_123, // Partially supported. Proceed with a warning.
NEW = 10000, // Too new. Issue a warning, but allow using it.
};
const char *CudaVersionToString(CudaVersion V);
@@ -71,6 +73,7 @@ enum class CudaArch {
SM_87,
SM_89,
SM_90,
+ SM_90a,
GFX600,
GFX601,
GFX602,
diff --git a/clang/lib/Basic/Cuda.cpp b/clang/lib/Basic/Cuda.cpp
index 65840b9f20252b..1b1da6a1356f2c 100644
--- a/clang/lib/Basic/Cuda.cpp
+++ b/clang/lib/Basic/Cuda.cpp
@@ -39,6 +39,8 @@ static const CudaVersionMapEntry CudaNameVersionMap[] = {
CUDA_ENTRY(11, 8),
CUDA_ENTRY(12, 0),
CUDA_ENTRY(12, 1),
+ CUDA_ENTRY(12, 2),
+ CUDA_ENTRY(12, 3),
{"", CudaVersion::NEW, llvm::VersionTuple(std::numeric_limits<int>::max())},
{"unknown", CudaVersion::UNKNOWN, {}} // End of list tombstone.
};
@@ -93,6 +95,7 @@ static const CudaArchToStringMap arch_names[] = {
SM(87), // Jetson/Drive AGX Orin
SM(89), // Ada Lovelace
SM(90), // Hopper
+ SM(90a), // Hopper
GFX(600), // gfx600
GFX(601), // gfx601
GFX(602), // gfx602
@@ -209,6 +212,8 @@ CudaVersion MinVersionForCudaArch(CudaArch A) {
case CudaArch::SM_89:
case CudaArch::SM_90:
return CudaVersion::CUDA_118;
+ case CudaArch::SM_90a:
+ return CudaVersion::CUDA_120;
default:
llvm_unreachable("invalid enum");
}
diff --git a/clang/lib/Basic/Targets/NVPTX.cpp b/clang/lib/Basic/Targets/NVPTX.cpp
index 3a4a75b0348f20..5c601812f61759 100644
--- a/clang/lib/Basic/Targets/NVPTX.cpp
+++ b/clang/lib/Basic/Targets/NVPTX.cpp
@@ -262,11 +262,14 @@ void NVPTXTargetInfo::getTargetDefines(const LangOptions &Opts,
case CudaArch::SM_89:
return "890";
case CudaArch::SM_90:
+ case CudaArch::SM_90a:
return "900";
}
llvm_unreachable("unhandled CudaArch");
}();
Builder.defineMacro("__CUDA_ARCH__", CUDAArchCode);
+ if (GPU == CudaArch::SM_90a)
+ Builder.defineMacro("__CUDA_ARCH_FEAT_SM90_ALL", "1");
}
}
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
index 293ccaa3413cdf..299ee1460b3db0 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@@ -3483,6 +3483,7 @@ void CGOpenMPRuntimeGPU::processRequiresDirective(
case CudaArch::SM_87:
case CudaArch::SM_89:
case CudaArch::SM_90:
+ case CudaArch::SM_90a:
case CudaArch::GFX600:
case CudaArch::GFX601:
case CudaArch::GFX602:
diff --git a/clang/lib/Driver/ToolChains/Cuda.cpp b/clang/lib/Driver/ToolChains/Cuda.cpp
index e95ff98e6c940f..ef1e77974c1eaa 100644
--- a/clang/lib/Driver/ToolChains/Cuda.cpp
+++ b/clang/lib/Driver/ToolChains/Cuda.cpp
@@ -78,6 +78,10 @@ CudaVersion getCudaVersion(uint32_t raw_version) {
return CudaVersion::CUDA_120;
if (raw_version < 12020)
return CudaVersion::CUDA_121;
+ if (raw_version < 12030)
+ return CudaVersion::CUDA_122;
+ if (raw_version < 12040)
+ return CudaVersion::CUDA_123;
return CudaVersion::NEW;
}
@@ -671,6 +675,8 @@ void NVPTX::getNVPTXTargetFeatures(const Driver &D, const llvm::Triple &Triple,
case CudaVersion::CUDA_##CUDA_VER: \
PtxFeature = "+ptx" #PTX_VER; \
break;
+ CASE_CUDA_VERSION(123, 83);
+ CASE_CUDA_VERSION(122, 82);
CASE_CUDA_VERSION(121, 81);
CASE_CUDA_VERSION(120, 80);
CASE_CUDA_VERSION(118, 78);
diff --git a/clang/test/Misc/target-invalid-cpu-note.c b/clang/test/Misc/target-invalid-cpu-note.c
index c7146e63add5f2..5475b1d8bd052d 100644
--- a/clang/test/Misc/target-invalid-cpu-note.c
+++ b/clang/test/Misc/target-invalid-cpu-note.c
@@ -29,7 +29,7 @@
// RUN: not %clang_cc1 -triple nvptx--- -target-cpu not-a-cpu -fsyntax-only %s 2>&1 | FileCheck %s --check-prefix NVPTX
// NVPTX: error: unknown target CPU 'not-a-cpu'
-// NVPTX-NEXT: note: valid target CPU values are: sm_20, sm_21, sm_30, sm_32, sm_35, sm_37, sm_50, sm_52, sm_53, sm_60, sm_61, sm_62, sm_70, sm_72, sm_75, sm_80, sm_86, sm_87, sm_89, sm_90, gfx600, gfx601, gfx602, gfx700, gfx701, gfx702, gfx703, gfx704, gfx705, gfx801, gfx802, gfx803, gfx805, gfx810, gfx900, gfx902, gfx904, gfx906, gfx908, gfx909, gfx90a, gfx90c, gfx940, gfx941, gfx942, gfx1010, gfx1011, gfx1012, gfx1013, gfx1030, gfx1031, gfx1032, gfx1033, gfx1034, gfx1035, gfx1036, gfx1100, gfx1101, gfx1102, gfx1103, gfx1150, gfx1151, gfx1200, gfx1201{{$}}
+// NVPTX-NEXT: note: valid target CPU values are: sm_20, sm_21, sm_30, sm_32, sm_35, sm_37, sm_50, sm_52, sm_53, sm_60, sm_61, sm_62, sm_70, sm_72, sm_75, sm_80, sm_86, sm_87, sm_89, sm_90, sm_90a, gfx600, gfx601, gfx602, gfx700, gfx701, gfx702, gfx703, gfx704, gfx705, gfx801, gfx802, gfx803, gfx805, gfx810, gfx900, gfx902, gfx904, gfx906, gfx908, gfx909, gfx90a, gfx90c, gfx940, gfx941, gfx942, gfx1010, gfx1011, gfx1012, gfx1013, gfx1030, gfx1031, gfx1032, gfx1033, gfx1034, gfx1035, gfx1036, gfx1100, gfx1101, gfx1102, gfx1103, gfx1150, gfx1151, gfx1200, gfx1201{{$}}
// RUN: not %clang_cc1 -triple r600--- -target-cpu not-a-cpu -fsyntax-only %s 2>&1 | FileCheck %s --check-prefix R600
// R600: error: unknown target CPU 'not-a-cpu'
diff --git a/llvm/lib/Target/NVPTX/NVPTX.td b/llvm/lib/Target/NVPTX/NVPTX.td
index 02fa2a4ee81ec5..f2a4ce381b40b4 100644
--- a/llvm/lib/Target/NVPTX/NVPTX.td
+++ b/llvm/lib/Target/NVPTX/NVPTX.td
@@ -24,23 +24,24 @@ include "NVPTXInstrInfo.td"
// TableGen in NVPTXGenSubtarget.inc.
//===----------------------------------------------------------------------===//
-class FeatureSM<int version>:
- SubtargetFeature<"sm_"# version, "SmVersion",
- "" # version,
- "Target SM " # version>;
-def SM90a: FeatureSM<90>;
+class FeatureSM<string sm, int value>:
+ SubtargetFeature<"sm_"# sm, "FullSmVersion",
+ "" # value,
+ "Target SM " # sm>;
class FeaturePTX<int version>:
SubtargetFeature<"ptx"# version, "PTXVersion",
"" # version,
"Use PTX version " # version>;
-foreach version = [20, 21, 30, 32, 35, 37, 50, 52, 53,
- 60, 61, 62, 70, 72, 75, 80, 86, 87, 89, 90] in
- def SM#version: FeatureSM<version>;
+foreach sm = [20, 21, 30, 32, 35, 37, 50, 52, 53,
+ 60, 61, 62, 70, 72, 75, 80, 86, 87, 89, 90] in
+ def SM#sm: FeatureSM<""#sm, !mul(sm, 10)>;
+
+def SM90a: FeatureSM<"90a", 901>;
foreach version = [32, 40, 41, 42, 43, 50, 60, 61, 63, 64, 65,
- 70, 71, 72, 73, 74, 75, 76, 77, 78, 80, 81] in
+ 70, 71, 72, 73, 74, 75, 76, 77, 78, 80, 81, 82, 83] in
def PTX#version: FeaturePTX<version>;
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp b/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
index 7fa64af196b936..420065585b3849 100644
--- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
@@ -36,6 +36,11 @@ NVPTXSubtarget &NVPTXSubtarget::initializeSubtargetDependencies(StringRef CPU,
ParseSubtargetFeatures(TargetName, /*TuneCPU*/ TargetName, FS);
+ // Re-map SM version numbers, SmVersion carries the regular SMs which do
+ // have relative order, while FullSmVersion allows distinguishing sm_90 from
+ // sm_90a, which would *not* be a subset of sm_91.
+ SmVersion = getSmVersion();
+
// Set default to PTX 6.0 (CUDA 9.0)
if (PTXVersion == 0) {
PTXVersion = 60;
@@ -48,7 +53,7 @@ NVPTXSubtarget::NVPTXSubtarget(const Triple &TT, const std::string &CPU,
const std::string &FS,
const NVPTXTargetMachine &TM)
: NVPTXGenSubtargetInfo(TT, CPU, /*TuneCPU*/ CPU, FS), PTXVersion(0),
- SmVersion(20), TM(TM),
+ FullSmVersion(200), SmVersion(getSmVersion()), TM(TM),
TLInfo(TM, initializeSubtargetDependencies(CPU, FS)) {}
bool NVPTXSubtarget::hasImageHandles() const {
diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
index 93af11c258b480..951962d1e68be8 100644
--- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
+++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
@@ -35,7 +35,12 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
// PTX version x.y is represented as 10*x+y, e.g. 3.1 == 31
unsigned PTXVersion;
- // SM version x.y is represented as 10*x+y, e.g. 3.1 == 31
+ // Full SM version x.y is represented as 100*x+10*y+feature, e.g. 3.1 == 310
+ // sm_90a == 901
+ unsigned int FullSmVersion;
+
+ // SM version x.y is represented as 10*x+y, e.g. 3.1 == 31. Derived from
+ // FullSmVersion.
unsigned int SmVersion;
const NVPTXTargetMachine &TM;
@@ -80,8 +85,10 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
bool allowFP16Math() const;
bool hasMaskOperator() const { return PTXVersion >= 71; }
bool hasNoReturn() const { return SmVersion >= 30 && PTXVersion >= 64; }
- unsigned int getSmVersion() const { return SmVersion; }
+ unsigned int getSmVersion() const { return FullSmVersion / 10; }
+ unsigned int getFullSmVersion() const { return FullSmVersion; }
std::string getTargetName() const { return TargetName; }
+ bool isSm90a() const { return getFullSmVersion() == 901; }
// Get maximum value of required alignments among the supported data types.
// From the PTX ISA doc, section 8.2.3:
>From 6bd838c0f60e050cad79bc8b198808eb5e7c1586 Mon Sep 17 00:00:00 2001
From: Artem Belevich <tra at google.com>
Date: Fri, 8 Dec 2023 15:49:23 -0800
Subject: [PATCH 2/3] use hasAAFeatures()
---
llvm/lib/Target/NVPTX/NVPTXSubtarget.h | 6 +++++-
1 file changed, 5 insertions(+), 1 deletion(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
index 951962d1e68be8..63e3b50a09fa05 100644
--- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
+++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
@@ -88,7 +88,11 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
unsigned int getSmVersion() const { return FullSmVersion / 10; }
unsigned int getFullSmVersion() const { return FullSmVersion; }
std::string getTargetName() const { return TargetName; }
- bool isSm90a() const { return getFullSmVersion() == 901; }
+
+ // GPUs with "a" suffix have include architecture-accelerated features that
+ // are supported on the specified architecture only, hence such targets do not
+ // follow the onion layer model.
+ bool hasAAFeatures() const { return getFullSmVersion() % 10; }
// Get maximum value of required alignments among the supported data types.
// From the PTX ISA doc, section 8.2.3:
>From 23362844e45ff0ca0925dd37ee18657a58a60b07 Mon Sep 17 00:00:00 2001
From: Artem Belevich <tra at google.com>
Date: Fri, 8 Dec 2023 15:56:39 -0800
Subject: [PATCH 3/3] Rearrange functions in sensible order and add more
details to the comment.
---
llvm/lib/Target/NVPTX/NVPTXSubtarget.h | 10 ++++++----
1 file changed, 6 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
index 63e3b50a09fa05..3ca4c1a24c79a1 100644
--- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
+++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
@@ -85,14 +85,16 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
bool allowFP16Math() const;
bool hasMaskOperator() const { return PTXVersion >= 71; }
bool hasNoReturn() const { return SmVersion >= 30 && PTXVersion >= 64; }
- unsigned int getSmVersion() const { return FullSmVersion / 10; }
unsigned int getFullSmVersion() const { return FullSmVersion; }
- std::string getTargetName() const { return TargetName; }
-
+ unsigned int getSmVersion() const { return getFullSmVersion() / 10; }
// GPUs with "a" suffix have include architecture-accelerated features that
// are supported on the specified architecture only, hence such targets do not
- // follow the onion layer model.
+ // follow the onion layer model. hasAAFeatures() allows distinguishing such
+ // GPU variants from the base GPU architecture.
+ // - 0 represents base GPU model,
+ // - non-zero value identifies particular architecture-accelerated variant.
bool hasAAFeatures() const { return getFullSmVersion() % 10; }
+ std::string getTargetName() const { return TargetName; }
// Get maximum value of required alignments among the supported data types.
// From the PTX ISA doc, section 8.2.3:
More information about the llvm-commits
mailing list