[clang] [CUDA] Implement __CUDA_ARCH_LIST__ macro and refactor architecture helpers (PR #175260)
Artem Belevich via cfe-commits
cfe-commits at lists.llvm.org
Fri Jan 9 14:49:24 PST 2026
https://github.com/Artem-B updated https://github.com/llvm/llvm-project/pull/175260
>From 0ef90c26493bf01a5899ac7a0d28a79317ea51f5 Mon Sep 17 00:00:00 2001
From: Artem Belevich <tra at google.com>
Date: Thu, 8 Jan 2026 17:01:38 -0800
Subject: [PATCH 1/2] [CUDA] Implement __CUDA_ARCH_LIST__ macro and refactor
architecture helpers.
- Move CudaArchToID and accelerated architecture check to Basic/Cuda.
- Simplify NVPTXTargetInfo::getTargetDefines using the new helpers.
- Implement __CUDA_ARCH_LIST__ macro which contains a comma-separated
list of numeric IDs for all enabled CUDA architectures.
---
clang/include/clang/Basic/Cuda.h | 6 +
clang/lib/Basic/Cuda.cpp | 84 ++++++++++++++
clang/lib/Basic/Targets/NVPTX.cpp | 153 ++------------------------
clang/lib/Driver/ToolChains/Clang.cpp | 22 ++++
clang/test/Driver/cuda-arch-list.cu | 56 ++++++++++
5 files changed, 176 insertions(+), 145 deletions(-)
create mode 100644 clang/test/Driver/cuda-arch-list.cu
diff --git a/clang/include/clang/Basic/Cuda.h b/clang/include/clang/Basic/Cuda.h
index 5639710f43aa5..78fc32295c88c 100644
--- a/clang/include/clang/Basic/Cuda.h
+++ b/clang/include/clang/Basic/Cuda.h
@@ -84,6 +84,12 @@ CudaVersion ToCudaVersion(llvm::VersionTuple);
bool CudaFeatureEnabled(llvm::VersionTuple, CudaFeature);
bool CudaFeatureEnabled(CudaVersion, CudaFeature);
+/// Get the numeric ID (e.g. 700) of a CUDA architecture.
+unsigned CudaArchToID(OffloadArch Arch);
+
+/// Check if the CUDA architecture is an accelerated variant (e.g. sm_90a).
+bool IsNVIDIAAcceleratedOffloadArch(OffloadArch Arch);
+
} // namespace clang
#endif
diff --git a/clang/lib/Basic/Cuda.cpp b/clang/lib/Basic/Cuda.cpp
index 51769eb425923..514fa2f2a4ca7 100644
--- a/clang/lib/Basic/Cuda.cpp
+++ b/clang/lib/Basic/Cuda.cpp
@@ -173,4 +173,88 @@ bool CudaFeatureEnabled(CudaVersion Version, CudaFeature Feature) {
}
llvm_unreachable("Unknown CUDA feature.");
}
+
+unsigned CudaArchToID(OffloadArch Arch) {
+ switch (Arch) {
+ case OffloadArch::SM_20:
+ return 200;
+ case OffloadArch::SM_21:
+ return 210;
+ case OffloadArch::SM_30:
+ return 300;
+ case OffloadArch::SM_32_:
+ return 320;
+ case OffloadArch::SM_35:
+ return 350;
+ case OffloadArch::SM_37:
+ return 370;
+ case OffloadArch::SM_50:
+ return 500;
+ case OffloadArch::SM_52:
+ return 520;
+ case OffloadArch::SM_53:
+ return 530;
+ case OffloadArch::SM_60:
+ return 600;
+ case OffloadArch::SM_61:
+ return 610;
+ case OffloadArch::SM_62:
+ return 620;
+ case OffloadArch::SM_70:
+ return 700;
+ case OffloadArch::SM_72:
+ return 720;
+ case OffloadArch::SM_75:
+ return 750;
+ case OffloadArch::SM_80:
+ return 800;
+ case OffloadArch::SM_86:
+ return 860;
+ case OffloadArch::SM_87:
+ return 870;
+ case OffloadArch::SM_88:
+ return 880;
+ case OffloadArch::SM_89:
+ return 890;
+ case OffloadArch::SM_90:
+ case OffloadArch::SM_90a:
+ return 900;
+ case OffloadArch::SM_100:
+ case OffloadArch::SM_100a:
+ return 1000;
+ case OffloadArch::SM_101:
+ case OffloadArch::SM_101a:
+ return 1010;
+ case OffloadArch::SM_103:
+ case OffloadArch::SM_103a:
+ return 1030;
+ case OffloadArch::SM_110:
+ case OffloadArch::SM_110a:
+ return 1100;
+ case OffloadArch::SM_120:
+ case OffloadArch::SM_120a:
+ return 1200;
+ case OffloadArch::SM_121:
+ case OffloadArch::SM_121a:
+ return 1210;
+ default:
+ break;
+ }
+ llvm_unreachable("invalid NVIDIA GPU architecture");
+}
+
+bool IsNVIDIAAcceleratedOffloadArch(OffloadArch Arch) {
+ switch (Arch) {
+ case OffloadArch::SM_90a:
+ case OffloadArch::SM_100a:
+ case OffloadArch::SM_101a:
+ case OffloadArch::SM_103a:
+ case OffloadArch::SM_110a:
+ case OffloadArch::SM_120a:
+ case OffloadArch::SM_121a:
+ return true;
+ default:
+ return false;
+ }
+}
} // namespace clang
diff --git a/clang/lib/Basic/Targets/NVPTX.cpp b/clang/lib/Basic/Targets/NVPTX.cpp
index 06db3aae0c755..5b399b2a5a080 100644
--- a/clang/lib/Basic/Targets/NVPTX.cpp
+++ b/clang/lib/Basic/Targets/NVPTX.cpp
@@ -174,155 +174,18 @@ void NVPTXTargetInfo::getTargetDefines(const LangOptions &Opts,
Builder.defineMacro("__NVPTX__");
// Skip setting architecture dependent macros if undefined.
- if (GPU == OffloadArch::UNUSED && !HostTarget)
+ if (!IsNVIDIAOffloadArch(GPU))
return;
if (Opts.CUDAIsDevice || Opts.OpenMPIsTargetDevice || !HostTarget) {
// Set __CUDA_ARCH__ for the GPU specified.
- llvm::StringRef CUDAArchCode = [this] {
- switch (GPU) {
- case OffloadArch::GFX600:
- case OffloadArch::GFX601:
- case OffloadArch::GFX602:
- case OffloadArch::GFX700:
- case OffloadArch::GFX701:
- case OffloadArch::GFX702:
- case OffloadArch::GFX703:
- case OffloadArch::GFX704:
- case OffloadArch::GFX705:
- case OffloadArch::GFX801:
- case OffloadArch::GFX802:
- case OffloadArch::GFX803:
- case OffloadArch::GFX805:
- case OffloadArch::GFX810:
- case OffloadArch::GFX9_GENERIC:
- case OffloadArch::GFX900:
- case OffloadArch::GFX902:
- case OffloadArch::GFX904:
- case OffloadArch::GFX906:
- case OffloadArch::GFX908:
- case OffloadArch::GFX909:
- case OffloadArch::GFX90a:
- case OffloadArch::GFX90c:
- case OffloadArch::GFX9_4_GENERIC:
- case OffloadArch::GFX942:
- case OffloadArch::GFX950:
- case OffloadArch::GFX10_1_GENERIC:
- case OffloadArch::GFX1010:
- case OffloadArch::GFX1011:
- case OffloadArch::GFX1012:
- case OffloadArch::GFX1013:
- case OffloadArch::GFX10_3_GENERIC:
- case OffloadArch::GFX1030:
- case OffloadArch::GFX1031:
- case OffloadArch::GFX1032:
- case OffloadArch::GFX1033:
- case OffloadArch::GFX1034:
- case OffloadArch::GFX1035:
- case OffloadArch::GFX1036:
- case OffloadArch::GFX11_GENERIC:
- case OffloadArch::GFX1100:
- case OffloadArch::GFX1101:
- case OffloadArch::GFX1102:
- case OffloadArch::GFX1103:
- case OffloadArch::GFX1150:
- case OffloadArch::GFX1151:
- case OffloadArch::GFX1152:
- case OffloadArch::GFX1153:
- case OffloadArch::GFX12_GENERIC:
- case OffloadArch::GFX1200:
- case OffloadArch::GFX1201:
- case OffloadArch::GFX1250:
- case OffloadArch::GFX1251:
- case OffloadArch::AMDGCNSPIRV:
- case OffloadArch::Generic:
- case OffloadArch::GRANITERAPIDS:
- case OffloadArch::BMG_G21:
- case OffloadArch::LAST:
- break;
- case OffloadArch::UNKNOWN:
- assert(false && "No GPU arch when compiling CUDA device code.");
- return "";
- case OffloadArch::UNUSED:
- case OffloadArch::SM_20:
- return "200";
- case OffloadArch::SM_21:
- return "210";
- case OffloadArch::SM_30:
- return "300";
- case OffloadArch::SM_32_:
- return "320";
- case OffloadArch::SM_35:
- return "350";
- case OffloadArch::SM_37:
- return "370";
- case OffloadArch::SM_50:
- return "500";
- case OffloadArch::SM_52:
- return "520";
- case OffloadArch::SM_53:
- return "530";
- case OffloadArch::SM_60:
- return "600";
- case OffloadArch::SM_61:
- return "610";
- case OffloadArch::SM_62:
- return "620";
- case OffloadArch::SM_70:
- return "700";
- case OffloadArch::SM_72:
- return "720";
- case OffloadArch::SM_75:
- return "750";
- case OffloadArch::SM_80:
- return "800";
- case OffloadArch::SM_86:
- return "860";
- case OffloadArch::SM_87:
- return "870";
- case OffloadArch::SM_88:
- return "880";
- case OffloadArch::SM_89:
- return "890";
- case OffloadArch::SM_90:
- case OffloadArch::SM_90a:
- return "900";
- case OffloadArch::SM_100:
- case OffloadArch::SM_100a:
- return "1000";
- case OffloadArch::SM_101:
- case OffloadArch::SM_101a:
- return "1010";
- case OffloadArch::SM_103:
- case OffloadArch::SM_103a:
- return "1030";
- case OffloadArch::SM_110:
- case OffloadArch::SM_110a:
- return "1100";
- case OffloadArch::SM_120:
- case OffloadArch::SM_120a:
- return "1200";
- case OffloadArch::SM_121:
- case OffloadArch::SM_121a:
- return "1210";
- }
- llvm_unreachable("unhandled OffloadArch");
- }();
- Builder.defineMacro("__CUDA_ARCH__", CUDAArchCode);
- switch(GPU) {
- case OffloadArch::SM_90a:
- case OffloadArch::SM_100a:
- case OffloadArch::SM_101a:
- case OffloadArch::SM_103a:
- case OffloadArch::SM_110a:
- case OffloadArch::SM_120a:
- case OffloadArch::SM_121a:
- Builder.defineMacro("__CUDA_ARCH_FEAT_SM" + CUDAArchCode.drop_back() + "_ALL", "1");
- break;
- default:
- // Do nothing if this is not an enhanced architecture.
- break;
- }
+ unsigned ArchID = CudaArchToID(GPU);
+ Builder.defineMacro("__CUDA_ARCH__", llvm::Twine(ArchID));
+
+ if (IsNVIDIAAcceleratedOffloadArch(GPU))
+ Builder.defineMacro("__CUDA_ARCH_FEAT_SM" + llvm::Twine(ArchID / 10) +
+ "_ALL",
+ "1");
}
}
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 4399eb475be75..4ca98600d6e93 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -1069,6 +1069,28 @@ void Clang::AddPreprocessingOptions(Compilation &C, const JobAction &JA,
{options::OPT_D, options::OPT_U, options::OPT_I_Group,
options::OPT_F, options::OPT_embed_dir_EQ});
+ if (C.isOffloadingHostKind(Action::OFK_Cuda) ||
+ JA.isDeviceOffloading(Action::OFK_Cuda)) {
+ // Collect all enabled NVPTX architectures.
+ std::set<unsigned> ArchIDs;
+ for (auto &I : llvm::make_range(C.getOffloadToolChains(Action::OFK_Cuda))) {
+ const ToolChain *TC = I.second;
+ for (StringRef Arch :
+ D.getOffloadArchs(C, C.getArgs(), Action::OFK_Cuda, *TC)) {
+ OffloadArch OA = StringToOffloadArch(Arch);
+ if (IsNVIDIAOffloadArch(OA))
+ ArchIDs.insert(CudaArchToID(OA));
+ }
+ }
+
+ if (!ArchIDs.empty()) {
+ SmallString<128> List;
+ llvm::raw_svector_ostream OS(List);
+ llvm::interleave(ArchIDs, OS, ",");
+ CmdArgs.push_back(Args.MakeArgString("-D__CUDA_ARCH_LIST__=" + List));
+ }
+ }
+
// Add -Wp, and -Xpreprocessor if using the preprocessor.
// FIXME: There is a very unfortunate problem here, some troubled
diff --git a/clang/test/Driver/cuda-arch-list.cu b/clang/test/Driver/cuda-arch-list.cu
new file mode 100644
index 0000000000000..84efeb1499708
--- /dev/null
+++ b/clang/test/Driver/cuda-arch-list.cu
@@ -0,0 +1,56 @@
+// Checks that __CUDA_ARCH_LIST__ is defined correctly for both host and device
+// subcompilations.
+
+// RUN: %clang -### --target=x86_64-unknown-linux-gnu -nocudainc -nocudalib \
+// RUN: --offload-arch=sm_60 %s 2>&1 \
+// RUN: | FileCheck -check-prefixes=DEVICE60,HOST %s
+
+// RUN: %clang -### --target=x86_64-unknown-linux-gnu -nocudainc -nocudalib \
+// RUN: --offload-arch=sm_60 --offload-arch=sm_70 %s 2>&1 \
+// RUN: | FileCheck -check-prefixes=DEVICE60-60-70,DEVICE70-60-70,HOST-60-70 %s
+
+// RUN: %clang -### --target=x86_64-unknown-linux-gnu -nocudainc -nocudalib \
+// RUN: --offload-arch=sm_70 --offload-arch=sm_60 %s 2>&1 \
+// RUN: | FileCheck -check-prefixes=DEVICE60-60-70,DEVICE70-60-70,HOST-60-70 %s
+
+// Verify that it works with no explicit arch (defaults to sm_52)
+// RUN: %clang -### --target=x86_64-unknown-linux-gnu -nocudainc -nocudalib \
+// RUN: --cuda-path=%S/Inputs/CUDA/usr/local/cuda %s 2>&1 \
+// RUN: | FileCheck -check-prefixes=DEVICE52,HOST52 %s
+
+// Verify that --no-offload-arch negates preceding --offload-arch
+// RUN: %clang -### --target=x86_64-unknown-linux-gnu -nocudainc -nocudalib \
+// RUN: --offload-arch=sm_60 --offload-arch=sm_70 --no-offload-arch=sm_60 %s 2>&1 \
+// RUN: | FileCheck -check-prefixes=DEVICE70-ONLY,HOST70-ONLY %s
+
+// DEVICE60: "-cc1" "-triple" "nvptx64-nvidia-cuda"
+// DEVICE60-SAME: "-target-cpu" "sm_60"
+// DEVICE60-SAME: "-D__CUDA_ARCH_LIST__=600"
+
+// HOST: "-cc1" "-triple" "x86_64-unknown-linux-gnu"
+// HOST-SAME: "-D__CUDA_ARCH_LIST__=600"
+
+// DEVICE60-60-70: "-cc1" "-triple" "nvptx64-nvidia-cuda"
+// DEVICE60-60-70-SAME: "-target-cpu" "sm_60"
+// DEVICE60-60-70-SAME: "-D__CUDA_ARCH_LIST__=600,700"
+
+// DEVICE70-60-70: "-cc1" "-triple" "nvptx64-nvidia-cuda"
+// DEVICE70-60-70-SAME: "-target-cpu" "sm_70"
+// DEVICE70-60-70-SAME: "-D__CUDA_ARCH_LIST__=600,700"
+
+// HOST-60-70: "-cc1" "-triple" "x86_64-unknown-linux-gnu"
+// HOST-60-70-SAME: "-D__CUDA_ARCH_LIST__=600,700"
+
+// DEVICE52: "-cc1" "-triple" "nvptx64-nvidia-cuda"
+// DEVICE52-SAME: "-target-cpu" "sm_52"
+// DEVICE52-SAME: "-D__CUDA_ARCH_LIST__=520"
+
+// HOST52: "-cc1" "-triple" "x86_64-unknown-linux-gnu"
+// HOST52-SAME: "-D__CUDA_ARCH_LIST__=520"
+
+// DEVICE70-ONLY: "-cc1" "-triple" "nvptx64-nvidia-cuda"
+// DEVICE70-ONLY-SAME: "-target-cpu" "sm_70"
+// DEVICE70-ONLY-SAME: "-D__CUDA_ARCH_LIST__=700"
+
+// HOST70-ONLY: "-cc1" "-triple" "x86_64-unknown-linux-gnu"
+// HOST70-ONLY-SAME: "-D__CUDA_ARCH_LIST__=700"
>From 8140af52cd72769a2349ae3ee2e22d8c7e6255f3 Mon Sep 17 00:00:00 2001
From: Artem Belevich <tra at google.com>
Date: Fri, 9 Jan 2026 14:48:59 -0800
Subject: [PATCH 2/2] clang-format
---
clang/lib/Basic/Targets/NVPTX.cpp | 5 ++---
1 file changed, 2 insertions(+), 3 deletions(-)
diff --git a/clang/lib/Basic/Targets/NVPTX.cpp b/clang/lib/Basic/Targets/NVPTX.cpp
index 5b399b2a5a080..dec076ac54f41 100644
--- a/clang/lib/Basic/Targets/NVPTX.cpp
+++ b/clang/lib/Basic/Targets/NVPTX.cpp
@@ -183,9 +183,8 @@ void NVPTXTargetInfo::getTargetDefines(const LangOptions &Opts,
Builder.defineMacro("__CUDA_ARCH__", llvm::Twine(ArchID));
if (IsNVIDIAAcceleratedOffloadArch(GPU))
- Builder.defineMacro("__CUDA_ARCH_FEAT_SM" + llvm::Twine(ArchID / 10) +
- "_ALL",
- "1");
+ Builder.defineMacro(
+ "__CUDA_ARCH_FEAT_SM" + llvm::Twine(ArchID / 10) + "_ALL", "1");
}
}
More information about the cfe-commits
mailing list