[clang] [Clang][HIP][CUDA] Add `__cluster_dims__` and `__no_cluster__` attribute (PR #156686)
Shilei Tian via cfe-commits
cfe-commits at lists.llvm.org
Wed Sep 24 10:26:58 PDT 2025
https://github.com/shiltian updated https://github.com/llvm/llvm-project/pull/156686
>From eb859be868d6ab795479419c80ca9b3a04977374 Mon Sep 17 00:00:00 2001
From: Shilei Tian <i at tianshilei.me>
Date: Wed, 3 Sep 2025 10:47:52 -0400
Subject: [PATCH 1/5] [Clang][HIP][CUDA] Add `__cluster_dims__` and
`__no_cluster__` attribute
This PR adds basic frontend support for `__cluster_dims__` and `__no_cluster__` attribute.
Co-authored-by: Yaxun (Sam) Liu <yaxun.liu at amd.com>
Co-authored-by: Jay Foad <jay.foad at amd.com>
---
clang/include/clang/Basic/Attr.td | 17 +++
.../clang/Basic/DiagnosticSemaKinds.td | 8 ++
clang/include/clang/Sema/Sema.h | 8 ++
clang/lib/CodeGen/Targets/AMDGPU.cpp | 26 ++++
.../lib/Headers/__clang_hip_runtime_wrapper.h | 2 +
clang/lib/Sema/SemaDeclAttr.cpp | 130 ++++++++++++++++++
.../lib/Sema/SemaTemplateInstantiateDecl.cpp | 37 +++++
clang/test/CodeGenCUDA/Inputs/cuda.h | 4 +
clang/test/CodeGenCUDA/cluster_dims.cu | 38 +++++
...a-attribute-supported-attributes-list.test | 2 +
clang/test/SemaCUDA/Inputs/cuda.h | 2 +
clang/test/SemaCUDA/cluster_dims.cu | 64 +++++++++
12 files changed, 338 insertions(+)
create mode 100644 clang/test/CodeGenCUDA/cluster_dims.cu
create mode 100644 clang/test/SemaCUDA/cluster_dims.cu
diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index 2623f9ff6972f..b018d2cc1831f 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -1572,6 +1572,23 @@ def HIPManaged : InheritableAttr {
let Documentation = [HIPManagedAttrDocs];
}
+def CUDAClusterDims : InheritableAttr {
+ let Spellings = [GNU<"cluster_dims">, Declspec<"__cluster_dims__">];
+ let Args = [ExprArgument<"X">, ExprArgument<"Y", 1>, ExprArgument<"Z", 1>];
+ let Subjects = SubjectList<[Function], ErrorDiag, "kernel functions">;
+ let LangOpts = [CUDA];
+ let Documentation = [Undocumented];
+}
+
+def CUDANoCluster : InheritableAttr {
+ let Spellings = [GNU<"no_cluster">, Declspec<"__no_cluster__">];
+ let Subjects = SubjectList<[Function], ErrorDiag, "kernel functions">;
+ let LangOpts = [CUDA];
+ let Documentation = [Undocumented];
+}
+
+def : MutualExclusions<[CUDAClusterDims, CUDANoCluster]>;
+
def CUDAInvalidTarget : InheritableAttr {
let Spellings = [];
let Subjects = SubjectList<[Function]>;
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index bd0e53d3086b0..f5f1b1a7c98d2 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -13055,6 +13055,14 @@ def warn_cuda_maxclusterrank_sm_90 : Warning<
"maxclusterrank requires sm_90 or higher, CUDA arch provided: %0, ignoring "
"%1 attribute">, InGroup<IgnoredAttributes>;
+def err_cuda_cluster_attr_not_supported : Error<
+ "%select{__cluster_dims__|__no_cluster__}0 is not supported for this GPU architecture"
+>;
+
+def err_cuda_cluster_dims_too_large : Error<
+ "only a maximum of %0 thread blocks in a cluster is supported"
+>;
+
// VTable pointer authentication errors
def err_non_polymorphic_vtable_pointer_auth : Error<
"cannot set vtable pointer authentication on monomorphic type %0">;
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index d017d1f829015..c8b2732c07b53 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -5010,6 +5010,14 @@ class Sema final : public SemaBase {
void AddLaunchBoundsAttr(Decl *D, const AttributeCommonInfo &CI,
Expr *MaxThreads, Expr *MinBlocks, Expr *MaxBlocks);
+ /// Add a cluster_dims attribute to a particular declaration.
+ CUDAClusterDimsAttr *createClusterDimsAttr(const AttributeCommonInfo &CI,
+ Expr *X, Expr *Y, Expr *Z);
+ void addClusterDimsAttr(Decl *D, const AttributeCommonInfo &CI, Expr *X,
+ Expr *Y, Expr *Z);
+ /// Add a no_cluster attribute to a particular declaration.
+ void addNoClusterAttr(Decl *D, const AttributeCommonInfo &CI);
+
enum class RetainOwnershipKind { NS, CF, OS };
UuidAttr *mergeUuidAttr(Decl *D, const AttributeCommonInfo &CI,
diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp b/clang/lib/CodeGen/Targets/AMDGPU.cpp
index 0fcbf7e458a34..48855ce485f91 100644
--- a/clang/lib/CodeGen/Targets/AMDGPU.cpp
+++ b/clang/lib/CodeGen/Targets/AMDGPU.cpp
@@ -342,6 +342,9 @@ static bool requiresAMDGPUProtectedVisibility(const Decl *D,
void AMDGPUTargetCodeGenInfo::setFunctionDeclAttributes(
const FunctionDecl *FD, llvm::Function *F, CodeGenModule &M) const {
+ llvm::StringMap<bool> TargetFetureMap;
+ M.getContext().getFunctionFeatureMap(TargetFetureMap, FD);
+
const auto *ReqdWGS =
M.getLangOpts().OpenCL ? FD->getAttr<ReqdWorkGroupSizeAttr>() : nullptr;
const bool IsOpenCLKernel =
@@ -402,6 +405,29 @@ void AMDGPUTargetCodeGenInfo::setFunctionDeclAttributes(
F->addFnAttr("amdgpu-max-num-workgroups", AttrVal.str());
}
+
+ if (auto *Attr = FD->getAttr<CUDAClusterDimsAttr>()) {
+ uint32_t X =
+ Attr->getX()->EvaluateKnownConstInt(M.getContext()).getExtValue();
+ uint32_t Y =
+ Attr->getY()
+ ? Attr->getY()->EvaluateKnownConstInt(M.getContext()).getExtValue()
+ : 1;
+ uint32_t Z =
+ Attr->getZ()
+ ? Attr->getZ()->EvaluateKnownConstInt(M.getContext()).getExtValue()
+ : 1;
+
+ llvm::SmallString<32> AttrVal;
+ llvm::raw_svector_ostream OS(AttrVal);
+ OS << X << ',' << Y << ',' << Z;
+ F->addFnAttr("amdgpu-cluster-dims", AttrVal.str());
+ }
+
+ // OpenCL doesn't support cluster feature.
+ if ((IsOpenCLKernel && TargetFetureMap.lookup("gfx1250-insts")) ||
+ FD->getAttr<CUDANoClusterAttr>())
+ F->addFnAttr("amdgpu-cluster-dims", "0,0,0");
}
void AMDGPUTargetCodeGenInfo::setTargetAttributes(
diff --git a/clang/lib/Headers/__clang_hip_runtime_wrapper.h b/clang/lib/Headers/__clang_hip_runtime_wrapper.h
index da1e39ac7270e..fb0ece96e1418 100644
--- a/clang/lib/Headers/__clang_hip_runtime_wrapper.h
+++ b/clang/lib/Headers/__clang_hip_runtime_wrapper.h
@@ -25,6 +25,8 @@
#define __constant__ __attribute__((constant))
#define __managed__ __attribute__((managed))
+#define __cluster_dims__(...) __attribute__((cluster_dims(__VA_ARGS__)))
+
#if !defined(__cplusplus) || __cplusplus < 201103L
#define nullptr NULL;
#endif
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index b876911384f6f..ba40babfcea48 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -5682,6 +5682,130 @@ static void handleLaunchBoundsAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
AL.getNumArgs() > 2 ? AL.getArgAsExpr(2) : nullptr);
}
+static std::pair<Expr *, int>
+makeClusterDimsArgExpr(Sema &S, Expr *E, const CUDAClusterDimsAttr &AL,
+ const unsigned Idx) {
+ if (S.DiagnoseUnexpandedParameterPack(E))
+ return {nullptr, 0};
+
+ // Accept template arguments for now as they depend on something else.
+ // We'll get to check them when they eventually get instantiated.
+ if (E->isValueDependent())
+ return {E, 1};
+
+ std::optional<llvm::APSInt> I = llvm::APSInt(64);
+ if (!(I = E->getIntegerConstantExpr(S.Context))) {
+ S.Diag(E->getExprLoc(), diag::err_attribute_argument_n_type)
+ << &AL << Idx << AANT_ArgumentIntegerConstant << E->getSourceRange();
+ return {nullptr, 0};
+ }
+ // Make sure we can fit it in 4 bits.
+ if (!I->isIntN(4)) {
+ S.Diag(E->getExprLoc(), diag::err_ice_too_large)
+ << toString(*I, 10, false) << 4 << /* Unsigned */ 1;
+ return {nullptr, 0};
+ }
+ if (*I < 0)
+ S.Diag(E->getExprLoc(), diag::warn_attribute_argument_n_negative)
+ << &AL << Idx << E->getSourceRange();
+
+ // We may need to perform implicit conversion of the argument.
+ InitializedEntity Entity = InitializedEntity::InitializeParameter(
+ S.Context, S.Context.getConstType(S.Context.IntTy), /*consume*/ false);
+ ExprResult ValArg = S.PerformCopyInitialization(Entity, SourceLocation(), E);
+ assert(!ValArg.isInvalid() &&
+ "Unexpected PerformCopyInitialization() failure.");
+
+ return {ValArg.getAs<Expr>(), I->getZExtValue()};
+}
+
+CUDAClusterDimsAttr *Sema::createClusterDimsAttr(const AttributeCommonInfo &CI,
+ Expr *X, Expr *Y, Expr *Z) {
+ CUDAClusterDimsAttr TmpAttr(Context, CI, X, Y, Z);
+
+ int ValX = 1;
+ int ValY = 1;
+ int ValZ = 1;
+
+ std::tie(X, ValX) = makeClusterDimsArgExpr(*this, X, TmpAttr, /*Idx=*/0);
+ if (!X)
+ return nullptr;
+
+ if (Y) {
+ std::tie(Y, ValY) = makeClusterDimsArgExpr(*this, Y, TmpAttr, /*Idx=*/1);
+ if (!Y)
+ return nullptr;
+ }
+
+ if (Z) {
+ std::tie(Z, ValZ) = makeClusterDimsArgExpr(*this, Z, TmpAttr, /*Idx=*/2);
+ if (!Z)
+ return nullptr;
+ }
+
+ int FlatDim = ValX * ValY * ValZ;
+ auto TT = (!Context.getLangOpts().CUDAIsDevice && Context.getAuxTargetInfo())
+ ? Context.getAuxTargetInfo()->getTriple()
+ : Context.getTargetInfo().getTriple();
+ int MaxDim = 1;
+ if (TT.isNVPTX())
+ MaxDim = 8;
+ else if (TT.isAMDGPU())
+ MaxDim = 16;
+ else
+ return nullptr;
+
+ // A maximum of 8 thread blocks in a cluster is supported as a portable
+ // cluster size in CUDA. The number is 16 for AMDGPU.
+ if (FlatDim > MaxDim) {
+ Diag(CI.getLoc(), diag::err_cuda_cluster_dims_too_large) << MaxDim;
+ return nullptr;
+ }
+
+ return ::new (Context) CUDAClusterDimsAttr(Context, CI, X, Y, Z);
+}
+
+void Sema::addClusterDimsAttr(Decl *D, const AttributeCommonInfo &CI, Expr *X,
+ Expr *Y, Expr *Z) {
+ if (auto *Attr = createClusterDimsAttr(CI, X, Y, Z))
+ D->addAttr(Attr);
+}
+
+void Sema::addNoClusterAttr(Decl *D, const AttributeCommonInfo &CI) {
+ if (CUDANoClusterAttr *Attr = ::new (Context) CUDANoClusterAttr(Context, CI))
+ D->addAttr(Attr);
+}
+
+static void handleClusterDimsAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
+ auto &TTI = S.Context.getTargetInfo();
+ auto Arch = StringToOffloadArch(TTI.getTargetOpts().CPU);
+ if ((TTI.getTriple().isNVPTX() && Arch < clang::OffloadArch::SM_90) ||
+ (TTI.getTriple().isAMDGPU() && Arch < clang::OffloadArch::GFX1250)) {
+ S.Diag(AL.getLoc(), diag::err_cuda_cluster_attr_not_supported) << 0;
+ return;
+ }
+
+ if (!AL.checkAtLeastNumArgs(S, /*Num=*/1) ||
+ !AL.checkAtMostNumArgs(S, /*Num=*/3))
+ return;
+
+ S.addClusterDimsAttr(D, AL, AL.getArgAsExpr(0),
+ AL.getNumArgs() > 1 ? AL.getArgAsExpr(1) : nullptr,
+ AL.getNumArgs() > 2 ? AL.getArgAsExpr(2) : nullptr);
+}
+
+static void handleNoClusterAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
+ auto &TTI = S.Context.getTargetInfo();
+ auto Arch = StringToOffloadArch(TTI.getTargetOpts().CPU);
+ if ((TTI.getTriple().isNVPTX() && Arch < clang::OffloadArch::SM_90) ||
+ (TTI.getTriple().isAMDGPU() && Arch < clang::OffloadArch::GFX1250)) {
+ S.Diag(AL.getLoc(), diag::err_cuda_cluster_attr_not_supported) << 1;
+ return;
+ }
+
+ S.addNoClusterAttr(D, AL);
+}
+
static void handleArgumentWithTypeTagAttr(Sema &S, Decl *D,
const ParsedAttr &AL) {
if (!AL.isArgIdent(0)) {
@@ -7134,6 +7258,12 @@ ProcessDeclAttribute(Sema &S, Scope *scope, Decl *D, const ParsedAttr &AL,
case ParsedAttr::AT_CUDALaunchBounds:
handleLaunchBoundsAttr(S, D, AL);
break;
+ case ParsedAttr::AT_CUDAClusterDims:
+ handleClusterDimsAttr(S, D, AL);
+ break;
+ case ParsedAttr::AT_CUDANoCluster:
+ handleNoClusterAttr(S, D, AL);
+ break;
case ParsedAttr::AT_Restrict:
handleRestrictAttr(S, D, AL);
break;
diff --git a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
index adac3dff5b2b4..faaf4e18e4ce9 100644
--- a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
@@ -707,6 +707,38 @@ static void instantiateDependentAMDGPUMaxNumWorkGroupsAttr(
S.AMDGPU().addAMDGPUMaxNumWorkGroupsAttr(New, Attr, XExpr, YExpr, ZExpr);
}
+static void instantiateDependentCUDAClusterDimsAttr(
+ Sema &S, const MultiLevelTemplateArgumentList &TemplateArgs,
+ const CUDAClusterDimsAttr &Attr, Decl *New) {
+ EnterExpressionEvaluationContext Unevaluated(
+ S, Sema::ExpressionEvaluationContext::ConstantEvaluated);
+
+ Expr *XExpr = nullptr;
+ Expr *YExpr = nullptr;
+ Expr *ZExpr = nullptr;
+
+ if (Attr.getX()) {
+ ExprResult ResultX = S.SubstExpr(Attr.getX(), TemplateArgs);
+ if (ResultX.isUsable())
+ XExpr = ResultX.getAs<Expr>();
+ }
+
+ if (Attr.getY()) {
+ ExprResult ResultY = S.SubstExpr(Attr.getY(), TemplateArgs);
+ if (ResultY.isUsable())
+ YExpr = ResultY.getAs<Expr>();
+ }
+
+ if (Attr.getZ()) {
+ ExprResult ResultZ = S.SubstExpr(Attr.getZ(), TemplateArgs);
+ if (ResultZ.isUsable())
+ ZExpr = ResultZ.getAs<Expr>();
+ }
+
+ if (XExpr)
+ S.addClusterDimsAttr(New, Attr, XExpr, YExpr, ZExpr);
+}
+
// This doesn't take any template parameters, but we have a custom action that
// needs to happen when the kernel itself is instantiated. We need to run the
// ItaniumMangler to mark the names required to name this kernel.
@@ -921,6 +953,11 @@ void Sema::InstantiateAttrs(const MultiLevelTemplateArgumentList &TemplateArgs,
*this, TemplateArgs, *AMDGPUMaxNumWorkGroups, New);
}
+ if (const auto *CUDAClusterDims = dyn_cast<CUDAClusterDimsAttr>(TmplAttr)) {
+ instantiateDependentCUDAClusterDimsAttr(*this, TemplateArgs,
+ *CUDAClusterDims, New);
+ }
+
if (const auto *ParamAttr = dyn_cast<HLSLParamModifierAttr>(TmplAttr)) {
instantiateDependentHLSLParamModifierAttr(*this, TemplateArgs, ParamAttr,
New);
diff --git a/clang/test/CodeGenCUDA/Inputs/cuda.h b/clang/test/CodeGenCUDA/Inputs/cuda.h
index dc85eae0c5178..e7ad784335027 100644
--- a/clang/test/CodeGenCUDA/Inputs/cuda.h
+++ b/clang/test/CodeGenCUDA/Inputs/cuda.h
@@ -13,6 +13,8 @@
#endif
#define __launch_bounds__(...) __attribute__((launch_bounds(__VA_ARGS__)))
#define __grid_constant__ __attribute__((grid_constant))
+#define __cluster_dims__(...) __attribute__((cluster_dims(__VA_ARGS__)))
+#define __no_cluster__ __attribute__((no_cluster))
#else
#define __constant__
#define __device__
@@ -22,6 +24,8 @@
#define __managed__
#define __launch_bounds__(...)
#define __grid_constant__
+#define __cluster_dims__(...)
+#define __no_cluster__
#endif
struct dim3 {
diff --git a/clang/test/CodeGenCUDA/cluster_dims.cu b/clang/test/CodeGenCUDA/cluster_dims.cu
new file mode 100644
index 0000000000000..00635e3572a7f
--- /dev/null
+++ b/clang/test/CodeGenCUDA/cluster_dims.cu
@@ -0,0 +1,38 @@
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx1250 -fcuda-is-device -emit-llvm -x hip -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -aux-triple amdgcn-amd-amdhsa -emit-llvm -x hip -o - %s | FileCheck --check-prefix=HOST %s
+
+#include "Inputs/cuda.h"
+
+const int constint = 4;
+
+// HOST-NOT: "amdgpu-cluster-dims"
+
+// CHECK: "amdgpu-cluster-dims"="2,2,2"
+__global__ void __cluster_dims__(2, 2, 2) test_literal_3d() {}
+
+// CHECK: "amdgpu-cluster-dims"="2,2,1"
+__global__ void __cluster_dims__(2, 2) test_literal_2d() {}
+
+// CHECK: "amdgpu-cluster-dims"="4,1,1"
+__global__ void __cluster_dims__(4) test_literal_1d() {}
+
+// CHECK: "amdgpu-cluster-dims"="4,2,1"
+__global__ void __cluster_dims__(constint, constint / 2, 1) test_constant() {}
+
+// CHECK: "amdgpu-cluster-dims"="0,0,0"
+__global__ void __no_cluster__ test_no_cluster() {}
+
+// CHECK: "amdgpu-cluster-dims"="7,1,1"
+template<unsigned a>
+__global__ void __cluster_dims__(a) test_template_1d() {}
+template __global__ void test_template_1d<7>();
+
+// CHECK: "amdgpu-cluster-dims"="2,6,1"
+template<unsigned a, unsigned b>
+__global__ void __cluster_dims__(a, b) test_template_2d() {}
+template __global__ void test_template_2d<2, 6>();
+
+// CHECK: "amdgpu-cluster-dims"="1,2,3"
+template<unsigned a, unsigned b, unsigned c>
+__global__ void __cluster_dims__(a, b, c) test_template_3d() {}
+template __global__ void test_template_3d<1, 2, 3>();
diff --git a/clang/test/Misc/pragma-attribute-supported-attributes-list.test b/clang/test/Misc/pragma-attribute-supported-attributes-list.test
index 37ff33e5a1523..c8c913448d968 100644
--- a/clang/test/Misc/pragma-attribute-supported-attributes-list.test
+++ b/clang/test/Misc/pragma-attribute-supported-attributes-list.test
@@ -35,6 +35,7 @@
// CHECK-NEXT: CFUnknownTransfer (SubjectMatchRule_function)
// CHECK-NEXT: CPUDispatch (SubjectMatchRule_function)
// CHECK-NEXT: CPUSpecific (SubjectMatchRule_function)
+// CHECK-NEXT: CUDAClusterDims (SubjectMatchRule_function)
// CHECK-NEXT: CUDAConstant (SubjectMatchRule_variable)
// CHECK-NEXT: CUDADevice (SubjectMatchRule_function, SubjectMatchRule_variable)
// CHECK-NEXT: CUDADeviceBuiltinSurfaceType (SubjectMatchRule_record)
@@ -43,6 +44,7 @@
// CHECK-NEXT: CUDAGridConstant (SubjectMatchRule_variable_is_parameter)
// CHECK-NEXT: CUDAHost (SubjectMatchRule_function)
// CHECK-NEXT: CUDALaunchBounds (SubjectMatchRule_objc_method, SubjectMatchRule_hasType_functionType)
+// CHECK-NEXT: CUDANoCluster (SubjectMatchRule_function)
// CHECK-NEXT: CUDAShared (SubjectMatchRule_variable)
// CHECK-NEXT: CXX11NoReturn (SubjectMatchRule_function)
// CHECK-NEXT: CallableWhen (SubjectMatchRule_function_is_member)
diff --git a/clang/test/SemaCUDA/Inputs/cuda.h b/clang/test/SemaCUDA/Inputs/cuda.h
index 10db947d8246c..2bf45e03d91c7 100644
--- a/clang/test/SemaCUDA/Inputs/cuda.h
+++ b/clang/test/SemaCUDA/Inputs/cuda.h
@@ -13,6 +13,8 @@
#define __managed__ __attribute__((managed))
#define __grid_constant__ __attribute__((grid_constant))
#define __launch_bounds__(...) __attribute__((launch_bounds(__VA_ARGS__)))
+#define __cluster_dims__(...) __attribute__((cluster_dims(__VA_ARGS__)))
+#define __no_cluster__ __attribute__((no_cluster))
struct dim3 {
unsigned x, y, z;
diff --git a/clang/test/SemaCUDA/cluster_dims.cu b/clang/test/SemaCUDA/cluster_dims.cu
new file mode 100644
index 0000000000000..3cd0e0197c29b
--- /dev/null
+++ b/clang/test/SemaCUDA/cluster_dims.cu
@@ -0,0 +1,64 @@
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device -ast-print -x hip -verify=NS,all %s
+// RUN: %clang_cc1 -triple nvptx-nvidia-cuda -fcuda-is-device -ast-print -x hip -verify=NS,all %s
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx1250 -fcuda-is-device -ast-print -x hip -verify=amd,common,all %s | FileCheck -check-prefixes=CHECK %s
+// RUN: %clang_cc1 -triple nvptx-nvidia-cuda -target-cpu sm_90 -fcuda-is-device -ast-print -x hip -verify=cuda,common,all %s | FileCheck -check-prefixes=CHECK %s
+// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -aux-triple amdgcn-amd-amdhsa -ast-print -x hip -verify=amd,common,all %s | FileCheck -check-prefixes=CHECK %s
+
+#include "Inputs/cuda.h"
+
+const int constint = 4;
+
+// CHECK: __attribute__((global)) __attribute__((cluster_dims(2, 2, 2))) void test_literal_3d()
+__global__ void __cluster_dims__(2, 2, 2) test_literal_3d() {} //NS-error {{__cluster_dims__ is not supported for this GPU architecture}}
+
+// CHECK: __attribute__((global)) __attribute__((cluster_dims(2, 2))) void test_literal_2d()
+__global__ void __cluster_dims__(2, 2) test_literal_2d() {} //NS-error {{__cluster_dims__ is not supported for this GPU architecture}}
+
+// CHECK: __attribute__((global)) __attribute__((cluster_dims(4))) void test_literal_1d()
+__global__ void __cluster_dims__(4) test_literal_1d() {} //NS-error {{__cluster_dims__ is not supported for this GPU architecture}}
+
+// CHECK: __attribute__((global)) __attribute__((cluster_dims(constint, constint / 4, 1))) void test_constant()
+__global__ void __cluster_dims__(constint, constint / 4, 1) test_constant() {} //NS-error {{__cluster_dims__ is not supported for this GPU architecture}}
+
+// CHECK: template <int x, int y, int z> void test_template() __attribute__((cluster_dims(x, y, z)))
+template <int x, int y, int z> void test_template(void) __cluster_dims__(x, y, z){} //NS-error {{__cluster_dims__ is not supported for this GPU architecture}}
+
+// CHECK: template <int x, int y, int z> void test_template_expr() __attribute__((cluster_dims(x + constint, y, z)))
+template <int x, int y, int z> void test_template_expr(void) __cluster_dims__(x + constint, y, z) {} //NS-error {{__cluster_dims__ is not supported for this GPU architecture}}
+
+//NS-error at +1 {{__cluster_dims__ is not supported for this GPU architecture}}
+__global__ void __cluster_dims__(32, 2, 4) test_too_large_dim_0() {} // common-error {{integer constant expression evaluates to value 32 that cannot be represented in a 4-bit unsigned integer type}}
+
+// cuda-error at +2 {{only a maximum of 8 thread blocks in a cluster is supported}}
+// amd-error at +1 {{only a maximum of 16 thread blocks in a cluster is supported}}
+__global__ void __cluster_dims__(4, 4, 4) test_too_large_dim_1() {} // NS-error {{__cluster_dims__ is not supported for this GPU architecture}}
+
+// cuda-error at +3 {{only a maximum of 8 thread blocks in a cluster is supported}}
+// amd-error at +2 {{only a maximum of 16 thread blocks in a cluster is supported}}
+template<unsigned a, unsigned b, unsigned c>
+__global__ void __cluster_dims__(a, b, c) test_too_large_dim_template() {} // NS-error {{__cluster_dims__ is not supported for this GPU architecture}}
+template __global__ void test_too_large_dim_template<4, 4, 4>(); // common-note {{in instantiation of function template specialization 'test_too_large_dim_template<4U, 4U, 4U>' requested here}}
+
+int none_const_int = 4;
+
+//NS-error at +1 {{__cluster_dims__ is not supported for this GPU architecture}}
+__global__ void __cluster_dims__(none_const_int, 2, 4) test_non_constant_0() {} // common-error {{'cluster_dims' attribute requires parameter 0 to be an integer constant}}
+
+//NS-error at +1 {{__cluster_dims__ is not supported for this GPU architecture}}
+__global__ void __cluster_dims__(8, none_const_int / 2, 4) test_non_constant_1() {} // common-error {{'cluster_dims' attribute requires parameter 1 to be an integer constant}}
+
+//NS-error at +1 {{__cluster_dims__ is not supported for this GPU architecture}}
+__global__ void __cluster_dims__(8, 2, none_const_int / 4) test_non_constant_2() {} // common-error {{'cluster_dims' attribute requires parameter 2 to be an integer constant}}
+
+//NS-error at +1 {{__no_cluster__ is not supported for this GPU architecture}}
+__global__ void __no_cluster__ test_no_cluster() {}
+
+//NS-error at +2 {{__no_cluster__ is not supported for this GPU architecture}}
+//NS-error at +1 {{__cluster_dims__ is not supported for this GPU architecture}}
+__global__ void __no_cluster__ __cluster_dims__(2,2,2) test_have_both() {} // common-error {{'cluster_dims' and 'no_cluster' attributes are not compatible}} common-note {{conflicting attribute is here}}
+
+template <int... args>
+__cluster_dims__(args) void test_template_variadic_args(void) {} // all-error {{expression contains unexpanded parameter pack 'args'}}
+
+template <int... args>
+__cluster_dims__(1, args) void test_template_variadic_args_2(void) {} // all-error {{expression contains unexpanded parameter pack 'args'}}
>From b71c60cd676f1d08d065a16b22b0b70fadaf1af9 Mon Sep 17 00:00:00 2001
From: Shilei Tian <i at tianshilei.me>
Date: Wed, 3 Sep 2025 18:44:16 -0400
Subject: [PATCH 2/5] add documentation
---
clang/include/clang/Basic/Attr.td | 4 ++--
clang/include/clang/Basic/AttrDocs.td | 19 +++++++++++++++++++
2 files changed, 21 insertions(+), 2 deletions(-)
diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index b018d2cc1831f..e5f2ca5231204 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -1577,14 +1577,14 @@ def CUDAClusterDims : InheritableAttr {
let Args = [ExprArgument<"X">, ExprArgument<"Y", 1>, ExprArgument<"Z", 1>];
let Subjects = SubjectList<[Function], ErrorDiag, "kernel functions">;
let LangOpts = [CUDA];
- let Documentation = [Undocumented];
+ let Documentation = [CUDAClusterDimsAttrDoc];
}
def CUDANoCluster : InheritableAttr {
let Spellings = [GNU<"no_cluster">, Declspec<"__no_cluster__">];
let Subjects = SubjectList<[Function], ErrorDiag, "kernel functions">;
let LangOpts = [CUDA];
- let Documentation = [Undocumented];
+ let Documentation = [CUDANoClusterAttrDoc];
}
def : MutualExclusions<[CUDAClusterDims, CUDANoCluster]>;
diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td
index ee212a9b50f36..31a4b2796571a 100644
--- a/clang/include/clang/Basic/AttrDocs.td
+++ b/clang/include/clang/Basic/AttrDocs.td
@@ -7532,6 +7532,25 @@ A managed variable can be accessed in both device and host code.
}];
}
+def CUDAClusterDimsAttrDoc : Documentation {
+ let Category = DocCatDecl;
+ let Content = [{
+In CUDA/HIP programming, the ``__cluster_dims__`` attribute can be applied to a kernel function
+to set the dimensions of a thread block cluster. This allows to group multiple thread blocks into
+a larger unit called a "cluster". `__cluster_dims__` defines the cluster size as ``(X, Y, Z)``,
+where each value is the number of thread blocks in that dimension.
+ }];
+}
+
+def CUDANoClusterAttrDoc : Documentation {
+ let Category = DocCatDecl;
+ let Content = [{
+In CUDA/HIP programming, the ``__no_cluster__`` attribute can be applied to a kernel function to
+indicate that the thread block cluster feature will not be enabled at both compile time and kernel
+launch time. Note: this is a LLVM/Clang only attribute.
+ }];
+}
+
def LifetimeOwnerDocs : Documentation {
let Category = DocCatDecl;
let Content = [{
>From 46e7ef50b48f07a340225d565dbcf418c2bc876e Mon Sep 17 00:00:00 2001
From: Shilei Tian <i at tianshilei.me>
Date: Wed, 10 Sep 2025 12:03:42 -0400
Subject: [PATCH 3/5] fix comments
---
clang/include/clang/Basic/AttrDocs.td | 26 ++++++++++++++-----
.../clang/Basic/DiagnosticSemaKinds.td | 2 +-
clang/lib/Sema/SemaDeclAttr.cpp | 17 ++++++------
3 files changed, 30 insertions(+), 15 deletions(-)
diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td
index 31a4b2796571a..e5174e27d064f 100644
--- a/clang/include/clang/Basic/AttrDocs.td
+++ b/clang/include/clang/Basic/AttrDocs.td
@@ -7536,18 +7536,32 @@ def CUDAClusterDimsAttrDoc : Documentation {
let Category = DocCatDecl;
let Content = [{
In CUDA/HIP programming, the ``__cluster_dims__`` attribute can be applied to a kernel function
-to set the dimensions of a thread block cluster. This allows to group multiple thread blocks into
-a larger unit called a "cluster". `__cluster_dims__` defines the cluster size as ``(X, Y, Z)``,
-where each value is the number of thread blocks in that dimension.
+to set the dimensions of a thread block cluster. ``__cluster_dims__`` defines the cluster size
+as ``(X, Y, Z)``, where each value is the number of thread blocks in that dimension.
+The ``__cluster_dims__`` and `__no_cluster__`` attributes are mutually exclusive.
+
+.. code::
+
+ __global__ __cluster_dims__(2, 1, 1) void kernel(...) {
+ ...
+ }
+
}];
}
def CUDANoClusterAttrDoc : Documentation {
let Category = DocCatDecl;
let Content = [{
-In CUDA/HIP programming, the ``__no_cluster__`` attribute can be applied to a kernel function to
-indicate that the thread block cluster feature will not be enabled at both compile time and kernel
-launch time. Note: this is a LLVM/Clang only attribute.
+In CUDA/HIP programming, the LLVM/Clang-exclusive ``__no_cluster__`` attribute can be applied to
+a kernel function to indicate that the thread block cluster feature will not be enabled at both
+compile time and kernel launch time. The ``__cluster_dims__`` and `__no_cluster__`` attributes
+are mutually exclusive.
+
+.. code::
+
+ __global__ __no_cluster__ void kernel(...) {
+ ...
+ }
}];
}
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index f5f1b1a7c98d2..86b5aa148a002 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -13056,7 +13056,7 @@ def warn_cuda_maxclusterrank_sm_90 : Warning<
"%1 attribute">, InGroup<IgnoredAttributes>;
def err_cuda_cluster_attr_not_supported : Error<
- "%select{__cluster_dims__|__no_cluster__}0 is not supported for this GPU architecture"
+ "%0 is not supported for this GPU architecture"
>;
def err_cuda_cluster_dims_too_large : Error<
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index ba40babfcea48..d697da5998d09 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -5690,11 +5690,11 @@ makeClusterDimsArgExpr(Sema &S, Expr *E, const CUDAClusterDimsAttr &AL,
// Accept template arguments for now as they depend on something else.
// We'll get to check them when they eventually get instantiated.
- if (E->isValueDependent())
+ if (E->isInstantiationDependent())
return {E, 1};
- std::optional<llvm::APSInt> I = llvm::APSInt(64);
- if (!(I = E->getIntegerConstantExpr(S.Context))) {
+ std::optional<llvm::APSInt> I = E->getIntegerConstantExpr(S.Context);
+ if (!I) {
S.Diag(E->getExprLoc(), diag::err_attribute_argument_n_type)
<< &AL << Idx << AANT_ArgumentIntegerConstant << E->getSourceRange();
return {nullptr, 0};
@@ -5762,7 +5762,7 @@ CUDAClusterDimsAttr *Sema::createClusterDimsAttr(const AttributeCommonInfo &CI,
return nullptr;
}
- return ::new (Context) CUDAClusterDimsAttr(Context, CI, X, Y, Z);
+ return CUDAClusterDimsAttr::Create(Context, X, Y, Z, CI);
}
void Sema::addClusterDimsAttr(Decl *D, const AttributeCommonInfo &CI, Expr *X,
@@ -5772,8 +5772,7 @@ void Sema::addClusterDimsAttr(Decl *D, const AttributeCommonInfo &CI, Expr *X,
}
void Sema::addNoClusterAttr(Decl *D, const AttributeCommonInfo &CI) {
- if (CUDANoClusterAttr *Attr = ::new (Context) CUDANoClusterAttr(Context, CI))
- D->addAttr(Attr);
+ D->addAttr(CUDANoClusterAttr::Create(Context, CI));
}
static void handleClusterDimsAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
@@ -5781,7 +5780,8 @@ static void handleClusterDimsAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
auto Arch = StringToOffloadArch(TTI.getTargetOpts().CPU);
if ((TTI.getTriple().isNVPTX() && Arch < clang::OffloadArch::SM_90) ||
(TTI.getTriple().isAMDGPU() && Arch < clang::OffloadArch::GFX1250)) {
- S.Diag(AL.getLoc(), diag::err_cuda_cluster_attr_not_supported) << 0;
+ S.Diag(AL.getLoc(), diag::err_cuda_cluster_attr_not_supported)
+ << "__cluster_dims__";
return;
}
@@ -5799,7 +5799,8 @@ static void handleNoClusterAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
auto Arch = StringToOffloadArch(TTI.getTargetOpts().CPU);
if ((TTI.getTriple().isNVPTX() && Arch < clang::OffloadArch::SM_90) ||
(TTI.getTriple().isAMDGPU() && Arch < clang::OffloadArch::GFX1250)) {
- S.Diag(AL.getLoc(), diag::err_cuda_cluster_attr_not_supported) << 1;
+ S.Diag(AL.getLoc(), diag::err_cuda_cluster_attr_not_supported)
+ << "__no_cluster__";
return;
}
>From 6080eadfe62cde09c46e49ea7f3857d6c242c74f Mon Sep 17 00:00:00 2001
From: Shilei Tian <i at tianshilei.me>
Date: Wed, 24 Sep 2025 13:00:38 -0400
Subject: [PATCH 4/5] minor improvements
---
clang/lib/Sema/SemaDeclAttr.cpp | 15 ++++++++-------
1 file changed, 8 insertions(+), 7 deletions(-)
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index d697da5998d09..6042e8cf867d4 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -5686,32 +5686,33 @@ static std::pair<Expr *, int>
makeClusterDimsArgExpr(Sema &S, Expr *E, const CUDAClusterDimsAttr &AL,
const unsigned Idx) {
if (S.DiagnoseUnexpandedParameterPack(E))
- return {nullptr, 0};
+ return {};
// Accept template arguments for now as they depend on something else.
// We'll get to check them when they eventually get instantiated.
if (E->isInstantiationDependent())
- return {E, 1};
+ return {};
std::optional<llvm::APSInt> I = E->getIntegerConstantExpr(S.Context);
if (!I) {
S.Diag(E->getExprLoc(), diag::err_attribute_argument_n_type)
<< &AL << Idx << AANT_ArgumentIntegerConstant << E->getSourceRange();
- return {nullptr, 0};
+ return {};
}
// Make sure we can fit it in 4 bits.
if (!I->isIntN(4)) {
S.Diag(E->getExprLoc(), diag::err_ice_too_large)
- << toString(*I, 10, false) << 4 << /* Unsigned */ 1;
- return {nullptr, 0};
+ << toString(*I, 10, false) << 4 << /*Unsigned=*/1;
+ return {};
}
- if (*I < 0)
+ if (*I < 0) {
S.Diag(E->getExprLoc(), diag::warn_attribute_argument_n_negative)
<< &AL << Idx << E->getSourceRange();
+ }
// We may need to perform implicit conversion of the argument.
InitializedEntity Entity = InitializedEntity::InitializeParameter(
- S.Context, S.Context.getConstType(S.Context.IntTy), /*consume*/ false);
+ S.Context, S.Context.getConstType(S.Context.IntTy), /*consume=*/false);
ExprResult ValArg = S.PerformCopyInitialization(Entity, SourceLocation(), E);
assert(!ValArg.isInvalid() &&
"Unexpected PerformCopyInitialization() failure.");
>From 17201e755c13503a44a54da5b051823f7e22c9be Mon Sep 17 00:00:00 2001
From: Shilei Tian <i at tianshilei.me>
Date: Wed, 24 Sep 2025 13:26:26 -0400
Subject: [PATCH 5/5] fix an error
---
clang/lib/Sema/SemaDeclAttr.cpp | 12 +++---------
1 file changed, 3 insertions(+), 9 deletions(-)
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index 6042e8cf867d4..20465bf8651cc 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -5691,7 +5691,7 @@ makeClusterDimsArgExpr(Sema &S, Expr *E, const CUDAClusterDimsAttr &AL,
// Accept template arguments for now as they depend on something else.
// We'll get to check them when they eventually get instantiated.
if (E->isInstantiationDependent())
- return {};
+ return {E, 1};
std::optional<llvm::APSInt> I = E->getIntegerConstantExpr(S.Context);
if (!I) {
@@ -5710,14 +5710,8 @@ makeClusterDimsArgExpr(Sema &S, Expr *E, const CUDAClusterDimsAttr &AL,
<< &AL << Idx << E->getSourceRange();
}
- // We may need to perform implicit conversion of the argument.
- InitializedEntity Entity = InitializedEntity::InitializeParameter(
- S.Context, S.Context.getConstType(S.Context.IntTy), /*consume=*/false);
- ExprResult ValArg = S.PerformCopyInitialization(Entity, SourceLocation(), E);
- assert(!ValArg.isInvalid() &&
- "Unexpected PerformCopyInitialization() failure.");
-
- return {ValArg.getAs<Expr>(), I->getZExtValue()};
+ return {ConstantExpr::Create(S.getASTContext(), E, APValue(*I)),
+ I->getZExtValue()};
}
CUDAClusterDimsAttr *Sema::createClusterDimsAttr(const AttributeCommonInfo &CI,
More information about the cfe-commits
mailing list