[clang] [Clang][HIP][CUDA] Add `__cluster_dims` and `no_cluster__` attribute (PR #156686)

Thu Oct 16 15:23:42 PDT 2025

https://github.com/shiltian updated https://github.com/llvm/llvm-project/pull/156686

>From 367a570ec538e79f339929c1b0c39b2d11b47cf4 Mon Sep 17 00:00:00 2001
From: Shilei Tian <i at tianshilei.me>
Date: Wed, 3 Sep 2025 10:47:52 -0400
Subject: [PATCH 01/14] [Clang][HIP][CUDA] Add `__cluster_dims__` and
 `__no_cluster__` attribute

This PR adds basic frontend support for `__cluster_dims__` and `__no_cluster__` attribute.

Co-authored-by: Yaxun (Sam) Liu <yaxun.liu at amd.com>
Co-authored-by: Jay Foad <jay.foad at amd.com>
---
 clang/include/clang/Basic/Attr.td             |  17 +++
 .../clang/Basic/DiagnosticSemaKinds.td        |   8 ++
 clang/include/clang/Sema/Sema.h               |   8 ++
 clang/lib/CodeGen/Targets/AMDGPU.cpp          |  26 ++++
 .../lib/Headers/__clang_hip_runtime_wrapper.h |   2 +
 clang/lib/Sema/SemaDeclAttr.cpp               | 130 ++++++++++++++++++
 .../lib/Sema/SemaTemplateInstantiateDecl.cpp  |  37 +++++
 clang/test/CodeGenCUDA/Inputs/cuda.h          |   4 +
 clang/test/CodeGenCUDA/cluster_dims.cu        |  38 +++++
 ...a-attribute-supported-attributes-list.test |   2 +
 clang/test/SemaCUDA/Inputs/cuda.h             |   2 +
 clang/test/SemaCUDA/cluster_dims.cu           |  64 +++++++++
 12 files changed, 338 insertions(+)
 create mode 100644 clang/test/CodeGenCUDA/cluster_dims.cu
 create mode 100644 clang/test/SemaCUDA/cluster_dims.cu

diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index 22e60aa9fe312..73461eb318b44 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -1572,6 +1572,23 @@ def HIPManaged : InheritableAttr {
   let Documentation = [HIPManagedAttrDocs];
 }
 
+def CUDAClusterDims : InheritableAttr {
+  let Spellings = [GNU<"cluster_dims">, Declspec<"__cluster_dims__">];
+  let Args = [ExprArgument<"X">, ExprArgument<"Y", 1>, ExprArgument<"Z", 1>];
+  let Subjects = SubjectList<[Function], ErrorDiag, "kernel functions">;
+  let LangOpts = [CUDA];
+  let Documentation = [Undocumented];
+}
+
+def CUDANoCluster : InheritableAttr {
+  let Spellings = [GNU<"no_cluster">, Declspec<"__no_cluster__">];
+  let Subjects = SubjectList<[Function], ErrorDiag, "kernel functions">;
+  let LangOpts = [CUDA];
+  let Documentation = [Undocumented];
+}
+
+def : MutualExclusions<[CUDAClusterDims, CUDANoCluster]>;
+
 def CUDAInvalidTarget : InheritableAttr {
   let Spellings = [];
   let Subjects = SubjectList<[Function]>;
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 12fd7b08683e1..dc33fd49d87b3 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -13070,6 +13070,14 @@ def warn_cuda_maxclusterrank_sm_90 : Warning<
   "maxclusterrank requires sm_90 or higher, CUDA arch provided: %0, ignoring "
   "%1 attribute">, InGroup<IgnoredAttributes>;
 
+def err_cuda_cluster_attr_not_supported : Error<
+  "%select{__cluster_dims__|__no_cluster__}0 is not supported for this GPU architecture"
+>;
+
+def err_cuda_cluster_dims_too_large : Error<
+  "only a maximum of %0 thread blocks in a cluster is supported"
+>;
+
 // VTable pointer authentication errors
 def err_non_polymorphic_vtable_pointer_auth : Error<
   "cannot set vtable pointer authentication on monomorphic type %0">;
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index 37598f8530c09..10a356c92d9bd 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -5010,6 +5010,14 @@ class Sema final : public SemaBase {
   void AddLaunchBoundsAttr(Decl *D, const AttributeCommonInfo &CI,
                            Expr *MaxThreads, Expr *MinBlocks, Expr *MaxBlocks);
 
+  /// Add a cluster_dims attribute to a particular declaration.
+  CUDAClusterDimsAttr *createClusterDimsAttr(const AttributeCommonInfo &CI,
+                                             Expr *X, Expr *Y, Expr *Z);
+  void addClusterDimsAttr(Decl *D, const AttributeCommonInfo &CI, Expr *X,
+                          Expr *Y, Expr *Z);
+  /// Add a no_cluster attribute to a particular declaration.
+  void addNoClusterAttr(Decl *D, const AttributeCommonInfo &CI);
+
   enum class RetainOwnershipKind { NS, CF, OS };
 
   UuidAttr *mergeUuidAttr(Decl *D, const AttributeCommonInfo &CI,
diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp b/clang/lib/CodeGen/Targets/AMDGPU.cpp
index 0fcbf7e458a34..48855ce485f91 100644
--- a/clang/lib/CodeGen/Targets/AMDGPU.cpp
+++ b/clang/lib/CodeGen/Targets/AMDGPU.cpp
@@ -342,6 +342,9 @@ static bool requiresAMDGPUProtectedVisibility(const Decl *D,
 
 void AMDGPUTargetCodeGenInfo::setFunctionDeclAttributes(
     const FunctionDecl *FD, llvm::Function *F, CodeGenModule &M) const {
+  llvm::StringMap<bool> TargetFetureMap;
+  M.getContext().getFunctionFeatureMap(TargetFetureMap, FD);
+
   const auto *ReqdWGS =
       M.getLangOpts().OpenCL ? FD->getAttr<ReqdWorkGroupSizeAttr>() : nullptr;
   const bool IsOpenCLKernel =
@@ -402,6 +405,29 @@ void AMDGPUTargetCodeGenInfo::setFunctionDeclAttributes(
 
     F->addFnAttr("amdgpu-max-num-workgroups", AttrVal.str());
   }
+
+  if (auto *Attr = FD->getAttr<CUDAClusterDimsAttr>()) {
+    uint32_t X =
+        Attr->getX()->EvaluateKnownConstInt(M.getContext()).getExtValue();
+    uint32_t Y =
+        Attr->getY()
+            ? Attr->getY()->EvaluateKnownConstInt(M.getContext()).getExtValue()
+            : 1;
+    uint32_t Z =
+        Attr->getZ()
+            ? Attr->getZ()->EvaluateKnownConstInt(M.getContext()).getExtValue()
+            : 1;
+
+    llvm::SmallString<32> AttrVal;
+    llvm::raw_svector_ostream OS(AttrVal);
+    OS << X << ',' << Y << ',' << Z;
+    F->addFnAttr("amdgpu-cluster-dims", AttrVal.str());
+  }
+
+  // OpenCL doesn't support cluster feature.
+  if ((IsOpenCLKernel && TargetFetureMap.lookup("gfx1250-insts")) ||
+      FD->getAttr<CUDANoClusterAttr>())
+    F->addFnAttr("amdgpu-cluster-dims", "0,0,0");
 }
 
 void AMDGPUTargetCodeGenInfo::setTargetAttributes(
diff --git a/clang/lib/Headers/__clang_hip_runtime_wrapper.h b/clang/lib/Headers/__clang_hip_runtime_wrapper.h
index da1e39ac7270e..fb0ece96e1418 100644
--- a/clang/lib/Headers/__clang_hip_runtime_wrapper.h
+++ b/clang/lib/Headers/__clang_hip_runtime_wrapper.h
@@ -25,6 +25,8 @@
 #define __constant__ __attribute__((constant))
 #define __managed__ __attribute__((managed))
 
+#define __cluster_dims__(...) __attribute__((cluster_dims(__VA_ARGS__)))
+
 #if !defined(__cplusplus) || __cplusplus < 201103L
   #define nullptr NULL;
 #endif
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index e6f8748db7644..b58570ac0975e 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -5676,6 +5676,130 @@ static void handleLaunchBoundsAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
                         AL.getNumArgs() > 2 ? AL.getArgAsExpr(2) : nullptr);
 }
 
+static std::pair<Expr *, int>
+makeClusterDimsArgExpr(Sema &S, Expr *E, const CUDAClusterDimsAttr &AL,
+                       const unsigned Idx) {
+  if (S.DiagnoseUnexpandedParameterPack(E))
+    return {nullptr, 0};
+
+  // Accept template arguments for now as they depend on something else.
+  // We'll get to check them when they eventually get instantiated.
+  if (E->isValueDependent())
+    return {E, 1};
+
+  std::optional<llvm::APSInt> I = llvm::APSInt(64);
+  if (!(I = E->getIntegerConstantExpr(S.Context))) {
+    S.Diag(E->getExprLoc(), diag::err_attribute_argument_n_type)
+        << &AL << Idx << AANT_ArgumentIntegerConstant << E->getSourceRange();
+    return {nullptr, 0};
+  }
+  // Make sure we can fit it in 4 bits.
+  if (!I->isIntN(4)) {
+    S.Diag(E->getExprLoc(), diag::err_ice_too_large)
+        << toString(*I, 10, false) << 4 << /* Unsigned */ 1;
+    return {nullptr, 0};
+  }
+  if (*I < 0)
+    S.Diag(E->getExprLoc(), diag::warn_attribute_argument_n_negative)
+        << &AL << Idx << E->getSourceRange();
+
+  // We may need to perform implicit conversion of the argument.
+  InitializedEntity Entity = InitializedEntity::InitializeParameter(
+      S.Context, S.Context.getConstType(S.Context.IntTy), /*consume*/ false);
+  ExprResult ValArg = S.PerformCopyInitialization(Entity, SourceLocation(), E);
+  assert(!ValArg.isInvalid() &&
+         "Unexpected PerformCopyInitialization() failure.");
+
+  return {ValArg.getAs<Expr>(), I->getZExtValue()};
+}
+
+CUDAClusterDimsAttr *Sema::createClusterDimsAttr(const AttributeCommonInfo &CI,
+                                                 Expr *X, Expr *Y, Expr *Z) {
+  CUDAClusterDimsAttr TmpAttr(Context, CI, X, Y, Z);
+
+  int ValX = 1;
+  int ValY = 1;
+  int ValZ = 1;
+
+  std::tie(X, ValX) = makeClusterDimsArgExpr(*this, X, TmpAttr, /*Idx=*/0);
+  if (!X)
+    return nullptr;
+
+  if (Y) {
+    std::tie(Y, ValY) = makeClusterDimsArgExpr(*this, Y, TmpAttr, /*Idx=*/1);
+    if (!Y)
+      return nullptr;
+  }
+
+  if (Z) {
+    std::tie(Z, ValZ) = makeClusterDimsArgExpr(*this, Z, TmpAttr, /*Idx=*/2);
+    if (!Z)
+      return nullptr;
+  }
+
+  int FlatDim = ValX * ValY * ValZ;
+  auto TT = (!Context.getLangOpts().CUDAIsDevice && Context.getAuxTargetInfo())
+                ? Context.getAuxTargetInfo()->getTriple()
+                : Context.getTargetInfo().getTriple();
+  int MaxDim = 1;
+  if (TT.isNVPTX())
+    MaxDim = 8;
+  else if (TT.isAMDGPU())
+    MaxDim = 16;
+  else
+    return nullptr;
+
+  // A maximum of 8 thread blocks in a cluster is supported as a portable
+  // cluster size in CUDA. The number is 16 for AMDGPU.
+  if (FlatDim > MaxDim) {
+    Diag(CI.getLoc(), diag::err_cuda_cluster_dims_too_large) << MaxDim;
+    return nullptr;
+  }
+
+  return ::new (Context) CUDAClusterDimsAttr(Context, CI, X, Y, Z);
+}
+
+void Sema::addClusterDimsAttr(Decl *D, const AttributeCommonInfo &CI, Expr *X,
+                              Expr *Y, Expr *Z) {
+  if (auto *Attr = createClusterDimsAttr(CI, X, Y, Z))
+    D->addAttr(Attr);
+}
+
+void Sema::addNoClusterAttr(Decl *D, const AttributeCommonInfo &CI) {
+  if (CUDANoClusterAttr *Attr = ::new (Context) CUDANoClusterAttr(Context, CI))
+    D->addAttr(Attr);
+}
+
+static void handleClusterDimsAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
+  auto &TTI = S.Context.getTargetInfo();
+  auto Arch = StringToOffloadArch(TTI.getTargetOpts().CPU);
+  if ((TTI.getTriple().isNVPTX() && Arch < clang::OffloadArch::SM_90) ||
+      (TTI.getTriple().isAMDGPU() && Arch < clang::OffloadArch::GFX1250)) {
+    S.Diag(AL.getLoc(), diag::err_cuda_cluster_attr_not_supported) << 0;
+    return;
+  }
+
+  if (!AL.checkAtLeastNumArgs(S, /*Num=*/1) ||
+      !AL.checkAtMostNumArgs(S, /*Num=*/3))
+    return;
+
+  S.addClusterDimsAttr(D, AL, AL.getArgAsExpr(0),
+                       AL.getNumArgs() > 1 ? AL.getArgAsExpr(1) : nullptr,
+                       AL.getNumArgs() > 2 ? AL.getArgAsExpr(2) : nullptr);
+}
+
+static void handleNoClusterAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
+  auto &TTI = S.Context.getTargetInfo();
+  auto Arch = StringToOffloadArch(TTI.getTargetOpts().CPU);
+  if ((TTI.getTriple().isNVPTX() && Arch < clang::OffloadArch::SM_90) ||
+      (TTI.getTriple().isAMDGPU() && Arch < clang::OffloadArch::GFX1250)) {
+    S.Diag(AL.getLoc(), diag::err_cuda_cluster_attr_not_supported) << 1;
+    return;
+  }
+
+  S.addNoClusterAttr(D, AL);
+}
+
 static void handleArgumentWithTypeTagAttr(Sema &S, Decl *D,
                                           const ParsedAttr &AL) {
   if (!AL.isArgIdent(0)) {
@@ -7141,6 +7265,12 @@ ProcessDeclAttribute(Sema &S, Scope *scope, Decl *D, const ParsedAttr &AL,
   case ParsedAttr::AT_CUDALaunchBounds:
     handleLaunchBoundsAttr(S, D, AL);
     break;
+  case ParsedAttr::AT_CUDAClusterDims:
+    handleClusterDimsAttr(S, D, AL);
+    break;
+  case ParsedAttr::AT_CUDANoCluster:
+    handleNoClusterAttr(S, D, AL);
+    break;
   case ParsedAttr::AT_Restrict:
     handleRestrictAttr(S, D, AL);
     break;
diff --git a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
index 468bc1d677ac2..c1bb2ec1800f8 100644
--- a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
@@ -707,6 +707,38 @@ static void instantiateDependentAMDGPUMaxNumWorkGroupsAttr(
     S.AMDGPU().addAMDGPUMaxNumWorkGroupsAttr(New, Attr, XExpr, YExpr, ZExpr);
 }
 
+static void instantiateDependentCUDAClusterDimsAttr(
+    Sema &S, const MultiLevelTemplateArgumentList &TemplateArgs,
+    const CUDAClusterDimsAttr &Attr, Decl *New) {
+  EnterExpressionEvaluationContext Unevaluated(
+      S, Sema::ExpressionEvaluationContext::ConstantEvaluated);
+
+  Expr *XExpr = nullptr;
+  Expr *YExpr = nullptr;
+  Expr *ZExpr = nullptr;
+
+  if (Attr.getX()) {
+    ExprResult ResultX = S.SubstExpr(Attr.getX(), TemplateArgs);
+    if (ResultX.isUsable())
+      XExpr = ResultX.getAs<Expr>();
+  }
+
+  if (Attr.getY()) {
+    ExprResult ResultY = S.SubstExpr(Attr.getY(), TemplateArgs);
+    if (ResultY.isUsable())
+      YExpr = ResultY.getAs<Expr>();
+  }
+
+  if (Attr.getZ()) {
+    ExprResult ResultZ = S.SubstExpr(Attr.getZ(), TemplateArgs);
+    if (ResultZ.isUsable())
+      ZExpr = ResultZ.getAs<Expr>();
+  }
+
+  if (XExpr)
+    S.addClusterDimsAttr(New, Attr, XExpr, YExpr, ZExpr);
+}
+
 // This doesn't take any template parameters, but we have a custom action that
 // needs to happen when the kernel itself is instantiated. We need to run the
 // ItaniumMangler to mark the names required to name this kernel.
@@ -921,6 +953,11 @@ void Sema::InstantiateAttrs(const MultiLevelTemplateArgumentList &TemplateArgs,
           *this, TemplateArgs, *AMDGPUMaxNumWorkGroups, New);
     }
 
+    if (const auto *CUDAClusterDims = dyn_cast<CUDAClusterDimsAttr>(TmplAttr)) {
+      instantiateDependentCUDAClusterDimsAttr(*this, TemplateArgs,
+                                              *CUDAClusterDims, New);
+    }
+
     if (const auto *ParamAttr = dyn_cast<HLSLParamModifierAttr>(TmplAttr)) {
       instantiateDependentHLSLParamModifierAttr(*this, TemplateArgs, ParamAttr,
                                                 New);
diff --git a/clang/test/CodeGenCUDA/Inputs/cuda.h b/clang/test/CodeGenCUDA/Inputs/cuda.h
index dc85eae0c5178..e7ad784335027 100644
--- a/clang/test/CodeGenCUDA/Inputs/cuda.h
+++ b/clang/test/CodeGenCUDA/Inputs/cuda.h
@@ -13,6 +13,8 @@
 #endif
 #define __launch_bounds__(...) __attribute__((launch_bounds(__VA_ARGS__)))
 #define __grid_constant__ __attribute__((grid_constant))
+#define __cluster_dims__(...) __attribute__((cluster_dims(__VA_ARGS__)))
+#define __no_cluster__ __attribute__((no_cluster))
 #else
 #define __constant__
 #define __device__
@@ -22,6 +24,8 @@
 #define __managed__
 #define __launch_bounds__(...)
 #define __grid_constant__
+#define __cluster_dims__(...)
+#define __no_cluster__
 #endif
 
 struct dim3 {
diff --git a/clang/test/CodeGenCUDA/cluster_dims.cu b/clang/test/CodeGenCUDA/cluster_dims.cu
new file mode 100644
index 0000000000000..00635e3572a7f
--- /dev/null
+++ b/clang/test/CodeGenCUDA/cluster_dims.cu
@@ -0,0 +1,38 @@
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx1250 -fcuda-is-device -emit-llvm -x hip -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -aux-triple amdgcn-amd-amdhsa -emit-llvm -x hip -o - %s | FileCheck --check-prefix=HOST %s
+
+#include "Inputs/cuda.h"
+
+const int constint = 4;
+
+// HOST-NOT: "amdgpu-cluster-dims"
+
+// CHECK: "amdgpu-cluster-dims"="2,2,2"
+__global__ void __cluster_dims__(2, 2, 2) test_literal_3d() {}
+
+// CHECK: "amdgpu-cluster-dims"="2,2,1"
+__global__ void __cluster_dims__(2, 2) test_literal_2d() {}
+
+// CHECK: "amdgpu-cluster-dims"="4,1,1"
+__global__ void __cluster_dims__(4) test_literal_1d() {}
+
+// CHECK: "amdgpu-cluster-dims"="4,2,1"
+__global__ void __cluster_dims__(constint, constint / 2, 1) test_constant() {}
+
+// CHECK: "amdgpu-cluster-dims"="0,0,0"
+__global__ void __no_cluster__ test_no_cluster() {}
+
+// CHECK: "amdgpu-cluster-dims"="7,1,1"
+template<unsigned a>
+__global__ void __cluster_dims__(a) test_template_1d() {}
+template __global__ void test_template_1d<7>();
+
+// CHECK: "amdgpu-cluster-dims"="2,6,1"
+template<unsigned a, unsigned b>
+__global__ void __cluster_dims__(a, b) test_template_2d() {}
+template __global__ void test_template_2d<2, 6>();
+
+// CHECK: "amdgpu-cluster-dims"="1,2,3"
+template<unsigned a, unsigned b, unsigned c>
+__global__ void __cluster_dims__(a, b, c) test_template_3d() {}
+template __global__ void test_template_3d<1, 2, 3>();
diff --git a/clang/test/Misc/pragma-attribute-supported-attributes-list.test b/clang/test/Misc/pragma-attribute-supported-attributes-list.test
index 73d4cb1769ed5..693d54159804e 100644
--- a/clang/test/Misc/pragma-attribute-supported-attributes-list.test
+++ b/clang/test/Misc/pragma-attribute-supported-attributes-list.test
@@ -35,6 +35,7 @@
 // CHECK-NEXT: CFUnknownTransfer (SubjectMatchRule_function)
 // CHECK-NEXT: CPUDispatch (SubjectMatchRule_function)
 // CHECK-NEXT: CPUSpecific (SubjectMatchRule_function)
+// CHECK-NEXT: CUDAClusterDims (SubjectMatchRule_function)
 // CHECK-NEXT: CUDAConstant (SubjectMatchRule_variable)
 // CHECK-NEXT: CUDADevice (SubjectMatchRule_function, SubjectMatchRule_variable)
 // CHECK-NEXT: CUDADeviceBuiltinSurfaceType (SubjectMatchRule_record)
@@ -43,6 +44,7 @@
 // CHECK-NEXT: CUDAGridConstant (SubjectMatchRule_variable_is_parameter)
 // CHECK-NEXT: CUDAHost (SubjectMatchRule_function)
 // CHECK-NEXT: CUDALaunchBounds (SubjectMatchRule_objc_method, SubjectMatchRule_hasType_functionType)
+// CHECK-NEXT: CUDANoCluster (SubjectMatchRule_function)
 // CHECK-NEXT: CUDAShared (SubjectMatchRule_variable)
 // CHECK-NEXT: CXX11NoReturn (SubjectMatchRule_function)
 // CHECK-NEXT: CallableWhen (SubjectMatchRule_function_is_member)
diff --git a/clang/test/SemaCUDA/Inputs/cuda.h b/clang/test/SemaCUDA/Inputs/cuda.h
index 10db947d8246c..2bf45e03d91c7 100644
--- a/clang/test/SemaCUDA/Inputs/cuda.h
+++ b/clang/test/SemaCUDA/Inputs/cuda.h
@@ -13,6 +13,8 @@
 #define __managed__ __attribute__((managed))
 #define __grid_constant__ __attribute__((grid_constant))
 #define __launch_bounds__(...) __attribute__((launch_bounds(__VA_ARGS__)))
+#define __cluster_dims__(...) __attribute__((cluster_dims(__VA_ARGS__)))
+#define __no_cluster__ __attribute__((no_cluster))
 
 struct dim3 {
   unsigned x, y, z;
diff --git a/clang/test/SemaCUDA/cluster_dims.cu b/clang/test/SemaCUDA/cluster_dims.cu
new file mode 100644
index 0000000000000..3cd0e0197c29b
--- /dev/null
+++ b/clang/test/SemaCUDA/cluster_dims.cu
@@ -0,0 +1,64 @@
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device -ast-print -x hip -verify=NS,all %s
+// RUN: %clang_cc1 -triple nvptx-nvidia-cuda -fcuda-is-device -ast-print -x hip -verify=NS,all %s
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx1250 -fcuda-is-device -ast-print -x hip -verify=amd,common,all %s | FileCheck -check-prefixes=CHECK %s
+// RUN: %clang_cc1 -triple nvptx-nvidia-cuda -target-cpu sm_90 -fcuda-is-device -ast-print -x hip -verify=cuda,common,all %s | FileCheck -check-prefixes=CHECK %s
+// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -aux-triple amdgcn-amd-amdhsa -ast-print -x hip -verify=amd,common,all %s | FileCheck -check-prefixes=CHECK %s
+
+#include "Inputs/cuda.h"
+
+const int constint = 4;
+
+// CHECK: __attribute__((global)) __attribute__((cluster_dims(2, 2, 2))) void test_literal_3d()
+__global__ void __cluster_dims__(2, 2, 2) test_literal_3d() {} //NS-error {{__cluster_dims__ is not supported for this GPU architecture}}
+
+// CHECK: __attribute__((global)) __attribute__((cluster_dims(2, 2))) void test_literal_2d()
+__global__ void __cluster_dims__(2, 2) test_literal_2d() {} //NS-error {{__cluster_dims__ is not supported for this GPU architecture}}
+
+// CHECK: __attribute__((global)) __attribute__((cluster_dims(4))) void test_literal_1d()
+__global__ void __cluster_dims__(4) test_literal_1d() {} //NS-error {{__cluster_dims__ is not supported for this GPU architecture}}
+
+// CHECK: __attribute__((global)) __attribute__((cluster_dims(constint, constint / 4, 1))) void test_constant()
+__global__ void __cluster_dims__(constint, constint / 4, 1) test_constant() {} //NS-error {{__cluster_dims__ is not supported for this GPU architecture}}
+
+// CHECK: template <int x, int y, int z> void test_template() __attribute__((cluster_dims(x, y, z)))
+template <int x, int y, int z>  void test_template(void) __cluster_dims__(x, y, z){} //NS-error {{__cluster_dims__ is not supported for this GPU architecture}}
+
+// CHECK: template <int x, int y, int z> void test_template_expr() __attribute__((cluster_dims(x + constint, y, z)))
+template <int x, int y, int z> void test_template_expr(void) __cluster_dims__(x + constint, y, z) {} //NS-error {{__cluster_dims__ is not supported for this GPU architecture}}
+
+//NS-error at +1 {{__cluster_dims__ is not supported for this GPU architecture}}
+__global__ void __cluster_dims__(32, 2, 4) test_too_large_dim_0() {} // common-error {{integer constant expression evaluates to value 32 that cannot be represented in a 4-bit unsigned integer type}}
+
+// cuda-error at +2 {{only a maximum of 8 thread blocks in a cluster is supported}}
+// amd-error at +1 {{only a maximum of 16 thread blocks in a cluster is supported}}
+__global__ void __cluster_dims__(4, 4, 4) test_too_large_dim_1() {} // NS-error {{__cluster_dims__ is not supported for this GPU architecture}}
+
+// cuda-error at +3 {{only a maximum of 8 thread blocks in a cluster is supported}}
+// amd-error at +2 {{only a maximum of 16 thread blocks in a cluster is supported}}
+template<unsigned a, unsigned b, unsigned c>
+__global__ void __cluster_dims__(a, b, c) test_too_large_dim_template() {} // NS-error {{__cluster_dims__ is not supported for this GPU architecture}}
+template __global__ void test_too_large_dim_template<4, 4, 4>(); // common-note {{in instantiation of function template specialization 'test_too_large_dim_template<4U, 4U, 4U>' requested here}}
+
+int none_const_int = 4;
+
+//NS-error at +1 {{__cluster_dims__ is not supported for this GPU architecture}}
+__global__ void __cluster_dims__(none_const_int, 2, 4) test_non_constant_0() {} // common-error {{'cluster_dims' attribute requires parameter 0 to be an integer constant}}
+
+//NS-error at +1 {{__cluster_dims__ is not supported for this GPU architecture}}
+__global__ void __cluster_dims__(8, none_const_int / 2, 4) test_non_constant_1() {} // common-error {{'cluster_dims' attribute requires parameter 1 to be an integer constant}}
+
+//NS-error at +1 {{__cluster_dims__ is not supported for this GPU architecture}}
+__global__ void __cluster_dims__(8, 2, none_const_int / 4) test_non_constant_2() {} // common-error {{'cluster_dims' attribute requires parameter 2 to be an integer constant}}
+
+//NS-error at +1 {{__no_cluster__ is not supported for this GPU architecture}}
+__global__ void __no_cluster__ test_no_cluster() {}
+
+//NS-error at +2 {{__no_cluster__ is not supported for this GPU architecture}}
+//NS-error at +1 {{__cluster_dims__ is not supported for this GPU architecture}}
+__global__ void __no_cluster__ __cluster_dims__(2,2,2) test_have_both() {} // common-error {{'cluster_dims' and 'no_cluster' attributes are not compatible}} common-note {{conflicting attribute is here}}
+
+template <int... args>
+__cluster_dims__(args) void test_template_variadic_args(void) {} // all-error {{expression contains unexpanded parameter pack 'args'}}
+
+template <int... args>
+__cluster_dims__(1, args) void test_template_variadic_args_2(void) {} // all-error {{expression contains unexpanded parameter pack 'args'}}

>From c72d2dba9c2591da7431d950661b90b8b801a109 Mon Sep 17 00:00:00 2001
From: Shilei Tian <i at tianshilei.me>
Date: Wed, 3 Sep 2025 18:44:16 -0400
Subject: [PATCH 02/14] add documentation

---
 clang/include/clang/Basic/Attr.td     |  4 ++--
 clang/include/clang/Basic/AttrDocs.td | 19 +++++++++++++++++++
 2 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index 73461eb318b44..8353e0359565b 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -1577,14 +1577,14 @@ def CUDAClusterDims : InheritableAttr {
   let Args = [ExprArgument<"X">, ExprArgument<"Y", 1>, ExprArgument<"Z", 1>];
   let Subjects = SubjectList<[Function], ErrorDiag, "kernel functions">;
   let LangOpts = [CUDA];
-  let Documentation = [Undocumented];
+  let Documentation = [CUDAClusterDimsAttrDoc];
 }
 
 def CUDANoCluster : InheritableAttr {
   let Spellings = [GNU<"no_cluster">, Declspec<"__no_cluster__">];
   let Subjects = SubjectList<[Function], ErrorDiag, "kernel functions">;
   let LangOpts = [CUDA];
-  let Documentation = [Undocumented];
+  let Documentation = [CUDANoClusterAttrDoc];
 }
 
 def : MutualExclusions<[CUDAClusterDims, CUDANoCluster]>;
diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td
index e0bbda083b5cf..07e1053ddac30 100644
--- a/clang/include/clang/Basic/AttrDocs.td
+++ b/clang/include/clang/Basic/AttrDocs.td
@@ -7545,6 +7545,25 @@ A managed variable can be accessed in both device and host code.
   }];
 }
 
+def CUDAClusterDimsAttrDoc : Documentation {
+  let Category = DocCatDecl;
+  let Content = [{
+In CUDA/HIP programming, the ``__cluster_dims__`` attribute can be applied to a kernel function
+to set the dimensions of a thread block cluster. This allows to group multiple thread blocks into
+a larger unit called a "cluster". `__cluster_dims__` defines the cluster size as ``(X, Y, Z)``,
+where each value is the number of thread blocks in that dimension.
+  }];
+}
+
+def CUDANoClusterAttrDoc : Documentation {
+  let Category = DocCatDecl;
+  let Content = [{
+In CUDA/HIP programming, the ``__no_cluster__`` attribute can be applied to a kernel function to
+indicate that the thread block cluster feature will not be enabled at both compile time and kernel
+launch time. Note: this is a LLVM/Clang only attribute.
+  }];
+}
+
 def LifetimeOwnerDocs : Documentation {
   let Category = DocCatDecl;
   let Content = [{

>From 522803cf7ef59f990bce6b4aa95e60649811153f Mon Sep 17 00:00:00 2001
From: Shilei Tian <i at tianshilei.me>
Date: Wed, 10 Sep 2025 12:03:42 -0400
Subject: [PATCH 03/14] fix comments

---
 clang/include/clang/Basic/AttrDocs.td         | 26 ++++++++++++++-----
 .../clang/Basic/DiagnosticSemaKinds.td        |  2 +-
 clang/lib/Sema/SemaDeclAttr.cpp               | 17 ++++++------
 3 files changed, 30 insertions(+), 15 deletions(-)

diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td
index 07e1053ddac30..6a33f08eb2a05 100644
--- a/clang/include/clang/Basic/AttrDocs.td
+++ b/clang/include/clang/Basic/AttrDocs.td
@@ -7549,18 +7549,32 @@ def CUDAClusterDimsAttrDoc : Documentation {
   let Category = DocCatDecl;
   let Content = [{
 In CUDA/HIP programming, the ``__cluster_dims__`` attribute can be applied to a kernel function
-to set the dimensions of a thread block cluster. This allows to group multiple thread blocks into
-a larger unit called a "cluster". `__cluster_dims__` defines the cluster size as ``(X, Y, Z)``,
-where each value is the number of thread blocks in that dimension.
+to set the dimensions of a thread block cluster. ``__cluster_dims__`` defines the cluster size
+as ``(X, Y, Z)``, where each value is the number of thread blocks in that dimension.
+The ``__cluster_dims__`` and `__no_cluster__`` attributes are mutually exclusive.
+
+.. code::
+
+  __global__ __cluster_dims__(2, 1, 1) void kernel(...) {
+    ...
+  }
+
   }];
 }
 
 def CUDANoClusterAttrDoc : Documentation {
   let Category = DocCatDecl;
   let Content = [{
-In CUDA/HIP programming, the ``__no_cluster__`` attribute can be applied to a kernel function to
-indicate that the thread block cluster feature will not be enabled at both compile time and kernel
-launch time. Note: this is a LLVM/Clang only attribute.
+In CUDA/HIP programming, the LLVM/Clang-exclusive ``__no_cluster__`` attribute can be applied to
+a kernel function to indicate that the thread block cluster feature will not be enabled at both
+compile time and kernel launch time. The ``__cluster_dims__`` and `__no_cluster__`` attributes
+are mutually exclusive.
+
+.. code::
+
+  __global__ __no_cluster__ void kernel(...) {
+    ...
+  }
   }];
 }
 
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index dc33fd49d87b3..f36cc6055a403 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -13071,7 +13071,7 @@ def warn_cuda_maxclusterrank_sm_90 : Warning<
   "%1 attribute">, InGroup<IgnoredAttributes>;
 
 def err_cuda_cluster_attr_not_supported : Error<
-  "%select{__cluster_dims__|__no_cluster__}0 is not supported for this GPU architecture"
+  "%0 is not supported for this GPU architecture"
 >;
 
 def err_cuda_cluster_dims_too_large : Error<
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index b58570ac0975e..76c3d414bfc15 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -5684,11 +5684,11 @@ makeClusterDimsArgExpr(Sema &S, Expr *E, const CUDAClusterDimsAttr &AL,
 
   // Accept template arguments for now as they depend on something else.
   // We'll get to check them when they eventually get instantiated.
-  if (E->isValueDependent())
+  if (E->isInstantiationDependent())
     return {E, 1};
 
-  std::optional<llvm::APSInt> I = llvm::APSInt(64);
-  if (!(I = E->getIntegerConstantExpr(S.Context))) {
+  std::optional<llvm::APSInt> I = E->getIntegerConstantExpr(S.Context);
+  if (!I) {
     S.Diag(E->getExprLoc(), diag::err_attribute_argument_n_type)
         << &AL << Idx << AANT_ArgumentIntegerConstant << E->getSourceRange();
     return {nullptr, 0};
@@ -5756,7 +5756,7 @@ CUDAClusterDimsAttr *Sema::createClusterDimsAttr(const AttributeCommonInfo &CI,
     return nullptr;
   }
 
-  return ::new (Context) CUDAClusterDimsAttr(Context, CI, X, Y, Z);
+  return CUDAClusterDimsAttr::Create(Context, X, Y, Z, CI);
 }
 
 void Sema::addClusterDimsAttr(Decl *D, const AttributeCommonInfo &CI, Expr *X,
@@ -5766,8 +5766,7 @@ void Sema::addClusterDimsAttr(Decl *D, const AttributeCommonInfo &CI, Expr *X,
 }
 
 void Sema::addNoClusterAttr(Decl *D, const AttributeCommonInfo &CI) {
-  if (CUDANoClusterAttr *Attr = ::new (Context) CUDANoClusterAttr(Context, CI))
-    D->addAttr(Attr);
+  D->addAttr(CUDANoClusterAttr::Create(Context, CI));
 }
 
 static void handleClusterDimsAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
@@ -5775,7 +5774,8 @@ static void handleClusterDimsAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
   auto Arch = StringToOffloadArch(TTI.getTargetOpts().CPU);
   if ((TTI.getTriple().isNVPTX() && Arch < clang::OffloadArch::SM_90) ||
       (TTI.getTriple().isAMDGPU() && Arch < clang::OffloadArch::GFX1250)) {
-    S.Diag(AL.getLoc(), diag::err_cuda_cluster_attr_not_supported) << 0;
+    S.Diag(AL.getLoc(), diag::err_cuda_cluster_attr_not_supported)
+        << "__cluster_dims__";
     return;
   }
 
@@ -5793,7 +5793,8 @@ static void handleNoClusterAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
   auto Arch = StringToOffloadArch(TTI.getTargetOpts().CPU);
   if ((TTI.getTriple().isNVPTX() && Arch < clang::OffloadArch::SM_90) ||
       (TTI.getTriple().isAMDGPU() && Arch < clang::OffloadArch::GFX1250)) {
-    S.Diag(AL.getLoc(), diag::err_cuda_cluster_attr_not_supported) << 1;
+    S.Diag(AL.getLoc(), diag::err_cuda_cluster_attr_not_supported)
+        << "__no_cluster__";
     return;
   }
 

>From f7c495c743101c747309452b30e1ff45e1ecb80a Mon Sep 17 00:00:00 2001
From: Shilei Tian <i at tianshilei.me>
Date: Wed, 24 Sep 2025 13:00:38 -0400
Subject: [PATCH 04/14] minor improvements

---
 clang/lib/Sema/SemaDeclAttr.cpp | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index 76c3d414bfc15..1f487fcbe0410 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -5680,32 +5680,33 @@ static std::pair<Expr *, int>
 makeClusterDimsArgExpr(Sema &S, Expr *E, const CUDAClusterDimsAttr &AL,
                        const unsigned Idx) {
   if (S.DiagnoseUnexpandedParameterPack(E))
-    return {nullptr, 0};
+    return {};
 
   // Accept template arguments for now as they depend on something else.
   // We'll get to check them when they eventually get instantiated.
   if (E->isInstantiationDependent())
-    return {E, 1};
+    return {};
 
   std::optional<llvm::APSInt> I = E->getIntegerConstantExpr(S.Context);
   if (!I) {
     S.Diag(E->getExprLoc(), diag::err_attribute_argument_n_type)
         << &AL << Idx << AANT_ArgumentIntegerConstant << E->getSourceRange();
-    return {nullptr, 0};
+    return {};
   }
   // Make sure we can fit it in 4 bits.
   if (!I->isIntN(4)) {
     S.Diag(E->getExprLoc(), diag::err_ice_too_large)
-        << toString(*I, 10, false) << 4 << /* Unsigned */ 1;
-    return {nullptr, 0};
+        << toString(*I, 10, false) << 4 << /*Unsigned=*/1;
+    return {};
   }
-  if (*I < 0)
+  if (*I < 0) {
     S.Diag(E->getExprLoc(), diag::warn_attribute_argument_n_negative)
         << &AL << Idx << E->getSourceRange();
+  }
 
   // We may need to perform implicit conversion of the argument.
   InitializedEntity Entity = InitializedEntity::InitializeParameter(
-      S.Context, S.Context.getConstType(S.Context.IntTy), /*consume*/ false);
+      S.Context, S.Context.getConstType(S.Context.IntTy), /*consume=*/false);
   ExprResult ValArg = S.PerformCopyInitialization(Entity, SourceLocation(), E);
   assert(!ValArg.isInvalid() &&
          "Unexpected PerformCopyInitialization() failure.");

>From bb3ad330f3327dc40fdee6d8fc182d6ff07e6635 Mon Sep 17 00:00:00 2001
From: Shilei Tian <i at tianshilei.me>
Date: Wed, 24 Sep 2025 13:26:26 -0400
Subject: [PATCH 05/14] fix an error

---
 clang/lib/Sema/SemaDeclAttr.cpp | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index 1f487fcbe0410..e2eae49f219ce 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -5685,7 +5685,7 @@ makeClusterDimsArgExpr(Sema &S, Expr *E, const CUDAClusterDimsAttr &AL,
   // Accept template arguments for now as they depend on something else.
   // We'll get to check them when they eventually get instantiated.
   if (E->isInstantiationDependent())
-    return {};
+    return {E, 1};
 
   std::optional<llvm::APSInt> I = E->getIntegerConstantExpr(S.Context);
   if (!I) {
@@ -5704,14 +5704,8 @@ makeClusterDimsArgExpr(Sema &S, Expr *E, const CUDAClusterDimsAttr &AL,
         << &AL << Idx << E->getSourceRange();
   }
 
-  // We may need to perform implicit conversion of the argument.
-  InitializedEntity Entity = InitializedEntity::InitializeParameter(
-      S.Context, S.Context.getConstType(S.Context.IntTy), /*consume=*/false);
-  ExprResult ValArg = S.PerformCopyInitialization(Entity, SourceLocation(), E);
-  assert(!ValArg.isInvalid() &&
-         "Unexpected PerformCopyInitialization() failure.");
-
-  return {ValArg.getAs<Expr>(), I->getZExtValue()};
+  return {ConstantExpr::Create(S.getASTContext(), E, APValue(*I)),
+          I->getZExtValue()};
 }
 
 CUDAClusterDimsAttr *Sema::createClusterDimsAttr(const AttributeCommonInfo &CI,

>From ad3ab69f45f4d6470d008db6f1fc463b8401c7b7 Mon Sep 17 00:00:00 2001
From: Shilei Tian <i at tianshilei.me>
Date: Thu, 25 Sep 2025 19:29:02 -0400
Subject: [PATCH 06/14] fix another comment

---
 clang/include/clang/Basic/Attr.td                             | 4 ++--
 .../test/Misc/pragma-attribute-supported-attributes-list.test | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index 8353e0359565b..cff5df703ce1c 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -1575,14 +1575,14 @@ def HIPManaged : InheritableAttr {
 def CUDAClusterDims : InheritableAttr {
   let Spellings = [GNU<"cluster_dims">, Declspec<"__cluster_dims__">];
   let Args = [ExprArgument<"X">, ExprArgument<"Y", 1>, ExprArgument<"Z", 1>];
-  let Subjects = SubjectList<[Function], ErrorDiag, "kernel functions">;
+  let Subjects = SubjectList<[ObjCMethod, FunctionLike]>;
   let LangOpts = [CUDA];
   let Documentation = [CUDAClusterDimsAttrDoc];
 }
 
 def CUDANoCluster : InheritableAttr {
   let Spellings = [GNU<"no_cluster">, Declspec<"__no_cluster__">];
-  let Subjects = SubjectList<[Function], ErrorDiag, "kernel functions">;
+  let Subjects = SubjectList<[ObjCMethod, FunctionLike]>;
   let LangOpts = [CUDA];
   let Documentation = [CUDANoClusterAttrDoc];
 }
diff --git a/clang/test/Misc/pragma-attribute-supported-attributes-list.test b/clang/test/Misc/pragma-attribute-supported-attributes-list.test
index 693d54159804e..ab4153a64f028 100644
--- a/clang/test/Misc/pragma-attribute-supported-attributes-list.test
+++ b/clang/test/Misc/pragma-attribute-supported-attributes-list.test
@@ -35,7 +35,7 @@
 // CHECK-NEXT: CFUnknownTransfer (SubjectMatchRule_function)
 // CHECK-NEXT: CPUDispatch (SubjectMatchRule_function)
 // CHECK-NEXT: CPUSpecific (SubjectMatchRule_function)
-// CHECK-NEXT: CUDAClusterDims (SubjectMatchRule_function)
+// CHECK-NEXT: CUDAClusterDims (SubjectMatchRule_objc_method, SubjectMatchRule_hasType_functionType)
 // CHECK-NEXT: CUDAConstant (SubjectMatchRule_variable)
 // CHECK-NEXT: CUDADevice (SubjectMatchRule_function, SubjectMatchRule_variable)
 // CHECK-NEXT: CUDADeviceBuiltinSurfaceType (SubjectMatchRule_record)
@@ -44,7 +44,7 @@
 // CHECK-NEXT: CUDAGridConstant (SubjectMatchRule_variable_is_parameter)
 // CHECK-NEXT: CUDAHost (SubjectMatchRule_function)
 // CHECK-NEXT: CUDALaunchBounds (SubjectMatchRule_objc_method, SubjectMatchRule_hasType_functionType)
-// CHECK-NEXT: CUDANoCluster (SubjectMatchRule_function)
+// CHECK-NEXT: CUDANoCluster (SubjectMatchRule_objc_method, SubjectMatchRule_hasType_functionType)
 // CHECK-NEXT: CUDAShared (SubjectMatchRule_variable)
 // CHECK-NEXT: CXX11NoReturn (SubjectMatchRule_function)
 // CHECK-NEXT: CallableWhen (SubjectMatchRule_function_is_member)

>From a90372b39ff6e42405ef1d786c61b89ee75293b9 Mon Sep 17 00:00:00 2001
From: Shilei Tian <i at tianshilei.me>
Date: Fri, 26 Sep 2025 00:41:39 -0400
Subject: [PATCH 07/14] more information on no_dims

---
 clang/include/clang/Basic/AttrDocs.td | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td
index 6a33f08eb2a05..7b050a9fad6ff 100644
--- a/clang/include/clang/Basic/AttrDocs.td
+++ b/clang/include/clang/Basic/AttrDocs.td
@@ -7565,10 +7565,14 @@ The ``__cluster_dims__`` and `__no_cluster__`` attributes are mutually exclusive
 def CUDANoClusterAttrDoc : Documentation {
   let Category = DocCatDecl;
   let Content = [{
-In CUDA/HIP programming, the LLVM/Clang-exclusive ``__no_cluster__`` attribute can be applied to
-a kernel function to indicate that the thread block cluster feature will not be enabled at both
-compile time and kernel launch time. The ``__cluster_dims__`` and `__no_cluster__`` attributes
-are mutually exclusive.
+In CUDA/HIP programming, a kernel function can still be launched with the cluster feature
+enabled at runtime, even without the ``__cluster_dims__`` attribute. The LLVM/Clang-exclusive
+``__no_cluster__`` attribute can be applied to a kernel function to explicitly indicate that
+the cluster feature will not be enabled either at compile time or at kernel launch time. This
+allows the compiler to apply certain optimizations without assuming that clustering could be
+enabled at runtime. It is undefined behavior to launch a kernel with the ``__no_cluster__``
+attribute if the cluster feature is enabled at runtime. The ``__cluster_dims__`` and
+``__no_cluster__`` attributes are mutually exclusive.
 
 .. code::
 

>From 3512e3ad3ea9cb1945a1c8f19a5dd5348df72274 Mon Sep 17 00:00:00 2001
From: Shilei Tian <i at tianshilei.me>
Date: Tue, 30 Sep 2025 14:37:50 -0400
Subject: [PATCH 08/14] drop `__` in `Declspec`

---
 clang/include/clang/Basic/Attr.td | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index cff5df703ce1c..22b80bcf352ae 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -1573,7 +1573,7 @@ def HIPManaged : InheritableAttr {
 }
 
 def CUDAClusterDims : InheritableAttr {
-  let Spellings = [GNU<"cluster_dims">, Declspec<"__cluster_dims__">];
+  let Spellings = [GNU<"cluster_dims">, Declspec<"cluster_dims">];
   let Args = [ExprArgument<"X">, ExprArgument<"Y", 1>, ExprArgument<"Z", 1>];
   let Subjects = SubjectList<[ObjCMethod, FunctionLike]>;
   let LangOpts = [CUDA];
@@ -1581,7 +1581,7 @@ def CUDAClusterDims : InheritableAttr {
 }
 
 def CUDANoCluster : InheritableAttr {
-  let Spellings = [GNU<"no_cluster">, Declspec<"__no_cluster__">];
+  let Spellings = [GNU<"no_cluster">, Declspec<"no_cluster">];
   let Subjects = SubjectList<[ObjCMethod, FunctionLike]>;
   let LangOpts = [CUDA];
   let Documentation = [CUDANoClusterAttrDoc];

>From 452e14cdbb2b0ed7372f9a99ad395c67933c78bf Mon Sep 17 00:00:00 2001
From: Shilei Tian <i at tianshilei.me>
Date: Tue, 30 Sep 2025 17:59:55 -0400
Subject: [PATCH 09/14] fix comments

---
 clang/include/clang/Basic/Attr.td             |  2 +-
 clang/lib/CodeGen/Targets/AMDGPU.cpp          | 19 +++++++------------
 .../lib/Sema/SemaTemplateInstantiateDecl.cpp  |  6 +++---
 3 files changed, 11 insertions(+), 16 deletions(-)

diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index 22b80bcf352ae..b0f97f745bcd9 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -1574,7 +1574,7 @@ def HIPManaged : InheritableAttr {
 
 def CUDAClusterDims : InheritableAttr {
   let Spellings = [GNU<"cluster_dims">, Declspec<"cluster_dims">];
-  let Args = [ExprArgument<"X">, ExprArgument<"Y", 1>, ExprArgument<"Z", 1>];
+  let Args = [ExprArgument<"X">, ExprArgument<"Y", /*opt=*/1>, ExprArgument<"Z", /*opt=*/1>];
   let Subjects = SubjectList<[ObjCMethod, FunctionLike]>;
   let LangOpts = [CUDA];
   let Documentation = [CUDAClusterDimsAttrDoc];
diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp b/clang/lib/CodeGen/Targets/AMDGPU.cpp
index 48855ce485f91..ee4d2aa660269 100644
--- a/clang/lib/CodeGen/Targets/AMDGPU.cpp
+++ b/clang/lib/CodeGen/Targets/AMDGPU.cpp
@@ -407,20 +407,15 @@ void AMDGPUTargetCodeGenInfo::setFunctionDeclAttributes(
   }
 
   if (auto *Attr = FD->getAttr<CUDAClusterDimsAttr>()) {
-    uint32_t X =
-        Attr->getX()->EvaluateKnownConstInt(M.getContext()).getExtValue();
-    uint32_t Y =
-        Attr->getY()
-            ? Attr->getY()->EvaluateKnownConstInt(M.getContext()).getExtValue()
-            : 1;
-    uint32_t Z =
-        Attr->getZ()
-            ? Attr->getZ()->EvaluateKnownConstInt(M.getContext()).getExtValue()
-            : 1;
-
+    auto GetExprVal = [&](const auto &E) {
+      return E ? E->EvaluateKnownConstInt(M.getContext()).getExtValue() : 1;
+    };
+    unsigned X = GetExprVal(Attr->getX());
+    unsigned Y = GetExprVal(Attr->getY());
+    unsigned Z = GetExprVal(Attr->getZ());
     llvm::SmallString<32> AttrVal;
     llvm::raw_svector_ostream OS(AttrVal);
-    OS << X << ',' << Y << ',' << Z;
+    OS << X << ", " << Y << ", " << Z;
     F->addFnAttr("amdgpu-cluster-dims", AttrVal.str());
   }
 
diff --git a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
index c1bb2ec1800f8..b9e7a991cd6e6 100644
--- a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
@@ -720,19 +720,19 @@ static void instantiateDependentCUDAClusterDimsAttr(
   if (Attr.getX()) {
     ExprResult ResultX = S.SubstExpr(Attr.getX(), TemplateArgs);
     if (ResultX.isUsable())
-      XExpr = ResultX.getAs<Expr>();
+      XExpr = ResultX.get();
   }
 
   if (Attr.getY()) {
     ExprResult ResultY = S.SubstExpr(Attr.getY(), TemplateArgs);
     if (ResultY.isUsable())
-      YExpr = ResultY.getAs<Expr>();
+      YExpr = ResultY.get();
   }
 
   if (Attr.getZ()) {
     ExprResult ResultZ = S.SubstExpr(Attr.getZ(), TemplateArgs);
     if (ResultZ.isUsable())
-      ZExpr = ResultZ.getAs<Expr>();
+      ZExpr = ResultZ.get();
   }
 
   if (XExpr)

>From 07f62e6dec4be19ea21f3a90babe02b2a4984980 Mon Sep 17 00:00:00 2001
From: Shilei Tian <i at tianshilei.me>
Date: Mon, 6 Oct 2025 13:53:51 -0400
Subject: [PATCH 10/14] refine target feature lookup; fix comments

---
 clang/include/clang/Basic/Attr.td             |  4 ++--
 clang/include/clang/Basic/AttrDocs.td         |  7 +++---
 .../clang/Basic/DiagnosticSemaKinds.td        |  2 +-
 clang/lib/CodeGen/Targets/AMDGPU.cpp          |  8 ++++---
 clang/lib/Sema/SemaDeclAttr.cpp               | 24 +++++++++++--------
 .../lib/Sema/SemaTemplateInstantiateDecl.cpp  |  3 +--
 clang/test/SemaCUDA/cluster_dims.cu           |  8 +++----
 7 files changed, 31 insertions(+), 25 deletions(-)

diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index b0f97f745bcd9..eb48a0c01fd1e 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -1573,7 +1573,7 @@ def HIPManaged : InheritableAttr {
 }
 
 def CUDAClusterDims : InheritableAttr {
-  let Spellings = [GNU<"cluster_dims">, Declspec<"cluster_dims">];
+  let Spellings = [GNU<"cluster_dims">];
   let Args = [ExprArgument<"X">, ExprArgument<"Y", /*opt=*/1>, ExprArgument<"Z", /*opt=*/1>];
   let Subjects = SubjectList<[ObjCMethod, FunctionLike]>;
   let LangOpts = [CUDA];
@@ -1581,7 +1581,7 @@ def CUDAClusterDims : InheritableAttr {
 }
 
 def CUDANoCluster : InheritableAttr {
-  let Spellings = [GNU<"no_cluster">, Declspec<"no_cluster">];
+  let Spellings = [GNU<"no_cluster">];
   let Subjects = SubjectList<[ObjCMethod, FunctionLike]>;
   let LangOpts = [CUDA];
   let Documentation = [CUDANoClusterAttrDoc];
diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td
index 7b050a9fad6ff..2ab53208f4642 100644
--- a/clang/include/clang/Basic/AttrDocs.td
+++ b/clang/include/clang/Basic/AttrDocs.td
@@ -7549,9 +7549,10 @@ def CUDAClusterDimsAttrDoc : Documentation {
   let Category = DocCatDecl;
   let Content = [{
 In CUDA/HIP programming, the ``__cluster_dims__`` attribute can be applied to a kernel function
-to set the dimensions of a thread block cluster. ``__cluster_dims__`` defines the cluster size
-as ``(X, Y, Z)``, where each value is the number of thread blocks in that dimension.
-The ``__cluster_dims__`` and `__no_cluster__`` attributes are mutually exclusive.
+to set the dimensions of a thread block cluster, which is an optional level of hierarchy and made
+up of thread blocks. ``__cluster_dims__`` defines the cluster size as ``(X, Y, Z)``, where each value
+is the number of thread blocks in that dimension. The ``__cluster_dims__`` and `__no_cluster__``
+attributes are mutually exclusive.
 
 .. code::
 
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index f36cc6055a403..c13cfe593c24e 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -13075,7 +13075,7 @@ def err_cuda_cluster_attr_not_supported : Error<
 >;
 
 def err_cuda_cluster_dims_too_large : Error<
-  "only a maximum of %0 thread blocks in a cluster is supported"
+  "cluster does not support more than %0 thread blocks; %1 provided"
 >;
 
 // VTable pointer authentication errors
diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp b/clang/lib/CodeGen/Targets/AMDGPU.cpp
index ee4d2aa660269..bb14d4602894b 100644
--- a/clang/lib/CodeGen/Targets/AMDGPU.cpp
+++ b/clang/lib/CodeGen/Targets/AMDGPU.cpp
@@ -415,13 +415,15 @@ void AMDGPUTargetCodeGenInfo::setFunctionDeclAttributes(
     unsigned Z = GetExprVal(Attr->getZ());
     llvm::SmallString<32> AttrVal;
     llvm::raw_svector_ostream OS(AttrVal);
-    OS << X << ", " << Y << ", " << Z;
+    OS << X << ',' << Y << ',' << Z;
     F->addFnAttr("amdgpu-cluster-dims", AttrVal.str());
   }
 
   // OpenCL doesn't support cluster feature.
-  if ((IsOpenCLKernel && TargetFetureMap.lookup("gfx1250-insts")) ||
-      FD->getAttr<CUDANoClusterAttr>())
+  const TargetInfo &TTI = M.getContext().getTargetInfo();
+  if ((IsOpenCLKernel &&
+       TTI.hasFeatureEnabled(TTI.getTargetOpts().FeatureMap, "clusters")) ||
+      FD->hasAttr<CUDANoClusterAttr>())
     F->addFnAttr("amdgpu-cluster-dims", "0,0,0");
 }
 
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index e2eae49f219ce..6da09491bbd9a 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -5733,9 +5733,10 @@ CUDAClusterDimsAttr *Sema::createClusterDimsAttr(const AttributeCommonInfo &CI,
   }
 
   int FlatDim = ValX * ValY * ValZ;
-  auto TT = (!Context.getLangOpts().CUDAIsDevice && Context.getAuxTargetInfo())
-                ? Context.getAuxTargetInfo()->getTriple()
-                : Context.getTargetInfo().getTriple();
+  const llvm::Triple TT =
+      (!Context.getLangOpts().CUDAIsDevice && Context.getAuxTargetInfo())
+          ? Context.getAuxTargetInfo()->getTriple()
+          : Context.getTargetInfo().getTriple();
   int MaxDim = 1;
   if (TT.isNVPTX())
     MaxDim = 8;
@@ -5747,7 +5748,8 @@ CUDAClusterDimsAttr *Sema::createClusterDimsAttr(const AttributeCommonInfo &CI,
   // A maximum of 8 thread blocks in a cluster is supported as a portable
   // cluster size in CUDA. The number is 16 for AMDGPU.
   if (FlatDim > MaxDim) {
-    Diag(CI.getLoc(), diag::err_cuda_cluster_dims_too_large) << MaxDim;
+    Diag(CI.getLoc(), diag::err_cuda_cluster_dims_too_large)
+        << MaxDim << FlatDim;
     return nullptr;
   }
 
@@ -5765,10 +5767,11 @@ void Sema::addNoClusterAttr(Decl *D, const AttributeCommonInfo &CI) {
 }
 
 static void handleClusterDimsAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
-  auto &TTI = S.Context.getTargetInfo();
-  auto Arch = StringToOffloadArch(TTI.getTargetOpts().CPU);
+  const TargetInfo &TTI = S.Context.getTargetInfo();
+  OffloadArch Arch = StringToOffloadArch(TTI.getTargetOpts().CPU);
   if ((TTI.getTriple().isNVPTX() && Arch < clang::OffloadArch::SM_90) ||
-      (TTI.getTriple().isAMDGPU() && Arch < clang::OffloadArch::GFX1250)) {
+      (TTI.getTriple().isAMDGPU() &&
+       !TTI.hasFeatureEnabled(TTI.getTargetOpts().FeatureMap, "clusters"))) {
     S.Diag(AL.getLoc(), diag::err_cuda_cluster_attr_not_supported)
         << "__cluster_dims__";
     return;
@@ -5784,10 +5787,11 @@ static void handleClusterDimsAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
 }
 
 static void handleNoClusterAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
-  auto &TTI = S.Context.getTargetInfo();
-  auto Arch = StringToOffloadArch(TTI.getTargetOpts().CPU);
+  const TargetInfo &TTI = S.Context.getTargetInfo();
+  OffloadArch Arch = StringToOffloadArch(TTI.getTargetOpts().CPU);
   if ((TTI.getTriple().isNVPTX() && Arch < clang::OffloadArch::SM_90) ||
-      (TTI.getTriple().isAMDGPU() && Arch < clang::OffloadArch::GFX1250)) {
+      (TTI.getTriple().isAMDGPU() &&
+       !TTI.hasFeatureEnabled(TTI.getTargetOpts().FeatureMap, "clusters"))) {
     S.Diag(AL.getLoc(), diag::err_cuda_cluster_attr_not_supported)
         << "__no_cluster__";
     return;
diff --git a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
index b9e7a991cd6e6..4717b4a44adb7 100644
--- a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
@@ -735,8 +735,7 @@ static void instantiateDependentCUDAClusterDimsAttr(
       ZExpr = ResultZ.get();
   }
 
-  if (XExpr)
-    S.addClusterDimsAttr(New, Attr, XExpr, YExpr, ZExpr);
+  S.addClusterDimsAttr(New, Attr, XExpr, YExpr, ZExpr);
 }
 
 // This doesn't take any template parameters, but we have a custom action that
diff --git a/clang/test/SemaCUDA/cluster_dims.cu b/clang/test/SemaCUDA/cluster_dims.cu
index 3cd0e0197c29b..a4797ef007eaa 100644
--- a/clang/test/SemaCUDA/cluster_dims.cu
+++ b/clang/test/SemaCUDA/cluster_dims.cu
@@ -29,12 +29,12 @@ template <int x, int y, int z> void test_template_expr(void) __cluster_dims__(x
 //NS-error at +1 {{__cluster_dims__ is not supported for this GPU architecture}}
 __global__ void __cluster_dims__(32, 2, 4) test_too_large_dim_0() {} // common-error {{integer constant expression evaluates to value 32 that cannot be represented in a 4-bit unsigned integer type}}
 
-// cuda-error at +2 {{only a maximum of 8 thread blocks in a cluster is supported}}
-// amd-error at +1 {{only a maximum of 16 thread blocks in a cluster is supported}}
+// cuda-error at +2 {{cluster does not support more than 8 thread blocks; 64 provided}}
+// amd-error at +1 {{cluster does not support more than 16 thread blocks; 64 provided}}
 __global__ void __cluster_dims__(4, 4, 4) test_too_large_dim_1() {} // NS-error {{__cluster_dims__ is not supported for this GPU architecture}}
 
-// cuda-error at +3 {{only a maximum of 8 thread blocks in a cluster is supported}}
-// amd-error at +2 {{only a maximum of 16 thread blocks in a cluster is supported}}
+// cuda-error at +3 {{cluster does not support more than 8 thread blocks; 64 provided}}
+// amd-error at +2 {{cluster does not support more than 16 thread blocks; 64 provided}}
 template<unsigned a, unsigned b, unsigned c>
 __global__ void __cluster_dims__(a, b, c) test_too_large_dim_template() {} // NS-error {{__cluster_dims__ is not supported for this GPU architecture}}
 template __global__ void test_too_large_dim_template<4, 4, 4>(); // common-note {{in instantiation of function template specialization 'test_too_large_dim_template<4U, 4U, 4U>' requested here}}

>From 56b35cd61c7126deec85437600f8ace0b22284be Mon Sep 17 00:00:00 2001
From: Shilei Tian <i at tianshilei.me>
Date: Thu, 16 Oct 2025 14:39:46 -0400
Subject: [PATCH 11/14] fix more comments

---
 .../clang/Basic/DiagnosticSemaKinds.td        | 10 +++---
 clang/lib/Sema/SemaDeclAttr.cpp               | 32 ++++++-------------
 .../lib/Sema/SemaTemplateInstantiateDecl.cpp  | 26 ++++-----------
 3 files changed, 19 insertions(+), 49 deletions(-)

diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index c13cfe593c24e..22de85d90a3cf 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -13070,13 +13070,11 @@ def warn_cuda_maxclusterrank_sm_90 : Warning<
   "maxclusterrank requires sm_90 or higher, CUDA arch provided: %0, ignoring "
   "%1 attribute">, InGroup<IgnoredAttributes>;
 
-def err_cuda_cluster_attr_not_supported : Error<
-  "%0 is not supported for this GPU architecture"
->;
+def err_cluster_attr_not_supported : Error<
+  "%0 is not supported for this GPU architecture">;
 
-def err_cuda_cluster_dims_too_large : Error<
-  "cluster does not support more than %0 thread blocks; %1 provided"
->;
+def err_cluster_dims_too_large : Error<
+  "cluster does not support more than %0 thread blocks; %1 provided">;
 
 // VTable pointer authentication errors
 def err_non_polymorphic_vtable_pointer_auth : Error<
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index 6da09491bbd9a..d6f2c47676bc0 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -5679,7 +5679,7 @@ static void handleLaunchBoundsAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
 static std::pair<Expr *, int>
 makeClusterDimsArgExpr(Sema &S, Expr *E, const CUDAClusterDimsAttr &AL,
                        const unsigned Idx) {
-  if (S.DiagnoseUnexpandedParameterPack(E))
+  if (!E || S.DiagnoseUnexpandedParameterPack(E))
     return {};
 
   // Accept template arguments for now as they depend on something else.
@@ -5712,26 +5712,13 @@ CUDAClusterDimsAttr *Sema::createClusterDimsAttr(const AttributeCommonInfo &CI,
                                                  Expr *X, Expr *Y, Expr *Z) {
   CUDAClusterDimsAttr TmpAttr(Context, CI, X, Y, Z);
 
-  int ValX = 1;
-  int ValY = 1;
-  int ValZ = 1;
+  auto [NewX, ValX] = makeClusterDimsArgExpr(*this, X, TmpAttr, /*Idx=*/0);
+  auto [NewY, ValY] = makeClusterDimsArgExpr(*this, Y, TmpAttr, /*Idx=*/1);
+  auto [NewZ, ValZ] = makeClusterDimsArgExpr(*this, Z, TmpAttr, /*Idx=*/2);
 
-  std::tie(X, ValX) = makeClusterDimsArgExpr(*this, X, TmpAttr, /*Idx=*/0);
-  if (!X)
+  if (!NewX || (Y && !NewY) || (Z && !NewZ))
     return nullptr;
 
-  if (Y) {
-    std::tie(Y, ValY) = makeClusterDimsArgExpr(*this, Y, TmpAttr, /*Idx=*/1);
-    if (!Y)
-      return nullptr;
-  }
-
-  if (Z) {
-    std::tie(Z, ValZ) = makeClusterDimsArgExpr(*this, Z, TmpAttr, /*Idx=*/2);
-    if (!Z)
-      return nullptr;
-  }
-
   int FlatDim = ValX * ValY * ValZ;
   const llvm::Triple TT =
       (!Context.getLangOpts().CUDAIsDevice && Context.getAuxTargetInfo())
@@ -5748,12 +5735,11 @@ CUDAClusterDimsAttr *Sema::createClusterDimsAttr(const AttributeCommonInfo &CI,
   // A maximum of 8 thread blocks in a cluster is supported as a portable
   // cluster size in CUDA. The number is 16 for AMDGPU.
   if (FlatDim > MaxDim) {
-    Diag(CI.getLoc(), diag::err_cuda_cluster_dims_too_large)
-        << MaxDim << FlatDim;
+    Diag(CI.getLoc(), diag::err_cluster_dims_too_large) << MaxDim << FlatDim;
     return nullptr;
   }
 
-  return CUDAClusterDimsAttr::Create(Context, X, Y, Z, CI);
+  return CUDAClusterDimsAttr::Create(Context, NewX, NewY, NewZ, CI);
 }
 
 void Sema::addClusterDimsAttr(Decl *D, const AttributeCommonInfo &CI, Expr *X,
@@ -5772,7 +5758,7 @@ static void handleClusterDimsAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
   if ((TTI.getTriple().isNVPTX() && Arch < clang::OffloadArch::SM_90) ||
       (TTI.getTriple().isAMDGPU() &&
        !TTI.hasFeatureEnabled(TTI.getTargetOpts().FeatureMap, "clusters"))) {
-    S.Diag(AL.getLoc(), diag::err_cuda_cluster_attr_not_supported)
+    S.Diag(AL.getLoc(), diag::err_cluster_attr_not_supported)
         << "__cluster_dims__";
     return;
   }
@@ -5792,7 +5778,7 @@ static void handleNoClusterAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
   if ((TTI.getTriple().isNVPTX() && Arch < clang::OffloadArch::SM_90) ||
       (TTI.getTriple().isAMDGPU() &&
        !TTI.hasFeatureEnabled(TTI.getTargetOpts().FeatureMap, "clusters"))) {
-    S.Diag(AL.getLoc(), diag::err_cuda_cluster_attr_not_supported)
+    S.Diag(AL.getLoc(), diag::err_cluster_attr_not_supported)
         << "__no_cluster__";
     return;
   }
diff --git a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
index 4717b4a44adb7..fe6d3328acbeb 100644
--- a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
@@ -713,27 +713,13 @@ static void instantiateDependentCUDAClusterDimsAttr(
   EnterExpressionEvaluationContext Unevaluated(
       S, Sema::ExpressionEvaluationContext::ConstantEvaluated);
 
-  Expr *XExpr = nullptr;
-  Expr *YExpr = nullptr;
-  Expr *ZExpr = nullptr;
-
-  if (Attr.getX()) {
-    ExprResult ResultX = S.SubstExpr(Attr.getX(), TemplateArgs);
-    if (ResultX.isUsable())
-      XExpr = ResultX.get();
-  }
-
-  if (Attr.getY()) {
-    ExprResult ResultY = S.SubstExpr(Attr.getY(), TemplateArgs);
-    if (ResultY.isUsable())
-      YExpr = ResultY.get();
-  }
+  auto SubstElt = [&S, &TemplateArgs](Expr *E) {
+    return E ? S.SubstExpr(E, TemplateArgs).get() : nullptr;
+  };
 
-  if (Attr.getZ()) {
-    ExprResult ResultZ = S.SubstExpr(Attr.getZ(), TemplateArgs);
-    if (ResultZ.isUsable())
-      ZExpr = ResultZ.get();
-  }
+  Expr *XExpr = SubstElt(Attr.getX());
+  Expr *YExpr = SubstElt(Attr.getY());
+  Expr *ZExpr = SubstElt(Attr.getZ());
 
   S.addClusterDimsAttr(New, Attr, XExpr, YExpr, ZExpr);
 }

>From 644a4d4a994d19b54093fc95c43468386bf30932 Mon Sep 17 00:00:00 2001
From: Shilei Tian <i at tianshilei.me>
Date: Thu, 16 Oct 2025 14:47:01 -0400
Subject: [PATCH 12/14] remove unused code

---
 clang/lib/CodeGen/Targets/AMDGPU.cpp | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp b/clang/lib/CodeGen/Targets/AMDGPU.cpp
index bb14d4602894b..16d5919d62cbb 100644
--- a/clang/lib/CodeGen/Targets/AMDGPU.cpp
+++ b/clang/lib/CodeGen/Targets/AMDGPU.cpp
@@ -342,9 +342,6 @@ static bool requiresAMDGPUProtectedVisibility(const Decl *D,
 
 void AMDGPUTargetCodeGenInfo::setFunctionDeclAttributes(
     const FunctionDecl *FD, llvm::Function *F, CodeGenModule &M) const {
-  llvm::StringMap<bool> TargetFetureMap;
-  M.getContext().getFunctionFeatureMap(TargetFetureMap, FD);
-
   const auto *ReqdWGS =
       M.getLangOpts().OpenCL ? FD->getAttr<ReqdWorkGroupSizeAttr>() : nullptr;
   const bool IsOpenCLKernel =

>From 39cc69ac8b508399ed330ed8eea39c71e769743b Mon Sep 17 00:00:00 2001
From: Shilei Tian <i at tianshilei.me>
Date: Thu, 16 Oct 2025 16:48:09 -0400
Subject: [PATCH 13/14] use attribute for diag msg

---
 clang/lib/Sema/SemaDeclAttr.cpp     |  6 ++----
 clang/test/SemaCUDA/cluster_dims.cu | 30 ++++++++++++++---------------
 2 files changed, 17 insertions(+), 19 deletions(-)

diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index d6f2c47676bc0..9475b8a684082 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -5758,8 +5758,7 @@ static void handleClusterDimsAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
   if ((TTI.getTriple().isNVPTX() && Arch < clang::OffloadArch::SM_90) ||
       (TTI.getTriple().isAMDGPU() &&
        !TTI.hasFeatureEnabled(TTI.getTargetOpts().FeatureMap, "clusters"))) {
-    S.Diag(AL.getLoc(), diag::err_cluster_attr_not_supported)
-        << "__cluster_dims__";
+    S.Diag(AL.getLoc(), diag::err_cluster_attr_not_supported) << AL;
     return;
   }
 
@@ -5778,8 +5777,7 @@ static void handleNoClusterAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
   if ((TTI.getTriple().isNVPTX() && Arch < clang::OffloadArch::SM_90) ||
       (TTI.getTriple().isAMDGPU() &&
        !TTI.hasFeatureEnabled(TTI.getTargetOpts().FeatureMap, "clusters"))) {
-    S.Diag(AL.getLoc(), diag::err_cluster_attr_not_supported)
-        << "__no_cluster__";
+    S.Diag(AL.getLoc(), diag::err_cluster_attr_not_supported) << AL;
     return;
   }
 
diff --git a/clang/test/SemaCUDA/cluster_dims.cu b/clang/test/SemaCUDA/cluster_dims.cu
index a4797ef007eaa..dcb8737a51006 100644
--- a/clang/test/SemaCUDA/cluster_dims.cu
+++ b/clang/test/SemaCUDA/cluster_dims.cu
@@ -9,52 +9,52 @@
 const int constint = 4;
 
 // CHECK: __attribute__((global)) __attribute__((cluster_dims(2, 2, 2))) void test_literal_3d()
-__global__ void __cluster_dims__(2, 2, 2) test_literal_3d() {} //NS-error {{__cluster_dims__ is not supported for this GPU architecture}}
+__global__ void __cluster_dims__(2, 2, 2) test_literal_3d() {} //NS-error {{'cluster_dims' is not supported for this GPU architecture}}
 
 // CHECK: __attribute__((global)) __attribute__((cluster_dims(2, 2))) void test_literal_2d()
-__global__ void __cluster_dims__(2, 2) test_literal_2d() {} //NS-error {{__cluster_dims__ is not supported for this GPU architecture}}
+__global__ void __cluster_dims__(2, 2) test_literal_2d() {} //NS-error {{'cluster_dims' is not supported for this GPU architecture}}
 
 // CHECK: __attribute__((global)) __attribute__((cluster_dims(4))) void test_literal_1d()
-__global__ void __cluster_dims__(4) test_literal_1d() {} //NS-error {{__cluster_dims__ is not supported for this GPU architecture}}
+__global__ void __cluster_dims__(4) test_literal_1d() {} //NS-error {{'cluster_dims' is not supported for this GPU architecture}}
 
 // CHECK: __attribute__((global)) __attribute__((cluster_dims(constint, constint / 4, 1))) void test_constant()
-__global__ void __cluster_dims__(constint, constint / 4, 1) test_constant() {} //NS-error {{__cluster_dims__ is not supported for this GPU architecture}}
+__global__ void __cluster_dims__(constint, constint / 4, 1) test_constant() {} //NS-error {{'cluster_dims' is not supported for this GPU architecture}}
 
 // CHECK: template <int x, int y, int z> void test_template() __attribute__((cluster_dims(x, y, z)))
-template <int x, int y, int z>  void test_template(void) __cluster_dims__(x, y, z){} //NS-error {{__cluster_dims__ is not supported for this GPU architecture}}
+template <int x, int y, int z>  void test_template(void) __cluster_dims__(x, y, z){} //NS-error {{'cluster_dims' is not supported for this GPU architecture}}
 
 // CHECK: template <int x, int y, int z> void test_template_expr() __attribute__((cluster_dims(x + constint, y, z)))
-template <int x, int y, int z> void test_template_expr(void) __cluster_dims__(x + constint, y, z) {} //NS-error {{__cluster_dims__ is not supported for this GPU architecture}}
+template <int x, int y, int z> void test_template_expr(void) __cluster_dims__(x + constint, y, z) {} //NS-error {{'cluster_dims' is not supported for this GPU architecture}}
 
-//NS-error at +1 {{__cluster_dims__ is not supported for this GPU architecture}}
+//NS-error at +1 {{'cluster_dims' is not supported for this GPU architecture}}
 __global__ void __cluster_dims__(32, 2, 4) test_too_large_dim_0() {} // common-error {{integer constant expression evaluates to value 32 that cannot be represented in a 4-bit unsigned integer type}}
 
 // cuda-error at +2 {{cluster does not support more than 8 thread blocks; 64 provided}}
 // amd-error at +1 {{cluster does not support more than 16 thread blocks; 64 provided}}
-__global__ void __cluster_dims__(4, 4, 4) test_too_large_dim_1() {} // NS-error {{__cluster_dims__ is not supported for this GPU architecture}}
+__global__ void __cluster_dims__(4, 4, 4) test_too_large_dim_1() {} // NS-error {{'cluster_dims' is not supported for this GPU architecture}}
 
 // cuda-error at +3 {{cluster does not support more than 8 thread blocks; 64 provided}}
 // amd-error at +2 {{cluster does not support more than 16 thread blocks; 64 provided}}
 template<unsigned a, unsigned b, unsigned c>
-__global__ void __cluster_dims__(a, b, c) test_too_large_dim_template() {} // NS-error {{__cluster_dims__ is not supported for this GPU architecture}}
+__global__ void __cluster_dims__(a, b, c) test_too_large_dim_template() {} // NS-error {{'cluster_dims' is not supported for this GPU architecture}}
 template __global__ void test_too_large_dim_template<4, 4, 4>(); // common-note {{in instantiation of function template specialization 'test_too_large_dim_template<4U, 4U, 4U>' requested here}}
 
 int none_const_int = 4;
 
-//NS-error at +1 {{__cluster_dims__ is not supported for this GPU architecture}}
+//NS-error at +1 {{'cluster_dims' is not supported for this GPU architecture}}
 __global__ void __cluster_dims__(none_const_int, 2, 4) test_non_constant_0() {} // common-error {{'cluster_dims' attribute requires parameter 0 to be an integer constant}}
 
-//NS-error at +1 {{__cluster_dims__ is not supported for this GPU architecture}}
+//NS-error at +1 {{'cluster_dims' is not supported for this GPU architecture}}
 __global__ void __cluster_dims__(8, none_const_int / 2, 4) test_non_constant_1() {} // common-error {{'cluster_dims' attribute requires parameter 1 to be an integer constant}}
 
-//NS-error at +1 {{__cluster_dims__ is not supported for this GPU architecture}}
+//NS-error at +1 {{'cluster_dims' is not supported for this GPU architecture}}
 __global__ void __cluster_dims__(8, 2, none_const_int / 4) test_non_constant_2() {} // common-error {{'cluster_dims' attribute requires parameter 2 to be an integer constant}}
 
-//NS-error at +1 {{__no_cluster__ is not supported for this GPU architecture}}
+//NS-error at +1 {{'no_cluster' is not supported for this GPU architecture}}
 __global__ void __no_cluster__ test_no_cluster() {}
 
-//NS-error at +2 {{__no_cluster__ is not supported for this GPU architecture}}
-//NS-error at +1 {{__cluster_dims__ is not supported for this GPU architecture}}
+//NS-error at +2 {{'no_cluster' is not supported for this GPU architecture}}
+//NS-error at +1 {{'cluster_dims' is not supported for this GPU architecture}}
 __global__ void __no_cluster__ __cluster_dims__(2,2,2) test_have_both() {} // common-error {{'cluster_dims' and 'no_cluster' attributes are not compatible}} common-note {{conflicting attribute is here}}
 
 template <int... args>

>From fb807b7bd55c959979209b0f1a699ddec415f495 Mon Sep 17 00:00:00 2001
From: Shilei Tian <i at tianshilei.me>
Date: Thu, 16 Oct 2025 18:22:59 -0400
Subject: [PATCH 14/14] refine docs

---
 clang/include/clang/Basic/AttrDocs.td | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td
index 2ab53208f4642..dbdee88b777b0 100644
--- a/clang/include/clang/Basic/AttrDocs.td
+++ b/clang/include/clang/Basic/AttrDocs.td
@@ -7548,11 +7548,12 @@ A managed variable can be accessed in both device and host code.
 def CUDAClusterDimsAttrDoc : Documentation {
   let Category = DocCatDecl;
   let Content = [{
-In CUDA/HIP programming, the ``__cluster_dims__`` attribute can be applied to a kernel function
-to set the dimensions of a thread block cluster, which is an optional level of hierarchy and made
-up of thread blocks. ``__cluster_dims__`` defines the cluster size as ``(X, Y, Z)``, where each value
-is the number of thread blocks in that dimension. The ``__cluster_dims__`` and `__no_cluster__``
-attributes are mutually exclusive.
+In CUDA/HIP programming, the ``cluster_dims`` attribute, conventionally exposed as
+``__cluster_dims__`` macro, can be applied to a kernel function to set the dimensions of a
+thread block cluster, which is an optional level of hierarchy and made up of thread blocks.
+``__cluster_dims__`` defines the cluster size as ``(X, Y, Z)``, where each value is the number
+of thread blocks in that dimension. The ``__cluster_dims__`` and `__no_cluster__`` attributes
+are mutually exclusive.
 
 .. code::
 
@@ -7568,12 +7569,12 @@ def CUDANoClusterAttrDoc : Documentation {
   let Content = [{
 In CUDA/HIP programming, a kernel function can still be launched with the cluster feature
 enabled at runtime, even without the ``__cluster_dims__`` attribute. The LLVM/Clang-exclusive
-``__no_cluster__`` attribute can be applied to a kernel function to explicitly indicate that
-the cluster feature will not be enabled either at compile time or at kernel launch time. This
-allows the compiler to apply certain optimizations without assuming that clustering could be
-enabled at runtime. It is undefined behavior to launch a kernel with the ``__no_cluster__``
-attribute if the cluster feature is enabled at runtime. The ``__cluster_dims__`` and
-``__no_cluster__`` attributes are mutually exclusive.
+``no_cluster`` attribute, conventionally exposed as ``__no_cluster__`` macro, can be applied
+to a kernel function to explicitly indicate that the cluster feature will not be enabled either
+at compile time or at kernel launch time. This allows the compiler to apply certain optimizations
+without assuming that clustering could be enabled at runtime. It is undefined behavior to launch
+a kernel with the ``__no_cluster__`` attribute if the cluster feature is enabled at runtime.
+The ``__cluster_dims__`` and ``__no_cluster__`` attributes are mutually exclusive.
 
 .. code::
 



[clang] [Clang][HIP][CUDA] Add `__cluster_dims__` and `__no_cluster__` attribute (PR #156686)

[clang] [Clang][HIP][CUDA] Add `__cluster_dims` and `no_cluster__` attribute (PR #156686)