[llvm] [NVPTX] Add support for "blocksareclusters" kernel attr (PR #152265)

Tue Aug 19 21:52:42 PDT 2025

https://github.com/rajatbajpai updated https://github.com/llvm/llvm-project/pull/152265

>From abfa8cf48516c99f00d51ba57588a6e740fb49b7 Mon Sep 17 00:00:00 2001
From: rbajpai <rbajpai at nvidia.com>
Date: Fri, 1 Aug 2025 18:17:03 +0530
Subject: [PATCH] [NVPTX] Add support for "blocksareclusters" kernel attr

This change introduces a new kernel attribute that allows
thread blocks to be mapped to clusters.

In addition to "blocksareclusters" kernel attr this change also add
"ptx90" support in NVPTX backend.
---
 llvm/docs/NVPTXUsage.rst                      |  6 ++
 llvm/lib/Target/NVPTX/NVPTX.td                |  8 +--
 llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp     | 21 ++++++-
 llvm/lib/Target/NVPTX/NVPTXUtilities.cpp      |  4 ++
 llvm/lib/Target/NVPTX/NVPTXUtilities.h        |  2 +
 .../NVPTX/blocksareclusters-kernel-attr.ll    | 60 +++++++++++++++++++
 6 files changed, 96 insertions(+), 5 deletions(-)
 create mode 100644 llvm/test/CodeGen/NVPTX/blocksareclusters-kernel-attr.ll

diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst
index 2dc8f9ff6a57f..629bf2ea5afb4 100644
--- a/llvm/docs/NVPTXUsage.rst
+++ b/llvm/docs/NVPTXUsage.rst
@@ -92,6 +92,12 @@ Function Attributes
     dimension. Specifying a different cluster dimension at launch will result in
     a runtime error or kernel launch failure. Only supported for Hopper+.
 
+``"nvvm.blocksareclusters"``
+    This attribute implies that the grid launch configuration for the corresponding
+    kernel function is specifying the number of clusters instead of the number of thread
+    blocks. This attribute is only allowed for kernel functions and requires
+    ``nvvm.reqntid`` and ``nvvm.cluster_dim`` attributes.
+
 .. _address_spaces:
 
 Address Spaces
diff --git a/llvm/lib/Target/NVPTX/NVPTX.td b/llvm/lib/Target/NVPTX/NVPTX.td
index 83992606bc419..8a445f82e7001 100644
--- a/llvm/lib/Target/NVPTX/NVPTX.td
+++ b/llvm/lib/Target/NVPTX/NVPTX.td
@@ -97,10 +97,10 @@ foreach sm = [20, 21, 30, 32, 35, 37, 50, 52, 53,
     def SM#sm#a : FeatureSM<""#sm#"a", !add(!mul(sm, 10), 3)>;
 }
 
-foreach version = [32, 40, 41, 42, 43, 50, 60, 61, 62, 63, 64, 65,
-                   70, 71, 72, 73, 74, 75, 76, 77, 78,
-                   80, 81, 82, 83, 84, 85, 86, 87, 88] in
-  def PTX#version: FeaturePTX<version>;
+foreach version = [32, 40, 41, 42, 43, 50, 60, 61, 62, 63, 64, 65, 70, 71, 72,
+                   73, 74, 75, 76, 77, 78, 80, 81, 82, 83, 84, 85, 86, 87, 88,
+                   90] in
+  def PTX#version : FeaturePTX<version>;
 
 //===----------------------------------------------------------------------===//
 // NVPTX supported processors.
diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index 0c581dccbbd75..7391c2d488b57 100644
--- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -436,9 +436,13 @@ void NVPTXAsmPrinter::emitKernelFunctionDirectives(const Function &F,
 
   if (STI->getSmVersion() >= 90) {
     const auto ClusterDim = getClusterDim(F);
+    const bool BlocksAreClusters = hasBlocksAreClusters(F);
 
     if (!ClusterDim.empty()) {
-      O << ".explicitcluster\n";
+
+      if (!BlocksAreClusters)
+        O << ".explicitcluster\n";
+
       if (ClusterDim[0] != 0) {
         assert(llvm::all_of(ClusterDim, [](unsigned D) { return D != 0; }) &&
                "cluster_dim_x != 0 implies cluster_dim_y and cluster_dim_z "
@@ -452,6 +456,21 @@ void NVPTXAsmPrinter::emitKernelFunctionDirectives(const Function &F,
                "should be 0 as well");
       }
     }
+
+    if (BlocksAreClusters) {
+      LLVMContext &Ctx = F.getContext();
+      if (ReqNTID.empty() || ClusterDim.empty())
+        Ctx.diagnose(DiagnosticInfoUnsupported(
+            F, "blocksareclusters requires reqntid and cluster_dim attributes",
+            F.getSubprogram()));
+      else if (STI->getPTXVersion() < 90)
+        Ctx.diagnose(DiagnosticInfoUnsupported(
+            F, "blocksareclusters requires PTX version >= 9.0",
+            F.getSubprogram()));
+      else
+        O << ".blocksareclusters\n";
+    }
+
     if (const auto Maxclusterrank = getMaxClusterRank(F))
       O << ".maxclusterrank " << *Maxclusterrank << "\n";
   }
diff --git a/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp b/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp
index 6586f925504f1..274b04fdd30b5 100644
--- a/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp
@@ -352,6 +352,10 @@ std::optional<unsigned> getMaxNReg(const Function &F) {
   return getFnAttrParsedInt(F, "nvvm.maxnreg");
 }
 
+bool hasBlocksAreClusters(const Function &F) {
+  return F.hasFnAttribute("nvvm.blocksareclusters");
+}
+
 MaybeAlign getAlign(const CallInst &I, unsigned Index) {
   // First check the alignstack metadata
   if (MaybeAlign StackAlign =
diff --git a/llvm/lib/Target/NVPTX/NVPTXUtilities.h b/llvm/lib/Target/NVPTX/NVPTXUtilities.h
index 4eb452f398220..9421f9f54d0a6 100644
--- a/llvm/lib/Target/NVPTX/NVPTXUtilities.h
+++ b/llvm/lib/Target/NVPTX/NVPTXUtilities.h
@@ -60,6 +60,8 @@ std::optional<unsigned> getMaxClusterRank(const Function &);
 std::optional<unsigned> getMinCTASm(const Function &);
 std::optional<unsigned> getMaxNReg(const Function &);
 
+bool hasBlocksAreClusters(const Function &);
+
 inline bool isKernelFunction(const Function &F) {
   return F.getCallingConv() == CallingConv::PTX_Kernel;
 }
diff --git a/llvm/test/CodeGen/NVPTX/blocksareclusters-kernel-attr.ll b/llvm/test/CodeGen/NVPTX/blocksareclusters-kernel-attr.ll
new file mode 100644
index 0000000000000..a0a99fe55654f
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/blocksareclusters-kernel-attr.ll
@@ -0,0 +1,60 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mcpu=sm_90 -mattr=+ptx90 | FileCheck %s
+
+target triple = "nvptx64-nvidia-cuda"
+
+; Test "blocksareclusters" attribute with full "reqntid" and "cluster_dim"
+; attributes.
+define ptx_kernel void @kernel1(ptr %input, ptr %output) #0 #1 #2 {
+; CHECK-LABEL: kernel1(
+; CHECK:       .reqntid 1024, 1, 1
+; CHECK-NEXT:  .reqnctapercluster 2, 2, 2
+; CHECK-NEXT:  .blocksareclusters
+; CHECK-NEXT:  {
+; CHECK-EMPTY:
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ret;
+  ret void
+}
+
+; Test "blocksareclusters" attribute with single dimension "reqntid" and
+; "cluster_dim" attributes.
+define ptx_kernel void @kernel2(ptr %input, ptr %output) #0 #3 #4 {
+; CHECK-LABEL: kernel2(
+; CHECK:       .reqntid 1024
+; CHECK-NEXT:  .reqnctapercluster 2
+; CHECK-NEXT:  .blocksareclusters // @kernel2
+; CHECK-NEXT:  {
+; CHECK-EMPTY:
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ret;
+  ret void
+}
+
+; Test "blocksareclusters" attribute with two dimensions(not z dimension)
+; "reqntid" and "cluster_dim" attributes.
+define ptx_kernel void @kernel3(ptr %input, ptr %output) #0 #5 #6 {
+; CHECK-LABEL: kernel3(
+; CHECK:       .reqntid 512, 2
+; CHECK-NEXT:  .reqnctapercluster 2, 2
+; CHECK-NEXT:  .blocksareclusters // @kernel3
+; CHECK-NEXT:  {
+; CHECK-EMPTY:
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ret;
+  ret void
+}
+
+attributes #0 = { "nvvm.blocksareclusters" }
+
+attributes #1 = { "nvvm.reqntid"="1024,1,1" }
+attributes #2 = { "nvvm.cluster_dim"="2,2,2" }
+
+attributes #3 = { "nvvm.reqntid"="1024" }
+attributes #4 = { "nvvm.cluster_dim"="2" }
+
+attributes #5 = { "nvvm.reqntid"="512,2" }
+attributes #6 = { "nvvm.cluster_dim"="2,2" }