[llvm] [NVPTX] Add support for "blocksareclusters" kernel attr (PR #152265)
Rajat Bajpai via llvm-commits
llvm-commits at lists.llvm.org
Thu Aug 14 02:32:42 PDT 2025
https://github.com/rajatbajpai updated https://github.com/llvm/llvm-project/pull/152265
>From a8b53a4f2a7df5c647f87ff0e2a04c7834836bb0 Mon Sep 17 00:00:00 2001
From: rbajpai <rbajpai at nvidia.com>
Date: Fri, 1 Aug 2025 18:17:03 +0530
Subject: [PATCH 1/2] [NVPTX] Add support for "blocksareclusters" kernel attr
This change introduces a new kernel attribute that allows
thread blocks to be mapped to clusters.
---
llvm/docs/NVPTXUsage.rst | 6 ++
llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp | 19 ++++-
.../NVPTX/blocksareclusters-kernel-attr.ll | 78 +++++++++++++++++++
3 files changed, 99 insertions(+), 4 deletions(-)
create mode 100644 llvm/test/CodeGen/NVPTX/blocksareclusters-kernel-attr.ll
diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst
index 2dc8f9ff6a57f..629bf2ea5afb4 100644
--- a/llvm/docs/NVPTXUsage.rst
+++ b/llvm/docs/NVPTXUsage.rst
@@ -92,6 +92,12 @@ Function Attributes
dimension. Specifying a different cluster dimension at launch will result in
a runtime error or kernel launch failure. Only supported for Hopper+.
+``"nvvm.blocksareclusters"``
+ This attribute implies that the grid launch configuration for the corresponding
+ kernel function is specifying the number of clusters instead of the number of thread
+ blocks. This attribute is only allowed for kernel functions and requires
+ ``nvvm.reqntid`` and ``nvvm.cluster_dim`` attributes.
+
.. _address_spaces:
Address Spaces
diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index 38912a7f09e30..385bf334ba338 100644
--- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -414,6 +414,17 @@ void NVPTXAsmPrinter::emitKernelFunctionDirectives(const Function &F,
// the reqntid directive, and set the unspecified ones to 1.
// If none of Reqntid* is specified, don't output reqntid directive.
const auto ReqNTID = getReqNTID(F);
+
+ const NVPTXTargetMachine &NTM = static_cast<const NVPTXTargetMachine &>(TM);
+ const auto *STI = static_cast<const NVPTXSubtarget *>(NTM.getSubtargetImpl());
+
+ const bool BlocksAreClusters = F.hasFnAttribute("nvvm.blocksareclusters");
+ if (BlocksAreClusters && STI->getSmVersion() >= 90) {
+ if (ReqNTID.empty() || getClusterDim(F).empty())
+ report_fatal_error("blocksareclusters requires reqntid and cluster_dim");
+ O << ".blocksareclusters\n";
+ }
+
if (!ReqNTID.empty())
O << formatv(".reqntid {0:$[, ]}\n",
make_range(ReqNTID.begin(), ReqNTID.end()));
@@ -431,14 +442,14 @@ void NVPTXAsmPrinter::emitKernelFunctionDirectives(const Function &F,
// .maxclusterrank directive requires SM_90 or higher, make sure that we
// filter it out for lower SM versions, as it causes a hard ptxas crash.
- const NVPTXTargetMachine &NTM = static_cast<const NVPTXTargetMachine &>(TM);
- const auto *STI = static_cast<const NVPTXSubtarget *>(NTM.getSubtargetImpl());
-
if (STI->getSmVersion() >= 90) {
const auto ClusterDim = getClusterDim(F);
if (!ClusterDim.empty()) {
- O << ".explicitcluster\n";
+
+ if (!BlocksAreClusters)
+ O << ".explicitcluster\n";
+
if (ClusterDim[0] != 0) {
assert(llvm::all_of(ClusterDim, [](unsigned D) { return D != 0; }) &&
"cluster_dim_x != 0 implies cluster_dim_y and cluster_dim_z "
diff --git a/llvm/test/CodeGen/NVPTX/blocksareclusters-kernel-attr.ll b/llvm/test/CodeGen/NVPTX/blocksareclusters-kernel-attr.ll
new file mode 100644
index 0000000000000..13357f015a176
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/blocksareclusters-kernel-attr.ll
@@ -0,0 +1,78 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100 | FileCheck %s
+
+target triple = "nvptx64-nvidia-cuda"
+
+; Test "blocksareclusters" attribute with full "reqntid" and "cluster_dim"
+; attributes.
+define ptx_kernel void @kernel1(i32* %input, i32* %output) #0 #1 #2 {
+; CHECK-LABEL: kernel1(
+; CHECK: .blocksareclusters
+; CHECK-NEXT: .reqntid 1024, 1, 1
+; CHECK-NEXT: .reqnctapercluster 2, 2, 2
+; CHECK-NEXT: {
+; CHECK-EMPTY:
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ret;
+ ret void
+}
+
+; Test "blocksareclusters" attribute with single dimension "reqntid" and
+; "cluster_dim" attributes.
+define ptx_kernel void @kernel2(i32* %input, i32* %output) #0 #3 #4 {
+; CHECK-LABEL: kernel2(
+; CHECK: .blocksareclusters
+; CHECK-NEXT: .reqntid 1024
+; CHECK-NEXT: .reqnctapercluster 2 // @kernel2
+; CHECK-NEXT: {
+; CHECK-EMPTY:
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ret;
+ ret void
+}
+
+; Test "blocksareclusters" attribute with two dimensions(not z dimension)
+; "reqntid" and "cluster_dim" attributes.
+define ptx_kernel void @kernel3(i32* %input, i32* %output) #0 #5 #6 {
+; CHECK-LABEL: kernel3(
+; CHECK: .blocksareclusters
+; CHECK-NEXT: .reqntid 512, 2
+; CHECK-NEXT: .reqnctapercluster 2, 2 // @kernel3
+; CHECK-NEXT: {
+; CHECK-EMPTY:
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ret;
+ ret void
+}
+
+; Test "blocksareclusters" attribute with full "reqntid" and "cluster_dim"
+; attributes where kernel attribute is provided through metadata.
+define void @kernel4(i32* %input, i32* %output) #0 #1 #2 {
+; CHECK-LABEL: kernel4(
+; CHECK: .blocksareclusters
+; CHECK-NEXT: .reqntid 1024, 1, 1
+; CHECK-NEXT: .reqnctapercluster 2, 2, 2 // @kernel4
+; CHECK-NEXT: {
+; CHECK-EMPTY:
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ret;
+ ret void
+}
+
+attributes #0 = { "nvvm.blocksareclusters" }
+
+attributes #1 = { "nvvm.reqntid"="1024,1,1" }
+attributes #2 = { "nvvm.cluster_dim"="2,2,2" }
+
+attributes #3 = { "nvvm.reqntid"="1024" }
+attributes #4 = { "nvvm.cluster_dim"="2" }
+
+attributes #5 = { "nvvm.reqntid"="512,2" }
+attributes #6 = { "nvvm.cluster_dim"="2,2" }
+
+!0 = !{void (i32*, i32*)* @kernel4, !"kernel", i32 1 }
+!nvvm.annotations = !{!0}
>From 85cd17f364ddd356251131c58e6c0ee690667ffd Mon Sep 17 00:00:00 2001
From: rbajpai <rbajpai at nvidia.com>
Date: Thu, 14 Aug 2025 14:50:54 +0530
Subject: [PATCH 2/2] Addressed review comments
In addition to "blocksareclusters" kernel attr this change also add
"ptx90" support in NVPTX backend.
---
llvm/lib/Target/NVPTX/NVPTX.td | 8 ++--
llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp | 22 +++++-----
llvm/lib/Target/NVPTX/NVPTXUtilities.cpp | 4 ++
llvm/lib/Target/NVPTX/NVPTXUtilities.h | 2 +
.../NVPTX/blocksareclusters-kernel-attr.ll | 42 ++++++-------------
5 files changed, 33 insertions(+), 45 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTX.td b/llvm/lib/Target/NVPTX/NVPTX.td
index 83992606bc419..8a445f82e7001 100644
--- a/llvm/lib/Target/NVPTX/NVPTX.td
+++ b/llvm/lib/Target/NVPTX/NVPTX.td
@@ -97,10 +97,10 @@ foreach sm = [20, 21, 30, 32, 35, 37, 50, 52, 53,
def SM#sm#a : FeatureSM<""#sm#"a", !add(!mul(sm, 10), 3)>;
}
-foreach version = [32, 40, 41, 42, 43, 50, 60, 61, 62, 63, 64, 65,
- 70, 71, 72, 73, 74, 75, 76, 77, 78,
- 80, 81, 82, 83, 84, 85, 86, 87, 88] in
- def PTX#version: FeaturePTX<version>;
+foreach version = [32, 40, 41, 42, 43, 50, 60, 61, 62, 63, 64, 65, 70, 71, 72,
+ 73, 74, 75, 76, 77, 78, 80, 81, 82, 83, 84, 85, 86, 87, 88,
+ 90] in
+ def PTX#version : FeaturePTX<version>;
//===----------------------------------------------------------------------===//
// NVPTX supported processors.
diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index 385bf334ba338..cb8882b3a9d83 100644
--- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -414,17 +414,6 @@ void NVPTXAsmPrinter::emitKernelFunctionDirectives(const Function &F,
// the reqntid directive, and set the unspecified ones to 1.
// If none of Reqntid* is specified, don't output reqntid directive.
const auto ReqNTID = getReqNTID(F);
-
- const NVPTXTargetMachine &NTM = static_cast<const NVPTXTargetMachine &>(TM);
- const auto *STI = static_cast<const NVPTXSubtarget *>(NTM.getSubtargetImpl());
-
- const bool BlocksAreClusters = F.hasFnAttribute("nvvm.blocksareclusters");
- if (BlocksAreClusters && STI->getSmVersion() >= 90) {
- if (ReqNTID.empty() || getClusterDim(F).empty())
- report_fatal_error("blocksareclusters requires reqntid and cluster_dim");
- O << ".blocksareclusters\n";
- }
-
if (!ReqNTID.empty())
O << formatv(".reqntid {0:$[, ]}\n",
make_range(ReqNTID.begin(), ReqNTID.end()));
@@ -442,8 +431,12 @@ void NVPTXAsmPrinter::emitKernelFunctionDirectives(const Function &F,
// .maxclusterrank directive requires SM_90 or higher, make sure that we
// filter it out for lower SM versions, as it causes a hard ptxas crash.
+ const NVPTXTargetMachine &NTM = static_cast<const NVPTXTargetMachine &>(TM);
+ const auto *STI = static_cast<const NVPTXSubtarget *>(NTM.getSubtargetImpl());
+
if (STI->getSmVersion() >= 90) {
const auto ClusterDim = getClusterDim(F);
+ const bool BlocksAreClusters = hasBlocksAreClusters(F);
if (!ClusterDim.empty()) {
@@ -463,6 +456,13 @@ void NVPTXAsmPrinter::emitKernelFunctionDirectives(const Function &F,
"should be 0 as well");
}
}
+
+ if (BlocksAreClusters && STI->getPTXVersion() >= 90) {
+ assert(!(ReqNTID.empty() || getClusterDim(F).empty()) &&
+ "blocksareclusters requires reqntid and cluster_dim");
+ O << ".blocksareclusters\n";
+ }
+
if (const auto Maxclusterrank = getMaxClusterRank(F))
O << ".maxclusterrank " << *Maxclusterrank << "\n";
}
diff --git a/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp b/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp
index 6586f925504f1..274b04fdd30b5 100644
--- a/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp
@@ -352,6 +352,10 @@ std::optional<unsigned> getMaxNReg(const Function &F) {
return getFnAttrParsedInt(F, "nvvm.maxnreg");
}
+bool hasBlocksAreClusters(const Function &F) {
+ return F.hasFnAttribute("nvvm.blocksareclusters");
+}
+
MaybeAlign getAlign(const CallInst &I, unsigned Index) {
// First check the alignstack metadata
if (MaybeAlign StackAlign =
diff --git a/llvm/lib/Target/NVPTX/NVPTXUtilities.h b/llvm/lib/Target/NVPTX/NVPTXUtilities.h
index 4eb452f398220..9421f9f54d0a6 100644
--- a/llvm/lib/Target/NVPTX/NVPTXUtilities.h
+++ b/llvm/lib/Target/NVPTX/NVPTXUtilities.h
@@ -60,6 +60,8 @@ std::optional<unsigned> getMaxClusterRank(const Function &);
std::optional<unsigned> getMinCTASm(const Function &);
std::optional<unsigned> getMaxNReg(const Function &);
+bool hasBlocksAreClusters(const Function &);
+
inline bool isKernelFunction(const Function &F) {
return F.getCallingConv() == CallingConv::PTX_Kernel;
}
diff --git a/llvm/test/CodeGen/NVPTX/blocksareclusters-kernel-attr.ll b/llvm/test/CodeGen/NVPTX/blocksareclusters-kernel-attr.ll
index 13357f015a176..a0a99fe55654f 100644
--- a/llvm/test/CodeGen/NVPTX/blocksareclusters-kernel-attr.ll
+++ b/llvm/test/CodeGen/NVPTX/blocksareclusters-kernel-attr.ll
@@ -1,15 +1,15 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100 | FileCheck %s
+; RUN: llc < %s -mcpu=sm_90 -mattr=+ptx90 | FileCheck %s
target triple = "nvptx64-nvidia-cuda"
; Test "blocksareclusters" attribute with full "reqntid" and "cluster_dim"
; attributes.
-define ptx_kernel void @kernel1(i32* %input, i32* %output) #0 #1 #2 {
+define ptx_kernel void @kernel1(ptr %input, ptr %output) #0 #1 #2 {
; CHECK-LABEL: kernel1(
-; CHECK: .blocksareclusters
-; CHECK-NEXT: .reqntid 1024, 1, 1
+; CHECK: .reqntid 1024, 1, 1
; CHECK-NEXT: .reqnctapercluster 2, 2, 2
+; CHECK-NEXT: .blocksareclusters
; CHECK-NEXT: {
; CHECK-EMPTY:
; CHECK-EMPTY:
@@ -20,11 +20,11 @@ define ptx_kernel void @kernel1(i32* %input, i32* %output) #0 #1 #2 {
; Test "blocksareclusters" attribute with single dimension "reqntid" and
; "cluster_dim" attributes.
-define ptx_kernel void @kernel2(i32* %input, i32* %output) #0 #3 #4 {
+define ptx_kernel void @kernel2(ptr %input, ptr %output) #0 #3 #4 {
; CHECK-LABEL: kernel2(
-; CHECK: .blocksareclusters
-; CHECK-NEXT: .reqntid 1024
-; CHECK-NEXT: .reqnctapercluster 2 // @kernel2
+; CHECK: .reqntid 1024
+; CHECK-NEXT: .reqnctapercluster 2
+; CHECK-NEXT: .blocksareclusters // @kernel2
; CHECK-NEXT: {
; CHECK-EMPTY:
; CHECK-EMPTY:
@@ -35,26 +35,11 @@ define ptx_kernel void @kernel2(i32* %input, i32* %output) #0 #3 #4 {
; Test "blocksareclusters" attribute with two dimensions(not z dimension)
; "reqntid" and "cluster_dim" attributes.
-define ptx_kernel void @kernel3(i32* %input, i32* %output) #0 #5 #6 {
+define ptx_kernel void @kernel3(ptr %input, ptr %output) #0 #5 #6 {
; CHECK-LABEL: kernel3(
-; CHECK: .blocksareclusters
-; CHECK-NEXT: .reqntid 512, 2
-; CHECK-NEXT: .reqnctapercluster 2, 2 // @kernel3
-; CHECK-NEXT: {
-; CHECK-EMPTY:
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ret;
- ret void
-}
-
-; Test "blocksareclusters" attribute with full "reqntid" and "cluster_dim"
-; attributes where kernel attribute is provided through metadata.
-define void @kernel4(i32* %input, i32* %output) #0 #1 #2 {
-; CHECK-LABEL: kernel4(
-; CHECK: .blocksareclusters
-; CHECK-NEXT: .reqntid 1024, 1, 1
-; CHECK-NEXT: .reqnctapercluster 2, 2, 2 // @kernel4
+; CHECK: .reqntid 512, 2
+; CHECK-NEXT: .reqnctapercluster 2, 2
+; CHECK-NEXT: .blocksareclusters // @kernel3
; CHECK-NEXT: {
; CHECK-EMPTY:
; CHECK-EMPTY:
@@ -73,6 +58,3 @@ attributes #4 = { "nvvm.cluster_dim"="2" }
attributes #5 = { "nvvm.reqntid"="512,2" }
attributes #6 = { "nvvm.cluster_dim"="2,2" }
-
-!0 = !{void (i32*, i32*)* @kernel4, !"kernel", i32 1 }
-!nvvm.annotations = !{!0}
More information about the llvm-commits
mailing list