[Mlir-commits] [mlir] [MLIR] Propagate known cluster sizes from gpu.launch to gpu.func (PR #174404)
Adam Paszke
llvmlistbot at llvm.org
Mon Jan 5 05:47:10 PST 2026
https://github.com/apaszke updated https://github.com/llvm/llvm-project/pull/174404
>From 4a77a2ad0c78b8f034f39c9e63c8d0c3d6003684 Mon Sep 17 00:00:00 2001
From: Adam Paszke <adam.paszke at gmail.com>
Date: Mon, 5 Jan 2026 13:31:42 +0000
Subject: [PATCH] Propagate known cluster sizes from gpu.launch to gpu.func
This lets us properly annotate ranges for gpu.cluster_block_id and gpu.cluster_dim_blocks.
It also allows us to fill in the nvvm.cluster_dim attribute for use in the NVVM backend.
---
mlir/include/mlir/Dialect/GPU/IR/GPUBase.td | 4 +-
mlir/include/mlir/Dialect/GPU/IR/GPUOps.td | 3 +-
.../Conversion/GPUCommon/GPUOpsLowering.cpp | 12 +++++-
.../lib/Conversion/GPUCommon/GPUOpsLowering.h | 7 ++++
.../GPUCommon/IndexIntrinsicsOpLowering.h | 12 +++++-
.../Conversion/GPUToLLVMSPV/GPUToLLVMSPV.cpp | 3 +-
.../GPUToNVVM/LowerGpuOpsToNVVMOps.cpp | 8 ++--
.../GPUToROCDL/LowerGpuOpsToROCDLOps.cpp | 3 +-
mlir/lib/Dialect/GPU/IR/GPUDialect.cpp | 2 +
.../GPU/IR/InferIntRangeInterfaceImpls.cpp | 20 ++++++++-
.../Conversion/GPUToNVVM/gpu-to-nvvm.mlir | 42 +++++++++++++++++++
.../GPU/int-range-interface-cluster.mlir | 27 ++++++++++++
12 files changed, 132 insertions(+), 11 deletions(-)
create mode 100644 mlir/test/Dialect/GPU/int-range-interface-cluster.mlir
diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUBase.td b/mlir/include/mlir/Dialect/GPU/IR/GPUBase.td
index 2c29bb8a01a41..f0086158fb9b6 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/GPUBase.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUBase.td
@@ -61,10 +61,10 @@ def GPU_Dialect : Dialect {
/// attribute with value 'workgroup`.
static bool isWorkgroupMemoryAddressSpace(Attribute memorySpace);
}];
-
let discardableAttrs = (ins
"::mlir::DenseI32ArrayAttr":$known_block_size,
- "::mlir::DenseI32ArrayAttr":$known_grid_size
+ "::mlir::DenseI32ArrayAttr":$known_grid_size,
+ "::mlir::DenseI32ArrayAttr":$known_cluster_size
);
let dependentDialects = ["arith::ArithDialect"];
diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
index 4884541a60535..e8c23200547d6 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
@@ -431,7 +431,8 @@ def GPU_GPUFuncOp : GPU_Op<"func", [
OptionalAttr<DictArrayAttr>:$workgroup_attrib_attrs,
OptionalAttr<DictArrayAttr>:$private_attrib_attrs,
GPU_OptionalDimSizeHintAttr:$known_block_size,
- GPU_OptionalDimSizeHintAttr:$known_grid_size);
+ GPU_OptionalDimSizeHintAttr:$known_grid_size,
+ GPU_OptionalDimSizeHintAttr:$known_cluster_size);
let regions = (region AnyRegion:$body);
let skipDefaultBuilders = 1;
diff --git a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp
index eb662a1b056de..498bea0fd17b4 100644
--- a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp
+++ b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp
@@ -186,7 +186,8 @@ GPUFuncOpLowering::matchAndRewrite(gpu::GPUFuncOp gpuFuncOp, OpAdaptor adaptor,
attr.getName() == gpuFuncOp.getWorkgroupAttribAttrsAttrName() ||
attr.getName() == gpuFuncOp.getPrivateAttribAttrsAttrName() ||
attr.getName() == gpuFuncOp.getKnownBlockSizeAttrName() ||
- attr.getName() == gpuFuncOp.getKnownGridSizeAttrName())
+ attr.getName() == gpuFuncOp.getKnownGridSizeAttrName() ||
+ attr.getName() == gpuFuncOp.getKnownClusterSizeAttrName())
continue;
if (attr.getName() == gpuFuncOp.getArgAttrsAttrName()) {
argAttrs = gpuFuncOp.getArgAttrsAttr();
@@ -197,6 +198,7 @@ GPUFuncOpLowering::matchAndRewrite(gpu::GPUFuncOp gpuFuncOp, OpAdaptor adaptor,
DenseI32ArrayAttr knownBlockSize = gpuFuncOp.getKnownBlockSizeAttr();
DenseI32ArrayAttr knownGridSize = gpuFuncOp.getKnownGridSizeAttr();
+ DenseI32ArrayAttr knownClusterSize = gpuFuncOp.getKnownClusterSizeAttr();
// Ensure we don't lose information if the function is lowered before its
// surrounding context.
auto *gpuDialect = cast<gpu::GPUDialect>(gpuFuncOp->getDialect());
@@ -206,6 +208,10 @@ GPUFuncOpLowering::matchAndRewrite(gpu::GPUFuncOp gpuFuncOp, OpAdaptor adaptor,
if (knownGridSize)
attributes.emplace_back(gpuDialect->getKnownGridSizeAttrHelper().getName(),
knownGridSize);
+ if (knownClusterSize)
+ attributes.emplace_back(
+ gpuDialect->getKnownClusterSizeAttrHelper().getName(),
+ knownClusterSize);
// Add a dialect specific kernel attribute in addition to GPU kernel
// attribute. The former is necessary for further translation while the
@@ -217,6 +223,10 @@ GPUFuncOpLowering::matchAndRewrite(gpu::GPUFuncOp gpuFuncOp, OpAdaptor adaptor,
if (kernelBlockSizeAttributeName && knownBlockSize) {
attributes.emplace_back(kernelBlockSizeAttributeName, knownBlockSize);
}
+ // Set the dialect-specific cluster size attribute if there is one.
+ if (kernelClusterSizeAttributeName && knownClusterSize) {
+ attributes.emplace_back(kernelClusterSizeAttributeName, knownClusterSize);
+ }
}
LLVM::CConv callingConvention = gpuFuncOp.isKernel()
? kernelCallingConvention
diff --git a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h
index ec74787b2a8ed..a3b2e04c35313 100644
--- a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h
+++ b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h
@@ -73,6 +73,9 @@ struct GPUFuncOpLoweringOptions {
/// The attribute name to to set block size. Null if no attribute should be
/// used.
StringAttr kernelBlockSizeAttributeName;
+ /// The attribute name to to set cluster size. Null if no attribute should be
+ /// used.
+ StringAttr kernelClusterSizeAttributeName;
/// The calling convention to use for kernel functions.
LLVM::CConv kernelCallingConvention = LLVM::CConv::C;
@@ -93,6 +96,7 @@ struct GPUFuncOpLowering : ConvertOpToLLVMPattern<gpu::GPUFuncOp> {
workgroupAddrSpace(options.workgroupAddrSpace),
kernelAttributeName(options.kernelAttributeName),
kernelBlockSizeAttributeName(options.kernelBlockSizeAttributeName),
+ kernelClusterSizeAttributeName(options.kernelClusterSizeAttributeName),
kernelCallingConvention(options.kernelCallingConvention),
nonKernelCallingConvention(options.nonKernelCallingConvention),
encodeWorkgroupAttributionsAsArguments(
@@ -114,6 +118,9 @@ struct GPUFuncOpLowering : ConvertOpToLLVMPattern<gpu::GPUFuncOp> {
/// The attribute name to to set block size. Null if no attribute should be
/// used.
StringAttr kernelBlockSizeAttributeName;
+ /// The attribute name to to set cluster size. Null if no attribute should be
+ /// used.
+ StringAttr kernelClusterSizeAttributeName;
/// The calling convention to use for kernel functions
LLVM::CConv kernelCallingConvention;
diff --git a/mlir/lib/Conversion/GPUCommon/IndexIntrinsicsOpLowering.h b/mlir/lib/Conversion/GPUCommon/IndexIntrinsicsOpLowering.h
index 91c43e8bd1117..ae0239132e7d0 100644
--- a/mlir/lib/Conversion/GPUCommon/IndexIntrinsicsOpLowering.h
+++ b/mlir/lib/Conversion/GPUCommon/IndexIntrinsicsOpLowering.h
@@ -17,7 +17,7 @@
namespace mlir {
namespace gpu {
namespace index_lowering {
-enum class IndexKind : uint32_t { Other = 0, Block = 1, Grid = 2 };
+enum class IndexKind : uint32_t { Other = 0, Block = 1, Grid = 2, Cluster = 3 };
enum class IntrType : uint32_t {
None = 0,
Id = 1,
@@ -92,6 +92,13 @@ struct OpLowering : public ConvertOpToLLVMPattern<Op> {
funcBounds = gridHelper.getAttr(funcOp);
break;
}
+ case IndexKind::Cluster: {
+ auto clusterHelper =
+ gpu::GPUDialect::KnownClusterSizeAttrHelper(op.getContext());
+ if (clusterHelper.isAttrPresent(funcOp))
+ funcBounds = clusterHelper.getAttr(funcOp);
+ break;
+ }
case IndexKind::Other:
break;
}
@@ -104,6 +111,9 @@ struct OpLowering : public ConvertOpToLLVMPattern<Op> {
case IndexKind::Grid:
funcBounds = gpuFunc.getKnownGridSizeAttr();
break;
+ case IndexKind::Cluster:
+ funcBounds = gpuFunc.getKnownClusterSizeAttr();
+ break;
case IndexKind::Other:
break;
}
diff --git a/mlir/lib/Conversion/GPUToLLVMSPV/GPUToLLVMSPV.cpp b/mlir/lib/Conversion/GPUToLLVMSPV/GPUToLLVMSPV.cpp
index c0480a1dfb512..01b34337647ec 100644
--- a/mlir/lib/Conversion/GPUToLLVMSPV/GPUToLLVMSPV.cpp
+++ b/mlir/lib/Conversion/GPUToLLVMSPV/GPUToLLVMSPV.cpp
@@ -533,7 +533,8 @@ void populateGpuToLLVMSPVConversionPatterns(
GPUFuncOpLoweringOptions{
privateAddressSpace, localAddressSpace,
/*kernelAttributeName=*/{}, kernelBlockSizeAttributeName,
- LLVM::CConv::SPIR_KERNEL, LLVM::CConv::SPIR_FUNC,
+ /*kernelClusterSizeAttributeName=*/{}, LLVM::CConv::SPIR_KERNEL,
+ LLVM::CConv::SPIR_FUNC,
/*encodeWorkgroupAttributionsAsArguments=*/true});
}
diff --git a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
index 2561ca00d4b4f..6394296e99b9e 100644
--- a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
+++ b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
@@ -708,11 +708,11 @@ void mlir::populateGpuToNVVMConversionPatterns(
patterns.add<gpu::index_lowering::OpLowering<
gpu::ClusterBlockIdOp, NVVM::BlockInClusterIdXOp,
NVVM::BlockInClusterIdYOp, NVVM::BlockInClusterIdZOp>>(
- converter, IndexKind::Other, IntrType::Id, benefit);
+ converter, IndexKind::Cluster, IntrType::Id, benefit);
patterns.add<gpu::index_lowering::OpLowering<
gpu::ClusterDimBlocksOp, NVVM::ClusterDimBlocksXOp,
NVVM::ClusterDimBlocksYOp, NVVM::ClusterDimBlocksZOp>>(
- converter, IndexKind::Other, IntrType::Dim, benefit);
+ converter, IndexKind::Cluster, IntrType::Dim, benefit);
patterns.add<gpu::index_lowering::OpLowering<
gpu::BlockIdOp, NVVM::BlockIdXOp, NVVM::BlockIdYOp, NVVM::BlockIdZOp>>(
converter, IndexKind::Grid, IntrType::Id, benefit);
@@ -737,7 +737,9 @@ void mlir::populateGpuToNVVMConversionPatterns(
StringAttr::get(&converter.getContext(),
NVVM::NVVMDialect::getKernelFuncAttrName()),
StringAttr::get(&converter.getContext(),
- NVVM::NVVMDialect::getMaxntidAttrName())},
+ NVVM::NVVMDialect::getMaxntidAttrName()),
+ StringAttr::get(&converter.getContext(),
+ NVVM::NVVMDialect::getClusterDimAttrName())},
benefit);
populateLibDeviceConversionPatterns(converter, patterns, benefit);
diff --git a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
index 51741414d2060..b8eb6d7facc6d 100644
--- a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
+++ b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
@@ -458,7 +458,8 @@ void mlir::populateGpuToROCDLConversionPatterns(
/*allocaAddrSpace=*/ROCDL::ROCDLDialect::kPrivateMemoryAddressSpace,
/*workgroupAddrSpace=*/ROCDL::ROCDLDialect::kSharedMemoryAddressSpace,
rocdlDialect->getKernelAttrHelper().getName(),
- rocdlDialect->getReqdWorkGroupSizeAttrHelper().getName()});
+ rocdlDialect->getReqdWorkGroupSizeAttrHelper().getName(),
+ /*kernelClusterSizeAttributeName=*/{}});
if (Runtime::HIP == runtime) {
patterns.add<GPUPrintfOpToHIPLowering>(converter);
} else if (Runtime::OpenCL == runtime) {
diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
index 21c0d369b8d1c..36db6e82baaea 100644
--- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
+++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
@@ -396,6 +396,8 @@ LogicalResult GPUDialect::verifyOperationAttribute(Operation *op,
return verifyKnownLaunchSizeAttr(op, attr);
if (attr.getName() == getKnownGridSizeAttrHelper().getName())
return verifyKnownLaunchSizeAttr(op, attr);
+ if (attr.getName() == getKnownClusterSizeAttrHelper().getName())
+ return verifyKnownLaunchSizeAttr(op, attr);
if (!llvm::isa<UnitAttr>(attr.getValue()) ||
attr.getName() != getContainerModuleAttrName())
return success();
diff --git a/mlir/lib/Dialect/GPU/IR/InferIntRangeInterfaceImpls.cpp b/mlir/lib/Dialect/GPU/IR/InferIntRangeInterfaceImpls.cpp
index bee3f392c91b5..263fcb96c17db 100644
--- a/mlir/lib/Dialect/GPU/IR/InferIntRangeInterfaceImpls.cpp
+++ b/mlir/lib/Dialect/GPU/IR/InferIntRangeInterfaceImpls.cpp
@@ -30,7 +30,7 @@ static ConstantIntRanges getIndexRange(uint64_t umin, uint64_t umax) {
}
namespace {
-enum class LaunchDims : uint32_t { Block = 0, Grid = 1 };
+enum class LaunchDims : uint32_t { Block = 0, Grid = 1, Cluster = 2 };
} // end namespace
/// If the operation `op` is in a context that is annotated with maximum
@@ -63,6 +63,9 @@ getKnownLaunchAttr(GPUFuncOp func, LaunchDims dims, Dimension dim) {
case LaunchDims::Grid:
bounds = func.getKnownGridSizeAttr();
break;
+ case LaunchDims::Cluster:
+ bounds = func.getKnownClusterSizeAttr();
+ break;
}
if (!bounds)
return std::nullopt;
@@ -94,6 +97,13 @@ static std::optional<uint64_t> getKnownLaunchDim(Op op, LaunchDims type) {
case LaunchDims::Grid:
bounds = launch.getGridSizeOperandValues();
break;
+ case LaunchDims::Cluster:
+ if (launch.hasClusterSize()) {
+ auto clusterBounds = launch.getClusterSizeOperandValues();
+ if (clusterBounds)
+ bounds = *clusterBounds;
+ }
+ break;
}
Value maybeBound = valueByDim(bounds, dim);
APInt value;
@@ -115,6 +125,9 @@ static std::optional<uint64_t> getKnownLaunchDim(Op op, LaunchDims type) {
case LaunchDims::Grid:
attrName = GPUDialect::KnownGridSizeAttrHelper::getNameStr();
break;
+ case LaunchDims::Cluster:
+ attrName = GPUDialect::KnownClusterSizeAttrHelper::getNameStr();
+ break;
}
auto discardableAttr = getKnownLaunchAttr(func, attrName, dim);
if (discardableAttr)
@@ -133,6 +146,9 @@ void ClusterDimOp::inferResultRanges(ArrayRef<ConstantIntRanges>,
void ClusterDimBlocksOp::inferResultRanges(ArrayRef<ConstantIntRanges>,
SetIntRangeFn setResultRange) {
+ if (auto known = getKnownLaunchDim(*this, LaunchDims::Cluster))
+ return setResultRange(getResult(), getIndexRange(*known, *known));
+
uint64_t max = kMaxClusterDim;
if (auto specified = getUpperBound())
max = specified->getZExtValue();
@@ -150,6 +166,8 @@ void ClusterIdOp::inferResultRanges(ArrayRef<ConstantIntRanges>,
void ClusterBlockIdOp::inferResultRanges(ArrayRef<ConstantIntRanges>,
SetIntRangeFn setResultRange) {
uint64_t max = kMaxClusterDim;
+ if (auto known = getKnownLaunchDim(*this, LaunchDims::Cluster))
+ max = *known;
if (auto specified = getUpperBound())
max = specified->getZExtValue();
setResultRange(getResult(), getIndexRange(0, max - 1ULL));
diff --git a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
index f1cc1eb983267..55ee508aa9f55 100644
--- a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
+++ b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
@@ -1149,3 +1149,45 @@ gpu.module @test_module_56 {
func.return %sin16, %cos16, %sin32, %cos32, %sin64, %cos64 : f16, f16, f32, f32, f64, f64
}
}
+
+// -----
+
+gpu.module @test_module_cluster_size {
+ // CHECK-LABEL: llvm.func @kernel_with_cluster_size()
+ // CHECK-SAME: nvvm.cluster_dim = array<i32: 8, 2, 4>
+ gpu.func @kernel_with_cluster_size() kernel attributes {known_cluster_size = array<i32: 8, 2, 4>} {
+ gpu.return
+ }
+}
+
+// -----
+
+gpu.module @test_module_cluster_block_ops {
+// CHECK-LABEL: llvm.func @kernel_with_cluster_size(
+// CHECK-SAME: %[[ARG0:.*]]: !llvm.ptr)
+// CHECK-SAME: gpu.known_cluster_size = array<i32: 8, 4, 2>
+ gpu.func @kernel_with_cluster_size(%arg0: !llvm.ptr) kernel attributes {known_cluster_size = array<i32: 8, 4, 2>} {
+ // CHECK: nvvm.read.ptx.sreg.cluster.ctaid.x range <i32, 0, 8> : i32
+ %0 = gpu.cluster_block_id x
+ // CHECK: nvvm.read.ptx.sreg.cluster.ctaid.y range <i32, 0, 4> : i32
+ %1 = gpu.cluster_block_id y
+ // CHECK: nvvm.read.ptx.sreg.cluster.ctaid.z range <i32, 0, 2> : i32
+ %2 = gpu.cluster_block_id z
+ // CHECK: nvvm.read.ptx.sreg.cluster.nctaid.x range <i32, 1, 9> : i32
+ %3 = gpu.cluster_dim_blocks x
+ // CHECK: nvvm.read.ptx.sreg.cluster.nctaid.y range <i32, 1, 5> : i32
+ %4 = gpu.cluster_dim_blocks y
+ // CHECK: nvvm.read.ptx.sreg.cluster.nctaid.z range <i32, 1, 3> : i32
+ %5 = gpu.cluster_dim_blocks z
+
+ %6 = arith.addi %0, %1 : index
+ %7 = arith.addi %6, %2 : index
+ %8 = arith.addi %7, %3 : index
+ %9 = arith.addi %8, %4 : index
+ %10 = arith.addi %9, %5 : index
+ %11 = arith.index_cast %10 : index to i64
+ llvm.store %11, %arg0 : i64, !llvm.ptr
+ gpu.return
+ }
+}
+
diff --git a/mlir/test/Dialect/GPU/int-range-interface-cluster.mlir b/mlir/test/Dialect/GPU/int-range-interface-cluster.mlir
new file mode 100644
index 0000000000000..a7dd0df2e2c13
--- /dev/null
+++ b/mlir/test/Dialect/GPU/int-range-interface-cluster.mlir
@@ -0,0 +1,27 @@
+// RUN: mlir-opt -int-range-optimizations %s | FileCheck %s
+
+gpu.module @test_module {
+ gpu.func @test_cluster_ranges() kernel attributes {known_cluster_size = array<i32: 8, 4, 1>} {
+ %c0 = gpu.cluster_block_id x
+ // CHECK: test.reflect_bounds {smax = 7 : index, smin = 0 : index, umax = 7 : index, umin = 0 : index}
+ %c0_0 = test.reflect_bounds %c0 : index
+ %c1 = gpu.cluster_block_id y
+ // CHECK: test.reflect_bounds {smax = 3 : index, smin = 0 : index, umax = 3 : index, umin = 0 : index}
+ %c1_0 = test.reflect_bounds %c1 : index
+ %c2 = gpu.cluster_block_id z
+ // CHECK: test.reflect_bounds {smax = 0 : index, smin = 0 : index, umax = 0 : index, umin = 0 : index}
+ %c2_0 = test.reflect_bounds %c2 : index
+
+ %d0 = gpu.cluster_dim_blocks x
+ // CHECK: test.reflect_bounds {smax = 8 : index, smin = 8 : index, umax = 8 : index, umin = 8 : index}
+ %d0_0 = test.reflect_bounds %d0 : index
+ %d1 = gpu.cluster_dim_blocks y
+ // CHECK: test.reflect_bounds {smax = 4 : index, smin = 4 : index, umax = 4 : index, umin = 4 : index}
+ %d1_0 = test.reflect_bounds %d1 : index
+ %d2 = gpu.cluster_dim_blocks z
+ // CHECK: test.reflect_bounds {smax = 1 : index, smin = 1 : index, umax = 1 : index, umin = 1 : index}
+ %d2_0 = test.reflect_bounds %d2 : index
+
+ gpu.return
+ }
+}
More information about the Mlir-commits
mailing list