[Mlir-commits] [mlir] 803828f - [mlir][GPU] Refactor, improve constant size information handling (#186907)
llvmlistbot at llvm.org
llvmlistbot at llvm.org
Tue Mar 17 12:03:54 PDT 2026
Author: Krzysztof Drewniak
Date: 2026-03-17T12:03:49-07:00
New Revision: 803828f4aa6285bc9c7f27f7d33de5713085ff3b
URL: https://github.com/llvm/llvm-project/commit/803828f4aa6285bc9c7f27f7d33de5713085ff3b
DIFF: https://github.com/llvm/llvm-project/commit/803828f4aa6285bc9c7f27f7d33de5713085ff3b.diff
LOG: [mlir][GPU] Refactor, improve constant size information handling (#186907)
1. There was duplicate code between the integer range analysis's
handling of static dimension size information (ex. gpu.known_block_dim
attributes) and the handling during the lowering of those operations.
The code from integer range analysis was given a dialect-wide entry
point (and had its types fixed to be more accurate), which the lowering
templates now call.
2. The templated lowering for block/grid/cluster_dim now produces
precise ranges (indicating the constant value) where one is known, and
the lowerings in rocdl (including those for subgroup_id) have been fixed
appropriately.
3. While I was here, the gpu.dimension enum has been moved to GPUBase so
it lives next to the other enums.
4. The pattern that expands subgroup_id operations now adds any thread
dimension bounds it finds in context.
(Claude was used for an initial round of review, I did the main coding
myself.)
---------
Co-authored-by: Claude Opus 4.6 <noreply at anthropic.com>
Added:
Modified:
mlir/include/mlir/Dialect/GPU/IR/GPUBase.td
mlir/include/mlir/Dialect/GPU/IR/GPUDialect.h
mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
mlir/lib/Conversion/GPUCommon/IndexIntrinsicsOpLowering.cpp
mlir/lib/Conversion/GPUCommon/IndexIntrinsicsOpLowering.h
mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
mlir/lib/Dialect/GPU/IR/InferIntRangeInterfaceImpls.cpp
mlir/lib/Dialect/GPU/Transforms/SubgroupIdRewriter.cpp
mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-subgroup-id.mlir
mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
mlir/test/Dialect/GPU/subgroupId-rewrite.mlir
Removed:
################################################################################
diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUBase.td b/mlir/include/mlir/Dialect/GPU/IR/GPUBase.td
index 1a07d506b56d7..55326f044147b 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/GPUBase.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUBase.td
@@ -101,6 +101,27 @@ def GPU_AddressSpaceAttr :
def GPU_AddressSpaceAttrArray : TypedArrayAttrBase<GPU_AddressSpaceAttr, "GPU Address Space array">;
+def GPU_Dimension : GPU_I32Enum<"Dimension",
+ "a dimension, either 'x', 'y', or 'z'",
+ [
+ I32EnumAttrCase<"x", 0>,
+ I32EnumAttrCase<"y", 1>,
+ I32EnumAttrCase<"z", 2>
+ ]>;
+
+def GPU_DimensionAttr : EnumAttr<GPU_Dialect, GPU_Dimension, "dim">;
+
+def GPU_DimensionKind : I32Enum<"DimensionKind",
+ "the possible kinds of launch dimension",
+ [
+ I32EnumCase<"Other", 0, "other">,
+ I32EnumCase<"Block", 1, "block">,
+ I32EnumCase<"Grid", 2, "grid">,
+ I32EnumCase<"Cluster", 3, "cluster">
+ ]> {
+ let cppNamespace = "::mlir::gpu";
+}
+
//===----------------------------------------------------------------------===//
// GPU Types.
//===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUDialect.h b/mlir/include/mlir/Dialect/GPU/IR/GPUDialect.h
index a9886d1f21ca0..6e9cf709c7585 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/GPUDialect.h
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUDialect.h
@@ -221,4 +221,12 @@ class SparseSpGEMMOpHandleType
#define GET_OP_CLASSES
#include "mlir/Dialect/GPU/IR/GPUOps.h.inc"
+namespace mlir::gpu {
+/// Retrieve the constant bounds for a given dimension and dimension kind
+/// from the context surrounding `op`, if known, and return them. This will
+/// check the bounds on an enclosing `gpu.launch`, an enclosing `gpu.func`, and
+/// any `gpu.known_*_size` on other function-like operations, in that order.
+std::optional<uint32_t>
+getKnownDimensionSizeAround(Operation *op, DimensionKind kind, Dimension dim);
+} // namespace mlir::gpu
#endif // MLIR_DIALECT_GPU_IR_GPUDIALECT_H
diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
index b5a9e3413ddfd..36e0875f53b0a 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
@@ -38,18 +38,6 @@ include "mlir/Interfaces/SideEffectInterfaces.td"
class GPU_Op<string mnemonic, list<Trait> traits = []> :
Op<GPU_Dialect, mnemonic, traits>;
-def GPU_Dimension : I32EnumAttr<"Dimension",
- "a dimension, either 'x', 'y', or 'z'",
- [
- I32EnumAttrCase<"x", 0>,
- I32EnumAttrCase<"y", 1>,
- I32EnumAttrCase<"z", 2>
- ]>{
- let genSpecializedAttr = 0;
- let cppNamespace = "::mlir::gpu";
-}
-def GPU_DimensionAttr : EnumAttr<GPU_Dialect, GPU_Dimension, "dim">;
-
class GPU_IndexOp<string mnemonic, list<Trait> traits = []> :
GPU_Op<mnemonic, !listconcat(traits, [
Pure,
diff --git a/mlir/lib/Conversion/GPUCommon/IndexIntrinsicsOpLowering.cpp b/mlir/lib/Conversion/GPUCommon/IndexIntrinsicsOpLowering.cpp
index ad3ae74d0c683..c27236415abb1 100644
--- a/mlir/lib/Conversion/GPUCommon/IndexIntrinsicsOpLowering.cpp
+++ b/mlir/lib/Conversion/GPUCommon/IndexIntrinsicsOpLowering.cpp
@@ -22,53 +22,17 @@ LLVM::ConstantRangeAttr mlir::gpu::index_lowering::getIndexOpRange(
// 2. Inherent attributes on a surrounding gpu.func
// 3. Discardable attributes on a surrounding function of any kind
// The below code handles these in reverse order so that more important
- // sources overwrite less important ones.
- DenseI32ArrayAttr funcBounds = nullptr;
- if (auto funcOp = op->getParentOfType<FunctionOpInterface>()) {
- switch (indexKind) {
- case IndexKind::Block: {
- auto blockHelper =
- gpu::GPUDialect::KnownBlockSizeAttrHelper(op->getContext());
- if (blockHelper.isAttrPresent(funcOp))
- funcBounds = blockHelper.getAttr(funcOp);
- break;
- }
- case IndexKind::Grid: {
- auto gridHelper =
- gpu::GPUDialect::KnownGridSizeAttrHelper(op->getContext());
- if (gridHelper.isAttrPresent(funcOp))
- funcBounds = gridHelper.getAttr(funcOp);
- break;
- }
- case IndexKind::Cluster: {
- auto clusterHelper =
- gpu::GPUDialect::KnownClusterSizeAttrHelper(op->getContext());
- if (clusterHelper.isAttrPresent(funcOp))
- funcBounds = clusterHelper.getAttr(funcOp);
- break;
- }
- case IndexKind::Other:
- break;
- }
- }
- if (auto gpuFunc = op->getParentOfType<gpu::GPUFuncOp>()) {
- switch (indexKind) {
- case IndexKind::Block:
- funcBounds = gpuFunc.getKnownBlockSizeAttr();
- break;
- case IndexKind::Grid:
- funcBounds = gpuFunc.getKnownGridSizeAttr();
- break;
- case IndexKind::Cluster:
- funcBounds = gpuFunc.getKnownClusterSizeAttr();
- break;
- case IndexKind::Other:
- break;
- }
- }
- std::optional<uint32_t> upperBound;
- if (funcBounds)
- upperBound = funcBounds.asArrayRef()[static_cast<uint32_t>(dim)];
+ // sources overwrite less important ones. As an exception, dimension-size
+ // getters will return exact bounds if known.
+ std::optional<uint32_t> upperBound =
+ getKnownDimensionSizeAround(op, indexKind, dim);
+ // If our upper bound is the maximum possible value, we can't easily construct
+ // the constant range for it.
+ if (upperBound && intrType == IntrType::Dim &&
+ *upperBound < std::numeric_limits<uint32_t>::max())
+ return LLVM::ConstantRangeAttr::get(op->getContext(), bitWidth, *upperBound,
+ *upperBound + 1);
+
if (opUpperBound)
upperBound = *opUpperBound;
diff --git a/mlir/lib/Conversion/GPUCommon/IndexIntrinsicsOpLowering.h b/mlir/lib/Conversion/GPUCommon/IndexIntrinsicsOpLowering.h
index 186823e1b40ca..777f3b92b941c 100644
--- a/mlir/lib/Conversion/GPUCommon/IndexIntrinsicsOpLowering.h
+++ b/mlir/lib/Conversion/GPUCommon/IndexIntrinsicsOpLowering.h
@@ -16,7 +16,9 @@
namespace mlir {
namespace gpu {
namespace index_lowering {
-enum class IndexKind : uint32_t { Other = 0, Block = 1, Grid = 2, Cluster = 3 };
+// Alias so existing call sites don't need updating.
+using IndexKind = gpu::DimensionKind;
+
enum class IntrType : uint32_t {
None = 0,
Id = 1,
@@ -27,7 +29,9 @@ enum class IntrType : uint32_t {
/// are found. `bitWidth` controls the width of the returned range.
/// Checks the provided upper_bound from the op (highest priority), inherent
/// attrs on enclosing `gpu.func`s, and discardable attributes on other
-/// enclosing function ops (lowest priority).
+/// enclosing function ops (lowest priority). However, in the case where
+/// a dimension is known to have a constant value, returns a range indicating
+/// that value.
LLVM::ConstantRangeAttr getIndexOpRange(Operation *op, gpu::Dimension dim,
std::optional<uint32_t> opUpperBound,
IndexKind indexKind, IntrType intrType,
diff --git a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
index b1a4627977f80..e08ec138c853a 100644
--- a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
+++ b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
@@ -106,19 +106,24 @@ static Value getLaneId(RewriterBase &rewriter, Location loc) {
static constexpr int64_t kMaxThreadsPerBlockDim = 1024;
/// Emits a call to an OCKL block/grid size function corresponding to
-/// `indexKind` with argument `dim`, querying for upper bounds in the context
-/// surrounding `contextOp` as a fallback for an unknown/unavailable
-/// `opUpperBound`.
-static Value getOcklDim(RewriterBase &rewriter,
- gpu::index_lowering::IndexKind indexKind,
- gpu::Dimension dim, Operation *contextOp,
- std::optional<uint32_t> opUpperBound) {
+/// `indexKind` with argument `dim`, except that if the context around
+/// `contextOp` gives an exact size for that dimension, return that as
+/// an `i64` constant instead.
+static Value getKnownOrOcklDim(RewriterBase &rewriter,
+ gpu::index_lowering::IndexKind indexKind,
+ gpu::Dimension dim, Operation *contextOp,
+ std::optional<uint32_t> opUpperBound) {
Location loc = contextOp->getLoc();
MLIRContext *context = contextOp->getContext();
auto i32Ty = IntegerType::get(context, 32);
auto i64Ty = IntegerType::get(context, 64);
+ if (std::optional<uint32_t> knownDim =
+ gpu::getKnownDimensionSizeAround(contextOp, indexKind, dim))
+ return LLVM::ConstantOp::create(rewriter, loc,
+ rewriter.getI64IntegerAttr(*knownDim));
+
int32_t dimParam = static_cast<int32_t>(dim);
StringRef functionName;
@@ -145,14 +150,16 @@ static Value getOcklDim(RewriterBase &rewriter,
auto callOp =
LLVM::CallOp::create(rewriter, loc, funcOp, ValueRange{dimConst});
- // Set range attribute on the call result if bounds are available.
- auto range = gpu::index_lowering::getIndexOpRange(
- contextOp, dim, opUpperBound, indexKind,
- gpu::index_lowering::IntrType::Dim, /*bitWidth=*/64);
- // Fall back to the hardware limit for block dimensions.
- if (!range && indexKind == gpu::index_lowering::IndexKind::Block)
+ LLVM::ConstantRangeAttr range;
+ if (opUpperBound) {
+ range = LLVM::ConstantRangeAttr::get(
+ context, APInt(64, 1),
+ APInt(64, static_cast<uint64_t>(*opUpperBound) + 1));
+ } else if (indexKind == gpu::index_lowering::IndexKind::Block) {
+ // Set the hardware limit for block ranges as the bounds on block dim calls.
range = LLVM::ConstantRangeAttr::get(context, APInt(64, 1),
APInt(64, kMaxThreadsPerBlockDim + 1));
+ }
if (range) {
callOp.setResAttrsAttr(rewriter.getArrayAttr(rewriter.getDictionaryAttr(
rewriter.getNamedAttr(LLVM::LLVMDialect::getRangeAttrName(), range))));
@@ -186,8 +193,8 @@ struct GPUDimOpToOcklCall final : ConvertOpToLLVMPattern<OpTy> {
if (auto bound = op.getUpperBound())
opUpperBound = static_cast<uint32_t>(bound->getZExtValue());
- Value ocklCall =
- getOcklDim(rewriter, indexKind, op.getDimension(), op, opUpperBound);
+ Value ocklCall = getKnownOrOcklDim(rewriter, indexKind, op.getDimension(),
+ op, opUpperBound);
Value result = truncOrExtToLLVMType(rewriter, loc, ocklCall,
*this->getTypeConverter());
rewriter.replaceOp(op, result);
@@ -305,8 +312,8 @@ struct GPUSubgroupIdOpToROCDL : ConvertOpToLLVMPattern<gpu::SubgroupIdOp> {
auto getBlockDim = [&](gpu::Dimension dim) {
Value dim64 =
- getOcklDim(rewriter, gpu::index_lowering::IndexKind::Block, dim, op,
- std::nullopt);
+ getKnownOrOcklDim(rewriter, gpu::index_lowering::IndexKind::Block,
+ dim, op, std::nullopt);
Value dimTrunc =
LLVM::TruncOp::create(rewriter, loc, int32Type, dim64, flags);
return dimTrunc;
diff --git a/mlir/lib/Dialect/GPU/IR/InferIntRangeInterfaceImpls.cpp b/mlir/lib/Dialect/GPU/IR/InferIntRangeInterfaceImpls.cpp
index 263fcb96c17db..5eb8aa2fe0923 100644
--- a/mlir/lib/Dialect/GPU/IR/InferIntRangeInterfaceImpls.cpp
+++ b/mlir/lib/Dialect/GPU/IR/InferIntRangeInterfaceImpls.cpp
@@ -10,7 +10,6 @@
#include "mlir/IR/Matchers.h"
#include "mlir/Interfaces/FunctionInterfaces.h"
#include "mlir/Interfaces/InferIntRangeInterface.h"
-#include "llvm/Support/ErrorHandling.h"
#include <optional>
using namespace mlir;
@@ -18,7 +17,7 @@ using namespace mlir::gpu;
// Maximum grid and block dimensions of all known GPUs are less than 2^32.
static constexpr uint64_t kMaxDim = std::numeric_limits<uint32_t>::max();
-// Maximum cluster size
+// Maximum cluster size.
static constexpr uint64_t kMaxClusterDim = 8;
// Maximum subgroups are no larger than 128.
static constexpr uint64_t kMaxSubgroupSize = 128;
@@ -29,15 +28,7 @@ static ConstantIntRanges getIndexRange(uint64_t umin, uint64_t umax) {
APInt(width, umax));
}
-namespace {
-enum class LaunchDims : uint32_t { Block = 0, Grid = 1, Cluster = 2 };
-} // end namespace
-
-/// If the operation `op` is in a context that is annotated with maximum
-/// launch dimensions (a launch op with constant block or grid
-/// sizes or a launch_func op with the appropriate dimensions), return
-/// the bound on the maximum size of the dimension that the op is querying.
-/// IDs will be one less than this bound.
+static uint64_t zext(uint32_t arg) { return static_cast<uint64_t>(arg); }
static Value valueByDim(KernelDim3 dims, Dimension dim) {
switch (dim) {
@@ -51,53 +42,55 @@ static Value valueByDim(KernelDim3 dims, Dimension dim) {
llvm_unreachable("All dimension enum cases handled above");
}
-static uint64_t zext(uint32_t arg) { return static_cast<uint64_t>(arg); }
-
-static std::optional<uint64_t>
-getKnownLaunchAttr(GPUFuncOp func, LaunchDims dims, Dimension dim) {
+static std::optional<uint32_t>
+getKnownLaunchAttr(GPUFuncOp func, DimensionKind dims, Dimension dim) {
DenseI32ArrayAttr bounds;
switch (dims) {
- case LaunchDims::Block:
+ case DimensionKind::Other:
+ return std::nullopt;
+ case DimensionKind::Block:
bounds = func.getKnownBlockSizeAttr();
break;
- case LaunchDims::Grid:
+ case DimensionKind::Grid:
bounds = func.getKnownGridSizeAttr();
break;
- case LaunchDims::Cluster:
+ case DimensionKind::Cluster:
bounds = func.getKnownClusterSizeAttr();
break;
}
if (!bounds)
return std::nullopt;
- if (bounds.size() < static_cast<uint32_t>(dim))
+ if (bounds.size() <= static_cast<uint32_t>(dim))
return std::nullopt;
- return zext(bounds[static_cast<uint32_t>(dim)]);
+ return bounds[static_cast<uint32_t>(dim)];
}
-static std::optional<uint64_t> getKnownLaunchAttr(FunctionOpInterface func,
+static std::optional<uint32_t> getKnownLaunchAttr(FunctionOpInterface func,
StringRef attrName,
Dimension dim) {
auto bounds = func.getOperation()->getAttrOfType<DenseI32ArrayAttr>(attrName);
if (!bounds)
return std::nullopt;
- if (bounds.size() < static_cast<uint32_t>(dim))
+ if (bounds.size() <= static_cast<uint32_t>(dim))
return std::nullopt;
- return zext(bounds[static_cast<uint32_t>(dim)]);
+ return bounds[static_cast<uint32_t>(dim)];
}
-template <typename Op>
-static std::optional<uint64_t> getKnownLaunchDim(Op op, LaunchDims type) {
- Dimension dim = op.getDimension();
- if (auto launch = op->template getParentOfType<LaunchOp>()) {
+std::optional<uint32_t>
+mlir::gpu::getKnownDimensionSizeAround(Operation *op, DimensionKind kind,
+ Dimension dim) {
+ if (auto launch = op->getParentOfType<LaunchOp>()) {
KernelDim3 bounds;
- switch (type) {
- case LaunchDims::Block:
+ switch (kind) {
+ case DimensionKind::Other:
+ return std::nullopt;
+ case DimensionKind::Block:
bounds = launch.getBlockSizeOperandValues();
break;
- case LaunchDims::Grid:
+ case DimensionKind::Grid:
bounds = launch.getGridSizeOperandValues();
break;
- case LaunchDims::Cluster:
+ case DimensionKind::Cluster:
if (launch.hasClusterSize()) {
auto clusterBounds = launch.getClusterSizeOperandValues();
if (clusterBounds)
@@ -107,25 +100,27 @@ static std::optional<uint64_t> getKnownLaunchDim(Op op, LaunchDims type) {
}
Value maybeBound = valueByDim(bounds, dim);
APInt value;
- if (matchPattern(maybeBound, m_ConstantInt(&value)))
+ if (maybeBound && matchPattern(maybeBound, m_ConstantInt(&value)))
return value.getZExtValue();
}
- if (auto gpuFunc = op->template getParentOfType<GPUFuncOp>()) {
- auto inherentAttr = getKnownLaunchAttr(gpuFunc, type, dim);
+ if (auto gpuFunc = op->getParentOfType<GPUFuncOp>()) {
+ auto inherentAttr = getKnownLaunchAttr(gpuFunc, kind, dim);
if (inherentAttr)
return inherentAttr;
}
- if (auto func = op->template getParentOfType<FunctionOpInterface>()) {
+ if (auto func = op->getParentOfType<FunctionOpInterface>()) {
StringRef attrName;
- switch (type) {
- case LaunchDims::Block:
+ switch (kind) {
+ case DimensionKind::Other:
+ return std::nullopt;
+ case DimensionKind::Block:
attrName = GPUDialect::KnownBlockSizeAttrHelper::getNameStr();
break;
- case LaunchDims::Grid:
+ case DimensionKind::Grid:
attrName = GPUDialect::KnownGridSizeAttrHelper::getNameStr();
break;
- case LaunchDims::Cluster:
+ case DimensionKind::Cluster:
attrName = GPUDialect::KnownClusterSizeAttrHelper::getNameStr();
break;
}
@@ -146,8 +141,10 @@ void ClusterDimOp::inferResultRanges(ArrayRef<ConstantIntRanges>,
void ClusterDimBlocksOp::inferResultRanges(ArrayRef<ConstantIntRanges>,
SetIntRangeFn setResultRange) {
- if (auto known = getKnownLaunchDim(*this, LaunchDims::Cluster))
- return setResultRange(getResult(), getIndexRange(*known, *known));
+ if (auto known = getKnownDimensionSizeAround(*this, DimensionKind::Cluster,
+ getDimension()))
+ return setResultRange(getResult(),
+ getIndexRange(zext(*known), zext(*known)));
uint64_t max = kMaxClusterDim;
if (auto specified = getUpperBound())
@@ -166,8 +163,9 @@ void ClusterIdOp::inferResultRanges(ArrayRef<ConstantIntRanges>,
void ClusterBlockIdOp::inferResultRanges(ArrayRef<ConstantIntRanges>,
SetIntRangeFn setResultRange) {
uint64_t max = kMaxClusterDim;
- if (auto known = getKnownLaunchDim(*this, LaunchDims::Cluster))
- max = *known;
+ if (auto known = getKnownDimensionSizeAround(*this, DimensionKind::Cluster,
+ getDimension()))
+ max = zext(*known);
if (auto specified = getUpperBound())
max = specified->getZExtValue();
setResultRange(getResult(), getIndexRange(0, max - 1ULL));
@@ -175,11 +173,12 @@ void ClusterBlockIdOp::inferResultRanges(ArrayRef<ConstantIntRanges>,
void BlockDimOp::inferResultRanges(ArrayRef<ConstantIntRanges>,
SetIntRangeFn setResultRange) {
- std::optional<uint64_t> knownVal =
- getKnownLaunchDim(*this, LaunchDims::Block);
+ std::optional<uint32_t> knownVal =
+ getKnownDimensionSizeAround(*this, DimensionKind::Block, getDimension());
if (knownVal)
- return setResultRange(getResult(), getIndexRange(*knownVal, *knownVal));
- ;
+ return setResultRange(getResult(),
+ getIndexRange(zext(*knownVal), zext(*knownVal)));
+
uint64_t max = kMaxDim;
if (auto specified = getUpperBound())
max = specified->getZExtValue();
@@ -189,8 +188,9 @@ void BlockDimOp::inferResultRanges(ArrayRef<ConstantIntRanges>,
void BlockIdOp::inferResultRanges(ArrayRef<ConstantIntRanges>,
SetIntRangeFn setResultRange) {
uint64_t max = kMaxDim;
- if (auto fromContext = getKnownLaunchDim(*this, LaunchDims::Grid))
- max = fromContext.value();
+ if (auto fromContext = getKnownDimensionSizeAround(*this, DimensionKind::Grid,
+ getDimension()))
+ max = zext(*fromContext);
if (auto specified = getUpperBound())
max = specified->getZExtValue();
setResultRange(getResult(), getIndexRange(0, max - 1ULL));
@@ -198,9 +198,11 @@ void BlockIdOp::inferResultRanges(ArrayRef<ConstantIntRanges>,
void GridDimOp::inferResultRanges(ArrayRef<ConstantIntRanges>,
SetIntRangeFn setResultRange) {
- std::optional<uint64_t> knownVal = getKnownLaunchDim(*this, LaunchDims::Grid);
+ std::optional<uint32_t> knownVal =
+ getKnownDimensionSizeAround(*this, DimensionKind::Grid, getDimension());
if (knownVal)
- return setResultRange(getResult(), getIndexRange(*knownVal, *knownVal));
+ return setResultRange(getResult(),
+ getIndexRange(zext(*knownVal), zext(*knownVal)));
uint64_t max = kMaxDim;
if (auto specified = getUpperBound())
max = specified->getZExtValue();
@@ -210,8 +212,9 @@ void GridDimOp::inferResultRanges(ArrayRef<ConstantIntRanges>,
void ThreadIdOp::inferResultRanges(ArrayRef<ConstantIntRanges>,
SetIntRangeFn setResultRange) {
uint64_t max = kMaxDim;
- if (auto fromContext = getKnownLaunchDim(*this, LaunchDims::Block))
- max = fromContext.value();
+ if (auto fromContext = getKnownDimensionSizeAround(
+ *this, DimensionKind::Block, getDimension()))
+ max = zext(*fromContext);
if (auto specified = getUpperBound())
max = specified->getZExtValue();
setResultRange(getResult(), getIndexRange(0, max - 1ULL));
@@ -239,10 +242,12 @@ void GlobalIdOp::inferResultRanges(ArrayRef<ConstantIntRanges>,
return setResultRange(getResult(),
getIndexRange(0, specified->getZExtValue() - 1ULL));
- uint64_t blockDimMax =
- getKnownLaunchDim(*this, LaunchDims::Block).value_or(kMaxDim);
- uint64_t gridDimMax =
- getKnownLaunchDim(*this, LaunchDims::Grid).value_or(kMaxDim);
+ uint64_t blockDimMax = zext(
+ getKnownDimensionSizeAround(*this, DimensionKind::Block, getDimension())
+ .value_or(kMaxDim));
+ uint64_t gridDimMax = zext(
+ getKnownDimensionSizeAround(*this, DimensionKind::Grid, getDimension())
+ .value_or(kMaxDim));
setResultRange(getResult(),
getIndexRange(0, (blockDimMax * gridDimMax) - 1ULL));
}
diff --git a/mlir/lib/Dialect/GPU/Transforms/SubgroupIdRewriter.cpp b/mlir/lib/Dialect/GPU/Transforms/SubgroupIdRewriter.cpp
index ad6ee9e070d09..50dbc584a2c01 100644
--- a/mlir/lib/Dialect/GPU/Transforms/SubgroupIdRewriter.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/SubgroupIdRewriter.cpp
@@ -12,6 +12,7 @@
//
//===----------------------------------------------------------------------===//
+#include "mlir/Dialect/Arith/IR/Arith.h"
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "mlir/Dialect/GPU/Transforms/Passes.h"
#include "mlir/Dialect/Index/IR/IndexOps.h"
@@ -54,11 +55,39 @@ struct GpuSubgroupIdRewriter final : OpRewritePattern<gpu::SubgroupIdOp> {
Location loc = op->getLoc();
Type indexType = rewriter.getIndexType();
- Value dimX = gpu::BlockDimOp::create(rewriter, loc, gpu::Dimension::x);
- Value dimY = gpu::BlockDimOp::create(rewriter, loc, gpu::Dimension::y);
- Value tidX = gpu::ThreadIdOp::create(rewriter, loc, gpu::Dimension::x);
- Value tidY = gpu::ThreadIdOp::create(rewriter, loc, gpu::Dimension::y);
- Value tidZ = gpu::ThreadIdOp::create(rewriter, loc, gpu::Dimension::z);
+ auto asMaybeIndexAttr = [&](std::optional<uint32_t> bound) -> IntegerAttr {
+ if (!bound)
+ return IntegerAttr();
+ return IntegerAttr::get(
+ indexType, static_cast<int64_t>(static_cast<uint64_t>(*bound)));
+ };
+
+ IntegerAttr maybeKnownDimX =
+ asMaybeIndexAttr(gpu::getKnownDimensionSizeAround(
+ op, gpu::DimensionKind::Block, gpu::Dimension::x));
+ IntegerAttr maybeKnownDimY =
+ asMaybeIndexAttr(gpu::getKnownDimensionSizeAround(
+ op, gpu::DimensionKind::Block, gpu::Dimension::y));
+ IntegerAttr maybeKnownDimZ =
+ asMaybeIndexAttr(gpu::getKnownDimensionSizeAround(
+ op, gpu::DimensionKind::Block, gpu::Dimension::z));
+
+ Value dimX, dimY;
+ if (maybeKnownDimX)
+ dimX = arith::ConstantOp::create(rewriter, loc, maybeKnownDimX);
+ else
+ dimX = gpu::BlockDimOp::create(rewriter, loc, gpu::Dimension::x);
+ if (maybeKnownDimY)
+ dimY = arith::ConstantOp::create(rewriter, loc, maybeKnownDimY);
+ else
+ dimY = gpu::BlockDimOp::create(rewriter, loc, gpu::Dimension::y);
+
+ Value tidX = gpu::ThreadIdOp::create(rewriter, loc, gpu::Dimension::x,
+ maybeKnownDimX);
+ Value tidY = gpu::ThreadIdOp::create(rewriter, loc, gpu::Dimension::y,
+ maybeKnownDimY);
+ Value tidZ = gpu::ThreadIdOp::create(rewriter, loc, gpu::Dimension::z,
+ maybeKnownDimZ);
Value dimYxIdZ =
arith::MulIOp::create(rewriter, loc, indexType, dimY, tidZ);
diff --git a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
index 4837800488e86..929794f035b9f 100644
--- a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
+++ b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
@@ -1179,11 +1179,11 @@ gpu.module @test_module_cluster_block_ops {
%1 = gpu.cluster_block_id y
// CHECK: nvvm.read.ptx.sreg.cluster.ctaid.z range <i32, 0, 2> : i32
%2 = gpu.cluster_block_id z
- // CHECK: nvvm.read.ptx.sreg.cluster.nctaid.x range <i32, 1, 9> : i32
+ // CHECK: nvvm.read.ptx.sreg.cluster.nctaid.x range <i32, 8, 9> : i32
%3 = gpu.cluster_dim_blocks x
- // CHECK: nvvm.read.ptx.sreg.cluster.nctaid.y range <i32, 1, 5> : i32
+ // CHECK: nvvm.read.ptx.sreg.cluster.nctaid.y range <i32, 4, 5> : i32
%4 = gpu.cluster_dim_blocks y
- // CHECK: nvvm.read.ptx.sreg.cluster.nctaid.z range <i32, 1, 3> : i32
+ // CHECK: nvvm.read.ptx.sreg.cluster.nctaid.z range <i32, 2, 3> : i32
%5 = gpu.cluster_dim_blocks z
%6 = arith.addi %0, %1 : index
diff --git a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-subgroup-id.mlir b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-subgroup-id.mlir
index b44216aab3bff..9cab3ff48f5bf 100644
--- a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-subgroup-id.mlir
+++ b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-subgroup-id.mlir
@@ -50,12 +50,9 @@ func.func @subgroup_id_with_workgroup_sizes() -> index
// GFX9-DAG: %[[IDX:.+]] = rocdl.workitem.id.x range <i32, 0, 64> : i32
// GFX9-DAG: %[[IDY:.+]] = rocdl.workitem.id.y range <i32, 0, 4> : i32
// GFX9-DAG: %[[IDZ:.+]] = rocdl.workitem.id.z range <i32, 0, 1> : i32
- // GFX9-DAG: %[[DIMX_I64:.+]] = llvm.call @__ockl_get_local_size(%[[C0:.+]]) : (i32) -> (i64 {llvm.range = #llvm.constant_range<i64, 1, 65>})
- // Yes, this is checking after the call that uses it. This prevents collisions with other 0s.
- // GFX9-DAG: %[[C0]] = llvm.mlir.constant(0 : i32) : i32
+ // GFX9-DAG: %[[DIMX_I64:.+]] = llvm.mlir.constant(64 : i64) : i64
// GFX9-DAG: %[[DIMX:.+]] = llvm.trunc %[[DIMX_I64]] overflow<nsw, nuw> : i64 to i32
- // GFX9-DAG: %[[DIMY_I64:.+]] = llvm.call @__ockl_get_local_size(%[[C1:.+]]) : (i32) -> (i64 {llvm.range = #llvm.constant_range<i64, 1, 5>})
- // GFX9-DAG: %[[C1]] = llvm.mlir.constant(1 : i32) : i32
+ // GFX9-DAG: %[[DIMY_I64:.+]] = llvm.mlir.constant(4 : i64) : i64
// GFX9-DAG: %[[DIMY:.+]] = llvm.trunc %[[DIMY_I64]] overflow<nsw, nuw> : i64 to i32
// GFX9: %[[Z_DY:.+]] = llvm.mul %[[DIMY]], %[[IDZ]] overflow<nsw, nuw>
// GFX9: %[[ZY:.+]] = llvm.add %[[IDY]], %[[Z_DY]] overflow<nsw, nuw>
diff --git a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
index 3cc9ded6fe916..5eaa2d0b4df28 100755
--- a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
+++ b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
@@ -100,18 +100,18 @@ gpu.module @test_module {
// CHECK: rocdl.workgroup.id.z range <i32, 0, 28> : i32
%bIdZ = gpu.block_id z
- // CHECK: llvm.call @__ockl_get_local_size(%{{.*}}) : (i32) -> (i64 {llvm.range = #llvm.constant_range<i64, 1, 9>})
+ // CHECK: llvm.mlir.constant(8 : i64) : i64
%bDimX = gpu.block_dim x
- // CHECK: llvm.call @__ockl_get_local_size(%{{.*}}) : (i32) -> (i64 {llvm.range = #llvm.constant_range<i64, 1, 13>})
+ // CHECK: llvm.mlir.constant(12 : i64) : i64
%bDimY = gpu.block_dim y
- // CHECK: llvm.call @__ockl_get_local_size(%{{.*}}) : (i32) -> (i64 {llvm.range = #llvm.constant_range<i64, 1, 17>})
+ // CHECK: llvm.mlir.constant(16 : i64) : i64
%bDimZ = gpu.block_dim z
- // CHECK: llvm.call @__ockl_get_num_groups(%{{.*}}) : (i32) -> (i64 {llvm.range = #llvm.constant_range<i64, 1, 21>})
+ // CHECK: llvm.mlir.constant(20 : i64) : i64
%gDimX = gpu.grid_dim x
- // CHECK: llvm.call @__ockl_get_num_groups(%{{.*}}) : (i32) -> (i64 {llvm.range = #llvm.constant_range<i64, 1, 25>})
+ // CHECK: llvm.mlir.constant(24 : i64) : i64
%gDimY = gpu.grid_dim y
- // CHECK: llvm.call @__ockl_get_num_groups(%{{.*}}) : (i32) -> (i64 {llvm.range = #llvm.constant_range<i64, 1, 29>})
+ // CHECK: llvm.mlir.constant(28 : i64) : i64
%gDimZ = gpu.grid_dim z
// "Usage" to make the ID calls not die
diff --git a/mlir/test/Dialect/GPU/subgroupId-rewrite.mlir b/mlir/test/Dialect/GPU/subgroupId-rewrite.mlir
index 0d4f4d590bb4e..23cc31baf73b1 100644
--- a/mlir/test/Dialect/GPU/subgroupId-rewrite.mlir
+++ b/mlir/test/Dialect/GPU/subgroupId-rewrite.mlir
@@ -22,3 +22,29 @@ func.func @subgroupId(%sz : index, %mem: memref<index, 1>) {
}
return
}
+
+// CHECK-LABEL: func.func @subgroupIdConsts
+// CHECK-SAME: (%[[SZ:.*]]: index, %[[MEM:.*]]: memref<index, 1>) {
+func.func @subgroupIdConsts(%sz : index, %mem: memref<index, 1>) {
+ %c32 = arith.constant 32 : index
+ %c4 = arith.constant 4 : index
+ %c2 = arith.constant 2 : index
+ gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %sz, %grid_y = %sz, %grid_z = %sz)
+ threads(%tx, %ty, %tz) in (%block_x = %c32, %block_y = %c4, %block_z = %c2) {
+ // CHECK-DAG: %[[DIMX:.*]] = arith.constant 32 : index
+ // CHECK-DAG: %[[DIMY:.*]] = arith.constant 4 : index
+ // CHECK: %[[TIDX:.*]] = gpu.thread_id x upper_bound 32
+ // CHECK-NEXT: %[[TIDY:.*]] = gpu.thread_id y upper_bound 4
+ // CHECK-NEXT: %[[TIDZ:.*]] = gpu.thread_id z upper_bound 2
+ // CHECK-NEXT: %[[T0:.*]] = arith.muli %[[TIDZ]], %[[DIMY]] : index
+ // CHECK-NEXT: %[[T1:.*]] = arith.addi %[[T0]], %[[TIDY]] : index
+ // CHECK-NEXT: %[[T2:.*]] = arith.muli %[[T1]], %[[DIMX]] : index
+ // CHECK-NEXT: %[[T3:.*]] = arith.addi %[[TIDX]], %[[T2]] : index
+ // CHECK-NEXT: %[[T4:.*]] = gpu.subgroup_size : index
+ // CHECK-NEXT: %[[T5:.*]] = arith.divui %[[T3]], %[[T4]] : index
+ %idz = gpu.subgroup_id : index
+ memref.store %idz, %mem[] : memref<index, 1>
+ gpu.terminator
+ }
+ return
+}
More information about the Mlir-commits
mailing list