[Mlir-commits] [mlir] ac7cf4c - [mlir][ROCDL] Improve block/grid_dim handling, fix subgroup ID (#186235)
llvmlistbot at llvm.org
llvmlistbot at llvm.org
Fri Mar 13 09:57:58 PDT 2026
Author: Krzysztof Drewniak
Date: 2026-03-13T09:57:52-07:00
New Revision: ac7cf4c8638f4fa0ebd5efeddc13fef00ed85e46
URL: https://github.com/llvm/llvm-project/commit/ac7cf4c8638f4fa0ebd5efeddc13fef00ed85e46
DIFF: https://github.com/llvm/llvm-project/commit/ac7cf4c8638f4fa0ebd5efeddc13fef00ed85e46.diff
LOG: [mlir][ROCDL] Improve block/grid_dim handling, fix subgroup ID (#186235)
This began as me chasing down the fact that the subgroup_id pattern
introduced lately was causing crashes in translation because of
mismatches between the i64 type of the ockl functions being called and
the i32 type they'd been assigned, and spilled out into a refactor of
how we handle these dimension-lookup functions.
This commit removes the {Block,Grid}Dim{X,Y,Z} ops from the rocdl
dialect, since they were translating to library calls and not
intrinsics, which meant they don't fit into the dialect. Therefore, we
instead add a new pattern that rewrites block/grid dimensions to library
calls. While I'm there, I go ahead and implement support for upper
bounds on these dimensions accessors, adding a fallback bound of 1 <=
size < 1024 to those calls.
This also meant updating the lowering of subgroup_id to use that same
call-generation logic.
While I was here, I factored out the "get block/grid bounds from the
context" logic from the index op lowering template into a separate
function that would be reusable in the new patterns.
This also makes the subgroup_id tests stricter.
---------
Co-authored-by: Claude Opus 4.6 <noreply at anthropic.com>
Co-authored-by: Tim Gymnich <tim at gymni.ch>
Added:
mlir/lib/Conversion/GPUCommon/IndexIntrinsicsOpLowering.cpp
Modified:
mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
mlir/lib/Conversion/GPUCommon/CMakeLists.txt
mlir/lib/Conversion/GPUCommon/IndexIntrinsicsOpLowering.h
mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
mlir/lib/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.cpp
mlir/test/Conversion/GPUCommon/lower-global-id.mlir
mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-subgroup-id.mlir
mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
mlir/test/Dialect/LLVMIR/rocdl.mlir
mlir/test/Target/LLVMIR/rocdl.mlir
Removed:
################################################################################
diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
index 1f2d472611120..135d1e4007d49 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
@@ -226,23 +226,6 @@ class ROCDL_SpecialIdRegisterOp<string mnemonic> :
}];
}
-// TODO(krzysz00): This should be a lowering pattern, not an op.
-class ROCDL_DimGetterFunctionOp<string mnemonic, string device_function,
- int parameter, list<Trait> traits = []> :
- ROCDL_Op<mnemonic, !listconcat(traits, [Pure])>,
- Results<(outs LLVM_Type:$res)>, Arguments<(ins OptionalAttr<LLVM_ConstantRangeAttr>:$range)> {
- string llvmBuilder = "$res = createDimGetterFunctionCall(builder, op, \""
- # device_function # "\", " # parameter # ");";
- let assemblyFormat = "(`range` $range^)? attr-dict `:` type($res)";
-
- // Temporaly builder until Nvidia ops also support range attributes.
- let builders = [
- OpBuilder<(ins "Type":$resultType), [{
- build($_builder, $_state, resultType, ::mlir::LLVM::ConstantRangeAttr{});
- }]>
- ];
-}
-
//===----------------------------------------------------------------------===//
// ROCDL vector types definitions
//===----------------------------------------------------------------------===//
@@ -451,28 +434,6 @@ def ROCDL_ClusterWorkgroupIdZOp : ROCDL_SpecialIdRegisterOp<"cluster.workgroup.i
def ROCDL_WaveId : ROCDL_SpecialIdRegisterOp<"wave.id">;
def ROCDL_WavefrontSizeOp : ROCDL_SpecialIdRegisterOp<"wavefrontsize">;
-//===----------------------------------------------------------------------===//
-// Thread range and Block range
-//===----------------------------------------------------------------------===//
-
-def ROCDL_BlockDimXOp : ROCDL_DimGetterFunctionOp<"workgroup.dim.x",
- "__ockl_get_local_size", 0>;
-
-def ROCDL_BlockDimYOp : ROCDL_DimGetterFunctionOp<"workgroup.dim.y",
- "__ockl_get_local_size", 1>;
-
-def ROCDL_BlockDimZOp : ROCDL_DimGetterFunctionOp<"workgroup.dim.z",
- "__ockl_get_local_size", 2>;
-
-def ROCDL_GridDimXOp : ROCDL_DimGetterFunctionOp<"grid.dim.x",
- "__ockl_get_num_groups", 0>;
-
-def ROCDL_GridDimYOp : ROCDL_DimGetterFunctionOp<"grid.dim.y",
- "__ockl_get_num_groups", 1>;
-
-def ROCDL_GridDimZOp : ROCDL_DimGetterFunctionOp<"grid.dim.z",
- "__ockl_get_num_groups", 2>;
-
//===----------------------------------------------------------------------===//
// Synchronization primitives
//===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Conversion/GPUCommon/CMakeLists.txt b/mlir/lib/Conversion/GPUCommon/CMakeLists.txt
index ce914c0ea3dd8..31ac47ff35540 100644
--- a/mlir/lib/Conversion/GPUCommon/CMakeLists.txt
+++ b/mlir/lib/Conversion/GPUCommon/CMakeLists.txt
@@ -18,6 +18,7 @@ add_mlir_conversion_library(MLIRGPUToGPURuntimeTransforms
AttrToSPIRVConverter.cpp
GPUToLLVMConversion.cpp
GPUOpsLowering.cpp
+ IndexIntrinsicsOpLowering.cpp
DEPENDS
MLIRConversionPassIncGen
diff --git a/mlir/lib/Conversion/GPUCommon/IndexIntrinsicsOpLowering.cpp b/mlir/lib/Conversion/GPUCommon/IndexIntrinsicsOpLowering.cpp
new file mode 100644
index 0000000000000..ad3ae74d0c683
--- /dev/null
+++ b/mlir/lib/Conversion/GPUCommon/IndexIntrinsicsOpLowering.cpp
@@ -0,0 +1,82 @@
+//===- IndexIntrinsicsOpLowering.cpp - GPU Index Op Lowering --------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "IndexIntrinsicsOpLowering.h"
+
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/IR/BuiltinAttributes.h"
+
+using namespace mlir;
+using namespace mlir::gpu::index_lowering;
+
+LLVM::ConstantRangeAttr mlir::gpu::index_lowering::getIndexOpRange(
+ Operation *op, gpu::Dimension dim, std::optional<uint32_t> opUpperBound,
+ IndexKind indexKind, IntrType intrType, unsigned bitWidth) {
+ // Order of priority for bounds:
+ // 1. The upper_bound attribute
+ // 2. Inherent attributes on a surrounding gpu.func
+ // 3. Discardable attributes on a surrounding function of any kind
+ // The below code handles these in reverse order so that more important
+ // sources overwrite less important ones.
+ DenseI32ArrayAttr funcBounds = nullptr;
+ if (auto funcOp = op->getParentOfType<FunctionOpInterface>()) {
+ switch (indexKind) {
+ case IndexKind::Block: {
+ auto blockHelper =
+ gpu::GPUDialect::KnownBlockSizeAttrHelper(op->getContext());
+ if (blockHelper.isAttrPresent(funcOp))
+ funcBounds = blockHelper.getAttr(funcOp);
+ break;
+ }
+ case IndexKind::Grid: {
+ auto gridHelper =
+ gpu::GPUDialect::KnownGridSizeAttrHelper(op->getContext());
+ if (gridHelper.isAttrPresent(funcOp))
+ funcBounds = gridHelper.getAttr(funcOp);
+ break;
+ }
+ case IndexKind::Cluster: {
+ auto clusterHelper =
+ gpu::GPUDialect::KnownClusterSizeAttrHelper(op->getContext());
+ if (clusterHelper.isAttrPresent(funcOp))
+ funcBounds = clusterHelper.getAttr(funcOp);
+ break;
+ }
+ case IndexKind::Other:
+ break;
+ }
+ }
+ if (auto gpuFunc = op->getParentOfType<gpu::GPUFuncOp>()) {
+ switch (indexKind) {
+ case IndexKind::Block:
+ funcBounds = gpuFunc.getKnownBlockSizeAttr();
+ break;
+ case IndexKind::Grid:
+ funcBounds = gpuFunc.getKnownGridSizeAttr();
+ break;
+ case IndexKind::Cluster:
+ funcBounds = gpuFunc.getKnownClusterSizeAttr();
+ break;
+ case IndexKind::Other:
+ break;
+ }
+ }
+ std::optional<uint32_t> upperBound;
+ if (funcBounds)
+ upperBound = funcBounds.asArrayRef()[static_cast<uint32_t>(dim)];
+ if (opUpperBound)
+ upperBound = *opUpperBound;
+
+ if (!upperBound || intrType == IntrType::None)
+ return nullptr;
+
+ uint32_t min = (intrType == IntrType::Dim ? 1u : 0u);
+ uint32_t max =
+ llvm::SaturatingAdd(*upperBound, (intrType == IntrType::Id ? 0u : 1u));
+ return LLVM::ConstantRangeAttr::get(op->getContext(), bitWidth, min, max);
+}
diff --git a/mlir/lib/Conversion/GPUCommon/IndexIntrinsicsOpLowering.h b/mlir/lib/Conversion/GPUCommon/IndexIntrinsicsOpLowering.h
index ae0239132e7d0..186823e1b40ca 100644
--- a/mlir/lib/Conversion/GPUCommon/IndexIntrinsicsOpLowering.h
+++ b/mlir/lib/Conversion/GPUCommon/IndexIntrinsicsOpLowering.h
@@ -1,4 +1,4 @@
-//===- IndexIntrinsicsOpLowering.h - GPU IndexOps Lowering class *- C++ -*-===//
+//===- IndexIntrinsicsOpLowering.h - GPU Index Op Lowering ------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
@@ -12,7 +12,6 @@
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
#include "mlir/IR/BuiltinAttributes.h"
-#include <limits>
namespace mlir {
namespace gpu {
@@ -24,6 +23,16 @@ enum class IntrType : uint32_t {
Dim = 2,
};
+/// Returns a ConstantRangeAttr for a GPU index op, or nullptr if no bounds
+/// are found. `bitWidth` controls the width of the returned range.
+/// Checks the provided upper_bound from the op (highest priority), inherent
+/// attrs on enclosing `gpu.func`s, and discardable attributes on other
+/// enclosing function ops (lowest priority).
+LLVM::ConstantRangeAttr getIndexOpRange(Operation *op, gpu::Dimension dim,
+ std::optional<uint32_t> opUpperBound,
+ IndexKind indexKind, IntrType intrType,
+ unsigned bitWidth);
+
// Rewriting that replaces Op with XOp, YOp, or ZOp depending on the dimension
// that Op operates on. Op is assumed to return an `index` value and
// XOp, YOp and ZOp are assumed to return an `llvm.i32` value. Depending on
@@ -54,7 +63,7 @@ struct OpLowering : public ConvertOpToLLVMPattern<Op> {
LogicalResult
matchAndRewrite(Op op, typename Op::Adaptor adaptor,
ConversionPatternRewriter &rewriter) const override {
- auto loc = op->getLoc();
+ Location loc = op->getLoc();
MLIRContext *context = rewriter.getContext();
Operation *newOp;
switch (op.getDimension()) {
@@ -69,70 +78,13 @@ struct OpLowering : public ConvertOpToLLVMPattern<Op> {
break;
}
- // Order of priority for bounds:
- // 1. The upper_bound attribute
- // 2. Inherent attributes on a surrounding gpu.func
- // 3. Discardable attributes on a surrounding function of any kind
- // The below code handles these in reverse order so that more important
- // sources overwrite less important ones.
- DenseI32ArrayAttr funcBounds = nullptr;
- if (auto funcOp = op->template getParentOfType<FunctionOpInterface>()) {
- switch (indexKind) {
- case IndexKind::Block: {
- auto blockHelper =
- gpu::GPUDialect::KnownBlockSizeAttrHelper(op.getContext());
- if (blockHelper.isAttrPresent(funcOp))
- funcBounds = blockHelper.getAttr(funcOp);
- break;
- }
- case IndexKind::Grid: {
- auto gridHelper =
- gpu::GPUDialect::KnownGridSizeAttrHelper(op.getContext());
- if (gridHelper.isAttrPresent(funcOp))
- funcBounds = gridHelper.getAttr(funcOp);
- break;
- }
- case IndexKind::Cluster: {
- auto clusterHelper =
- gpu::GPUDialect::KnownClusterSizeAttrHelper(op.getContext());
- if (clusterHelper.isAttrPresent(funcOp))
- funcBounds = clusterHelper.getAttr(funcOp);
- break;
- }
- case IndexKind::Other:
- break;
- }
- }
- if (auto gpuFunc = op->template getParentOfType<gpu::GPUFuncOp>()) {
- switch (indexKind) {
- case IndexKind::Block:
- funcBounds = gpuFunc.getKnownBlockSizeAttr();
- break;
- case IndexKind::Grid:
- funcBounds = gpuFunc.getKnownGridSizeAttr();
- break;
- case IndexKind::Cluster:
- funcBounds = gpuFunc.getKnownClusterSizeAttr();
- break;
- case IndexKind::Other:
- break;
- }
- }
- std::optional<int32_t> upperBound;
- if (funcBounds)
- upperBound =
- funcBounds.asArrayRef()[static_cast<uint32_t>(op.getDimension())];
- if (auto opBound = op.getUpperBound())
- upperBound = opBound->getZExtValue();
+ std::optional<uint32_t> opBound;
+ if (auto bound = op.getUpperBound())
+ opBound = static_cast<uint32_t>(bound->getZExtValue());
+ if (auto range = getIndexOpRange(op, op.getDimension(), opBound, indexKind,
+ intrType, /*bitWidth=*/32))
+ newOp->setAttr("range", range);
- if (upperBound && intrType != IntrType::None) {
- int32_t min = (intrType == IntrType::Dim ? 1 : 0);
- int32_t max = *upperBound == std::numeric_limits<int32_t>::max()
- ? *upperBound
- : *upperBound + (intrType == IntrType::Id ? 0 : 1);
- newOp->setAttr("range", LLVM::ConstantRangeAttr::get(
- rewriter.getContext(), 32, min, max));
- }
if (indexBitwidth > 32) {
newOp = LLVM::SExtOp::create(rewriter, loc,
IntegerType::get(context, indexBitwidth),
diff --git a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
index 65353fedc9c4f..b1a4627977f80 100644
--- a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
+++ b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
@@ -102,6 +102,64 @@ static Value getLaneId(RewriterBase &rewriter, Location loc) {
return laneId;
}
+/// Maximum number of threads per block dimension on AMD GPUs.
+static constexpr int64_t kMaxThreadsPerBlockDim = 1024;
+
+/// Emits a call to an OCKL block/grid size function corresponding to
+/// `indexKind` with argument `dim`, querying for upper bounds in the context
+/// surrounding `contextOp` as a fallback for an unknown/unavailable
+/// `opUpperBound`.
+static Value getOcklDim(RewriterBase &rewriter,
+ gpu::index_lowering::IndexKind indexKind,
+ gpu::Dimension dim, Operation *contextOp,
+ std::optional<uint32_t> opUpperBound) {
+ Location loc = contextOp->getLoc();
+ MLIRContext *context = contextOp->getContext();
+
+ auto i32Ty = IntegerType::get(context, 32);
+ auto i64Ty = IntegerType::get(context, 64);
+
+ int32_t dimParam = static_cast<int32_t>(dim);
+
+ StringRef functionName;
+ switch (indexKind) {
+ case gpu::index_lowering::IndexKind::Block:
+ functionName = "__ockl_get_local_size";
+ break;
+ case gpu::index_lowering::IndexKind::Grid:
+ functionName = "__ockl_get_num_groups";
+ break;
+ case gpu::index_lowering::IndexKind::Cluster:
+ case gpu::index_lowering::IndexKind::Other:
+ llvm_unreachable("Not valid index kinds for ockl lookup");
+ }
+
+ // Declare the ockl function: i64 @functionName(i32).
+ auto fnType = LLVM::LLVMFunctionType::get(i64Ty, {i32Ty});
+ Operation *moduleOp = contextOp->getParentWithTrait<OpTrait::SymbolTable>();
+ LLVM::LLVMFuncOp funcOp =
+ getOrDefineFunction(moduleOp, loc, rewriter, functionName, fnType);
+
+ // Create the call.
+ Value dimConst = LLVM::ConstantOp::create(rewriter, loc, i32Ty, dimParam);
+ auto callOp =
+ LLVM::CallOp::create(rewriter, loc, funcOp, ValueRange{dimConst});
+
+ // Set range attribute on the call result if bounds are available.
+ auto range = gpu::index_lowering::getIndexOpRange(
+ contextOp, dim, opUpperBound, indexKind,
+ gpu::index_lowering::IntrType::Dim, /*bitWidth=*/64);
+ // Fall back to the hardware limit for block dimensions.
+ if (!range && indexKind == gpu::index_lowering::IndexKind::Block)
+ range = LLVM::ConstantRangeAttr::get(context, APInt(64, 1),
+ APInt(64, kMaxThreadsPerBlockDim + 1));
+ if (range) {
+ callOp.setResAttrsAttr(rewriter.getArrayAttr(rewriter.getDictionaryAttr(
+ rewriter.getNamedAttr(LLVM::LLVMDialect::getRangeAttrName(), range))));
+ }
+ return callOp.getResult();
+}
+
static constexpr StringLiteral amdgcnDataLayout =
"e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32"
"-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-i64:64-v16:16-v24:"
@@ -110,6 +168,36 @@ static constexpr StringLiteral amdgcnDataLayout =
"64-S32-A5-G1-ni:7:8:9";
namespace {
+
+/// Lowers gpu.block_dim / gpu.grid_dim to direct __ockl_get_local_size /
+/// __ockl_get_num_groups function calls.
+template <typename OpTy>
+struct GPUDimOpToOcklCall final : ConvertOpToLLVMPattern<OpTy> {
+ GPUDimOpToOcklCall(const LLVMTypeConverter &converter,
+ gpu::index_lowering::IndexKind indexKind)
+ : ConvertOpToLLVMPattern<OpTy>(converter), indexKind(indexKind) {}
+
+ LogicalResult
+ matchAndRewrite(OpTy op, typename OpTy::Adaptor adaptor,
+ ConversionPatternRewriter &rewriter) const override {
+ Location loc = op.getLoc();
+
+ std::optional<uint32_t> opUpperBound;
+ if (auto bound = op.getUpperBound())
+ opUpperBound = static_cast<uint32_t>(bound->getZExtValue());
+
+ Value ocklCall =
+ getOcklDim(rewriter, indexKind, op.getDimension(), op, opUpperBound);
+ Value result = truncOrExtToLLVMType(rewriter, loc, ocklCall,
+ *this->getTypeConverter());
+ rewriter.replaceOp(op, result);
+ return success();
+ }
+
+private:
+ const gpu::index_lowering::IndexKind indexKind;
+};
+
struct GPULaneIdOpToROCDL : ConvertOpToLLVMPattern<gpu::LaneIdOp> {
using ConvertOpToLLVMPattern<gpu::LaneIdOp>::ConvertOpToLLVMPattern;
@@ -197,16 +285,37 @@ struct GPUSubgroupIdOpToROCDL : ConvertOpToLLVMPattern<gpu::SubgroupIdOp> {
// For older architectures, compute:
// subgroup_id = linearized_thread_id / subgroup_size
// where linearized_thread_id = tid.x + dim.x * (tid.y + dim.y * tid.z)
- Value tidX = ROCDL::ThreadIdXOp::create(rewriter, loc, int32Type);
- Value tidY = ROCDL::ThreadIdYOp::create(rewriter, loc, int32Type);
- Value tidZ = ROCDL::ThreadIdZOp::create(rewriter, loc, int32Type);
- Value dimX = ROCDL::BlockDimXOp::create(rewriter, loc, int32Type);
- Value dimY = ROCDL::BlockDimYOp::create(rewriter, loc, int32Type);
+ auto tidX = ROCDL::ThreadIdXOp::create(rewriter, loc, int32Type);
+ auto tidY = ROCDL::ThreadIdYOp::create(rewriter, loc, int32Type);
+ auto tidZ = ROCDL::ThreadIdZOp::create(rewriter, loc, int32Type);
+ auto setBoundFromContext = [&](Operation *tidOp, gpu::Dimension dim) {
+ if (LLVM::ConstantRangeAttr range =
+ gpu::index_lowering::getIndexOpRange(
+ op, dim, std::nullopt,
+ gpu::index_lowering::IndexKind::Block,
+ gpu::index_lowering::IntrType::Id, 32))
+ tidOp->setAttr("range", range);
+ };
+ setBoundFromContext(tidX, gpu::Dimension::x);
+ setBoundFromContext(tidY, gpu::Dimension::y);
+ setBoundFromContext(tidZ, gpu::Dimension::z);
- // linearized = tid.x + dim.x * (tid.y + dim.y * tid.z)
- // Thread IDs and dimensions are non-negative and small, so use nuw+nsw.
auto flags =
LLVM::IntegerOverflowFlags::nsw | LLVM::IntegerOverflowFlags::nuw;
+
+ auto getBlockDim = [&](gpu::Dimension dim) {
+ Value dim64 =
+ getOcklDim(rewriter, gpu::index_lowering::IndexKind::Block, dim, op,
+ std::nullopt);
+ Value dimTrunc =
+ LLVM::TruncOp::create(rewriter, loc, int32Type, dim64, flags);
+ return dimTrunc;
+ };
+ Value dimX = getBlockDim(gpu::Dimension::x);
+ Value dimY = getBlockDim(gpu::Dimension::y);
+
+ // linearized = tid.x + dim.x * (tid.y + dim.y * tid.z)
+ // Thread IDs and dimensions are non-negative and small, so use nuw+nsw.
Value dimYxTidZ =
LLVM::MulOp::create(rewriter, loc, int32Type, dimY, tidZ, flags);
Value tidYPlusDimYxTidZ =
@@ -626,13 +735,9 @@ void mlir::populateGpuToROCDLConversionPatterns(
patterns.add<gpu::index_lowering::OpLowering<
gpu::BlockIdOp, ROCDL::BlockIdXOp, ROCDL::BlockIdYOp, ROCDL::BlockIdZOp>>(
converter, IndexKind::Grid, IntrType::Id);
- patterns.add<
- gpu::index_lowering::OpLowering<gpu::BlockDimOp, ROCDL::BlockDimXOp,
- ROCDL::BlockDimYOp, ROCDL::BlockDimZOp>>(
- converter, IndexKind::Block, IntrType::Dim);
- patterns.add<gpu::index_lowering::OpLowering<
- gpu::GridDimOp, ROCDL::GridDimXOp, ROCDL::GridDimYOp, ROCDL::GridDimZOp>>(
- converter, IndexKind::Grid, IntrType::Dim);
+ patterns.add<GPUDimOpToOcklCall<gpu::BlockDimOp>>(converter,
+ IndexKind::Block);
+ patterns.add<GPUDimOpToOcklCall<gpu::GridDimOp>>(converter, IndexKind::Grid);
patterns.add<GPUReturnOpLowering>(converter);
patterns.add<GPUFuncOpLowering>(
converter,
diff --git a/mlir/lib/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.cpp
index 8142347d80cb8..e1168e75f10da 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.cpp
@@ -17,7 +17,6 @@
#include "mlir/IR/Operation.h"
#include "mlir/Target/LLVMIR/ModuleTranslation.h"
-#include "llvm/IR/ConstantRange.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/Support/raw_ostream.h"
@@ -26,32 +25,6 @@ using namespace mlir;
using namespace mlir::LLVM;
using mlir::LLVM::detail::createIntrinsicCall;
-// Create a call to ROCm-Device-Library function that returns an ID.
-// This is intended to specifically call device functions that fetch things like
-// block or grid dimensions, and so is limited to functions that take one
-// integer parameter.
-static llvm::Value *createDimGetterFunctionCall(llvm::IRBuilderBase &builder,
- Operation *op, StringRef fnName,
- int parameter) {
- llvm::Module *module = builder.GetInsertBlock()->getModule();
- llvm::FunctionType *functionType = llvm::FunctionType::get(
- llvm::Type::getInt64Ty(module->getContext()), // return type.
- llvm::Type::getInt32Ty(module->getContext()), // parameter type.
- false); // no variadic arguments.
- llvm::Function *fn = dyn_cast<llvm::Function>(
- module->getOrInsertFunction(fnName, functionType).getCallee());
- llvm::Value *fnOp0 = llvm::ConstantInt::get(
- llvm::Type::getInt32Ty(module->getContext()), parameter);
- auto *call = builder.CreateCall(fn, ArrayRef<llvm::Value *>(fnOp0));
- if (auto rangeAttr = op->getAttrOfType<LLVM::ConstantRangeAttr>("range")) {
- // Zero-extend to 64 bits because the GPU dialect uses 32-bit bounds but
- // these ockl functions are defined to be 64-bits
- call->addRangeRetAttr(llvm::ConstantRange(rangeAttr.getLower().zext(64),
- rangeAttr.getUpper().zext(64)));
- }
- return call;
-}
-
namespace {
/// Implementation of the dialect interface that converts operations belonging
/// to the ROCDL dialect to LLVM IR.
diff --git a/mlir/test/Conversion/GPUCommon/lower-global-id.mlir b/mlir/test/Conversion/GPUCommon/lower-global-id.mlir
index b0274e0f9f290..94b9f90052769 100644
--- a/mlir/test/Conversion/GPUCommon/lower-global-id.mlir
+++ b/mlir/test/Conversion/GPUCommon/lower-global-id.mlir
@@ -11,9 +11,8 @@ gpu.module @kernel {
// ROCDL-LABEL: llvm.func @gpu_global_id() -> i64 {
// ROCDL: %[[WORKGROUP_0:.*]] = rocdl.workgroup.id.x : i32
// ROCDL: %[[SEXT_0:.*]] = llvm.sext %[[WORKGROUP_0]] : i32 to i64
-// ROCDL: %[[WORKGROUP_1:.*]] = rocdl.workgroup.dim.x : i32
-// ROCDL: %[[SEXT_1:.*]] = llvm.sext %[[WORKGROUP_1]] : i32 to i64
-// ROCDL: %[[MUL_0:.*]] = llvm.mul %[[SEXT_0]], %[[SEXT_1]] : i64
+// ROCDL: %[[DIM64:.*]] = llvm.call @__ockl_get_local_size(%{{.*}}) : (i32) -> (i64 {llvm.range = #llvm.constant_range<i64, 1, 1025>})
+// ROCDL: %[[MUL_0:.*]] = llvm.mul %[[SEXT_0]], %[[DIM64]] : i64
// ROCDL: %[[WORKITEM_0:.*]] = rocdl.workitem.id.x : i32
// ROCDL: %[[SEXT_2:.*]] = llvm.sext %[[WORKITEM_0]] : i32 to i64
// ROCDL: %[[ADD_0:.*]] = llvm.add %[[SEXT_2]], %[[MUL_0]] : i64
diff --git a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-subgroup-id.mlir b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-subgroup-id.mlir
index 030eb0e5eb181..b44216aab3bff 100644
--- a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-subgroup-id.mlir
+++ b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-subgroup-id.mlir
@@ -4,36 +4,66 @@
gpu.module @test_module {
// CHECK-LABEL: func @subgroup_id()
func.func @subgroup_id() -> index {
- // GFX12: rocdl.wave.id : i32
- // GFX12: llvm.sext %{{.*}} : i32 to i64
+ // GFX12: %[[WAVEID:.+]] = rocdl.wave.id : i32
+ // GFX12: llvm.sext %[[WAVEID]] : i32 to i64
- // GFX9-DAG: rocdl.workitem.id.x : i32
- // GFX9-DAG: rocdl.workitem.id.y : i32
- // GFX9-DAG: rocdl.workitem.id.z : i32
- // GFX9-DAG: rocdl.workgroup.dim.x : i32
- // GFX9-DAG: rocdl.workgroup.dim.y : i32
- // GFX9-DAG: llvm.mul %{{.*}}, %{{.*}} overflow<nsw, nuw>
- // GFX9-DAG: llvm.add %{{.*}}, %{{.*}} overflow<nsw, nuw>
- // GFX9: rocdl.wavefrontsize : i32
- // GFX9: llvm.udiv
- // GFX9: llvm.sext %{{.*}} : i32 to i64
+ // GFX9-DAG: %[[IDX:.+]] = rocdl.workitem.id.x : i32
+ // GFX9-DAG: %[[IDY:.+]] = rocdl.workitem.id.y : i32
+ // GFX9-DAG: %[[IDZ:.+]] = rocdl.workitem.id.z : i32
+ // GFX9-DAG: %[[DIMX_I64:.+]] = llvm.call @__ockl_get_local_size(%[[C0:.+]]) : (i32) -> (i64 {llvm.range = #llvm.constant_range<i64, 1, 1025>})
+ // Yes, this is checking after the call that uses it. This prevents collisions with other 0s.
+ // GFX9-DAG: %[[C0]] = llvm.mlir.constant(0 : i32) : i32
+ // GFX9-DAG: %[[DIMX:.+]] = llvm.trunc %[[DIMX_I64]] overflow<nsw, nuw> : i64 to i32
+ // GFX9-DAG: %[[DIMY_I64:.+]] = llvm.call @__ockl_get_local_size(%[[C1:.+]]) : (i32) -> (i64 {llvm.range = #llvm.constant_range<i64, 1, 1025>})
+ // GFX9-DAG: %[[C1]] = llvm.mlir.constant(1 : i32) : i32
+ // GFX9-DAG: %[[DIMY:.+]] = llvm.trunc %[[DIMY_I64]] overflow<nsw, nuw> : i64 to i32
+ // GFX9: %[[Z_DY:.+]] = llvm.mul %[[DIMY]], %[[IDZ]] overflow<nsw, nuw>
+ // GFX9: %[[ZY:.+]] = llvm.add %[[IDY]], %[[Z_DY]] overflow<nsw, nuw>
+ // GFX9: %[[YZ_DX:.+]] = llvm.mul %[[DIMX]], %[[ZY]] overflow<nsw, nuw>
+ // GFX9: %[[ZYX:.+]] = llvm.add %[[IDX]], %[[YZ_DX]] overflow<nsw, nuw>
+ // GFX9: %[[WAVESZ:.+]] = rocdl.wavefrontsize : i32
+ // GFX9: %[[RES:.+]] = llvm.udiv %[[ZYX]], %[[WAVESZ]]
+ // GFX9: llvm.sext %[[RES]] : i32 to i64
%subgroupId = gpu.subgroup_id : index
func.return %subgroupId : index
}
// CHECK-LABEL: func @subgroup_id_with_upper_bound()
func.func @subgroup_id_with_upper_bound() -> index {
- // GFX12: rocdl.wave.id range <i32, 0, 4> : i32
- // GFX12: llvm.sext %{{.*}} : i32 to i64
+ // GFX12: %[[WAVEID:.+]] = rocdl.wave.id range <i32, 0, 4> : i32
+ // GFX12: llvm.sext %[[WAVEID]] : i32 to i64
- // GFX9-DAG: rocdl.workitem.id.x : i32
- // GFX9-DAG: rocdl.workitem.id.y : i32
- // GFX9-DAG: rocdl.workitem.id.z : i32
- // GFX9-DAG: rocdl.workgroup.dim.x : i32
- // GFX9-DAG: rocdl.workgroup.dim.y : i32
- // GFX9: rocdl.wavefrontsize : i32
- // GFX9: llvm.udiv
- // GFX9: llvm.sext %{{.*}} : i32 to i64
+ // Minimal check to ensure we don't set any bounds based on the subgroup ID bound
+ // since we don't know which thread ID they go on to.
+ // GFX9: rocdl.workitem.id.x : i32
+ // GFX9-DAG: llvm.call @__ockl_get_local_size({{.*}}) : (i32) -> (i64 {llvm.range = #llvm.constant_range<i64, 1, 1025>})
+ %subgroupId = gpu.subgroup_id upper_bound 4 : index
+ func.return %subgroupId : index
+}
+
+// CHECK-LABEL: func @subgroup_id_with_workgroup_sizes()
+func.func @subgroup_id_with_workgroup_sizes() -> index
+ attributes {gpu.known_block_size = array<i32: 64, 4, 1>} {
+ // GFX12: %[[WAVEID:.+]] = rocdl.wave.id range <i32, 0, 4> : i32
+ // GFX12: llvm.sext %[[WAVEID]] : i32 to i64
+
+ // GFX9-DAG: %[[IDX:.+]] = rocdl.workitem.id.x range <i32, 0, 64> : i32
+ // GFX9-DAG: %[[IDY:.+]] = rocdl.workitem.id.y range <i32, 0, 4> : i32
+ // GFX9-DAG: %[[IDZ:.+]] = rocdl.workitem.id.z range <i32, 0, 1> : i32
+ // GFX9-DAG: %[[DIMX_I64:.+]] = llvm.call @__ockl_get_local_size(%[[C0:.+]]) : (i32) -> (i64 {llvm.range = #llvm.constant_range<i64, 1, 65>})
+ // Yes, this is checking after the call that uses it. This prevents collisions with other 0s.
+ // GFX9-DAG: %[[C0]] = llvm.mlir.constant(0 : i32) : i32
+ // GFX9-DAG: %[[DIMX:.+]] = llvm.trunc %[[DIMX_I64]] overflow<nsw, nuw> : i64 to i32
+ // GFX9-DAG: %[[DIMY_I64:.+]] = llvm.call @__ockl_get_local_size(%[[C1:.+]]) : (i32) -> (i64 {llvm.range = #llvm.constant_range<i64, 1, 5>})
+ // GFX9-DAG: %[[C1]] = llvm.mlir.constant(1 : i32) : i32
+ // GFX9-DAG: %[[DIMY:.+]] = llvm.trunc %[[DIMY_I64]] overflow<nsw, nuw> : i64 to i32
+ // GFX9: %[[Z_DY:.+]] = llvm.mul %[[DIMY]], %[[IDZ]] overflow<nsw, nuw>
+ // GFX9: %[[ZY:.+]] = llvm.add %[[IDY]], %[[Z_DY]] overflow<nsw, nuw>
+ // GFX9: %[[YZ_DX:.+]] = llvm.mul %[[DIMX]], %[[ZY]] overflow<nsw, nuw>
+ // GFX9: %[[ZYX:.+]] = llvm.add %[[IDX]], %[[YZ_DX]] overflow<nsw, nuw>
+ // GFX9: %[[WAVESZ:.+]] = rocdl.wavefrontsize : i32
+ // GFX9: %[[RES:.+]] = llvm.udiv %[[ZYX]], %[[WAVESZ]]
+ // GFX9: llvm.sext %[[RES]] : i32 to i64
%subgroupId = gpu.subgroup_id upper_bound 4 : index
func.return %subgroupId : index
}
diff --git a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
index 9c5c6c7cf9c87..3cc9ded6fe916 100755
--- a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
+++ b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
@@ -24,14 +24,14 @@ gpu.module @test_module {
// CHECK: = llvm.sext %{{.*}} : i32 to i64
%tIdZ = gpu.thread_id z
- // CHECK: rocdl.workgroup.dim.x : i32
- // CHECK: = llvm.sext %{{.*}} : i32 to i64
+ // CHECK-DAG: %[[BD_C0:.*]] = llvm.mlir.constant(0 : i32) : i32
+ // CHECK-DAG: %[[BD_C1:.*]] = llvm.mlir.constant(1 : i32) : i32
+ // CHECK-DAG: %[[BD_C2:.*]] = llvm.mlir.constant(2 : i32) : i32
+ // CHECK-DAG: llvm.call @__ockl_get_local_size(%[[BD_C0]]) : (i32) -> (i64 {llvm.range = #llvm.constant_range<i64, 1, 1025>})
%bDimX = gpu.block_dim x
- // CHECK: rocdl.workgroup.dim.y : i32
- // CHECK: = llvm.sext %{{.*}} : i32 to i64
+ // CHECK-DAG: llvm.call @__ockl_get_local_size(%[[BD_C1]]) : (i32) -> (i64 {llvm.range = #llvm.constant_range<i64, 1, 1025>})
%bDimY = gpu.block_dim y
- // CHECK: rocdl.workgroup.dim.z : i32
- // CHECK: = llvm.sext %{{.*}} : i32 to i64
+ // CHECK-DAG: llvm.call @__ockl_get_local_size(%[[BD_C2]]) : (i32) -> (i64 {llvm.range = #llvm.constant_range<i64, 1, 1025>})
%bDimZ = gpu.block_dim z
// CHECK: rocdl.workgroup.id.x : i32
@@ -44,14 +44,14 @@ gpu.module @test_module {
// CHECK: = llvm.sext %{{.*}} : i32 to i64
%bIdZ = gpu.block_id z
- // CHECK: rocdl.grid.dim.x : i32
- // CHECK: = llvm.sext %{{.*}} : i32 to i64
+ // CHECK-DAG: %[[GD_C0:.*]] = llvm.mlir.constant(0 : i32) : i32
+ // CHECK-DAG: %[[GD_C1:.*]] = llvm.mlir.constant(1 : i32) : i32
+ // CHECK-DAG: %[[GD_C2:.*]] = llvm.mlir.constant(2 : i32) : i32
+ // CHECK-DAG: llvm.call @__ockl_get_num_groups(%[[GD_C0]]) : (i32) -> i64
%gDimX = gpu.grid_dim x
- // CHECK: rocdl.grid.dim.y : i32
- // CHECK: = llvm.sext %{{.*}} : i32 to i64
+ // CHECK-DAG: llvm.call @__ockl_get_num_groups(%[[GD_C1]]) : (i32) -> i64
%gDimY = gpu.grid_dim y
- // CHECK: rocdl.grid.dim.z : i32
- // CHECK: = llvm.sext %{{.*}} : i32 to i64
+ // CHECK-DAG: llvm.call @__ockl_get_num_groups(%[[GD_C2]]) : (i32) -> i64
%gDimZ = gpu.grid_dim z
// CHECK: = rocdl.mbcnt.lo %{{.*}}, %{{.*}} {res_attrs = [{llvm.noundef, llvm.range = #llvm.constant_range<i32, 0, 32>}]} : (i32, i32) -> i32
@@ -100,13 +100,33 @@ gpu.module @test_module {
// CHECK: rocdl.workgroup.id.z range <i32, 0, 28> : i32
%bIdZ = gpu.block_id z
+ // CHECK: llvm.call @__ockl_get_local_size(%{{.*}}) : (i32) -> (i64 {llvm.range = #llvm.constant_range<i64, 1, 9>})
+ %bDimX = gpu.block_dim x
+ // CHECK: llvm.call @__ockl_get_local_size(%{{.*}}) : (i32) -> (i64 {llvm.range = #llvm.constant_range<i64, 1, 13>})
+ %bDimY = gpu.block_dim y
+ // CHECK: llvm.call @__ockl_get_local_size(%{{.*}}) : (i32) -> (i64 {llvm.range = #llvm.constant_range<i64, 1, 17>})
+ %bDimZ = gpu.block_dim z
+
+ // CHECK: llvm.call @__ockl_get_num_groups(%{{.*}}) : (i32) -> (i64 {llvm.range = #llvm.constant_range<i64, 1, 21>})
+ %gDimX = gpu.grid_dim x
+ // CHECK: llvm.call @__ockl_get_num_groups(%{{.*}}) : (i32) -> (i64 {llvm.range = #llvm.constant_range<i64, 1, 25>})
+ %gDimY = gpu.grid_dim y
+ // CHECK: llvm.call @__ockl_get_num_groups(%{{.*}}) : (i32) -> (i64 {llvm.range = #llvm.constant_range<i64, 1, 29>})
+ %gDimZ = gpu.grid_dim z
+
// "Usage" to make the ID calls not die
%0 = arith.addi %tIdX, %tIdY : index
%1 = arith.addi %0, %tIdZ : index
%2 = arith.addi %1, %bIdX : index
%3 = arith.addi %2, %bIdY : index
%4 = arith.addi %3, %bIdZ : index
- %5 = arith.index_cast %4 : index to i32
+ %r0 = arith.addi %4, %bDimX : index
+ %r1 = arith.addi %r0, %bDimY : index
+ %r2 = arith.addi %r1, %bDimZ : index
+ %r3 = arith.addi %r2, %gDimX : index
+ %r4 = arith.addi %r3, %gDimY : index
+ %r5 = arith.addi %r4, %gDimZ : index
+ %5 = arith.index_cast %r5 : index to i32
memref.store %5, %place[] : memref<i32>
gpu.return
}
@@ -796,7 +816,8 @@ gpu.module @test_module {
func.func @gpu_dim_int_max_upper_bound()
-> (index) {
- // CHECK32: rocdl.workgroup.dim.x range <i32, 1, 2147483647> : i32
+ // CHECK32: llvm.call @__ockl_get_local_size(%{{.*}}) : (i32) -> (i64 {llvm.range = #llvm.constant_range<i64, 1, 2147483648>})
+ // CHECK32: llvm.trunc %{{.*}} : i64 to i32
%bDimX = gpu.block_dim x upper_bound 2147483647
func.return %bDimX : index
}
diff --git a/mlir/test/Dialect/LLVMIR/rocdl.mlir b/mlir/test/Dialect/LLVMIR/rocdl.mlir
index 1a810dce706bd..1d835b352e519 100644
--- a/mlir/test/Dialect/LLVMIR/rocdl.mlir
+++ b/mlir/test/Dialect/LLVMIR/rocdl.mlir
@@ -26,20 +26,8 @@ func.func @rocdl_special_regs() -> i32 {
%10 = rocdl.cluster.workgroup.id.y : i32
// CHECK: rocdl.cluster.workgroup.id.z : i32
%11 = rocdl.cluster.workgroup.id.z : i32
- // CHECK: rocdl.workgroup.dim.x : i32
- %12 = rocdl.workgroup.dim.x : i32
- // CHECK: rocdl.workgroup.dim.y : i32
- %13 = rocdl.workgroup.dim.y : i32
- // CHECK: rocdl.workgroup.dim.z : i32
- %14 = rocdl.workgroup.dim.z : i32
- // CHECK: rocdl.grid.dim.x : i32
- %15 = rocdl.grid.dim.x : i32
- // CHECK: rocdl.grid.dim.y : i32
- %16 = rocdl.grid.dim.y : i32
- // CHECK: rocdl.grid.dim.z : i32
- %17 = rocdl.grid.dim.z : i32
// CHECK: rocdl.wave.id : i32
- %18 = rocdl.wave.id : i32
+ %12 = rocdl.wave.id : i32
llvm.return %0 : i32
}
diff --git a/mlir/test/Target/LLVMIR/rocdl.mlir b/mlir/test/Target/LLVMIR/rocdl.mlir
index 78a78c0bd1bbd..d3ac38ecad326 100644
--- a/mlir/test/Target/LLVMIR/rocdl.mlir
+++ b/mlir/test/Target/LLVMIR/rocdl.mlir
@@ -27,37 +27,20 @@ llvm.func @rocdl_special_regs() -> i32 {
// CHECK: call range(i32 0, 16) i32 @llvm.amdgcn.cluster.workgroup.id.z()
%12 = rocdl.cluster.workgroup.id.z range <i32, 0, 16> : i32
- // CHECK: call i64 @__ockl_get_local_size(i32 0)
- %13 = rocdl.workgroup.dim.x : i64
- // CHECK: call i64 @__ockl_get_local_size(i32 1)
- %14 = rocdl.workgroup.dim.y : i64
- // CHECK: call i64 @__ockl_get_local_size(i32 2)
- %15 = rocdl.workgroup.dim.z : i64
-
- // CHECK: call i64 @__ockl_get_num_groups(i32 0)
- %16 = rocdl.grid.dim.x : i64
- // CHECK: call i64 @__ockl_get_num_groups(i32 1)
- %17 = rocdl.grid.dim.y : i64
- // CHECK: call i64 @__ockl_get_num_groups(i32 2)
- %18 = rocdl.grid.dim.z : i64
-
// CHECK: call range(i32 0, 64) i32 @llvm.amdgcn.workitem.id.x()
- %19 = rocdl.workitem.id.x range <i32, 0, 64> : i32
-
- // CHECK: call range(i64 1, 65) i64 @__ockl_get_local_size(i32 0)
- %20 = rocdl.workgroup.dim.x range <i32, 1, 65> : i64
+ %13 = rocdl.workitem.id.x range <i32, 0, 64> : i32
// CHECK: call i32 @llvm.amdgcn.wave.id()
- %21 = rocdl.wave.id : i32
+ %14 = rocdl.wave.id : i32
// CHECK: call range(i32 32, 65) i32 @llvm.amdgcn.wave.id()
- %22 = rocdl.wave.id range <i32, 32, 65> : i32
+ %15 = rocdl.wave.id range <i32, 32, 65> : i32
// CHECK: call i32 @llvm.amdgcn.wavefrontsize()
- %23 = rocdl.wavefrontsize : i32
+ %16 = rocdl.wavefrontsize : i32
// CHECK: call range(i32 32, 65) i32 @llvm.amdgcn.wavefrontsize()
- %24 = rocdl.wavefrontsize range <i32, 32, 65> : i32
+ %17 = rocdl.wavefrontsize range <i32, 32, 65> : i32
llvm.return %1 : i32
}
More information about the Mlir-commits
mailing list