[Mlir-commits] [mlir] [MLIR][ROCDL] Add conversion for gpu.subgroup_id to ROCDL (PR #136405)
Alan Li
llvmlistbot at llvm.org
Mon Apr 28 06:41:55 PDT 2025
https://github.com/lialan updated https://github.com/llvm/llvm-project/pull/136405
>From 7c0e3a50fde54ab3179b99c1e3342f4cc3b3ae39 Mon Sep 17 00:00:00 2001
From: Alan Li <me at alanli.org>
Date: Sat, 19 Apr 2025 00:51:15 -0400
Subject: [PATCH 1/2] [MLIR][ROCDL] Add conversion for gpu.subgroup_id to ROCDL
---
.../Conversion/GPUToROCDL/GPUToROCDLPass.h | 7 +-
mlir/include/mlir/Conversion/Passes.td | 4 +
.../GPUToROCDL/LowerGpuOpsToROCDLOps.cpp | 143 +++++++++++-------
.../Conversion/GPUToROCDL/gpu-to-rocdl.mlir | 22 +++
4 files changed, 118 insertions(+), 58 deletions(-)
diff --git a/mlir/include/mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h b/mlir/include/mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h
index 291b809071ce9..1b265ecfc48e7 100644
--- a/mlir/include/mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h
+++ b/mlir/include/mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h
@@ -10,6 +10,7 @@
#include "mlir/Conversion/GPUToROCDL/Runtimes.h"
#include "mlir/Conversion/LLVMCommon/LoweringOptions.h"
+#include "mlir/Dialect/AMDGPU/Utils/Chipset.h"
#include <memory>
namespace mlir {
@@ -46,11 +47,7 @@ void configureGpuToROCDLConversionLegality(ConversionTarget &target);
/// index bitwidth used for the lowering of the device side index computations
/// is configurable.
std::unique_ptr<OperationPass<gpu::GPUModuleOp>>
-createLowerGpuOpsToROCDLOpsPass(
- const std::string &chipset = "gfx900",
- unsigned indexBitwidth = kDeriveIndexBitwidthFromDataLayout,
- bool useBarePtrCallConv = false,
- gpu::amd::Runtime runtime = gpu::amd::Runtime::Unknown);
+createLowerGpuOpsToROCDLOpsPass();
} // namespace mlir
diff --git a/mlir/include/mlir/Conversion/Passes.td b/mlir/include/mlir/Conversion/Passes.td
index bbba495e613b2..a558aeffba5e4 100644
--- a/mlir/include/mlir/Conversion/Passes.td
+++ b/mlir/include/mlir/Conversion/Passes.td
@@ -608,6 +608,10 @@ def ConvertGpuOpsToROCDLOps : Pass<"convert-gpu-to-rocdl", "gpu::GPUModuleOp"> {
clEnumValN(::mlir::gpu::amd::Runtime::HIP, "HIP", "HIP"),
clEnumValN(::mlir::gpu::amd::Runtime::OpenCL, "OpenCL",
"OpenCL"))}]>,
+ Option<"subgroupSize", "subgroup-size", "unsigned",
+ "0",
+ "specify subgroup size for the kernel, if left empty, the default "
+ "value will be decided by the target chipset.">,
ListOption<"allowedDialects", "allowed-dialects", "std::string",
"Run conversion patterns of only the specified dialects">,
];
diff --git a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
index dd16ec4b73e9f..932063aa109ba 100644
--- a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
+++ b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
@@ -52,25 +52,6 @@ namespace mlir {
using namespace mlir;
-// Truncate or extend the result depending on the index bitwidth specified
-// by the LLVMTypeConverter options.
-static Value truncOrExtToLLVMType(ConversionPatternRewriter &rewriter,
- Location loc, Value value,
- const LLVMTypeConverter &converter) {
- int64_t intWidth = cast<IntegerType>(value.getType()).getWidth();
- int64_t indexBitwidth = converter.getIndexTypeBitwidth();
- auto indexBitwidthType =
- IntegerType::get(rewriter.getContext(), converter.getIndexTypeBitwidth());
- // TODO: use <=> in C++20.
- if (indexBitwidth > intWidth) {
- return rewriter.create<LLVM::SExtOp>(loc, indexBitwidthType, value);
- }
- if (indexBitwidth < intWidth) {
- return rewriter.create<LLVM::TruncOp>(loc, indexBitwidthType, value);
- }
- return value;
-}
-
/// Returns true if the given `gpu.func` can be safely called using the bare
/// pointer calling convention.
static bool canBeCalledWithBarePointers(gpu::GPUFuncOp func) {
@@ -99,6 +80,26 @@ static constexpr StringLiteral amdgcnDataLayout =
"64-S32-A5-G1-ni:7:8:9";
namespace {
+
+// Truncate or extend the result depending on the index bitwidth specified
+// by the LLVMTypeConverter options.
+static Value truncOrExtToLLVMType(ConversionPatternRewriter &rewriter,
+ Location loc, Value value,
+ const LLVMTypeConverter &converter) {
+ int64_t intWidth = cast<IntegerType>(value.getType()).getWidth();
+ int64_t indexBitwidth = converter.getIndexTypeBitwidth();
+ auto indexBitwidthType =
+ IntegerType::get(rewriter.getContext(), converter.getIndexTypeBitwidth());
+ // TODO: use <=> in C++20.
+ if (indexBitwidth > intWidth) {
+ return rewriter.create<LLVM::SExtOp>(loc, indexBitwidthType, value);
+ }
+ if (indexBitwidth < intWidth) {
+ return rewriter.create<LLVM::TruncOp>(loc, indexBitwidthType, value);
+ }
+ return value;
+}
+
struct GPULaneIdOpToROCDL : ConvertOpToLLVMPattern<gpu::LaneIdOp> {
using ConvertOpToLLVMPattern<gpu::LaneIdOp>::ConvertOpToLLVMPattern;
@@ -117,16 +118,7 @@ struct GPULaneIdOpToROCDL : ConvertOpToLLVMPattern<gpu::LaneIdOp> {
rewriter.create<ROCDL::MbcntLoOp>(loc, intTy, ValueRange{minus1, zero});
Value laneId = rewriter.create<ROCDL::MbcntHiOp>(
loc, intTy, ValueRange{minus1, mbcntLo});
- // Truncate or extend the result depending on the index bitwidth specified
- // by the LLVMTypeConverter options.
- const unsigned indexBitwidth = getTypeConverter()->getIndexTypeBitwidth();
- if (indexBitwidth > 32) {
- laneId = rewriter.create<LLVM::SExtOp>(
- loc, IntegerType::get(context, indexBitwidth), laneId);
- } else if (indexBitwidth < 32) {
- laneId = rewriter.create<LLVM::TruncOp>(
- loc, IntegerType::get(context, indexBitwidth), laneId);
- }
+ laneId = truncOrExtToLLVMType(rewriter, loc, laneId, *getTypeConverter());
rewriter.replaceOp(op, {laneId});
return success();
}
@@ -150,11 +142,11 @@ struct GPUSubgroupSizeOpToROCDL : ConvertOpToLLVMPattern<gpu::SubgroupSizeOp> {
/*bitWidth=*/32, /*lower=*/isBeforeGfx10 ? 64 : 32,
/*upper=*/op.getUpperBoundAttr().getInt() + 1);
}
- Value wavefrontOp = rewriter.create<ROCDL::WavefrontSizeOp>(
+ Value wavefrontSizeOp = rewriter.create<ROCDL::WavefrontSizeOp>(
op.getLoc(), rewriter.getI32Type(), bounds);
- wavefrontOp = truncOrExtToLLVMType(rewriter, op.getLoc(), wavefrontOp,
- *getTypeConverter());
- rewriter.replaceOp(op, {wavefrontOp});
+ wavefrontSizeOp = truncOrExtToLLVMType(
+ rewriter, op.getLoc(), wavefrontSizeOp, *getTypeConverter());
+ rewriter.replaceOp(op, {wavefrontSizeOp});
return success();
}
@@ -239,6 +231,65 @@ struct GPUShuffleOpLowering : public ConvertOpToLLVMPattern<gpu::ShuffleOp> {
}
};
+struct GPUSubgroupIdOpToROCDL final
+ : ConvertOpToLLVMPattern<gpu::SubgroupIdOp> {
+ using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern;
+
+ LogicalResult
+ matchAndRewrite(gpu::SubgroupIdOp op, gpu::SubgroupIdOp::Adaptor adaptor,
+ ConversionPatternRewriter &rewriter) const override {
+ // Calculation of the thread's subgroup identifier.
+ //
+ // The process involves mapping the thread's 3D identifier within its
+ // workgroup/block (w_id.x, w_id.y, w_id.z) to a 1D linear index.
+ // This linearization assumes a layout where the x-dimension (w_dim.x)
+ // varies most rapidly (i.e., it is the innermost dimension).
+ //
+ // The formula for the linearized thread index is:
+ // L = w_id.x + w_dim.x * (w_id.y + (w_dim.y * w_id.z))
+ //
+ // Subsequently, the range of linearized indices [0, N_threads-1] is
+ // divided into consecutive, non-overlapping segments, each representing
+ // a subgroup of size 'subgroup_size'.
+ //
+ // Example Partitioning (N = subgroup_size):
+ // | Subgroup 0 | Subgroup 1 | Subgroup 2 | ... |
+ // | Indices 0..N-1 | Indices N..2N-1 | Indices 2N..3N-1| ... |
+ //
+ // The subgroup identifier is obtained via integer division of the
+ // linearized thread index by the predefined 'subgroup_size'.
+ //
+ // subgroup_id = floor( L / subgroup_size )
+ // = (w_id.x + w_dim.x * (w_id.y + w_dim.y * w_id.z)) /
+ // subgroup_size
+ auto int32Type = IntegerType::get(rewriter.getContext(), 32);
+ Location loc = op.getLoc();
+ LLVM::IntegerOverflowFlags flags =
+ LLVM::IntegerOverflowFlags::nsw | LLVM::IntegerOverflowFlags::nuw;
+ Value workitemIdX = rewriter.create<ROCDL::ThreadIdXOp>(loc, int32Type);
+ Value workitemIdY = rewriter.create<ROCDL::ThreadIdYOp>(loc, int32Type);
+ Value workitemIdZ = rewriter.create<ROCDL::ThreadIdZOp>(loc, int32Type);
+ Value workitemDimX = rewriter.create<ROCDL::BlockDimXOp>(loc, int32Type);
+ Value workitemDimY = rewriter.create<ROCDL::BlockDimYOp>(loc, int32Type);
+ Value dimYxIdZ = rewriter.create<LLVM::MulOp>(loc, int32Type, workitemDimY,
+ workitemIdZ, flags);
+ Value dimYxIdZPlusIdY = rewriter.create<LLVM::AddOp>(
+ loc, int32Type, dimYxIdZ, workitemIdY, flags);
+ Value dimYxIdZPlusIdYTimesDimX = rewriter.create<LLVM::MulOp>(
+ loc, int32Type, workitemDimX, dimYxIdZPlusIdY, flags);
+ Value workitemIdXPlusDimYxIdZPlusIdYTimesDimX =
+ rewriter.create<LLVM::AddOp>(loc, int32Type, workitemIdX,
+ dimYxIdZPlusIdYTimesDimX, flags);
+ Value subgroupSize = rewriter.create<ROCDL::WavefrontSizeOp>(
+ loc, rewriter.getI32Type(), nullptr);
+ Value waveIdOp = rewriter.create<LLVM::UDivOp>(
+ loc, workitemIdXPlusDimYxIdZPlusIdYTimesDimX, subgroupSize);
+ rewriter.replaceOp(op, {truncOrExtToLLVMType(rewriter, loc, waveIdOp,
+ *getTypeConverter())});
+ return success();
+ }
+};
+
/// Import the GPU Ops to ROCDL Patterns.
#include "GPUToROCDL.cpp.inc"
@@ -249,19 +300,7 @@ struct GPUShuffleOpLowering : public ConvertOpToLLVMPattern<gpu::ShuffleOp> {
// code.
struct LowerGpuOpsToROCDLOpsPass final
: public impl::ConvertGpuOpsToROCDLOpsBase<LowerGpuOpsToROCDLOpsPass> {
- LowerGpuOpsToROCDLOpsPass() = default;
- LowerGpuOpsToROCDLOpsPass(const std::string &chipset, unsigned indexBitwidth,
- bool useBarePtrCallConv,
- gpu::amd::Runtime runtime) {
- if (this->chipset.getNumOccurrences() == 0)
- this->chipset = chipset;
- if (this->indexBitwidth.getNumOccurrences() == 0)
- this->indexBitwidth = indexBitwidth;
- if (this->useBarePtrCallConv.getNumOccurrences() == 0)
- this->useBarePtrCallConv = useBarePtrCallConv;
- if (this->runtime.getNumOccurrences() == 0)
- this->runtime = runtime;
- }
+ using Base::Base;
void getDependentDialects(DialectRegistry ®istry) const override {
Base::getDependentDialects(registry);
@@ -455,17 +494,15 @@ void mlir::populateGpuToROCDLConversionPatterns(
// TODO: Add alignment for workgroup memory
patterns.add<GPUDynamicSharedMemoryOpLowering>(converter);
- patterns.add<GPUShuffleOpLowering, GPULaneIdOpToROCDL>(converter);
+ patterns
+ .add<GPUShuffleOpLowering, GPULaneIdOpToROCDL, GPUSubgroupIdOpToROCDL>(
+ converter);
patterns.add<GPUSubgroupSizeOpToROCDL>(converter, chipset);
populateMathToROCDLConversionPatterns(converter, patterns);
}
std::unique_ptr<OperationPass<gpu::GPUModuleOp>>
-mlir::createLowerGpuOpsToROCDLOpsPass(const std::string &chipset,
- unsigned indexBitwidth,
- bool useBarePtrCallConv,
- gpu::amd::Runtime runtime) {
- return std::make_unique<LowerGpuOpsToROCDLOpsPass>(
- chipset, indexBitwidth, useBarePtrCallConv, runtime);
+mlir::createLowerGpuOpsToROCDLOpsPass() {
+ return std::make_unique<LowerGpuOpsToROCDLOpsPass>();
}
diff --git a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
index d28aa9e34c22a..e8868aeda4dcb 100644
--- a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
+++ b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
@@ -763,3 +763,25 @@ gpu.module @test_module {
gpu.module @test_custom_data_layout attributes {llvm.data_layout = "e"} {
}
+
+// -----
+
+gpu.module @test_module {
+ // CHECK-LABEL: func @gpu_subgroup_id()
+ func.func @gpu_subgroup_id() -> (index) {
+ // CHECK: %[[widx:.*]] = rocdl.workitem.id.x : i32
+ // CHECK: %[[widy:.*]] = rocdl.workitem.id.y : i32
+ // CHECK: %[[widz:.*]] = rocdl.workitem.id.z : i32
+ // CHECK: %[[dimx:.*]] = rocdl.workgroup.dim.x : i32
+ // CHECK: %[[dimy:.*]] = rocdl.workgroup.dim.y : i32
+ // CHECK: %[[int5:.*]] = llvm.mul %[[dimy]], %[[widz]] overflow<nsw, nuw> : i32
+ // CHECK: %[[int6:.*]] = llvm.add %[[int5]], %[[widy]] overflow<nsw, nuw> : i32
+ // CHECK: %[[int7:.*]] = llvm.mul %[[dimx]], %[[int6]] overflow<nsw, nuw> : i32
+ // CHECK: %[[int8:.*]] = llvm.add %[[widx]], %[[int7]] overflow<nsw, nuw> : i32
+ // CHECK: %[[wavefrontsize:.*]] = rocdl.wavefrontsize : i32
+ // CHECK: %[[result:.*]] = llvm.udiv %[[int8]], %[[wavefrontsize]] : i32
+ // CHECK: = llvm.sext %[[result]] : i32 to i64
+ %subgroupId = gpu.subgroup_id : index
+ func.return %subgroupId : index
+ }
+}
>From 293f6245a5bfe467fcfec91f1f98142ac4c5a09a Mon Sep 17 00:00:00 2001
From: Alan Li <me at alanli.org>
Date: Mon, 28 Apr 2025 09:40:50 -0400
Subject: [PATCH 2/2] Another update
---
mlir/include/mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h | 1 -
mlir/include/mlir/Conversion/Passes.td | 4 ----
mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp | 3 +--
3 files changed, 1 insertion(+), 7 deletions(-)
diff --git a/mlir/include/mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h b/mlir/include/mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h
index 1b265ecfc48e7..b6d051cd0af96 100644
--- a/mlir/include/mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h
+++ b/mlir/include/mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h
@@ -10,7 +10,6 @@
#include "mlir/Conversion/GPUToROCDL/Runtimes.h"
#include "mlir/Conversion/LLVMCommon/LoweringOptions.h"
-#include "mlir/Dialect/AMDGPU/Utils/Chipset.h"
#include <memory>
namespace mlir {
diff --git a/mlir/include/mlir/Conversion/Passes.td b/mlir/include/mlir/Conversion/Passes.td
index a558aeffba5e4..bbba495e613b2 100644
--- a/mlir/include/mlir/Conversion/Passes.td
+++ b/mlir/include/mlir/Conversion/Passes.td
@@ -608,10 +608,6 @@ def ConvertGpuOpsToROCDLOps : Pass<"convert-gpu-to-rocdl", "gpu::GPUModuleOp"> {
clEnumValN(::mlir::gpu::amd::Runtime::HIP, "HIP", "HIP"),
clEnumValN(::mlir::gpu::amd::Runtime::OpenCL, "OpenCL",
"OpenCL"))}]>,
- Option<"subgroupSize", "subgroup-size", "unsigned",
- "0",
- "specify subgroup size for the kernel, if left empty, the default "
- "value will be decided by the target chipset.">,
ListOption<"allowedDialects", "allowed-dialects", "std::string",
"Run conversion patterns of only the specified dialects">,
];
diff --git a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
index 932063aa109ba..7e96fd24c3e2f 100644
--- a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
+++ b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
@@ -90,7 +90,6 @@ static Value truncOrExtToLLVMType(ConversionPatternRewriter &rewriter,
int64_t indexBitwidth = converter.getIndexTypeBitwidth();
auto indexBitwidthType =
IntegerType::get(rewriter.getContext(), converter.getIndexTypeBitwidth());
- // TODO: use <=> in C++20.
if (indexBitwidth > intWidth) {
return rewriter.create<LLVM::SExtOp>(loc, indexBitwidthType, value);
}
@@ -281,7 +280,7 @@ struct GPUSubgroupIdOpToROCDL final
rewriter.create<LLVM::AddOp>(loc, int32Type, workitemIdX,
dimYxIdZPlusIdYTimesDimX, flags);
Value subgroupSize = rewriter.create<ROCDL::WavefrontSizeOp>(
- loc, rewriter.getI32Type(), nullptr);
+ loc, rewriter.getI32Type(), /*upper_bound = */ nullptr);
Value waveIdOp = rewriter.create<LLVM::UDivOp>(
loc, workitemIdXPlusDimYxIdZPlusIdYTimesDimX, subgroupSize);
rewriter.replaceOp(op, {truncOrExtToLLVMType(rewriter, loc, waveIdOp,
More information about the Mlir-commits
mailing list