[Mlir-commits] [mlir] [MLIR][ROCDL] Add conversion for gpu.subgroup_id to ROCDL (PR #136405)
Alan Li
llvmlistbot at llvm.org
Fri Apr 25 16:54:25 PDT 2025
https://github.com/lialan updated https://github.com/llvm/llvm-project/pull/136405
>From 4c8b7169f4fefee8ae441bc843f13fc869d2345e Mon Sep 17 00:00:00 2001
From: Alan Li <me at alanli.org>
Date: Sat, 19 Apr 2025 00:51:15 -0400
Subject: [PATCH] [MLIR][ROCDL] Add conversion for gpu.subgroup_id to ROCDL
---
.../Conversion/GPUToROCDL/GPUToROCDLPass.h | 7 +-
mlir/include/mlir/Conversion/Passes.td | 4 +
.../GPUToROCDL/LowerGpuOpsToROCDLOps.cpp | 141 +++++++++++-------
.../Conversion/GPUToROCDL/gpu-to-rocdl.mlir | 22 +++
4 files changed, 116 insertions(+), 58 deletions(-)
diff --git a/mlir/include/mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h b/mlir/include/mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h
index 291b809071ce9..1b265ecfc48e7 100644
--- a/mlir/include/mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h
+++ b/mlir/include/mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h
@@ -10,6 +10,7 @@
#include "mlir/Conversion/GPUToROCDL/Runtimes.h"
#include "mlir/Conversion/LLVMCommon/LoweringOptions.h"
+#include "mlir/Dialect/AMDGPU/Utils/Chipset.h"
#include <memory>
namespace mlir {
@@ -46,11 +47,7 @@ void configureGpuToROCDLConversionLegality(ConversionTarget &target);
/// index bitwidth used for the lowering of the device side index computations
/// is configurable.
std::unique_ptr<OperationPass<gpu::GPUModuleOp>>
-createLowerGpuOpsToROCDLOpsPass(
- const std::string &chipset = "gfx900",
- unsigned indexBitwidth = kDeriveIndexBitwidthFromDataLayout,
- bool useBarePtrCallConv = false,
- gpu::amd::Runtime runtime = gpu::amd::Runtime::Unknown);
+createLowerGpuOpsToROCDLOpsPass();
} // namespace mlir
diff --git a/mlir/include/mlir/Conversion/Passes.td b/mlir/include/mlir/Conversion/Passes.td
index bbba495e613b2..a558aeffba5e4 100644
--- a/mlir/include/mlir/Conversion/Passes.td
+++ b/mlir/include/mlir/Conversion/Passes.td
@@ -608,6 +608,10 @@ def ConvertGpuOpsToROCDLOps : Pass<"convert-gpu-to-rocdl", "gpu::GPUModuleOp"> {
clEnumValN(::mlir::gpu::amd::Runtime::HIP, "HIP", "HIP"),
clEnumValN(::mlir::gpu::amd::Runtime::OpenCL, "OpenCL",
"OpenCL"))}]>,
+ Option<"subgroupSize", "subgroup-size", "unsigned",
+ "0",
+ "specify subgroup size for the kernel, if left empty, the default "
+ "value will be decided by the target chipset.">,
ListOption<"allowedDialects", "allowed-dialects", "std::string",
"Run conversion patterns of only the specified dialects">,
];
diff --git a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
index 0a6fb6451f700..932063aa109ba 100644
--- a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
+++ b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
@@ -52,25 +52,6 @@ namespace mlir {
using namespace mlir;
-// Truncate or extend the result depending on the index bitwidth specified
-// by the LLVMTypeConverter options.
-static Value truncOrExtToLLVMType(ConversionPatternRewriter &rewriter,
- Location loc, Value value,
- const LLVMTypeConverter &converter) {
- int64_t intWidth = cast<IntegerType>(value.getType()).getWidth();
- int64_t indexBitwidth = converter.getIndexTypeBitwidth();
- auto indexBitwidthType =
- IntegerType::get(rewriter.getContext(), converter.getIndexTypeBitwidth());
- // TODO: use <=> in C++20.
- if (indexBitwidth > intWidth) {
- return rewriter.create<LLVM::SExtOp>(loc, indexBitwidthType, value);
- }
- if (indexBitwidth < intWidth) {
- return rewriter.create<LLVM::TruncOp>(loc, indexBitwidthType, value);
- }
- return value;
-}
-
/// Returns true if the given `gpu.func` can be safely called using the bare
/// pointer calling convention.
static bool canBeCalledWithBarePointers(gpu::GPUFuncOp func) {
@@ -99,6 +80,26 @@ static constexpr StringLiteral amdgcnDataLayout =
"64-S32-A5-G1-ni:7:8:9";
namespace {
+
+// Truncate or extend the result depending on the index bitwidth specified
+// by the LLVMTypeConverter options.
+static Value truncOrExtToLLVMType(ConversionPatternRewriter &rewriter,
+ Location loc, Value value,
+ const LLVMTypeConverter &converter) {
+ int64_t intWidth = cast<IntegerType>(value.getType()).getWidth();
+ int64_t indexBitwidth = converter.getIndexTypeBitwidth();
+ auto indexBitwidthType =
+ IntegerType::get(rewriter.getContext(), converter.getIndexTypeBitwidth());
+ // TODO: use <=> in C++20.
+ if (indexBitwidth > intWidth) {
+ return rewriter.create<LLVM::SExtOp>(loc, indexBitwidthType, value);
+ }
+ if (indexBitwidth < intWidth) {
+ return rewriter.create<LLVM::TruncOp>(loc, indexBitwidthType, value);
+ }
+ return value;
+}
+
struct GPULaneIdOpToROCDL : ConvertOpToLLVMPattern<gpu::LaneIdOp> {
using ConvertOpToLLVMPattern<gpu::LaneIdOp>::ConvertOpToLLVMPattern;
@@ -117,16 +118,7 @@ struct GPULaneIdOpToROCDL : ConvertOpToLLVMPattern<gpu::LaneIdOp> {
rewriter.create<ROCDL::MbcntLoOp>(loc, intTy, ValueRange{minus1, zero});
Value laneId = rewriter.create<ROCDL::MbcntHiOp>(
loc, intTy, ValueRange{minus1, mbcntLo});
- // Truncate or extend the result depending on the index bitwidth specified
- // by the LLVMTypeConverter options.
- const unsigned indexBitwidth = getTypeConverter()->getIndexTypeBitwidth();
- if (indexBitwidth > 32) {
- laneId = rewriter.create<LLVM::SExtOp>(
- loc, IntegerType::get(context, indexBitwidth), laneId);
- } else if (indexBitwidth < 32) {
- laneId = rewriter.create<LLVM::TruncOp>(
- loc, IntegerType::get(context, indexBitwidth), laneId);
- }
+ laneId = truncOrExtToLLVMType(rewriter, loc, laneId, *getTypeConverter());
rewriter.replaceOp(op, {laneId});
return success();
}
@@ -150,11 +142,11 @@ struct GPUSubgroupSizeOpToROCDL : ConvertOpToLLVMPattern<gpu::SubgroupSizeOp> {
/*bitWidth=*/32, /*lower=*/isBeforeGfx10 ? 64 : 32,
/*upper=*/op.getUpperBoundAttr().getInt() + 1);
}
- Value wavefrontOp = rewriter.create<ROCDL::WavefrontSizeOp>(
+ Value wavefrontSizeOp = rewriter.create<ROCDL::WavefrontSizeOp>(
op.getLoc(), rewriter.getI32Type(), bounds);
- wavefrontOp = truncOrExtToLLVMType(rewriter, op.getLoc(), wavefrontOp,
- *getTypeConverter());
- rewriter.replaceOp(op, {wavefrontOp});
+ wavefrontSizeOp = truncOrExtToLLVMType(
+ rewriter, op.getLoc(), wavefrontSizeOp, *getTypeConverter());
+ rewriter.replaceOp(op, {wavefrontSizeOp});
return success();
}
@@ -239,6 +231,65 @@ struct GPUShuffleOpLowering : public ConvertOpToLLVMPattern<gpu::ShuffleOp> {
}
};
+struct GPUSubgroupIdOpToROCDL final
+ : ConvertOpToLLVMPattern<gpu::SubgroupIdOp> {
+ using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern;
+
+ LogicalResult
+ matchAndRewrite(gpu::SubgroupIdOp op, gpu::SubgroupIdOp::Adaptor adaptor,
+ ConversionPatternRewriter &rewriter) const override {
+ // Calculation of the thread's subgroup identifier.
+ //
+ // The process involves mapping the thread's 3D identifier within its
+ // workgroup/block (w_id.x, w_id.y, w_id.z) to a 1D linear index.
+ // This linearization assumes a layout where the x-dimension (w_dim.x)
+ // varies most rapidly (i.e., it is the innermost dimension).
+ //
+ // The formula for the linearized thread index is:
+ // L = w_id.x + w_dim.x * (w_id.y + (w_dim.y * w_id.z))
+ //
+ // Subsequently, the range of linearized indices [0, N_threads-1] is
+ // divided into consecutive, non-overlapping segments, each representing
+ // a subgroup of size 'subgroup_size'.
+ //
+ // Example Partitioning (N = subgroup_size):
+ // | Subgroup 0 | Subgroup 1 | Subgroup 2 | ... |
+ // | Indices 0..N-1 | Indices N..2N-1 | Indices 2N..3N-1| ... |
+ //
+ // The subgroup identifier is obtained via integer division of the
+ // linearized thread index by the predefined 'subgroup_size'.
+ //
+ // subgroup_id = floor( L / subgroup_size )
+ // = (w_id.x + w_dim.x * (w_id.y + w_dim.y * w_id.z)) /
+ // subgroup_size
+ auto int32Type = IntegerType::get(rewriter.getContext(), 32);
+ Location loc = op.getLoc();
+ LLVM::IntegerOverflowFlags flags =
+ LLVM::IntegerOverflowFlags::nsw | LLVM::IntegerOverflowFlags::nuw;
+ Value workitemIdX = rewriter.create<ROCDL::ThreadIdXOp>(loc, int32Type);
+ Value workitemIdY = rewriter.create<ROCDL::ThreadIdYOp>(loc, int32Type);
+ Value workitemIdZ = rewriter.create<ROCDL::ThreadIdZOp>(loc, int32Type);
+ Value workitemDimX = rewriter.create<ROCDL::BlockDimXOp>(loc, int32Type);
+ Value workitemDimY = rewriter.create<ROCDL::BlockDimYOp>(loc, int32Type);
+ Value dimYxIdZ = rewriter.create<LLVM::MulOp>(loc, int32Type, workitemDimY,
+ workitemIdZ, flags);
+ Value dimYxIdZPlusIdY = rewriter.create<LLVM::AddOp>(
+ loc, int32Type, dimYxIdZ, workitemIdY, flags);
+ Value dimYxIdZPlusIdYTimesDimX = rewriter.create<LLVM::MulOp>(
+ loc, int32Type, workitemDimX, dimYxIdZPlusIdY, flags);
+ Value workitemIdXPlusDimYxIdZPlusIdYTimesDimX =
+ rewriter.create<LLVM::AddOp>(loc, int32Type, workitemIdX,
+ dimYxIdZPlusIdYTimesDimX, flags);
+ Value subgroupSize = rewriter.create<ROCDL::WavefrontSizeOp>(
+ loc, rewriter.getI32Type(), nullptr);
+ Value waveIdOp = rewriter.create<LLVM::UDivOp>(
+ loc, workitemIdXPlusDimYxIdZPlusIdYTimesDimX, subgroupSize);
+ rewriter.replaceOp(op, {truncOrExtToLLVMType(rewriter, loc, waveIdOp,
+ *getTypeConverter())});
+ return success();
+ }
+};
+
/// Import the GPU Ops to ROCDL Patterns.
#include "GPUToROCDL.cpp.inc"
@@ -249,19 +300,7 @@ struct GPUShuffleOpLowering : public ConvertOpToLLVMPattern<gpu::ShuffleOp> {
// code.
struct LowerGpuOpsToROCDLOpsPass final
: public impl::ConvertGpuOpsToROCDLOpsBase<LowerGpuOpsToROCDLOpsPass> {
- LowerGpuOpsToROCDLOpsPass() = default;
- LowerGpuOpsToROCDLOpsPass(const std::string &chipset, unsigned indexBitwidth,
- bool useBarePtrCallConv,
- gpu::amd::Runtime runtime) {
- if (this->chipset.getNumOccurrences() == 0)
- this->chipset = chipset;
- if (this->indexBitwidth.getNumOccurrences() == 0)
- this->indexBitwidth = indexBitwidth;
- if (this->useBarePtrCallConv.getNumOccurrences() == 0)
- this->useBarePtrCallConv = useBarePtrCallConv;
- if (this->runtime.getNumOccurrences() == 0)
- this->runtime = runtime;
- }
+ using Base::Base;
void getDependentDialects(DialectRegistry ®istry) const override {
Base::getDependentDialects(registry);
@@ -456,7 +495,7 @@ void mlir::populateGpuToROCDLConversionPatterns(
patterns.add<GPUDynamicSharedMemoryOpLowering>(converter);
patterns
- .add<GPUShuffleOpLowering, GPULaneIdOpToROCDL, GPUSubgroupSizeOpToROCDL>(
+ .add<GPUShuffleOpLowering, GPULaneIdOpToROCDL, GPUSubgroupIdOpToROCDL>(
converter);
patterns.add<GPUSubgroupSizeOpToROCDL>(converter, chipset);
@@ -464,10 +503,6 @@ void mlir::populateGpuToROCDLConversionPatterns(
}
std::unique_ptr<OperationPass<gpu::GPUModuleOp>>
-mlir::createLowerGpuOpsToROCDLOpsPass(const std::string &chipset,
- unsigned indexBitwidth,
- bool useBarePtrCallConv,
- gpu::amd::Runtime runtime) {
- return std::make_unique<LowerGpuOpsToROCDLOpsPass>(
- chipset, indexBitwidth, useBarePtrCallConv, runtime);
+mlir::createLowerGpuOpsToROCDLOpsPass() {
+ return std::make_unique<LowerGpuOpsToROCDLOpsPass>();
}
diff --git a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
index d28aa9e34c22a..e8868aeda4dcb 100644
--- a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
+++ b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
@@ -763,3 +763,25 @@ gpu.module @test_module {
gpu.module @test_custom_data_layout attributes {llvm.data_layout = "e"} {
}
+
+// -----
+
+gpu.module @test_module {
+ // CHECK-LABEL: func @gpu_subgroup_id()
+ func.func @gpu_subgroup_id() -> (index) {
+ // CHECK: %[[widx:.*]] = rocdl.workitem.id.x : i32
+ // CHECK: %[[widy:.*]] = rocdl.workitem.id.y : i32
+ // CHECK: %[[widz:.*]] = rocdl.workitem.id.z : i32
+ // CHECK: %[[dimx:.*]] = rocdl.workgroup.dim.x : i32
+ // CHECK: %[[dimy:.*]] = rocdl.workgroup.dim.y : i32
+ // CHECK: %[[int5:.*]] = llvm.mul %[[dimy]], %[[widz]] overflow<nsw, nuw> : i32
+ // CHECK: %[[int6:.*]] = llvm.add %[[int5]], %[[widy]] overflow<nsw, nuw> : i32
+ // CHECK: %[[int7:.*]] = llvm.mul %[[dimx]], %[[int6]] overflow<nsw, nuw> : i32
+ // CHECK: %[[int8:.*]] = llvm.add %[[widx]], %[[int7]] overflow<nsw, nuw> : i32
+ // CHECK: %[[wavefrontsize:.*]] = rocdl.wavefrontsize : i32
+ // CHECK: %[[result:.*]] = llvm.udiv %[[int8]], %[[wavefrontsize]] : i32
+ // CHECK: = llvm.sext %[[result]] : i32 to i64
+ %subgroupId = gpu.subgroup_id : index
+ func.return %subgroupId : index
+ }
+}
More information about the Mlir-commits
mailing list