[Mlir-commits] [mlir] [MLIR][ROCDL] Lower `gpu.subgroup_size` to `wavefrontsize` (PR #137360)
Alan Li
llvmlistbot at llvm.org
Fri Apr 25 13:20:27 PDT 2025
https://github.com/lialan updated https://github.com/llvm/llvm-project/pull/137360
>From de4364f23cb912a66505fb86c5de9539b2af3d7a Mon Sep 17 00:00:00 2001
From: Alan Li <me at alanli.org>
Date: Fri, 25 Apr 2025 12:37:59 -0400
Subject: [PATCH 1/3] [MLIR][ROCDL] Lower `gpu.subgroup_id` to `wavefrontsize`
---
mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td | 2 +
.../GPUToROCDL/LowerGpuOpsToROCDLOps.cpp | 37 ++++++++++++++++++-
.../Conversion/GPUToROCDL/gpu-to-rocdl.mlir | 10 +++--
mlir/test/Target/LLVMIR/rocdl.mlir | 4 ++
4 files changed, 49 insertions(+), 4 deletions(-)
diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
index 186a4f53f93cb..93e59e0e7e6be 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
@@ -216,6 +216,8 @@ def ROCDL_BlockIdXOp : ROCDL_SpecialIdRegisterOp<"workgroup.id.x">;
def ROCDL_BlockIdYOp : ROCDL_SpecialIdRegisterOp<"workgroup.id.y">;
def ROCDL_BlockIdZOp : ROCDL_SpecialIdRegisterOp<"workgroup.id.z">;
+def ROCDL_WavefrontSizeOp : ROCDL_SpecialIdRegisterOp<"wavefrontsize">;
+
//===----------------------------------------------------------------------===//
// Thread range and Block range
//===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
index e6dd6f135884e..d17fb4716d331 100644
--- a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
+++ b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
@@ -52,6 +52,25 @@ namespace mlir {
using namespace mlir;
+// Truncate or extend the result depending on the index bitwidth specified
+// by the LLVMTypeConverter options.
+static Value truncOrExtToLLVMType(ConversionPatternRewriter &rewriter,
+ Location loc, Value value,
+ const LLVMTypeConverter &converter) {
+ int64_t intWidth = cast<IntegerType>(value.getType()).getWidth();
+ int64_t indexBitwidth = converter.getIndexTypeBitwidth();
+ auto indexBitwidthType =
+ IntegerType::get(rewriter.getContext(), converter.getIndexTypeBitwidth());
+ // TODO: use <=> in C++20.
+ if (indexBitwidth > intWidth) {
+ return rewriter.create<LLVM::SExtOp>(loc, indexBitwidthType, value);
+ }
+ if (indexBitwidth < intWidth) {
+ return rewriter.create<LLVM::TruncOp>(loc, indexBitwidthType, value);
+ }
+ return value;
+}
+
/// Returns true if the given `gpu.func` can be safely called using the bare
/// pointer calling convention.
static bool canBeCalledWithBarePointers(gpu::GPUFuncOp func) {
@@ -113,6 +132,20 @@ struct GPULaneIdOpToROCDL : ConvertOpToLLVMPattern<gpu::LaneIdOp> {
}
};
+struct GPUSubgroupSizeOpToROCDL : ConvertOpToLLVMPattern<gpu::SubgroupSizeOp> {
+ using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern;
+ LogicalResult
+ matchAndRewrite(gpu::SubgroupSizeOp op, gpu::SubgroupSizeOp::Adaptor adaptor,
+ ConversionPatternRewriter &rewriter) const override {
+ Value wavefrontOp = rewriter.create<ROCDL::WavefrontSizeOp>(
+ op.getLoc(), IntegerType::get(rewriter.getContext(), 32));
+ wavefrontOp = truncOrExtToLLVMType(rewriter, op.getLoc(), wavefrontOp,
+ *getTypeConverter());
+ rewriter.replaceOp(op, {wavefrontOp});
+ return success();
+ }
+};
+
struct GPUShuffleOpLowering : public ConvertOpToLLVMPattern<gpu::ShuffleOp> {
using ConvertOpToLLVMPattern<gpu::ShuffleOp>::ConvertOpToLLVMPattern;
@@ -405,7 +438,9 @@ void mlir::populateGpuToROCDLConversionPatterns(
// TODO: Add alignment for workgroup memory
patterns.add<GPUDynamicSharedMemoryOpLowering>(converter);
- patterns.add<GPUShuffleOpLowering, GPULaneIdOpToROCDL>(converter);
+ patterns
+ .add<GPUShuffleOpLowering, GPULaneIdOpToROCDL, GPUSubgroupSizeOpToROCDL>(
+ converter);
populateMathToROCDLConversionPatterns(converter, patterns);
}
diff --git a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
index 071cae9d5789f..5e3cad0cf26b0 100644
--- a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
+++ b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
@@ -11,7 +11,7 @@ gpu.module @test_module {
func.func @gpu_index_ops()
-> (index, index, index, index, index, index,
index, index, index, index, index, index,
- index) {
+ index, index) {
// CHECK32-NOT: = llvm.sext %{{.*}} : i32 to i64
// CHECK: rocdl.workitem.id.x : i32
@@ -59,12 +59,16 @@ gpu.module @test_module {
// CHECK: = llvm.sext %{{.*}} : i32 to i64
%laneId = gpu.lane_id
+ // CHECK: = rocdl.wavefrontsize : i32
+ // CHECK: = llvm.sext %{{.*}} : i32 to i64
+ %subgroupSize = gpu.subgroup_size : index
+
func.return %tIdX, %tIdY, %tIdZ, %bDimX, %bDimY, %bDimZ,
%bIdX, %bIdY, %bIdZ, %gDimX, %gDimY, %gDimZ,
- %laneId
+ %laneId, %subgroupSize
: index, index, index, index, index, index,
index, index, index, index, index, index,
- index
+ index, index
}
}
diff --git a/mlir/test/Target/LLVMIR/rocdl.mlir b/mlir/test/Target/LLVMIR/rocdl.mlir
index 3db1f7b2b6427..3a0d3943fe207 100644
--- a/mlir/test/Target/LLVMIR/rocdl.mlir
+++ b/mlir/test/Target/LLVMIR/rocdl.mlir
@@ -32,6 +32,10 @@ llvm.func @rocdl_special_regs() -> i32 {
// CHECK: call range(i64 1, 65) i64 @__ockl_get_local_size(i32 0)
%14 = rocdl.workgroup.dim.x range <i32, 1, 65> : i64
+
+ // CHECK: call i32 $llvm.amdgcn.wavefrontsize()
+ %15 = rocdl.wavefrontsize : i32
+
llvm.return %1 : i32
}
>From 1c75a81472535e0b27706df898d6799de32734a9 Mon Sep 17 00:00:00 2001
From: Alan Li <me at alanli.org>
Date: Fri, 25 Apr 2025 16:06:04 -0400
Subject: [PATCH 2/3] updates
---
mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td | 5 ++++-
.../Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp | 2 +-
mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir | 10 +++++++---
mlir/test/Target/LLVMIR/rocdl.mlir | 5 ++++-
4 files changed, 16 insertions(+), 6 deletions(-)
diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
index 93e59e0e7e6be..3511f71b32866 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
@@ -216,7 +216,10 @@ def ROCDL_BlockIdXOp : ROCDL_SpecialIdRegisterOp<"workgroup.id.x">;
def ROCDL_BlockIdYOp : ROCDL_SpecialIdRegisterOp<"workgroup.id.y">;
def ROCDL_BlockIdZOp : ROCDL_SpecialIdRegisterOp<"workgroup.id.z">;
-def ROCDL_WavefrontSizeOp : ROCDL_SpecialIdRegisterOp<"wavefrontsize">;
+def ROCDL_WavefrontSizeOp : ROCDL_IntrPure1Op<"wavefrontsize">,
+ Arguments<(ins OptionalAttr<IndexAttr>:$upper_bound)> {
+ let assemblyFormat = "(`upper_bound` $upper_bound^)? attr-dict `:` type($res)";
+}
//===----------------------------------------------------------------------===//
// Thread range and Block range
diff --git a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
index d17fb4716d331..e196aa17d61c2 100644
--- a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
+++ b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
@@ -138,7 +138,7 @@ struct GPUSubgroupSizeOpToROCDL : ConvertOpToLLVMPattern<gpu::SubgroupSizeOp> {
matchAndRewrite(gpu::SubgroupSizeOp op, gpu::SubgroupSizeOp::Adaptor adaptor,
ConversionPatternRewriter &rewriter) const override {
Value wavefrontOp = rewriter.create<ROCDL::WavefrontSizeOp>(
- op.getLoc(), IntegerType::get(rewriter.getContext(), 32));
+ op.getLoc(), rewriter.getI32Type(), op.getUpperBoundAttr());
wavefrontOp = truncOrExtToLLVMType(rewriter, op.getLoc(), wavefrontOp,
*getTypeConverter());
rewriter.replaceOp(op, {wavefrontOp});
diff --git a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
index 5e3cad0cf26b0..3ed291ce11c4e 100644
--- a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
+++ b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
@@ -11,7 +11,7 @@ gpu.module @test_module {
func.func @gpu_index_ops()
-> (index, index, index, index, index, index,
index, index, index, index, index, index,
- index, index) {
+ index, index, index) {
// CHECK32-NOT: = llvm.sext %{{.*}} : i32 to i64
// CHECK: rocdl.workitem.id.x : i32
@@ -63,12 +63,16 @@ gpu.module @test_module {
// CHECK: = llvm.sext %{{.*}} : i32 to i64
%subgroupSize = gpu.subgroup_size : index
+ // CHECK: = rocdl.wavefrontsize upper_bound 64 : i32
+ // CHECK: = llvm.sext %{{.*}} : i32 to i64
+ %subgroupSize2 = gpu.subgroup_size upper_bound 64 : index
+
func.return %tIdX, %tIdY, %tIdZ, %bDimX, %bDimY, %bDimZ,
%bIdX, %bIdY, %bIdZ, %gDimX, %gDimY, %gDimZ,
- %laneId, %subgroupSize
+ %laneId, %subgroupSize, %subgroupSize2
: index, index, index, index, index, index,
index, index, index, index, index, index,
- index, index
+ index, index, index
}
}
diff --git a/mlir/test/Target/LLVMIR/rocdl.mlir b/mlir/test/Target/LLVMIR/rocdl.mlir
index 3a0d3943fe207..66be3dea66630 100644
--- a/mlir/test/Target/LLVMIR/rocdl.mlir
+++ b/mlir/test/Target/LLVMIR/rocdl.mlir
@@ -33,9 +33,12 @@ llvm.func @rocdl_special_regs() -> i32 {
// CHECK: call range(i64 1, 65) i64 @__ockl_get_local_size(i32 0)
%14 = rocdl.workgroup.dim.x range <i32, 1, 65> : i64
- // CHECK: call i32 $llvm.amdgcn.wavefrontsize()
+ // CHECK: call i32 @llvm.amdgcn.wavefrontsize()
%15 = rocdl.wavefrontsize : i32
+ // CHECK: call i32 @llvm.amdgcn.wavefrontsize()
+ %16 = rocdl.wavefrontsize upper_bound 32 : i32
+
llvm.return %1 : i32
}
>From a29668da34fd9a5a0546c615c062ba277276d951 Mon Sep 17 00:00:00 2001
From: Alan Li <me at alanli.org>
Date: Fri, 25 Apr 2025 16:20:07 -0400
Subject: [PATCH 3/3] Use range instead of single upper_bound
---
mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td | 5 +----
mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp | 8 +++++++-
mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir | 2 +-
mlir/test/Target/LLVMIR/rocdl.mlir | 4 ++--
4 files changed, 11 insertions(+), 8 deletions(-)
diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
index 3511f71b32866..93e59e0e7e6be 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
@@ -216,10 +216,7 @@ def ROCDL_BlockIdXOp : ROCDL_SpecialIdRegisterOp<"workgroup.id.x">;
def ROCDL_BlockIdYOp : ROCDL_SpecialIdRegisterOp<"workgroup.id.y">;
def ROCDL_BlockIdZOp : ROCDL_SpecialIdRegisterOp<"workgroup.id.z">;
-def ROCDL_WavefrontSizeOp : ROCDL_IntrPure1Op<"wavefrontsize">,
- Arguments<(ins OptionalAttr<IndexAttr>:$upper_bound)> {
- let assemblyFormat = "(`upper_bound` $upper_bound^)? attr-dict `:` type($res)";
-}
+def ROCDL_WavefrontSizeOp : ROCDL_SpecialIdRegisterOp<"wavefrontsize">;
//===----------------------------------------------------------------------===//
// Thread range and Block range
diff --git a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
index e196aa17d61c2..c328ff96feb4e 100644
--- a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
+++ b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
@@ -137,8 +137,14 @@ struct GPUSubgroupSizeOpToROCDL : ConvertOpToLLVMPattern<gpu::SubgroupSizeOp> {
LogicalResult
matchAndRewrite(gpu::SubgroupSizeOp op, gpu::SubgroupSizeOp::Adaptor adaptor,
ConversionPatternRewriter &rewriter) const override {
+ LLVM::ConstantRangeAttr bounds = nullptr;
+ if (auto upperBoundAttr = op.getUpperBoundAttr()) {
+ bounds = rewriter.getAttr<LLVM::ConstantRangeAttr>(
+ /*bitWidth=*/32, /*lower=*/32,
+ /*upper=*/op.getUpperBoundAttr().getInt());
+ }
Value wavefrontOp = rewriter.create<ROCDL::WavefrontSizeOp>(
- op.getLoc(), rewriter.getI32Type(), op.getUpperBoundAttr());
+ op.getLoc(), rewriter.getI32Type(), bounds);
wavefrontOp = truncOrExtToLLVMType(rewriter, op.getLoc(), wavefrontOp,
*getTypeConverter());
rewriter.replaceOp(op, {wavefrontOp});
diff --git a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
index 3ed291ce11c4e..640df84dcba8a 100644
--- a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
+++ b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
@@ -63,7 +63,7 @@ gpu.module @test_module {
// CHECK: = llvm.sext %{{.*}} : i32 to i64
%subgroupSize = gpu.subgroup_size : index
- // CHECK: = rocdl.wavefrontsize upper_bound 64 : i32
+ // CHECK: = rocdl.wavefrontsize range <i32, 32, 64> : i32
// CHECK: = llvm.sext %{{.*}} : i32 to i64
%subgroupSize2 = gpu.subgroup_size upper_bound 64 : index
diff --git a/mlir/test/Target/LLVMIR/rocdl.mlir b/mlir/test/Target/LLVMIR/rocdl.mlir
index 66be3dea66630..663f115a7c5ce 100644
--- a/mlir/test/Target/LLVMIR/rocdl.mlir
+++ b/mlir/test/Target/LLVMIR/rocdl.mlir
@@ -36,8 +36,8 @@ llvm.func @rocdl_special_regs() -> i32 {
// CHECK: call i32 @llvm.amdgcn.wavefrontsize()
%15 = rocdl.wavefrontsize : i32
- // CHECK: call i32 @llvm.amdgcn.wavefrontsize()
- %16 = rocdl.wavefrontsize upper_bound 32 : i32
+ // CHECK: call range(i32 32, 64) i32 @llvm.amdgcn.wavefrontsize()
+ %16 = rocdl.wavefrontsize range <i32, 32, 64> : i32
llvm.return %1 : i32
}
More information about the Mlir-commits
mailing list