[Mlir-commits] [mlir] 6e47937 - [MLIR][ROCDL] Lower `gpu.subgroup_size` to `wavefrontsize` (#137360)
llvmlistbot at llvm.org
llvmlistbot at llvm.org
Fri Apr 25 16:21:18 PDT 2025
Author: Alan Li
Date: 2025-04-25T19:21:15-04:00
New Revision: 6e47937eed35caecd80ff24cba9bb26259b7e8c1
URL: https://github.com/llvm/llvm-project/commit/6e47937eed35caecd80ff24cba9bb26259b7e8c1
DIFF: https://github.com/llvm/llvm-project/commit/6e47937eed35caecd80ff24cba9bb26259b7e8c1.diff
LOG: [MLIR][ROCDL] Lower `gpu.subgroup_size` to `wavefrontsize` (#137360)
Added:
Modified:
mlir/include/mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h
mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
mlir/test/Target/LLVMIR/rocdl.mlir
Removed:
################################################################################
diff --git a/mlir/include/mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h b/mlir/include/mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h
index 1a917932a9a84..291b809071ce9 100644
--- a/mlir/include/mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h
+++ b/mlir/include/mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h
@@ -20,6 +20,10 @@ class RewritePatternSet;
template <typename OpT>
class OperationPass;
+namespace amdgpu {
+struct Chipset;
+} // namespace amdgpu
+
namespace gpu {
class GPUModuleOp;
} // namespace gpu
@@ -32,7 +36,8 @@ class GPUModuleOp;
/// The resulting pattern set should be run over a gpu.module op
void populateGpuToROCDLConversionPatterns(const LLVMTypeConverter &converter,
RewritePatternSet &patterns,
- gpu::amd::Runtime runtime);
+ gpu::amd::Runtime runtime,
+ amdgpu::Chipset chipset);
/// Configure target to convert from the GPU dialect to ROCDL.
void configureGpuToROCDLConversionLegality(ConversionTarget &target);
diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
index 186a4f53f93cb..93e59e0e7e6be 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
@@ -216,6 +216,8 @@ def ROCDL_BlockIdXOp : ROCDL_SpecialIdRegisterOp<"workgroup.id.x">;
def ROCDL_BlockIdYOp : ROCDL_SpecialIdRegisterOp<"workgroup.id.y">;
def ROCDL_BlockIdZOp : ROCDL_SpecialIdRegisterOp<"workgroup.id.z">;
+def ROCDL_WavefrontSizeOp : ROCDL_SpecialIdRegisterOp<"wavefrontsize">;
+
//===----------------------------------------------------------------------===//
// Thread range and Block range
//===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
index 8002d08fdbd27..40e27caedaf89 100644
--- a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
+++ b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
@@ -52,6 +52,25 @@ namespace mlir {
using namespace mlir;
+// Truncate or extend the result depending on the index bitwidth specified
+// by the LLVMTypeConverter options.
+static Value truncOrExtToLLVMType(ConversionPatternRewriter &rewriter,
+ Location loc, Value value,
+ const LLVMTypeConverter &converter) {
+ int64_t intWidth = cast<IntegerType>(value.getType()).getWidth();
+ int64_t indexBitwidth = converter.getIndexTypeBitwidth();
+ auto indexBitwidthType =
+ IntegerType::get(rewriter.getContext(), converter.getIndexTypeBitwidth());
+ // TODO: use <=> in C++20.
+ if (indexBitwidth > intWidth) {
+ return rewriter.create<LLVM::SExtOp>(loc, indexBitwidthType, value);
+ }
+ if (indexBitwidth < intWidth) {
+ return rewriter.create<LLVM::TruncOp>(loc, indexBitwidthType, value);
+ }
+ return value;
+}
+
/// Returns true if the given `gpu.func` can be safely called using the bare
/// pointer calling convention.
static bool canBeCalledWithBarePointers(gpu::GPUFuncOp func) {
@@ -113,6 +132,35 @@ struct GPULaneIdOpToROCDL : ConvertOpToLLVMPattern<gpu::LaneIdOp> {
}
};
+struct GPUSubgroupSizeOpToROCDL : ConvertOpToLLVMPattern<gpu::SubgroupSizeOp> {
+ using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern;
+
+ GPUSubgroupSizeOpToROCDL(const LLVMTypeConverter &converter,
+ amdgpu::Chipset chipset)
+ : ConvertOpToLLVMPattern<gpu::SubgroupSizeOp>(converter),
+ chipset(chipset) {}
+
+ LogicalResult
+ matchAndRewrite(gpu::SubgroupSizeOp op, gpu::SubgroupSizeOp::Adaptor adaptor,
+ ConversionPatternRewriter &rewriter) const override {
+ LLVM::ConstantRangeAttr bounds = nullptr;
+ bool isBeforeGfx10 = chipset.majorVersion < 10;
+ if (auto upperBoundAttr = op.getUpperBoundAttr()) {
+ bounds = rewriter.getAttr<LLVM::ConstantRangeAttr>(
+ /*bitWidth=*/32, /*lower=*/isBeforeGfx10 ? 64 : 32,
+ /*upper=*/op.getUpperBoundAttr().getInt() + 1);
+ }
+ Value wavefrontOp = rewriter.create<ROCDL::WavefrontSizeOp>(
+ op.getLoc(), rewriter.getI32Type(), bounds);
+ wavefrontOp = truncOrExtToLLVMType(rewriter, op.getLoc(), wavefrontOp,
+ *getTypeConverter());
+ rewriter.replaceOp(op, {wavefrontOp});
+ return success();
+ }
+
+ const amdgpu::Chipset chipset;
+};
+
struct GPUShuffleOpLowering : public ConvertOpToLLVMPattern<gpu::ShuffleOp> {
using ConvertOpToLLVMPattern<gpu::ShuffleOp>::ConvertOpToLLVMPattern;
@@ -322,7 +370,8 @@ struct LowerGpuOpsToROCDLOpsPass final
populateAMDGPUToROCDLConversionPatterns(converter, llvmPatterns,
*maybeChipset);
- populateGpuToROCDLConversionPatterns(converter, llvmPatterns, runtime);
+ populateGpuToROCDLConversionPatterns(converter, llvmPatterns, runtime,
+ *maybeChipset);
configureGpuToROCDLConversionLegality(target);
if (failed(applyPartialConversion(m, target, std::move(llvmPatterns))))
signalPassFailure();
@@ -370,7 +419,7 @@ void mlir::configureGpuToROCDLConversionLegality(ConversionTarget &target) {
void mlir::populateGpuToROCDLConversionPatterns(
const LLVMTypeConverter &converter, RewritePatternSet &patterns,
- mlir::gpu::amd::Runtime runtime) {
+ mlir::gpu::amd::Runtime runtime, amdgpu::Chipset chipset) {
using gpu::index_lowering::IndexKind;
using gpu::index_lowering::IntrType;
using mlir::gpu::amd::Runtime;
@@ -408,7 +457,10 @@ void mlir::populateGpuToROCDLConversionPatterns(
// TODO: Add alignment for workgroup memory
patterns.add<GPUDynamicSharedMemoryOpLowering>(converter);
- patterns.add<GPUShuffleOpLowering, GPULaneIdOpToROCDL>(converter);
+ patterns
+ .add<GPUShuffleOpLowering, GPULaneIdOpToROCDL, GPUSubgroupSizeOpToROCDL>(
+ converter);
+ patterns.add<GPUSubgroupSizeOpToROCDL>(converter, chipset);
populateMathToROCDLConversionPatterns(converter, patterns);
}
diff --git a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
index 999b383a3b8db..d28aa9e34c22a 100644
--- a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
+++ b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
@@ -11,7 +11,7 @@ gpu.module @test_module {
func.func @gpu_index_ops()
-> (index, index, index, index, index, index,
index, index, index, index, index, index,
- index) {
+ index, index, index) {
// CHECK32-NOT: = llvm.sext %{{.*}} : i32 to i64
// CHECK: rocdl.workitem.id.x : i32
@@ -59,12 +59,20 @@ gpu.module @test_module {
// CHECK: = llvm.sext %{{.*}} : i32 to i64
%laneId = gpu.lane_id
+ // CHECK: = rocdl.wavefrontsize : i32
+ // CHECK: = llvm.sext %{{.*}} : i32 to i64
+ %subgroupSize = gpu.subgroup_size : index
+
+ // CHECK: = rocdl.wavefrontsize range <i32, 64, 65> : i32
+ // CHECK: = llvm.sext %{{.*}} : i32 to i64
+ %subgroupSize2 = gpu.subgroup_size upper_bound 64 : index
+
func.return %tIdX, %tIdY, %tIdZ, %bDimX, %bDimY, %bDimZ,
%bIdX, %bIdY, %bIdZ, %gDimX, %gDimY, %gDimZ,
- %laneId
+ %laneId, %subgroupSize, %subgroupSize2
: index, index, index, index, index, index,
index, index, index, index, index, index,
- index
+ index, index, index
}
}
diff --git a/mlir/test/Target/LLVMIR/rocdl.mlir b/mlir/test/Target/LLVMIR/rocdl.mlir
index 3db1f7b2b6427..af47582dd0bfb 100644
--- a/mlir/test/Target/LLVMIR/rocdl.mlir
+++ b/mlir/test/Target/LLVMIR/rocdl.mlir
@@ -32,6 +32,13 @@ llvm.func @rocdl_special_regs() -> i32 {
// CHECK: call range(i64 1, 65) i64 @__ockl_get_local_size(i32 0)
%14 = rocdl.workgroup.dim.x range <i32, 1, 65> : i64
+
+ // CHECK: call i32 @llvm.amdgcn.wavefrontsize()
+ %15 = rocdl.wavefrontsize : i32
+
+ // CHECK: call range(i32 32, 65) i32 @llvm.amdgcn.wavefrontsize()
+ %16 = rocdl.wavefrontsize range <i32, 32, 65> : i32
+
llvm.return %1 : i32
}
More information about the Mlir-commits
mailing list