[Mlir-commits] [mlir] [MLIR][ROCDL] Add conversion for gpu.subgroup_id to ROCDL (PR #136405)
Alan Li
llvmlistbot at llvm.org
Sat Apr 19 20:06:47 PDT 2025
https://github.com/lialan updated https://github.com/llvm/llvm-project/pull/136405
>From d75c2106ae48c3e86a2a572d2efe715c54cbdcfb Mon Sep 17 00:00:00 2001
From: Alan Li <me at alanli.org>
Date: Sat, 19 Apr 2025 00:51:15 -0400
Subject: [PATCH 1/2] [MLIR][ROCDL] Add conversion for gpu.subgroup_id to ROCDL
Creates `rocdl.wave_id` op with llvm conversion to:
`__builtin_amdgcn_s_get_waveid_in_workgroup`
---
mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td | 8 ++++
.../GPUToROCDL/LowerGpuOpsToROCDLOps.cpp | 48 ++++++++++++++-----
.../Conversion/GPUToROCDL/gpu-to-rocdl.mlir | 10 ++--
mlir/test/Target/LLVMIR/rocdl.mlir | 6 +++
4 files changed, 58 insertions(+), 14 deletions(-)
diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
index 186a4f53f93cb..09d22da0d4c72 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
@@ -204,6 +204,14 @@ def ROCDL_ReadlaneOp : ROCDL_IntrOp<"readlane", [], [0], [AllTypesMatch<["res",
}];
}
+// the intrinsic function name is too long so we use a shorter name for rocdl.
+def ROCDL_WaveIdOp : LLVM_IntrOpBase<ROCDL_Dialect, "wave_id",
+ "amdgcn_s_get_waveid_in_workgroup", [], [], [Pure], 1>,
+ Arguments<(ins)> {
+ let results = (outs LLVM_Type:$res);
+ let assemblyFormat = "attr-dict `:` type($res)";
+}
+
//===----------------------------------------------------------------------===//
// Thread index and Block index
//===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
index e6dd6f135884e..315bc7157cd83 100644
--- a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
+++ b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
@@ -80,6 +80,24 @@ static constexpr StringLiteral amdgcnDataLayout =
"64-S32-A5-G1-ni:7:8:9";
namespace {
+
+// Truncate or extend the result depending on the index bitwidth specified
+// by the LLVMTypeConverter options.
+static Value truncOrExtToLLVMType(ConversionPatternRewriter &rewriter,
+ Location loc, Value value,
+ const LLVMTypeConverter *converter) {
+ auto intWidth = cast<IntegerType>(value.getType()).getWidth();
+ auto indexBitwidth = converter->getIndexTypeBitwidth();
+ if (indexBitwidth > intWidth) {
+ return rewriter.create<LLVM::SExtOp>(
+ loc, IntegerType::get(rewriter.getContext(), indexBitwidth), value);
+ } else if (indexBitwidth < intWidth) {
+ return rewriter.create<LLVM::TruncOp>(
+ loc, IntegerType::get(rewriter.getContext(), indexBitwidth), value);
+ }
+ return value;
+}
+
struct GPULaneIdOpToROCDL : ConvertOpToLLVMPattern<gpu::LaneIdOp> {
using ConvertOpToLLVMPattern<gpu::LaneIdOp>::ConvertOpToLLVMPattern;
@@ -98,16 +116,7 @@ struct GPULaneIdOpToROCDL : ConvertOpToLLVMPattern<gpu::LaneIdOp> {
rewriter.create<ROCDL::MbcntLoOp>(loc, intTy, ValueRange{minus1, zero});
Value laneId = rewriter.create<ROCDL::MbcntHiOp>(
loc, intTy, ValueRange{minus1, mbcntLo});
- // Truncate or extend the result depending on the index bitwidth specified
- // by the LLVMTypeConverter options.
- const unsigned indexBitwidth = getTypeConverter()->getIndexTypeBitwidth();
- if (indexBitwidth > 32) {
- laneId = rewriter.create<LLVM::SExtOp>(
- loc, IntegerType::get(context, indexBitwidth), laneId);
- } else if (indexBitwidth < 32) {
- laneId = rewriter.create<LLVM::TruncOp>(
- loc, IntegerType::get(context, indexBitwidth), laneId);
- }
+ laneId = truncOrExtToLLVMType(rewriter, loc, laneId, getTypeConverter());
rewriter.replaceOp(op, {laneId});
return success();
}
@@ -190,6 +199,21 @@ struct GPUShuffleOpLowering : public ConvertOpToLLVMPattern<gpu::ShuffleOp> {
}
};
+struct GPUSubgroupIdOpToROCDL : ConvertOpToLLVMPattern<gpu::SubgroupIdOp> {
+ using ConvertOpToLLVMPattern<gpu::SubgroupIdOp>::ConvertOpToLLVMPattern;
+
+ LogicalResult
+ matchAndRewrite(gpu::SubgroupIdOp op, gpu::SubgroupIdOp::Adaptor adaptor,
+ ConversionPatternRewriter &rewriter) const override {
+ auto int32Type = IntegerType::get(rewriter.getContext(), 32);
+ Value waveIdOp = rewriter.create<ROCDL::WaveIdOp>(op.getLoc(), int32Type);
+ waveIdOp = truncOrExtToLLVMType(rewriter, op.getLoc(), waveIdOp,
+ getTypeConverter());
+ rewriter.replaceOp(op, {waveIdOp});
+ return success();
+ }
+};
+
/// Import the GPU Ops to ROCDL Patterns.
#include "GPUToROCDL.cpp.inc"
@@ -405,7 +429,9 @@ void mlir::populateGpuToROCDLConversionPatterns(
// TODO: Add alignment for workgroup memory
patterns.add<GPUDynamicSharedMemoryOpLowering>(converter);
- patterns.add<GPUShuffleOpLowering, GPULaneIdOpToROCDL>(converter);
+ patterns
+ .add<GPUShuffleOpLowering, GPULaneIdOpToROCDL, GPUSubgroupIdOpToROCDL>(
+ converter);
populateMathToROCDLConversionPatterns(converter, patterns);
}
diff --git a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
index 071cae9d5789f..a06b77dcff038 100644
--- a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
+++ b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
@@ -11,7 +11,7 @@ gpu.module @test_module {
func.func @gpu_index_ops()
-> (index, index, index, index, index, index,
index, index, index, index, index, index,
- index) {
+ index, index) {
// CHECK32-NOT: = llvm.sext %{{.*}} : i32 to i64
// CHECK: rocdl.workitem.id.x : i32
@@ -59,12 +59,16 @@ gpu.module @test_module {
// CHECK: = llvm.sext %{{.*}} : i32 to i64
%laneId = gpu.lane_id
+ // CHECK: = rocdl.wave_id : i32
+ // CHECK: = llvm.sext %{{.*}} : i32 to i64
+ %waveId = gpu.subgroup_id : index
+
func.return %tIdX, %tIdY, %tIdZ, %bDimX, %bDimY, %bDimZ,
%bIdX, %bIdY, %bIdZ, %gDimX, %gDimY, %gDimZ,
- %laneId
+ %laneId, %waveId
: index, index, index, index, index, index,
index, index, index, index, index, index,
- index
+ index, index
}
}
diff --git a/mlir/test/Target/LLVMIR/rocdl.mlir b/mlir/test/Target/LLVMIR/rocdl.mlir
index 3db1f7b2b6427..f5767dd1fc95a 100644
--- a/mlir/test/Target/LLVMIR/rocdl.mlir
+++ b/mlir/test/Target/LLVMIR/rocdl.mlir
@@ -88,6 +88,12 @@ llvm.func @rocdl.lane_id() -> i32 {
llvm.return %3 : i32
}
+llvm.func @rocdl.wave_id() -> i32 {
+ // CHECK: call i32 @llvm.amdgcn.s.get.waveid.in.workgroup()
+ %0 = rocdl.wave_id : i32
+ llvm.return %0 : i32
+}
+
llvm.func @rocdl.swizzle(%src : i32) -> i32 {
// CHECK-LABEL: rocdl.swizzle
// CHECK: call i32 @llvm.amdgcn.ds.swizzle
>From 2fc9d61012255e266eb43c32fc1c5329d512a6e0 Mon Sep 17 00:00:00 2001
From: Alan Li <me at alanli.org>
Date: Sat, 19 Apr 2025 23:04:32 -0400
Subject: [PATCH 2/2] update comments; also add a guard to guard off gfx9.
---
.../Conversion/GPUToROCDL/GPUToROCDLPass.h | 4 +-
mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td | 5 ++-
.../GPUToROCDL/LowerGpuOpsToROCDLOps.cpp | 43 ++++++++++++-------
3 files changed, 33 insertions(+), 19 deletions(-)
diff --git a/mlir/include/mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h b/mlir/include/mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h
index 1a917932a9a84..2d8aaf8371627 100644
--- a/mlir/include/mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h
+++ b/mlir/include/mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h
@@ -10,6 +10,7 @@
#include "mlir/Conversion/GPUToROCDL/Runtimes.h"
#include "mlir/Conversion/LLVMCommon/LoweringOptions.h"
+#include "mlir/Dialect/AMDGPU/Utils/Chipset.h"
#include <memory>
namespace mlir {
@@ -32,7 +33,8 @@ class GPUModuleOp;
/// The resulting pattern set should be run over a gpu.module op
void populateGpuToROCDLConversionPatterns(const LLVMTypeConverter &converter,
RewritePatternSet &patterns,
- gpu::amd::Runtime runtime);
+ gpu::amd::Runtime runtime,
+ mlir::amdgpu::Chipset chipset);
/// Configure target to convert from the GPU dialect to ROCDL.
void configureGpuToROCDLConversionLegality(ConversionTarget &target);
diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
index 09d22da0d4c72..f5da5e1fcfa19 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
@@ -204,9 +204,10 @@ def ROCDL_ReadlaneOp : ROCDL_IntrOp<"readlane", [], [0], [AllTypesMatch<["res",
}];
}
-// the intrinsic function name is too long so we use a shorter name for rocdl.
+// The LLVM intrinsic function name is rather mouthful,
+// so here we opt to use a shorter rocdl name.
def ROCDL_WaveIdOp : LLVM_IntrOpBase<ROCDL_Dialect, "wave_id",
- "amdgcn_s_get_waveid_in_workgroup", [], [], [Pure], 1>,
+ "amdgcn_s_get_waveid_in_workgroup", [], [], [], 1>,
Arguments<(ins)> {
let results = (outs LLVM_Type:$res);
let assemblyFormat = "attr-dict `:` type($res)";
diff --git a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
index 315bc7157cd83..be3ecae005ff3 100644
--- a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
+++ b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
@@ -85,15 +85,15 @@ namespace {
// by the LLVMTypeConverter options.
static Value truncOrExtToLLVMType(ConversionPatternRewriter &rewriter,
Location loc, Value value,
- const LLVMTypeConverter *converter) {
+ const LLVMTypeConverter &converter) {
auto intWidth = cast<IntegerType>(value.getType()).getWidth();
- auto indexBitwidth = converter->getIndexTypeBitwidth();
+ auto indexBitwidth = converter.getIndexTypeBitwidth();
+ auto indexBitwidthType =
+ IntegerType::get(rewriter.getContext(), converter.getIndexTypeBitwidth());
if (indexBitwidth > intWidth) {
- return rewriter.create<LLVM::SExtOp>(
- loc, IntegerType::get(rewriter.getContext(), indexBitwidth), value);
+ return rewriter.create<LLVM::SExtOp>(loc, indexBitwidthType, value);
} else if (indexBitwidth < intWidth) {
- return rewriter.create<LLVM::TruncOp>(
- loc, IntegerType::get(rewriter.getContext(), indexBitwidth), value);
+ return rewriter.create<LLVM::TruncOp>(loc, indexBitwidthType, value);
}
return value;
}
@@ -116,7 +116,7 @@ struct GPULaneIdOpToROCDL : ConvertOpToLLVMPattern<gpu::LaneIdOp> {
rewriter.create<ROCDL::MbcntLoOp>(loc, intTy, ValueRange{minus1, zero});
Value laneId = rewriter.create<ROCDL::MbcntHiOp>(
loc, intTy, ValueRange{minus1, mbcntLo});
- laneId = truncOrExtToLLVMType(rewriter, loc, laneId, getTypeConverter());
+ laneId = truncOrExtToLLVMType(rewriter, loc, laneId, *getTypeConverter());
rewriter.replaceOp(op, {laneId});
return success();
}
@@ -199,16 +199,27 @@ struct GPUShuffleOpLowering : public ConvertOpToLLVMPattern<gpu::ShuffleOp> {
}
};
-struct GPUSubgroupIdOpToROCDL : ConvertOpToLLVMPattern<gpu::SubgroupIdOp> {
- using ConvertOpToLLVMPattern<gpu::SubgroupIdOp>::ConvertOpToLLVMPattern;
+struct GPUSubgroupIdOpToROCDL final
+ : ConvertOpToLLVMPattern<gpu::SubgroupIdOp> {
+ using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern;
+
+ GPUSubgroupIdOpToROCDL(MLIRContext *ctx, mlir::amdgpu::Chipset chipset)
+ : ConvertOpToLLVMPattern(ctx), chipset(chipset) {}
+
+ mlir::amdgpu::Chipset chipset;
LogicalResult
matchAndRewrite(gpu::SubgroupIdOp op, gpu::SubgroupIdOp::Adaptor adaptor,
ConversionPatternRewriter &rewriter) const override {
+ if (chipset.majorVersion < 10) {
+ return rewriter.notifyMatchFailure(
+ op, "SubgroupIdOp is not yet supported on this architecture");
+ }
+
auto int32Type = IntegerType::get(rewriter.getContext(), 32);
Value waveIdOp = rewriter.create<ROCDL::WaveIdOp>(op.getLoc(), int32Type);
waveIdOp = truncOrExtToLLVMType(rewriter, op.getLoc(), waveIdOp,
- getTypeConverter());
+ *getTypeConverter());
rewriter.replaceOp(op, {waveIdOp});
return success();
}
@@ -343,7 +354,8 @@ struct LowerGpuOpsToROCDLOpsPass final
populateAMDGPUToROCDLConversionPatterns(converter, llvmPatterns,
*maybeChipset);
- populateGpuToROCDLConversionPatterns(converter, llvmPatterns, runtime);
+ populateGpuToROCDLConversionPatterns(converter, llvmPatterns, runtime,
+ *maybeChipset);
configureGpuToROCDLConversionLegality(target);
if (failed(applyPartialConversion(m, target, std::move(llvmPatterns))))
signalPassFailure();
@@ -391,7 +403,7 @@ void mlir::configureGpuToROCDLConversionLegality(ConversionTarget &target) {
void mlir::populateGpuToROCDLConversionPatterns(
const LLVMTypeConverter &converter, RewritePatternSet &patterns,
- mlir::gpu::amd::Runtime runtime) {
+ mlir::gpu::amd::Runtime runtime, mlir::amdgpu::Chipset chipset) {
using gpu::index_lowering::IndexKind;
using gpu::index_lowering::IntrType;
using mlir::gpu::amd::Runtime;
@@ -429,10 +441,9 @@ void mlir::populateGpuToROCDLConversionPatterns(
// TODO: Add alignment for workgroup memory
patterns.add<GPUDynamicSharedMemoryOpLowering>(converter);
- patterns
- .add<GPUShuffleOpLowering, GPULaneIdOpToROCDL, GPUSubgroupIdOpToROCDL>(
- converter);
-
+ patterns.add<GPUShuffleOpLowering, GPULaneIdOpToROCDL>(converter);
+ patterns.add(
+ std::make_unique<GPUSubgroupIdOpToROCDL>(patterns.getContext(), chipset));
populateMathToROCDLConversionPatterns(converter, patterns);
}
More information about the Mlir-commits
mailing list