[Mlir-commits] [mlir] [mlir][amdgpu] Add `rocdl.s.waitcnt` wrapper (PR #149670)
Ivan Butygin
llvmlistbot at llvm.org
Sun Jul 20 01:29:58 PDT 2025
https://github.com/Hardcode84 updated https://github.com/llvm/llvm-project/pull/149670
>From 442ed16790f1a491aa7f6607d5d3664338954454 Mon Sep 17 00:00:00 2001
From: Ivan Butygin <ivan.butygin at gmail.com>
Date: Sat, 19 Jul 2025 20:11:13 +0200
Subject: [PATCH 1/2] [mlir][amdgpu] Add `amdgpu.waitcnt` wrapper
The main motivations is to pass vmcnt/expcnt/lgkmcnt values directly and delegate architecture-dependent bitpacking to the amdgpu->rocdl lowering.
Only gfx9 bitpacking support added as part of this commit.
---
mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td | 20 +++++++
.../AMDGPUToROCDL/AMDGPUToROCDL.cpp | 52 +++++++++++++++++--
.../Conversion/AMDGPUToROCDL/waitcnt.mlir | 20 +++++++
mlir/test/Dialect/AMDGPU/ops.mlir | 13 +++++
4 files changed, 102 insertions(+), 3 deletions(-)
create mode 100644 mlir/test/Conversion/AMDGPUToROCDL/waitcnt.mlir
diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
index 80959ffbaf426..cecb936e18ae3 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
@@ -717,6 +717,26 @@ def AMDGPU_SchedBarrierOp :
}];
}
+def AMDGPU_WaitcntOp :
+ AMDGPU_Op<"waitcnt">,
+ Arguments<(ins
+ OptionalAttr<I32Attr>:$vmcnt,
+ OptionalAttr<I32Attr>:$expcnt,
+ OptionalAttr<I32Attr>:$lgkmcnt
+ )>
+ {
+ let summary = "Wrapper on ROCDL SWaitcntOp";
+ let description = [{
+ Covenience wrapper on `rocdl.s.waitcnt`. Hides the architecture specific
+ bitpacking from user. Missing values will be assumed maximum values supported
+ by the architecture. Large values will also be clamped to the maximum
+ supported values.
+ }];
+ let assemblyFormat = [{
+ (`vmcnt` `(` $vmcnt^ `)` )? (`expcnt` `(` $expcnt^ `)` )? (`lgkmcnt` `(` $lgkmcnt^ `)`)? attr-dict
+ }];
+}
+
def AMDGPU_MFMAPermB : I32EnumAttr<"MFMAPermB",
"The possible permutations of the lanes storing B available in an MFMA",
[
diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index ef35ee208f002..af588d5b70a45 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -419,6 +419,52 @@ struct RawBufferOpLowering : public ConvertOpToLLVMPattern<GpuOp> {
}
};
+// TODO: AMDGPU backend already have all this bitpacking logic, we should move
+// it to some common place.
+static FailureOr<unsigned> encodeWaitcnt(Chipset chipset, unsigned vmcnt,
+ unsigned expcnt, unsigned lgkmcnt) {
+ if (chipset.majorVersion == 9) {
+ vmcnt = std::min(63u, vmcnt);
+ expcnt = std::min(7u, expcnt);
+ lgkmcnt = std::min(15u, lgkmcnt);
+ unsigned lowBits = vmcnt & 0xF;
+ unsigned highBits = (vmcnt >> 4) << 14;
+ unsigned otherCnts = (expcnt << 4) | (lgkmcnt << 8);
+ return lowBits | highBits | otherCnts;
+ }
+ return failure();
+}
+
+struct WaitcntOpLowering : public ConvertOpToLLVMPattern<WaitcntOp> {
+ WaitcntOpLowering(const LLVMTypeConverter &converter, Chipset chipset)
+ : ConvertOpToLLVMPattern<WaitcntOp>(converter), chipset(chipset) {}
+
+ Chipset chipset;
+
+ LogicalResult
+ matchAndRewrite(WaitcntOp op, OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter) const override {
+ auto getVal = [](Attribute attr) -> unsigned {
+ if (attr)
+ return cast<IntegerAttr>(attr).getInt();
+
+ // This value will be clamped to the maximum value for the chipset.
+ return 1024 * 1024;
+ };
+ unsigned vmcnt = getVal(adaptor.getVmcntAttr());
+ unsigned expcnt = getVal(adaptor.getExpcntAttr());
+ unsigned lgkmcnt = getVal(adaptor.getLgkmcntAttr());
+
+ FailureOr<unsigned> waitcnt =
+ encodeWaitcnt(chipset, vmcnt, expcnt, lgkmcnt);
+ if (failed(waitcnt))
+ return op.emitOpError("unsupported chipset");
+
+ rewriter.replaceOpWithNewOp<ROCDL::SWaitcntOp>(op, *waitcnt);
+ return success();
+ }
+};
+
struct LDSBarrierOpLowering : public ConvertOpToLLVMPattern<LDSBarrierOp> {
LDSBarrierOpLowering(const LLVMTypeConverter &converter, Chipset chipset)
: ConvertOpToLLVMPattern<LDSBarrierOp>(converter), chipset(chipset) {}
@@ -1825,9 +1871,9 @@ void mlir::populateAMDGPUToROCDLConversionPatterns(LLVMTypeConverter &converter,
ROCDL::RawPtrBufferAtomicUminOp>,
RawBufferOpLowering<RawBufferAtomicCmpswapOp,
ROCDL::RawPtrBufferAtomicCmpSwap>,
- AMDGPUDPPLowering, LDSBarrierOpLowering, SchedBarrierOpLowering,
- MFMAOpLowering, ScaledMFMAOpLowering, WMMAOpLowering,
- ExtPackedFp8OpLowering, ScaledExtPackedOpLowering,
+ AMDGPUDPPLowering, WaitcntOpLowering, LDSBarrierOpLowering,
+ SchedBarrierOpLowering, MFMAOpLowering, ScaledMFMAOpLowering,
+ WMMAOpLowering, ExtPackedFp8OpLowering, ScaledExtPackedOpLowering,
PackedScaledTruncOpLowering, PackedTrunc2xFp8OpLowering,
PackedStochRoundFp8OpLowering, GatherToLDSOpLowering,
TransposeLoadOpLowering>(converter, chipset);
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/waitcnt.mlir b/mlir/test/Conversion/AMDGPUToROCDL/waitcnt.mlir
new file mode 100644
index 0000000000000..9c785670198ae
--- /dev/null
+++ b/mlir/test/Conversion/AMDGPUToROCDL/waitcnt.mlir
@@ -0,0 +1,20 @@
+// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx942 | FileCheck %s --check-prefixes=CHECK,GFX9
+// TODO: Add more chipsets support
+
+
+// CHECK-LABEL: func @waitcnt
+func.func @waitcnt() {
+ // GFX9: rocdl.s.waitcnt 53119
+ amdgpu.waitcnt
+
+ // GFX9: rocdl.s.waitcnt 3952
+ amdgpu.waitcnt vmcnt(0)
+
+ // GFX9: rocdl.s.waitcnt 53007
+ amdgpu.waitcnt expcnt(0)
+
+ // GFX9: rocdl.s.waitcnt 49279
+ amdgpu.waitcnt lgkmcnt(0)
+
+ return
+}
diff --git a/mlir/test/Dialect/AMDGPU/ops.mlir b/mlir/test/Dialect/AMDGPU/ops.mlir
index 5559ac8f1a5c3..b126b23cb8156 100644
--- a/mlir/test/Dialect/AMDGPU/ops.mlir
+++ b/mlir/test/Dialect/AMDGPU/ops.mlir
@@ -504,3 +504,16 @@ func.func @gather_to_lds(%idx1 : index, %idx2 : index, %mem1 : memref<32xf16>, %
amdgpu.gather_to_lds %mem1[%idx1], %smem2[%idx1, %idx2] : vector<2xf16>, memref<32xf16>, memref<32x32xf16, #gpu.address_space<workgroup>>
func.return
}
+
+// CHECK-LABEL: func @waitcnt
+func.func @waitcnt() {
+ // CHECK: amdgpu.waitcnt vmcnt(1) expcnt(2) lgkmcnt(3)
+ // CHECK: amdgpu.waitcnt vmcnt(1)
+ // CHECK: amdgpu.waitcnt expcnt(2)
+ // CHECK: amdgpu.waitcnt lgkmcnt(3)
+ amdgpu.waitcnt vmcnt(1) expcnt(2) lgkmcnt(3)
+ amdgpu.waitcnt vmcnt(1)
+ amdgpu.waitcnt expcnt(2)
+ amdgpu.waitcnt lgkmcnt(3)
+ func.return
+}
>From bfacb4de1dab85b6f7423effe1bc4e9097babd90 Mon Sep 17 00:00:00 2001
From: Ivan Butygin <ivan.butygin at gmail.com>
Date: Sun, 20 Jul 2025 10:29:25 +0200
Subject: [PATCH 2/2] more chisets
Signed-off-by: Ivan Butygin <ivan.butygin at gmail.com>
---
.../AMDGPUToROCDL/AMDGPUToROCDL.cpp | 30 +++++++++++++++++++
.../Conversion/AMDGPUToROCDL/waitcnt.mlir | 11 ++++++-
2 files changed, 40 insertions(+), 1 deletion(-)
diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index af588d5b70a45..1940ef8775688 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -421,8 +421,23 @@ struct RawBufferOpLowering : public ConvertOpToLLVMPattern<GpuOp> {
// TODO: AMDGPU backend already have all this bitpacking logic, we should move
// it to some common place.
+/// \details \p Vmcnt, \p Expcnt and \p Lgkmcnt are decoded as follows:
+/// \p Vmcnt = \p Waitcnt[3:0] (pre-gfx9)
+/// \p Vmcnt = \p Waitcnt[15:14,3:0] (gfx9,10)
+/// \p Vmcnt = \p Waitcnt[15:10] (gfx11)
+/// \p Expcnt = \p Waitcnt[6:4] (pre-gfx11)
+/// \p Expcnt = \p Waitcnt[2:0] (gfx11)
+/// \p Lgkmcnt = \p Waitcnt[11:8] (pre-gfx10)
+/// \p Lgkmcnt = \p Waitcnt[13:8] (gfx10)
+/// \p Lgkmcnt = \p Waitcnt[9:4] (gfx11)
static FailureOr<unsigned> encodeWaitcnt(Chipset chipset, unsigned vmcnt,
unsigned expcnt, unsigned lgkmcnt) {
+ if (chipset.majorVersion < 9) {
+ vmcnt = std::min(15u, vmcnt);
+ expcnt = std::min(7u, expcnt);
+ lgkmcnt = std::min(15u, lgkmcnt);
+ return vmcnt | (expcnt << 4) | (lgkmcnt << 8);
+ }
if (chipset.majorVersion == 9) {
vmcnt = std::min(63u, vmcnt);
expcnt = std::min(7u, expcnt);
@@ -432,6 +447,21 @@ static FailureOr<unsigned> encodeWaitcnt(Chipset chipset, unsigned vmcnt,
unsigned otherCnts = (expcnt << 4) | (lgkmcnt << 8);
return lowBits | highBits | otherCnts;
}
+ if (chipset.majorVersion == 10) {
+ vmcnt = std::min(63u, vmcnt);
+ expcnt = std::min(7u, expcnt);
+ lgkmcnt = std::min(63u, lgkmcnt);
+ unsigned lowBits = vmcnt & 0xF;
+ unsigned highBits = (vmcnt >> 4) << 14;
+ unsigned otherCnts = (expcnt << 4) | (lgkmcnt << 8);
+ return lowBits | highBits | otherCnts;
+ }
+ if (chipset.majorVersion == 11) {
+ vmcnt = std::min(63u, vmcnt);
+ expcnt = std::min(7u, expcnt);
+ lgkmcnt = std::min(63u, lgkmcnt);
+ return (vmcnt << 10) | expcnt | (lgkmcnt << 4);
+ }
return failure();
}
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/waitcnt.mlir b/mlir/test/Conversion/AMDGPUToROCDL/waitcnt.mlir
index 9c785670198ae..71617df05eb60 100644
--- a/mlir/test/Conversion/AMDGPUToROCDL/waitcnt.mlir
+++ b/mlir/test/Conversion/AMDGPUToROCDL/waitcnt.mlir
@@ -1,19 +1,28 @@
// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx942 | FileCheck %s --check-prefixes=CHECK,GFX9
-// TODO: Add more chipsets support
+// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx1030 | FileCheck %s --check-prefixes=CHECK,GFX10
+// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx1100 | FileCheck %s --check-prefixes=CHECK,GFX11
// CHECK-LABEL: func @waitcnt
func.func @waitcnt() {
// GFX9: rocdl.s.waitcnt 53119
+ // GFX10: rocdl.s.waitcnt 65407
+ // GFX11: rocdl.s.waitcnt 65527
amdgpu.waitcnt
// GFX9: rocdl.s.waitcnt 3952
+ // GFX10: rocdl.s.waitcnt 16240
+ // GFX11: rocdl.s.waitcnt 1015
amdgpu.waitcnt vmcnt(0)
// GFX9: rocdl.s.waitcnt 53007
+ // GFX10: rocdl.s.waitcnt 65295
+ // GFX11: rocdl.s.waitcnt 65520
amdgpu.waitcnt expcnt(0)
// GFX9: rocdl.s.waitcnt 49279
+ // GFX10: rocdl.s.waitcnt 49279
+ // GFX11: rocdl.s.waitcnt 64519
amdgpu.waitcnt lgkmcnt(0)
return
More information about the Mlir-commits
mailing list