[Mlir-commits] [mlir] [mlir][amdgpu] Add `rocdl.s.waitcnt` wrapper (PR #149670)
Ivan Butygin
llvmlistbot at llvm.org
Tue Jul 22 11:46:31 PDT 2025
https://github.com/Hardcode84 updated https://github.com/llvm/llvm-project/pull/149670
>From f3bc55c8f3a1bb0aabbed5e110831278f2d20ec5 Mon Sep 17 00:00:00 2001
From: Ivan Butygin <ivan.butygin at gmail.com>
Date: Sat, 19 Jul 2025 20:11:13 +0200
Subject: [PATCH 1/6] [mlir][amdgpu] Add `amdgpu.waitcnt` wrapper
The main motivations is to pass vmcnt/expcnt/lgkmcnt values directly and delegate architecture-dependent bitpacking to the amdgpu->rocdl lowering.
Only gfx9 bitpacking support added as part of this commit.
---
mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td | 20 +++++++
.../AMDGPUToROCDL/AMDGPUToROCDL.cpp | 52 +++++++++++++++++--
.../Conversion/AMDGPUToROCDL/waitcnt.mlir | 20 +++++++
mlir/test/Dialect/AMDGPU/ops.mlir | 13 +++++
4 files changed, 102 insertions(+), 3 deletions(-)
create mode 100644 mlir/test/Conversion/AMDGPUToROCDL/waitcnt.mlir
diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
index 5a53b15a9c679..7fe1ef37e1f9b 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
@@ -719,6 +719,26 @@ def AMDGPU_SchedBarrierOp :
}];
}
+def AMDGPU_WaitcntOp :
+ AMDGPU_Op<"waitcnt">,
+ Arguments<(ins
+ OptionalAttr<I32Attr>:$vmcnt,
+ OptionalAttr<I32Attr>:$expcnt,
+ OptionalAttr<I32Attr>:$lgkmcnt
+ )>
+ {
+ let summary = "Wrapper on ROCDL SWaitcntOp";
+ let description = [{
+ Covenience wrapper on `rocdl.s.waitcnt`. Hides the architecture specific
+ bitpacking from user. Missing values will be assumed maximum values supported
+ by the architecture. Large values will also be clamped to the maximum
+ supported values.
+ }];
+ let assemblyFormat = [{
+ (`vmcnt` `(` $vmcnt^ `)` )? (`expcnt` `(` $expcnt^ `)` )? (`lgkmcnt` `(` $lgkmcnt^ `)`)? attr-dict
+ }];
+}
+
def AMDGPU_MFMAPermB : I32EnumAttr<"MFMAPermB",
"The possible permutations of the lanes storing B available in an MFMA",
[
diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index ef35ee208f002..af588d5b70a45 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -419,6 +419,52 @@ struct RawBufferOpLowering : public ConvertOpToLLVMPattern<GpuOp> {
}
};
+// TODO: AMDGPU backend already have all this bitpacking logic, we should move
+// it to some common place.
+static FailureOr<unsigned> encodeWaitcnt(Chipset chipset, unsigned vmcnt,
+ unsigned expcnt, unsigned lgkmcnt) {
+ if (chipset.majorVersion == 9) {
+ vmcnt = std::min(63u, vmcnt);
+ expcnt = std::min(7u, expcnt);
+ lgkmcnt = std::min(15u, lgkmcnt);
+ unsigned lowBits = vmcnt & 0xF;
+ unsigned highBits = (vmcnt >> 4) << 14;
+ unsigned otherCnts = (expcnt << 4) | (lgkmcnt << 8);
+ return lowBits | highBits | otherCnts;
+ }
+ return failure();
+}
+
+struct WaitcntOpLowering : public ConvertOpToLLVMPattern<WaitcntOp> {
+ WaitcntOpLowering(const LLVMTypeConverter &converter, Chipset chipset)
+ : ConvertOpToLLVMPattern<WaitcntOp>(converter), chipset(chipset) {}
+
+ Chipset chipset;
+
+ LogicalResult
+ matchAndRewrite(WaitcntOp op, OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter) const override {
+ auto getVal = [](Attribute attr) -> unsigned {
+ if (attr)
+ return cast<IntegerAttr>(attr).getInt();
+
+ // This value will be clamped to the maximum value for the chipset.
+ return 1024 * 1024;
+ };
+ unsigned vmcnt = getVal(adaptor.getVmcntAttr());
+ unsigned expcnt = getVal(adaptor.getExpcntAttr());
+ unsigned lgkmcnt = getVal(adaptor.getLgkmcntAttr());
+
+ FailureOr<unsigned> waitcnt =
+ encodeWaitcnt(chipset, vmcnt, expcnt, lgkmcnt);
+ if (failed(waitcnt))
+ return op.emitOpError("unsupported chipset");
+
+ rewriter.replaceOpWithNewOp<ROCDL::SWaitcntOp>(op, *waitcnt);
+ return success();
+ }
+};
+
struct LDSBarrierOpLowering : public ConvertOpToLLVMPattern<LDSBarrierOp> {
LDSBarrierOpLowering(const LLVMTypeConverter &converter, Chipset chipset)
: ConvertOpToLLVMPattern<LDSBarrierOp>(converter), chipset(chipset) {}
@@ -1825,9 +1871,9 @@ void mlir::populateAMDGPUToROCDLConversionPatterns(LLVMTypeConverter &converter,
ROCDL::RawPtrBufferAtomicUminOp>,
RawBufferOpLowering<RawBufferAtomicCmpswapOp,
ROCDL::RawPtrBufferAtomicCmpSwap>,
- AMDGPUDPPLowering, LDSBarrierOpLowering, SchedBarrierOpLowering,
- MFMAOpLowering, ScaledMFMAOpLowering, WMMAOpLowering,
- ExtPackedFp8OpLowering, ScaledExtPackedOpLowering,
+ AMDGPUDPPLowering, WaitcntOpLowering, LDSBarrierOpLowering,
+ SchedBarrierOpLowering, MFMAOpLowering, ScaledMFMAOpLowering,
+ WMMAOpLowering, ExtPackedFp8OpLowering, ScaledExtPackedOpLowering,
PackedScaledTruncOpLowering, PackedTrunc2xFp8OpLowering,
PackedStochRoundFp8OpLowering, GatherToLDSOpLowering,
TransposeLoadOpLowering>(converter, chipset);
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/waitcnt.mlir b/mlir/test/Conversion/AMDGPUToROCDL/waitcnt.mlir
new file mode 100644
index 0000000000000..9c785670198ae
--- /dev/null
+++ b/mlir/test/Conversion/AMDGPUToROCDL/waitcnt.mlir
@@ -0,0 +1,20 @@
+// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx942 | FileCheck %s --check-prefixes=CHECK,GFX9
+// TODO: Add more chipsets support
+
+
+// CHECK-LABEL: func @waitcnt
+func.func @waitcnt() {
+ // GFX9: rocdl.s.waitcnt 53119
+ amdgpu.waitcnt
+
+ // GFX9: rocdl.s.waitcnt 3952
+ amdgpu.waitcnt vmcnt(0)
+
+ // GFX9: rocdl.s.waitcnt 53007
+ amdgpu.waitcnt expcnt(0)
+
+ // GFX9: rocdl.s.waitcnt 49279
+ amdgpu.waitcnt lgkmcnt(0)
+
+ return
+}
diff --git a/mlir/test/Dialect/AMDGPU/ops.mlir b/mlir/test/Dialect/AMDGPU/ops.mlir
index fe2b32be04de4..086b5884be5c7 100644
--- a/mlir/test/Dialect/AMDGPU/ops.mlir
+++ b/mlir/test/Dialect/AMDGPU/ops.mlir
@@ -548,3 +548,16 @@ func.func @gather_to_lds(%idx1 : index, %idx2 : index, %mem1 : memref<32xf16>, %
amdgpu.gather_to_lds %mem1[%idx1], %smem2[%idx1, %idx2] : vector<2xf16>, memref<32xf16>, memref<32x32xf16, #gpu.address_space<workgroup>>
func.return
}
+
+// CHECK-LABEL: func @waitcnt
+func.func @waitcnt() {
+ // CHECK: amdgpu.waitcnt vmcnt(1) expcnt(2) lgkmcnt(3)
+ // CHECK: amdgpu.waitcnt vmcnt(1)
+ // CHECK: amdgpu.waitcnt expcnt(2)
+ // CHECK: amdgpu.waitcnt lgkmcnt(3)
+ amdgpu.waitcnt vmcnt(1) expcnt(2) lgkmcnt(3)
+ amdgpu.waitcnt vmcnt(1)
+ amdgpu.waitcnt expcnt(2)
+ amdgpu.waitcnt lgkmcnt(3)
+ func.return
+}
>From 5320853675ede402b99d9fbcc4446134e80ad12b Mon Sep 17 00:00:00 2001
From: Ivan Butygin <ivan.butygin at gmail.com>
Date: Sun, 20 Jul 2025 10:29:25 +0200
Subject: [PATCH 2/6] more chisets
Signed-off-by: Ivan Butygin <ivan.butygin at gmail.com>
---
.../AMDGPUToROCDL/AMDGPUToROCDL.cpp | 30 +++++++++++++++++++
.../Conversion/AMDGPUToROCDL/waitcnt.mlir | 11 ++++++-
2 files changed, 40 insertions(+), 1 deletion(-)
diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index af588d5b70a45..1940ef8775688 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -421,8 +421,23 @@ struct RawBufferOpLowering : public ConvertOpToLLVMPattern<GpuOp> {
// TODO: AMDGPU backend already have all this bitpacking logic, we should move
// it to some common place.
+/// \details \p Vmcnt, \p Expcnt and \p Lgkmcnt are decoded as follows:
+/// \p Vmcnt = \p Waitcnt[3:0] (pre-gfx9)
+/// \p Vmcnt = \p Waitcnt[15:14,3:0] (gfx9,10)
+/// \p Vmcnt = \p Waitcnt[15:10] (gfx11)
+/// \p Expcnt = \p Waitcnt[6:4] (pre-gfx11)
+/// \p Expcnt = \p Waitcnt[2:0] (gfx11)
+/// \p Lgkmcnt = \p Waitcnt[11:8] (pre-gfx10)
+/// \p Lgkmcnt = \p Waitcnt[13:8] (gfx10)
+/// \p Lgkmcnt = \p Waitcnt[9:4] (gfx11)
static FailureOr<unsigned> encodeWaitcnt(Chipset chipset, unsigned vmcnt,
unsigned expcnt, unsigned lgkmcnt) {
+ if (chipset.majorVersion < 9) {
+ vmcnt = std::min(15u, vmcnt);
+ expcnt = std::min(7u, expcnt);
+ lgkmcnt = std::min(15u, lgkmcnt);
+ return vmcnt | (expcnt << 4) | (lgkmcnt << 8);
+ }
if (chipset.majorVersion == 9) {
vmcnt = std::min(63u, vmcnt);
expcnt = std::min(7u, expcnt);
@@ -432,6 +447,21 @@ static FailureOr<unsigned> encodeWaitcnt(Chipset chipset, unsigned vmcnt,
unsigned otherCnts = (expcnt << 4) | (lgkmcnt << 8);
return lowBits | highBits | otherCnts;
}
+ if (chipset.majorVersion == 10) {
+ vmcnt = std::min(63u, vmcnt);
+ expcnt = std::min(7u, expcnt);
+ lgkmcnt = std::min(63u, lgkmcnt);
+ unsigned lowBits = vmcnt & 0xF;
+ unsigned highBits = (vmcnt >> 4) << 14;
+ unsigned otherCnts = (expcnt << 4) | (lgkmcnt << 8);
+ return lowBits | highBits | otherCnts;
+ }
+ if (chipset.majorVersion == 11) {
+ vmcnt = std::min(63u, vmcnt);
+ expcnt = std::min(7u, expcnt);
+ lgkmcnt = std::min(63u, lgkmcnt);
+ return (vmcnt << 10) | expcnt | (lgkmcnt << 4);
+ }
return failure();
}
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/waitcnt.mlir b/mlir/test/Conversion/AMDGPUToROCDL/waitcnt.mlir
index 9c785670198ae..71617df05eb60 100644
--- a/mlir/test/Conversion/AMDGPUToROCDL/waitcnt.mlir
+++ b/mlir/test/Conversion/AMDGPUToROCDL/waitcnt.mlir
@@ -1,19 +1,28 @@
// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx942 | FileCheck %s --check-prefixes=CHECK,GFX9
-// TODO: Add more chipsets support
+// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx1030 | FileCheck %s --check-prefixes=CHECK,GFX10
+// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx1100 | FileCheck %s --check-prefixes=CHECK,GFX11
// CHECK-LABEL: func @waitcnt
func.func @waitcnt() {
// GFX9: rocdl.s.waitcnt 53119
+ // GFX10: rocdl.s.waitcnt 65407
+ // GFX11: rocdl.s.waitcnt 65527
amdgpu.waitcnt
// GFX9: rocdl.s.waitcnt 3952
+ // GFX10: rocdl.s.waitcnt 16240
+ // GFX11: rocdl.s.waitcnt 1015
amdgpu.waitcnt vmcnt(0)
// GFX9: rocdl.s.waitcnt 53007
+ // GFX10: rocdl.s.waitcnt 65295
+ // GFX11: rocdl.s.waitcnt 65520
amdgpu.waitcnt expcnt(0)
// GFX9: rocdl.s.waitcnt 49279
+ // GFX10: rocdl.s.waitcnt 49279
+ // GFX11: rocdl.s.waitcnt 64519
amdgpu.waitcnt lgkmcnt(0)
return
>From a8569157e2995cad79515eb2206ea6756f7bc5d6 Mon Sep 17 00:00:00 2001
From: Ivan Butygin <ivan.butygin at gmail.com>
Date: Mon, 21 Jul 2025 22:22:12 +0200
Subject: [PATCH 3/6] oilist
Signed-off-by: Ivan Butygin <ivan.butygin at gmail.com>
---
mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td | 2 +-
mlir/test/Dialect/AMDGPU/ops.mlir | 2 ++
2 files changed, 3 insertions(+), 1 deletion(-)
diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
index 7fe1ef37e1f9b..481cebdf30852 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
@@ -735,7 +735,7 @@ def AMDGPU_WaitcntOp :
supported values.
}];
let assemblyFormat = [{
- (`vmcnt` `(` $vmcnt^ `)` )? (`expcnt` `(` $expcnt^ `)` )? (`lgkmcnt` `(` $lgkmcnt^ `)`)? attr-dict
+ oilist( `vmcnt` `(` $vmcnt `)` | `expcnt` `(` $expcnt `)` | `lgkmcnt` `(` $lgkmcnt `)` ) attr-dict
}];
}
diff --git a/mlir/test/Dialect/AMDGPU/ops.mlir b/mlir/test/Dialect/AMDGPU/ops.mlir
index 086b5884be5c7..82dd2bec248a7 100644
--- a/mlir/test/Dialect/AMDGPU/ops.mlir
+++ b/mlir/test/Dialect/AMDGPU/ops.mlir
@@ -552,10 +552,12 @@ func.func @gather_to_lds(%idx1 : index, %idx2 : index, %mem1 : memref<32xf16>, %
// CHECK-LABEL: func @waitcnt
func.func @waitcnt() {
// CHECK: amdgpu.waitcnt vmcnt(1) expcnt(2) lgkmcnt(3)
+ // CHECK: amdgpu.waitcnt vmcnt(3) expcnt(2) lgkmcnt(1)
// CHECK: amdgpu.waitcnt vmcnt(1)
// CHECK: amdgpu.waitcnt expcnt(2)
// CHECK: amdgpu.waitcnt lgkmcnt(3)
amdgpu.waitcnt vmcnt(1) expcnt(2) lgkmcnt(3)
+ amdgpu.waitcnt lgkmcnt(1) expcnt(2) vmcnt(3)
amdgpu.waitcnt vmcnt(1)
amdgpu.waitcnt expcnt(2)
amdgpu.waitcnt lgkmcnt(3)
>From 811633e21a309c2b97ba342fa7ec1a9a2f955884 Mon Sep 17 00:00:00 2001
From: Ivan Butygin <ivan.butygin at gmail.com>
Date: Tue, 22 Jul 2025 19:37:52 +0200
Subject: [PATCH 4/6] switch to new api
---
mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td | 25 ++++++-----
.../AMDGPUToROCDL/AMDGPUToROCDL.cpp | 42 ++++++++++++++-----
.../AMDGPUToROCDL/memory_counter_wait.mlir | 42 +++++++++++++++++++
.../Conversion/AMDGPUToROCDL/waitcnt.mlir | 29 -------------
mlir/test/Dialect/AMDGPU/ops.mlir | 26 ++++++------
5 files changed, 102 insertions(+), 62 deletions(-)
create mode 100644 mlir/test/Conversion/AMDGPUToROCDL/memory_counter_wait.mlir
delete mode 100644 mlir/test/Conversion/AMDGPUToROCDL/waitcnt.mlir
diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
index 481cebdf30852..b237f7b5749e7 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
@@ -719,23 +719,26 @@ def AMDGPU_SchedBarrierOp :
}];
}
-def AMDGPU_WaitcntOp :
- AMDGPU_Op<"waitcnt">,
+def AMDGPU_MemoryCounterWaitOp :
+ AMDGPU_Op<"memory_counter_wait">,
Arguments<(ins
- OptionalAttr<I32Attr>:$vmcnt,
- OptionalAttr<I32Attr>:$expcnt,
- OptionalAttr<I32Attr>:$lgkmcnt
+ OptionalAttr<I32Attr>:$load,
+ OptionalAttr<I32Attr>:$store,
+ OptionalAttr<I32Attr>:$ds,
+ OptionalAttr<I32Attr>:$exp
)>
{
- let summary = "Wrapper on ROCDL SWaitcntOp";
+ let summary = "Wait for specified hardware counters";
let description = [{
- Covenience wrapper on `rocdl.s.waitcnt`. Hides the architecture specific
- bitpacking from user. Missing values will be assumed maximum values supported
- by the architecture. Large values will also be clamped to the maximum
- supported values.
+ Wait for the specified counters to be less-than or equal-to the provided
+ values before continuing.
+
+ Counters can lower to different instructions on different architectires,
+ including clamping to the some HW supported max value or combining multiple
+ counters into one.
}];
let assemblyFormat = [{
- oilist( `vmcnt` `(` $vmcnt `)` | `expcnt` `(` $expcnt `)` | `lgkmcnt` `(` $lgkmcnt `)` ) attr-dict
+ oilist( `load` `(` $load `)` | `store` `(` $store `)` | `ds` `(` $ds `)` | `exp` `(` $exp `)` ) attr-dict
}];
}
diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index 1940ef8775688..057dfced09087 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -465,15 +465,35 @@ static FailureOr<unsigned> encodeWaitcnt(Chipset chipset, unsigned vmcnt,
return failure();
}
-struct WaitcntOpLowering : public ConvertOpToLLVMPattern<WaitcntOp> {
- WaitcntOpLowering(const LLVMTypeConverter &converter, Chipset chipset)
- : ConvertOpToLLVMPattern<WaitcntOp>(converter), chipset(chipset) {}
+struct MemoryCounterWaitOpLowering
+ : public ConvertOpToLLVMPattern<MemoryCounterWaitOp> {
+ MemoryCounterWaitOpLowering(const LLVMTypeConverter &converter,
+ Chipset chipset)
+ : ConvertOpToLLVMPattern<MemoryCounterWaitOp>(converter),
+ chipset(chipset) {}
Chipset chipset;
LogicalResult
- matchAndRewrite(WaitcntOp op, OpAdaptor adaptor,
+ matchAndRewrite(MemoryCounterWaitOp op, OpAdaptor adaptor,
ConversionPatternRewriter &rewriter) const override {
+ if (chipset.majorVersion >= 12) {
+ Location loc = op.getLoc();
+ if (auto ds = adaptor.getDs())
+ rewriter.create<ROCDL::WaitDscntOp>(loc, *ds);
+
+ if (auto load = adaptor.getLoad())
+ rewriter.create<ROCDL::WaitLoadcntOp>(loc, *load);
+
+ if (auto store = adaptor.getStore())
+ rewriter.create<ROCDL::WaitStorecntOp>(loc, *store);
+
+ if (auto exp = adaptor.getExp())
+ rewriter.create<ROCDL::WaitExpcntOp>(loc, *exp);
+
+ return success();
+ }
+
auto getVal = [](Attribute attr) -> unsigned {
if (attr)
return cast<IntegerAttr>(attr).getInt();
@@ -481,12 +501,14 @@ struct WaitcntOpLowering : public ConvertOpToLLVMPattern<WaitcntOp> {
// This value will be clamped to the maximum value for the chipset.
return 1024 * 1024;
};
- unsigned vmcnt = getVal(adaptor.getVmcntAttr());
- unsigned expcnt = getVal(adaptor.getExpcntAttr());
- unsigned lgkmcnt = getVal(adaptor.getLgkmcntAttr());
+ unsigned ds = getVal(adaptor.getDsAttr());
+ unsigned load = getVal(adaptor.getLoadAttr());
+ unsigned store = getVal(adaptor.getStoreAttr());
+ unsigned exp = getVal(adaptor.getExpAttr());
+
+ unsigned vmcnt = std::min(load, store);
- FailureOr<unsigned> waitcnt =
- encodeWaitcnt(chipset, vmcnt, expcnt, lgkmcnt);
+ FailureOr<unsigned> waitcnt = encodeWaitcnt(chipset, vmcnt, exp, ds);
if (failed(waitcnt))
return op.emitOpError("unsupported chipset");
@@ -1901,7 +1923,7 @@ void mlir::populateAMDGPUToROCDLConversionPatterns(LLVMTypeConverter &converter,
ROCDL::RawPtrBufferAtomicUminOp>,
RawBufferOpLowering<RawBufferAtomicCmpswapOp,
ROCDL::RawPtrBufferAtomicCmpSwap>,
- AMDGPUDPPLowering, WaitcntOpLowering, LDSBarrierOpLowering,
+ AMDGPUDPPLowering, MemoryCounterWaitOpLowering, LDSBarrierOpLowering,
SchedBarrierOpLowering, MFMAOpLowering, ScaledMFMAOpLowering,
WMMAOpLowering, ExtPackedFp8OpLowering, ScaledExtPackedOpLowering,
PackedScaledTruncOpLowering, PackedTrunc2xFp8OpLowering,
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/memory_counter_wait.mlir b/mlir/test/Conversion/AMDGPUToROCDL/memory_counter_wait.mlir
new file mode 100644
index 0000000000000..1016ee859e462
--- /dev/null
+++ b/mlir/test/Conversion/AMDGPUToROCDL/memory_counter_wait.mlir
@@ -0,0 +1,42 @@
+// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx942 | FileCheck %s --check-prefixes=CHECK,GFX9
+// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx1030 | FileCheck %s --check-prefixes=CHECK,GFX10
+// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx1100 | FileCheck %s --check-prefixes=CHECK,GFX11
+// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx1201 | FileCheck %s --check-prefixes=CHECK,GFX12
+
+// CHECK-LABEL: func @memory_counter_wait
+func.func @memory_counter_wait() {
+ // GFX9: rocdl.s.waitcnt 53119
+ // GFX10: rocdl.s.waitcnt 65407
+ // GFX11: rocdl.s.waitcnt 65527
+ // GFX12-NOT: rocdl.s.wait.loadcnt
+ // GFX12-NOT: rocdl.s.wait.storecnt
+ // GFX12-NOT: rocdl.s.wait.expcnt
+ // GFX12-NOT: rocdl.s.wait.dscnt
+ amdgpu.memory_counter_wait
+
+ // GFX9: rocdl.s.waitcnt 3952
+ // GFX10: rocdl.s.waitcnt 16240
+ // GFX11: rocdl.s.waitcnt 1015
+ // GFX12: rocdl.s.wait.loadcnt 0
+ amdgpu.memory_counter_wait load(0)
+
+ // GFX9: rocdl.s.waitcnt 3952
+ // GFX10: rocdl.s.waitcnt 16240
+ // GFX11: rocdl.s.waitcnt 1015
+ // GFX12: rocdl.s.wait.storecnt 0
+ amdgpu.memory_counter_wait store(0)
+
+ // GFX9: rocdl.s.waitcnt 53007
+ // GFX10: rocdl.s.waitcnt 65295
+ // GFX11: rocdl.s.waitcnt 65520
+ // GFX12: rocdl.s.wait.expcnt 0
+ amdgpu.memory_counter_wait exp(0)
+
+ // GFX9: rocdl.s.waitcnt 49279
+ // GFX10: rocdl.s.waitcnt 49279
+ // GFX11: rocdl.s.waitcnt 64519
+ // GFX12: rocdl.s.wait.dscnt 0
+ amdgpu.memory_counter_wait ds(0)
+
+ return
+}
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/waitcnt.mlir b/mlir/test/Conversion/AMDGPUToROCDL/waitcnt.mlir
deleted file mode 100644
index 71617df05eb60..0000000000000
--- a/mlir/test/Conversion/AMDGPUToROCDL/waitcnt.mlir
+++ /dev/null
@@ -1,29 +0,0 @@
-// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx942 | FileCheck %s --check-prefixes=CHECK,GFX9
-// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx1030 | FileCheck %s --check-prefixes=CHECK,GFX10
-// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx1100 | FileCheck %s --check-prefixes=CHECK,GFX11
-
-
-// CHECK-LABEL: func @waitcnt
-func.func @waitcnt() {
- // GFX9: rocdl.s.waitcnt 53119
- // GFX10: rocdl.s.waitcnt 65407
- // GFX11: rocdl.s.waitcnt 65527
- amdgpu.waitcnt
-
- // GFX9: rocdl.s.waitcnt 3952
- // GFX10: rocdl.s.waitcnt 16240
- // GFX11: rocdl.s.waitcnt 1015
- amdgpu.waitcnt vmcnt(0)
-
- // GFX9: rocdl.s.waitcnt 53007
- // GFX10: rocdl.s.waitcnt 65295
- // GFX11: rocdl.s.waitcnt 65520
- amdgpu.waitcnt expcnt(0)
-
- // GFX9: rocdl.s.waitcnt 49279
- // GFX10: rocdl.s.waitcnt 49279
- // GFX11: rocdl.s.waitcnt 64519
- amdgpu.waitcnt lgkmcnt(0)
-
- return
-}
diff --git a/mlir/test/Dialect/AMDGPU/ops.mlir b/mlir/test/Dialect/AMDGPU/ops.mlir
index 82dd2bec248a7..fe78b5365745a 100644
--- a/mlir/test/Dialect/AMDGPU/ops.mlir
+++ b/mlir/test/Dialect/AMDGPU/ops.mlir
@@ -549,17 +549,19 @@ func.func @gather_to_lds(%idx1 : index, %idx2 : index, %mem1 : memref<32xf16>, %
func.return
}
-// CHECK-LABEL: func @waitcnt
-func.func @waitcnt() {
- // CHECK: amdgpu.waitcnt vmcnt(1) expcnt(2) lgkmcnt(3)
- // CHECK: amdgpu.waitcnt vmcnt(3) expcnt(2) lgkmcnt(1)
- // CHECK: amdgpu.waitcnt vmcnt(1)
- // CHECK: amdgpu.waitcnt expcnt(2)
- // CHECK: amdgpu.waitcnt lgkmcnt(3)
- amdgpu.waitcnt vmcnt(1) expcnt(2) lgkmcnt(3)
- amdgpu.waitcnt lgkmcnt(1) expcnt(2) vmcnt(3)
- amdgpu.waitcnt vmcnt(1)
- amdgpu.waitcnt expcnt(2)
- amdgpu.waitcnt lgkmcnt(3)
+// CHECK-LABEL: func @memory_counter_wait
+func.func @memory_counter_wait() {
+ // CHECK: amdgpu.memory_counter_wait load(1) store(2) ds(3) exp(4)
+ // CHECK: amdgpu.memory_counter_wait load(4) store(2) ds(3) exp(1)
+ // CHECK: amdgpu.memory_counter_wait load(1)
+ // CHECK: amdgpu.memory_counter_wait store(2)
+ // CHECK: amdgpu.memory_counter_wait ds(3)
+ // CHECK: amdgpu.memory_counter_wait exp(4)
+ amdgpu.memory_counter_wait load(1) store(2) ds(3) exp(4)
+ amdgpu.memory_counter_wait exp(1) store(2) ds(3) load(4)
+ amdgpu.memory_counter_wait load(1)
+ amdgpu.memory_counter_wait store(2)
+ amdgpu.memory_counter_wait ds(3)
+ amdgpu.memory_counter_wait exp(4)
func.return
}
>From b02a25cb8a0746340d1b1bf826f6f42a3f880162 Mon Sep 17 00:00:00 2001
From: Ivan Butygin <ivan.butygin at gmail.com>
Date: Tue, 22 Jul 2025 19:41:42 +0200
Subject: [PATCH 5/6] erase op
---
mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp | 1 +
1 file changed, 1 insertion(+)
diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index 057dfced09087..0501aa968347f 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -491,6 +491,7 @@ struct MemoryCounterWaitOpLowering
if (auto exp = adaptor.getExp())
rewriter.create<ROCDL::WaitExpcntOp>(loc, *exp);
+ rewriter.eraseOp(op);
return success();
}
>From 0660da5c64bcba3b6f3e52d8ed8553607566b39f Mon Sep 17 00:00:00 2001
From: Ivan Butygin <ivan.butygin at gmail.com>
Date: Tue, 22 Jul 2025 20:46:08 +0200
Subject: [PATCH 6/6] comments
---
.../AMDGPUToROCDL/AMDGPUToROCDL.cpp | 26 +++++++++----------
1 file changed, 13 insertions(+), 13 deletions(-)
diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index 0501aa968347f..93d220f56026b 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -421,15 +421,15 @@ struct RawBufferOpLowering : public ConvertOpToLLVMPattern<GpuOp> {
// TODO: AMDGPU backend already have all this bitpacking logic, we should move
// it to some common place.
-/// \details \p Vmcnt, \p Expcnt and \p Lgkmcnt are decoded as follows:
-/// \p Vmcnt = \p Waitcnt[3:0] (pre-gfx9)
-/// \p Vmcnt = \p Waitcnt[15:14,3:0] (gfx9,10)
-/// \p Vmcnt = \p Waitcnt[15:10] (gfx11)
-/// \p Expcnt = \p Waitcnt[6:4] (pre-gfx11)
-/// \p Expcnt = \p Waitcnt[2:0] (gfx11)
-/// \p Lgkmcnt = \p Waitcnt[11:8] (pre-gfx10)
-/// \p Lgkmcnt = \p Waitcnt[13:8] (gfx10)
-/// \p Lgkmcnt = \p Waitcnt[9:4] (gfx11)
+/// Vmcnt, Expcnt and Lgkmcnt are decoded as follows:
+/// Vmcnt = Waitcnt[3:0] (pre-gfx9)
+/// Vmcnt = Waitcnt[15:14,3:0] (gfx9,10)
+/// Vmcnt = Waitcnt[15:10] (gfx11)
+/// Expcnt = Waitcnt[6:4] (pre-gfx11)
+/// Expcnt = Waitcnt[2:0] (gfx11)
+/// Lgkmcnt = Waitcnt[11:8] (pre-gfx10)
+/// Lgkmcnt = Waitcnt[13:8] (gfx10)
+/// Lgkmcnt = Waitcnt[9:4] (gfx11)
static FailureOr<unsigned> encodeWaitcnt(Chipset chipset, unsigned vmcnt,
unsigned expcnt, unsigned lgkmcnt) {
if (chipset.majorVersion < 9) {
@@ -479,16 +479,16 @@ struct MemoryCounterWaitOpLowering
ConversionPatternRewriter &rewriter) const override {
if (chipset.majorVersion >= 12) {
Location loc = op.getLoc();
- if (auto ds = adaptor.getDs())
+ if (std::optional<int> ds = adaptor.getDs())
rewriter.create<ROCDL::WaitDscntOp>(loc, *ds);
- if (auto load = adaptor.getLoad())
+ if (std::optional<int> load = adaptor.getLoad())
rewriter.create<ROCDL::WaitLoadcntOp>(loc, *load);
- if (auto store = adaptor.getStore())
+ if (std::optional<int> store = adaptor.getStore())
rewriter.create<ROCDL::WaitStorecntOp>(loc, *store);
- if (auto exp = adaptor.getExp())
+ if (std::optional<int> exp = adaptor.getExp())
rewriter.create<ROCDL::WaitExpcntOp>(loc, *exp);
rewriter.eraseOp(op);
More information about the Mlir-commits
mailing list