[Mlir-commits] [mlir] f88d060 - [mlir][amdgpu] `memory_counter_wait` tensor counter support (#171153)

Mon Dec 8 09:02:44 PST 2025

Author: Ivan Butygin
Date: 2025-12-08T20:02:40+03:00
New Revision: f88d060c4176d17df56587a083944637ca865cb3

URL: https://github.com/llvm/llvm-project/commit/f88d060c4176d17df56587a083944637ca865cb3
DIFF: https://github.com/llvm/llvm-project/commit/f88d060c4176d17df56587a083944637ca865cb3.diff

LOG: [mlir][amdgpu] `memory_counter_wait` tensor counter support (#171153)

Added: 
    mlir/test/Conversion/AMDGPUToROCDL/memory_counter_wait_tensor.mlir
    mlir/test/Conversion/AMDGPUToROCDL/memory_counter_wait_unsupported.mlir

Modified: 
    mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
    mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
    mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
    mlir/test/Conversion/AMDGPUToROCDL/memory_counter_wait.mlir
    mlir/test/Dialect/AMDGPU/canonicalize.mlir
    mlir/test/Dialect/AMDGPU/ops.mlir

Removed: 
    


################################################################################
diff  --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
index ba078f52d24f6..56160d3e8fe85 100644

--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
@@ -906,7 +906,8 @@ def AMDGPU_MemoryCounterWaitOp :
       OptionalAttr<I32Attr>:$load,
       OptionalAttr<I32Attr>:$store,
       OptionalAttr<I32Attr>:$ds,
-      OptionalAttr<I32Attr>:$exp
+      OptionalAttr<I32Attr>:$exp,
+      OptionalAttr<I32Attr>:$tensor
     )>
   {
   let summary = "Wait for specified hardware counters";
@@ -919,7 +920,7 @@ def AMDGPU_MemoryCounterWaitOp :
     counters into one.
   }];
   let assemblyFormat = [{
-    oilist( `load` `(` $load `)` | `store` `(` $store `)` | `ds` `(` $ds `)` | `exp` `(` $exp `)` ) attr-dict
+    oilist( `load` `(` $load `)` | `store` `(` $store `)` | `ds` `(` $ds `)` | `exp` `(` $exp `)` | `tensor` `(` $tensor `)` ) attr-dict
   }];
 
   let hasCanonicalizer = 1;

diff  --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index f3b0da0120998..7584b17075225 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -506,10 +506,16 @@ struct MemoryCounterWaitOpLowering
       if (std::optional<int> exp = adaptor.getExp())
         ROCDL::WaitExpcntOp::create(rewriter, loc, *exp);
 
+      if (std::optional<int> tensor = adaptor.getTensor())
+        ROCDL::WaitTensorcntOp::create(rewriter, loc, *tensor);
+
       rewriter.eraseOp(op);
       return success();
     }
 
+    if (adaptor.getTensor())
+      return op.emitOpError("unsupported chipset");
+
     auto getVal = [](Attribute attr) -> unsigned {
       if (attr)
         return cast<IntegerAttr>(attr).getInt();

diff  --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
index 4a85db3ecf6f8..b7a665b0f5367 100644
--- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
+++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
@@ -614,10 +614,12 @@ struct FuseMemoryCounterWaitOp final : OpRewritePattern<MemoryCounterWaitOp> {
 
     auto setters = {&MemoryCounterWaitOp::setLoad,
                     &MemoryCounterWaitOp::setStore, &MemoryCounterWaitOp::setDs,
-                    &MemoryCounterWaitOp::setExp};
-    auto lhsVals = {op.getLoad(), op.getStore(), op.getDs(), op.getExp()};
+                    &MemoryCounterWaitOp::setExp,
+                    &MemoryCounterWaitOp::setTensor};
+    auto lhsVals = {op.getLoad(), op.getStore(), op.getDs(), op.getExp(),
+                    op.getTensor()};
     auto rhsVals = {next.getLoad(), next.getStore(), next.getDs(),
-                    next.getExp()};
+                    next.getExp(), next.getTensor()};
     rewriter.modifyOpInPlace(op, [&] {
       for (auto [setter, lhs, rhs] :
            llvm::zip_equal(setters, lhsVals, rhsVals)) {

diff  --git a/mlir/test/Conversion/AMDGPUToROCDL/memory_counter_wait.mlir b/mlir/test/Conversion/AMDGPUToROCDL/memory_counter_wait.mlir
index 1016ee859e462..537ef59b503a6 100644
--- a/mlir/test/Conversion/AMDGPUToROCDL/memory_counter_wait.mlir
+++ b/mlir/test/Conversion/AMDGPUToROCDL/memory_counter_wait.mlir
@@ -1,7 +1,7 @@
-// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx942 | FileCheck %s --check-prefixes=CHECK,GFX9
-// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx1030 | FileCheck %s --check-prefixes=CHECK,GFX10
-// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx1100 | FileCheck %s --check-prefixes=CHECK,GFX11
-// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx1201 | FileCheck %s --check-prefixes=CHECK,GFX12
+// RUN: mlir-opt %s --convert-amdgpu-to-rocdl=chipset=gfx942 | FileCheck %s --check-prefixes=CHECK,GFX9
+// RUN: mlir-opt %s --convert-amdgpu-to-rocdl=chipset=gfx1030 | FileCheck %s --check-prefixes=CHECK,GFX10
+// RUN: mlir-opt %s --convert-amdgpu-to-rocdl=chipset=gfx1100 | FileCheck %s --check-prefixes=CHECK,GFX11
+// RUN: mlir-opt %s --convert-amdgpu-to-rocdl=chipset=gfx1201 | FileCheck %s --check-prefixes=CHECK,GFX12
 
 // CHECK-LABEL: func @memory_counter_wait
 func.func @memory_counter_wait() {

diff  --git a/mlir/test/Conversion/AMDGPUToROCDL/memory_counter_wait_tensor.mlir b/mlir/test/Conversion/AMDGPUToROCDL/memory_counter_wait_tensor.mlir
new file mode 100644
index 0000000000000..5b29e01abebdb
--- /dev/null
+++ b/mlir/test/Conversion/AMDGPUToROCDL/memory_counter_wait_tensor.mlir
@@ -0,0 +1,9 @@
+// RUN: mlir-opt %s --convert-amdgpu-to-rocdl=chipset=gfx1250 | FileCheck %s
+
+// CHECK-LABEL: func @memory_counter_wait_tensor
+func.func @memory_counter_wait_tensor() {
+  // CHECK: rocdl.s.wait.tensorcnt 3
+  amdgpu.memory_counter_wait tensor(3)
+
+  return
+}

diff  --git a/mlir/test/Conversion/AMDGPUToROCDL/memory_counter_wait_unsupported.mlir b/mlir/test/Conversion/AMDGPUToROCDL/memory_counter_wait_unsupported.mlir
new file mode 100644
index 0000000000000..1d2f692bee488
--- /dev/null
+++ b/mlir/test/Conversion/AMDGPUToROCDL/memory_counter_wait_unsupported.mlir
@@ -0,0 +1,11 @@
+// RUN: mlir-opt %s --verify-diagnostics --convert-amdgpu-to-rocdl=chipset=gfx942
+// RUN: mlir-opt %s --verify-diagnostics --convert-amdgpu-to-rocdl=chipset=gfx1030
+// RUN: mlir-opt %s --verify-diagnostics --convert-amdgpu-to-rocdl=chipset=gfx1100
+
+func.func @memory_counter_wait_tensor() {
+  // expected-error @below{{failed to legalize operation 'amdgpu.memory_counter_wait'}}
+  // expected-error @below{{'amdgpu.memory_counter_wait' op unsupported chipset}}
+  amdgpu.memory_counter_wait tensor(0)
+
+  return
+}

diff  --git a/mlir/test/Dialect/AMDGPU/canonicalize.mlir b/mlir/test/Dialect/AMDGPU/canonicalize.mlir
index c66e9ed5d6f6d..cff1d3f2ac1fd 100644
--- a/mlir/test/Dialect/AMDGPU/canonicalize.mlir
+++ b/mlir/test/Dialect/AMDGPU/canonicalize.mlir
@@ -250,10 +250,10 @@ func.func @scaled_mfma_ugly_shapes(%opA: vector<32xf4E2M1FN>, %opB: vector<32xf4
 // CHECK-LABEL fuse_memory_counter_wait
 func.func @fuse_memory_counter_wait() {
   //      CHECK: amdgpu.memory_counter_wait
-  // CHECK-SAME: load(1) store(2) ds(2) exp(1)
+  // CHECK-SAME: load(1) store(2) ds(2) exp(1) tensor(0)
   // CHECK-NEXT: return
-  amdgpu.memory_counter_wait load(1) store(2) ds(3) exp(4)
-  amdgpu.memory_counter_wait load(4) store(3) ds(2) exp(1)
+  amdgpu.memory_counter_wait load(1) store(2) ds(3) exp(4) tensor(5)
+  amdgpu.memory_counter_wait load(4) store(3) ds(2) exp(1) tensor(0)
   return
 }
 

diff  --git a/mlir/test/Dialect/AMDGPU/ops.mlir b/mlir/test/Dialect/AMDGPU/ops.mlir
index cf3f7a9cb08a2..651aff4a0d22a 100644
--- a/mlir/test/Dialect/AMDGPU/ops.mlir
+++ b/mlir/test/Dialect/AMDGPU/ops.mlir
@@ -671,18 +671,20 @@ func.func @gather_to_lds(%idx1 : index, %idx2 : index, %mem1 : memref<32xf16>, %
 
 // CHECK-LABEL: func @memory_counter_wait
 func.func @memory_counter_wait() {
-  // CHECK: amdgpu.memory_counter_wait load(1) store(2) ds(3) exp(4)
-  // CHECK: amdgpu.memory_counter_wait load(4) store(2) ds(3) exp(1)
+  // CHECK: amdgpu.memory_counter_wait load(1) store(2) ds(3) exp(4) tensor(5)
+  // CHECK: amdgpu.memory_counter_wait load(4) store(2) ds(3) exp(1) tensor(0)
   // CHECK: amdgpu.memory_counter_wait load(1)
   // CHECK: amdgpu.memory_counter_wait store(2)
   // CHECK: amdgpu.memory_counter_wait ds(3)
   // CHECK: amdgpu.memory_counter_wait exp(4)
-  amdgpu.memory_counter_wait load(1) store(2) ds(3) exp(4)
-  amdgpu.memory_counter_wait exp(1) store(2) ds(3) load(4)
+  // CHECK: amdgpu.memory_counter_wait tensor(5)
+  amdgpu.memory_counter_wait load(1) store(2) ds(3) exp(4) tensor(5)
+  amdgpu.memory_counter_wait tensor(0) exp(1) store(2) ds(3) load(4)
   amdgpu.memory_counter_wait load(1)
   amdgpu.memory_counter_wait store(2)
   amdgpu.memory_counter_wait ds(3)
   amdgpu.memory_counter_wait exp(4)
+  amdgpu.memory_counter_wait tensor(5)
   func.return
 }