[Mlir-commits] [mlir] [mlir][nvgpu] update commit group and wait async ops (PR #130482)

Thu Mar 13 19:44:58 PDT 2025

https://github.com/linuxlonelyeagle updated https://github.com/llvm/llvm-project/pull/130482

>From 2dbb60534c2966f7616ddbd72d600f1cf9ba70aa Mon Sep 17 00:00:00 2001
From: linuxlonelyeagle <2020382038 at qq.com>
Date: Sun, 9 Mar 2025 17:35:14 +0800
Subject: [PATCH 1/3] update async wait op.

---
 .../include/mlir/Dialect/NVGPU/IR/NVGPUOps.td |  5 ++---
 .../NVGPU/Transforms/CreateAsyncGroups.cpp    |  7 ++-----
 .../Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir |  6 +++---
 .../Dialect/NVGPU/optimize-shared-memory.mlir | 14 +++++++-------
 mlir/test/Dialect/NVGPU/roundtrip.mlir        |  4 ++--
 .../NVGPU/transform-create-async-groups.mlir  |  6 +++---
 .../NVGPU/transform-pipeline-shared.mlir      | 19 +++++++------------
 7 files changed, 26 insertions(+), 35 deletions(-)

diff --git a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUOps.td b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUOps.td
index eb0fb90d271ed..8d8eddf18efc2 100644
--- a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUOps.td
+++ b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUOps.td
@@ -294,10 +294,9 @@ def NVGPU_DeviceAsyncWaitOp : NVGPU_Op<"device_async_wait", []> {
     nvgpu.device_async_wait %0
     ```
   }];
-  let arguments = (ins NVGPU_DeviceAsyncToken:$asyncDependencies,
-                       OptionalAttr<I32Attr>:$numGroups);
+  let arguments = (ins OptionalAttr<I32Attr>:$numGroups);
   let assemblyFormat = [{
-    $asyncDependencies attr-dict
+    attr-dict
   }];
 }
 
diff --git a/mlir/lib/Dialect/NVGPU/Transforms/CreateAsyncGroups.cpp b/mlir/lib/Dialect/NVGPU/Transforms/CreateAsyncGroups.cpp
index 10bc1993ffd96..ed37af096751f 100644
--- a/mlir/lib/Dialect/NVGPU/Transforms/CreateAsyncGroups.cpp
+++ b/mlir/lib/Dialect/NVGPU/Transforms/CreateAsyncGroups.cpp
@@ -265,11 +265,8 @@ void nvgpu::createAsyncGroups(RewriterBase &rewriter, Operation *op,
     }
 
     // Create the group and wait for it right after.
-    Value groupToken = rewriter.create<nvgpu::DeviceAsyncCreateGroupOp>(
-        op->getLoc(), nvgpu::DeviceAsyncTokenType::get(op->getContext()),
-        tokens);
-    rewriter.create<nvgpu::DeviceAsyncWaitOp>(op->getLoc(), groupToken,
-                                              nullptr);
+    rewriter.create<nvgpu::DeviceAsyncCreateGroupOp>(op->getLoc(), nvgpu::DeviceAsyncTokenType::get(op->getContext()), tokens);
+    rewriter.create<nvgpu::DeviceAsyncWaitOp>(op->getLoc(), nullptr);
     // Clean up old stores.
     for (Operation *writeOp : group)
       rewriter.eraseOp(writeOp);
diff --git a/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir b/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir
index 6b59b5e4343b4..7cbf39cb97dc8 100644
--- a/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir
+++ b/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir
@@ -243,7 +243,7 @@ func.func @async_cp(
   // CHECK: nvvm.cp.async.commit.group
   %1 = nvgpu.device_async_create_group %0
   // CHECK: nvvm.cp.async.wait.group 1
-  nvgpu.device_async_wait %1 { numGroups = 1 : i32 }
+  nvgpu.device_async_wait { numGroups = 1 : i32 }
 
   // CHECK: nvvm.cp.async.shared.global %{{.*}}, %{{.*}}, 16, cache = cg
   %2 = nvgpu.device_async_copy %src[%i, %i], %dst[%i, %i, %i], 4 {bypassL1}: memref<128x128xf32> to memref<3x16x128xf32, 3>
@@ -301,7 +301,7 @@ func.func @async_cp_zfill_f32_align4(
   // CHECK: nvvm.cp.async.commit.group
   %1 = nvgpu.device_async_create_group %0
   // CHECK: nvvm.cp.async.wait.group 1
-  nvgpu.device_async_wait %1 { numGroups = 1 : i32 }
+  nvgpu.device_async_wait { numGroups = 1 : i32 }
 
   return
 }
@@ -336,7 +336,7 @@ func.func @async_cp_zfill_f32_align1(
   // CHECK: nvvm.cp.async.commit.group
   %1 = nvgpu.device_async_create_group %0
   // CHECK: nvvm.cp.async.wait.group 1
-  nvgpu.device_async_wait %1 { numGroups = 1 : i32 }
+  nvgpu.device_async_wait { numGroups = 1 : i32 }
 
   return
 }
diff --git a/mlir/test/Dialect/NVGPU/optimize-shared-memory.mlir b/mlir/test/Dialect/NVGPU/optimize-shared-memory.mlir
index 7477e18728677..144e422f6c2b3 100644
--- a/mlir/test/Dialect/NVGPU/optimize-shared-memory.mlir
+++ b/mlir/test/Dialect/NVGPU/optimize-shared-memory.mlir
@@ -20,7 +20,7 @@ func.func @optimize_128x32xf16_32x128xf16(%arg0: memref<128x128xf16>,
   %0 = nvgpu.device_async_copy %arg0[%ldRow, %ldCol], %shm[%stRow, %stCol], 8
       : memref<128x128xf16> to memref<128x32xf16, 3>
   %1 = nvgpu.device_async_create_group %0
-  nvgpu.device_async_wait %1 { numGroups = 1 : i32}
+  nvgpu.device_async_wait { numGroups = 1 : i32}
 
   // CHECK: [[c6:%.+]] = arith.constant 6 : index
   // CHECK: [[srcBits:%.+]] = arith.andi [[fragRow]], [[c6]]
@@ -40,7 +40,7 @@ func.func @optimize_128x32xf16_32x128xf16(%arg0: memref<128x128xf16>,
   %2 = nvgpu.device_async_copy %arg0[%ldRow, %ldCol], %shmB[%stRow, %stCol], 8
       : memref<128x128xf16> to memref<32x128xf16, 3>
   %3 = nvgpu.device_async_create_group %0
-  nvgpu.device_async_wait %1 { numGroups = 1 : i32}
+  nvgpu.device_async_wait { numGroups = 1 : i32}
 
   // CHECK: [[c15:%.+]] = arith.constant 15 : index
   // CHECK: [[srcBits:%.+]] = arith.andi [[fragRow]], [[c15]]
@@ -77,7 +77,7 @@ func.func @optimize_64x16xf32_16x64xf32(%arg0: memref<128x128xf32>,
   %0 = nvgpu.device_async_copy %arg0[%ldRow, %ldCol], %shm[%stRow, %stCol], 4
       : memref<128x128xf32> to memref<64x16xf32, 3>
   %1 = nvgpu.device_async_create_group %0
-  nvgpu.device_async_wait %1 { numGroups = 1 : i32}
+  nvgpu.device_async_wait { numGroups = 1 : i32}
 
   // CHECK: [[c6:%.+]] = arith.constant 6 : index
   // CHECK: [[srcBits:%.+]] = arith.andi [[fragRow]], [[c6]]
@@ -133,7 +133,7 @@ func.func @optimize_64x16xf32_16x64xf32(%arg0: memref<128x128xf32>,
   %2 = nvgpu.device_async_copy %arg0[%ldRow, %ldCol], %shmB[%stRow, %stCol], 4
       : memref<128x128xf32> to memref<16x64xf32, 3>
   %3 = nvgpu.device_async_create_group %0
-  nvgpu.device_async_wait %1 { numGroups = 1 : i32}
+  nvgpu.device_async_wait { numGroups = 1 : i32}
 
   // CHECK: [[c15:%.+]] = arith.constant 15 : index
   // CHECK: [[srcBits:%.+]] = arith.andi [[fragRow]], [[c15]]
@@ -178,7 +178,7 @@ func.func @small_column_size_f64(%arg0: memref<32x32xf64>,
   %0 = nvgpu.device_async_copy %arg0[%ldRow, %ldCol], %shm[%stRow, %stCol], 2
       : memref<32x32xf64> to memref<32x4xf64, 3>
   %1 = nvgpu.device_async_create_group %0
-  nvgpu.device_async_wait %1 { numGroups = 1 : i32}
+  nvgpu.device_async_wait { numGroups = 1 : i32}
 
   // CHECK: [[c6:%.+]] = arith.constant 4 : index
   // CHECK: [[srcBits:%.+]] = arith.andi [[fragRow]], [[c6]]
@@ -204,7 +204,7 @@ func.func @too_small_column_size_f16(%arg0: memref<128x128xf16>,
   %0 = nvgpu.device_async_copy %arg0[%ldRow, %ldCol], %shm[%stRow, %stCol], 8
       : memref<128x128xf16> to memref<128x8xf16, 3>
   %1 = nvgpu.device_async_create_group %0
-  nvgpu.device_async_wait %1 { numGroups = 1 : i32}
+  nvgpu.device_async_wait { numGroups = 1 : i32}
 
   // CHECK: nvgpu.ldmatrix [[shm]][[[fragRow]], [[fragCol]]]
   %mat = nvgpu.ldmatrix %shm[%fragRow, %fragCol] {numTiles = 1 : i32, transpose = false}
@@ -230,7 +230,7 @@ func.func @abort_if_subview(%arg0: memref<128x128xf16>,
   %0 = nvgpu.device_async_copy %arg0[%ldRow, %ldCol], %shm[%stRow, %stCol], 8
       : memref<128x128xf16> to memref<128x32xf16, 3>
   %1 = nvgpu.device_async_create_group %0
-  nvgpu.device_async_wait %1 { numGroups = 1 : i32}
+  nvgpu.device_async_wait { numGroups = 1 : i32}
 
   // CHECK: nvgpu.ldmatrix [[shmView]][[[fragRow]], [[fragCol]]]
   %mat = nvgpu.ldmatrix %shmView[%fragRow, %fragCol] {numTiles = 1 : i32, transpose = false}
diff --git a/mlir/test/Dialect/NVGPU/roundtrip.mlir b/mlir/test/Dialect/NVGPU/roundtrip.mlir
index ad516b4d2c200..71f8f52bcbc64 100644
--- a/mlir/test/Dialect/NVGPU/roundtrip.mlir
+++ b/mlir/test/Dialect/NVGPU/roundtrip.mlir
@@ -65,7 +65,7 @@ func.func @async_cp(%dst : memref<2x7x5xf32, 3>, %src : memref<4x5xf32>){
   %0 = nvgpu.device_async_copy %src[%c0, %c0], %dst[%c0, %c0, %c0], 4 : memref<4x5xf32> to memref<2x7x5xf32, 3>
   // CHECK: %{{.*}} = nvgpu.device_async_create_group
   %token = nvgpu.device_async_create_group %0
-  // CHECK: nvgpu.device_async_wait %{{.*}} {numGroups = 1 : i32}
-  nvgpu.device_async_wait %token {numGroups = 1 : i32}
+  // CHECK: nvgpu.device_async_wait {numGroups = 1 : i32}
+  nvgpu.device_async_wait {numGroups = 1 : i32}
   return
 }
diff --git a/mlir/test/Dialect/NVGPU/transform-create-async-groups.mlir b/mlir/test/Dialect/NVGPU/transform-create-async-groups.mlir
index 8290001c45856..f325dac6f7303 100644
--- a/mlir/test/Dialect/NVGPU/transform-create-async-groups.mlir
+++ b/mlir/test/Dialect/NVGPU/transform-create-async-groups.mlir
@@ -18,7 +18,7 @@ builtin.module {
     %2 = vector.transfer_read %a[%c0, %c4], %cst_0 {in_bounds = [true]} : memref<1024x1024xf32>, vector<1xf32>
     vector.transfer_write %2, %0[%c0, %c4, %c0] {in_bounds = [true]} : vector<1xf32>, memref<4x32x16xf32, #gpu.address_space<workgroup>>
     // CHECK: %[[G:.*]] = nvgpu.device_async_create_group %[[CP0]], %[[CP1]]
-    // CHECK: nvgpu.device_async_wait %[[G]]
+    // CHECK: nvgpu.device_async_wait
     return
   }
 
@@ -52,7 +52,7 @@ builtin.module {
     %2 = vector.transfer_read %a[%c0, %c4], %cst_0 {in_bounds = [true]} : memref<1024x1024xf32>, vector<1xf32>
     vector.transfer_write %2, %0[%c0, %c4, %c0] {in_bounds = [true]} : vector<1xf32>, memref<4x32x16xf32, #gpu.address_space<workgroup>>
     // CHECK: %[[G:.*]] = nvgpu.device_async_create_group %[[CP0]], %[[CP1]]
-    // CHECK: nvgpu.device_async_wait %[[G]]
+    // CHECK: nvgpu.device_async_wait
     return
   }
 
@@ -84,7 +84,7 @@ builtin.module {
     %2 = vector.load %a[%c0, %c4] : memref<1024x1024xf32>, vector<1xf32>
     vector.store %2, %0[%c0, %c4, %c0] : memref<4x32x16xf32, #gpu.address_space<workgroup>>, vector<1xf32>
     // CHECK: %[[G:.*]] = nvgpu.device_async_create_group %[[CP0]], %[[CP1]]
-    // CHECK: nvgpu.device_async_wait %[[G]]
+    // CHECK: nvgpu.device_async_wait
     return
   }
 
diff --git a/mlir/test/Dialect/NVGPU/transform-pipeline-shared.mlir b/mlir/test/Dialect/NVGPU/transform-pipeline-shared.mlir
index e959949babd9e..e93a6a40391bb 100644
--- a/mlir/test/Dialect/NVGPU/transform-pipeline-shared.mlir
+++ b/mlir/test/Dialect/NVGPU/transform-pipeline-shared.mlir
@@ -94,13 +94,11 @@ func.func @async_depth_2_predicated(%global: memref<?xf32>, %alloc_size: index)
   %c0f = arith.constant 0.0 : f32
   // CHECK: %[[TOKEN0:.+]] = nvgpu.device_async_copy
   // CHECK: %[[TOKEN1:.+]] = nvgpu.device_async_copy
-  // CHECK: scf.for %[[I:.+]] = {{.*}} iter_args
-  // CHECK-SAME: %[[ITER_ARG0:.+]] = %[[TOKEN0]]
-  // CHECK-SAME: %[[ITER_ARG1:.+]] = %[[TOKEN1]]
+  // CHECK: scf.for %[[I:.+]] = {{.*}}
   scf.for %i = %c0 to %c98 step %c4 {
     // Condition for the predication "select" below.
     // CHECK:   %[[CMP0:.+]] = arith.cmpi slt, %[[I]], %[[C90]]
-    // CHECK:   nvgpu.device_async_wait %[[ITER_ARG0]] {numGroups = 1
+    // CHECK:   nvgpu.device_async_wait {numGroups = 1
     // Original "select" with updated induction variable.
     // CHECK:   %[[I_PLUS_8:.+]] = arith.addi %[[I]], %[[C8]]
     // CHECK:   %[[CMP1:.+]] = arith.cmpi slt, %[[I_PLUS_8]], %[[C96]]
@@ -122,9 +120,7 @@ func.func @async_depth_2_predicated(%global: memref<?xf32>, %alloc_size: index)
     %token = nvgpu.device_async_copy %global[%i], %shared[%i], 4, %read_size
       : memref<?xf32> to memref<?xf32, #gpu.address_space<workgroup>>
 
-    nvgpu.device_async_wait %token
-
-    // CHECK: scf.yield %[[ITER_ARG1]], %[[ASYNC_TOKEN]]
+    nvgpu.device_async_wait
   }
   // There is no need to wait for the last copies as it it was fully predicated
   // out and doesn't load the original data.
@@ -156,12 +152,11 @@ func.func @async_depth_2_peeled(%global: memref<?xf32>) {
   // CHECK: nvgpu.device_async_copy
   // CHECK: nvgpu.device_async_copy
   // CHECK: scf.for
-  // CHECK:   nvgpu.device_async_wait %{{.*}} {numGroups = 1
+  // CHECK:   nvgpu.device_async_wait {numGroups = 1
   // CHECK:   arith.select
   // CHECK:   nvgpu.device_async_copy
-  // CHECK:   scf.yield
-  // CHECK: nvgpu.device_async_wait %{{.*}} {numGroups = 1
-  // CHECK: nvgpu.device_async_wait %{{.*}} {numGroups = 0
+  // CHECK: nvgpu.device_async_wait {numGroups = 1
+  // CHECK: nvgpu.device_async_wait {numGroups = 0
   scf.for %i = %c0 to %c98 step %c4 {
     %c96 = arith.constant 96 : index
     %cond = arith.cmpi slt, %i, %c96 : index
@@ -169,7 +164,7 @@ func.func @async_depth_2_peeled(%global: memref<?xf32>) {
     %read_size = arith.select %cond, %c4, %c2 : index
     %token = nvgpu.device_async_copy %global[%i], %shared[%i], 4, %read_size
       : memref<?xf32> to memref<?xf32, #gpu.address_space<workgroup>>
-    nvgpu.device_async_wait %token
+    nvgpu.device_async_wait
   }
   return
 }

>From 2001dc0c4283baacadaaf5e93132a0f7b527c36f Mon Sep 17 00:00:00 2001
From: linuxlonelyeagle <2020382038 at qq.com>
Date: Sun, 9 Mar 2025 18:52:08 +0800
Subject: [PATCH 2/3] update create_group op and update doc.

---
 .../include/mlir/Dialect/NVGPU/IR/NVGPUOps.td | 32 +++++++++++--------
 .../Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp    |  7 +---
 .../NVGPU/Transforms/CreateAsyncGroups.cpp    |  2 +-
 .../Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir |  6 ++--
 .../Dialect/NVGPU/optimize-shared-memory.mlir | 14 ++++----
 mlir/test/Dialect/NVGPU/roundtrip.mlir        |  4 +--
 .../NVGPU/transform-create-async-groups.mlir  |  6 ++--
 7 files changed, 36 insertions(+), 35 deletions(-)

diff --git a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUOps.td b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUOps.td
index 8d8eddf18efc2..03a9485e26bc7 100644
--- a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUOps.td
+++ b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUOps.td
@@ -216,15 +216,13 @@ def NVGPU_DeviceAsyncCopyOp : NVGPU_Op<"device_async_copy", [
     // copy 2.
     %cp2 = nvgpu.device_async_copy %C[%c0], %D[%c0], 4 : memref<16xf32> to memref<16xf32, 3>
     // group 1 contains copy 1 and copy 2.
-    %token1 = nvgpu.device_async_create_group %cp1, %cp2
+    nvgpu.device_async_create_group %cp1, %cp2
     // copy 3.
     %cp3 = nvgpu.device_async_copy %E[%c0], %F[%c0], 4 : memref<16xf32> to memref<16xf32, 3>
     // group 2 contains copy 3.
-    %token2 = nvgpu.device_async_create_group %cp3
-    // after the wait copy 1 and copy 2 are complete.
-    nvgpu.device_async_wait %token1
-    // after the wait copy 3 is complete.
-    nvgpu.device_async_wait %token2
+    nvgpu.device_async_create_group %cp3
+    // after the wait copy 1, copy 2 and copy 3 are complete.
+    nvgpu.device_async_wait
     ```
 
     Example:
@@ -255,9 +253,7 @@ def NVGPU_DeviceAsyncCreateGroupOp : NVGPU_Op<"device_async_create_group", []> {
     The `nvgpu.device_async_create_group` op creates a group of memory accesses
     containing all the pending `device_async_copy` operations associated with
     argument tokens. Each token can only be part of one group.
-
-    It returns a token that can be use to wait until the group fully completes.
-
+ 
     This is meant to be used with `nvgpu.device_async_wait` to synchronize copies
     as explained in those ops descriptions.
 
@@ -266,10 +262,10 @@ def NVGPU_DeviceAsyncCreateGroupOp : NVGPU_Op<"device_async_create_group", []> {
     Example:
 
     ```mlir
-    %0 = nvgpu.device_async_create_group
-  ```
+    %cp = nvgpu.device_async_copy %E[%c0], %F[%c0], 4 : memref<16xf32> to memref<16xf32, 3>
+    nvgpu.device_async_create_group %cp
+    ```
   }];
-  let results = (outs NVGPU_DeviceAsyncToken:$asyncToken);
   let arguments = (ins Variadic<NVGPU_DeviceAsyncToken>:$inputTokens);
   let assemblyFormat = [{
     $inputTokens attr-dict
@@ -291,7 +287,17 @@ def NVGPU_DeviceAsyncWaitOp : NVGPU_Op<"device_async_wait", []> {
     Example:
 
     ```mlir
-    nvgpu.device_async_wait %0
+    // copy 1.
+    %cp1 = nvgpu.device_async_copy %A[%c0], %B[%c0], 4 :memref<16xf32> to memref<16xf32, 3>
+    nvgpu.device_async_create_group %cp1
+    // copy 2.
+    %cp2 = nvgpu.device_async_copy %C[%c0], %D[%c0], 4 : memref<16xf32> to memref<16xf32, 3>
+    nvgpu.device_async_create_group %cp2
+    // copy 3.
+    %cp3 = nvgpu.device_async_copy %E[%c0], %F[%c0], 4 : memref<16xf32> to memref<16xf32, 3>
+    nvgpu.device_async_create_group %cp3
+    // after the wait copy 1 and copy 2 are complete.
+    nvgpu.device_async_wait {numGroups = 1 : i32}
     ```
   }];
   let arguments = (ins OptionalAttr<I32Attr>:$numGroups);
diff --git a/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp b/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp
index f53de416f2abd..3bf1fd04d1759 100644
--- a/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp
+++ b/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp
@@ -733,12 +733,7 @@ struct NVGPUAsyncCreateGroupLowering
   LogicalResult
   matchAndRewrite(nvgpu::DeviceAsyncCreateGroupOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
-    rewriter.create<NVVM::CpAsyncCommitGroupOp>(op.getLoc());
-    // Drop the result token.
-    Value zero = rewriter.create<LLVM::ConstantOp>(
-        op->getLoc(), IntegerType::get(op.getContext(), 32),
-        rewriter.getI32IntegerAttr(0));
-    rewriter.replaceOp(op, zero);
+    rewriter.replaceOpWithNewOp<NVVM::CpAsyncCommitGroupOp>(op);
     return success();
   }
 };
diff --git a/mlir/lib/Dialect/NVGPU/Transforms/CreateAsyncGroups.cpp b/mlir/lib/Dialect/NVGPU/Transforms/CreateAsyncGroups.cpp
index ed37af096751f..08794b2b328fa 100644
--- a/mlir/lib/Dialect/NVGPU/Transforms/CreateAsyncGroups.cpp
+++ b/mlir/lib/Dialect/NVGPU/Transforms/CreateAsyncGroups.cpp
@@ -265,7 +265,7 @@ void nvgpu::createAsyncGroups(RewriterBase &rewriter, Operation *op,
     }
 
     // Create the group and wait for it right after.
-    rewriter.create<nvgpu::DeviceAsyncCreateGroupOp>(op->getLoc(), nvgpu::DeviceAsyncTokenType::get(op->getContext()), tokens);
+    rewriter.create<nvgpu::DeviceAsyncCreateGroupOp>(op->getLoc(), tokens);
     rewriter.create<nvgpu::DeviceAsyncWaitOp>(op->getLoc(), nullptr);
     // Clean up old stores.
     for (Operation *writeOp : group)
diff --git a/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir b/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir
index 7cbf39cb97dc8..524eb3e1fa7b1 100644
--- a/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir
+++ b/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir
@@ -241,7 +241,7 @@ func.func @async_cp(
   // CHECK-DAG: nvvm.cp.async.shared.global %[[ADDRESSDST]], %[[CAST2]], 16, cache = ca
   %0 = nvgpu.device_async_copy %src[%i, %i], %dst[%i, %i, %i], 4 : memref<128x128xf32> to memref<3x16x128xf32, 3>
   // CHECK: nvvm.cp.async.commit.group
-  %1 = nvgpu.device_async_create_group %0
+  nvgpu.device_async_create_group %0
   // CHECK: nvvm.cp.async.wait.group 1
   nvgpu.device_async_wait { numGroups = 1 : i32 }
 
@@ -299,7 +299,7 @@ func.func @async_cp_zfill_f32_align4(
   // CHECK-DAG: nvvm.cp.async.shared.global %[[ADDRESSDST]], %[[CAST2]], 16, cache = cg, %[[c5]]
   %0 = nvgpu.device_async_copy %src[%i, %i], %dst[%i, %i, %i], 4, %srcElements {bypassL1}: memref<128x128xf32> to memref<3x16x128xf32, 3>
   // CHECK: nvvm.cp.async.commit.group
-  %1 = nvgpu.device_async_create_group %0
+  nvgpu.device_async_create_group %0
   // CHECK: nvvm.cp.async.wait.group 1
   nvgpu.device_async_wait { numGroups = 1 : i32 }
 
@@ -334,7 +334,7 @@ func.func @async_cp_zfill_f32_align1(
   // CHECK-DAG: nvvm.cp.async.shared.global %[[ADDRESSDST]], %[[CAST2]], 4, cache = ca, %[[c5]]
   %0 = nvgpu.device_async_copy %src[%i, %i], %dst[%i, %i, %i], 1, %srcElements : memref<128x128xf32> to memref<3x16x128xf32, 3>
   // CHECK: nvvm.cp.async.commit.group
-  %1 = nvgpu.device_async_create_group %0
+  nvgpu.device_async_create_group %0
   // CHECK: nvvm.cp.async.wait.group 1
   nvgpu.device_async_wait { numGroups = 1 : i32 }
 
diff --git a/mlir/test/Dialect/NVGPU/optimize-shared-memory.mlir b/mlir/test/Dialect/NVGPU/optimize-shared-memory.mlir
index 144e422f6c2b3..610afb56d3175 100644
--- a/mlir/test/Dialect/NVGPU/optimize-shared-memory.mlir
+++ b/mlir/test/Dialect/NVGPU/optimize-shared-memory.mlir
@@ -19,7 +19,7 @@ func.func @optimize_128x32xf16_32x128xf16(%arg0: memref<128x128xf16>,
   // CHECK: nvgpu.device_async_copy [[arg0]][[[ldRow]], [[ldCol]]], [[shm]][[[stRow]], [[stColPerm]]]
   %0 = nvgpu.device_async_copy %arg0[%ldRow, %ldCol], %shm[%stRow, %stCol], 8
       : memref<128x128xf16> to memref<128x32xf16, 3>
-  %1 = nvgpu.device_async_create_group %0
+  nvgpu.device_async_create_group %0
   nvgpu.device_async_wait { numGroups = 1 : i32}
 
   // CHECK: [[c6:%.+]] = arith.constant 6 : index
@@ -39,7 +39,7 @@ func.func @optimize_128x32xf16_32x128xf16(%arg0: memref<128x128xf16>,
   // CHECK: nvgpu.device_async_copy [[arg0]][[[ldRow]], [[ldCol]]], [[shmB]][[[stRow]], [[stColPerm]]]
   %2 = nvgpu.device_async_copy %arg0[%ldRow, %ldCol], %shmB[%stRow, %stCol], 8
       : memref<128x128xf16> to memref<32x128xf16, 3>
-  %3 = nvgpu.device_async_create_group %0
+  nvgpu.device_async_create_group %0
   nvgpu.device_async_wait { numGroups = 1 : i32}
 
   // CHECK: [[c15:%.+]] = arith.constant 15 : index
@@ -76,7 +76,7 @@ func.func @optimize_64x16xf32_16x64xf32(%arg0: memref<128x128xf32>,
   // CHECK: nvgpu.device_async_copy [[arg0]][[[ldRow]], [[ldCol]]], [[shm]][[[stRow]], [[stColPerm]]]
   %0 = nvgpu.device_async_copy %arg0[%ldRow, %ldCol], %shm[%stRow, %stCol], 4
       : memref<128x128xf32> to memref<64x16xf32, 3>
-  %1 = nvgpu.device_async_create_group %0
+  nvgpu.device_async_create_group %0
   nvgpu.device_async_wait { numGroups = 1 : i32}
 
   // CHECK: [[c6:%.+]] = arith.constant 6 : index
@@ -132,7 +132,7 @@ func.func @optimize_64x16xf32_16x64xf32(%arg0: memref<128x128xf32>,
   // CHECK: nvgpu.device_async_copy [[arg0]][[[ldRow]], [[ldCol]]], [[shmB]][[[stRow]], [[stColPerm]]]
   %2 = nvgpu.device_async_copy %arg0[%ldRow, %ldCol], %shmB[%stRow, %stCol], 4
       : memref<128x128xf32> to memref<16x64xf32, 3>
-  %3 = nvgpu.device_async_create_group %0
+  nvgpu.device_async_create_group %0
   nvgpu.device_async_wait { numGroups = 1 : i32}
 
   // CHECK: [[c15:%.+]] = arith.constant 15 : index
@@ -177,7 +177,7 @@ func.func @small_column_size_f64(%arg0: memref<32x32xf64>,
   // CHECK: nvgpu.device_async_copy [[arg0]][[[ldRow]], [[ldCol]]], [[shm]][[[stRow]], [[stColPerm]]]
   %0 = nvgpu.device_async_copy %arg0[%ldRow, %ldCol], %shm[%stRow, %stCol], 2
       : memref<32x32xf64> to memref<32x4xf64, 3>
-  %1 = nvgpu.device_async_create_group %0
+  nvgpu.device_async_create_group %0
   nvgpu.device_async_wait { numGroups = 1 : i32}
 
   // CHECK: [[c6:%.+]] = arith.constant 4 : index
@@ -203,7 +203,7 @@ func.func @too_small_column_size_f16(%arg0: memref<128x128xf16>,
   // CHECK: nvgpu.device_async_copy [[arg0]][[[ldRow]], [[ldCol]]], [[shm]][[[stRow]], [[stCol]]]
   %0 = nvgpu.device_async_copy %arg0[%ldRow, %ldCol], %shm[%stRow, %stCol], 8
       : memref<128x128xf16> to memref<128x8xf16, 3>
-  %1 = nvgpu.device_async_create_group %0
+  nvgpu.device_async_create_group %0
   nvgpu.device_async_wait { numGroups = 1 : i32}
 
   // CHECK: nvgpu.ldmatrix [[shm]][[[fragRow]], [[fragCol]]]
@@ -229,7 +229,7 @@ func.func @abort_if_subview(%arg0: memref<128x128xf16>,
   // CHECK: nvgpu.device_async_copy [[arg0]][[[ldRow]], [[ldCol]]], [[shm]][[[stRow]], [[stCol]]]
   %0 = nvgpu.device_async_copy %arg0[%ldRow, %ldCol], %shm[%stRow, %stCol], 8
       : memref<128x128xf16> to memref<128x32xf16, 3>
-  %1 = nvgpu.device_async_create_group %0
+  nvgpu.device_async_create_group %0
   nvgpu.device_async_wait { numGroups = 1 : i32}
 
   // CHECK: nvgpu.ldmatrix [[shmView]][[[fragRow]], [[fragCol]]]
diff --git a/mlir/test/Dialect/NVGPU/roundtrip.mlir b/mlir/test/Dialect/NVGPU/roundtrip.mlir
index 71f8f52bcbc64..dbd9c368d9e47 100644
--- a/mlir/test/Dialect/NVGPU/roundtrip.mlir
+++ b/mlir/test/Dialect/NVGPU/roundtrip.mlir
@@ -63,8 +63,8 @@ func.func @async_cp(%dst : memref<2x7x5xf32, 3>, %src : memref<4x5xf32>){
   %c0 = arith.constant 0 : index
   // CHECK: nvgpu.device_async_copy %{{.*}}[{{.*}}, {{.*}}], %{{.*}}[{{.*}}, {{.*}}, {{.*}}], 4 : memref<4x5xf32> to memref<2x7x5xf32, 3>
   %0 = nvgpu.device_async_copy %src[%c0, %c0], %dst[%c0, %c0, %c0], 4 : memref<4x5xf32> to memref<2x7x5xf32, 3>
-  // CHECK: %{{.*}} = nvgpu.device_async_create_group
-  %token = nvgpu.device_async_create_group %0
+  // CHECK: nvgpu.device_async_create_group
+  nvgpu.device_async_create_group %0
   // CHECK: nvgpu.device_async_wait {numGroups = 1 : i32}
   nvgpu.device_async_wait {numGroups = 1 : i32}
   return
diff --git a/mlir/test/Dialect/NVGPU/transform-create-async-groups.mlir b/mlir/test/Dialect/NVGPU/transform-create-async-groups.mlir
index f325dac6f7303..aaaeb50854dc4 100644
--- a/mlir/test/Dialect/NVGPU/transform-create-async-groups.mlir
+++ b/mlir/test/Dialect/NVGPU/transform-create-async-groups.mlir
@@ -17,7 +17,7 @@ builtin.module {
     // CHECK: %[[CP1:.*]] = nvgpu.device_async_copy {{.*}}, {{.*}}, 1
     %2 = vector.transfer_read %a[%c0, %c4], %cst_0 {in_bounds = [true]} : memref<1024x1024xf32>, vector<1xf32>
     vector.transfer_write %2, %0[%c0, %c4, %c0] {in_bounds = [true]} : vector<1xf32>, memref<4x32x16xf32, #gpu.address_space<workgroup>>
-    // CHECK: %[[G:.*]] = nvgpu.device_async_create_group %[[CP0]], %[[CP1]]
+    // CHECK: nvgpu.device_async_create_group %[[CP0]], %[[CP1]]
     // CHECK: nvgpu.device_async_wait
     return
   }
@@ -51,7 +51,7 @@ builtin.module {
     // CHECK: %[[CP1:.*]] = nvgpu.device_async_copy {{.*}}, {{.*}}, 1 :
     %2 = vector.transfer_read %a[%c0, %c4], %cst_0 {in_bounds = [true]} : memref<1024x1024xf32>, vector<1xf32>
     vector.transfer_write %2, %0[%c0, %c4, %c0] {in_bounds = [true]} : vector<1xf32>, memref<4x32x16xf32, #gpu.address_space<workgroup>>
-    // CHECK: %[[G:.*]] = nvgpu.device_async_create_group %[[CP0]], %[[CP1]]
+    // CHECK: nvgpu.device_async_create_group %[[CP0]], %[[CP1]]
     // CHECK: nvgpu.device_async_wait
     return
   }
@@ -83,7 +83,7 @@ builtin.module {
     // CHECK: %[[CP1:.*]] = nvgpu.device_async_copy {{.*}}, {{.*}}, 1 :
     %2 = vector.load %a[%c0, %c4] : memref<1024x1024xf32>, vector<1xf32>
     vector.store %2, %0[%c0, %c4, %c0] : memref<4x32x16xf32, #gpu.address_space<workgroup>>, vector<1xf32>
-    // CHECK: %[[G:.*]] = nvgpu.device_async_create_group %[[CP0]], %[[CP1]]
+    // CHECK: nvgpu.device_async_create_group %[[CP0]], %[[CP1]]
     // CHECK: nvgpu.device_async_wait
     return
   }

>From bc929f3bbc498ef12116ad1b5f19c5a723a4c4a7 Mon Sep 17 00:00:00 2001
From: linuxlonelyeagle <2020382038 at qq.com>
Date: Fri, 14 Mar 2025 10:44:35 +0800
Subject: [PATCH 3/3] update description and add prop-attr.

---
 mlir/include/mlir/Dialect/NVGPU/IR/NVGPUOps.td       | 12 +++++++-----
 mlir/include/mlir/Dialect/NVGPU/IR/NVGPUTypes.td     |  1 -
 mlir/test/Dialect/NVGPU/roundtrip.mlir               |  2 +-
 .../Dialect/NVGPU/transform-pipeline-shared.mlir     |  8 ++++----
 4 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUOps.td b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUOps.td
index 03a9485e26bc7..581bd5005873d 100644
--- a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUOps.td
+++ b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUOps.td
@@ -275,14 +275,16 @@ def NVGPU_DeviceAsyncCreateGroupOp : NVGPU_Op<"device_async_create_group", []> {
 def NVGPU_DeviceAsyncWaitOp : NVGPU_Op<"device_async_wait", []> {
   let summary = "Wait for async gpu ops to complete.";
   let description = [{
-    The `nvgpu.device_async_wait` op will block the execution thread until the group
-    associated with the source token is fully completed.
+    The `nvgpu.device_async_wait` op will block the execution thread until the till
+    only `$numGroups` or fewer of the most recent async copy groups are pending and
+    all the prior async copy groups committed by the executing threads are complete.
 
     The optional `$numGroups` attribute gives an upper bound of the number of
     groups uncompleted when the wait can unblock the thread. For example,  if
     16 async groups are pushe and `$numGroups` is set to 12, then the thread
-    will unblock when 12 groups or fewer are in flight (4 groups have
-    completed).
+    will unblock when 12 groups or fewer are in flight (4 groups have completed).
+    Its default value is 0, This means waiting for all previously committed groups
+    to complete.
 
     Example:
 
@@ -302,7 +304,7 @@ def NVGPU_DeviceAsyncWaitOp : NVGPU_Op<"device_async_wait", []> {
   }];
   let arguments = (ins OptionalAttr<I32Attr>:$numGroups);
   let assemblyFormat = [{
-    attr-dict
+    prop-dict attr-dict
   }];
 }
 
diff --git a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUTypes.td b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUTypes.td
index 8836a1a9dfcd8..ee71a145734ae 100644
--- a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUTypes.td
+++ b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUTypes.td
@@ -10,7 +10,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-
 #ifndef MLIR_DIALECT_NVGPU_IR_NVGPUTYPES_TD
 #define MLIR_DIALECT_NVGPU_IR_NVGPUTYPES_TD
 
diff --git a/mlir/test/Dialect/NVGPU/roundtrip.mlir b/mlir/test/Dialect/NVGPU/roundtrip.mlir
index dbd9c368d9e47..bb79c288fd064 100644
--- a/mlir/test/Dialect/NVGPU/roundtrip.mlir
+++ b/mlir/test/Dialect/NVGPU/roundtrip.mlir
@@ -65,7 +65,7 @@ func.func @async_cp(%dst : memref<2x7x5xf32, 3>, %src : memref<4x5xf32>){
   %0 = nvgpu.device_async_copy %src[%c0, %c0], %dst[%c0, %c0, %c0], 4 : memref<4x5xf32> to memref<2x7x5xf32, 3>
   // CHECK: nvgpu.device_async_create_group
   nvgpu.device_async_create_group %0
-  // CHECK: nvgpu.device_async_wait {numGroups = 1 : i32}
+  // CHECK: nvgpu.device_async_wait <{numGroups = 1 : i32}>
   nvgpu.device_async_wait {numGroups = 1 : i32}
   return
 }
diff --git a/mlir/test/Dialect/NVGPU/transform-pipeline-shared.mlir b/mlir/test/Dialect/NVGPU/transform-pipeline-shared.mlir
index e93a6a40391bb..f17475622a240 100644
--- a/mlir/test/Dialect/NVGPU/transform-pipeline-shared.mlir
+++ b/mlir/test/Dialect/NVGPU/transform-pipeline-shared.mlir
@@ -98,7 +98,7 @@ func.func @async_depth_2_predicated(%global: memref<?xf32>, %alloc_size: index)
   scf.for %i = %c0 to %c98 step %c4 {
     // Condition for the predication "select" below.
     // CHECK:   %[[CMP0:.+]] = arith.cmpi slt, %[[I]], %[[C90]]
-    // CHECK:   nvgpu.device_async_wait {numGroups = 1
+    // CHECK:   nvgpu.device_async_wait <{numGroups = 1 : i32}>
     // Original "select" with updated induction variable.
     // CHECK:   %[[I_PLUS_8:.+]] = arith.addi %[[I]], %[[C8]]
     // CHECK:   %[[CMP1:.+]] = arith.cmpi slt, %[[I_PLUS_8]], %[[C96]]
@@ -152,11 +152,11 @@ func.func @async_depth_2_peeled(%global: memref<?xf32>) {
   // CHECK: nvgpu.device_async_copy
   // CHECK: nvgpu.device_async_copy
   // CHECK: scf.for
-  // CHECK:   nvgpu.device_async_wait {numGroups = 1
+  // CHECK:   nvgpu.device_async_wait <{numGroups = 1 : i32}>
   // CHECK:   arith.select
   // CHECK:   nvgpu.device_async_copy
-  // CHECK: nvgpu.device_async_wait {numGroups = 1
-  // CHECK: nvgpu.device_async_wait {numGroups = 0
+  // CHECK: nvgpu.device_async_wait <{numGroups = 1 : i32}>
+  // CHECK: nvgpu.device_async_wait <{numGroups = 0 : i32}>
   scf.for %i = %c0 to %c98 step %c4 {
     %c96 = arith.constant 96 : index
     %cond = arith.cmpi slt, %i, %c96 : index