[Mlir-commits] [mlir] [mlir][nvgpu] update commit group and wait async ops (PR #130482)
llvmlistbot at llvm.org
llvmlistbot at llvm.org
Sun Mar 9 04:06:43 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-mlir
Author: lonely eagle (linuxlonelyeagle)
<details>
<summary>Changes</summary>
There seems to be some errors in the current definitions of `nvgpu.devicce_async_create_group` and `nvgpu.device_async_wait`.
`nvgpu.device_async_wait` should not have operands.If it has operands, the semantics should be to wait for a cp operation in the operands.But it actually waits for all groups, so it shouldn't specify an operand, and you can see from the pattern in nvgpu-to-nvvm that its operand is useless.Since `nvgpu.device_async_wait` no longer has operands, this PR also removes the result from `nvgpu.devicce_async_create_group`.
In addition to this, corrections were made to the documentation and examples.
---
Full diff: https://github.com/llvm/llvm-project/pull/130482.diff
8 Files Affected:
- (modified) mlir/include/mlir/Dialect/NVGPU/IR/NVGPUOps.td (+21-16)
- (modified) mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp (+1-6)
- (modified) mlir/lib/Dialect/NVGPU/Transforms/CreateAsyncGroups.cpp (+2-5)
- (modified) mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir (+6-6)
- (modified) mlir/test/Dialect/NVGPU/optimize-shared-memory.mlir (+14-14)
- (modified) mlir/test/Dialect/NVGPU/roundtrip.mlir (+4-4)
- (modified) mlir/test/Dialect/NVGPU/transform-create-async-groups.mlir (+6-6)
- (modified) mlir/test/Dialect/NVGPU/transform-pipeline-shared.mlir (+7-12)
``````````diff
diff --git a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUOps.td b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUOps.td
index eb0fb90d271ed..03a9485e26bc7 100644
--- a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUOps.td
+++ b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUOps.td
@@ -216,15 +216,13 @@ def NVGPU_DeviceAsyncCopyOp : NVGPU_Op<"device_async_copy", [
// copy 2.
%cp2 = nvgpu.device_async_copy %C[%c0], %D[%c0], 4 : memref<16xf32> to memref<16xf32, 3>
// group 1 contains copy 1 and copy 2.
- %token1 = nvgpu.device_async_create_group %cp1, %cp2
+ nvgpu.device_async_create_group %cp1, %cp2
// copy 3.
%cp3 = nvgpu.device_async_copy %E[%c0], %F[%c0], 4 : memref<16xf32> to memref<16xf32, 3>
// group 2 contains copy 3.
- %token2 = nvgpu.device_async_create_group %cp3
- // after the wait copy 1 and copy 2 are complete.
- nvgpu.device_async_wait %token1
- // after the wait copy 3 is complete.
- nvgpu.device_async_wait %token2
+ nvgpu.device_async_create_group %cp3
+ // after the wait copy 1, copy 2 and copy 3 are complete.
+ nvgpu.device_async_wait
```
Example:
@@ -255,9 +253,7 @@ def NVGPU_DeviceAsyncCreateGroupOp : NVGPU_Op<"device_async_create_group", []> {
The `nvgpu.device_async_create_group` op creates a group of memory accesses
containing all the pending `device_async_copy` operations associated with
argument tokens. Each token can only be part of one group.
-
- It returns a token that can be use to wait until the group fully completes.
-
+
This is meant to be used with `nvgpu.device_async_wait` to synchronize copies
as explained in those ops descriptions.
@@ -266,10 +262,10 @@ def NVGPU_DeviceAsyncCreateGroupOp : NVGPU_Op<"device_async_create_group", []> {
Example:
```mlir
- %0 = nvgpu.device_async_create_group
- ```
+ %cp = nvgpu.device_async_copy %E[%c0], %F[%c0], 4 : memref<16xf32> to memref<16xf32, 3>
+ nvgpu.device_async_create_group %cp
+ ```
}];
- let results = (outs NVGPU_DeviceAsyncToken:$asyncToken);
let arguments = (ins Variadic<NVGPU_DeviceAsyncToken>:$inputTokens);
let assemblyFormat = [{
$inputTokens attr-dict
@@ -291,13 +287,22 @@ def NVGPU_DeviceAsyncWaitOp : NVGPU_Op<"device_async_wait", []> {
Example:
```mlir
- nvgpu.device_async_wait %0
+ // copy 1.
+ %cp1 = nvgpu.device_async_copy %A[%c0], %B[%c0], 4 :memref<16xf32> to memref<16xf32, 3>
+ nvgpu.device_async_create_group %cp1
+ // copy 2.
+ %cp2 = nvgpu.device_async_copy %C[%c0], %D[%c0], 4 : memref<16xf32> to memref<16xf32, 3>
+ nvgpu.device_async_create_group %cp2
+ // copy 3.
+ %cp3 = nvgpu.device_async_copy %E[%c0], %F[%c0], 4 : memref<16xf32> to memref<16xf32, 3>
+ nvgpu.device_async_create_group %cp3
+ // after the wait copy 1 and copy 2 are complete.
+ nvgpu.device_async_wait {numGroups = 1 : i32}
```
}];
- let arguments = (ins NVGPU_DeviceAsyncToken:$asyncDependencies,
- OptionalAttr<I32Attr>:$numGroups);
+ let arguments = (ins OptionalAttr<I32Attr>:$numGroups);
let assemblyFormat = [{
- $asyncDependencies attr-dict
+ attr-dict
}];
}
diff --git a/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp b/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp
index f53de416f2abd..3bf1fd04d1759 100644
--- a/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp
+++ b/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp
@@ -733,12 +733,7 @@ struct NVGPUAsyncCreateGroupLowering
LogicalResult
matchAndRewrite(nvgpu::DeviceAsyncCreateGroupOp op, OpAdaptor adaptor,
ConversionPatternRewriter &rewriter) const override {
- rewriter.create<NVVM::CpAsyncCommitGroupOp>(op.getLoc());
- // Drop the result token.
- Value zero = rewriter.create<LLVM::ConstantOp>(
- op->getLoc(), IntegerType::get(op.getContext(), 32),
- rewriter.getI32IntegerAttr(0));
- rewriter.replaceOp(op, zero);
+ rewriter.replaceOpWithNewOp<NVVM::CpAsyncCommitGroupOp>(op);
return success();
}
};
diff --git a/mlir/lib/Dialect/NVGPU/Transforms/CreateAsyncGroups.cpp b/mlir/lib/Dialect/NVGPU/Transforms/CreateAsyncGroups.cpp
index 10bc1993ffd96..08794b2b328fa 100644
--- a/mlir/lib/Dialect/NVGPU/Transforms/CreateAsyncGroups.cpp
+++ b/mlir/lib/Dialect/NVGPU/Transforms/CreateAsyncGroups.cpp
@@ -265,11 +265,8 @@ void nvgpu::createAsyncGroups(RewriterBase &rewriter, Operation *op,
}
// Create the group and wait for it right after.
- Value groupToken = rewriter.create<nvgpu::DeviceAsyncCreateGroupOp>(
- op->getLoc(), nvgpu::DeviceAsyncTokenType::get(op->getContext()),
- tokens);
- rewriter.create<nvgpu::DeviceAsyncWaitOp>(op->getLoc(), groupToken,
- nullptr);
+ rewriter.create<nvgpu::DeviceAsyncCreateGroupOp>(op->getLoc(), tokens);
+ rewriter.create<nvgpu::DeviceAsyncWaitOp>(op->getLoc(), nullptr);
// Clean up old stores.
for (Operation *writeOp : group)
rewriter.eraseOp(writeOp);
diff --git a/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir b/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir
index 6b59b5e4343b4..524eb3e1fa7b1 100644
--- a/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir
+++ b/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir
@@ -241,9 +241,9 @@ func.func @async_cp(
// CHECK-DAG: nvvm.cp.async.shared.global %[[ADDRESSDST]], %[[CAST2]], 16, cache = ca
%0 = nvgpu.device_async_copy %src[%i, %i], %dst[%i, %i, %i], 4 : memref<128x128xf32> to memref<3x16x128xf32, 3>
// CHECK: nvvm.cp.async.commit.group
- %1 = nvgpu.device_async_create_group %0
+ nvgpu.device_async_create_group %0
// CHECK: nvvm.cp.async.wait.group 1
- nvgpu.device_async_wait %1 { numGroups = 1 : i32 }
+ nvgpu.device_async_wait { numGroups = 1 : i32 }
// CHECK: nvvm.cp.async.shared.global %{{.*}}, %{{.*}}, 16, cache = cg
%2 = nvgpu.device_async_copy %src[%i, %i], %dst[%i, %i, %i], 4 {bypassL1}: memref<128x128xf32> to memref<3x16x128xf32, 3>
@@ -299,9 +299,9 @@ func.func @async_cp_zfill_f32_align4(
// CHECK-DAG: nvvm.cp.async.shared.global %[[ADDRESSDST]], %[[CAST2]], 16, cache = cg, %[[c5]]
%0 = nvgpu.device_async_copy %src[%i, %i], %dst[%i, %i, %i], 4, %srcElements {bypassL1}: memref<128x128xf32> to memref<3x16x128xf32, 3>
// CHECK: nvvm.cp.async.commit.group
- %1 = nvgpu.device_async_create_group %0
+ nvgpu.device_async_create_group %0
// CHECK: nvvm.cp.async.wait.group 1
- nvgpu.device_async_wait %1 { numGroups = 1 : i32 }
+ nvgpu.device_async_wait { numGroups = 1 : i32 }
return
}
@@ -334,9 +334,9 @@ func.func @async_cp_zfill_f32_align1(
// CHECK-DAG: nvvm.cp.async.shared.global %[[ADDRESSDST]], %[[CAST2]], 4, cache = ca, %[[c5]]
%0 = nvgpu.device_async_copy %src[%i, %i], %dst[%i, %i, %i], 1, %srcElements : memref<128x128xf32> to memref<3x16x128xf32, 3>
// CHECK: nvvm.cp.async.commit.group
- %1 = nvgpu.device_async_create_group %0
+ nvgpu.device_async_create_group %0
// CHECK: nvvm.cp.async.wait.group 1
- nvgpu.device_async_wait %1 { numGroups = 1 : i32 }
+ nvgpu.device_async_wait { numGroups = 1 : i32 }
return
}
diff --git a/mlir/test/Dialect/NVGPU/optimize-shared-memory.mlir b/mlir/test/Dialect/NVGPU/optimize-shared-memory.mlir
index 7477e18728677..610afb56d3175 100644
--- a/mlir/test/Dialect/NVGPU/optimize-shared-memory.mlir
+++ b/mlir/test/Dialect/NVGPU/optimize-shared-memory.mlir
@@ -19,8 +19,8 @@ func.func @optimize_128x32xf16_32x128xf16(%arg0: memref<128x128xf16>,
// CHECK: nvgpu.device_async_copy [[arg0]][[[ldRow]], [[ldCol]]], [[shm]][[[stRow]], [[stColPerm]]]
%0 = nvgpu.device_async_copy %arg0[%ldRow, %ldCol], %shm[%stRow, %stCol], 8
: memref<128x128xf16> to memref<128x32xf16, 3>
- %1 = nvgpu.device_async_create_group %0
- nvgpu.device_async_wait %1 { numGroups = 1 : i32}
+ nvgpu.device_async_create_group %0
+ nvgpu.device_async_wait { numGroups = 1 : i32}
// CHECK: [[c6:%.+]] = arith.constant 6 : index
// CHECK: [[srcBits:%.+]] = arith.andi [[fragRow]], [[c6]]
@@ -39,8 +39,8 @@ func.func @optimize_128x32xf16_32x128xf16(%arg0: memref<128x128xf16>,
// CHECK: nvgpu.device_async_copy [[arg0]][[[ldRow]], [[ldCol]]], [[shmB]][[[stRow]], [[stColPerm]]]
%2 = nvgpu.device_async_copy %arg0[%ldRow, %ldCol], %shmB[%stRow, %stCol], 8
: memref<128x128xf16> to memref<32x128xf16, 3>
- %3 = nvgpu.device_async_create_group %0
- nvgpu.device_async_wait %1 { numGroups = 1 : i32}
+ nvgpu.device_async_create_group %0
+ nvgpu.device_async_wait { numGroups = 1 : i32}
// CHECK: [[c15:%.+]] = arith.constant 15 : index
// CHECK: [[srcBits:%.+]] = arith.andi [[fragRow]], [[c15]]
@@ -76,8 +76,8 @@ func.func @optimize_64x16xf32_16x64xf32(%arg0: memref<128x128xf32>,
// CHECK: nvgpu.device_async_copy [[arg0]][[[ldRow]], [[ldCol]]], [[shm]][[[stRow]], [[stColPerm]]]
%0 = nvgpu.device_async_copy %arg0[%ldRow, %ldCol], %shm[%stRow, %stCol], 4
: memref<128x128xf32> to memref<64x16xf32, 3>
- %1 = nvgpu.device_async_create_group %0
- nvgpu.device_async_wait %1 { numGroups = 1 : i32}
+ nvgpu.device_async_create_group %0
+ nvgpu.device_async_wait { numGroups = 1 : i32}
// CHECK: [[c6:%.+]] = arith.constant 6 : index
// CHECK: [[srcBits:%.+]] = arith.andi [[fragRow]], [[c6]]
@@ -132,8 +132,8 @@ func.func @optimize_64x16xf32_16x64xf32(%arg0: memref<128x128xf32>,
// CHECK: nvgpu.device_async_copy [[arg0]][[[ldRow]], [[ldCol]]], [[shmB]][[[stRow]], [[stColPerm]]]
%2 = nvgpu.device_async_copy %arg0[%ldRow, %ldCol], %shmB[%stRow, %stCol], 4
: memref<128x128xf32> to memref<16x64xf32, 3>
- %3 = nvgpu.device_async_create_group %0
- nvgpu.device_async_wait %1 { numGroups = 1 : i32}
+ nvgpu.device_async_create_group %0
+ nvgpu.device_async_wait { numGroups = 1 : i32}
// CHECK: [[c15:%.+]] = arith.constant 15 : index
// CHECK: [[srcBits:%.+]] = arith.andi [[fragRow]], [[c15]]
@@ -177,8 +177,8 @@ func.func @small_column_size_f64(%arg0: memref<32x32xf64>,
// CHECK: nvgpu.device_async_copy [[arg0]][[[ldRow]], [[ldCol]]], [[shm]][[[stRow]], [[stColPerm]]]
%0 = nvgpu.device_async_copy %arg0[%ldRow, %ldCol], %shm[%stRow, %stCol], 2
: memref<32x32xf64> to memref<32x4xf64, 3>
- %1 = nvgpu.device_async_create_group %0
- nvgpu.device_async_wait %1 { numGroups = 1 : i32}
+ nvgpu.device_async_create_group %0
+ nvgpu.device_async_wait { numGroups = 1 : i32}
// CHECK: [[c6:%.+]] = arith.constant 4 : index
// CHECK: [[srcBits:%.+]] = arith.andi [[fragRow]], [[c6]]
@@ -203,8 +203,8 @@ func.func @too_small_column_size_f16(%arg0: memref<128x128xf16>,
// CHECK: nvgpu.device_async_copy [[arg0]][[[ldRow]], [[ldCol]]], [[shm]][[[stRow]], [[stCol]]]
%0 = nvgpu.device_async_copy %arg0[%ldRow, %ldCol], %shm[%stRow, %stCol], 8
: memref<128x128xf16> to memref<128x8xf16, 3>
- %1 = nvgpu.device_async_create_group %0
- nvgpu.device_async_wait %1 { numGroups = 1 : i32}
+ nvgpu.device_async_create_group %0
+ nvgpu.device_async_wait { numGroups = 1 : i32}
// CHECK: nvgpu.ldmatrix [[shm]][[[fragRow]], [[fragCol]]]
%mat = nvgpu.ldmatrix %shm[%fragRow, %fragCol] {numTiles = 1 : i32, transpose = false}
@@ -229,8 +229,8 @@ func.func @abort_if_subview(%arg0: memref<128x128xf16>,
// CHECK: nvgpu.device_async_copy [[arg0]][[[ldRow]], [[ldCol]]], [[shm]][[[stRow]], [[stCol]]]
%0 = nvgpu.device_async_copy %arg0[%ldRow, %ldCol], %shm[%stRow, %stCol], 8
: memref<128x128xf16> to memref<128x32xf16, 3>
- %1 = nvgpu.device_async_create_group %0
- nvgpu.device_async_wait %1 { numGroups = 1 : i32}
+ nvgpu.device_async_create_group %0
+ nvgpu.device_async_wait { numGroups = 1 : i32}
// CHECK: nvgpu.ldmatrix [[shmView]][[[fragRow]], [[fragCol]]]
%mat = nvgpu.ldmatrix %shmView[%fragRow, %fragCol] {numTiles = 1 : i32, transpose = false}
diff --git a/mlir/test/Dialect/NVGPU/roundtrip.mlir b/mlir/test/Dialect/NVGPU/roundtrip.mlir
index ad516b4d2c200..dbd9c368d9e47 100644
--- a/mlir/test/Dialect/NVGPU/roundtrip.mlir
+++ b/mlir/test/Dialect/NVGPU/roundtrip.mlir
@@ -63,9 +63,9 @@ func.func @async_cp(%dst : memref<2x7x5xf32, 3>, %src : memref<4x5xf32>){
%c0 = arith.constant 0 : index
// CHECK: nvgpu.device_async_copy %{{.*}}[{{.*}}, {{.*}}], %{{.*}}[{{.*}}, {{.*}}, {{.*}}], 4 : memref<4x5xf32> to memref<2x7x5xf32, 3>
%0 = nvgpu.device_async_copy %src[%c0, %c0], %dst[%c0, %c0, %c0], 4 : memref<4x5xf32> to memref<2x7x5xf32, 3>
- // CHECK: %{{.*}} = nvgpu.device_async_create_group
- %token = nvgpu.device_async_create_group %0
- // CHECK: nvgpu.device_async_wait %{{.*}} {numGroups = 1 : i32}
- nvgpu.device_async_wait %token {numGroups = 1 : i32}
+ // CHECK: nvgpu.device_async_create_group
+ nvgpu.device_async_create_group %0
+ // CHECK: nvgpu.device_async_wait {numGroups = 1 : i32}
+ nvgpu.device_async_wait {numGroups = 1 : i32}
return
}
diff --git a/mlir/test/Dialect/NVGPU/transform-create-async-groups.mlir b/mlir/test/Dialect/NVGPU/transform-create-async-groups.mlir
index 8290001c45856..aaaeb50854dc4 100644
--- a/mlir/test/Dialect/NVGPU/transform-create-async-groups.mlir
+++ b/mlir/test/Dialect/NVGPU/transform-create-async-groups.mlir
@@ -17,8 +17,8 @@ builtin.module {
// CHECK: %[[CP1:.*]] = nvgpu.device_async_copy {{.*}}, {{.*}}, 1
%2 = vector.transfer_read %a[%c0, %c4], %cst_0 {in_bounds = [true]} : memref<1024x1024xf32>, vector<1xf32>
vector.transfer_write %2, %0[%c0, %c4, %c0] {in_bounds = [true]} : vector<1xf32>, memref<4x32x16xf32, #gpu.address_space<workgroup>>
- // CHECK: %[[G:.*]] = nvgpu.device_async_create_group %[[CP0]], %[[CP1]]
- // CHECK: nvgpu.device_async_wait %[[G]]
+ // CHECK: nvgpu.device_async_create_group %[[CP0]], %[[CP1]]
+ // CHECK: nvgpu.device_async_wait
return
}
@@ -51,8 +51,8 @@ builtin.module {
// CHECK: %[[CP1:.*]] = nvgpu.device_async_copy {{.*}}, {{.*}}, 1 :
%2 = vector.transfer_read %a[%c0, %c4], %cst_0 {in_bounds = [true]} : memref<1024x1024xf32>, vector<1xf32>
vector.transfer_write %2, %0[%c0, %c4, %c0] {in_bounds = [true]} : vector<1xf32>, memref<4x32x16xf32, #gpu.address_space<workgroup>>
- // CHECK: %[[G:.*]] = nvgpu.device_async_create_group %[[CP0]], %[[CP1]]
- // CHECK: nvgpu.device_async_wait %[[G]]
+ // CHECK: nvgpu.device_async_create_group %[[CP0]], %[[CP1]]
+ // CHECK: nvgpu.device_async_wait
return
}
@@ -83,8 +83,8 @@ builtin.module {
// CHECK: %[[CP1:.*]] = nvgpu.device_async_copy {{.*}}, {{.*}}, 1 :
%2 = vector.load %a[%c0, %c4] : memref<1024x1024xf32>, vector<1xf32>
vector.store %2, %0[%c0, %c4, %c0] : memref<4x32x16xf32, #gpu.address_space<workgroup>>, vector<1xf32>
- // CHECK: %[[G:.*]] = nvgpu.device_async_create_group %[[CP0]], %[[CP1]]
- // CHECK: nvgpu.device_async_wait %[[G]]
+ // CHECK: nvgpu.device_async_create_group %[[CP0]], %[[CP1]]
+ // CHECK: nvgpu.device_async_wait
return
}
diff --git a/mlir/test/Dialect/NVGPU/transform-pipeline-shared.mlir b/mlir/test/Dialect/NVGPU/transform-pipeline-shared.mlir
index e959949babd9e..e93a6a40391bb 100644
--- a/mlir/test/Dialect/NVGPU/transform-pipeline-shared.mlir
+++ b/mlir/test/Dialect/NVGPU/transform-pipeline-shared.mlir
@@ -94,13 +94,11 @@ func.func @async_depth_2_predicated(%global: memref<?xf32>, %alloc_size: index)
%c0f = arith.constant 0.0 : f32
// CHECK: %[[TOKEN0:.+]] = nvgpu.device_async_copy
// CHECK: %[[TOKEN1:.+]] = nvgpu.device_async_copy
- // CHECK: scf.for %[[I:.+]] = {{.*}} iter_args
- // CHECK-SAME: %[[ITER_ARG0:.+]] = %[[TOKEN0]]
- // CHECK-SAME: %[[ITER_ARG1:.+]] = %[[TOKEN1]]
+ // CHECK: scf.for %[[I:.+]] = {{.*}}
scf.for %i = %c0 to %c98 step %c4 {
// Condition for the predication "select" below.
// CHECK: %[[CMP0:.+]] = arith.cmpi slt, %[[I]], %[[C90]]
- // CHECK: nvgpu.device_async_wait %[[ITER_ARG0]] {numGroups = 1
+ // CHECK: nvgpu.device_async_wait {numGroups = 1
// Original "select" with updated induction variable.
// CHECK: %[[I_PLUS_8:.+]] = arith.addi %[[I]], %[[C8]]
// CHECK: %[[CMP1:.+]] = arith.cmpi slt, %[[I_PLUS_8]], %[[C96]]
@@ -122,9 +120,7 @@ func.func @async_depth_2_predicated(%global: memref<?xf32>, %alloc_size: index)
%token = nvgpu.device_async_copy %global[%i], %shared[%i], 4, %read_size
: memref<?xf32> to memref<?xf32, #gpu.address_space<workgroup>>
- nvgpu.device_async_wait %token
-
- // CHECK: scf.yield %[[ITER_ARG1]], %[[ASYNC_TOKEN]]
+ nvgpu.device_async_wait
}
// There is no need to wait for the last copies as it it was fully predicated
// out and doesn't load the original data.
@@ -156,12 +152,11 @@ func.func @async_depth_2_peeled(%global: memref<?xf32>) {
// CHECK: nvgpu.device_async_copy
// CHECK: nvgpu.device_async_copy
// CHECK: scf.for
- // CHECK: nvgpu.device_async_wait %{{.*}} {numGroups = 1
+ // CHECK: nvgpu.device_async_wait {numGroups = 1
// CHECK: arith.select
// CHECK: nvgpu.device_async_copy
- // CHECK: scf.yield
- // CHECK: nvgpu.device_async_wait %{{.*}} {numGroups = 1
- // CHECK: nvgpu.device_async_wait %{{.*}} {numGroups = 0
+ // CHECK: nvgpu.device_async_wait {numGroups = 1
+ // CHECK: nvgpu.device_async_wait {numGroups = 0
scf.for %i = %c0 to %c98 step %c4 {
%c96 = arith.constant 96 : index
%cond = arith.cmpi slt, %i, %c96 : index
@@ -169,7 +164,7 @@ func.func @async_depth_2_peeled(%global: memref<?xf32>) {
%read_size = arith.select %cond, %c4, %c2 : index
%token = nvgpu.device_async_copy %global[%i], %shared[%i], 4, %read_size
: memref<?xf32> to memref<?xf32, #gpu.address_space<workgroup>>
- nvgpu.device_async_wait %token
+ nvgpu.device_async_wait
}
return
}
``````````
</details>
https://github.com/llvm/llvm-project/pull/130482
More information about the Mlir-commits
mailing list