[Mlir-commits] [mlir] [mlir] adapt sm_90 integration test `mbarrier.group` (PR #67423)
llvmlistbot at llvm.org
llvmlistbot at llvm.org
Tue Sep 26 05:24:05 PDT 2023
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-mlir-gpu
<details>
<summary>Changes</summary>
#<!-- -->65951 improved mbarrier supports. This PR adapts that usage in the integration test.
---
Full diff: https://github.com/llvm/llvm-project/pull/67423.diff
3 Files Affected:
- (modified) mlir/test/Integration/GPU/CUDA/sm90/tma_load_128x64_swizzle128b.mlir (+6-6)
- (modified) mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x64_swizzle128b.mlir (+16-21)
- (modified) mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x8_8x128_noswizzle.mlir (+7-6)
``````````diff
diff --git a/mlir/test/Integration/GPU/CUDA/sm90/tma_load_128x64_swizzle128b.mlir b/mlir/test/Integration/GPU/CUDA/sm90/tma_load_128x64_swizzle128b.mlir
index c9538ea3e6af531..aa11773defdb15f 100644
--- a/mlir/test/Integration/GPU/CUDA/sm90/tma_load_128x64_swizzle128b.mlir
+++ b/mlir/test/Integration/GPU/CUDA/sm90/tma_load_128x64_swizzle128b.mlir
@@ -35,7 +35,7 @@
// |-------------------------------|
-!barrierType = !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>>
+!barrierType = !nvgpu.mbarrier.group<memorySpace = #gpu.address_space<workgroup>>
!tokenType = !nvgpu.mbarrier.token
!lhs = memref<128x64xf16>
@@ -93,21 +93,21 @@ module @mymod {
// Step 6. Initialize the mbarrier
%9 = nvgpu.mbarrier.create -> !barrierType
- nvgpu.mbarrier.init %9, %5 : !barrierType
+ nvgpu.mbarrier.init %9[%c0], %5 : !barrierType
%10 = arith.cmpi eq, %6, %c0 : index
// Step 7. First thread does TMA load
scf.if %10 {
gpu.printf "[GPU] TMA SIZE %d\0A" %c8192 : index
- nvgpu.tma.async.load %3[%c0, %c0], %9 to %7 : !lhsTensorMap, !barrierType -> !shmemlhs
- nvgpu.mbarrier.arrive.expect_tx %9, %c8192 : !barrierType
+ nvgpu.tma.async.load %3[%c0, %c0], %9[%c0] to %7 : !lhsTensorMap, !barrierType -> !shmemlhs
+ nvgpu.mbarrier.arrive.expect_tx %9[%c0], %c8192 : !barrierType
} else {
- nvgpu.mbarrier.arrive.expect_tx %9, %c0 : !barrierType
+ nvgpu.mbarrier.arrive.expect_tx %9[%c0], %c0 : !barrierType
}
// Step 8. Wait until TMA is done
- nvgpu.mbarrier.try_wait.parity %9, %c0, %c10000000 : !barrierType
+ nvgpu.mbarrier.try_wait.parity %9[%c0], %c0, %c10000000 : !barrierType
// Step 9. Print loaded data in 128b swizzled
scf.if %10 {
diff --git a/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x64_swizzle128b.mlir b/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x64_swizzle128b.mlir
index c75be107ca4c276..5c465f7de8abdb5 100644
--- a/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x64_swizzle128b.mlir
+++ b/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x64_swizzle128b.mlir
@@ -40,7 +40,7 @@
// |-------------------------------|
-!barrierType = !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>>
+!barrierType = !nvgpu.mbarrier.group<memorySpace = #gpu.address_space<workgroup>>
!tokenType = !nvgpu.mbarrier.token
!lhs = memref<128x64xf16>
@@ -96,28 +96,22 @@ module @mymod {
memref.store %vL32, %lhs32[%j, %i] : memref<128x64xf32>
}
}
-
- // Step 2. Print on the host
- %lhs32_unranked = memref.cast %lhs32 : memref<128x64xf32> to memref<*xf32>
- call @printMemrefF32(%lhs32_unranked) : (memref<*xf32>) -> ()
- %rhs32_unranked = memref.cast %rhs32 : memref<64x128xf32> to memref<*xf32>
- call @printMemrefF32(%rhs32_unranked) : (memref<*xf32>) -> ()
- // Step 3. Copy host to device
+ // Step 2. Copy host to device
%0 = gpu.wait async
%d_glbmem_lhs, %asyncToken = gpu.alloc async [%0] () : !lhs
%d_glbmem_rhs, %asyncToken_2 = gpu.alloc async [%0] () : !rhs
%1 = gpu.memcpy async [%0] %d_glbmem_lhs, %lhs : !lhs, !lhs
%2 = gpu.memcpy async [%0] %d_glbmem_rhs, %rhs : !rhs, !rhs
- // Step 4. Create TMA tensor descriptor
+ // Step 3. Create TMA tensor descriptor
%d_lhs_unranked = memref.cast %d_glbmem_lhs :!lhs to memref<*xf16>
%d_rhs_unranked = memref.cast %d_glbmem_rhs :!rhs to memref<*xf16>
%d_lhsTensorMap = nvgpu.tma.create.descriptor %d_lhs_unranked box[%c128, %c64] : memref<*xf16> -> !lhsTensorMap
%d_rhsTensorMap = nvgpu.tma.create.descriptor %d_rhs_unranked box[%c64, %c64] : memref<*xf16> -> !rhsTensorMap
- // Step 5. Launch a GPU kernel
+ // Step 4. Launch a GPU kernel
gpu.launch blocks(%arg0, %arg1, %arg2) in (%arg6 = %c1, %arg7 = %c1, %arg8 = %c1) threads(%arg3, %arg4, %arg5) in (%arg9 = %c128, %arg10 = %c1, %arg11 = %c1) {
%5 = gpu.block_dim x
%6 = gpu.thread_id x
@@ -125,27 +119,27 @@ module @mymod {
%rhsShmem = memref.get_global @bufferRhsGlobal : !shmemrhs
%rhsShmem2 = memref.subview %rhsShmem[%c32, %c0][%c32, %c128][%c1, %c1] : !shmemrhs to memref<?x?xf16, strided<[?, ?], offset: ?>, 3>
- // Step 6. Initialize the mbarrier
+ // Step 5. Initialize the mbarrier
%9 = nvgpu.mbarrier.create -> !barrierType
- nvgpu.mbarrier.init %9, %5 : !barrierType
+ nvgpu.mbarrier.init %9[%c0], %5 : !barrierType
%10 = arith.cmpi eq, %6, %c0 : index
- // Step 7. First thread does TMA load
+ // Step 6. First thread does TMA load
scf.if %10 {
gpu.printf "[GPU] TMA SIZE %d\0A" %c32768 : index
- nvgpu.tma.async.load %d_lhsTensorMap[%c0, %c0], %9 to %lhsShmem : !lhsTensorMap, !barrierType -> !shmemlhs
- nvgpu.tma.async.load %d_rhsTensorMap[%c0, %c0], %9 to %rhsShmem : !rhsTensorMap, !barrierType -> !shmemrhs
- nvgpu.tma.async.load %d_rhsTensorMap[%c64, %c0], %9 to %rhsShmem2 : !rhsTensorMap, !barrierType -> memref<?x?xf16, strided<[?, ?], offset: ?>, 3>
- nvgpu.mbarrier.arrive.expect_tx %9, %c32768 : !barrierType
+ nvgpu.tma.async.load %d_lhsTensorMap[%c0, %c0], %9[%c0] to %lhsShmem : !lhsTensorMap, !barrierType -> !shmemlhs
+ nvgpu.tma.async.load %d_rhsTensorMap[%c0, %c0], %9[%c0] to %rhsShmem : !rhsTensorMap, !barrierType -> !shmemrhs
+ nvgpu.tma.async.load %d_rhsTensorMap[%c64, %c0], %9[%c0] to %rhsShmem2 : !rhsTensorMap, !barrierType -> memref<?x?xf16, strided<[?, ?], offset: ?>, 3>
+ nvgpu.mbarrier.arrive.expect_tx %9[%c0], %c32768 : !barrierType
} else {
- nvgpu.mbarrier.arrive.expect_tx %9, %c0 : !barrierType
+ nvgpu.mbarrier.arrive.expect_tx %9[%c0], %c0 : !barrierType
}
- // Step 8. Wait until TMA is done
- nvgpu.mbarrier.try_wait.parity %9, %c0, %c10000000 : !barrierType
+ // Step 7. Wait until TMA is done
+ nvgpu.mbarrier.try_wait.parity %9[%c0], %c0, %c10000000 : !barrierType
- // Step 9. Print loaded data in 128b swizzled
+ // Step 8. Print loaded data in 128b swizzled
scf.if %10 {
gpu.printf "===--- Matrix B ---=== %d \n" %c-1_i32 : i32
scf.for %ii = %c0 to %c64 step %c1 {
@@ -158,6 +152,7 @@ module @mymod {
}
gpu.printf "===----------------=== %d \n" %c-1_i32 : i32
}
+ gpu.barrier
gpu.terminator
}
return
diff --git a/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x8_8x128_noswizzle.mlir b/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x8_8x128_noswizzle.mlir
index 11cf63548a551bb..5331ebb87d37de5 100644
--- a/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x8_8x128_noswizzle.mlir
+++ b/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x8_8x128_noswizzle.mlir
@@ -39,6 +39,7 @@
// RUN: --entry-point-result=void \
// RUN: | FileCheck %s
+
// CHECK: [GPU] TMA BEFORE lhs[45][7] 0.000000
// CHECK: [GPU] TMA BEFORE rhs[7][0] 0.000000
// CHECK: [GPU] TMA LOADED lhs[45][7] 7.000000
@@ -87,21 +88,21 @@ module @mymod {
%7 = memref.get_global @bufferLhsGlobal : memref<64x8xf32, 3>
%8 = memref.get_global @bufferRhsGlobal : memref<8x128xf32, 3>
%9 = nvgpu.mbarrier.create -> <memorySpace = #gpu.address_space<workgroup>>
- nvgpu.mbarrier.init %9, %5 : <memorySpace = #gpu.address_space<workgroup>>
+ nvgpu.mbarrier.init %9[%c0], %5 : <memorySpace = #gpu.address_space<workgroup>>
gpu.barrier
%10 = arith.cmpi eq, %6, %c0 : index
scf.if %10 {
- nvgpu.mbarrier.arrive.expect_tx %9, %c6144 : <memorySpace = #gpu.address_space<workgroup>>
+ nvgpu.mbarrier.arrive.expect_tx %9[%c0], %c6144 : <memorySpace = #gpu.address_space<workgroup>>
%11 = memref.load %7[%c0, %c0] : memref<64x8xf32, 3>
%12 = memref.load %8[%c0, %c0] : memref<8x128xf32, 3>
gpu.printf "[GPU] TMA BEFORE lhs[45][7] %f\0A" %11 : f32
gpu.printf "[GPU] TMA BEFORE rhs[7][0] %f\0A" %12 : f32
- nvgpu.tma.async.load %3[%c0, %c0], %9 to %7 : <tensor = memref<64x8xf32, 3>, swizzle = none, l2promo = none, oob = zero, interleave = none>, <memorySpace = #gpu.address_space<workgroup>> -> memref<64x8xf32, 3>
- nvgpu.tma.async.load %4[%c0, %c0], %9 to %8 : <tensor = memref<8x128xf32, 3>, swizzle = none, l2promo = none, oob = zero, interleave = none>, <memorySpace = #gpu.address_space<workgroup>> -> memref<8x128xf32, 3>
+ nvgpu.tma.async.load %3[%c0, %c0], %9[%c0] to %7 : <tensor = memref<64x8xf32, 3>, swizzle = none, l2promo = none, oob = zero, interleave = none>, <memorySpace = #gpu.address_space<workgroup>> -> memref<64x8xf32, 3>
+ nvgpu.tma.async.load %4[%c0, %c0], %9[%c0] to %8 : <tensor = memref<8x128xf32, 3>, swizzle = none, l2promo = none, oob = zero, interleave = none>, <memorySpace = #gpu.address_space<workgroup>> -> memref<8x128xf32, 3>
} else {
- nvgpu.mbarrier.arrive.expect_tx %9, %c0 : <memorySpace = #gpu.address_space<workgroup>>
+ nvgpu.mbarrier.arrive.expect_tx %9[%c0], %c0 : <memorySpace = #gpu.address_space<workgroup>>
}
- nvgpu.mbarrier.try_wait.parity %9, %c0, %c10000000 : <memorySpace = #gpu.address_space<workgroup>>
+ nvgpu.mbarrier.try_wait.parity %9[%c0], %c0, %c10000000 : <memorySpace = #gpu.address_space<workgroup>>
scf.if %10 {
%11 = memref.load %7[%c45, %c7] : memref<64x8xf32, 3>
%12 = memref.load %8[%c7, %c0] : memref<8x128xf32, 3>
``````````
</details>
https://github.com/llvm/llvm-project/pull/67423
More information about the Mlir-commits
mailing list