[Mlir-commits] [mlir] [NVGPU] Fix nvdsl examples - take 2 (PR #167321)
llvmlistbot at llvm.org
llvmlistbot at llvm.org
Mon Nov 10 06:39:57 PST 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-mlir-nvgpu
Author: Giacomo Castiglioni (castigli)
<details>
<summary>Changes</summary>
This PR re-lands https://github.com/llvm/llvm-project/pull/156830
This PR aims at fixing the nvdsl examples which got a bit out of sync not being tested in the CI.
The fixed bugs were related to the following PRs:
- move to nanobind #<!-- -->118583
- split gpu module initialization #<!-- -->135478
- gpu dialect python API change #<!-- -->163883
---
Patch is 56.82 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/167321.diff
9 Files Affected:
- (modified) mlir/test/Examples/NVGPU/Ch0.py (+23-3)
- (modified) mlir/test/Examples/NVGPU/Ch1.py (+34-9)
- (modified) mlir/test/Examples/NVGPU/Ch2.py (+50-9)
- (modified) mlir/test/Examples/NVGPU/Ch3.py (+77-7)
- (modified) mlir/test/Examples/NVGPU/Ch4.py (+156-10)
- (modified) mlir/test/Examples/NVGPU/Ch5.py (+170-11)
- (modified) mlir/test/Examples/NVGPU/lit.local.cfg (+1-1)
- (modified) mlir/test/Examples/NVGPU/tools/nvdsl.py (+17-18)
- (modified) mlir/test/Examples/NVGPU/tools/nvgpucompiler.py (+3-1)
``````````diff
diff --git a/mlir/test/Examples/NVGPU/Ch0.py b/mlir/test/Examples/NVGPU/Ch0.py
index 8f60088178d11..e09720a0f3b75 100644
--- a/mlir/test/Examples/NVGPU/Ch0.py
+++ b/mlir/test/Examples/NVGPU/Ch0.py
@@ -1,5 +1,9 @@
# RUN: env SUPPORT_LIB=%mlir_cuda_runtime \
-# RUN: %PYTHON %s | FileCheck %s
+# RUN: sh -c 'if [[ "%mlir_run_cuda_sm90_tests" == "1" ]]; \
+# RUN: then %PYTHON %s | FileCheck %s; \
+# RUN: else export MLIR_NVDSL_PRINT_IR=1; \
+# RUN: %PYTHON %s | FileCheck %s --check-prefix=DUMPIR; fi'
+
# ===----------------------------------------------------------------------===//
# Chapter 0 : Hello World
@@ -33,7 +37,7 @@ def kernel():
# + operator generates arith.addi
myValue = alpha + tidx
# Print from a GPU thread
- gpu.printf("GPU thread %llu has %llu\n", [tidx, myValue])
+ gpu.printf("GPU thread %llu has %llu\n", tidx, myValue)
# 3. Call the GPU kernel
kernel()
@@ -43,8 +47,24 @@ def kernel():
# 4. The `mlir_func` decorator JIT compiles the IR and executes the MLIR function.
main(alpha)
-
# CHECK: GPU thread 0 has 100
# CHECK: GPU thread 1 has 101
# CHECK: GPU thread 2 has 102
# CHECK: GPU thread 3 has 103
+
+# DUMPIR: func.func @main(%arg0: index) attributes {llvm.emit_c_interface} {
+# DUMPIR: %[[C0_I32:.*]] = arith.constant 0 : i32
+# DUMPIR: %[[C1:.*]] = arith.constant 1 : index
+# DUMPIR: %[[C1_0:.*]] = arith.constant 1 : index
+# DUMPIR: %[[C1_1:.*]] = arith.constant 1 : index
+# DUMPIR: %[[C4:.*]] = arith.constant 4 : index
+# DUMPIR: %[[C1_2:.*]] = arith.constant 1 : index
+# DUMPIR: %[[C1_3:.*]] = arith.constant 1 : index
+# DUMPIR: gpu.launch blocks(%arg1, %arg2, %arg3) in (%arg7 = %[[C1]], %arg8 = %[[C1_0]], %arg9 = %[[C1_1]]) threads(%arg4, %arg5, %arg6) in (%arg10 = %[[C4]], %arg11 = %[[C1_2]], %arg12 = %[[C1_3]]) dynamic_shared_memory_size %[[C0_I32]] {
+# DUMPIR: %[[TIDX:.*]] = gpu.thread_id x
+# DUMPIR: %[[MYVAL:.*]] = arith.addi %arg0, %[[TIDX]] : index
+# DUMPIR: gpu.printf "GPU thread %llu has %llu\0A", %[[TIDX]], %[[MYVAL]] : index, index
+# DUMPIR: gpu.terminator
+# DUMPIR: }
+# DUMPIR: return
+# DUMPIR: }
diff --git a/mlir/test/Examples/NVGPU/Ch1.py b/mlir/test/Examples/NVGPU/Ch1.py
index cfb48d56f8d49..6e44e4d04fa06 100644
--- a/mlir/test/Examples/NVGPU/Ch1.py
+++ b/mlir/test/Examples/NVGPU/Ch1.py
@@ -1,5 +1,9 @@
# RUN: env SUPPORT_LIB=%mlir_cuda_runtime \
-# RUN: %PYTHON %s | FileCheck %s
+# RUN: sh -c 'if [[ "%mlir_run_cuda_sm90_tests" == "1" ]]; \
+# RUN: then %PYTHON %s | FileCheck %s; \
+# RUN: else export MLIR_NVDSL_PRINT_IR=1; \
+# RUN: %PYTHON %s | FileCheck %s --check-prefix=DUMPIR; fi'
+
# ===----------------------------------------------------------------------===//
# Chapter 1 : 2D Saxpy
@@ -24,12 +28,12 @@
def saxpy(x, y, alpha):
# 1. Use MLIR GPU dialect to allocate and copy memory
token_ty = gpu.AsyncTokenType.get()
- t1 = gpu.wait(token_ty, [])
+ t1 = gpu.wait([])
x_dev, t2 = gpu.alloc(x.type, token_ty, [t1], [], [])
y_dev, t3 = gpu.alloc(y.type, token_ty, [t2], [], [])
t4 = gpu.memcpy(token_ty, [t3], x_dev, x)
t5 = gpu.memcpy(token_ty, [t4], y_dev, y)
- t6 = gpu.wait(token_ty, [t5])
+ t6 = gpu.wait([t5])
# 2. Compute 2D SAXPY kernel
@NVDSL.mlir_gpu_launch(grid=(M, 1, 1), block=(N, 1, 1))
@@ -47,7 +51,7 @@ def saxpy_kernel():
saxpy_kernel()
t7 = gpu.memcpy(token_ty, [t6], y, y_dev)
- gpu.wait(token_ty, [t7])
+ gpu.wait([t7])
# 3. Pass numpy arrays to MLIR
@@ -56,11 +60,32 @@ def saxpy_kernel():
alpha = 2.0
x = np.random.randn(M, N).astype(np.float32)
y = np.ones((M, N), np.float32)
+
saxpy(x, y, alpha)
-# 4. Verify MLIR with reference computation
-ref = np.ones((M, N), np.float32)
-ref += x * alpha
-np.testing.assert_allclose(y, ref, rtol=5e-03, atol=1e-01)
-print("PASS")
+if os.getenv("MLIR_NVDSL_PRINT_IR") != "1":
+ # 4. Verify MLIR with reference computation
+ ref = np.ones((M, N), np.float32)
+ ref += x * alpha
+ np.testing.assert_allclose(y, ref, rtol=5e-03, atol=1e-01)
+ print("PASS")
# CHECK-NOT: Mismatched elements
+# CHECK: PASS
+
+# DUMPIR: func.func @saxpy(%[[ARG0:.*]]: memref<256x32xf32>, %[[ARG1:.*]]: memref<256x32xf32>, %[[ARG2:.*]]: f32) attributes {llvm.emit_c_interface} {
+# DUMPIR: %[[WAIT0:.*]] = gpu.wait async
+# DUMPIR: %[[MEMREF:.*]], %[[ASYNC0:.*]] = gpu.alloc async [%[[WAIT0]]] () : memref<256x32xf32>
+# DUMPIR: %[[MEMREF0:.*]], %[[ASYNC1:.*]] = gpu.alloc async [%[[ASYNC0]]] () : memref<256x32xf32>
+# DUMPIR: %[[MEMCPY1:.*]] = gpu.memcpy async [%[[ASYNC1]]] %[[MEMREF]], %[[ARG0]] : memref<256x32xf32>, memref<256x32xf32>
+# DUMPIR: %[[MEMCPY2:.*]] = gpu.memcpy async [%[[MEMCPY1]]] %[[MEMREF0]], %[[ARG1]] : memref<256x32xf32>, memref<256x32xf32>
+# DUMPIR: %[[WAIT1:.*]] = gpu.wait async [%[[MEMCPY2]]]
+# DUMPIR: %[[LD0:.*]] = memref.load %[[MEMREF]][%{{.*}}, %{{.*}}] : memref<256x32xf32>
+# DUMPIR: %[[LD1:.*]] = memref.load %[[MEMREF0]][%{{.*}}, %{{.*}}] : memref<256x32xf32>
+# DUMPIR: %[[MUL:.*]] = arith.mulf %[[LD0]], %[[ARG2]] : f32
+# DUMPIR: %[[ADD:.*]] = arith.addf %[[LD1]], %[[MUL]] : f32
+# DUMPIR: memref.store %[[ADD]], %[[MEMREF0]][%{{.*}}, %{{.*}}] : memref<256x32xf32>
+# DUMPIR: gpu.terminator
+# DUMPIR: %[[MEMCPY3:.*]] = gpu.memcpy async [%[[WAIT1]]] %[[ARG1]], %[[MEMREF0]] : memref<256x32xf32>, memref<256x32xf32>
+# DUMPIR: %[[WAIT2:.*]] = gpu.wait async [%[[MEMCPY3]]]
+# DUMPIR: return
+# DUMPIR: }
diff --git a/mlir/test/Examples/NVGPU/Ch2.py b/mlir/test/Examples/NVGPU/Ch2.py
index 729913c6d5c4f..aba610cee0b34 100644
--- a/mlir/test/Examples/NVGPU/Ch2.py
+++ b/mlir/test/Examples/NVGPU/Ch2.py
@@ -1,5 +1,9 @@
# RUN: env SUPPORT_LIB=%mlir_cuda_runtime \
-# RUN: %PYTHON %s | FileCheck %s
+# RUN: sh -c 'if [[ "%mlir_run_cuda_sm90_tests" == "1" ]]; \
+# RUN: then %PYTHON %s | FileCheck %s; \
+# RUN: else export MLIR_NVDSL_PRINT_IR=1; \
+# RUN: %PYTHON %s | FileCheck %s --check-prefix=DUMPIR; fi'
+
# ===----------------------------------------------------------------------===//
# Chapter 2 : 2D Saxpy with TMA
@@ -28,12 +32,12 @@
@NVDSL.mlir_func
def saxpy(x, y, alpha):
token_ty = gpu.AsyncTokenType.get()
- t1 = gpu.wait(token_ty, [])
+ t1 = gpu.wait([])
x_dev, t2 = gpu.alloc(x.type, token_ty, [t1], [], [])
y_dev, t3 = gpu.alloc(y.type, token_ty, [t2], [], [])
t4 = gpu.memcpy(token_ty, [t3], x_dev, x)
t5 = gpu.memcpy(token_ty, [t4], y_dev, y)
- t6 = gpu.wait(token_ty, [t5])
+ t6 = gpu.wait([t5])
x_tma = TMA([1, N], x.type)
y_tma = TMA([1, N], y.type)
@@ -74,7 +78,7 @@ def saxpy_tma_kernel():
saxpy_tma_kernel()
t7 = gpu.memcpy(token_ty, [t6], y, y_dev)
- gpu.wait(token_ty, [t7])
+ gpu.wait([t7])
# 3. Pass numpy arrays to MLIR
@@ -85,9 +89,46 @@ def saxpy_tma_kernel():
y = np.ones((M, N), np.float32)
saxpy(x, y, alpha)
-# 4. Verify MLIR with reference computation
-ref = np.ones((M, N), np.float32)
-ref += x * alpha
-np.testing.assert_allclose(y, ref, rtol=5e-03, atol=1e-01)
-print("PASS")
+if os.getenv("MLIR_NVDSL_PRINT_IR") != "1":
+ # 4. Verify MLIR with reference computation
+ ref = np.ones((M, N), np.float32)
+ ref += x * alpha
+ np.testing.assert_allclose(y, ref, rtol=5e-03, atol=1e-01)
+ print("PASS")
# CHECK-NOT: Mismatched elements
+# CHECK: PASS
+
+# DUMPIR: func.func @saxpy(%{{.*}}: memref<256x32xf32>, %[[ARG1:.*]]: memref<256x32xf32>, %[[ARG2:.*]]: f32) attributes {llvm.emit_c_interface} {
+# DUMPIR: %[[WAIT0:.*]] = gpu.wait async
+# DUMPIR: %[[MEMREF:.*]], %[[ASYNC0:.*]] = gpu.alloc async [%[[WAIT0]]] () : memref<256x32xf32>
+# DUMPIR: %[[CAST:.*]] = memref.cast %[[MEMREF]] : memref<256x32xf32> to memref<*xf32>
+# DUMPIR: %[[C1:.*]] = arith.constant 1 : index
+# DUMPIR: %[[C32:.*]] = arith.constant 32 : index
+# DUMPIR: %[[TMA0:.*]] = nvgpu.tma.create.descriptor %[[CAST]] box[%[[C1]], %[[C32]]] : memref<*xf32> -> <tensor = memref<1x32xf32, 3>, swizzle = none, l2promo = none, oob = zero, interleave = none>
+# DUMPIR: %[[C0:.*]] = arith.constant 0 : index
+# DUMPIR: %[[EQ:.*]] = arith.cmpi eq, %{{.*}}, %[[C0]] : index
+# DUMPIR: %[[MB:.*]] = nvgpu.mbarrier.create -> <memorySpace = #gpu.address_space<workgroup>>
+# DUMPIR: %[[C0_10:.*]] = arith.constant 0 : index
+# DUMPIR: %[[C1_11:.*]] = arith.constant 1 : index
+# DUMPIR: nvgpu.mbarrier.init %[[MB]][%[[C0_10]]], %[[C1_11]], predicate = %[[EQ]] : <memorySpace = #gpu.address_space<workgroup>>
+# DUMPIR: %[[DSM0:.*]] = gpu.dynamic_shared_memory : memref<?xi8, #gpu.address_space<workgroup>>
+# DUMPIR: %[[C0_12:.*]] = arith.constant 0 : index
+# DUMPIR: %[[VIEW:.*]] = memref.view %[[DSM0]][%[[C0_12]]][] : memref<?xi8, #gpu.address_space<workgroup>> to memref<1x32xf32, #gpu.address_space<workgroup>>
+# DUMPIR: %[[DSM1:.*]] = gpu.dynamic_shared_memory : memref<?xi8, #gpu.address_space<workgroup>>
+# DUMPIR: %[[C128:.*]] = arith.constant 128 : index
+# DUMPIR: %[[VIEW_13:.*]] = memref.view %[[DSM1]][%[[C128]]][] : memref<?xi8, #gpu.address_space<workgroup>> to memref<1x32xf32, #gpu.address_space<workgroup>>
+# DUMPIR: nvgpu.tma.async.load %[[TMA0]][%{{.*}}, %{{.*}}], %[[MB]][%{{.*}}] to %[[VIEW]], predicate = %[[EQ]] : <tensor = memref<1x32xf32, 3>, swizzle = none, l2promo = none, oob = zero, interleave = none>, <memorySpace = #gpu.address_space<workgroup>> -> memref<1x32xf32, #gpu.address_space<workgroup>>
+# DUMPIR: nvgpu.mbarrier.arrive.expect_tx %[[MB]][%{{.*}}], %{{.*}}, predicate = %[[EQ]] : <memorySpace = #gpu.address_space<workgroup>>
+# DUMPIR: %[[C0_20:.*]] = arith.constant 0 : index
+# DUMPIR: %[[C10000000:.*]] = arith.constant 10000000 : index
+# DUMPIR: %[[FALSE:.*]] = arith.constant false
+# DUMPIR: nvgpu.mbarrier.try_wait.parity %[[MB]][%[[C0_20]]], %[[FALSE]], %[[C10000000]] : <memorySpace = #gpu.address_space<workgroup>>
+# DUMPIR: %[[C0_21:.*]] = arith.constant 0 : index
+# DUMPIR: %[[LD0:.*]] = memref.load %[[VIEW]][%[[C0_21]], %{{.*}}] : memref<1x32xf32, #gpu.address_space<workgroup>>
+# DUMPIR: %[[C0_22:.*]] = arith.constant 0 : index
+# DUMPIR: %[[LD1:.*]] = memref.load %[[VIEW_13]][%[[C0_22]], %{{.*}}] : memref<1x32xf32, #gpu.address_space<workgroup>>
+# DUMPIR: memref.store %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}] : memref<256x32xf32>
+# DUMPIR: %[[MEMCPY3:.*]] = gpu.memcpy async [%{{.*}}] %[[ARG1]], %{{.*}} : memref<256x32xf32>, memref<256x32xf32>
+# DUMPIR: %{{.*}} = gpu.wait async [%[[MEMCPY3]]]
+# DUMPIR: return
+# DUMPIR: }
diff --git a/mlir/test/Examples/NVGPU/Ch3.py b/mlir/test/Examples/NVGPU/Ch3.py
index eb96b11c63416..fe11575416866 100644
--- a/mlir/test/Examples/NVGPU/Ch3.py
+++ b/mlir/test/Examples/NVGPU/Ch3.py
@@ -1,5 +1,9 @@
# RUN: env SUPPORT_LIB=%mlir_cuda_runtime \
-# RUN: %PYTHON %s | FileCheck %s
+# RUN: sh -c 'if [[ "%mlir_run_cuda_sm90_tests" == "1" ]]; \
+# RUN: then %PYTHON %s | FileCheck %s; \
+# RUN: else export MLIR_NVDSL_PRINT_IR=1; \
+# RUN: %PYTHON %s | FileCheck %s --check-prefix=DUMPIR; fi'
+
# ===----------------------------------------------------------------------===//
# Chapter 3 : GEMM 128x128x64 with Tensor Core
@@ -60,13 +64,13 @@ def tma_load(
@NVDSL.mlir_func
def gemm_128_128_64(a, b, d):
token_ty = gpu.AsyncTokenType.get()
- t1 = gpu.wait(token_ty, [])
+ t1 = gpu.wait([])
a_dev, t2 = gpu.alloc(a.type, token_ty, [t1], [], [])
b_dev, t3 = gpu.alloc(b.type, token_ty, [t2], [], [])
d_dev, t4 = gpu.alloc(d.type, token_ty, [t3], [], [])
t5 = gpu.memcpy(token_ty, [t4], a_dev, a)
t6 = gpu.memcpy(token_ty, [t5], b_dev, b)
- t7 = gpu.wait(token_ty, [t6])
+ t7 = gpu.wait([t6])
sw = nvgpu.TensorMapSwizzleKind.SWIZZLE_128B
a_tma = TMA([128, 64], a.type, swizzle=sw)
@@ -111,7 +115,7 @@ def gemm_tma_kernel():
gemm_tma_kernel()
t8 = gpu.memcpy(token_ty, [t7], d, d_dev)
- gpu.wait(None, [t8])
+ gpu.wait([t8])
# Python pass arguments to MLIR
@@ -123,7 +127,73 @@ def gemm_tma_kernel():
d = np.zeros((M, N), np.float32)
gemm_128_128_64(a, b, d)
-ref_d = a.astype(np.float16) @ b.astype(np.float16)
-np.testing.assert_allclose(d, ref_d, rtol=5e-03, atol=1e-01)
-print("PASS")
+if os.getenv("MLIR_NVDSL_PRINT_IR") != "1":
+ # Verify MLIR program with reference computation in python
+ ref_d = a.astype(np.float16) @ b.astype(np.float16)
+ np.testing.assert_allclose(d, ref_d, rtol=5e-03, atol=1e-01)
+ print("PASS")
# CHECK-NOT: Mismatched elements
+# CHECK: PASS
+
+# DUMPIR: func.func @gemm_128_128_64(%{{.*}}: memref<128x64xf16>, %{{.*}}: memref<64x128xf16>, %[[ARG2:.*]]: memref<128x128xf32>) attributes {llvm.emit_c_interface} {
+# DUMPIR: %[[C128:.*]] = arith.constant 128 : index
+# DUMPIR: %[[C64:.*]] = arith.constant 64 : index
+# DUMPIR: %[[TMA0:.*]] = nvgpu.tma.create.descriptor %{{.*}} box[%[[C128]], %[[C64]]] : memref<*xf16> -> <tensor = memref<128x64xf16, 3>, swizzle = swizzle_128b, l2promo = none, oob = zero, interleave = none>
+# DUMPIR: %[[CAST1:.*]] = memref.cast %{{.*}} : memref<64x128xf16> to memref<*xf16>
+# DUMPIR: %[[C64_5:.*]] = arith.constant 64 : index
+# DUMPIR: %[[C64_6:.*]] = arith.constant 64 : index
+# DUMPIR: %[[TMA1:.*]] = nvgpu.tma.create.descriptor %[[CAST1]] box[%[[C64_5]], %[[C64_6]]] : memref<*xf16> -> <tensor = memref<64x64xf16, 3>, swizzle = swizzle_128b, l2promo = none, oob = zero, interleave = none>
+# DUMPIR: %[[THREADID:.*]] = gpu.thread_id x
+# DUMPIR: %[[MB:.*]] = nvgpu.mbarrier.create -> <memorySpace = #gpu.address_space<workgroup>>
+# DUMPIR: %[[C0:.*]] = arith.constant 0 : index
+# DUMPIR: %[[EQ:.*]] = arith.cmpi eq, %[[THREADID]], %[[C0]] : index
+# DUMPIR: %[[C0_12:.*]] = arith.constant 0 : index
+# DUMPIR: %[[C1_13:.*]] = arith.constant 1 : index
+# DUMPIR: nvgpu.mbarrier.init %[[MB]][%[[C0_12]]], %[[C1_13]], predicate = %[[EQ]] : <memorySpace = #gpu.address_space<workgroup>>
+# DUMPIR: nvgpu.tma.prefetch.descriptor %[[TMA0]], predicate = %[[EQ]] : <tensor = memref<128x64xf16, 3>, swizzle = swizzle_128b, l2promo = none, oob = zero, interleave = none>
+# DUMPIR: nvgpu.tma.prefetch.descriptor %[[TMA1]], predicate = %[[EQ]] : <tensor = memref<64x64xf16, 3>, swizzle = swizzle_128b, l2promo = none, oob = zero, interleave = none>
+# DUMPIR: %[[DSM0:.*]] = gpu.dynamic_shared_memory : memref<?xi8, #gpu.address_space<workgroup>>
+# DUMPIR: %[[C0_14:.*]] = arith.constant 0 : index
+# DUMPIR: %[[VIEW:.*]] = memref.view %[[DSM0]][%[[C0_14]]][] : memref<?xi8, #gpu.address_space<workgroup>> to memref<128x64xf16, #gpu.address_space<workgroup>>
+# DUMPIR: %[[DSM1:.*]] = gpu.dynamic_shared_memory : memref<?xi8, #gpu.address_space<workgroup>>
+# DUMPIR: %[[C16384:.*]] = arith.constant 16384 : index
+# DUMPIR: %[[VIEW_15:.*]] = memref.view %[[DSM1]][%[[C16384]]][] : memref<?xi8, #gpu.address_space<workgroup>> to memref<64x128xf16, #gpu.address_space<workgroup>>
+# DUMPIR: %[[DSM2:.*]] = gpu.dynamic_shared_memory : memref<?xi8, #gpu.address_space<workgroup>>
+# DUMPIR: %[[C0_16:.*]] = arith.constant 0 : index
+# DUMPIR: %[[VIEW_17:.*]] = memref.view %[[DSM2]][%[[C0_16]]][] : memref<?xi8, #gpu.address_space<workgroup>> to memref<128x64xf16, #gpu.address_space<workgroup>>
+# DUMPIR: %[[DSM3:.*]] = gpu.dynamic_shared_memory : memref<?xi8, #gpu.address_space<workgroup>>
+# DUMPIR: %[[C16384_18:.*]] = arith.constant 16384 : index
+# DUMPIR: %[[VIEW_19:.*]] = memref.view %[[DSM3]][%[[C16384_18]]][] : memref<?xi8, #gpu.address_space<workgroup>> to memref<64x64xf16, #gpu.address_space<workgroup>>
+# DUMPIR: %[[DSM4:.*]] = gpu.dynamic_shared_memory : memref<?xi8, #gpu.address_space<workgroup>>
+# DUMPIR: %[[C24576:.*]] = arith.constant 24576 : index
+# DUMPIR: %[[VIEW_20:.*]] = memref.view %[[DSM4]][%[[C24576]]][] : memref<?xi8, #gpu.address_space<workgroup>> to memref<64x64xf16, #gpu.address_space<workgroup>>
+# DUMPIR: %[[C0_21:.*]] = arith.constant 0 : index
+# DUMPIR: %[[C32768:.*]] = arith.constant 32768 : index
+# DUMPIR: nvgpu.mbarrier.arrive.expect_tx %[[MB]][%[[C0_21]]], %[[C32768]], predicate = %[[EQ]] : <memorySpace = #gpu.address_space<workgroup>>
+# DUMPIR: %[[C0_22:.*]] = arith.constant 0 : index
+# DUMPIR: %[[C0_23:.*]] = arith.constant 0 : index
+# DUMPIR: %[[C0_24:.*]] = arith.constant 0 : index
+# DUMPIR: nvgpu.tma.async.load %[[TMA0]][%[[C0_23]], %[[C0_24]]], %[[MB]][%[[C0_22]]] to %[[VIEW_17]], predicate = %[[EQ]] : <tensor = memref<128x64xf16, 3>, swizzle = swizzle_128b, l2promo = none, oob = zero, interleave = none>, <memorySpace = #gpu.address_space<workgroup>> -> memref<128x64xf16, #gpu.address_space<workgroup>>
+# DUMPIR: %[[C0_25:.*]] = arith.constant 0 : index
+# DUMPIR: %[[C0_26:.*]] = arith.constant 0 : index
+# DUMPIR: %[[C0_27:.*]] = arith.constant 0 : index
+# DUMPIR: nvgpu.tma.async.load %[[TMA1]][%[[C0_26]], %[[C0_27]]], %[[MB]][%[[C0_25]]] to %[[VIEW_19]], predicate = %[[EQ]] : <tensor = memref<64x64xf16, 3>, swizzle = swizzle_128b, l2promo = none, oob = zero, interleave = none>, <memorySpace = #gpu.address_space<workgroup>> -> memref<64x64xf16, #gpu.address_space<workgroup>>
+# DUMPIR: %[[C0_28:.*]] = arith.constant 0 : index
+# DUMPIR: %[[C64_29:.*]] = arith.constant 64 : index
+# DUMPIR: %[[C0_30:.*]] = arith.constant 0 : index
+# DUMPIR: nvgpu.tma.async.load %[[TMA1]][%[[C64_29]], %[[C0_30]]], %[[MB]][%[[C0_28]]] to %[[VIEW_20]], predicate = %[[EQ]] : <tensor = memref<64x64xf16, 3>, swizzle = swizzle_128b, l2promo = none, oob = zero, interleave = none>, <memorySpace = #gpu.address_space<workgroup>> -> memref<64x64xf16, #gpu.address_space<workgroup>>
+# DUMPIR: %[[C0_31:.*]] = arith.constant 0 : index
+# DUMPIR: %[[C10000000:.*]] = arith.constant 10000000 : index
+# DUMPIR: %[[FALSE:.*]] = arith.constant false
+# DUMPIR: nvgpu.mbarrier.try_wait.parity %[[MB]][%[[C0_31]]], %[[FALSE]], %[[C10000000]] : <memorySpace = #gpu.address_space<workgroup>>
+# DUMPIR: %[[WG_ACC:.*]] = nvgpu.warpgroup.mma.init.accumulator -> <fragmented = vector<128x128xf32>>
+# DUMPIR: %[[GEN0:.*]] = nvgpu.warpgroup.generate.descriptor %[[VIEW]], %[[TMA0]] : memref<128x64xf16, #gpu.address_space<workgroup>>, <tensor = memref<128x64xf16, 3>, swizzle = swizzle_128b, l2promo = none, oob = zero, interleave = none> -> <tensor = memref<128x64xf16, #gpu.address_space<workgroup>>>
+# DUMPIR: %[[GEN1:.*]] = nvgpu.warpgroup.generate.descriptor %[[VIEW_15]], %[[TMA1]] : memref<64x128xf16, #gpu.address_space<workgroup>>, <tensor = memref<64x64xf16, 3>, swizzle = swizzle_128b, l2promo = none, oob = zero, interleave = none> -> <tensor = memref<64x128xf16, #gpu.address_space<workgroup>>>
+# DUMPIR: %[[MMA:.*]] = nvgpu.warpgroup.mma %[[GEN0]], %[[GEN1]], %[[WG_ACC]] {transposeB} : <tensor = memref<128x64xf16, #gpu.address_space<workgroup>>>, <tensor = memref<64x128xf16, #gpu.address_space<workgroup>>>, <fragmented = vector<128x128xf32>> -> <fragmented = vector<128x128xf32>>
+# DUMPIR: nvgpu.warpgroup.mma.store %[[MMA]], %{{.*}} : <fragmented = vector<128x128xf32>> to memref<128x128xf32>
+# DUMPIR: gpu.terminator
+# DUMPIR: }
+# DUMPIR: %[[CPY3:.*]] = gpu.memcpy async [%{{.*}}] %[[ARG2]], %{{.*}} : memref<128x128xf32>, memref<128x128xf32>
+# DUMPIR: gpu.wait async [%[[CPY3]]]
+# DUMPIR: return
+# DUMPIR: }
diff --git a/mlir/test/Examples/NVGPU/Ch4.py b/mlir/test/Examples/NVGPU/Ch4.py
index 0e3460ff8d63b..dffafda7f21c9 100644
--- a/mlir/test/Examples/NVGPU/Ch4.py
+++ b/mlir/test/Examples/NVGPU/Ch4.py
@@ -1,5 +1,9 @@
# RUN: env SUPPORT_LIB=%mlir_cuda_runtime \
-# RUN: %PYTHON %s | FileCheck %s
+# RUN: sh -c 'if [[ "%mlir_run_cuda_sm90_tests" == "1" ]]; \
+# RUN: then %PYTHON %s | FileCheck %s; \
+# RUN: else export MLIR_NVDSL_PRINT_IR=1; \
+# RUN: %PYTHON %s | FileCheck %s --check-prefix=DUMPIR; fi'
+
# ===----------------------------------------------------------------------===//
# Chapter 4 : Multistage GEMM with Tensor Core
@@ -259,13 +263,13 @@ def epilogue(D: WGMMAMatrix, d_dev):
@N...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/167321
More information about the Mlir-commits
mailing list