[Mlir-commits] [mlir] [NVGPU] Fix nvdsl examples (PR #156830)
Giacomo Castiglioni
llvmlistbot at llvm.org
Mon Nov 10 01:22:28 PST 2025
================
@@ -56,11 +60,43 @@ def saxpy_kernel():
alpha = 2.0
x = np.random.randn(M, N).astype(np.float32)
y = np.ones((M, N), np.float32)
+
saxpy(x, y, alpha)
-# 4. Verify MLIR with reference computation
-ref = np.ones((M, N), np.float32)
-ref += x * alpha
-np.testing.assert_allclose(y, ref, rtol=5e-03, atol=1e-01)
-print("PASS")
+if os.getenv("MLIR_NVDSL_PRINT_IR") != "1":
+ # 4. Verify MLIR with reference computation
+ ref = np.ones((M, N), np.float32)
+ ref += x * alpha
+ np.testing.assert_allclose(y, ref, rtol=5e-03, atol=1e-01)
+ print("PASS")
# CHECK-NOT: Mismatched elements
+# CHECK: PASS
+
+# DUMPIR: func.func @saxpy(%arg0: memref<256x32xf32>, %arg1: memref<256x32xf32>, %arg2: f32) attributes {llvm.emit_c_interface} {
+# DUMPIR: %[[WAIT0:.*]] = gpu.wait async
+# DUMPIR: %[[MEMREF:.*]], %[[ASYNC0:.*]] = gpu.alloc async [%[[WAIT0]]] () : memref<256x32xf32>
+# DUMPIR: %[[MEMREF0:.*]], %[[ASYNC1:.*]] = gpu.alloc async [%[[ASYNC0]]] () : memref<256x32xf32>
+# DUMPIR: %[[MEMCPY1:.*]] = gpu.memcpy async [%[[ASYNC1]]] %[[MEMREF]], %arg0 : memref<256x32xf32>, memref<256x32xf32>
+# DUMPIR: %[[MEMCPY2:.*]] = gpu.memcpy async [%[[MEMCPY1]]] %[[MEMREF0]], %arg1 : memref<256x32xf32>, memref<256x32xf32>
+# DUMPIR: %[[WAIT1:.*]] = gpu.wait async [%[[MEMCPY2]]]
+# DUMPIR: %[[C256:.*]] = arith.constant 256 : index
+# DUMPIR: %[[C1:.*]] = arith.constant 1 : index
+# DUMPIR: %[[C1_2:.*]] = arith.constant 1 : index
+# DUMPIR: %[[C32:.*]] = arith.constant 32 : index
+# DUMPIR: %[[C1_3:.*]] = arith.constant 1 : index
+# DUMPIR: %[[C1_4:.*]] = arith.constant 1 : index
+# DUMPIR: %[[C0_I32:.*]] = arith.constant 0 : i32
+# DUMPIR: gpu.launch blocks(%arg3, %arg4, %arg5) in (%arg9 = %[[C256]], %arg10 = %[[C1]], %arg11 = %[[C1_2]]) threads(%arg6, %arg7, %arg8) in (%arg12 = %[[C32]], %arg13 = %[[C1_3]], %arg14 = %[[C1_4]]) dynamic_shared_memory_size %[[C0_I32]] {
+# DUMPIR: %[[BLOCKID:.*]] = gpu.block_id x
+# DUMPIR: %[[THREADID:.*]] = gpu.thread_id x
+# DUMPIR: %[[LD0:.*]] = memref.load %[[MEMREF]][%[[BLOCKID]], %[[THREADID]]] : memref<256x32xf32>
+# DUMPIR: %[[LD1:.*]] = memref.load %[[MEMREF0]][%[[BLOCKID]], %[[THREADID]]] : memref<256x32xf32>
+# DUMPIR: %[[MUL:.*]] = arith.mulf %[[LD0]], %arg2 : f32
+# DUMPIR: %[[ADD:.*]] = arith.addf %[[LD1]], %[[MUL]] : f32
+# DUMPIR: memref.store %[[ADD]], %[[MEMREF0]][%[[BLOCKID]], %[[THREADID]]] : memref<256x32xf32>
+# DUMPIR: gpu.terminator
+# DUMPIR: }
+# DUMPIR: %[[MEMCPY3:.*]] = gpu.memcpy async [%[[WAIT1]]] %arg1, %[[MEMREF0]] : memref<256x32xf32>, memref<256x32xf32>
+# DUMPIR: %[[WAIT2:.*]] = gpu.wait async [%[[MEMCPY3]]]
+# DUMPIR: return
+# DUMPIR: }
----------------
castigli wrote:
yes, I will update the test as part of the new PR.
https://github.com/llvm/llvm-project/pull/156830
More information about the Mlir-commits
mailing list