[Mlir-commits] [mlir] [mlir][gpu]Add GPUToXeVM lowering pipeline pass. (PR #161216)
Md Abdullah Shahneous Bari
llvmlistbot at llvm.org
Tue Oct 14 14:12:22 PDT 2025
================
@@ -0,0 +1,121 @@
+// RUN: mlir-opt %s --gpu-lower-to-xevm-pipeline="xegpu-op-level=lane" \
+// RUN: | mlir-runner \
+// RUN: --shared-libs=%mlir_levelzero_runtime \
+// RUN: --shared-libs=%mlir_runner_utils \
+// RUN: --entry-point-result=void \
+// RUN: | FileCheck %s
+
+module @gemm attributes {gpu.container_module} {
+ gpu.module @kernel {
+ gpu.func @simple_gemm(%a: memref<256x256xf16>, %b: memref<256x256xf16>, %c: memref<256x256xf32>) kernel {
+ %c0 = arith.constant 0 : index
+ %c1 = arith.constant 1 : index
+ %c8 = arith.constant 8 : index
+ %c16 = arith.constant 16 : index
+ %c32 = arith.constant 32 : index
+ %c256 = arith.constant 256 : index
+ %block_x = gpu.block_id x
+ %block_y = gpu.block_id y
+ %x_block_offset = arith.muli %block_x, %c8 : index
+ %y_block_offset = arith.muli %block_y, %c16 : index
+
+ %c_tdesc = xegpu.create_nd_tdesc %c : memref<256x256xf32> -> !xegpu.tensor_desc<8x16xf32>
+ %c_init_value = xegpu.load_nd %c_tdesc[%x_block_offset, %y_block_offset] : !xegpu.tensor_desc<8x16xf32> -> vector<8xf32>
+ %a_tdesc = xegpu.create_nd_tdesc %a : memref<256x256xf16> -> !xegpu.tensor_desc<8x16xf16>
+ %b_tdesc = xegpu.create_nd_tdesc %b : memref<256x256xf16> -> !xegpu.tensor_desc<16x16xf16>
+
+ %r = scf.for %k = %c0 to %c256 step %c16 iter_args(%arg_c = %c_init_value) -> (vector<8xf32>) {
+ %a_val = xegpu.load_nd %a_tdesc[%x_block_offset, %k] : !xegpu.tensor_desc<8x16xf16> -> vector<8xf16>
+ %b_val = xegpu.load_nd %b_tdesc[%k, %y_block_offset] : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16>
+ %dpas = xegpu.dpas %a_val, %b_val, %arg_c : vector<8xf16>, vector<16xf16>, vector<8xf32> -> vector<8xf32>
+ scf.yield %dpas : vector<8xf32>
+ }
+ xegpu.store_nd %r, %c_tdesc[%x_block_offset, %y_block_offset] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}>: vector<8xf32>, !xegpu.tensor_desc<8x16xf32>
+ gpu.return
+ }
+ }
+
+ func.func @test(%a : memref<256x256xf16>, %b : memref<256x256xf16>, %c : memref<256x256xf32>) -> memref<256x256xf32> attributes {llvm.emit_c_interface} {
+ %c1 = arith.constant 1 : index
+ %c16 = arith.constant 16 : index
+ %c32 = arith.constant 32 : index
+ %memref_a = gpu.alloc () : memref<256x256xf16>
+ gpu.memcpy %memref_a, %a : memref<256x256xf16>, memref<256x256xf16>
+ %memref_b = gpu.alloc () : memref<256x256xf16>
+ gpu.memcpy %memref_b, %b : memref<256x256xf16>, memref<256x256xf16>
+ %memref_c = gpu.alloc () : memref<256x256xf32>
+ gpu.memcpy %memref_c, %c : memref<256x256xf32>, memref<256x256xf32>
+ gpu.launch_func @kernel::@simple_gemm blocks in (%c32, %c16, %c1) threads in (%c16, %c1, %c1) args(%memref_a : memref<256x256xf16>, %memref_b : memref<256x256xf16>, %memref_c : memref<256x256xf32>)
+ gpu.wait // Wait for the kernel to finish.
----------------
mshahneo wrote:
Actually we do use async. Through the usage of `GpuAsyncRegionPass`. This pass adds async keyword to gpu ops based on dependency analysis. This pass is necessary to generate asynchronous chain. These chains essentially gets converted to `mgpuCreateStream()`.
https://github.com/llvm/llvm-project/pull/161216
More information about the Mlir-commits
mailing list