[Mlir-commits] [mlir] [mlir][XeGPU][VectorToXeGPU] Use 'xegpu.load' to lower 1D 'vector.transfer_read' for PVC & BMG (PR #168910)
llvmlistbot at llvm.org
llvmlistbot at llvm.org
Thu Nov 20 09:44:51 PST 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-mlir
Author: Dmitry Chigarev (dchigarev)
<details>
<summary>Changes</summary>
The PR changes the `TransferReadLowering` to always use `xegpu.load` (and not `xegpu.load_nd`) for 1D cases as it has more developed interface (e.g. layouts capabilites).
---
Full diff: https://github.com/llvm/llvm-project/pull/168910.diff
2 Files Affected:
- (modified) mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp (+6-1)
- (modified) mlir/test/Conversion/VectorToXeGPU/transfer-read-to-xegpu.mlir (+63-19)
``````````diff
diff --git a/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp b/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp
index 1b4d1a42614ea..4358ef07da91d 100644
--- a/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp
+++ b/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp
@@ -519,8 +519,13 @@ struct TransferReadLowering : public OpRewritePattern<vector::TransferReadOp> {
return lowerToScatteredLoadOp(readOp, rewriter);
}
- // Perform common data transfer checks.
VectorType vecTy = readOp.getVectorType();
+
+ // Lower using load.gather in 1D case
+ if (vecTy.getRank() == 1 && !readOp.hasOutOfBoundsDim())
+ return lowerToScatteredLoadOp(readOp, rewriter);
+
+ // Perform common data transfer checks.
if (failed(storeLoadPreconditions(rewriter, readOp, vecTy)))
return failure();
diff --git a/mlir/test/Conversion/VectorToXeGPU/transfer-read-to-xegpu.mlir b/mlir/test/Conversion/VectorToXeGPU/transfer-read-to-xegpu.mlir
index c87a5304babfe..8bb272b1fe5fc 100644
--- a/mlir/test/Conversion/VectorToXeGPU/transfer-read-to-xegpu.mlir
+++ b/mlir/test/Conversion/VectorToXeGPU/transfer-read-to-xegpu.mlir
@@ -11,14 +11,15 @@ gpu.func @load_1D_vector(%source: memref<8x16x32xf32>, %offset: index) -> vector
// LOAD-ND-LABEL: @load_1D_vector(
// LOAD-ND-SAME: %[[SRC:.+]]: memref<8x16x32xf32>,
-// LOAD-ND-SAME: %[[OFFSET:.+]]: index
-// LOAD-ND: %[[COLLAPSED:.+]] = memref.subview %[[SRC]][%[[OFFSET]], %[[OFFSET]], 0]
-// LOAD-ND: %[[DESC:.+]] = xegpu.create_nd_tdesc
-// LOAD-ND-SAME: %[[COLLAPSED]]
-// LOAD-ND-SAME: memref<32xf32, strided<[1], offset: ?>> -> !xegpu.tensor_desc<8xf32,
-// LOAD-ND-SAME: boundary_check = false
-// LOAD-ND: %[[VEC:.+]] = xegpu.load_nd %[[DESC]][%[[OFFSET]]]{{.*}}-> vector<8xf32>
-// LOAD-ND: return %[[VEC]]
+// LOAD-ND: %[[CST:.+]] = arith.constant dense<true> : vector<8xi1>
+// LOAD-ND: %[[STEP:.+]] = vector.step : vector<8xindex>
+// LOAD-ND-COUNT2: arith.muli {{.*}} : index
+// LOAD-ND-COUNT2: arith.addi {{.*}} : index
+// LOAD-ND: %[[SPLAT:.+]] = vector.broadcast {{.*}}: index to vector<8xindex>
+// LOAD-ND: %[[IDX:.+]] = arith.addi %[[SPLAT]], %[[STEP]] : vector<8xindex>
+// LOAD-ND: %[[COLLAPSE:.+]] = memref.extract_aligned_pointer_as_index %[[SRC]] : memref<8x16x32xf32> -> index
+// LOAD-ND: %[[COLLAPSE_I:.+]] = arith.index_cast %[[COLLAPSE]] : index to i64
+// LOAD-ND: %[[VEC:.+]] = xegpu.load %[[COLLAPSE_I]]{{\[}}%[[IDX]]{{\]}}, %[[CST]] : i64, vector<8xindex>, vector<8xi1> -> vector<8xf32>
// LOAD-GATHER-LABEL: @load_1D_vector(
// LOAD-GATHER-SAME: %[[SRC:.+]]: memref<8x16x32xf32>,
@@ -404,7 +405,7 @@ gpu.func @no_load_unsupported_map(%source: memref<16x32x64xf32>,
// -----
gpu.module @xevm_module {
-gpu.func @load_from_subview(%source: memref<4096x4096xf16>, %off1: index, %off2: index) -> vector<8xf16> {
+gpu.func @load_from_subview_1D(%source: memref<4096x4096xf16>, %off1: index, %off2: index) -> vector<8xf16> {
%c0 = arith.constant 0.0 : f16
%subview = memref.subview %source[%off1, %off2] [256, 256] [1, 1] : memref<4096x4096xf16> to memref<256x256xf16, strided<[4096, 1], offset: ?>>
%0 = vector.transfer_read %subview[%off2, %off2], %c0
@@ -412,19 +413,23 @@ gpu.func @load_from_subview(%source: memref<4096x4096xf16>, %off1: index, %off2:
gpu.return %0 : vector<8xf16>
}
-// LOAD-ND-LABEL: @load_from_subview(
+// LOAD-ND-LABEL: @load_from_subview_1D(
// LOAD-ND-SAME: %[[SRC:.+]]: memref<4096x4096xf16>,
// LOAD-ND-SAME: %[[OFF1:.+]]: index, %[[OFF2:.+]]: index
+// LOAD-ND: %[[CST:.+]] = arith.constant dense<true> : vector<8xi1>
// LOAD-ND: %[[SUBVIEW:.+]] = memref.subview %[[SRC]][%[[OFF1]], %[[OFF2]]] [256, 256] [1, 1] : memref<4096x4096xf16> to memref<256x256xf16, strided<[4096, 1], offset: ?>>
-// LOAD-ND: %[[COLLAPSED:.+]] = memref.subview %[[SUBVIEW]][%[[OFF2]], 0]
-// LOAD-ND: %[[DESC:.+]] = xegpu.create_nd_tdesc
-// LOAD-ND-SAME: %[[COLLAPSED]]
-// LOAD-ND-SAME: memref<256xf16, strided<[1], offset: ?>> -> !xegpu.tensor_desc<8xf16,
-// LOAD-ND-SAME: boundary_check = false
-// LOAD-ND: %[[VEC:.+]] = xegpu.load_nd %[[DESC]][%[[OFF2]]]{{.*}}-> vector<8xf16>
-// LOAD-ND: return %[[VEC]]
-
-// LOAD-GATHER-LABEL: @load_from_subview(
+// LOAD-ND: %[[BB:.+]], %[[OFFSET:.+]],{{.*}},{{.*}} = memref.extract_strided_metadata %[[SUBVIEW]] : memref<256x256xf16, strided<[4096, 1], offset: ?>> -> memref<f16>, index, index, index, index, index
+// LOAD-ND: %[[STEP:.+]] = vector.step : vector<8xindex>
+// LOAD-ND: arith.muli {{.*}} : index
+// LOAD-ND: arith.addi %[[OFFSET]]{{.*}} : index
+// LOAD-ND: arith.addi {{.*}} : index
+// LOAD-ND: %[[SPLAT:.+]] = vector.broadcast {{.*}}: index to vector<8xindex>
+// LOAD-ND: %[[IDX:.+]] = arith.addi %[[SPLAT]], %[[STEP]] : vector<8xindex>
+// LOAD-ND: %[[COLLAPSE:.+]] = memref.extract_aligned_pointer_as_index %[[SUBVIEW]] : memref<256x256xf16, strided<[4096, 1], offset: ?>> -> index
+// LOAD-ND: %[[COLLAPSE_I:.+]] = arith.index_cast %[[COLLAPSE]] : index to i64
+// LOAD-ND: %[[VEC:.+]] = xegpu.load %[[COLLAPSE_I]]{{\[}}%[[IDX]]{{\]}}, %[[CST]] : i64, vector<8xindex>, vector<8xi1> -> vector<8xf16>
+
+// LOAD-GATHER-LABEL: @load_from_subview_1D(
// LOAD-GATHER-SAME: %[[SRC:.+]]: memref<4096x4096xf16>,
// LOAD-GATHER-SAME: %[[OFF1:.+]]: index, %[[OFF2:.+]]: index
// LOAD-GATHER: %[[CST:.+]] = arith.constant dense<true> : vector<8xi1>
@@ -440,3 +445,42 @@ gpu.func @load_from_subview(%source: memref<4096x4096xf16>, %off1: index, %off2:
// LOAD-GATHER: %[[COLLAPSE_I:.+]] = arith.index_cast %[[COLLAPSE]] : index to i64
// LOAD-GATHER: %[[VEC:.+]] = xegpu.load %[[COLLAPSE_I]]{{\[}}%[[IDX]]{{\]}}, %[[CST]] : i64, vector<8xindex>, vector<8xi1> -> vector<8xf16>
}
+
+// -----
+gpu.module @xevm_module {
+gpu.func @load_from_subview_2D(%source: memref<4096x4096xf16>, %off1: index, %off2: index) -> vector<8x16xf16> {
+ %c0 = arith.constant 0.0 : f16
+ %subview = memref.subview %source[%off1, %off2] [256, 256] [1, 1] : memref<4096x4096xf16> to memref<256x256xf16, strided<[4096, 1], offset: ?>>
+ %0 = vector.transfer_read %subview[%off2, %off2], %c0
+ {in_bounds = [true, true]} : memref<256x256xf16, strided<[4096, 1], offset: ?>>, vector<8x16xf16>
+ gpu.return %0 : vector<8x16xf16>
+}
+
+// LOAD-ND-LABEL: @load_from_subview_2D(
+// LOAD-ND-SAME: %[[SRC:.+]]: memref<4096x4096xf16>,
+// LOAD-ND-SAME: %[[OFF1:.+]]: index, %[[OFF2:.+]]: index
+// LOAD-ND: %[[SUBVIEW:.+]] = memref.subview %[[SRC]][%[[OFF1]], %[[OFF2]]] [256, 256] [1, 1] : memref<4096x4096xf16> to memref<256x256xf16, strided<[4096, 1], offset: ?>>
+// LOAD-ND: %[[DESC:.+]] = xegpu.create_nd_tdesc
+// LOAD-ND-SAME: %[[SUBVIEW]]
+// LOAD-ND-SAME: memref<256x256xf16, strided<[4096, 1], offset: ?>> -> !xegpu.tensor_desc<8x16xf16,
+// LOAD-ND-SAME: boundary_check = false
+// LOAD-ND: %[[VEC:.+]] = xegpu.load_nd %[[DESC]][%[[OFF2]], %[[OFF2]]]{{.*}}-> vector<8x16xf16>
+// LOAD-ND: return %[[VEC]]
+
+// LOAD-GATHER-LABEL: @load_from_subview_2D(
+// LOAD-GATHER-SAME: %[[SRC:.+]]: memref<4096x4096xf16>,
+// LOAD-GATHER-SAME: %[[OFF1:.+]]: index, %[[OFF2:.+]]: index
+// LOAD-GATHER: %[[CST:.+]] = arith.constant dense<true> : vector<8x16xi1>
+// LOAD-GATHER: %[[SUBVIEW:.+]] = memref.subview %[[SRC]][%[[OFF1]], %[[OFF2]]] [256, 256] [1, 1] : memref<4096x4096xf16> to memref<256x256xf16, strided<[4096, 1], offset: ?>>
+// LOAD-GATHER: %[[BB:.+]], %[[OFFSET:.+]],{{.*}},{{.*}} = memref.extract_strided_metadata %[[SUBVIEW]] : memref<256x256xf16, strided<[4096, 1], offset: ?>> -> memref<f16>, index, index, index, index, index
+// LOAD-GATHER-COUNT2: vector.step
+// LOAD-GATHER-COUNT2: vector.shape_cast
+// LOAD-GATHER-COUNT2: vector.broadcast
+// LOAD-GATHER-COUNT2: arith.muli {{.*}} : index
+// LOAD-GATHER-COUNT2: arith.addi {{.*}} : index
+// LOAD-GATHER: %[[SPLAT:.+]] = vector.broadcast {{.*}}: index to vector<8x16xindex>
+// LOAD-GATHER: %[[IDX:.+]] = arith.addi %[[SPLAT]], {{.*}} : vector<8x16xindex>
+// LOAD-GATHER: %[[COLLAPSE:.+]] = memref.extract_aligned_pointer_as_index %[[SUBVIEW]] : memref<256x256xf16, strided<[4096, 1], offset: ?>> -> index
+// LOAD-GATHER: %[[COLLAPSE_I:.+]] = arith.index_cast %[[COLLAPSE]] : index to i64
+// LOAD-GATHER: %[[VEC:.+]] = xegpu.load %[[COLLAPSE_I]]{{\[}}%[[IDX]]{{\]}}, %[[CST]] : i64, vector<8x16xindex>, vector<8x16xi1> -> vector<8x16xf16>
+}
``````````
</details>
https://github.com/llvm/llvm-project/pull/168910
More information about the Mlir-commits
mailing list