[Mlir-commits] [mlir] 3925d11 - [MLIR][XeGPU] Decompose unsupported 'vector.transfer_read'-transpose-permutations (#182875)
llvmlistbot at llvm.org
llvmlistbot at llvm.org
Thu Mar 5 08:08:26 PST 2026
Author: Dmitry Chigarev
Date: 2026-03-05T17:08:22+01:00
New Revision: 3925d112c44413f858bcb50c00125e039cef170e
URL: https://github.com/llvm/llvm-project/commit/3925d112c44413f858bcb50c00125e039cef170e
DIFF: https://github.com/llvm/llvm-project/commit/3925d112c44413f858bcb50c00125e039cef170e.diff
LOG: [MLIR][XeGPU] Decompose unsupported 'vector.transfer_read'-transpose-permutations (#182875)
The PR adds a pattern to `vector-to-xegpu` pass that decomposes
`vector.transfer_read` with unsupported transpose-permutations
(unsupported element-type) into `vector.transfer_read +
vector.transpose`:
Example:
```mlir
// input-ir:
%0 = vector.transfer_read %source[%offset, %offset], %c0
{permutation_map = affine_map<(d0, d1) -> (d1, d0)>,
in_bounds = [true, true]} : memref<32x64xf16>, vector<8x16xf16>
// mlir-opt %s --convert-vector-to-xegpu
// before PR (no conversion because of unsupported type):
%0 = vector.transfer_read %source[%offset, %offset], %c0
{permutation_map = affine_map<(d0, d1) -> (d1, d0)>,
in_bounds = [true, true]} : memref<32x64xf16>, vector<8x16xf16>
// mlir-opt %s --convert-vector-to-xegpu
// after PR (decomposed + converted):
%0 = xegpu.load_nd %source[%offset, %offset]
%1 = vector.transpose %0
```
---------
Signed-off-by: dchigarev <dmitry.chigarev at intel.com>
Added:
Modified:
mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp
mlir/test/Conversion/VectorToXeGPU/transfer-read-to-xegpu.mlir
Removed:
################################################################################
diff --git a/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp b/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp
index eb45e323ef849..0eac704779e7d 100644
--- a/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp
+++ b/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp
@@ -558,15 +558,16 @@ struct TransferReadLowering : public OpRewritePattern<vector::TransferReadOp> {
return lowerToScatteredLoadOp(readOp, rewriter);
}
- VectorType vecTy = readOp.getVectorType();
+ VectorType loadedVecTy = readOp.getVectorType();
// Lower using load.gather in 1D case
- if (vecTy.getRank() == 1 && !readOp.hasOutOfBoundsDim())
+ if (loadedVecTy.getRank() == 1 && !readOp.hasOutOfBoundsDim())
return lowerToScatteredLoadOp(readOp, rewriter);
// Perform common data transfer checks.
auto readMemTy = cast<MemRefType>(readOp.getShapedType());
- if (failed(storeLoadPreconditions(rewriter, readOp, vecTy, readMemTy)))
+ if (failed(
+ storeLoadPreconditions(rewriter, readOp, loadedVecTy, readMemTy)))
return failure();
bool isOutOfBounds = readOp.hasOutOfBoundsDim();
@@ -576,40 +577,44 @@ struct TransferReadLowering : public OpRewritePattern<vector::TransferReadOp> {
AffineMap readMap = readOp.getPermutationMap();
bool isTransposeLoad = !readMap.isMinorIdentity();
-
- Type elementType = vecTy.getElementType();
- unsigned minTransposeBitWidth = 32;
- if (isTransposeLoad &&
- elementType.getIntOrFloatBitWidth() < minTransposeBitWidth)
- return rewriter.notifyMatchFailure(
- readOp, "Unsupported data type for transposition");
-
- // If load is transposed, get the base shape for the tensor descriptor.
- SmallVector<int64_t> descShape(vecTy.getShape());
- if (isTransposeLoad)
- std::reverse(descShape.begin(), descShape.end());
+ auto elementType = loadedVecTy.getElementType();
+
+ SmallVector<int64_t> descShape(loadedVecTy.getShape());
+ if (isTransposeLoad) {
+ // If load is transposed, then the shape of the source-descriptor
+ // is the opposite from the result-shape. Applying the permutation
+ // to get the reversive shape.
+ auto inversedMap = inversePermutation(readMap);
+ descShape = applyPermutationMap(inversedMap, loadedVecTy.getShape());
+ loadedVecTy = VectorType::get(descShape, elementType);
+ }
auto descType = xegpu::TensorDescType::get(
descShape, elementType, /*array_length=*/1,
/*boundary_check=*/isOutOfBounds, xegpu::MemorySpace::Global);
-
- DenseI64ArrayAttr transposeAttr =
- !isTransposeLoad ? nullptr
- : DenseI64ArrayAttr::get(rewriter.getContext(),
- ArrayRef<int64_t>{1, 0});
auto [src, indices] = convertMemrefAndOffsetsToTargetRank(
rewriter, loc, readOp.getBase(), getAsOpFoldResult(readOp.getIndices()),
- vecTy.getRank());
+ loadedVecTy.getRank());
// By default, no specific caching policy is assigned.
xegpu::CachePolicyAttr hint = nullptr;
xegpu::CreateNdDescOp ndDesc = createNdDescriptor(
rewriter, loc, descType, dyn_cast<TypedValue<MemRefType>>(src));
- auto loadOp = xegpu::LoadNdOp::create(rewriter, loc, vecTy, ndDesc, indices,
- /*packed=*/nullptr, transposeAttr,
- /*l1_hint=*/hint,
- /*l2_hint=*/hint, /*l3_hint=*/hint,
- /*layout=*/nullptr);
- rewriter.replaceOp(readOp, loadOp);
+ Operation *loadedOp =
+ xegpu::LoadNdOp::create(rewriter, loc, loadedVecTy, ndDesc, indices,
+ /*packed=*/nullptr, /*transpose=*/nullptr,
+ /*l1_hint=*/hint,
+ /*l2_hint=*/hint, /*l3_hint=*/hint,
+ /*layout=*/nullptr);
+ if (isTransposeLoad) {
+ // Transposing the loaded vector with a separate vector.transpose
+ // operation
+ auto range = llvm::seq<int64_t>(0, readMap.getResults().size());
+ SmallVector<int64_t> perm(range.begin(), range.end());
+ auto permApplied = applyPermutationMap<int64_t>(readMap, perm);
+ loadedOp = vector::TransposeOp::create(
+ rewriter, loc, loadedOp->getResult(0), permApplied);
+ }
+ rewriter.replaceOp(readOp, loadedOp);
return success();
}
diff --git a/mlir/test/Conversion/VectorToXeGPU/transfer-read-to-xegpu.mlir b/mlir/test/Conversion/VectorToXeGPU/transfer-read-to-xegpu.mlir
index b58f9b30ed726..1a19c8a13f120 100644
--- a/mlir/test/Conversion/VectorToXeGPU/transfer-read-to-xegpu.mlir
+++ b/mlir/test/Conversion/VectorToXeGPU/transfer-read-to-xegpu.mlir
@@ -120,9 +120,10 @@ gpu.func @load_transposed(%source: memref<32x64xf32>,
// LOAD-ND-SAME: %[[OFFSET2:.+]]: index
// LOAD-ND: %[[DESC:.+]] = xegpu.create_nd_tdesc %[[SRC]]
// LOAD-ND-SAME: memref<32x64xf32> -> !xegpu.tensor_desc<16x8xf32
-// LOAD-ND: %[[VEC:.+]] = xegpu.load_nd %[[DESC]][%[[OFFSET1]], %[[OFFSET2]]] <{transpose = array<i64: 1, 0>}>
-// LOAD-ND-SAME: -> vector<8x16xf32>
-// LOAD-ND: return %[[VEC]]
+// LOAD-ND: %[[VEC:.+]] = xegpu.load_nd %[[DESC]][%[[OFFSET1]], %[[OFFSET2]]]
+// LOAD-ND-SAME: -> vector<16x8xf32>
+// LOAD-ND: %[[VEC_TRANSPOSED:.+]] = vector.transpose %[[VEC]], [1, 0] : vector<16x8xf32> to vector<8x16xf32>
+// LOAD-ND: return %[[VEC_TRANSPOSED]]
// LOAD-GATHER-LABEL: @load_transposed(
@@ -297,7 +298,8 @@ gpu.func @load_transpose_f16(%source: memref<32x64xf16>,
}
// LOAD-ND-LABEL: @load_transpose_f16(
-// LOAD-ND: vector.transfer_read
+// LOAD-ND: %[[LOAD:.*]] = xegpu.load_nd
+// LOAD-ND: vector.transpose %[[LOAD]], [1, 0] : vector<16x8xf16> to vector<8x16xf16>
// LOAD-GATHER-LABEL: @load_transpose_f16(
// LOAD-GATHER-SAME: %[[SRC:.+]]: memref<32x64xf16>,
More information about the Mlir-commits
mailing list