[Mlir-commits] [mlir] [AMDGPU] Adding FoldMemRefOpsIntoTransposeLoadOp pattern (PR #183330)
Zhuoran Yin
llvmlistbot at llvm.org
Wed Feb 25 08:22:57 PST 2026
https://github.com/jerryyin created https://github.com/llvm/llvm-project/pull/183330
Before the fix we wouldn't fold a trivial expand_shape as index computation. This will later force expand_shape to materialize into a extract_stride_metadata and a reinterpret_cast unnecessarily. The example below showcase the motivation of a source IR that won't be able to fold today.
```mlir
%expanded = memref.expand_shape %buf [[0, 1], [2, 3]]
: memref<32x128xf16, strided<[128, 1], offset: ?>, #gpu.address_space<workgroup>>
into memref<1x32x8x16xf16, strided<..., offset: ?>, #gpu.address_space<workgroup>>
amdgpu.transpose_load %expanded[%i, %j, %k, %l]
: memref<1x32x8x16xf16, ...> -> vector<4xf16>
```
With this pattern that matches the more generic `FoldMemRefAliasOpsPass`, the expand_shape can now fold into transpose_load op like other load/stores.
The current `FoldMemRefAliasOps` pass doesn't use a more generic interface yet — it still uses the hardcoded overloads. This PR continues the pragmatic approach in providing its own folding pass (like `GatherToLDSOp`).
>From 963eb8b2971e99f4bd8d2ed4f01ce35e79ec0bbb Mon Sep 17 00:00:00 2001
From: jerryyin <zhuoryin at amd.com>
Date: Wed, 25 Feb 2026 16:01:00 +0000
Subject: [PATCH] Adding FoldMemRefOpsIntoTransposeLoadOp pattern
---
.../AMDGPU/Transforms/FoldMemRefsOps.cpp | 24 +++-
.../Dialect/AMDGPU/amdgpu-fold-memrefs.mlir | 103 ++++++++++++++++++
2 files changed, 126 insertions(+), 1 deletion(-)
diff --git a/mlir/lib/Dialect/AMDGPU/Transforms/FoldMemRefsOps.cpp b/mlir/lib/Dialect/AMDGPU/Transforms/FoldMemRefsOps.cpp
index 9a6915b003df4..21269ef009a6f 100644
--- a/mlir/lib/Dialect/AMDGPU/Transforms/FoldMemRefsOps.cpp
+++ b/mlir/lib/Dialect/AMDGPU/Transforms/FoldMemRefsOps.cpp
@@ -100,8 +100,30 @@ struct FoldMemRefOpsIntoGatherToLDSOp final : OpRewritePattern<GatherToLDSOp> {
}
};
+struct FoldMemRefOpsIntoTransposeLoadOp final
+ : OpRewritePattern<TransposeLoadOp> {
+ using OpRewritePattern::OpRewritePattern;
+ LogicalResult matchAndRewrite(TransposeLoadOp op,
+ PatternRewriter &rewriter) const override {
+ Location loc = op.getLoc();
+
+ SmallVector<Value> sourceIndices;
+ Value memrefSource;
+
+ if (failed(foldMemrefViewOp(rewriter, loc, op.getSrc(), op.getSrcIndices(),
+ sourceIndices, memrefSource, "source")))
+ return failure();
+
+ rewriter.replaceOpWithNewOp<TransposeLoadOp>(op, op.getResult().getType(),
+ memrefSource, sourceIndices);
+ return success();
+ }
+};
+
void populateAmdgpuFoldMemRefOpsPatterns(RewritePatternSet &patterns,
PatternBenefit benefit) {
- patterns.add<FoldMemRefOpsIntoGatherToLDSOp>(patterns.getContext(), benefit);
+ patterns
+ .add<FoldMemRefOpsIntoGatherToLDSOp, FoldMemRefOpsIntoTransposeLoadOp>(
+ patterns.getContext(), benefit);
}
} // namespace mlir::amdgpu
diff --git a/mlir/test/Dialect/AMDGPU/amdgpu-fold-memrefs.mlir b/mlir/test/Dialect/AMDGPU/amdgpu-fold-memrefs.mlir
index 77a5e4af4c1d1..11571b060a0b8 100644
--- a/mlir/test/Dialect/AMDGPU/amdgpu-fold-memrefs.mlir
+++ b/mlir/test/Dialect/AMDGPU/amdgpu-fold-memrefs.mlir
@@ -183,3 +183,106 @@ func.func @test_async_flag_preserved(%offset_i: index, %offset_j: index) {
: vector<8xf16>, memref<32x64xf16, strided<[128, 1]>>, memref<64x64xf16, #gpu_lds_addrspace>
func.return
}
+
+// -----
+
+#gpu_wg = #gpu.address_space<workgroup>
+
+// CHECK: func @test_transpose_load_subview
+// CHECK-SAME: %[[ARG0:.*]]: index, %[[ARG1:.*]]: index
+func.func @test_transpose_load_subview(%offset_i: index, %offset_j: index) -> vector<4xf16> {
+ // CHECK: %[[ALLOC:.*]] = memref.alloc() : memref<64x128xf16, #gpu.address_space<workgroup>>
+ // CHECK: amdgpu.transpose_load %[[ALLOC]][%[[ARG0]], %[[ARG1]]]
+ // CHECK-SAME: memref<64x128xf16, #gpu.address_space<workgroup>> -> vector<4xf16>
+
+ %alloc = memref.alloc() : memref<64x128xf16, #gpu_wg>
+ %subview = memref.subview %alloc[0, 0][32, 64][1, 1]
+ : memref<64x128xf16, #gpu_wg> to memref<32x64xf16, strided<[128, 1]>, #gpu_wg>
+ %result = amdgpu.transpose_load %subview[%offset_i, %offset_j]
+ : memref<32x64xf16, strided<[128, 1]>, #gpu_wg> -> vector<4xf16>
+ return %result : vector<4xf16>
+}
+
+// -----
+
+#gpu_wg = #gpu.address_space<workgroup>
+
+// CHECK: #[[MAP:.*]] = affine_map<()[s0] -> (s0 + 32)>
+// CHECK: #[[MAP1:.*]] = affine_map<()[s0] -> (s0 + 64)>
+
+// CHECK: func @test_transpose_load_subview_offset
+// CHECK-SAME: %[[ARG0:.*]]: index, %[[ARG1:.*]]: index
+func.func @test_transpose_load_subview_offset(%offset_i: index, %offset_j: index) -> vector<4xf16> {
+ // CHECK: %[[ALLOC:.*]] = memref.alloc() : memref<64x128xf16, #gpu.address_space<workgroup>>
+ // CHECK: %[[IDX0:.*]] = affine.apply #[[MAP]]()[%[[ARG0]]]
+ // CHECK: %[[IDX1:.*]] = affine.apply #[[MAP1]]()[%[[ARG1]]]
+ // CHECK: amdgpu.transpose_load %[[ALLOC]][%[[IDX0]], %[[IDX1]]]
+ // CHECK-SAME: memref<64x128xf16, #gpu.address_space<workgroup>> -> vector<4xf16>
+
+ %alloc = memref.alloc() : memref<64x128xf16, #gpu_wg>
+ %subview = memref.subview %alloc[32, 64][32, 64][1, 1]
+ : memref<64x128xf16, #gpu_wg>
+ to memref<32x64xf16, strided<[128, 1], offset: 4160>, #gpu_wg>
+ %result = amdgpu.transpose_load %subview[%offset_i, %offset_j]
+ : memref<32x64xf16, strided<[128, 1], offset: 4160>, #gpu_wg> -> vector<4xf16>
+ return %result : vector<4xf16>
+}
+
+// -----
+
+#gpu_wg = #gpu.address_space<workgroup>
+
+// CHECK: func @test_transpose_load_expand_shape
+// CHECK-SAME: %[[ARG0:.*]]: index, %[[ARG1:.*]]: index
+func.func @test_transpose_load_expand_shape(%offset_i: index, %offset_j: index) -> vector<4xf16> {
+ // CHECK: %[[ALLOC:.*]] = memref.alloc() : memref<4096xf16, #gpu.address_space<workgroup>>
+ // CHECK: %[[IDX:.*]] = affine.linearize_index [%[[ARG0]], %[[ARG1]]] by (32, 128) : index
+ // CHECK: amdgpu.transpose_load %[[ALLOC]][%[[IDX]]]
+ // CHECK-SAME: memref<4096xf16, #gpu.address_space<workgroup>> -> vector<4xf16>
+
+ %alloc = memref.alloc() : memref<4096xf16, #gpu_wg>
+ %expand = memref.expand_shape %alloc [[0, 1]] output_shape [32, 128]
+ : memref<4096xf16, #gpu_wg> into memref<32x128xf16, #gpu_wg>
+ %result = amdgpu.transpose_load %expand[%offset_i, %offset_j]
+ : memref<32x128xf16, #gpu_wg> -> vector<4xf16>
+ return %result : vector<4xf16>
+}
+
+// -----
+
+#gpu_wg = #gpu.address_space<workgroup>
+
+// CHECK: func @test_transpose_load_collapse_shape
+// CHECK-SAME: %[[ARG0:.*]]: index
+func.func @test_transpose_load_collapse_shape(%offset_i: index) -> vector<4xf16> {
+ // CHECK: %[[ALLOC:.*]] = memref.alloc() : memref<32x128xf16, #gpu.address_space<workgroup>>
+ // CHECK: %[[INDICES:.*]]:2 = affine.delinearize_index %[[ARG0]] into (32, 128) : index, index
+ // CHECK: amdgpu.transpose_load %[[ALLOC]][%[[INDICES]]#0, %[[INDICES]]#1]
+ // CHECK-SAME: memref<32x128xf16, #gpu.address_space<workgroup>> -> vector<4xf16>
+
+ %alloc = memref.alloc() : memref<32x128xf16, #gpu_wg>
+ %collapse = memref.collapse_shape %alloc [[0, 1]]
+ : memref<32x128xf16, #gpu_wg> into memref<4096xf16, #gpu_wg>
+ %result = amdgpu.transpose_load %collapse[%offset_i]
+ : memref<4096xf16, #gpu_wg> -> vector<4xf16>
+ return %result : vector<4xf16>
+}
+
+// -----
+
+#gpu_wg = #gpu.address_space<workgroup>
+
+// CHECK: func @test_transpose_load_nop
+// CHECK-SAME: %[[ARG0:.*]]: index, %[[ARG1:.*]]: index
+func.func @test_transpose_load_nop(%offset_i: index, %offset_j: index) -> vector<4xf16> {
+ // CHECK: %[[ALLOC:.*]] = memref.alloc() : memref<32x128xf16, #gpu.address_space<workgroup>>
+ // CHECK: amdgpu.transpose_load %[[ALLOC]][%[[ARG0]], %[[ARG1]]]
+ // CHECK-SAME: memref<32x128xf16, #gpu.address_space<workgroup>> -> vector<4xf16>
+ // CHECK-NOT: subview
+ // CHECK-NOT: expand_shape
+
+ %alloc = memref.alloc() : memref<32x128xf16, #gpu_wg>
+ %result = amdgpu.transpose_load %alloc[%offset_i, %offset_j]
+ : memref<32x128xf16, #gpu_wg> -> vector<4xf16>
+ return %result : vector<4xf16>
+}
More information about the Mlir-commits
mailing list