[Mlir-commits] [mlir] 642763c - [AMDGPU] Adding FoldMemRefOpsIntoTransposeLoadOp pattern (#183330)
llvmlistbot at llvm.org
llvmlistbot at llvm.org
Wed Feb 25 13:58:48 PST 2026
Author: Zhuoran Yin
Date: 2026-02-25T16:58:44-05:00
New Revision: 642763c553fdf6e699a3dd867b984b5c8044c07e
URL: https://github.com/llvm/llvm-project/commit/642763c553fdf6e699a3dd867b984b5c8044c07e
DIFF: https://github.com/llvm/llvm-project/commit/642763c553fdf6e699a3dd867b984b5c8044c07e.diff
LOG: [AMDGPU] Adding FoldMemRefOpsIntoTransposeLoadOp pattern (#183330)
Before the fix we wouldn't fold a trivial expand_shape as index
computation. This will later force expand_shape to materialize into a
extract_stride_metadata and a reinterpret_cast unnecessarily. The
example below showcase the motivation of a source IR that won't be able
to fold today.
```mlir
%expanded = memref.expand_shape %buf [[0, 1], [2, 3]]
: memref<32x128xf16, strided<[128, 1], offset: ?>, #gpu.address_space<workgroup>>
into memref<1x32x8x16xf16, strided<..., offset: ?>, #gpu.address_space<workgroup>>
amdgpu.transpose_load %expanded[%i, %j, %k, %l]
: memref<1x32x8x16xf16, ...> -> vector<4xf16>
```
With this pattern that matches the more generic
`FoldMemRefAliasOpsPass`, the expand_shape can now fold into
transpose_load op like other load/stores.
The current `FoldMemRefAliasOps` pass doesn't use a more generic
interface yet — it still uses the hardcoded overloads. This PR continues
the pragmatic approach in providing its own folding pass (like
`GatherToLDSOp`).
Added:
Modified:
mlir/lib/Dialect/AMDGPU/Transforms/FoldMemRefsOps.cpp
mlir/test/Dialect/AMDGPU/amdgpu-fold-memrefs.mlir
Removed:
################################################################################
diff --git a/mlir/lib/Dialect/AMDGPU/Transforms/FoldMemRefsOps.cpp b/mlir/lib/Dialect/AMDGPU/Transforms/FoldMemRefsOps.cpp
index 9a6915b003df4..19b4341cc27a1 100644
--- a/mlir/lib/Dialect/AMDGPU/Transforms/FoldMemRefsOps.cpp
+++ b/mlir/lib/Dialect/AMDGPU/Transforms/FoldMemRefsOps.cpp
@@ -66,7 +66,7 @@ static LogicalResult foldMemrefViewOp(PatternRewriter &rewriter, Location loc,
}
struct FoldMemRefOpsIntoGatherToLDSOp final : OpRewritePattern<GatherToLDSOp> {
- using OpRewritePattern::OpRewritePattern;
+ using Base::Base;
LogicalResult matchAndRewrite(GatherToLDSOp op,
PatternRewriter &rewriter) const override {
Location loc = op.getLoc();
@@ -100,8 +100,29 @@ struct FoldMemRefOpsIntoGatherToLDSOp final : OpRewritePattern<GatherToLDSOp> {
}
};
+struct FoldMemRefOpsIntoTransposeLoadOp final
+ : OpRewritePattern<TransposeLoadOp> {
+ using Base::Base;
+ LogicalResult matchAndRewrite(TransposeLoadOp op,
+ PatternRewriter &rewriter) const override {
+ SmallVector<Value> sourceIndices;
+ Value memrefSource;
+
+ if (failed(foldMemrefViewOp(rewriter, op.getLoc(), op.getSrc(),
+ op.getSrcIndices(), sourceIndices, memrefSource,
+ "source")))
+ return failure();
+
+ rewriter.replaceOpWithNewOp<TransposeLoadOp>(op, op.getResult().getType(),
+ memrefSource, sourceIndices);
+ return success();
+ }
+};
+
void populateAmdgpuFoldMemRefOpsPatterns(RewritePatternSet &patterns,
PatternBenefit benefit) {
- patterns.add<FoldMemRefOpsIntoGatherToLDSOp>(patterns.getContext(), benefit);
+ patterns
+ .add<FoldMemRefOpsIntoGatherToLDSOp, FoldMemRefOpsIntoTransposeLoadOp>(
+ patterns.getContext(), benefit);
}
} // namespace mlir::amdgpu
diff --git a/mlir/test/Dialect/AMDGPU/amdgpu-fold-memrefs.mlir b/mlir/test/Dialect/AMDGPU/amdgpu-fold-memrefs.mlir
index 77a5e4af4c1d1..11571b060a0b8 100644
--- a/mlir/test/Dialect/AMDGPU/amdgpu-fold-memrefs.mlir
+++ b/mlir/test/Dialect/AMDGPU/amdgpu-fold-memrefs.mlir
@@ -183,3 +183,106 @@ func.func @test_async_flag_preserved(%offset_i: index, %offset_j: index) {
: vector<8xf16>, memref<32x64xf16, strided<[128, 1]>>, memref<64x64xf16, #gpu_lds_addrspace>
func.return
}
+
+// -----
+
+#gpu_wg = #gpu.address_space<workgroup>
+
+// CHECK: func @test_transpose_load_subview
+// CHECK-SAME: %[[ARG0:.*]]: index, %[[ARG1:.*]]: index
+func.func @test_transpose_load_subview(%offset_i: index, %offset_j: index) -> vector<4xf16> {
+ // CHECK: %[[ALLOC:.*]] = memref.alloc() : memref<64x128xf16, #gpu.address_space<workgroup>>
+ // CHECK: amdgpu.transpose_load %[[ALLOC]][%[[ARG0]], %[[ARG1]]]
+ // CHECK-SAME: memref<64x128xf16, #gpu.address_space<workgroup>> -> vector<4xf16>
+
+ %alloc = memref.alloc() : memref<64x128xf16, #gpu_wg>
+ %subview = memref.subview %alloc[0, 0][32, 64][1, 1]
+ : memref<64x128xf16, #gpu_wg> to memref<32x64xf16, strided<[128, 1]>, #gpu_wg>
+ %result = amdgpu.transpose_load %subview[%offset_i, %offset_j]
+ : memref<32x64xf16, strided<[128, 1]>, #gpu_wg> -> vector<4xf16>
+ return %result : vector<4xf16>
+}
+
+// -----
+
+#gpu_wg = #gpu.address_space<workgroup>
+
+// CHECK: #[[MAP:.*]] = affine_map<()[s0] -> (s0 + 32)>
+// CHECK: #[[MAP1:.*]] = affine_map<()[s0] -> (s0 + 64)>
+
+// CHECK: func @test_transpose_load_subview_offset
+// CHECK-SAME: %[[ARG0:.*]]: index, %[[ARG1:.*]]: index
+func.func @test_transpose_load_subview_offset(%offset_i: index, %offset_j: index) -> vector<4xf16> {
+ // CHECK: %[[ALLOC:.*]] = memref.alloc() : memref<64x128xf16, #gpu.address_space<workgroup>>
+ // CHECK: %[[IDX0:.*]] = affine.apply #[[MAP]]()[%[[ARG0]]]
+ // CHECK: %[[IDX1:.*]] = affine.apply #[[MAP1]]()[%[[ARG1]]]
+ // CHECK: amdgpu.transpose_load %[[ALLOC]][%[[IDX0]], %[[IDX1]]]
+ // CHECK-SAME: memref<64x128xf16, #gpu.address_space<workgroup>> -> vector<4xf16>
+
+ %alloc = memref.alloc() : memref<64x128xf16, #gpu_wg>
+ %subview = memref.subview %alloc[32, 64][32, 64][1, 1]
+ : memref<64x128xf16, #gpu_wg>
+ to memref<32x64xf16, strided<[128, 1], offset: 4160>, #gpu_wg>
+ %result = amdgpu.transpose_load %subview[%offset_i, %offset_j]
+ : memref<32x64xf16, strided<[128, 1], offset: 4160>, #gpu_wg> -> vector<4xf16>
+ return %result : vector<4xf16>
+}
+
+// -----
+
+#gpu_wg = #gpu.address_space<workgroup>
+
+// CHECK: func @test_transpose_load_expand_shape
+// CHECK-SAME: %[[ARG0:.*]]: index, %[[ARG1:.*]]: index
+func.func @test_transpose_load_expand_shape(%offset_i: index, %offset_j: index) -> vector<4xf16> {
+ // CHECK: %[[ALLOC:.*]] = memref.alloc() : memref<4096xf16, #gpu.address_space<workgroup>>
+ // CHECK: %[[IDX:.*]] = affine.linearize_index [%[[ARG0]], %[[ARG1]]] by (32, 128) : index
+ // CHECK: amdgpu.transpose_load %[[ALLOC]][%[[IDX]]]
+ // CHECK-SAME: memref<4096xf16, #gpu.address_space<workgroup>> -> vector<4xf16>
+
+ %alloc = memref.alloc() : memref<4096xf16, #gpu_wg>
+ %expand = memref.expand_shape %alloc [[0, 1]] output_shape [32, 128]
+ : memref<4096xf16, #gpu_wg> into memref<32x128xf16, #gpu_wg>
+ %result = amdgpu.transpose_load %expand[%offset_i, %offset_j]
+ : memref<32x128xf16, #gpu_wg> -> vector<4xf16>
+ return %result : vector<4xf16>
+}
+
+// -----
+
+#gpu_wg = #gpu.address_space<workgroup>
+
+// CHECK: func @test_transpose_load_collapse_shape
+// CHECK-SAME: %[[ARG0:.*]]: index
+func.func @test_transpose_load_collapse_shape(%offset_i: index) -> vector<4xf16> {
+ // CHECK: %[[ALLOC:.*]] = memref.alloc() : memref<32x128xf16, #gpu.address_space<workgroup>>
+ // CHECK: %[[INDICES:.*]]:2 = affine.delinearize_index %[[ARG0]] into (32, 128) : index, index
+ // CHECK: amdgpu.transpose_load %[[ALLOC]][%[[INDICES]]#0, %[[INDICES]]#1]
+ // CHECK-SAME: memref<32x128xf16, #gpu.address_space<workgroup>> -> vector<4xf16>
+
+ %alloc = memref.alloc() : memref<32x128xf16, #gpu_wg>
+ %collapse = memref.collapse_shape %alloc [[0, 1]]
+ : memref<32x128xf16, #gpu_wg> into memref<4096xf16, #gpu_wg>
+ %result = amdgpu.transpose_load %collapse[%offset_i]
+ : memref<4096xf16, #gpu_wg> -> vector<4xf16>
+ return %result : vector<4xf16>
+}
+
+// -----
+
+#gpu_wg = #gpu.address_space<workgroup>
+
+// CHECK: func @test_transpose_load_nop
+// CHECK-SAME: %[[ARG0:.*]]: index, %[[ARG1:.*]]: index
+func.func @test_transpose_load_nop(%offset_i: index, %offset_j: index) -> vector<4xf16> {
+ // CHECK: %[[ALLOC:.*]] = memref.alloc() : memref<32x128xf16, #gpu.address_space<workgroup>>
+ // CHECK: amdgpu.transpose_load %[[ALLOC]][%[[ARG0]], %[[ARG1]]]
+ // CHECK-SAME: memref<32x128xf16, #gpu.address_space<workgroup>> -> vector<4xf16>
+ // CHECK-NOT: subview
+ // CHECK-NOT: expand_shape
+
+ %alloc = memref.alloc() : memref<32x128xf16, #gpu_wg>
+ %result = amdgpu.transpose_load %alloc[%offset_i, %offset_j]
+ : memref<32x128xf16, #gpu_wg> -> vector<4xf16>
+ return %result : vector<4xf16>
+}
More information about the Mlir-commits
mailing list