[Mlir-commits] [mlir] [AMDGPU] fold `memref.subview` into `amdgpu.gather_to_lds` (PR #149851)
Alan Li
llvmlistbot at llvm.org
Mon Jul 21 11:49:38 PDT 2025
https://github.com/lialan updated https://github.com/llvm/llvm-project/pull/149851
>From 9f6afe18bceeca2b9d6e26368be2e06bbaf870a9 Mon Sep 17 00:00:00 2001
From: Alan Li <me at alanli.org>
Date: Mon, 21 Jul 2025 16:33:54 +0000
Subject: [PATCH 1/4] [AMDGPU] fold memref.subview into amdgpu.gather_to_lds
---
.../mlir/Dialect/AMDGPU/Transforms/Passes.h | 6 +-
.../mlir/Dialect/AMDGPU/Transforms/Passes.td | 12 ++++
.../Dialect/AMDGPU/Transforms/CMakeLists.txt | 3 +-
.../AMDGPU/Transforms/FoldSubviewOps.cpp | 65 +++++++++++++++++++
.../Dialect/AMDGPU/amdgpu-fold-subviews.mlir | 50 ++++++++++++++
5 files changed, 134 insertions(+), 2 deletions(-)
create mode 100644 mlir/lib/Dialect/AMDGPU/Transforms/FoldSubviewOps.cpp
create mode 100644 mlir/test/Dialect/AMDGPU/amdgpu-fold-subviews.mlir
diff --git a/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.h b/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.h
index cc2f543e79f69..a61903609aaff 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.h
+++ b/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.h
@@ -22,8 +22,9 @@ class ConversionTarget;
namespace amdgpu {
#define GEN_PASS_DECL_AMDGPUEMULATEATOMICSPASS
-#define GEN_PASS_DECL_AMDGPURESOLVESTRIDEDMETADATAPASS
+#define GEN_PASS_DECL_AMDGPUFOLDSUBVIEWOPSPASS
#define GEN_PASS_DECL_AMDGPUMASKEDLOADTOLOADPASS
+#define GEN_PASS_DECL_AMDGPURESOLVESTRIDEDMETADATAPASS
#define GEN_PASS_REGISTRATION
#include "mlir/Dialect/AMDGPU/Transforms/Passes.h.inc"
@@ -38,6 +39,9 @@ void populateAmdgpuResolveStridedMetadataPatterns(RewritePatternSet &patterns,
void populateAmdgpuMaskedloadToLoadPatterns(RewritePatternSet &patterns,
PatternBenefit benefit = 1);
+void populateAmdgpuFoldSubviewOpsPatterns(RewritePatternSet &patterns,
+ PatternBenefit benefit = 1);
+
} // namespace amdgpu
} // namespace mlir
diff --git a/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.td b/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.td
index 8d0e6829ab0cc..7529511b0ea76 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.td
@@ -70,4 +70,16 @@ def AmdgpuMaskedloadToLoadPass : Pass<"amdgpu-maskedload-to-load"> {
"memref::MemRefDialect"
];
}
+
+def AmdgpuFoldSubviewOpsPass : Pass<"amdgpu-fold-subview-ops"> {
+ let summary = "Fold subview operations into their parent operations";
+ let description = [{
+ This pass identifies `memref.subview` source of `GatherToLDSOp` and
+ attempts to fold the source op, potentially simplifying the overall
+ operation and improving performance.
+ }];
+ let dependentDialects = [
+ "memref::MemRefDialect"
+ ];
+}
#endif // MLIR_DIALECT_AMDGPU_TRANSFORMS_PASSES_TD_
diff --git a/mlir/lib/Dialect/AMDGPU/Transforms/CMakeLists.txt b/mlir/lib/Dialect/AMDGPU/Transforms/CMakeLists.txt
index 17bbe54ea6c0c..20621ec0d55a4 100644
--- a/mlir/lib/Dialect/AMDGPU/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/AMDGPU/Transforms/CMakeLists.txt
@@ -1,7 +1,8 @@
add_mlir_dialect_library(MLIRAMDGPUTransforms
EmulateAtomics.cpp
- ResolveStridedMetadata.cpp
+ FoldSubviewOps.cpp
MaskedloadToLoad.cpp
+ ResolveStridedMetadata.cpp
ADDITIONAL_HEADER_DIRS
{$MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/AMDGPU/Transforms
diff --git a/mlir/lib/Dialect/AMDGPU/Transforms/FoldSubviewOps.cpp b/mlir/lib/Dialect/AMDGPU/Transforms/FoldSubviewOps.cpp
new file mode 100644
index 0000000000000..a962f7a2526b2
--- /dev/null
+++ b/mlir/lib/Dialect/AMDGPU/Transforms/FoldSubviewOps.cpp
@@ -0,0 +1,65 @@
+//===- FoldSubviewOps.cpp - AMDGPU fold subview ops ---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/AMDGPU/Transforms/Passes.h"
+
+#include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h"
+#include "mlir/Dialect/Affine/ViewLikeInterfaceUtils.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+
+namespace mlir::amdgpu {
+#define GEN_PASS_DEF_AMDGPUFOLDSUBVIEWOPSPASS
+#include "mlir/Dialect/AMDGPU/Transforms/Passes.h.inc"
+} // namespace mlir::amdgpu
+
+using namespace mlir;
+using namespace mlir::amdgpu;
+
+namespace {
+struct AmdgpuFoldSubviewOpsPass
+ : public amdgpu::impl::AmdgpuFoldSubviewOpsPassBase<
+ AmdgpuFoldSubviewOpsPass> {
+ void runOnOperation() override {
+ RewritePatternSet patterns(&getContext());
+ populateAmdgpuFoldSubviewOpsPatterns(patterns);
+ if (failed(applyPatternsGreedily(getOperation(), std::move(patterns))))
+ signalPassFailure();
+ }
+};
+
+struct FoldSubviewIntoGatherToLDSOp : public OpRewritePattern<GatherToLDSOp> {
+ using OpRewritePattern<GatherToLDSOp>::OpRewritePattern;
+ LogicalResult matchAndRewrite(GatherToLDSOp op,
+ PatternRewriter &rewriter) const override {
+ Location loc = op.getLoc();
+
+ // Check if the source is a subview operation:
+ auto subviewOp = dyn_cast<memref::SubViewOp>(op.getSrc().getDefiningOp());
+ if (!subviewOp)
+ return rewriter.notifyMatchFailure(
+ loc, "GatherToLDSOp can only be folded if the source is a SubviewOp");
+
+ SmallVector<Value> sourceIndices;
+ mlir::affine::resolveIndicesIntoOpWithOffsetsAndStrides(
+ rewriter, loc, subviewOp.getMixedOffsets(), subviewOp.getMixedStrides(),
+ subviewOp.getDroppedDims(), op.getSrcIndices(), sourceIndices);
+
+ rewriter.replaceOpWithNewOp<GatherToLDSOp>(
+ op, subviewOp.getSource(), sourceIndices, op.getDst(),
+ op.getDstIndices(), op.getTransferType());
+
+ return success();
+ }
+};
+} // namespace
+
+void mlir::amdgpu::populateAmdgpuFoldSubviewOpsPatterns(
+ RewritePatternSet &patterns, PatternBenefit benefit) {
+ patterns.add<FoldSubviewIntoGatherToLDSOp>(patterns.getContext(), benefit);
+}
diff --git a/mlir/test/Dialect/AMDGPU/amdgpu-fold-subviews.mlir b/mlir/test/Dialect/AMDGPU/amdgpu-fold-subviews.mlir
new file mode 100644
index 0000000000000..d582991c3622f
--- /dev/null
+++ b/mlir/test/Dialect/AMDGPU/amdgpu-fold-subviews.mlir
@@ -0,0 +1,50 @@
+// RUN: mlir-opt -amdgpu-fold-subview-ops -split-input-file %s | FileCheck %s
+
+#gpu_lds_addrspace = 3
+
+// CHECK: func @test_memref
+// CHECK-SAME: %[[ARG0:.*]]: index, %[[ARG1:.*]]: index
+func.func @test_memref(%offset_i: index, %offset_j: index) {
+ // CHECK: %[[C0:.*]] = arith.constant 0 : index
+ // CHECK: %[[LOCAL:.*]] = memref.alloc() : memref<64x64xf16, 3>
+ // CHECK: %[[MEM:.*]] = memref.alloc() : memref<64x128xf16>
+ // CHECK: %[[MEM]][%arg0, %arg1], %[[LOCAL]][%[[C0]], %[[C0]]]
+ // CHECK-SAME: vector<8xf16>, memref<64x128xf16>, memref<64x64xf16, 3>
+
+ %alloc = memref.alloc() : memref<64x64xf16, #gpu_lds_addrspace>
+ %mem = memref.alloc() : memref<64x128xf16>
+ %subview = memref.subview %mem[0, 0][32, 64][1, 1] : memref<64x128xf16> to memref<32x64xf16, strided<[128, 1]>>
+ %c0 = arith.constant 0 : index
+ amdgpu.gather_to_lds %subview[%offset_i, %offset_j], %alloc[%c0, %c0]
+ : vector<8xf16>, memref<32x64xf16, strided<[128, 1]>>, memref<64x64xf16, #gpu_lds_addrspace>
+ func.return
+}
+
+// -----
+
+#gpu_lds_addrspace = 3
+
+// CHECK: #[[MAP:.*]] = affine_map<()[s0] -> (s0 + 32)>
+// CHECK: #[[MAP1:.*]] = affine_map<()[s0] -> (s0 + 64)>
+
+// CHECK: func @subview_folding_offset
+// CHECK-SAME: %[[ARG0:.*]]: index, %[[ARG1:.*]]: index
+func.func @subview_folding_offset(%offset_i: index, %offset_j: index) {
+ // CHECK: %[[C0:.*]] = arith.constant 0 : index
+ // CHECK: %[[LOCAL:.*]] = memref.alloc() : memref<64x64xf16, 3>
+ // CHECK: %[[MEM:.*]] = memref.alloc() : memref<64x128xf16>
+
+ // CHECK: %[[IDX0:.*]] = affine.apply #[[MAP]]()[%[[ARG0]]]
+ // CHECK: %[[IDX1:.*]] = affine.apply #[[MAP1]]()[%[[ARG1]]]
+
+ // CHECK: %[[MEM]][%[[IDX0]], %[[IDX1]]], %[[LOCAL]][%[[C0]], %[[C0]]]
+ // CHECK-SAME: vector<8xf16>, memref<64x128xf16>, memref<64x64xf16, 3>
+
+ %alloc = memref.alloc() : memref<64x64xf16, #gpu_lds_addrspace>
+ %mem = memref.alloc() : memref<64x128xf16>
+ %subview = memref.subview %mem[32, 64][32, 64][1, 1] : memref<64x128xf16> to memref<32x64xf16, strided<[128, 1], offset: 4160>>
+ %c0 = arith.constant 0 : index
+ amdgpu.gather_to_lds %subview[%offset_i, %offset_j], %alloc[%c0, %c0]
+ : vector<8xf16>, memref<32x64xf16, strided<[128, 1], offset: 4160>>, memref<64x64xf16, #gpu_lds_addrspace>
+ func.return
+}
>From 71fe3aa49154184123546c40c72d695680be7133 Mon Sep 17 00:00:00 2001
From: Alan Li <alan.li at me.com>
Date: Mon, 21 Jul 2025 14:21:05 -0400
Subject: [PATCH 2/4] Update
mlir/lib/Dialect/AMDGPU/Transforms/FoldSubviewOps.cpp
Co-authored-by: Copilot <175728472+Copilot at users.noreply.github.com>
---
mlir/lib/Dialect/AMDGPU/Transforms/FoldSubviewOps.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/mlir/lib/Dialect/AMDGPU/Transforms/FoldSubviewOps.cpp b/mlir/lib/Dialect/AMDGPU/Transforms/FoldSubviewOps.cpp
index a962f7a2526b2..7b81800f07ab2 100644
--- a/mlir/lib/Dialect/AMDGPU/Transforms/FoldSubviewOps.cpp
+++ b/mlir/lib/Dialect/AMDGPU/Transforms/FoldSubviewOps.cpp
@@ -43,7 +43,7 @@ struct FoldSubviewIntoGatherToLDSOp : public OpRewritePattern<GatherToLDSOp> {
auto subviewOp = dyn_cast<memref::SubViewOp>(op.getSrc().getDefiningOp());
if (!subviewOp)
return rewriter.notifyMatchFailure(
- loc, "GatherToLDSOp can only be folded if the source is a SubviewOp");
+ loc, "GatherToLDSOp folding is currently supported only when the source is a SubviewOp. This is one specific pattern, and other scenarios may be added in the future.");
SmallVector<Value> sourceIndices;
mlir::affine::resolveIndicesIntoOpWithOffsetsAndStrides(
>From bd4ade5466db59f84e88dc62773c38a40bb05c77 Mon Sep 17 00:00:00 2001
From: Alan Li <alan.li at me.com>
Date: Mon, 21 Jul 2025 14:21:15 -0400
Subject: [PATCH 3/4] Update
mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.td
Co-authored-by: Copilot <175728472+Copilot at users.noreply.github.com>
---
mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.td | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.td b/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.td
index 7529511b0ea76..fad939ced9877 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.td
@@ -74,8 +74,8 @@ def AmdgpuMaskedloadToLoadPass : Pass<"amdgpu-maskedload-to-load"> {
def AmdgpuFoldSubviewOpsPass : Pass<"amdgpu-fold-subview-ops"> {
let summary = "Fold subview operations into their parent operations";
let description = [{
- This pass identifies `memref.subview` source of `GatherToLDSOp` and
- attempts to fold the source op, potentially simplifying the overall
+ This pass identifies `memref.subview` sources of `GatherToLDSOp` and
+ attempts to fold the source ops, potentially simplifying the overall
operation and improving performance.
}];
let dependentDialects = [
>From 9552f4ed9b2857c79fedb2faab32cdaddd8dfda1 Mon Sep 17 00:00:00 2001
From: Alan Li <me at alanli.org>
Date: Mon, 21 Jul 2025 14:49:21 -0400
Subject: [PATCH 4/4] linting
---
mlir/lib/Dialect/AMDGPU/Transforms/FoldSubviewOps.cpp | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/mlir/lib/Dialect/AMDGPU/Transforms/FoldSubviewOps.cpp b/mlir/lib/Dialect/AMDGPU/Transforms/FoldSubviewOps.cpp
index 7b81800f07ab2..adbdf4b856bd5 100644
--- a/mlir/lib/Dialect/AMDGPU/Transforms/FoldSubviewOps.cpp
+++ b/mlir/lib/Dialect/AMDGPU/Transforms/FoldSubviewOps.cpp
@@ -43,7 +43,9 @@ struct FoldSubviewIntoGatherToLDSOp : public OpRewritePattern<GatherToLDSOp> {
auto subviewOp = dyn_cast<memref::SubViewOp>(op.getSrc().getDefiningOp());
if (!subviewOp)
return rewriter.notifyMatchFailure(
- loc, "GatherToLDSOp folding is currently supported only when the source is a SubviewOp. This is one specific pattern, and other scenarios may be added in the future.");
+ loc, "GatherToLDSOp folding is currently supported only when the "
+ "source is a SubviewOp. This is one specific pattern, and other "
+ "scenarios may be added in the future.");
SmallVector<Value> sourceIndices;
mlir::affine::resolveIndicesIntoOpWithOffsetsAndStrides(
More information about the Mlir-commits
mailing list