[Mlir-commits] [mlir] [AMDGPU] fold `memref.subview` into `amdgpu.gather_to_lds` (PR #149851)

Tue Jul 22 07:07:25 PDT 2025

https://github.com/lialan updated https://github.com/llvm/llvm-project/pull/149851

>From 9f6afe18bceeca2b9d6e26368be2e06bbaf870a9 Mon Sep 17 00:00:00 2001
From: Alan Li <me at alanli.org>
Date: Mon, 21 Jul 2025 16:33:54 +0000
Subject: [PATCH 1/7] [AMDGPU] fold memref.subview into amdgpu.gather_to_lds

---
 .../mlir/Dialect/AMDGPU/Transforms/Passes.h   |  6 +-
 .../mlir/Dialect/AMDGPU/Transforms/Passes.td  | 12 ++++
 .../Dialect/AMDGPU/Transforms/CMakeLists.txt  |  3 +-
 .../AMDGPU/Transforms/FoldSubviewOps.cpp      | 65 +++++++++++++++++++
 .../Dialect/AMDGPU/amdgpu-fold-subviews.mlir  | 50 ++++++++++++++
 5 files changed, 134 insertions(+), 2 deletions(-)
 create mode 100644 mlir/lib/Dialect/AMDGPU/Transforms/FoldSubviewOps.cpp
 create mode 100644 mlir/test/Dialect/AMDGPU/amdgpu-fold-subviews.mlir

diff --git a/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.h b/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.h
index cc2f543e79f69..a61903609aaff 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.h
+++ b/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.h
@@ -22,8 +22,9 @@ class ConversionTarget;
 namespace amdgpu {
 
 #define GEN_PASS_DECL_AMDGPUEMULATEATOMICSPASS
-#define GEN_PASS_DECL_AMDGPURESOLVESTRIDEDMETADATAPASS
+#define GEN_PASS_DECL_AMDGPUFOLDSUBVIEWOPSPASS
 #define GEN_PASS_DECL_AMDGPUMASKEDLOADTOLOADPASS
+#define GEN_PASS_DECL_AMDGPURESOLVESTRIDEDMETADATAPASS
 #define GEN_PASS_REGISTRATION
 #include "mlir/Dialect/AMDGPU/Transforms/Passes.h.inc"
 
@@ -38,6 +39,9 @@ void populateAmdgpuResolveStridedMetadataPatterns(RewritePatternSet &patterns,
 void populateAmdgpuMaskedloadToLoadPatterns(RewritePatternSet &patterns,
                                             PatternBenefit benefit = 1);
 
+void populateAmdgpuFoldSubviewOpsPatterns(RewritePatternSet &patterns,
+                                          PatternBenefit benefit = 1);
+
 } // namespace amdgpu
 } // namespace mlir
 
diff --git a/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.td b/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.td
index 8d0e6829ab0cc..7529511b0ea76 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.td
@@ -70,4 +70,16 @@ def AmdgpuMaskedloadToLoadPass : Pass<"amdgpu-maskedload-to-load"> {
     "memref::MemRefDialect"
   ];
 }
+
+def AmdgpuFoldSubviewOpsPass : Pass<"amdgpu-fold-subview-ops"> {
+  let summary = "Fold subview operations into their parent operations";
+  let description = [{
+    This pass identifies `memref.subview` source of `GatherToLDSOp` and
+    attempts to fold the source op, potentially simplifying the overall
+    operation and improving performance.
+  }];
+  let dependentDialects = [
+    "memref::MemRefDialect"
+  ];
+}
 #endif // MLIR_DIALECT_AMDGPU_TRANSFORMS_PASSES_TD_
diff --git a/mlir/lib/Dialect/AMDGPU/Transforms/CMakeLists.txt b/mlir/lib/Dialect/AMDGPU/Transforms/CMakeLists.txt
index 17bbe54ea6c0c..20621ec0d55a4 100644
--- a/mlir/lib/Dialect/AMDGPU/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/AMDGPU/Transforms/CMakeLists.txt
@@ -1,7 +1,8 @@
 add_mlir_dialect_library(MLIRAMDGPUTransforms
   EmulateAtomics.cpp
-  ResolveStridedMetadata.cpp
+  FoldSubviewOps.cpp
   MaskedloadToLoad.cpp
+  ResolveStridedMetadata.cpp
 
   ADDITIONAL_HEADER_DIRS
   {$MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/AMDGPU/Transforms
diff --git a/mlir/lib/Dialect/AMDGPU/Transforms/FoldSubviewOps.cpp b/mlir/lib/Dialect/AMDGPU/Transforms/FoldSubviewOps.cpp
new file mode 100644
index 0000000000000..a962f7a2526b2
--- /dev/null
+++ b/mlir/lib/Dialect/AMDGPU/Transforms/FoldSubviewOps.cpp
@@ -0,0 +1,65 @@
+//===- FoldSubviewOps.cpp - AMDGPU fold subview ops ---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/AMDGPU/Transforms/Passes.h"
+
+#include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h"
+#include "mlir/Dialect/Affine/ViewLikeInterfaceUtils.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+
+namespace mlir::amdgpu {
+#define GEN_PASS_DEF_AMDGPUFOLDSUBVIEWOPSPASS
+#include "mlir/Dialect/AMDGPU/Transforms/Passes.h.inc"
+} // namespace mlir::amdgpu
+
+using namespace mlir;
+using namespace mlir::amdgpu;
+
+namespace {
+struct AmdgpuFoldSubviewOpsPass
+    : public amdgpu::impl::AmdgpuFoldSubviewOpsPassBase<
+          AmdgpuFoldSubviewOpsPass> {
+  void runOnOperation() override {
+    RewritePatternSet patterns(&getContext());
+    populateAmdgpuFoldSubviewOpsPatterns(patterns);
+    if (failed(applyPatternsGreedily(getOperation(), std::move(patterns))))
+      signalPassFailure();
+  }
+};
+
+struct FoldSubviewIntoGatherToLDSOp : public OpRewritePattern<GatherToLDSOp> {
+  using OpRewritePattern<GatherToLDSOp>::OpRewritePattern;
+  LogicalResult matchAndRewrite(GatherToLDSOp op,
+                                PatternRewriter &rewriter) const override {
+    Location loc = op.getLoc();
+
+    // Check if the source is a subview operation:
+    auto subviewOp = dyn_cast<memref::SubViewOp>(op.getSrc().getDefiningOp());
+    if (!subviewOp)
+      return rewriter.notifyMatchFailure(
+          loc, "GatherToLDSOp can only be folded if the source is a SubviewOp");
+
+    SmallVector<Value> sourceIndices;
+    mlir::affine::resolveIndicesIntoOpWithOffsetsAndStrides(
+        rewriter, loc, subviewOp.getMixedOffsets(), subviewOp.getMixedStrides(),
+        subviewOp.getDroppedDims(), op.getSrcIndices(), sourceIndices);
+
+    rewriter.replaceOpWithNewOp<GatherToLDSOp>(
+        op, subviewOp.getSource(), sourceIndices, op.getDst(),
+        op.getDstIndices(), op.getTransferType());
+
+    return success();
+  }
+};
+} // namespace
+
+void mlir::amdgpu::populateAmdgpuFoldSubviewOpsPatterns(
+    RewritePatternSet &patterns, PatternBenefit benefit) {
+  patterns.add<FoldSubviewIntoGatherToLDSOp>(patterns.getContext(), benefit);
+}
diff --git a/mlir/test/Dialect/AMDGPU/amdgpu-fold-subviews.mlir b/mlir/test/Dialect/AMDGPU/amdgpu-fold-subviews.mlir
new file mode 100644
index 0000000000000..d582991c3622f
--- /dev/null
+++ b/mlir/test/Dialect/AMDGPU/amdgpu-fold-subviews.mlir
@@ -0,0 +1,50 @@
+// RUN: mlir-opt -amdgpu-fold-subview-ops -split-input-file %s | FileCheck %s
+
+#gpu_lds_addrspace = 3
+
+// CHECK: func @test_memref
+// CHECK-SAME: %[[ARG0:.*]]: index, %[[ARG1:.*]]: index
+func.func @test_memref(%offset_i: index, %offset_j: index) {
+  // CHECK: %[[C0:.*]] = arith.constant 0 : index
+  // CHECK: %[[LOCAL:.*]] = memref.alloc() : memref<64x64xf16, 3>
+  // CHECK: %[[MEM:.*]] = memref.alloc() : memref<64x128xf16>
+  // CHECK:  %[[MEM]][%arg0, %arg1], %[[LOCAL]][%[[C0]], %[[C0]]]
+  // CHECK-SAME: vector<8xf16>, memref<64x128xf16>, memref<64x64xf16, 3>
+
+  %alloc = memref.alloc() : memref<64x64xf16, #gpu_lds_addrspace>
+  %mem = memref.alloc() : memref<64x128xf16>
+  %subview = memref.subview %mem[0, 0][32, 64][1, 1] : memref<64x128xf16> to memref<32x64xf16, strided<[128, 1]>>
+  %c0 = arith.constant 0 : index
+  amdgpu.gather_to_lds %subview[%offset_i, %offset_j], %alloc[%c0, %c0]
+    : vector<8xf16>, memref<32x64xf16, strided<[128, 1]>>, memref<64x64xf16, #gpu_lds_addrspace>
+  func.return
+}
+
+// -----
+
+#gpu_lds_addrspace = 3
+
+// CHECK: #[[MAP:.*]] = affine_map<()[s0] -> (s0 + 32)>
+// CHECK: #[[MAP1:.*]] = affine_map<()[s0] -> (s0 + 64)>
+
+// CHECK: func @subview_folding_offset
+// CHECK-SAME: %[[ARG0:.*]]: index, %[[ARG1:.*]]: index
+func.func @subview_folding_offset(%offset_i: index, %offset_j: index) {
+  // CHECK: %[[C0:.*]] = arith.constant 0 : index
+  // CHECK: %[[LOCAL:.*]] = memref.alloc() : memref<64x64xf16, 3>
+  // CHECK: %[[MEM:.*]] = memref.alloc() : memref<64x128xf16>
+
+  // CHECK: %[[IDX0:.*]] = affine.apply #[[MAP]]()[%[[ARG0]]]
+  // CHECK: %[[IDX1:.*]] = affine.apply #[[MAP1]]()[%[[ARG1]]]
+
+  // CHECK:  %[[MEM]][%[[IDX0]], %[[IDX1]]], %[[LOCAL]][%[[C0]], %[[C0]]]
+  // CHECK-SAME: vector<8xf16>, memref<64x128xf16>, memref<64x64xf16, 3>
+
+  %alloc = memref.alloc() : memref<64x64xf16, #gpu_lds_addrspace>
+  %mem = memref.alloc() : memref<64x128xf16>
+  %subview = memref.subview %mem[32, 64][32, 64][1, 1] : memref<64x128xf16> to memref<32x64xf16, strided<[128, 1], offset: 4160>>
+  %c0 = arith.constant 0 : index
+  amdgpu.gather_to_lds %subview[%offset_i, %offset_j], %alloc[%c0, %c0]
+    : vector<8xf16>, memref<32x64xf16, strided<[128, 1], offset: 4160>>, memref<64x64xf16, #gpu_lds_addrspace>
+  func.return
+}

>From 71fe3aa49154184123546c40c72d695680be7133 Mon Sep 17 00:00:00 2001
From: Alan Li <alan.li at me.com>
Date: Mon, 21 Jul 2025 14:21:05 -0400
Subject: [PATCH 2/7] Update
 mlir/lib/Dialect/AMDGPU/Transforms/FoldSubviewOps.cpp

Co-authored-by: Copilot <175728472+Copilot at users.noreply.github.com>
---
 mlir/lib/Dialect/AMDGPU/Transforms/FoldSubviewOps.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/lib/Dialect/AMDGPU/Transforms/FoldSubviewOps.cpp b/mlir/lib/Dialect/AMDGPU/Transforms/FoldSubviewOps.cpp
index a962f7a2526b2..7b81800f07ab2 100644
--- a/mlir/lib/Dialect/AMDGPU/Transforms/FoldSubviewOps.cpp
+++ b/mlir/lib/Dialect/AMDGPU/Transforms/FoldSubviewOps.cpp
@@ -43,7 +43,7 @@ struct FoldSubviewIntoGatherToLDSOp : public OpRewritePattern<GatherToLDSOp> {
     auto subviewOp = dyn_cast<memref::SubViewOp>(op.getSrc().getDefiningOp());
     if (!subviewOp)
       return rewriter.notifyMatchFailure(
-          loc, "GatherToLDSOp can only be folded if the source is a SubviewOp");
+          loc, "GatherToLDSOp folding is currently supported only when the source is a SubviewOp. This is one specific pattern, and other scenarios may be added in the future.");
 
     SmallVector<Value> sourceIndices;
     mlir::affine::resolveIndicesIntoOpWithOffsetsAndStrides(

>From bd4ade5466db59f84e88dc62773c38a40bb05c77 Mon Sep 17 00:00:00 2001
From: Alan Li <alan.li at me.com>
Date: Mon, 21 Jul 2025 14:21:15 -0400
Subject: [PATCH 3/7] Update
 mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.td

Co-authored-by: Copilot <175728472+Copilot at users.noreply.github.com>
---
 mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.td | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.td b/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.td
index 7529511b0ea76..fad939ced9877 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.td
@@ -74,8 +74,8 @@ def AmdgpuMaskedloadToLoadPass : Pass<"amdgpu-maskedload-to-load"> {
 def AmdgpuFoldSubviewOpsPass : Pass<"amdgpu-fold-subview-ops"> {
   let summary = "Fold subview operations into their parent operations";
   let description = [{
-    This pass identifies `memref.subview` source of `GatherToLDSOp` and
-    attempts to fold the source op, potentially simplifying the overall
+    This pass identifies `memref.subview` sources of `GatherToLDSOp` and
+    attempts to fold the source ops, potentially simplifying the overall
     operation and improving performance.
   }];
   let dependentDialects = [

>From 9552f4ed9b2857c79fedb2faab32cdaddd8dfda1 Mon Sep 17 00:00:00 2001
From: Alan Li <me at alanli.org>
Date: Mon, 21 Jul 2025 14:49:21 -0400
Subject: [PATCH 4/7] linting

---
 mlir/lib/Dialect/AMDGPU/Transforms/FoldSubviewOps.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/mlir/lib/Dialect/AMDGPU/Transforms/FoldSubviewOps.cpp b/mlir/lib/Dialect/AMDGPU/Transforms/FoldSubviewOps.cpp
index 7b81800f07ab2..adbdf4b856bd5 100644
--- a/mlir/lib/Dialect/AMDGPU/Transforms/FoldSubviewOps.cpp
+++ b/mlir/lib/Dialect/AMDGPU/Transforms/FoldSubviewOps.cpp
@@ -43,7 +43,9 @@ struct FoldSubviewIntoGatherToLDSOp : public OpRewritePattern<GatherToLDSOp> {
     auto subviewOp = dyn_cast<memref::SubViewOp>(op.getSrc().getDefiningOp());
     if (!subviewOp)
       return rewriter.notifyMatchFailure(
-          loc, "GatherToLDSOp folding is currently supported only when the source is a SubviewOp. This is one specific pattern, and other scenarios may be added in the future.");
+          loc, "GatherToLDSOp folding is currently supported only when the "
+               "source is a SubviewOp. This is one specific pattern, and other "
+               "scenarios may be added in the future.");
 
     SmallVector<Value> sourceIndices;
     mlir::affine::resolveIndicesIntoOpWithOffsetsAndStrides(

>From 06e283175dd68291a41ea5bcb88b0788444fa096 Mon Sep 17 00:00:00 2001
From: Alan Li <me at alanli.org>
Date: Mon, 21 Jul 2025 19:40:58 +0000
Subject: [PATCH 5/7] Move to FoldMemRefAliasOps

---
 .../mlir/Dialect/AMDGPU/Transforms/Passes.h   |  6 +-
 .../Dialect/AMDGPU/Transforms/CMakeLists.txt  |  3 +-
 .../AMDGPU/Transforms/FoldSubviewOps.cpp      | 67 -------------------
 .../MemRef/Transforms/FoldMemRefAliasOps.cpp  | 31 ++++++++-
 4 files changed, 31 insertions(+), 76 deletions(-)
 delete mode 100644 mlir/lib/Dialect/AMDGPU/Transforms/FoldSubviewOps.cpp

diff --git a/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.h b/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.h
index a61903609aaff..cc2f543e79f69 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.h
+++ b/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.h
@@ -22,9 +22,8 @@ class ConversionTarget;
 namespace amdgpu {
 
 #define GEN_PASS_DECL_AMDGPUEMULATEATOMICSPASS
-#define GEN_PASS_DECL_AMDGPUFOLDSUBVIEWOPSPASS
-#define GEN_PASS_DECL_AMDGPUMASKEDLOADTOLOADPASS
 #define GEN_PASS_DECL_AMDGPURESOLVESTRIDEDMETADATAPASS
+#define GEN_PASS_DECL_AMDGPUMASKEDLOADTOLOADPASS
 #define GEN_PASS_REGISTRATION
 #include "mlir/Dialect/AMDGPU/Transforms/Passes.h.inc"
 
@@ -39,9 +38,6 @@ void populateAmdgpuResolveStridedMetadataPatterns(RewritePatternSet &patterns,
 void populateAmdgpuMaskedloadToLoadPatterns(RewritePatternSet &patterns,
                                             PatternBenefit benefit = 1);
 
-void populateAmdgpuFoldSubviewOpsPatterns(RewritePatternSet &patterns,
-                                          PatternBenefit benefit = 1);
-
 } // namespace amdgpu
 } // namespace mlir
 
diff --git a/mlir/lib/Dialect/AMDGPU/Transforms/CMakeLists.txt b/mlir/lib/Dialect/AMDGPU/Transforms/CMakeLists.txt
index 20621ec0d55a4..17bbe54ea6c0c 100644
--- a/mlir/lib/Dialect/AMDGPU/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/AMDGPU/Transforms/CMakeLists.txt
@@ -1,8 +1,7 @@
 add_mlir_dialect_library(MLIRAMDGPUTransforms
   EmulateAtomics.cpp
-  FoldSubviewOps.cpp
-  MaskedloadToLoad.cpp
   ResolveStridedMetadata.cpp
+  MaskedloadToLoad.cpp
 
   ADDITIONAL_HEADER_DIRS
   {$MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/AMDGPU/Transforms
diff --git a/mlir/lib/Dialect/AMDGPU/Transforms/FoldSubviewOps.cpp b/mlir/lib/Dialect/AMDGPU/Transforms/FoldSubviewOps.cpp
deleted file mode 100644
index adbdf4b856bd5..0000000000000
--- a/mlir/lib/Dialect/AMDGPU/Transforms/FoldSubviewOps.cpp
+++ /dev/null
@@ -1,67 +0,0 @@
-//===- FoldSubviewOps.cpp - AMDGPU fold subview ops ---------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "mlir/Dialect/AMDGPU/Transforms/Passes.h"
-
-#include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h"
-#include "mlir/Dialect/Affine/ViewLikeInterfaceUtils.h"
-#include "mlir/Dialect/MemRef/IR/MemRef.h"
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-
-namespace mlir::amdgpu {
-#define GEN_PASS_DEF_AMDGPUFOLDSUBVIEWOPSPASS
-#include "mlir/Dialect/AMDGPU/Transforms/Passes.h.inc"
-} // namespace mlir::amdgpu
-
-using namespace mlir;
-using namespace mlir::amdgpu;
-
-namespace {
-struct AmdgpuFoldSubviewOpsPass
-    : public amdgpu::impl::AmdgpuFoldSubviewOpsPassBase<
-          AmdgpuFoldSubviewOpsPass> {
-  void runOnOperation() override {
-    RewritePatternSet patterns(&getContext());
-    populateAmdgpuFoldSubviewOpsPatterns(patterns);
-    if (failed(applyPatternsGreedily(getOperation(), std::move(patterns))))
-      signalPassFailure();
-  }
-};
-
-struct FoldSubviewIntoGatherToLDSOp : public OpRewritePattern<GatherToLDSOp> {
-  using OpRewritePattern<GatherToLDSOp>::OpRewritePattern;
-  LogicalResult matchAndRewrite(GatherToLDSOp op,
-                                PatternRewriter &rewriter) const override {
-    Location loc = op.getLoc();
-
-    // Check if the source is a subview operation:
-    auto subviewOp = dyn_cast<memref::SubViewOp>(op.getSrc().getDefiningOp());
-    if (!subviewOp)
-      return rewriter.notifyMatchFailure(
-          loc, "GatherToLDSOp folding is currently supported only when the "
-               "source is a SubviewOp. This is one specific pattern, and other "
-               "scenarios may be added in the future.");
-
-    SmallVector<Value> sourceIndices;
-    mlir::affine::resolveIndicesIntoOpWithOffsetsAndStrides(
-        rewriter, loc, subviewOp.getMixedOffsets(), subviewOp.getMixedStrides(),
-        subviewOp.getDroppedDims(), op.getSrcIndices(), sourceIndices);
-
-    rewriter.replaceOpWithNewOp<GatherToLDSOp>(
-        op, subviewOp.getSource(), sourceIndices, op.getDst(),
-        op.getDstIndices(), op.getTransferType());
-
-    return success();
-  }
-};
-} // namespace
-
-void mlir::amdgpu::populateAmdgpuFoldSubviewOpsPatterns(
-    RewritePatternSet &patterns, PatternBenefit benefit) {
-  patterns.add<FoldSubviewIntoGatherToLDSOp>(patterns.getContext(), benefit);
-}
diff --git a/mlir/lib/Dialect/MemRef/Transforms/FoldMemRefAliasOps.cpp b/mlir/lib/Dialect/MemRef/Transforms/FoldMemRefAliasOps.cpp
index 89be188af9129..d6b5e3ecfa72b 100644
--- a/mlir/lib/Dialect/MemRef/Transforms/FoldMemRefAliasOps.cpp
+++ b/mlir/lib/Dialect/MemRef/Transforms/FoldMemRefAliasOps.cpp
@@ -20,6 +20,7 @@
 #include "mlir/Dialect/MemRef/Transforms/Transforms.h"
 #include "mlir/Dialect/MemRef/Utils/MemRefUtils.h"
 #include "mlir/Dialect/NVGPU/IR/NVGPUDialect.h"
+#include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/IR/AffineMap.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
@@ -732,6 +733,32 @@ LogicalResult NVGPUAsyncCopyOpSubViewOpFolder::matchAndRewrite(
   return success();
 }
 
+struct FoldSubviewIntoGatherToLDSOp
+    : public OpRewritePattern<amdgpu::GatherToLDSOp> {
+  using OpRewritePattern<amdgpu::GatherToLDSOp>::OpRewritePattern;
+  LogicalResult
+  matchAndRewrite(amdgpu::GatherToLDSOp op, PatternRewriter &rewriter) const override {
+    Location loc = op.getLoc();
+
+    // Check if the source is a subview operation:
+    auto subviewOp = dyn_cast<memref::SubViewOp>(op.getSrc().getDefiningOp());
+    if (!subviewOp)
+      return rewriter.notifyMatchFailure(
+          loc, "GatherToLDSOp can only be folded if the source is a SubviewOp");
+
+    SmallVector<Value> sourceIndices;
+     mlir::affine::resolveIndicesIntoOpWithOffsetsAndStrides(
+         rewriter, loc, subviewOp.getMixedOffsets(), subviewOp.getMixedStrides(),
+         subviewOp.getDroppedDims(), op.getSrcIndices(), sourceIndices);
+
+     rewriter.replaceOpWithNewOp<admgpu::GatherToLDSOp>(
+         op, subviewOp.getSource(), sourceIndices, op.getDst(), op.getDstIndices(),
+         op.getTransferType());
+
+     return success();
+  }
+};
+
 void memref::populateFoldMemRefAliasOpPatterns(RewritePatternSet &patterns) {
   patterns.add<LoadOpOfSubViewOpFolder<affine::AffineLoadOp>,
                LoadOpOfSubViewOpFolder<memref::LoadOp>,
@@ -762,8 +789,8 @@ void memref::populateFoldMemRefAliasOpPatterns(RewritePatternSet &patterns) {
                StoreOpOfCollapseShapeOpFolder<memref::StoreOp>,
                StoreOpOfCollapseShapeOpFolder<vector::StoreOp>,
                StoreOpOfCollapseShapeOpFolder<vector::MaskedStoreOp>,
-               SubViewOfSubViewFolder, NVGPUAsyncCopyOpSubViewOpFolder>(
-      patterns.getContext());
+               SubViewOfSubViewFolder, NVGPUAsyncCopyOpSubViewOpFolder,
+               FoldSubviewIntoGatherToLDSOp>(patterns.getContext());
 }
 
 //===----------------------------------------------------------------------===//

>From 97ec2bf6d8e956e92dfb54cbf50e9d0c292d6af2 Mon Sep 17 00:00:00 2001
From: Alan Li <me at alanli.org>
Date: Mon, 21 Jul 2025 19:59:35 +0000
Subject: [PATCH 6/7] updating tests

---
 mlir/test/Dialect/AMDGPU/amdgpu-fold-subviews.mlir | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mlir/test/Dialect/AMDGPU/amdgpu-fold-subviews.mlir b/mlir/test/Dialect/AMDGPU/amdgpu-fold-subviews.mlir
index d582991c3622f..a0f02a9bc9340 100644
--- a/mlir/test/Dialect/AMDGPU/amdgpu-fold-subviews.mlir
+++ b/mlir/test/Dialect/AMDGPU/amdgpu-fold-subviews.mlir
@@ -8,7 +8,7 @@ func.func @test_memref(%offset_i: index, %offset_j: index) {
   // CHECK: %[[C0:.*]] = arith.constant 0 : index
   // CHECK: %[[LOCAL:.*]] = memref.alloc() : memref<64x64xf16, 3>
   // CHECK: %[[MEM:.*]] = memref.alloc() : memref<64x128xf16>
-  // CHECK:  %[[MEM]][%arg0, %arg1], %[[LOCAL]][%[[C0]], %[[C0]]]
+  // CHECK:  amdgpu.gather_to_lds %[[MEM]][%arg0, %arg1], %[[LOCAL]][%[[C0]], %[[C0]]]
   // CHECK-SAME: vector<8xf16>, memref<64x128xf16>, memref<64x64xf16, 3>
 
   %alloc = memref.alloc() : memref<64x64xf16, #gpu_lds_addrspace>
@@ -37,7 +37,7 @@ func.func @subview_folding_offset(%offset_i: index, %offset_j: index) {
   // CHECK: %[[IDX0:.*]] = affine.apply #[[MAP]]()[%[[ARG0]]]
   // CHECK: %[[IDX1:.*]] = affine.apply #[[MAP1]]()[%[[ARG1]]]
 
-  // CHECK:  %[[MEM]][%[[IDX0]], %[[IDX1]]], %[[LOCAL]][%[[C0]], %[[C0]]]
+  // CHECK:  amdgpu.gather_to_lds %[[MEM]][%[[IDX0]], %[[IDX1]]], %[[LOCAL]][%[[C0]], %[[C0]]]
   // CHECK-SAME: vector<8xf16>, memref<64x128xf16>, memref<64x64xf16, 3>
 
   %alloc = memref.alloc() : memref<64x64xf16, #gpu_lds_addrspace>

>From addc07a168463b82d5b33269e69faf1d806a3eb9 Mon Sep 17 00:00:00 2001
From: Alan Li <me at alanli.org>
Date: Tue, 22 Jul 2025 10:06:18 -0400
Subject: [PATCH 7/7] Still ... merge it with FoldMemRefAliasOps pass.

---
 .../mlir/Dialect/AMDGPU/Transforms/Passes.td  | 12 --------
 .../MemRef/Transforms/FoldMemRefAliasOps.cpp  | 28 +++++++++----------
 .../Dialect/AMDGPU/amdgpu-fold-subviews.mlir  |  2 +-
 3 files changed, 15 insertions(+), 27 deletions(-)

diff --git a/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.td b/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.td
index fad939ced9877..8d0e6829ab0cc 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.td
@@ -70,16 +70,4 @@ def AmdgpuMaskedloadToLoadPass : Pass<"amdgpu-maskedload-to-load"> {
     "memref::MemRefDialect"
   ];
 }
-
-def AmdgpuFoldSubviewOpsPass : Pass<"amdgpu-fold-subview-ops"> {
-  let summary = "Fold subview operations into their parent operations";
-  let description = [{
-    This pass identifies `memref.subview` sources of `GatherToLDSOp` and
-    attempts to fold the source ops, potentially simplifying the overall
-    operation and improving performance.
-  }];
-  let dependentDialects = [
-    "memref::MemRefDialect"
-  ];
-}
 #endif // MLIR_DIALECT_AMDGPU_TRANSFORMS_PASSES_TD_
diff --git a/mlir/lib/Dialect/MemRef/Transforms/FoldMemRefAliasOps.cpp b/mlir/lib/Dialect/MemRef/Transforms/FoldMemRefAliasOps.cpp
index d6b5e3ecfa72b..d3b2862e660bd 100644
--- a/mlir/lib/Dialect/MemRef/Transforms/FoldMemRefAliasOps.cpp
+++ b/mlir/lib/Dialect/MemRef/Transforms/FoldMemRefAliasOps.cpp
@@ -11,6 +11,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Affine/ViewLikeInterfaceUtils.h"
 #include "mlir/Dialect/Arith/Utils/Utils.h"
@@ -20,7 +21,6 @@
 #include "mlir/Dialect/MemRef/Transforms/Transforms.h"
 #include "mlir/Dialect/MemRef/Utils/MemRefUtils.h"
 #include "mlir/Dialect/NVGPU/IR/NVGPUDialect.h"
-#include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/IR/AffineMap.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
@@ -733,11 +733,11 @@ LogicalResult NVGPUAsyncCopyOpSubViewOpFolder::matchAndRewrite(
   return success();
 }
 
-struct FoldSubviewIntoGatherToLDSOp
-    : public OpRewritePattern<amdgpu::GatherToLDSOp> {
-  using OpRewritePattern<amdgpu::GatherToLDSOp>::OpRewritePattern;
-  LogicalResult
-  matchAndRewrite(amdgpu::GatherToLDSOp op, PatternRewriter &rewriter) const override {
+struct FoldSubviewIntoAMDGPUGatherToLDSOp final
+    : OpRewritePattern<amdgpu::GatherToLDSOp> {
+  using OpRewritePattern::OpRewritePattern;
+  LogicalResult matchAndRewrite(amdgpu::GatherToLDSOp op,
+                                PatternRewriter &rewriter) const override {
     Location loc = op.getLoc();
 
     // Check if the source is a subview operation:
@@ -747,15 +747,15 @@ struct FoldSubviewIntoGatherToLDSOp
           loc, "GatherToLDSOp can only be folded if the source is a SubviewOp");
 
     SmallVector<Value> sourceIndices;
-     mlir::affine::resolveIndicesIntoOpWithOffsetsAndStrides(
-         rewriter, loc, subviewOp.getMixedOffsets(), subviewOp.getMixedStrides(),
-         subviewOp.getDroppedDims(), op.getSrcIndices(), sourceIndices);
+    mlir::affine::resolveIndicesIntoOpWithOffsetsAndStrides(
+        rewriter, loc, subviewOp.getMixedOffsets(), subviewOp.getMixedStrides(),
+        subviewOp.getDroppedDims(), op.getSrcIndices(), sourceIndices);
 
-     rewriter.replaceOpWithNewOp<admgpu::GatherToLDSOp>(
-         op, subviewOp.getSource(), sourceIndices, op.getDst(), op.getDstIndices(),
-         op.getTransferType());
+    rewriter.replaceOpWithNewOp<amdgpu::GatherToLDSOp>(
+        op, subviewOp.getSource(), sourceIndices, op.getDst(),
+        op.getDstIndices(), op.getTransferType());
 
-     return success();
+    return success();
   }
 };
 
@@ -790,7 +790,7 @@ void memref::populateFoldMemRefAliasOpPatterns(RewritePatternSet &patterns) {
                StoreOpOfCollapseShapeOpFolder<vector::StoreOp>,
                StoreOpOfCollapseShapeOpFolder<vector::MaskedStoreOp>,
                SubViewOfSubViewFolder, NVGPUAsyncCopyOpSubViewOpFolder,
-               FoldSubviewIntoGatherToLDSOp>(patterns.getContext());
+               FoldSubviewIntoAMDGPUGatherToLDSOp>(patterns.getContext());
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/Dialect/AMDGPU/amdgpu-fold-subviews.mlir b/mlir/test/Dialect/AMDGPU/amdgpu-fold-subviews.mlir
index a0f02a9bc9340..f1953ddbd7a78 100644
--- a/mlir/test/Dialect/AMDGPU/amdgpu-fold-subviews.mlir
+++ b/mlir/test/Dialect/AMDGPU/amdgpu-fold-subviews.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt -amdgpu-fold-subview-ops -split-input-file %s | FileCheck %s
+// RUN: mlir-opt --fold-memref-alias-ops --split-input-file %s | FileCheck %s
 
 #gpu_lds_addrspace = 3