[Mlir-commits] [mlir] [mlir][memref][NVGPU] Move NVGPU ops to IndexedAccessOpInterface (PR #190430)

Fri Apr 3 16:31:35 PDT 2026

https://github.com/krzysz00 created https://github.com/llvm/llvm-project/pull/190430

This removes the need for the memref dialect to know about nvgpu operations (though we still haven't converted
ExtractAddressComputations to t new interface, so we can't remove the dependency just yet).

ldmatrix is defined to access a 1-D region of memory in order to enable folding in arbitrary expand_ and collpapse_shapes, as its underlying lowering is jut a scalar getStridedElementPtr()

>From 5fb8381b61d4a6d40d93139e8c918dd4cc874d67 Mon Sep 17 00:00:00 2001
From: Krzysztof Drewniak <Krzysztof.Drewniak at amd.com>
Date: Fri, 3 Apr 2026 23:20:49 +0000
Subject: [PATCH] [mlir][memref][NVGPU] Move NVGPU ops to
 IndexedAccessOpInterface

This removes the need for the memref dialect to know about nvgpu
operations (though we still haven't converted
ExtractAddressComputations to t new interface, so we can't remove the
dependency just yet).

ldmatrix is defined to access a 1-D region of memory in order to
enable folding in arbitrary expand_ and collpapse_shapes, as its
underlying lowering is jut a scalar getStridedElementPtr()
---
 .../include/mlir/Dialect/NVGPU/IR/NVGPUOps.td |  2 +
 .../Transforms/MemoryAccessOpInterfacesImpl.h | 21 +++++
 .../MemRef/Transforms/FoldMemRefAliasOps.cpp  | 76 +--------------
 mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp    |  4 +
 .../Dialect/NVGPU/Transforms/CMakeLists.txt   |  1 +
 .../MemoryAccessOpInterfacesImpl.cpp          | 93 +++++++++++++++++++
 mlir/lib/RegisterAllDialects.cpp              |  2 +
 .../Dialect/MemRef/fold-memref-alias-ops.mlir | 80 ----------------
 .../Dialect/NVGPU/fold-memref-alias-ops.mlir  | 93 +++++++++++++++++++
 9 files changed, 217 insertions(+), 155 deletions(-)
 create mode 100644 mlir/include/mlir/Dialect/NVGPU/Transforms/MemoryAccessOpInterfacesImpl.h
 create mode 100644 mlir/lib/Dialect/NVGPU/Transforms/MemoryAccessOpInterfacesImpl.cpp
 create mode 100644 mlir/test/Dialect/NVGPU/fold-memref-alias-ops.mlir

diff --git a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUOps.td b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUOps.td
index 73d86283a5940..4c11725405ea5 100644
--- a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUOps.td
+++ b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUOps.td
@@ -30,6 +30,7 @@ include "mlir/Dialect/NVGPU/IR/NVGPUTypes.td"
 class NVGPU_Op<string mnemonic, list<Trait> traits = []> :
   Op<NVGPU_Dialect, mnemonic, traits> {}
 
+// Promises IndexedAccessOpInterface.
 def NVGPU_LdMatrixOp : NVGPU_Op<"ldmatrix", [
                                 MemoryEffects<[MemRead]>,
                                 PredOpTrait<"srcMemref and res have same element type",
@@ -183,6 +184,7 @@ def NVGPU_MmaSparseSyncOp : NVGPU_MmaSyncOp<"mma.sp.sync"> {
   let extraClassDeclaration = extraBaseClassDeclaration;
 }
 
+// Promises IndexedMemCopyOpInterface.
 def NVGPU_DeviceAsyncCopyOp : NVGPU_Op<"device_async_copy", [
                                        AttrSizedOperandSegments]> {
   let summary = "device-side asynchronous copy";
diff --git a/mlir/include/mlir/Dialect/NVGPU/Transforms/MemoryAccessOpInterfacesImpl.h b/mlir/include/mlir/Dialect/NVGPU/Transforms/MemoryAccessOpInterfacesImpl.h
new file mode 100644
index 0000000000000..50d2223912a27
--- /dev/null
+++ b/mlir/include/mlir/Dialect/NVGPU/Transforms/MemoryAccessOpInterfacesImpl.h
@@ -0,0 +1,21 @@
+//===- MemoryAccessOpInterfacesImpl.h -------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_NVGPU_TRANSFORMS_MEMORYACCESSOPINTERFACESIMPL_H
+#define MLIR_DIALECT_NVGPU_TRANSFORMS_MEMORYACCESSOPINTERFACESIMPL_H
+
+namespace mlir {
+
+class DialectRegistry;
+
+namespace nvgpu {
+void registerMemoryAccessOpInterfacesExternalModels(DialectRegistry &registry);
+} // namespace nvgpu
+} // namespace mlir
+
+#endif // MLIR_DIALECT_NVGPU_TRANSFORMS_MEMORYACCESSOPINTERFACESIMPL_H
diff --git a/mlir/lib/Dialect/MemRef/Transforms/FoldMemRefAliasOps.cpp b/mlir/lib/Dialect/MemRef/Transforms/FoldMemRefAliasOps.cpp
index 6f2752932422a..d42d4f4599e18 100644
--- a/mlir/lib/Dialect/MemRef/Transforms/FoldMemRefAliasOps.cpp
+++ b/mlir/lib/Dialect/MemRef/Transforms/FoldMemRefAliasOps.cpp
@@ -19,7 +19,6 @@
 #include "mlir/Dialect/MemRef/Transforms/Passes.h"
 #include "mlir/Dialect/MemRef/Transforms/Transforms.h"
 #include "mlir/Dialect/MemRef/Utils/MemRefUtils.h"
-#include "mlir/Dialect/NVGPU/IR/NVGPUDialect.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/IR/AffineExpr.h"
 #include "mlir/IR/AffineMap.h"
@@ -76,10 +75,6 @@ static Value getMemRefOperand(vector::TransferReadOp op) {
   return op.getBase();
 }
 
-static Value getMemRefOperand(nvgpu::LdMatrixOp op) {
-  return op.getSrcMemref();
-}
-
 static Value getMemRefOperand(vector::LoadOp op) { return op.getBase(); }
 
 static Value getMemRefOperand(vector::StoreOp op) { return op.getBase(); }
@@ -208,17 +203,6 @@ class SubViewOfSubViewFolder : public OpRewritePattern<memref::SubViewOp> {
   }
 };
 
-/// Folds nvgpu.device_async_copy subviews into the copy itself. This pattern
-/// is folds subview on src and dst memref of the copy.
-class NVGPUAsyncCopyOpSubViewOpFolder final
-    : public OpRewritePattern<nvgpu::DeviceAsyncCopyOp> {
-public:
-  using OpRewritePattern<nvgpu::DeviceAsyncCopyOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(nvgpu::DeviceAsyncCopyOp copyOp,
-                                PatternRewriter &rewriter) const override;
-};
-
 /// Merges subview operations with load/store like operations unless such a
 /// merger would cause the strides between dimensions accessed by that operaton
 /// to change.
@@ -377,11 +361,6 @@ LogicalResult LoadOpOfSubViewOpFolder<OpTy>::matchAndRewrite(
             op, op.getType(), subViewOp.getSource(), sourceIndices,
             op.getLeadDimension(), op.getTransposeAttr());
       })
-      .Case([&](nvgpu::LdMatrixOp op) {
-        rewriter.replaceOpWithNewOp<nvgpu::LdMatrixOp>(
-            op, op.getType(), subViewOp.getSource(), sourceIndices,
-            op.getTranspose(), op.getNumTiles());
-      })
       .DefaultUnreachable("unexpected operation");
   return success();
 }
@@ -822,57 +801,6 @@ LogicalResult IndexedMemCopyOpOfCollapseShapeOpFolder::matchAndRewrite(
   return success();
 }
 
-LogicalResult NVGPUAsyncCopyOpSubViewOpFolder::matchAndRewrite(
-    nvgpu::DeviceAsyncCopyOp copyOp, PatternRewriter &rewriter) const {
-
-  LLVM_DEBUG(DBGS() << "copyOp       : " << copyOp << "\n");
-
-  auto srcSubViewOp =
-      copyOp.getSrc().template getDefiningOp<memref::SubViewOp>();
-  auto dstSubViewOp =
-      copyOp.getDst().template getDefiningOp<memref::SubViewOp>();
-
-  if (!(srcSubViewOp || dstSubViewOp))
-    return rewriter.notifyMatchFailure(copyOp, "does not use subview ops for "
-                                               "source or destination");
-
-  // If the source is a subview, we need to resolve the indices.
-  SmallVector<Value> foldedSrcIndices(copyOp.getSrcIndices().begin(),
-                                      copyOp.getSrcIndices().end());
-
-  if (srcSubViewOp) {
-    LLVM_DEBUG(DBGS() << "srcSubViewOp : " << srcSubViewOp << "\n");
-    affine::resolveIndicesIntoOpWithOffsetsAndStrides(
-        rewriter, copyOp.getLoc(), srcSubViewOp.getMixedOffsets(),
-        srcSubViewOp.getMixedStrides(), srcSubViewOp.getDroppedDims(),
-        copyOp.getSrcIndices(), foldedSrcIndices);
-  }
-
-  // If the destination is a subview, we need to resolve the indices.
-  SmallVector<Value> foldedDstIndices(copyOp.getDstIndices().begin(),
-                                      copyOp.getDstIndices().end());
-
-  if (dstSubViewOp) {
-    LLVM_DEBUG(DBGS() << "dstSubViewOp : " << dstSubViewOp << "\n");
-    affine::resolveIndicesIntoOpWithOffsetsAndStrides(
-        rewriter, copyOp.getLoc(), dstSubViewOp.getMixedOffsets(),
-        dstSubViewOp.getMixedStrides(), dstSubViewOp.getDroppedDims(),
-        copyOp.getDstIndices(), foldedDstIndices);
-  }
-
-  // Replace the copy op with a new copy op that uses the source and destination
-  // of the subview.
-  rewriter.replaceOpWithNewOp<nvgpu::DeviceAsyncCopyOp>(
-      copyOp, nvgpu::DeviceAsyncTokenType::get(copyOp.getContext()),
-      (dstSubViewOp ? dstSubViewOp.getSource() : copyOp.getDst()),
-      foldedDstIndices,
-      (srcSubViewOp ? srcSubViewOp.getSource() : copyOp.getSrc()),
-      foldedSrcIndices, copyOp.getDstElements(), copyOp.getSrcElements(),
-      copyOp.getBypassL1Attr());
-
-  return success();
-}
-
 void memref::populateFoldMemRefAliasOpPatterns(RewritePatternSet &patterns) {
   patterns.add<
       // Interface-based patterns to which we will be migrating.
@@ -881,7 +809,6 @@ void memref::populateFoldMemRefAliasOpPatterns(RewritePatternSet &patterns) {
       IndexedMemCopyOpOfExpandShapeOpFolder,
       IndexedMemCopyOpOfCollapseShapeOpFolder,
       // The old way of doing things. Don't add more of these.
-      LoadOpOfSubViewOpFolder<nvgpu::LdMatrixOp>,
       LoadOpOfSubViewOpFolder<vector::LoadOp>,
       LoadOpOfSubViewOpFolder<vector::MaskedLoadOp>,
       LoadOpOfSubViewOpFolder<vector::TransferReadOp>,
@@ -899,8 +826,7 @@ void memref::populateFoldMemRefAliasOpPatterns(RewritePatternSet &patterns) {
       LoadOpOfCollapseShapeOpFolder<vector::MaskedLoadOp>,
       StoreOpOfCollapseShapeOpFolder<vector::StoreOp>,
       StoreOpOfCollapseShapeOpFolder<vector::MaskedStoreOp>,
-      SubViewOfSubViewFolder, NVGPUAsyncCopyOpSubViewOpFolder>(
-      patterns.getContext());
+      SubViewOfSubViewFolder>(patterns.getContext());
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp b/mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp
index 237aab4d7f309..b60658c7e3041 100644
--- a/mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp
+++ b/mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp
@@ -12,6 +12,7 @@
 
 #include "mlir/Dialect/NVGPU/IR/NVGPUDialect.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/MemRef/IR/MemoryAccessOpInterfaces.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/BuiltinTypes.h"
@@ -40,6 +41,9 @@ void NVGPUDialect::initialize() {
 #define GET_OP_LIST
 #include "mlir/Dialect/NVGPU/IR/NVGPUOps.cpp.inc"
       >();
+  declarePromisedInterfaces<memref::IndexedAccessOpInterface, LdMatrixOp>();
+  declarePromisedInterfaces<memref::IndexedMemCopyOpInterface,
+                            DeviceAsyncCopyOp>();
 }
 
 bool NVGPUDialect::isSharedMemoryAddressSpace(Attribute memorySpace) {
diff --git a/mlir/lib/Dialect/NVGPU/Transforms/CMakeLists.txt b/mlir/lib/Dialect/NVGPU/Transforms/CMakeLists.txt
index 3f967d2b189be..8852ed7fb30a8 100644
--- a/mlir/lib/Dialect/NVGPU/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/NVGPU/Transforms/CMakeLists.txt
@@ -1,5 +1,6 @@
 add_mlir_dialect_library(MLIRNVGPUTransforms
   CreateAsyncGroups.cpp
+  MemoryAccessOpInterfacesImpl.cpp
   OptimizeSharedMemory.cpp
   MmaSyncTF32Transform.cpp
   Utils.cpp
diff --git a/mlir/lib/Dialect/NVGPU/Transforms/MemoryAccessOpInterfacesImpl.cpp b/mlir/lib/Dialect/NVGPU/Transforms/MemoryAccessOpInterfacesImpl.cpp
new file mode 100644
index 0000000000000..92a4834c481c0
--- /dev/null
+++ b/mlir/lib/Dialect/NVGPU/Transforms/MemoryAccessOpInterfacesImpl.cpp
@@ -0,0 +1,93 @@
+//===- MemoryAccessOpInterfacesImpl.cpp -----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/NVGPU/Transforms/MemoryAccessOpInterfacesImpl.h"
+
+#include "mlir/Dialect/MemRef/IR/MemoryAccessOpInterfaces.h"
+#include "mlir/Dialect/NVGPU/IR/NVGPUDialect.h"
+#include "mlir/IR/Dialect.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/PatternMatch.h"
+
+using namespace mlir;
+using namespace mlir::memref;
+using namespace mlir::nvgpu;
+
+namespace {
+struct LdMatrixOpInterface final
+    : IndexedAccessOpInterface::ExternalModel<LdMatrixOpInterface, LdMatrixOp> {
+  TypedValue<MemRefType> getAccessedMemref(Operation *op) const {
+    return cast<LdMatrixOp>(op).getSrcMemref();
+  }
+
+  Operation::operand_range getIndices(Operation *op) const {
+    return cast<LdMatrixOp>(op).getIndices();
+  }
+
+  SmallVector<int64_t> getAccessedShape(Operation *op) const {
+    VectorType vecTy = cast<LdMatrixOp>(op).getRes().getType();
+    // The 2-D nature of the result is an artifact of this operation returning
+    // a struct of vectors and doesn't reflect any strides that need to be
+    // preserved.
+    return SmallVector<int64_t>{vecTy.getNumElements()};
+  }
+
+  std::optional<SmallVector<Value>>
+  updateMemrefAndIndices(Operation *op, RewriterBase &rewriter, Value newMemref,
+                         ValueRange newIndices) const {
+    auto ldMatrixOp = cast<LdMatrixOp>(op);
+    rewriter.modifyOpInPlace(ldMatrixOp, [&]() {
+      ldMatrixOp.getSrcMemrefMutable().assign(newMemref);
+      ldMatrixOp.getIndicesMutable().assign(newIndices);
+    });
+    return std::nullopt;
+  }
+
+  bool hasInboundsIndices(Operation *) const { return true; }
+};
+
+struct DeviceAsyncCopyOpInterface final
+    : IndexedMemCopyOpInterface::ExternalModel<DeviceAsyncCopyOpInterface,
+                                               DeviceAsyncCopyOp> {
+  TypedValue<MemRefType> getSrc(Operation *op) const {
+    return cast<DeviceAsyncCopyOp>(op).getSrc();
+  }
+
+  Operation::operand_range getSrcIndices(Operation *op) const {
+    return cast<DeviceAsyncCopyOp>(op).getSrcIndices();
+  }
+
+  TypedValue<MemRefType> getDst(Operation *op) const {
+    return cast<DeviceAsyncCopyOp>(op).getDst();
+  }
+
+  Operation::operand_range getDstIndices(Operation *op) const {
+    return cast<DeviceAsyncCopyOp>(op).getDstIndices();
+  }
+
+  void setMemrefsAndIndices(Operation *op, RewriterBase &rewriter, Value newSrc,
+                            ValueRange newSrcIndices, Value newDst,
+                            ValueRange newDstIndices) const {
+    auto copyOp = cast<DeviceAsyncCopyOp>(op);
+    rewriter.modifyOpInPlace(copyOp, [&]() {
+      copyOp.getSrcMutable().assign(newSrc);
+      copyOp.getSrcIndicesMutable().assign(newSrcIndices);
+      copyOp.getDstMutable().assign(newDst);
+      copyOp.getDstIndicesMutable().assign(newDstIndices);
+    });
+  }
+};
+} // namespace
+
+void mlir::nvgpu::registerMemoryAccessOpInterfacesExternalModels(
+    DialectRegistry &registry) {
+  registry.addExtension(+[](MLIRContext *ctx, nvgpu::NVGPUDialect *dialect) {
+    LdMatrixOp::attachInterface<LdMatrixOpInterface>(*ctx);
+    DeviceAsyncCopyOp::attachInterface<DeviceAsyncCopyOpInterface>(*ctx);
+  });
+}
diff --git a/mlir/lib/RegisterAllDialects.cpp b/mlir/lib/RegisterAllDialects.cpp
index ea5698f39c0b0..bf994500a19f0 100644
--- a/mlir/lib/RegisterAllDialects.cpp
+++ b/mlir/lib/RegisterAllDialects.cpp
@@ -59,6 +59,7 @@
 #include "mlir/Dialect/MemRef/Transforms/BufferViewFlowOpInterfaceImpl.h"
 #include "mlir/Dialect/MemRef/Transforms/RuntimeOpVerification.h"
 #include "mlir/Dialect/NVGPU/IR/NVGPUDialect.h"
+#include "mlir/Dialect/NVGPU/Transforms/MemoryAccessOpInterfacesImpl.h"
 #include "mlir/Dialect/OpenACC/OpenACC.h"
 #include "mlir/Dialect/OpenMP/OpenMPDialect.h"
 #include "mlir/Dialect/PDL/IR/PDL.h"
@@ -179,6 +180,7 @@ void mlir::registerAllDialects(DialectRegistry &registry) {
   memref::registerValueBoundsOpInterfaceExternalModels(registry);
   memref::registerMemorySlotExternalModels(registry);
   ml_program::registerBufferizableOpInterfaceExternalModels(registry);
+  nvgpu::registerMemoryAccessOpInterfacesExternalModels(registry);
   scf::registerBufferDeallocationOpInterfaceExternalModels(registry);
   scf::registerBufferizableOpInterfaceExternalModels(registry);
   scf::registerValueBoundsOpInterfaceExternalModels(registry);
diff --git a/mlir/test/Dialect/MemRef/fold-memref-alias-ops.mlir b/mlir/test/Dialect/MemRef/fold-memref-alias-ops.mlir
index 114ba86cda718..52c9ad590eda0 100644
--- a/mlir/test/Dialect/MemRef/fold-memref-alias-ops.mlir
+++ b/mlir/test/Dialect/MemRef/fold-memref-alias-ops.mlir
@@ -594,86 +594,6 @@ func.func @fold_gpu_subgroup_mma_load_matrix_2d(%arg0 : memref<128x128xf32>, %ar
 
 // -----
 
-
-func.func @fold_nvgpu_device_async_copy_zero_sub_idx(%gmem_memref_3d : memref<2x128x768xf16>, %idx_1 : index, %idx_2 : index, %idx_3 : index) {
-
-  %c0 = arith.constant 0 : index
-  %smem_memref_4d = memref.alloc() : memref<5x1x64x64xf16, #gpu.address_space<workgroup>>
-  %gmem_memref_subview_2d = memref.subview %gmem_memref_3d[%idx_1, %idx_2, %idx_3] [1, 1, 8] [1, 1, 1] : memref<2x128x768xf16> to memref<1x8xf16, strided<[98304, 1], offset: ?>>
-  %async_token = nvgpu.device_async_copy %gmem_memref_subview_2d[%c0, %c0], %smem_memref_4d[%c0, %c0, %c0, %c0], 8 {bypassL1} : memref<1x8xf16, strided<[98304, 1], offset: ?>> to memref<5x1x64x64xf16, #gpu.address_space<workgroup>>
-  return
-}
-
-// CHECK-LABEL: func.func @fold_nvgpu_device_async_copy_zero_sub_idx
-//  CHECK-SAME: (%[[GMEM_MEMREF_3d:.+]]: memref<2x128x768xf16>, %[[IDX_1:.+]]: index, %[[IDX_2:.+]]: index, %[[IDX_3:.+]]: index)
-//   CHECK-DAG: %[[c0:.+]] = arith.constant 0 : index
-//   CHECK-DAG: %[[SMEM_MEMREF_4d:.+]] = memref.alloc() : memref<5x1x64x64xf16, #gpu.address_space<workgroup>>
-//       CHECK: nvgpu.device_async_copy %[[GMEM_MEMREF_3d]][%[[IDX_1]], %[[IDX_2]], %[[IDX_3]]], %[[SMEM_MEMREF_4d]][%[[c0]], %[[c0]], %[[c0]], %[[c0]]], 8 {bypassL1} : memref<2x128x768xf16> to memref<5x1x64x64xf16, #gpu.address_space<workgroup>>
-
-// -----
-
-
-func.func @fold_src_nvgpu_device_async_copy(%gmem_memref_3d : memref<2x128x768xf16>, %src_idx_0 : index, %src_idx_1 : index, %src_idx_2 : index, %src_sub_idx_0 : index, %src_sub_idx_1 : index) {
-  %c0 = arith.constant 0 : index
-  %smem_memref_4d = memref.alloc() : memref<5x1x64x64xf16, #gpu.address_space<workgroup>>
-  %gmem_memref_subview_2d = memref.subview %gmem_memref_3d[%src_idx_0, %src_idx_1, %src_idx_2] [1, 1, 8] [1, 1, 1] : memref<2x128x768xf16> to memref<1x8xf16, strided<[98304, 1], offset: ?>>
-  %async_token = nvgpu.device_async_copy %gmem_memref_subview_2d[%src_sub_idx_0, %src_sub_idx_1], %smem_memref_4d[%c0, %c0, %c0, %c0], 8 {bypassL1} : memref<1x8xf16, strided<[98304, 1], offset: ?>> to memref<5x1x64x64xf16, #gpu.address_space<workgroup>>
-  return
-}
-
-//   CHECK-DAG: #[[MAP:.+]] = affine_map<()[s0, s1] -> (s0 + s1)>
-//       CHECK: func.func @fold_src_nvgpu_device_async_copy
-//  CHECK-SAME: (%[[GMEM_MEMREF_3d:.+]]: memref<2x128x768xf16>, %[[SRC_IDX_0:.+]]: index, %[[SRC_IDX_1:.+]]: index, %[[SRC_IDX_2:.+]]: index, %[[SRC_SUB_IDX_0:.+]]: index, %[[SRC_SUB_IDX_1:.+]]: index)
-//   CHECK-DAG: %[[c0:.+]] = arith.constant 0 : index
-//   CHECK-DAG: %[[RESOLVED_SRC_IDX_0:.+]] = affine.apply #[[MAP]]()[%[[SRC_IDX_0]], %[[SRC_SUB_IDX_0]]]
-//   CHECK-DAG: %[[RESOLVED_SRC_IDX_1:.+]] = affine.apply #[[MAP]]()[%[[SRC_IDX_2]], %[[SRC_SUB_IDX_1]]]
-//   CHECK-DAG: nvgpu.device_async_copy %[[GMEM_MEMREF_3d]][%[[RESOLVED_SRC_IDX_0]], %[[SRC_IDX_1]], %[[RESOLVED_SRC_IDX_1]]], %[[SMEM_MEMREF_4d]][%[[c0]], %[[c0]], %[[c0]], %[[c0]]], 8 {bypassL1} : memref<2x128x768xf16> to memref<5x1x64x64xf16, #gpu.address_space<workgroup>>
-
-// -----
-
-
-func.func @fold_src_fold_dest_nvgpu_device_async_copy(%gmem_memref_3d : memref<2x128x768xf16>, %src_idx_0 : index, %src_idx_1 : index, %src_idx_2 : index, %src_sub_idx_0 : index, %src_sub_idx_1 : index, %dest_idx_0 : index, %dest_idx_1 : index, %dest_idx_2 : index, %dest_idx_3 : index, %dest_sub_idx_0 : index, %dest_sub_idx_1 : index) {
-  %c0 = arith.constant 0 : index
-  %smem_memref_4d = memref.alloc() : memref<5x1x64x64xf16, #gpu.address_space<workgroup>>
-  %gmem_memref_subview_2d = memref.subview %gmem_memref_3d[%src_idx_0, %src_idx_1, %src_idx_2] [1, 1, 8] [1, 1, 1] : memref<2x128x768xf16> to memref<1x8xf16, strided<[98304, 1], offset: ?>>
-  %smem_memref_2d = memref.subview %smem_memref_4d[%dest_idx_0, %dest_idx_1, %dest_idx_2, %dest_idx_3] [1, 1, 1, 8] [1, 1, 1, 1] : memref<5x1x64x64xf16, #gpu.address_space<workgroup>> to memref<1x8xf16, strided<[4096, 1], offset: ?>, #gpu.address_space<workgroup>>
-  %async_token = nvgpu.device_async_copy %gmem_memref_subview_2d[%src_sub_idx_0, %src_sub_idx_1], %smem_memref_2d[%dest_sub_idx_0, %dest_sub_idx_1], 8 {bypassL1} : memref<1x8xf16, strided<[98304, 1], offset: ?>> to memref<1x8xf16, strided<[4096, 1], offset: ?>, #gpu.address_space<workgroup>>
-  return
-}
-
-//   CHECK-DAG: #[[MAP:.+]] = affine_map<()[s0, s1] -> (s0 + s1)>
-//       CHECK: func.func @fold_src_fold_dest_nvgpu_device_async_copy
-//  CHECK-SAME: (%[[GMEM_MEMREF_3d:.+]]: memref<2x128x768xf16>, %[[SRC_IDX_0:.+]]: index, %[[SRC_IDX_1:.+]]: index, %[[SRC_IDX_2:.+]]: index, %[[SRC_SUB_IDX_0:.+]]: index, %[[SRC_SUB_IDX_1:.+]]: index, %[[DEST_IDX_0:.+]]: index, %[[DEST_IDX_1:.+]]: index, %[[DEST_IDX_2:.+]]: index, %[[DEST_IDX_3:.+]]: index, %[[DEST_SUB_IDX_0:.+]]: index, %[[DEST_SUB_IDX_1:.+]]: index)
-//   CHECK-DAG: %[[RESOLVED_SRC_IDX_0:.+]] = affine.apply #[[MAP]]()[%[[SRC_IDX_0]], %[[SRC_SUB_IDX_0]]]
-//   CHECK-DAG: %[[RESOLVED_SRC_IDX_1:.+]] = affine.apply #[[MAP]]()[%[[SRC_IDX_2]], %[[SRC_SUB_IDX_1]]]
-//   CHECK-DAG: %[[RESOLVED_DST_IDX_1:.+]] = affine.apply #[[MAP]]()[%[[DEST_IDX_1]], %[[DEST_SUB_IDX_0]]]
-//   CHECK-DAG: %[[RESOLVED_DST_IDX_3:.+]] = affine.apply #[[MAP]]()[%[[DEST_IDX_3]], %[[DEST_SUB_IDX_1]]]
-//   CHECK-DAG: nvgpu.device_async_copy %[[GMEM_MEMREF_3d]][%[[RESOLVED_SRC_IDX_0]], %[[SRC_IDX_1]], %[[RESOLVED_SRC_IDX_1]]], %[[SMEM_MEMREF_4d]][%[[DEST_IDX_0]], %[[RESOLVED_DST_IDX_1]], %[[DEST_IDX_2]], %[[RESOLVED_DST_IDX_3]]], 8 {bypassL1} : memref<2x128x768xf16> to memref<5x1x64x64xf16, #gpu.address_space<workgroup>>
-
-// -----
-
-#map = affine_map<()[s0] -> (-s0 + 4)>
-#map1 = affine_map<()[s0] -> (-s0 + 32)>
-
-func.func @test_ldmatrix(%arg0: memref<4x32x32xf16, 3>, %arg1: index, %arg2: index, %arg3: index) -> vector<4x2xf16> {
-  %c0 = arith.constant 0 : index
-  %0 = affine.apply #map()[%arg1]
-  %1 = affine.apply #map1()[%arg2]
-  %2 = affine.apply #map1()[%arg3]
-  %subview = memref.subview %arg0[%arg1, %arg2, %arg3] [%0, %1, %2] [1, 1, 1] : memref<4x32x32xf16, 3> to memref<?x?x?xf16, strided<[1024, 32, 1], offset: ?>, 3>
-  %3 = nvgpu.ldmatrix %subview[%c0, %c0, %c0] {numTiles = 4 : i32, transpose = false} : memref<?x?x?xf16, strided<[1024, 32, 1], offset: ?>, 3> -> vector<4x2xf16>
-  return %3 : vector<4x2xf16>
-}
-
-//      CHECK: func @test_ldmatrix
-// CHECK-SAME:   %[[ARG0:[a-zA-Z0-9_]+]]: memref<4x32x32xf16, 3>
-// CHECK-SAME:   %[[ARG1:[a-zA-Z0-9_]+]]: index
-// CHECK-SAME:   %[[ARG2:[a-zA-Z0-9_]+]]: index
-// CHECK-SAME:   %[[ARG3:[a-zA-Z0-9_]+]]: index
-//      CHECK:   nvgpu.ldmatrix %[[ARG0]][%[[ARG1]], %[[ARG2]], %[[ARG3]]] {numTiles = 4 : i32, transpose = false} : memref<4x32x32xf16, 3> -> vector<4x2xf16>
-
-// -----
-
 func.func @fold_vector_load_subview(%src : memref<24x64xf32>,
                                     %off1 : index,
                                     %off2 : index,
diff --git a/mlir/test/Dialect/NVGPU/fold-memref-alias-ops.mlir b/mlir/test/Dialect/NVGPU/fold-memref-alias-ops.mlir
new file mode 100644
index 0000000000000..44dcffcc1f00d
--- /dev/null
+++ b/mlir/test/Dialect/NVGPU/fold-memref-alias-ops.mlir
@@ -0,0 +1,93 @@
+// RUN: mlir-opt -fold-memref-alias-ops -split-input-file %s | FileCheck %s
+
+func.func @fold_nvgpu_device_async_copy_zero_sub_idx(%gmem_memref_3d : memref<2x128x768xf16>, %idx_1 : index, %idx_2 : index, %idx_3 : index) {
+  %c0 = arith.constant 0 : index
+  %smem_memref_4d = memref.alloc() : memref<5x1x64x64xf16, #gpu.address_space<workgroup>>
+  %gmem_memref_subview_2d = memref.subview %gmem_memref_3d[%idx_1, %idx_2, %idx_3] [1, 1, 8] [1, 1, 1] : memref<2x128x768xf16> to memref<1x8xf16, strided<[98304, 1], offset: ?>>
+  %async_token = nvgpu.device_async_copy %gmem_memref_subview_2d[%c0, %c0], %smem_memref_4d[%c0, %c0, %c0, %c0], 8 {bypassL1} : memref<1x8xf16, strided<[98304, 1], offset: ?>> to memref<5x1x64x64xf16, #gpu.address_space<workgroup>>
+  return
+}
+
+// CHECK-LABEL: func.func @fold_nvgpu_device_async_copy_zero_sub_idx
+//  CHECK-SAME: (%[[GMEM_MEMREF_3d:.+]]: memref<2x128x768xf16>, %[[IDX_1:.+]]: index, %[[IDX_2:.+]]: index, %[[IDX_3:.+]]: index)
+//   CHECK-DAG: %[[c0:.+]] = arith.constant 0 : index
+//   CHECK-DAG: %[[SMEM_MEMREF_4d:.+]] = memref.alloc() : memref<5x1x64x64xf16, #gpu.address_space<workgroup>>
+//       CHECK: nvgpu.device_async_copy %[[GMEM_MEMREF_3d]][%[[IDX_1]], %[[IDX_2]], %[[IDX_3]]], %[[SMEM_MEMREF_4d]][%[[c0]], %[[c0]], %[[c0]], %[[c0]]], 8 {bypassL1} : memref<2x128x768xf16> to memref<5x1x64x64xf16, #gpu.address_space<workgroup>>
+
+// -----
+
+
+func.func @fold_src_nvgpu_device_async_copy(%gmem_memref_3d : memref<2x128x768xf16>, %src_idx_0 : index, %src_idx_1 : index, %src_idx_2 : index, %src_sub_idx_0 : index, %src_sub_idx_1 : index) {
+  %c0 = arith.constant 0 : index
+  %smem_memref_4d = memref.alloc() : memref<5x1x64x64xf16, #gpu.address_space<workgroup>>
+  %gmem_memref_subview_2d = memref.subview %gmem_memref_3d[%src_idx_0, %src_idx_1, %src_idx_2] [1, 1, 8] [1, 1, 1] : memref<2x128x768xf16> to memref<1x8xf16, strided<[98304, 1], offset: ?>>
+  %async_token = nvgpu.device_async_copy %gmem_memref_subview_2d[%src_sub_idx_0, %src_sub_idx_1], %smem_memref_4d[%c0, %c0, %c0, %c0], 8 {bypassL1} : memref<1x8xf16, strided<[98304, 1], offset: ?>> to memref<5x1x64x64xf16, #gpu.address_space<workgroup>>
+  return
+}
+
+//   CHECK-DAG: #[[MAP:.+]] = affine_map<()[s0, s1] -> (s0 + s1)>
+//       CHECK: func.func @fold_src_nvgpu_device_async_copy
+//  CHECK-SAME: (%[[GMEM_MEMREF_3d:.+]]: memref<2x128x768xf16>, %[[SRC_IDX_0:.+]]: index, %[[SRC_IDX_1:.+]]: index, %[[SRC_IDX_2:.+]]: index, %[[SRC_SUB_IDX_0:.+]]: index, %[[SRC_SUB_IDX_1:.+]]: index)
+//   CHECK-DAG: %[[c0:.+]] = arith.constant 0 : index
+//   CHECK-DAG: %[[RESOLVED_SRC_IDX_0:.+]] = affine.apply #[[MAP]]()[%[[SRC_IDX_0]], %[[SRC_SUB_IDX_0]]]
+//   CHECK-DAG: %[[RESOLVED_SRC_IDX_1:.+]] = affine.apply #[[MAP]]()[%[[SRC_IDX_2]], %[[SRC_SUB_IDX_1]]]
+//   CHECK-DAG: nvgpu.device_async_copy %[[GMEM_MEMREF_3d]][%[[RESOLVED_SRC_IDX_0]], %[[SRC_IDX_1]], %[[RESOLVED_SRC_IDX_1]]], %[[SMEM_MEMREF_4d]][%[[c0]], %[[c0]], %[[c0]], %[[c0]]], 8 {bypassL1} : memref<2x128x768xf16> to memref<5x1x64x64xf16, #gpu.address_space<workgroup>>
+
+// -----
+
+
+func.func @fold_src_fold_dest_nvgpu_device_async_copy(%gmem_memref_3d : memref<2x128x768xf16>, %src_idx_0 : index, %src_idx_1 : index, %src_idx_2 : index, %src_sub_idx_0 : index, %src_sub_idx_1 : index, %dest_idx_0 : index, %dest_idx_1 : index, %dest_idx_2 : index, %dest_idx_3 : index, %dest_sub_idx_0 : index, %dest_sub_idx_1 : index) {
+  %c0 = arith.constant 0 : index
+  %smem_memref_4d = memref.alloc() : memref<5x1x64x64xf16, #gpu.address_space<workgroup>>
+  %gmem_memref_subview_2d = memref.subview %gmem_memref_3d[%src_idx_0, %src_idx_1, %src_idx_2] [1, 1, 8] [1, 1, 1] : memref<2x128x768xf16> to memref<1x8xf16, strided<[98304, 1], offset: ?>>
+  %smem_memref_2d = memref.subview %smem_memref_4d[%dest_idx_0, %dest_idx_1, %dest_idx_2, %dest_idx_3] [1, 1, 1, 8] [1, 1, 1, 1] : memref<5x1x64x64xf16, #gpu.address_space<workgroup>> to memref<1x8xf16, strided<[4096, 1], offset: ?>, #gpu.address_space<workgroup>>
+  %async_token = nvgpu.device_async_copy %gmem_memref_subview_2d[%src_sub_idx_0, %src_sub_idx_1], %smem_memref_2d[%dest_sub_idx_0, %dest_sub_idx_1], 8 {bypassL1} : memref<1x8xf16, strided<[98304, 1], offset: ?>> to memref<1x8xf16, strided<[4096, 1], offset: ?>, #gpu.address_space<workgroup>>
+  return
+}
+
+//   CHECK-DAG: #[[MAP:.+]] = affine_map<()[s0, s1] -> (s0 + s1)>
+//       CHECK: func.func @fold_src_fold_dest_nvgpu_device_async_copy
+//  CHECK-SAME: (%[[GMEM_MEMREF_3d:.+]]: memref<2x128x768xf16>, %[[SRC_IDX_0:.+]]: index, %[[SRC_IDX_1:.+]]: index, %[[SRC_IDX_2:.+]]: index, %[[SRC_SUB_IDX_0:.+]]: index, %[[SRC_SUB_IDX_1:.+]]: index, %[[DEST_IDX_0:.+]]: index, %[[DEST_IDX_1:.+]]: index, %[[DEST_IDX_2:.+]]: index, %[[DEST_IDX_3:.+]]: index, %[[DEST_SUB_IDX_0:.+]]: index, %[[DEST_SUB_IDX_1:.+]]: index)
+//   CHECK-DAG: %[[RESOLVED_SRC_IDX_0:.+]] = affine.apply #[[MAP]]()[%[[SRC_IDX_0]], %[[SRC_SUB_IDX_0]]]
+//   CHECK-DAG: %[[RESOLVED_SRC_IDX_1:.+]] = affine.apply #[[MAP]]()[%[[SRC_IDX_2]], %[[SRC_SUB_IDX_1]]]
+//   CHECK-DAG: %[[RESOLVED_DST_IDX_1:.+]] = affine.apply #[[MAP]]()[%[[DEST_IDX_1]], %[[DEST_SUB_IDX_0]]]
+//   CHECK-DAG: %[[RESOLVED_DST_IDX_3:.+]] = affine.apply #[[MAP]]()[%[[DEST_IDX_3]], %[[DEST_SUB_IDX_1]]]
+//   CHECK-DAG: nvgpu.device_async_copy %[[GMEM_MEMREF_3d]][%[[RESOLVED_SRC_IDX_0]], %[[SRC_IDX_1]], %[[RESOLVED_SRC_IDX_1]]], %[[SMEM_MEMREF_4d]][%[[DEST_IDX_0]], %[[RESOLVED_DST_IDX_1]], %[[DEST_IDX_2]], %[[RESOLVED_DST_IDX_3]]], 8 {bypassL1} : memref<2x128x768xf16> to memref<5x1x64x64xf16, #gpu.address_space<workgroup>>
+
+// -----
+
+#map = affine_map<()[s0] -> (-s0 + 4)>
+#map1 = affine_map<()[s0] -> (-s0 + 32)>
+
+func.func @test_ldmatrix(%arg0: memref<4x32x32xf16, 3>, %arg1: index, %arg2: index, %arg3: index) -> vector<4x2xf16> {
+  %c0 = arith.constant 0 : index
+  %0 = affine.apply #map()[%arg1]
+  %1 = affine.apply #map1()[%arg2]
+  %2 = affine.apply #map1()[%arg3]
+  %subview = memref.subview %arg0[%arg1, %arg2, %arg3] [%0, %1, %2] [1, 1, 1] : memref<4x32x32xf16, 3> to memref<?x?x?xf16, strided<[1024, 32, 1], offset: ?>, 3>
+  %3 = nvgpu.ldmatrix %subview[%c0, %c0, %c0] {numTiles = 4 : i32, transpose = false} : memref<?x?x?xf16, strided<[1024, 32, 1], offset: ?>, 3> -> vector<4x2xf16>
+  return %3 : vector<4x2xf16>
+}
+
+//      CHECK: func @test_ldmatrix
+// CHECK-SAME:   %[[ARG0:[a-zA-Z0-9_]+]]: memref<4x32x32xf16, 3>
+// CHECK-SAME:   %[[ARG1:[a-zA-Z0-9_]+]]: index
+// CHECK-SAME:   %[[ARG2:[a-zA-Z0-9_]+]]: index
+// CHECK-SAME:   %[[ARG3:[a-zA-Z0-9_]+]]: index
+//      CHECK:   nvgpu.ldmatrix %[[ARG0]][%[[ARG1]], %[[ARG2]], %[[ARG3]]] {numTiles = 4 : i32, transpose = false} : memref<4x32x32xf16, 3> -> vector<4x2xf16>
+
+// -----
+
+func.func @ldmatrix_expand(%arg0: memref<4096xf16, 3>, %arg1: index, %arg2: index, %arg3: index) -> vector<4x2xf16> {
+  %exp = memref.expand_shape %arg0 [[0, 1, 2]] output_shape [4, 32, 32] : memref<4096xf16, 3> into memref<4x32x32xf16, 3>
+  %3 = nvgpu.ldmatrix %exp[%arg1, %arg2, %arg3] {numTiles = 4 : i32, transpose = false} : memref<4x32x32xf16, 3> -> vector<4x2xf16>
+  return %3 : vector<4x2xf16>
+}
+
+//      CHECK: func @ldmatrix_expand
+// CHECK-SAME:   %[[ARG0:[a-zA-Z0-9_]+]]: memref<4096xf16, 3>
+// CHECK-SAME:   %[[ARG1:[a-zA-Z0-9_]+]]: index
+// CHECK-SAME:   %[[ARG2:[a-zA-Z0-9_]+]]: index
+// CHECK-SAME:   %[[ARG3:[a-zA-Z0-9_]+]]: index
+//      CHECK:   %[[LIN:[a-zA-Z0-9_]+]] = affine.linearize_index disjoint [%[[ARG1]], %[[ARG2]], %[[ARG3]]] by (4, 32, 32)
+//      CHECK:   nvgpu.ldmatrix %[[ARG0]][%[[LIN]]] {numTiles = 4 : i32, transpose = false} : memref<4096xf16, 3> -> vector<4x2xf16>