[Mlir-commits] [mlir] 247a9bf - [mlir][AMDGPU] Add folders for memref aliases to TDM base creation (#184567)

Wed Mar 4 07:26:46 PST 2026

Author: Krzysztof Drewniak
Date: 2026-03-04T07:26:40-08:00
New Revision: 247a9bfc26ad1f062c52f7b4625ec0e0710a6424

URL: https://github.com/llvm/llvm-project/commit/247a9bfc26ad1f062c52f7b4625ec0e0710a6424
DIFF: https://github.com/llvm/llvm-project/commit/247a9bfc26ad1f062c52f7b4625ec0e0710a6424.diff

LOG: [mlir][AMDGPU] Add folders for memref aliases to TDM base creation (#184567)

The TDM base creation (amdgpu.make_tdm_base and
amdgpu.make_gather_tdm_base) take references to a
`%memref[%i0, %i1,, ...]` for the starting point of the tiles in
global/shared memory that the TDM descriptor refers to. Memory alias ops
can be safely folded into these operations, since these two memref
operands are just pointers to a scalar starting pint and don't have
semantics that depend on the memref layout (except to the extent that it
defines a location in memory).

While I'm here, I've cleaned up a few things, like the incorrect file
header and fixed the tests to not use integer address spaces.

Co-authored-by: Claude Opus 4.6 <noreply at anthropic.com>

Added: 
    

Modified: 
    mlir/lib/Dialect/AMDGPU/Transforms/FoldMemRefsOps.cpp
    mlir/test/Dialect/AMDGPU/amdgpu-fold-memrefs.mlir

Removed: 
    


################################################################################
diff  --git a/mlir/lib/Dialect/AMDGPU/Transforms/FoldMemRefsOps.cpp b/mlir/lib/Dialect/AMDGPU/Transforms/FoldMemRefsOps.cpp
index 19b4341cc27a1..8570927a77794 100644

--- a/mlir/lib/Dialect/AMDGPU/Transforms/FoldMemRefsOps.cpp
+++ b/mlir/lib/Dialect/AMDGPU/Transforms/FoldMemRefsOps.cpp
@@ -1,4 +1,4 @@
-//===- FoldSubviewOps.cpp - AMDGPU fold subview ops -----------------------===//
+//===- FoldMemRefsOps.cpp - AMDGPU fold memref ops ------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -92,6 +92,9 @@ struct FoldMemRefOpsIntoGatherToLDSOp final : OpRewritePattern<GatherToLDSOp> {
       destIndices = op.getDstIndices();
     }
 
+    if (failed(foldSrcResult) && failed(foldDstResult))
+      return rewriter.notifyMatchFailure(op, "no fold found");
+
     rewriter.replaceOpWithNewOp<GatherToLDSOp>(
         op, memrefSource, sourceIndices, memrefDest, destIndices,
         op.getTransferType(), op.getAsync());
@@ -100,6 +103,41 @@ struct FoldMemRefOpsIntoGatherToLDSOp final : OpRewritePattern<GatherToLDSOp> {
   }
 };
 
+template <typename OpTy>
+struct FoldMemRefOpsIntoDmaBaseOp final : OpRewritePattern<OpTy> {
+  using OpRewritePattern<OpTy>::OpRewritePattern;
+  LogicalResult matchAndRewrite(OpTy op,
+                                PatternRewriter &rewriter) const override {
+    Location loc = op.getLoc();
+
+    SmallVector<Value> globalIndices, ldsIndices;
+    Value globalBase, ldsBase;
+
+    LogicalResult didFoldGlobal =
+        foldMemrefViewOp(rewriter, loc, op.getGlobal(), op.getGlobalIndices(),
+                         globalIndices, globalBase, "global");
+    if (failed(didFoldGlobal)) {
+      globalBase = op.getGlobal();
+      globalIndices = op.getGlobalIndices();
+    }
+
+    LogicalResult didFoldLds =
+        foldMemrefViewOp(rewriter, loc, op.getLds(), op.getLdsIndices(),
+                         ldsIndices, ldsBase, "lds");
+    if (failed(didFoldLds)) {
+      ldsBase = op.getLds();
+      ldsIndices = op.getLdsIndices();
+    }
+
+    if (failed(didFoldGlobal) && failed(didFoldLds))
+      return rewriter.notifyMatchFailure(op, "no fold found");
+
+    rewriter.replaceOpWithNewOp<OpTy>(op, op.getBase().getType(), globalBase,
+                                      globalIndices, ldsBase, ldsIndices);
+    return success();
+  }
+};
+
 struct FoldMemRefOpsIntoTransposeLoadOp final
     : OpRewritePattern<TransposeLoadOp> {
   using Base::Base;
@@ -121,8 +159,10 @@ struct FoldMemRefOpsIntoTransposeLoadOp final
 
 void populateAmdgpuFoldMemRefOpsPatterns(RewritePatternSet &patterns,
                                          PatternBenefit benefit) {
-  patterns
-      .add<FoldMemRefOpsIntoGatherToLDSOp, FoldMemRefOpsIntoTransposeLoadOp>(
-          patterns.getContext(), benefit);
+  patterns.add<FoldMemRefOpsIntoGatherToLDSOp,
+               FoldMemRefOpsIntoDmaBaseOp<MakeDmaBaseOp>,
+               FoldMemRefOpsIntoDmaBaseOp<MakeGatherDmaBaseOp>,
+               FoldMemRefOpsIntoTransposeLoadOp>(patterns.getContext(),
+                                                 benefit);
 }
 } // namespace mlir::amdgpu

diff  --git a/mlir/test/Dialect/AMDGPU/amdgpu-fold-memrefs.mlir b/mlir/test/Dialect/AMDGPU/amdgpu-fold-memrefs.mlir
index 11571b060a0b8..4fc6bc1846c3d 100644
--- a/mlir/test/Dialect/AMDGPU/amdgpu-fold-memrefs.mlir
+++ b/mlir/test/Dialect/AMDGPU/amdgpu-fold-memrefs.mlir
@@ -1,15 +1,15 @@
 // RUN: mlir-opt --amdgpu-fold-memrefs-ops --split-input-file %s | FileCheck %s
 
-#gpu_lds_addrspace = 3
+#gpu_lds_addrspace = #gpu.address_space<workgroup>
 
 // CHECK: func @test_subview_folding
 // CHECK-SAME: %[[ARG0:.*]]: index, %[[ARG1:.*]]: index
 func.func @test_subview_folding(%offset_i: index, %offset_j: index) {
-  // CHECK: %[[LOCAL:.*]] = memref.alloc() : memref<64x64xf16, 3>
+  // CHECK: %[[LOCAL:.*]] = memref.alloc() : memref<64x64xf16, #gpu.address_space<workgroup>>
   // CHECK: %[[MEM:.*]] = memref.alloc() : memref<64x128xf16>
   // CHECK: %[[C0:.*]] = arith.constant 0 : index
   // CHECK: amdgpu.gather_to_lds %[[MEM]][%[[ARG0]], %[[ARG1]]], %[[LOCAL]][%[[C0]], %[[C0]]]
-  // CHECK-SAME: vector<8xf16>, memref<64x128xf16>, memref<64x64xf16, 3>
+  // CHECK-SAME: vector<8xf16>, memref<64x128xf16>, memref<64x64xf16, #gpu.address_space<workgroup>>
 
   %alloc = memref.alloc() : memref<64x64xf16, #gpu_lds_addrspace>
   %mem = memref.alloc() : memref<64x128xf16>
@@ -22,7 +22,7 @@ func.func @test_subview_folding(%offset_i: index, %offset_j: index) {
 
 // -----
 
-#gpu_lds_addrspace = 3
+#gpu_lds_addrspace = #gpu.address_space<workgroup>
 
 // CHECK: #[[MAP:.*]] = affine_map<()[s0] -> (s0 + 32)>
 // CHECK: #[[MAP1:.*]] = affine_map<()[s0] -> (s0 + 64)>
@@ -30,13 +30,13 @@ func.func @test_subview_folding(%offset_i: index, %offset_j: index) {
 // CHECK: func @subview_folding_offset
 // CHECK-SAME: %[[ARG0:.*]]: index, %[[ARG1:.*]]: index
 func.func @subview_folding_offset(%offset_i: index, %offset_j: index) {
-  // CHECK: %[[LOCAL:.*]] = memref.alloc() : memref<64x64xf16, 3>
+  // CHECK: %[[LOCAL:.*]] = memref.alloc() : memref<64x64xf16, #gpu.address_space<workgroup>>
   // CHECK: %[[MEM:.*]] = memref.alloc() : memref<64x128xf16>
   // CHECK: %[[C0:.*]] = arith.constant 0 : index
   // CHECK: %[[IDX0:.*]] = affine.apply #[[MAP]]()[%[[ARG0]]]
   // CHECK: %[[IDX1:.*]] = affine.apply #[[MAP1]]()[%[[ARG1]]]
   // CHECK: amdgpu.gather_to_lds %[[MEM]][%[[IDX0]], %[[IDX1]]], %[[LOCAL]][%[[C0]], %[[C0]]]
-  // CHECK-SAME: vector<8xf16>, memref<64x128xf16>, memref<64x64xf16, 3>
+  // CHECK-SAME: vector<8xf16>, memref<64x128xf16>, memref<64x64xf16, #gpu.address_space<workgroup>>
 
   %alloc = memref.alloc() : memref<64x64xf16, #gpu_lds_addrspace>
   %mem = memref.alloc() : memref<64x128xf16>
@@ -49,18 +49,18 @@ func.func @subview_folding_offset(%offset_i: index, %offset_j: index) {
 
 // -----
 
-#gpu_lds_addrspace = 3
+#gpu_lds_addrspace = #gpu.address_space<workgroup>
 
 // CHECK: func @test_expand_shape
 // CHECK-SAME: %[[ARG0:.*]]: index, %[[ARG1:.*]]: index
 func.func @test_expand_shape(%offset_i: index, %offset_j: index) {
-  // CHECK: %[[LOCAL:.*]] = memref.alloc() : memref<4096xf16, 3>
+  // CHECK: %[[LOCAL:.*]] = memref.alloc() : memref<4096xf16, #gpu.address_space<workgroup>>
   // CHECK: %[[MEM:.*]] = memref.alloc() : memref<8192xf16>
   // CHECK: %[[C0:.*]] = arith.constant 0 : index
   // CHECK: %[[IDXM:.*]] = affine.linearize_index [%[[ARG0]], %[[ARG1]]] by (64, 128) : index
   // CHECK: %[[IDXL:.*]] = affine.linearize_index [%[[C0]], %[[C0]]] by (64, 64) : index
   // CHECK: amdgpu.gather_to_lds %[[MEM]][%[[IDXM]]], %[[LOCAL]][%[[IDXL]]]
-  // CHECK-SAME: vector<8xf16>, memref<8192xf16>, memref<4096xf16, 3>
+  // CHECK-SAME: vector<8xf16>, memref<8192xf16>, memref<4096xf16, #gpu.address_space<workgroup>>
 
   %alloc = memref.alloc() : memref<4096xf16, #gpu_lds_addrspace>
   %mem = memref.alloc() : memref<8192xf16>
@@ -74,18 +74,18 @@ func.func @test_expand_shape(%offset_i: index, %offset_j: index) {
 
 // -----
 
-#gpu_lds_addrspace = 3
+#gpu_lds_addrspace = #gpu.address_space<workgroup>
 
 // CHECK: func @test_collapse_shape
 // CHECK-SAME: %[[ARG0:.*]]: index, %[[ARG1:.*]]: index
 func.func @test_collapse_shape(%offset_i: index, %offset_j: index) {
-  // CHECK: %[[LOCAL:.*]] = memref.alloc() : memref<64x64xf16, 3>
+  // CHECK: %[[LOCAL:.*]] = memref.alloc() : memref<64x64xf16, #gpu.address_space<workgroup>>
   // CHECK: %[[MEM:.*]] = memref.alloc() : memref<64x128xf16>
   // CHECK: %[[C0:.*]] = arith.constant 0 : index
   // CHECK: %[[INDICES_MEM:.*]]:2 = affine.delinearize_index %[[ARG0]] into (64, 128) : index, index
   // CHECK: %[[INDICES_LDS:.*]]:2 = affine.delinearize_index %[[ARG1]] into (64, 64) : index, index
   // CHECK: amdgpu.gather_to_lds %[[MEM]][%[[INDICES_MEM]]#0, %[[INDICES_MEM]]#1], %[[LOCAL]][%[[INDICES_LDS]]#0, %[[INDICES_LDS]]#1]
-  // CHECK-SAME: vector<8xf16>, memref<64x128xf16>, memref<64x64xf16, 3>
+  // CHECK-SAME: vector<8xf16>, memref<64x128xf16>, memref<64x64xf16, #gpu.address_space<workgroup>>
 
   %alloc = memref.alloc() : memref<64x64xf16, #gpu_lds_addrspace>
   %collapse_alloc = memref.collapse_shape %alloc [[0, 1]] : memref<64x64xf16, #gpu_lds_addrspace> into memref<4096xf16, #gpu_lds_addrspace>
@@ -100,17 +100,17 @@ func.func @test_collapse_shape(%offset_i: index, %offset_j: index) {
 
 // -----
 
-#gpu_lds_addrspace = 3
+#gpu_lds_addrspace = #gpu.address_space<workgroup>
 
 
 // CHECK: func @test_expand_shape_src_raw_buffer
 // CHECK-SAME: %[[ARG0:.*]]: memref<8192xf16, #amdgpu.address_space<fat_raw_buffer>>, %[[ARG1:.*]]: index, %[[ARG2:.*]]: index
 func.func @test_expand_shape_src_raw_buffer(%mem : memref<8192xf16, #amdgpu.address_space<fat_raw_buffer>>, %offset_i: index, %offset_j: index) {
-  // CHECK: %[[LOCAL:.*]] = memref.alloc() : memref<4096xf16, 3>
+  // CHECK: %[[LOCAL:.*]] = memref.alloc() : memref<4096xf16, #gpu.address_space<workgroup>>
   // CHECK: %[[C0:.*]] = arith.constant 0 : index
   // CHECK: %[[IDXM:.*]] = affine.linearize_index [%[[ARG1]], %[[ARG2]]] by (64, 128) : index
   // CHECK: amdgpu.gather_to_lds %[[ARG0]][%[[IDXM]]], %[[LOCAL]][%[[C0]]]
-  // CHECK-SAME: vector<8xf16>, memref<8192xf16, #amdgpu.address_space<fat_raw_buffer>>, memref<4096xf16, 3>
+  // CHECK-SAME: vector<8xf16>, memref<8192xf16, #amdgpu.address_space<fat_raw_buffer>>, memref<4096xf16, #gpu.address_space<workgroup>>
 
   %alloc = memref.alloc() : memref<4096xf16, #gpu_lds_addrspace>
   %expand_mem = memref.expand_shape %mem [[0, 1]] output_shape [64, 128] : memref<8192xf16, #amdgpu.address_space<fat_raw_buffer>> into memref<64x128xf16, #amdgpu.address_space<fat_raw_buffer>>
@@ -123,17 +123,17 @@ func.func @test_expand_shape_src_raw_buffer(%mem : memref<8192xf16, #amdgpu.addr
 
 // -----
 
-#gpu_lds_addrspace = 3
+#gpu_lds_addrspace = #gpu.address_space<workgroup>
 
 // CHECK: func @test_expand_shape_dst_only
 // CHECK-SAME: %[[ARG0:.*]]: index, %[[ARG1:.*]]: index
 func.func @test_expand_shape_dst_only(%offset_i: index, %offset_j: index) {
-  // CHECK: %[[LOCAL:.*]] = memref.alloc() : memref<4096xf16, 3>
+  // CHECK: %[[LOCAL:.*]] = memref.alloc() : memref<4096xf16, #gpu.address_space<workgroup>>
   // CHECK: %[[MEM:.*]] = memref.alloc() : memref<8192xf16>
   // CHECK: %[[C0:.*]] = arith.constant 0 : index
   // CHECK: %[[IDX_LDS:.*]] = affine.linearize_index [%[[ARG1]], %[[C0]]] by (64, 64) : index
   // CHECK: amdgpu.gather_to_lds %[[MEM]][%[[ARG0]]], %[[LOCAL]][%[[IDX_LDS]]]
-  // CHECK-SAME: vector<8xf16>, memref<8192xf16>, memref<4096xf16, 3>
+  // CHECK-SAME: vector<8xf16>, memref<8192xf16>, memref<4096xf16, #gpu.address_space<workgroup>>
 
   %alloc = memref.alloc() : memref<4096xf16, #gpu_lds_addrspace>
   %mem = memref.alloc() : memref<8192xf16>
@@ -147,14 +147,14 @@ func.func @test_expand_shape_dst_only(%offset_i: index, %offset_j: index) {
 
 // -----
 
-#gpu_lds_addrspace = 3
+#gpu_lds_addrspace = #gpu.address_space<workgroup>
 
 // CHECK: func @test_nop
 // CHECK-SAME: %[[ARG0:.*]]: memref<8192xf16, #amdgpu.address_space<fat_raw_buffer>>, %[[ARG1:.*]]: index, %[[ARG2:.*]]: index
 func.func @test_nop(%mem : memref<8192xf16, #amdgpu.address_space<fat_raw_buffer>>, %offset_i: index, %offset_j: index) {
-  // CHECK: %[[LOCAL:.*]] = memref.alloc() : memref<4096xf16, 3>
+  // CHECK: %[[LOCAL:.*]] = memref.alloc() : memref<4096xf16, #gpu.address_space<workgroup>>
   // CHECK: amdgpu.gather_to_lds %[[ARG0]][%[[ARG1]]], %[[LOCAL]][%[[ARG2]]]
-  // CHECK-SAME: vector<8xf16>, memref<8192xf16, #amdgpu.address_space<fat_raw_buffer>>, memref<4096xf16, 3>
+  // CHECK-SAME: vector<8xf16>, memref<8192xf16, #amdgpu.address_space<fat_raw_buffer>>, memref<4096xf16, #gpu.address_space<workgroup>>
 
   %alloc = memref.alloc() : memref<4096xf16, #gpu_lds_addrspace>
   amdgpu.gather_to_lds %mem[%offset_i], %alloc[%offset_j]
@@ -164,16 +164,16 @@ func.func @test_nop(%mem : memref<8192xf16, #amdgpu.address_space<fat_raw_buffer
 
 // -----
 
-#gpu_lds_addrspace = 3
+#gpu_lds_addrspace = #gpu.address_space<workgroup>
 
 // CHECK: func @test_async_flag_preserved
 // CHECK-SAME: %[[ARG0:.*]]: index, %[[ARG1:.*]]: index
 func.func @test_async_flag_preserved(%offset_i: index, %offset_j: index) {
-  // CHECK: %[[LOCAL:.*]] = memref.alloc() : memref<64x64xf16, 3>
+  // CHECK: %[[LOCAL:.*]] = memref.alloc() : memref<64x64xf16, #gpu.address_space<workgroup>>
   // CHECK: %[[MEM:.*]] = memref.alloc() : memref<64x128xf16>
   // CHECK: %[[C0:.*]] = arith.constant 0 : index
   // CHECK: amdgpu.gather_to_lds async %[[MEM]][%[[ARG0]], %[[ARG1]]], %[[LOCAL]][%[[C0]], %[[C0]]]
-  // CHECK-SAME: vector<8xf16>, memref<64x128xf16>, memref<64x64xf16, 3>
+  // CHECK-SAME: vector<8xf16>, memref<64x128xf16>, memref<64x64xf16, #gpu.address_space<workgroup>>
 
   %alloc = memref.alloc() : memref<64x64xf16, #gpu_lds_addrspace>
   %mem = memref.alloc() : memref<64x128xf16>
@@ -286,3 +286,135 @@ func.func @test_transpose_load_nop(%offset_i: index, %offset_j: index) -> vector
     : memref<32x128xf16, #gpu_wg> -> vector<4xf16>
   return %result : vector<4xf16>
 }
+
+// -----
+
+#gpu_lds_addrspace = #gpu.address_space<workgroup>
+#gpu_global_addrspace = #gpu.address_space<global>
+
+// CHECK: func @test_make_dma_base_subview
+// CHECK-SAME: %[[MEM:.*]]: memref<64x128xf16, #gpu.address_space<global>>, %[[LDS:.*]]: memref<64x64xf16, #gpu.address_space<workgroup>>, %[[GLOBAL_I:.*]]: index, %[[GLOBAL_J:.*]]: index, %[[LDS_I:.*]]: index, %[[LDS_J:.*]]: index
+func.func @test_make_dma_base_subview(%mem: memref<64x128xf16, #gpu_global_addrspace>, %lds: memref<64x64xf16, #gpu_lds_addrspace>, %global_i: index, %global_j: index, %lds_i: index, %lds_j: index) {
+  // CHECK: amdgpu.make_dma_base %[[MEM]][%[[GLOBAL_I]], %[[GLOBAL_J]]], %[[LDS]][%[[LDS_I]], %[[LDS_J]]]
+  // CHECK-SAME: memref<64x128xf16, #gpu.address_space<global>>, memref<64x64xf16, #gpu.address_space<workgroup>> -> !amdgpu.tdm_base<f16>
+
+  %subview = memref.subview %mem[0, 0][32, 64][1, 1] : memref<64x128xf16, #gpu_global_addrspace> to memref<32x64xf16, strided<[128, 1]>, #gpu_global_addrspace>
+  %base = amdgpu.make_dma_base %subview[%global_i, %global_j], %lds[%lds_i, %lds_j]
+    : memref<32x64xf16, strided<[128, 1]>, #gpu_global_addrspace>, memref<64x64xf16, #gpu_lds_addrspace> -> !amdgpu.tdm_base<f16>
+  func.return
+}
+
+// -----
+
+#gpu_lds_addrspace = #gpu.address_space<workgroup>
+#gpu_global_addrspace = #gpu.address_space<global>
+
+// CHECK: func @test_make_dma_base_expand_shape
+// CHECK-SAME: %[[MEM:.*]]: memref<64x128xf16, #gpu.address_space<global>>, %[[LDS:.*]]: memref<4096xf16, #gpu.address_space<workgroup>>, %[[GLOBAL_I:.*]]: index, %[[GLOBAL_J:.*]]: index, %[[LDS_I:.*]]: index, %[[LDS_J:.*]]: index
+func.func @test_make_dma_base_expand_shape(%mem: memref<64x128xf16, #gpu_global_addrspace>, %lds: memref<4096xf16, #gpu_lds_addrspace>, %global_i: index, %global_j: index, %lds_i: index, %lds_j: index) {
+  // CHECK: %[[IDX:.*]] = affine.linearize_index [%[[LDS_I]], %[[LDS_J]]] by (64, 64) : index
+  // CHECK: amdgpu.make_dma_base %[[MEM]][%[[GLOBAL_I]], %[[GLOBAL_J]]], %[[LDS]][%[[IDX]]]
+  // CHECK-SAME: memref<64x128xf16, #gpu.address_space<global>>, memref<4096xf16, #gpu.address_space<workgroup>> -> !amdgpu.tdm_base<f16>
+
+  %expand_lds = memref.expand_shape %lds [[0, 1]] output_shape [64, 64] : memref<4096xf16, #gpu_lds_addrspace> into memref<64x64xf16, #gpu_lds_addrspace>
+  %base = amdgpu.make_dma_base %mem[%global_i, %global_j], %expand_lds[%lds_i, %lds_j]
+    : memref<64x128xf16, #gpu_global_addrspace>, memref<64x64xf16, #gpu_lds_addrspace> -> !amdgpu.tdm_base<f16>
+  func.return
+}
+
+// -----
+
+#gpu_lds_addrspace = #gpu.address_space<workgroup>
+#gpu_global_addrspace = #gpu.address_space<global>
+
+// CHECK: func @test_make_gather_dma_base_subview
+// CHECK-SAME: %[[MEM:.*]]: memref<64x128xf16, #gpu.address_space<global>>, %[[LDS:.*]]: memref<64x64xf16, #gpu.address_space<workgroup>>, %[[GLOBAL_I:.*]]: index, %[[GLOBAL_J:.*]]: index, %[[LDS_I:.*]]: index, %[[LDS_J:.*]]: index
+func.func @test_make_gather_dma_base_subview(%mem: memref<64x128xf16, #gpu_global_addrspace>, %lds: memref<64x64xf16, #gpu_lds_addrspace>, %global_i: index, %global_j: index, %lds_i: index, %lds_j: index) {
+  // CHECK: amdgpu.make_gather_dma_base %[[MEM]][%[[GLOBAL_I]], %[[GLOBAL_J]]], %[[LDS]][%[[LDS_I]], %[[LDS_J]]]
+  // CHECK-SAME: memref<64x128xf16, #gpu.address_space<global>>, memref<64x64xf16, #gpu.address_space<workgroup>> -> !amdgpu.tdm_gather_base<f16, i16>
+
+  %subview = memref.subview %mem[0, 0][32, 64][1, 1] : memref<64x128xf16, #gpu_global_addrspace> to memref<32x64xf16, strided<[128, 1]>, #gpu_global_addrspace>
+  %base = amdgpu.make_gather_dma_base %subview[%global_i, %global_j], %lds[%lds_i, %lds_j]
+    : memref<32x64xf16, strided<[128, 1]>, #gpu_global_addrspace>, memref<64x64xf16, #gpu_lds_addrspace> -> !amdgpu.tdm_gather_base<f16, i16>
+  func.return
+}
+
+// -----
+
+#gpu_lds_addrspace = #gpu.address_space<workgroup>
+#gpu_global_addrspace = #gpu.address_space<global>
+
+// CHECK: func @test_make_gather_dma_base_collapse_shape
+// CHECK-SAME: %[[MEM:.*]]: memref<64x128xf16, #gpu.address_space<global>>, %[[LDS:.*]]: memref<64x64xf16, #gpu.address_space<workgroup>>, %[[GLOBAL_I:.*]]: index, %[[GLOBAL_J:.*]]: index, %[[LDS_IDX:.*]]: index
+func.func @test_make_gather_dma_base_collapse_shape(%mem: memref<64x128xf16, #gpu_global_addrspace>, %lds: memref<64x64xf16, #gpu_lds_addrspace>, %global_i: index, %global_j: index, %lds_idx: index) {
+  // CHECK: %[[INDICES:.*]]:2 = affine.delinearize_index %[[LDS_IDX]] into (64, 64) : index, index
+  // CHECK: amdgpu.make_gather_dma_base %[[MEM]][%[[GLOBAL_I]], %[[GLOBAL_J]]], %[[LDS]][%[[INDICES]]#0, %[[INDICES]]#1]
+  // CHECK-SAME: memref<64x128xf16, #gpu.address_space<global>>, memref<64x64xf16, #gpu.address_space<workgroup>> -> !amdgpu.tdm_gather_base<f16, i16>
+
+  %collapse_lds = memref.collapse_shape %lds [[0, 1]] : memref<64x64xf16, #gpu_lds_addrspace> into memref<4096xf16, #gpu_lds_addrspace>
+  %base = amdgpu.make_gather_dma_base %mem[%global_i, %global_j], %collapse_lds[%lds_idx]
+    : memref<64x128xf16, #gpu_global_addrspace>, memref<4096xf16, #gpu_lds_addrspace> -> !amdgpu.tdm_gather_base<f16, i16>
+  func.return
+}
+
+// -----
+
+#gpu_lds_addrspace = #gpu.address_space<workgroup>
+#gpu_global_addrspace = #gpu.address_space<global>
+
+// CHECK: #[[BOTH_MAP:.*]] = affine_map<()[s0] -> (s0 + 32)>
+// CHECK: #[[BOTH_MAP1:.*]] = affine_map<()[s0] -> (s0 + 64)>
+
+// CHECK: func @test_make_dma_base_both_fold
+// CHECK-SAME: %[[MEM:.*]]: memref<64x128xf16, #gpu.address_space<global>>, %[[LDS:.*]]: memref<4096xf16, #gpu.address_space<workgroup>>, %[[GLOBAL_I:.*]]: index, %[[GLOBAL_J:.*]]: index, %[[LDS_I:.*]]: index, %[[LDS_J:.*]]: index
+func.func @test_make_dma_base_both_fold(%mem: memref<64x128xf16, #gpu_global_addrspace>, %lds: memref<4096xf16, #gpu_lds_addrspace>, %global_i: index, %global_j: index, %lds_i: index, %lds_j: index) {
+  // CHECK: %[[GI:.*]] = affine.apply #[[BOTH_MAP]]()[%[[GLOBAL_I]]]
+  // CHECK: %[[GJ:.*]] = affine.apply #[[BOTH_MAP1]]()[%[[GLOBAL_J]]]
+  // CHECK: %[[IDX:.*]] = affine.linearize_index [%[[LDS_I]], %[[LDS_J]]] by (64, 64) : index
+  // CHECK: amdgpu.make_dma_base %[[MEM]][%[[GI]], %[[GJ]]], %[[LDS]][%[[IDX]]]
+  // CHECK-SAME: memref<64x128xf16, #gpu.address_space<global>>, memref<4096xf16, #gpu.address_space<workgroup>> -> !amdgpu.tdm_base<f16>
+
+  %subview = memref.subview %mem[32, 64][32, 64][1, 1] : memref<64x128xf16, #gpu_global_addrspace> to memref<32x64xf16, strided<[128, 1], offset: 4160>, #gpu_global_addrspace>
+  %expand_lds = memref.expand_shape %lds [[0, 1]] output_shape [64, 64] : memref<4096xf16, #gpu_lds_addrspace> into memref<64x64xf16, #gpu_lds_addrspace>
+  %base = amdgpu.make_dma_base %subview[%global_i, %global_j], %expand_lds[%lds_i, %lds_j]
+    : memref<32x64xf16, strided<[128, 1], offset: 4160>, #gpu_global_addrspace>, memref<64x64xf16, #gpu_lds_addrspace> -> !amdgpu.tdm_base<f16>
+  func.return
+}
+
+// -----
+
+#gpu_lds_addrspace = #gpu.address_space<workgroup>
+#gpu_global_addrspace = #gpu.address_space<global>
+
+// CHECK: func @test_make_dma_base_nop
+// CHECK-SAME: %[[MEM:.*]]: memref<64x128xf16, #gpu.address_space<global>>, %[[LDS:.*]]: memref<64x64xf16, #gpu.address_space<workgroup>>, %[[GLOBAL_I:.*]]: index, %[[GLOBAL_J:.*]]: index, %[[LDS_I:.*]]: index, %[[LDS_J:.*]]: index
+func.func @test_make_dma_base_nop(%mem: memref<64x128xf16, #gpu_global_addrspace>, %lds: memref<64x64xf16, #gpu_lds_addrspace>, %global_i: index, %global_j: index, %lds_i: index, %lds_j: index) {
+  // CHECK: amdgpu.make_dma_base %[[MEM]][%[[GLOBAL_I]], %[[GLOBAL_J]]], %[[LDS]][%[[LDS_I]], %[[LDS_J]]]
+  // CHECK-SAME: memref<64x128xf16, #gpu.address_space<global>>, memref<64x64xf16, #gpu.address_space<workgroup>> -> !amdgpu.tdm_base<f16>
+  // CHECK-NOT: subview
+  // CHECK-NOT: expand_shape
+  // CHECK-NOT: collapse_shape
+
+  %base = amdgpu.make_dma_base %mem[%global_i, %global_j], %lds[%lds_i, %lds_j]
+    : memref<64x128xf16, #gpu_global_addrspace>, memref<64x64xf16, #gpu_lds_addrspace> -> !amdgpu.tdm_base<f16>
+  func.return
+}
+
+// -----
+
+#gpu_lds_addrspace = #gpu.address_space<workgroup>
+#gpu_global_addrspace = #gpu.address_space<global>
+
+// CHECK: func @test_make_gather_dma_base_nop
+// CHECK-SAME: %[[MEM:.*]]: memref<64x128xf16, #gpu.address_space<global>>, %[[LDS:.*]]: memref<64x64xf16, #gpu.address_space<workgroup>>, %[[GLOBAL_I:.*]]: index, %[[GLOBAL_J:.*]]: index, %[[LDS_I:.*]]: index, %[[LDS_J:.*]]: index
+func.func @test_make_gather_dma_base_nop(%mem: memref<64x128xf16, #gpu_global_addrspace>, %lds: memref<64x64xf16, #gpu_lds_addrspace>, %global_i: index, %global_j: index, %lds_i: index, %lds_j: index) {
+  // CHECK: amdgpu.make_gather_dma_base %[[MEM]][%[[GLOBAL_I]], %[[GLOBAL_J]]], %[[LDS]][%[[LDS_I]], %[[LDS_J]]]
+  // CHECK-SAME: memref<64x128xf16, #gpu.address_space<global>>, memref<64x64xf16, #gpu.address_space<workgroup>> -> !amdgpu.tdm_gather_base<f16, i16>
+  // CHECK-NOT: subview
+  // CHECK-NOT: expand_shape
+  // CHECK-NOT: collapse_shape
+
+  %base = amdgpu.make_gather_dma_base %mem[%global_i, %global_j], %lds[%lds_i, %lds_j]
+    : memref<64x128xf16, #gpu_global_addrspace>, memref<64x64xf16, #gpu_lds_addrspace> -> !amdgpu.tdm_gather_base<f16, i16>
+  func.return
+}