[Mlir-commits] [mlir] [mlir][AMDGPU] Move memory access op folding to memref interfaces (PR #197310)

Tue May 12 14:49:42 PDT 2026

https://github.com/krzysz00 created https://github.com/llvm/llvm-project/pull/197310

This PR implemnets IndexedAccessOpInterface and
IndexedMemCopyOpInterface for relevant ops in the AMDGPU dialect, removing the custom folding pass we used to have now that there's interfaces for this sort of thing.

As a result:

- The in-bonuds semantics of various AMDGPU ops have been clarified
- Interface methods to enable oob checks on DMA operations have been added (to prevent accidental `disjoint`ing and the like)
- Said memref rewrite patterns have been hardened to allow for mixed tensor/memref semantics.
- Helpers for detecting memory spaces were factored out of `AMDGPUOps.cpp` so that they could be re-used in the interface implementations.

>From 98f8bfcc100901357f6b95a34e8edc36630d8d5a Mon Sep 17 00:00:00 2001
From: Krzysztof Drewniak <Krzysztof.Drewniak at amd.com>
Date: Fri, 8 May 2026 15:22:23 +0000
Subject: [PATCH] [mlir][AMDGPU] Move memory access op folding to memref
 interfaces

This PR implemnets IndexedAccessOpInterface and
IndexedMemCopyOpInterface for relevant ops in the AMDGPU dialect,
removing the custom folding pass we used to have now that there's
interfaces for this sort of thing.

As a result:

- The in-bonuds semantics of various AMDGPU ops have been clarified
- Interface methods to enable oob checks on DMA operations have been
  added (to prevent accidental `disjoint`ing and the like)
- Said memref rewrite patterns have been hardened to allow for mixed
  tensor/memref semantics.
- Helpers for detecting memory spaces were factored out of
  `AMDGPUOps.cpp` so that they could be re-used in the interface
  implementations.
---
 .../mlir/Dialect/AMDGPU/AMDGPUIRUtils.h       |  61 ++++
 .../mlir/Dialect/AMDGPU/IR/AMDGPUOps.td       |  52 +++-
 .../Transforms/MemoryAccessOpInterfacesImpl.h |  21 ++
 .../mlir/Dialect/AMDGPU/Transforms/Passes.h   |   4 -
 .../mlir/Dialect/AMDGPU/Transforms/Passes.td  |  11 -
 .../MemRef/IR/MemoryAccessOpInterfaces.td     |  38 ++-
 mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp  |   8 +
 mlir/lib/Dialect/AMDGPU/IR/AMDGPUOps.cpp      |  57 +---
 .../Dialect/AMDGPU/Transforms/CMakeLists.txt  |   2 +-
 .../AMDGPU/Transforms/FoldMemRefsOps.cpp      | 211 -------------
 .../MemoryAccessOpInterfacesImpl.cpp          | 258 +++++++++++++++
 .../MemRef/IR/MemoryAccessOpInterfaces.cpp    |   4 +-
 .../MemRef/Transforms/FoldMemRefAliasOps.cpp  |  59 ++--
 mlir/lib/RegisterAllDialects.cpp              |   2 +
 ...emrefs.mlir => fold-memref-alias-ops.mlir} | 294 ++++++++++++++----
 mlir/test/Dialect/AMDGPU/invalid.mlir         |  10 +
 mlir/test/Dialect/AMDGPU/ops.mlir             |   9 +
 17 files changed, 742 insertions(+), 359 deletions(-)
 create mode 100644 mlir/include/mlir/Dialect/AMDGPU/AMDGPUIRUtils.h
 create mode 100644 mlir/include/mlir/Dialect/AMDGPU/Transforms/MemoryAccessOpInterfacesImpl.h
 delete mode 100644 mlir/lib/Dialect/AMDGPU/Transforms/FoldMemRefsOps.cpp
 create mode 100644 mlir/lib/Dialect/AMDGPU/Transforms/MemoryAccessOpInterfacesImpl.cpp
 rename mlir/test/Dialect/AMDGPU/{amdgpu-fold-memrefs.mlir => fold-memref-alias-ops.mlir} (62%)

diff --git a/mlir/include/mlir/Dialect/AMDGPU/AMDGPUIRUtils.h b/mlir/include/mlir/Dialect/AMDGPU/AMDGPUIRUtils.h
new file mode 100644
index 0000000000000..597b4eea72d50
--- /dev/null
+++ b/mlir/include/mlir/Dialect/AMDGPU/AMDGPUIRUtils.h
@@ -0,0 +1,61 @@
+//===- AMDGPUIRUtils.h - AMDGPU dialect IR utilities -----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_AMDGPU_AMDGPUIRUTILS_H
+#define MLIR_DIALECT_AMDGPU_AMDGPUIRUTILS_H
+
+#include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/Support/LLVM.h"
+#include "llvm/Support/Compiler.h"
+
+namespace mlir::amdgpu {
+
+// Integer memory-space attributes are deprecated, but still accepted here for
+// compatibility with existing IR.
+
+inline bool isGlobalMemorySpace(Attribute memorySpace) {
+  if (!memorySpace)
+    return true;
+  if (auto gpuMemorySpace = dyn_cast<gpu::AddressSpaceAttr>(memorySpace))
+    return gpuMemorySpace.getValue() == gpu::AddressSpace::Global;
+  if (LLVM_UNLIKELY(isa<IntegerAttr>(memorySpace))) {
+    int64_t intMemorySpace = cast<IntegerAttr>(memorySpace).getInt();
+    return intMemorySpace == 0 || intMemorySpace == 1;
+  }
+  return false;
+}
+
+inline bool isWorkgroupMemorySpace(Attribute memorySpace) {
+  if (!memorySpace)
+    return false;
+  if (auto gpuMemorySpace = dyn_cast<gpu::AddressSpaceAttr>(memorySpace))
+    return gpuMemorySpace.getValue() == gpu::AddressSpace::Workgroup;
+  if (LLVM_UNLIKELY(isa<IntegerAttr>(memorySpace))) {
+    int64_t intMemorySpace = cast<IntegerAttr>(memorySpace).getInt();
+    return intMemorySpace == 3;
+  }
+  return false;
+}
+
+inline bool isFatRawBufferMemorySpace(Attribute memorySpace) {
+  if (!memorySpace)
+    return false;
+  if (auto amdgpuMemorySpace = dyn_cast<amdgpu::AddressSpaceAttr>(memorySpace))
+    return amdgpuMemorySpace.getValue() == amdgpu::AddressSpace::FatRawBuffer;
+  if (LLVM_UNLIKELY(isa<IntegerAttr>(memorySpace))) {
+    int64_t intMemorySpace = cast<IntegerAttr>(memorySpace).getInt();
+    return intMemorySpace == 7;
+  }
+  return false;
+}
+
+} // namespace mlir::amdgpu
+
+#endif // MLIR_DIALECT_AMDGPU_AMDGPUIRUTILS_H
diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUOps.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUOps.td
index 7d33ca163fb2f..d554e27e38dd6 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUOps.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUOps.td
@@ -1318,6 +1318,7 @@ def AMDGPU_SparseWMMAOp :
   let hasVerifier = 1;
 }
 
+// Promises IndexedMemCopyOpInterface.
 def AMDGPU_GatherToLDSOp :
     AMDGPU_Op<"gather_to_lds", [AttrSizedOperandSegments]>,
     Arguments<(ins
@@ -1336,8 +1337,12 @@ def AMDGPU_GatherToLDSOp :
     Operands:
     * `$src`: global memory (including fat buffer) memref to read from.
     * `$srcIndices`: indices into `$src` to read from for this thread.
+      These indices must be non-negative and in-bounds when `$src` is not a
+      fat raw buffer. Fat raw buffer sources permit out-of-bounds indices with
+      raw buffer semantics.
     * `$dst`: LDS memory memref to write to.
     * `$dstIndices`: base indices into `$dst` to write to for the subgroup of this thread.
+      These indices must be non-negative and in-bounds.
       The elements gathered by the subgroup will be written contiguously in order of lane ID
       starting at `$dst[$dstIndices]`. Byte-sized (ex. i8) or short-sized (ex. i16)
       types will be zero-padded/extended to 32 bits before being written. 96-bit types
@@ -1366,6 +1371,7 @@ def AMDGPU_GatherToLDSOp :
   let hasCanonicalizer = 1;
 }
 
+// Promises IndexedMemCopyOpInterface.
 def AMDGPU_GlobalLoadAsyncToLDSOp :
     AMDGPU_Op<"global_load_async_to_lds", [AttrSizedOperandSegments]>,
     Arguments<(ins
@@ -1384,8 +1390,13 @@ def AMDGPU_GlobalLoadAsyncToLDSOp :
 
     * `$src`: global memory memref to read from (global addrspace only, no fat buffer).
     * `$srcIndices`: indices into `$src` for this thread's global read location.
+      These indices must be non-negative and in-bounds.
     * `$dst`: LDS memref to write to (workgroup addrspace).
     * `$dstIndices`: indices into `$dst` for this thread's LDS write location.
+      These indices must be non-negative and in-bounds when `$mask` is not
+      provided. When `$mask` is provided, the destination indices are not
+      guaranteed to be in-bounds because masked-off lanes may carry invalid
+      destination indices.
     * `$transferType`: type of data to be transferred. Must be 8, 32, 64 or 128 bit scalar
      or vector type.
     * `$mask`: optional per-thread mask. When false, the thread's LDS write
@@ -1416,6 +1427,7 @@ def AMDGPU_GlobalLoadAsyncToLDSOp :
   let hasCanonicalizer = 1;
 }
 
+// Promises IndexedAccessOpInterface.
 def AMDGPU_TransposeLoadOp :
     AMDGPU_Op<"transpose_load", [SameVariadicOperandSize]>,
     Arguments<(ins Arg<AnyMemRef, "buffer to transpose load from", [MemRead]>:$src, Variadic<Index>:$srcIndices)>,
@@ -1439,7 +1451,9 @@ def AMDGPU_TransposeLoadOp :
     ```
     Operands:
     * `$src`: LDS memref to read from.
-    * `$srcIndices`: indices into `$src` to read from for this thread.
+    * `$srcIndices`: indices into `$src` to read from for this thread. Indices
+      must be non-negative and in-bounds for the corresponding dimension of
+      `$src`, matching the constraints of `memref.load`.
     * `$result`: target register this transpose load instruction will write to.
 
     Note: Lowering is only supported on gfx950 and up.
@@ -1450,6 +1464,7 @@ def AMDGPU_TransposeLoadOp :
   let hasVerifier = 1;
 }
 
+// Promises IndexedAccessOpInterface.
 def AMDGPU_GlobalTransposeLoadOp :
     AMDGPU_Op<"global_transpose_load", [SameVariadicOperandSize]>,
     Arguments<(ins Arg<AnyMemRef, "buffer to transpose load from", [MemRead]>:$src,
@@ -1643,6 +1658,7 @@ class AMDGPU_DmaBaseOp<string mnemonic, Type outType> :
   }];
 }
 
+// Promises IndexedMemCopyOpInterface.
 def AMDGPU_MakeGatherDmaBaseOp : AMDGPU_DmaBaseOp<"make_gather_dma_base", AMDGPU_TDMGatherBaseType> {
   let summary = "Pair of based addresses used when moving tiles between LDS and global memory.";
 
@@ -1650,6 +1666,9 @@ def AMDGPU_MakeGatherDmaBaseOp : AMDGPU_DmaBaseOp<"make_gather_dma_base", AMDGPU
     This operation creates a pair of addresses that will be used by `tensor_load_to_lds`
     and `tensor_store_from_lds`.
 
+    The global and LDS indices must be non-negative and in-bounds for the
+    corresponding dimensions of their memrefs.
+
     This operation creates a value corresponding to the tensor descriptor (D#) group 0
     found in TensorLoadToLDSOp and TensorStoreFromLDSOp in the rocdl dialect.
 
@@ -1674,7 +1693,7 @@ def AMDGPU_MakeGatherDmaBaseOp : AMDGPU_DmaBaseOp<"make_gather_dma_base", AMDGPU
   }];
 }
 
-
+// Promises IndexedMemCopyOpInterface.
 def AMDGPU_MakeDmaBaseOp : AMDGPU_DmaBaseOp<"make_dma_base", AMDGPU_TDMBaseType> {
 
   let summary = "Pair of based addresses used when moving tiles between LDS and global memory.";
@@ -1682,6 +1701,9 @@ def AMDGPU_MakeDmaBaseOp : AMDGPU_DmaBaseOp<"make_dma_base", AMDGPU_TDMBaseType>
     This operation creates a pair of addresses that will be used by tensor_load_to_lds
     and tensor_store_from_lds.
 
+    The global and LDS indices must be non-negative and in-bounds for the
+    corresponding dimensions of their memrefs.
+
     This operation creates a value corresponding to the tensor descriptor (D#) group 0
     found in TensorLoadToLDSOp and TensorStoreFromLDSOp in the rocdl dialect.
 
@@ -1771,12 +1793,20 @@ class AMDGPU_MakeDescriptorOp<string mnemonic> :
 
 }
 
+// Promises IndexedAccessOpInterface for the optional atomic barrier address.
 def AMDGPU_MakeGatherDmaDescriptorOp : AMDGPU_MakeDescriptorOp<"make_gather_dma_descriptor"> {
   dag args = (ins AMDGPU_TDMGatherBaseType: $base,
                   AnyTypeOf<[VectorOfMinMaxLengthAndType<1, 8, [I32]>,
                              VectorOfMinMaxLengthAndType<1, 16, [I16]>]>: $indices);
   let arguments = !con(args, baseArgs);
   let summary = "Make all descriptor groups needed by TensorLoadToLDS/TensorStoreFromLDS.";
+  let description = [{
+    Make all descriptor groups needed by tensor memory operations in gather
+    mode.
+
+    If an atomic barrier is provided, its indices must be non-negative and
+    in-bounds for the corresponding dimensions of the barrier memref.
+  }];
 
   let assemblyFormat = [{
     $base `[` $indices `]`
@@ -1801,6 +1831,7 @@ def AMDGPU_MakeGatherDmaDescriptorOp : AMDGPU_MakeDescriptorOp<"make_gather_dma_
   }];
 }
 
+// Promises IndexedAccessOpInterface for the optional atomic barrier address.
 def AMDGPU_MakeDmaDescriptorOp : AMDGPU_MakeDescriptorOp<"make_dma_descriptor"> {
   dag args = (ins AMDGPU_TDMBaseType: $base);
   let arguments = !con(args, baseArgs);
@@ -1829,6 +1860,8 @@ def AMDGPU_MakeDmaDescriptorOp : AMDGPU_MakeDescriptorOp<"make_dma_descriptor">
 
      If an atomic barrier is provided, it will be arrived at **once** after
      each load/store using this descriptor is completed.
+     Its indices must be non-negative and in-bounds for the corresponding
+     dimensions of the barrier memref.
 
      2D and 3D tensors may be iterated over by setting $global_increment, $lds_increment, and $iteration_count.
      $global_increment determines how much to increment the starting global memory address per iteration in units of the $base's element type.
@@ -1911,6 +1944,7 @@ def AMDGPU_TensorStoreFromLDSOp :
 // being hoisted out of loops.
 //===----------------------------------------------------------------------===//
 
+// Promises IndexedAccessOpInterface.
 def AMDGPU_DsBarrierInitOp :
     AMDGPU_Op<"ds_barrier_init">,
     Arguments<(ins Arg<MemRefOf<[AMDGPU_DsBarrierStateType]>, "barrier(s)",
@@ -1923,6 +1957,9 @@ def AMDGPU_DsBarrierInitOp :
     initialize the barrier structure so that the pending and init counts are equal to
     `participants - 1`, which will have its high bits masked off, and its phase is equal to 0.
 
+    The indices must be non-negative and in-bounds for the corresponding
+    dimensions of `base`.
+
     Note that we subtract 1 from `participants` when constructing the barrier state
     to provide clearer high-level semantics.
 
@@ -1948,6 +1985,7 @@ def AMDGPU_DsBarrierInitOp :
   let hasVerifier = 1;
 }
 
+// Promises IndexedAccessOpInterface.
 def AMDGPU_DsBarrierPollStateOp :
     AMDGPU_Op<"ds_barrier_poll_state">,
     Arguments<(ins Arg<MemRefOf<[AMDGPU_DsBarrierStateType]>, "barrier(s)",
@@ -1960,6 +1998,8 @@ def AMDGPU_DsBarrierPollStateOp :
 
     This will ultimately act like a `memref.load`, but this operation will ensure
     that appropriate atomic orderings and syncscopes are set.
+    The indices must be non-negative and in-bounds for the corresponding
+    dimensions of `base`.
 
     Example:
     ```mlir
@@ -1976,6 +2016,7 @@ def AMDGPU_DsBarrierPollStateOp :
   let hasVerifier = 1;
 }
 
+// Promises IndexedAccessOpInterface.
 def AMDGPU_DsAsyncBarrierArriveOp :
     AMDGPU_Op<"ds_async_barrier_arrive">,
     Arguments<(ins Arg<MemRefOf<[AMDGPU_DsBarrierStateType]>, "barrier(s)",
@@ -1986,6 +2027,9 @@ def AMDGPU_DsAsyncBarrierArriveOp :
     Add a arrival to the LDS barrier at `base[indices]` to the sequence of pending
     asynchronous memory operations.
 
+    The indices must be non-negative and in-bounds for the corresponding
+    dimensions of `base`.
+
     This will add an "asynchronous memory operation" to the in-order list of pending
     asynchronous loads from global memory to LDS. When the queue of such operations
     issued before this operation is complete, the specified barrier will be arrived at,
@@ -2009,6 +2053,7 @@ def AMDGPU_DsAsyncBarrierArriveOp :
   let hasVerifier = 1;
 }
 
+// Promises IndexedAccessOpInterface.
 def AMDGPU_DsBarrierArriveOp :
     AMDGPU_Op<"ds_barrier_arrive">,
     Arguments<(ins Arg<MemRefOf<[AMDGPU_DsBarrierStateType]>, "barrier(s)",
@@ -2021,6 +2066,9 @@ def AMDGPU_DsBarrierArriveOp :
     Atomically arrive at the LDS barrier at `base[indices]` and decrement it by `count`,
     rolling over the phase if needed and returning the old barrier state.
 
+    The indices must be non-negative and in-bounds for the corresponding
+    dimensions of `base`.
+
     `count` is the number of participants that should be subtracted from the barrier's
     pending count **per lane that executes the operation**.
 
diff --git a/mlir/include/mlir/Dialect/AMDGPU/Transforms/MemoryAccessOpInterfacesImpl.h b/mlir/include/mlir/Dialect/AMDGPU/Transforms/MemoryAccessOpInterfacesImpl.h
new file mode 100644
index 0000000000000..96cf6b1d01259
--- /dev/null
+++ b/mlir/include/mlir/Dialect/AMDGPU/Transforms/MemoryAccessOpInterfacesImpl.h
@@ -0,0 +1,21 @@
+//===- MemoryAccessOpInterfacesImpl.h ---------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_AMDGPU_TRANSFORMS_MEMORYACCESSOPINTERFACESIMPL_H
+#define MLIR_DIALECT_AMDGPU_TRANSFORMS_MEMORYACCESSOPINTERFACESIMPL_H
+
+namespace mlir {
+
+class DialectRegistry;
+
+namespace amdgpu {
+void registerMemoryAccessOpInterfacesExternalModels(DialectRegistry &registry);
+} // namespace amdgpu
+} // namespace mlir
+
+#endif // MLIR_DIALECT_AMDGPU_TRANSFORMS_MEMORYACCESSOPINTERFACESIMPL_H
diff --git a/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.h b/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.h
index 58b9c74b2f8e0..48e7658568f86 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.h
+++ b/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.h
@@ -22,7 +22,6 @@ class ConversionTarget;
 namespace amdgpu {
 
 #define GEN_PASS_DECL_AMDGPUEMULATEATOMICSPASS
-#define GEN_PASS_DECL_AMDGPUFOLDMEMREFOPSPASS
 #define GEN_PASS_DECL_AMDGPUMASKEDLOADTOLOADPASS
 #define GEN_PASS_DECL_AMDGPURESOLVESTRIDEDMETADATAPASS
 #define GEN_PASS_REGISTRATION
@@ -39,9 +38,6 @@ void populateAmdgpuResolveStridedMetadataPatterns(RewritePatternSet &patterns,
 void populateAmdgpuMaskedloadToLoadPatterns(RewritePatternSet &patterns,
                                             PatternBenefit benefit = 1);
 
-void populateAmdgpuFoldMemRefOpsPatterns(RewritePatternSet &patterns,
-                                         PatternBenefit benefit = 1);
-
 } // namespace amdgpu
 } // namespace mlir
 
diff --git a/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.td b/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.td
index 8664f971cabde..7dd7ac750a9eb 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.td
@@ -71,15 +71,4 @@ def AmdgpuMaskedloadToLoadPass : Pass<"amdgpu-maskedload-to-load"> {
   ];
 }
 
-def AmdgpuFoldMemRefOpsPass : Pass<"amdgpu-fold-memrefs-ops"> {
-  let summary = "Fold memref operations into their parent operations";
-  let description = [{
-    This pass identifies memref operations (subview, expand_shape, collapse_shape)
-    that are sources of `GatherToLDSOp` and attempts to fold the source ops,
-    potentially simplifying the overall operation and improving performance.
-  }];
-  let dependentDialects = [
-    "memref::MemRefDialect"
-  ];
-}
 #endif // MLIR_DIALECT_AMDGPU_TRANSFORMS_PASSES_TD_
diff --git a/mlir/include/mlir/Dialect/MemRef/IR/MemoryAccessOpInterfaces.td b/mlir/include/mlir/Dialect/MemRef/IR/MemoryAccessOpInterfaces.td
index 0f1ef521afc57..b599eca9f38d5 100644
--- a/mlir/include/mlir/Dialect/MemRef/IR/MemoryAccessOpInterfaces.td
+++ b/mlir/include/mlir/Dialect/MemRef/IR/MemoryAccessOpInterfaces.td
@@ -34,8 +34,8 @@ def IndexedAccessOpInterface : OpInterface<"IndexedAccessOpInterface"> {
   let methods =
     [InterfaceMethod<
       /*desc=*/[{
-        Return the accessed memref. If the operation is still in tensor form, return
-        the null value.
+        Return the accessed memref. If the operation is not accessing memory through
+        a memref in its current form, return the null value.
       }],
       /*retType=*/"::mlir::TypedValue<::mlir::MemRefType>",
       /*methodName=*/"getAccessedMemref",
@@ -140,7 +140,8 @@ def IndexedMemCopyOpInterface : OpInterface<"IndexedMemCopyOpInterface"> {
   let methods =
     [InterfaceMethod<
       /*desc=*/[{
-        Return the source memref for this copy operation.
+        Return the source memref for this copy operation. If the operation is not
+        currently copying from a memref source, return the null value.
       }],
       /*retType=*/"::mlir::TypedValue<::mlir::MemRefType>",
       /*methodName=*/"getSrc",
@@ -157,7 +158,8 @@ def IndexedMemCopyOpInterface : OpInterface<"IndexedMemCopyOpInterface"> {
       /*args=*/(ins)>,
     InterfaceMethod<
       /*desc=*/[{
-        Return the destination memref for this copy operation.
+        Return the destination memref for this copy operation. If the operation is
+        not currently copying to a memref destination, return the null value.
       }],
       /*retType=*/"::mlir::TypedValue<::mlir::MemRefType>",
       /*methodName=*/"getDst",
@@ -194,7 +196,33 @@ def IndexedMemCopyOpInterface : OpInterface<"IndexedMemCopyOpInterface"> {
       /*methodName=*/"setMemrefsAndIndices",
       /*args=*/(ins "::mlir::RewriterBase&":$rewriter, "::mlir::Value":$newSrc,
         "::mlir::ValueRange":$newSrcIndices, "::mlir::Value":$newDst,
-        "::mlir::ValueRange":$newDstIndices)>];
+        "::mlir::ValueRange":$newDstIndices)>,
+    InterfaceMethod<
+      /*desc=*/[{
+        Return true if, either by definition or due to some attribute, it's
+        known that the source indices are non-negative and less than the size of
+        the dimension they index.
+      }],
+      /*retType=*/"bool",
+      /*methodName=*/"hasInboundsSrcIndices",
+      /*args=*/(ins),
+      /*methodBody=*/[{}],
+      /*defaultImplementation=*/[{
+        return true;
+      }]>,
+    InterfaceMethod<
+      /*desc=*/[{
+        Return true if, either by definition or due to some attribute, it's
+        known that the destination indices are non-negative and less than the
+        size of the dimension they index.
+      }],
+      /*retType=*/"bool",
+      /*methodName=*/"hasInboundsDstIndices",
+      /*args=*/(ins),
+      /*methodBody=*/[{}],
+      /*defaultImplementation=*/[{
+        return true;
+      }]>];
   let verify = [{
     return ::mlir::memref::detail::verifyIndexedMemCopyOpInterface($_op);
   }];
diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
index 1d4e5eddce019..92e05b397c099 100644
--- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
+++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
@@ -15,6 +15,7 @@
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
+#include "mlir/Dialect/MemRef/IR/MemoryAccessOpInterfaces.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/DialectImplementation.h"
 #include "mlir/Transforms/InliningUtils.h"
@@ -41,4 +42,11 @@ void AMDGPUDialect::initialize() {
   registerTypes();
   registerAttributes();
   addInterfaces<AMDGPUInlinerInterface>();
+  declarePromisedInterfaces<
+      memref::IndexedAccessOpInterface, TransposeLoadOp, GlobalTransposeLoadOp,
+      MakeDmaDescriptorOp, MakeGatherDmaDescriptorOp, DsBarrierInitOp,
+      DsBarrierPollStateOp, DsAsyncBarrierArriveOp, DsBarrierArriveOp>();
+  declarePromisedInterfaces<memref::IndexedMemCopyOpInterface, GatherToLDSOp,
+                            GlobalLoadAsyncToLDSOp, MakeDmaBaseOp,
+                            MakeGatherDmaBaseOp>();
 }
diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUOps.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUOps.cpp
index 209d52ec7a1c8..263d6087e87a4 100644
--- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUOps.cpp
+++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUOps.cpp
@@ -13,6 +13,7 @@
 
 #include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h"
 
+#include "mlir/Dialect/AMDGPU/AMDGPUIRUtils.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
@@ -150,43 +151,13 @@ LogicalResult FatRawBufferCastOp::verify() {
   return success();
 }
 
-static bool hasGlobalMemorySpace(Attribute memorySpace) {
-  if (!memorySpace)
-    return true;
-  if (auto intMemorySpace = dyn_cast<IntegerAttr>(memorySpace))
-    return intMemorySpace.getInt() == 0 || intMemorySpace.getInt() == 1;
-  if (auto gpuMemorySpace = dyn_cast<gpu::AddressSpaceAttr>(memorySpace))
-    return gpuMemorySpace.getValue() == gpu::AddressSpace::Global;
-  return false;
-}
-
-static bool hasWorkgroupMemorySpace(Attribute memorySpace) {
-  if (!memorySpace)
-    return false;
-  if (auto intMemorySpace = dyn_cast<IntegerAttr>(memorySpace))
-    return intMemorySpace.getInt() == 3;
-  if (auto gpuMemorySpace = dyn_cast<gpu::AddressSpaceAttr>(memorySpace))
-    return gpuMemorySpace.getValue() == gpu::AddressSpace::Workgroup;
-  return false;
-}
-
-static bool hasFatRawBufferMemorySpace(Attribute memorySpace) {
-  if (!memorySpace)
-    return false;
-  if (auto intMemorySpace = dyn_cast<IntegerAttr>(memorySpace))
-    return intMemorySpace.getInt() == 7;
-  if (auto gpuMemorySpace = dyn_cast<amdgpu::AddressSpaceAttr>(memorySpace))
-    return gpuMemorySpace.getValue() == amdgpu::AddressSpace::FatRawBuffer;
-  return false;
-}
-
 //===----------------------------------------------------------------------===//
 // RawBuffer*Op
 //===----------------------------------------------------------------------===//
 template <typename T>
 static LogicalResult verifyRawBufferOp(T &op) {
   MemRefType bufferType = llvm::cast<MemRefType>(op.getMemref().getType());
-  bool isGlobal = hasGlobalMemorySpace(bufferType.getMemorySpace());
+  bool isGlobal = isGlobalMemorySpace(bufferType.getMemorySpace());
 
   if (!isGlobal)
     return op.emitOpError(
@@ -985,12 +956,12 @@ LogicalResult GatherToLDSOp::verify() {
     return emitOpError(
         "Transfering type size must be 8, 16, 32, 96 or 128 bits");
 
-  if (!hasGlobalMemorySpace(srcType.getMemorySpace()) &&
-      !hasFatRawBufferMemorySpace(srcType.getMemorySpace()))
+  if (!isGlobalMemorySpace(srcType.getMemorySpace()) &&
+      !isFatRawBufferMemorySpace(srcType.getMemorySpace()))
     return emitOpError(
         "source memory address space must be global or fat raw buffer");
 
-  if (!hasWorkgroupMemorySpace(dstType.getMemorySpace()))
+  if (!isWorkgroupMemorySpace(dstType.getMemorySpace()))
     return emitOpError("destination memory address space must be Workgroup");
 
   return success();
@@ -1056,10 +1027,10 @@ LogicalResult GlobalLoadAsyncToLDSOp::verify() {
   if (!llvm::is_contained({8, 32, 64, 128}, transferSize))
     return emitOpError("transfer type size must be 8, 32, 64, or 128 bits");
 
-  if (!hasGlobalMemorySpace(srcType.getMemorySpace()))
+  if (!isGlobalMemorySpace(srcType.getMemorySpace()))
     return emitOpError("source memory address space must be global");
 
-  if (!hasWorkgroupMemorySpace(dstType.getMemorySpace()))
+  if (!isWorkgroupMemorySpace(dstType.getMemorySpace()))
     return emitOpError("destination memory address space must be Workgroup");
 
   return success();
@@ -1101,7 +1072,7 @@ LogicalResult TransposeLoadOp::verify() {
           verifyIndexCount(*this, "source", srcType, getSrcIndices().size())))
     return failure();
 
-  if (!hasWorkgroupMemorySpace(srcType.getMemorySpace()))
+  if (!isWorkgroupMemorySpace(srcType.getMemorySpace()))
     return emitOpError("source memory address space must be Workgroup");
 
   auto transferType = cast<VectorType>(getType());
@@ -1141,7 +1112,7 @@ LogicalResult GlobalTransposeLoadOp::verify() {
           verifyIndexCount(*this, "source", srcType, getSrcIndices().size())))
     return failure();
 
-  if (!hasGlobalMemorySpace(srcType.getMemorySpace()))
+  if (!isGlobalMemorySpace(srcType.getMemorySpace()))
     return emitOpError("source memory address space must be Global");
 
   auto resultType = cast<VectorType>(getType());
@@ -1184,10 +1155,10 @@ static LogicalResult verifyBase(BaseOp op) {
       failed(verifyIndexCount(op, "lds", ldsType, op.getLdsIndices().size())))
     return failure();
 
-  if (!hasWorkgroupMemorySpace(ldsType.getMemorySpace()))
+  if (!isWorkgroupMemorySpace(ldsType.getMemorySpace()))
     return op.emitOpError(
         "lds memref must have workgroup address space attribute.");
-  if (!hasGlobalMemorySpace(globalType.getMemorySpace()))
+  if (!isGlobalMemorySpace(globalType.getMemorySpace()))
     return op.emitOpError(
         "global memref must have global address space attribute.");
 
@@ -1268,7 +1239,7 @@ static LogicalResult verifyDescriptorOp(DescriptorOp op) {
       return failure();
 
     bool barrierInLDS =
-        hasWorkgroupMemorySpace(atomicBarrierAddressType.getMemorySpace());
+        isWorkgroupMemorySpace(atomicBarrierAddressType.getMemorySpace());
     if (!barrierInLDS)
       return op.emitOpError("atomic barrier address must be in LDS.");
   }
@@ -1489,7 +1460,7 @@ static LogicalResult verifyDsBarrierOpCommon(T &op) {
           verifyIndexCount(op, "barrier", memrefType, op.getIndices().size())))
     return failure();
 
-  if (!hasWorkgroupMemorySpace(memrefType.getMemorySpace()))
+  if (!isWorkgroupMemorySpace(memrefType.getMemorySpace()))
     return op.emitOpError("barrier must be in workgroup (LDS) memory");
 
   return success();
@@ -1524,7 +1495,7 @@ LogicalResult GlobalPrefetchOp::verify() {
   Attribute memSpace = src.getMemorySpace();
   if (!memSpace)
     return this->emitOpError("the source must have address space attribute");
-  if (!hasGlobalMemorySpace(memSpace))
+  if (!isGlobalMemorySpace(memSpace))
     return this->emitOpError("the source must reside in global address space");
 
   const LoadTemporalHint temporalHint = getTemporalHint();
diff --git a/mlir/lib/Dialect/AMDGPU/Transforms/CMakeLists.txt b/mlir/lib/Dialect/AMDGPU/Transforms/CMakeLists.txt
index d35853bb6a416..29baef635ec80 100644
--- a/mlir/lib/Dialect/AMDGPU/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/AMDGPU/Transforms/CMakeLists.txt
@@ -1,7 +1,7 @@
 add_mlir_dialect_library(MLIRAMDGPUTransforms
   EmulateAtomics.cpp
-  FoldMemRefsOps.cpp
   MaskedloadToLoad.cpp
+  MemoryAccessOpInterfacesImpl.cpp
   ResolveStridedMetadata.cpp
 
   ADDITIONAL_HEADER_DIRS
diff --git a/mlir/lib/Dialect/AMDGPU/Transforms/FoldMemRefsOps.cpp b/mlir/lib/Dialect/AMDGPU/Transforms/FoldMemRefsOps.cpp
deleted file mode 100644
index 24c30525957c7..0000000000000
--- a/mlir/lib/Dialect/AMDGPU/Transforms/FoldMemRefsOps.cpp
+++ /dev/null
@@ -1,211 +0,0 @@
-//===- FoldMemRefsOps.cpp - AMDGPU fold memref ops ------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "mlir/Dialect/AMDGPU/Transforms/Passes.h"
-
-#include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h"
-#include "mlir/Dialect/Affine/ViewLikeInterfaceUtils.h"
-#include "mlir/Dialect/MemRef/IR/MemRef.h"
-#include "mlir/Dialect/MemRef/Utils/MemRefUtils.h"
-#include "mlir/Transforms/WalkPatternRewriteDriver.h"
-#include "llvm/ADT/TypeSwitch.h"
-
-namespace mlir::amdgpu {
-#define GEN_PASS_DEF_AMDGPUFOLDMEMREFOPSPASS
-#include "mlir/Dialect/AMDGPU/Transforms/Passes.h.inc"
-
-struct AmdgpuFoldMemRefOpsPass final
-    : amdgpu::impl::AmdgpuFoldMemRefOpsPassBase<AmdgpuFoldMemRefOpsPass> {
-  void runOnOperation() override {
-    RewritePatternSet patterns(&getContext());
-    populateAmdgpuFoldMemRefOpsPatterns(patterns);
-    walkAndApplyPatterns(getOperation(), std::move(patterns));
-  }
-};
-
-static LogicalResult foldMemrefViewOp(PatternRewriter &rewriter, Location loc,
-                                      Value view, mlir::OperandRange indices,
-                                      SmallVectorImpl<Value> &resolvedIndices,
-                                      Value &memrefBase, StringRef role) {
-  Operation *defOp = view.getDefiningOp();
-  if (!defOp) {
-    return failure();
-  }
-  return llvm::TypeSwitch<Operation *, LogicalResult>(defOp)
-      .Case([&](memref::SubViewOp subviewOp) {
-        mlir::affine::resolveIndicesIntoOpWithOffsetsAndStrides(
-            rewriter, loc, subviewOp.getMixedOffsets(),
-            subviewOp.getMixedStrides(), subviewOp.getDroppedDims(), indices,
-            resolvedIndices);
-        memrefBase = subviewOp.getSource();
-        return success();
-      })
-      .Case([&](memref::ExpandShapeOp expandShapeOp) {
-        // The lack of inbounds is conservative and will be fixed.
-        mlir::memref::resolveSourceIndicesExpandShape(
-            loc, rewriter, expandShapeOp, indices, resolvedIndices, false);
-        memrefBase = expandShapeOp.getViewSource();
-        return success();
-      })
-      .Case([&](memref::CollapseShapeOp collapseShapeOp) {
-        // The collapse shape in-bounds-ness is defaulted to false
-        // conservatively.
-        mlir::memref::resolveSourceIndicesCollapseShape(
-            loc, rewriter, collapseShapeOp, indices, resolvedIndices, false);
-        memrefBase = collapseShapeOp.getViewSource();
-        return success();
-      })
-      .Default([&](Operation *op) {
-        return rewriter.notifyMatchFailure(
-            op, (role + " producer is not one of SubViewOp, ExpandShapeOp, or "
-                        "CollapseShapeOp")
-                    .str());
-      });
-}
-
-struct FoldMemRefOpsIntoGatherToLDSOp final : OpRewritePattern<GatherToLDSOp> {
-  using Base::Base;
-  LogicalResult matchAndRewrite(GatherToLDSOp op,
-                                PatternRewriter &rewriter) const override {
-    Location loc = op.getLoc();
-
-    SmallVector<Value> sourceIndices, destIndices;
-    Value memrefSource, memrefDest;
-
-    auto foldSrcResult =
-        foldMemrefViewOp(rewriter, loc, op.getSrc(), op.getSrcIndices(),
-                         sourceIndices, memrefSource, "source");
-
-    if (failed(foldSrcResult)) {
-      memrefSource = op.getSrc();
-      sourceIndices = op.getSrcIndices();
-    }
-
-    auto foldDstResult =
-        foldMemrefViewOp(rewriter, loc, op.getDst(), op.getDstIndices(),
-                         destIndices, memrefDest, "destination");
-
-    if (failed(foldDstResult)) {
-      memrefDest = op.getDst();
-      destIndices = op.getDstIndices();
-    }
-
-    if (failed(foldSrcResult) && failed(foldDstResult))
-      return rewriter.notifyMatchFailure(op, "no fold found");
-
-    rewriter.replaceOpWithNewOp<GatherToLDSOp>(
-        op, memrefSource, sourceIndices, memrefDest, destIndices,
-        op.getTransferType(), op.getAsync());
-
-    return success();
-  }
-};
-
-struct FoldMemRefOpsIntoGlobalLoadAsyncToLDSOp final
-    : OpRewritePattern<GlobalLoadAsyncToLDSOp> {
-  using Base::Base;
-  LogicalResult matchAndRewrite(GlobalLoadAsyncToLDSOp op,
-                                PatternRewriter &rewriter) const override {
-    Location loc = op.getLoc();
-
-    SmallVector<Value> sourceIndices, destIndices;
-    Value memrefSource, memrefDest;
-
-    auto foldSrcResult =
-        foldMemrefViewOp(rewriter, loc, op.getSrc(), op.getSrcIndices(),
-                         sourceIndices, memrefSource, "source");
-
-    if (failed(foldSrcResult)) {
-      memrefSource = op.getSrc();
-      sourceIndices = op.getSrcIndices();
-    }
-
-    auto foldDstResult =
-        foldMemrefViewOp(rewriter, loc, op.getDst(), op.getDstIndices(),
-                         destIndices, memrefDest, "destination");
-
-    if (failed(foldDstResult)) {
-      memrefDest = op.getDst();
-      destIndices = op.getDstIndices();
-    }
-
-    if (failed(foldSrcResult) && failed(foldDstResult))
-      return rewriter.notifyMatchFailure(op, "no fold found");
-
-    rewriter.replaceOpWithNewOp<GlobalLoadAsyncToLDSOp>(
-        op, memrefSource, sourceIndices, memrefDest, destIndices,
-        op.getTransferType(), op.getMask());
-
-    return success();
-  }
-};
-
-template <typename OpTy>
-struct FoldMemRefOpsIntoDmaBaseOp final : OpRewritePattern<OpTy> {
-  using OpRewritePattern<OpTy>::OpRewritePattern;
-  LogicalResult matchAndRewrite(OpTy op,
-                                PatternRewriter &rewriter) const override {
-    Location loc = op.getLoc();
-
-    SmallVector<Value> globalIndices, ldsIndices;
-    Value globalBase, ldsBase;
-
-    LogicalResult didFoldGlobal =
-        foldMemrefViewOp(rewriter, loc, op.getGlobal(), op.getGlobalIndices(),
-                         globalIndices, globalBase, "global");
-    if (failed(didFoldGlobal)) {
-      globalBase = op.getGlobal();
-      globalIndices = op.getGlobalIndices();
-    }
-
-    LogicalResult didFoldLds =
-        foldMemrefViewOp(rewriter, loc, op.getLds(), op.getLdsIndices(),
-                         ldsIndices, ldsBase, "lds");
-    if (failed(didFoldLds)) {
-      ldsBase = op.getLds();
-      ldsIndices = op.getLdsIndices();
-    }
-
-    if (failed(didFoldGlobal) && failed(didFoldLds))
-      return rewriter.notifyMatchFailure(op, "no fold found");
-
-    rewriter.replaceOpWithNewOp<OpTy>(op, op.getBase().getType(), globalBase,
-                                      globalIndices, ldsBase, ldsIndices);
-    return success();
-  }
-};
-
-struct FoldMemRefOpsIntoTransposeLoadOp final
-    : OpRewritePattern<TransposeLoadOp> {
-  using Base::Base;
-  LogicalResult matchAndRewrite(TransposeLoadOp op,
-                                PatternRewriter &rewriter) const override {
-    SmallVector<Value> sourceIndices;
-    Value memrefSource;
-
-    if (failed(foldMemrefViewOp(rewriter, op.getLoc(), op.getSrc(),
-                                op.getSrcIndices(), sourceIndices, memrefSource,
-                                "source")))
-      return failure();
-
-    rewriter.replaceOpWithNewOp<TransposeLoadOp>(op, op.getResult().getType(),
-                                                 memrefSource, sourceIndices);
-    return success();
-  }
-};
-
-void populateAmdgpuFoldMemRefOpsPatterns(RewritePatternSet &patterns,
-                                         PatternBenefit benefit) {
-  patterns.add<FoldMemRefOpsIntoGatherToLDSOp,
-               FoldMemRefOpsIntoGlobalLoadAsyncToLDSOp,
-               FoldMemRefOpsIntoDmaBaseOp<MakeDmaBaseOp>,
-               FoldMemRefOpsIntoDmaBaseOp<MakeGatherDmaBaseOp>,
-               FoldMemRefOpsIntoTransposeLoadOp>(patterns.getContext(),
-                                                 benefit);
-}
-} // namespace mlir::amdgpu
diff --git a/mlir/lib/Dialect/AMDGPU/Transforms/MemoryAccessOpInterfacesImpl.cpp b/mlir/lib/Dialect/AMDGPU/Transforms/MemoryAccessOpInterfacesImpl.cpp
new file mode 100644
index 0000000000000..abe9f4abc8ee8
--- /dev/null
+++ b/mlir/lib/Dialect/AMDGPU/Transforms/MemoryAccessOpInterfacesImpl.cpp
@@ -0,0 +1,258 @@
+//===- MemoryAccessOpInterfacesImpl.cpp -----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// Implement memref dialect interfaces that enable manipulating memref indexing
+// in passes like FoldMemRefAliasOps.
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/AMDGPU/Transforms/MemoryAccessOpInterfacesImpl.h"
+
+#include "mlir/Dialect/AMDGPU/AMDGPUIRUtils.h"
+#include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h"
+#include "mlir/Dialect/MemRef/IR/MemoryAccessOpInterfaces.h"
+#include "mlir/IR/Dialect.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/PatternMatch.h"
+
+using namespace mlir;
+using namespace mlir::amdgpu;
+using namespace mlir::memref;
+
+namespace {
+template <typename OpTy>
+struct TransposeLoadAccess final
+    : IndexedAccessOpInterface::ExternalModel<TransposeLoadAccess<OpTy>, OpTy> {
+  TypedValue<MemRefType> getAccessedMemref(Operation *op) const {
+    return cast<TypedValue<MemRefType>>(cast<OpTy>(op).getSrc());
+  }
+
+  Operation::operand_range getIndices(Operation *op) const {
+    return cast<OpTy>(op).getSrcIndices();
+  }
+
+  SmallVector<int64_t> getAccessedShape(Operation *op) const {
+    return {cast<VectorType>(cast<OpTy>(op).getResult().getType())
+                .getNumElements()};
+  }
+
+  std::optional<SmallVector<Value>>
+  updateMemrefAndIndices(Operation *op, RewriterBase &rewriter, Value newMemref,
+                         ValueRange newIndices) const {
+    auto accessOp = cast<OpTy>(op);
+    rewriter.modifyOpInPlace(accessOp, [&]() {
+      accessOp.getSrcMutable().assign(newMemref);
+      accessOp.getSrcIndicesMutable().assign(newIndices);
+    });
+    return std::nullopt;
+  }
+
+  bool hasInboundsIndices(Operation *) const { return true; }
+};
+
+template <typename OpTy>
+struct BaseAndIndicesAccess final
+    : IndexedAccessOpInterface::ExternalModel<BaseAndIndicesAccess<OpTy>,
+                                              OpTy> {
+  TypedValue<MemRefType> getAccessedMemref(Operation *op) const {
+    return cast<TypedValue<MemRefType>>(cast<OpTy>(op).getBase());
+  }
+
+  Operation::operand_range getIndices(Operation *op) const {
+    return cast<OpTy>(op).getIndices();
+  }
+
+  SmallVector<int64_t> getAccessedShape(Operation *) const { return {}; }
+
+  std::optional<SmallVector<Value>>
+  updateMemrefAndIndices(Operation *op, RewriterBase &rewriter, Value newMemref,
+                         ValueRange newIndices) const {
+    auto accessOp = cast<OpTy>(op);
+    rewriter.modifyOpInPlace(accessOp, [&]() {
+      accessOp.getBaseMutable().assign(newMemref);
+      accessOp.getIndicesMutable().assign(newIndices);
+    });
+    return std::nullopt;
+  }
+
+  bool hasInboundsIndices(Operation *) const { return true; }
+};
+
+template <typename OpTy>
+struct DescriptorAtomicBarrierAccess final
+    : IndexedAccessOpInterface::ExternalModel<
+          DescriptorAtomicBarrierAccess<OpTy>, OpTy> {
+  TypedValue<MemRefType> getAccessedMemref(Operation *op) const {
+    Value memref = cast<OpTy>(op).getAtomicBarrierAddress();
+    if (!memref)
+      return {};
+    return cast<TypedValue<MemRefType>>(memref);
+  }
+
+  Operation::operand_range getIndices(Operation *op) const {
+    return cast<OpTy>(op).getAtomicBarrierIndices();
+  }
+
+  SmallVector<int64_t> getAccessedShape(Operation *) const { return {}; }
+
+  std::optional<SmallVector<Value>>
+  updateMemrefAndIndices(Operation *op, RewriterBase &rewriter, Value newMemref,
+                         ValueRange newIndices) const {
+    auto accessOp = cast<OpTy>(op);
+    rewriter.modifyOpInPlace(accessOp, [&]() {
+      accessOp.getAtomicBarrierAddressMutable().assign(newMemref);
+      accessOp.getAtomicBarrierIndicesMutable().assign(newIndices);
+    });
+    return std::nullopt;
+  }
+
+  bool hasInboundsIndices(Operation *) const { return true; }
+};
+
+struct GatherToLDSCopy final
+    : IndexedMemCopyOpInterface::ExternalModel<GatherToLDSCopy, GatherToLDSOp> {
+  TypedValue<MemRefType> getSrc(Operation *op) const {
+    return cast<TypedValue<MemRefType>>(cast<GatherToLDSOp>(op).getSrc());
+  }
+
+  Operation::operand_range getSrcIndices(Operation *op) const {
+    return cast<GatherToLDSOp>(op).getSrcIndices();
+  }
+
+  TypedValue<MemRefType> getDst(Operation *op) const {
+    return cast<TypedValue<MemRefType>>(cast<GatherToLDSOp>(op).getDst());
+  }
+
+  Operation::operand_range getDstIndices(Operation *op) const {
+    return cast<GatherToLDSOp>(op).getDstIndices();
+  }
+
+  void setMemrefsAndIndices(Operation *op, RewriterBase &rewriter, Value newSrc,
+                            ValueRange newSrcIndices, Value newDst,
+                            ValueRange newDstIndices) const {
+    auto copyOp = cast<GatherToLDSOp>(op);
+    rewriter.modifyOpInPlace(copyOp, [&]() {
+      copyOp.getSrcMutable().assign(newSrc);
+      copyOp.getSrcIndicesMutable().assign(newSrcIndices);
+      copyOp.getDstMutable().assign(newDst);
+      copyOp.getDstIndicesMutable().assign(newDstIndices);
+    });
+  }
+
+  bool hasInboundsSrcIndices(Operation *op) const {
+    MemRefType srcType = cast<GatherToLDSOp>(op).getSrc().getType();
+    return !isFatRawBufferMemorySpace(srcType.getMemorySpace());
+  }
+
+  bool hasInboundsDstIndices(Operation *) const { return true; }
+};
+
+struct GlobalLoadAsyncToLDSCopy final
+    : IndexedMemCopyOpInterface::ExternalModel<GlobalLoadAsyncToLDSCopy,
+                                               GlobalLoadAsyncToLDSOp> {
+  TypedValue<MemRefType> getSrc(Operation *op) const {
+    return cast<TypedValue<MemRefType>>(
+        cast<GlobalLoadAsyncToLDSOp>(op).getSrc());
+  }
+
+  Operation::operand_range getSrcIndices(Operation *op) const {
+    return cast<GlobalLoadAsyncToLDSOp>(op).getSrcIndices();
+  }
+
+  TypedValue<MemRefType> getDst(Operation *op) const {
+    return cast<TypedValue<MemRefType>>(
+        cast<GlobalLoadAsyncToLDSOp>(op).getDst());
+  }
+
+  Operation::operand_range getDstIndices(Operation *op) const {
+    return cast<GlobalLoadAsyncToLDSOp>(op).getDstIndices();
+  }
+
+  void setMemrefsAndIndices(Operation *op, RewriterBase &rewriter, Value newSrc,
+                            ValueRange newSrcIndices, Value newDst,
+                            ValueRange newDstIndices) const {
+    auto copyOp = cast<GlobalLoadAsyncToLDSOp>(op);
+    rewriter.modifyOpInPlace(copyOp, [&]() {
+      copyOp.getSrcMutable().assign(newSrc);
+      copyOp.getSrcIndicesMutable().assign(newSrcIndices);
+      copyOp.getDstMutable().assign(newDst);
+      copyOp.getDstIndicesMutable().assign(newDstIndices);
+    });
+  }
+
+  bool hasInboundsSrcIndices(Operation *) const { return true; }
+
+  bool hasInboundsDstIndices(Operation *op) const {
+    // Masked lanes may carry out-of-bounds destination indices; lowering
+    // replaces their destination pointer with -1 before the instruction uses
+    // it.
+    return !cast<GlobalLoadAsyncToLDSOp>(op).getMask();
+  }
+};
+
+template <typename OpTy>
+struct DmaBaseCopy final
+    : IndexedMemCopyOpInterface::ExternalModel<DmaBaseCopy<OpTy>, OpTy> {
+  TypedValue<MemRefType> getSrc(Operation *op) const {
+    return cast<TypedValue<MemRefType>>(cast<OpTy>(op).getGlobal());
+  }
+
+  Operation::operand_range getSrcIndices(Operation *op) const {
+    return cast<OpTy>(op).getGlobalIndices();
+  }
+
+  TypedValue<MemRefType> getDst(Operation *op) const {
+    return cast<TypedValue<MemRefType>>(cast<OpTy>(op).getLds());
+  }
+
+  Operation::operand_range getDstIndices(Operation *op) const {
+    return cast<OpTy>(op).getLdsIndices();
+  }
+
+  void setMemrefsAndIndices(Operation *op, RewriterBase &rewriter, Value newSrc,
+                            ValueRange newSrcIndices, Value newDst,
+                            ValueRange newDstIndices) const {
+    auto copyOp = cast<OpTy>(op);
+    rewriter.modifyOpInPlace(copyOp, [&]() {
+      copyOp.getGlobalMutable().assign(newSrc);
+      copyOp.getGlobalIndicesMutable().assign(newSrcIndices);
+      copyOp.getLdsMutable().assign(newDst);
+      copyOp.getLdsIndicesMutable().assign(newDstIndices);
+    });
+  }
+
+  bool hasInboundsSrcIndices(Operation *) const { return true; }
+
+  bool hasInboundsDstIndices(Operation *) const { return true; }
+};
+} // namespace
+
+void mlir::amdgpu::registerMemoryAccessOpInterfacesExternalModels(
+    DialectRegistry &registry) {
+  registry.addExtension(+[](MLIRContext *ctx, amdgpu::AMDGPUDialect *) {
+    TransposeLoadOp::attachInterface<TransposeLoadAccess<TransposeLoadOp>>(
+        *ctx);
+    GlobalTransposeLoadOp::attachInterface<
+        TransposeLoadAccess<GlobalTransposeLoadOp>>(*ctx);
+    MakeDmaDescriptorOp::attachInterface<
+        DescriptorAtomicBarrierAccess<MakeDmaDescriptorOp>>(*ctx);
+    MakeGatherDmaDescriptorOp::attachInterface<
+        DescriptorAtomicBarrierAccess<MakeGatherDmaDescriptorOp>>(*ctx);
+    DsBarrierInitOp::attachInterface<BaseAndIndicesAccess<DsBarrierInitOp>>(
+        *ctx);
+    DsBarrierPollStateOp::attachInterface<
+        BaseAndIndicesAccess<DsBarrierPollStateOp>>(*ctx);
+    DsAsyncBarrierArriveOp::attachInterface<
+        BaseAndIndicesAccess<DsAsyncBarrierArriveOp>>(*ctx);
+    DsBarrierArriveOp::attachInterface<BaseAndIndicesAccess<DsBarrierArriveOp>>(
+        *ctx);
+    GatherToLDSOp::attachInterface<GatherToLDSCopy>(*ctx);
+    GlobalLoadAsyncToLDSOp::attachInterface<GlobalLoadAsyncToLDSCopy>(*ctx);
+    MakeDmaBaseOp::attachInterface<DmaBaseCopy<MakeDmaBaseOp>>(*ctx);
+    MakeGatherDmaBaseOp::attachInterface<DmaBaseCopy<MakeGatherDmaBaseOp>>(
+        *ctx);
+  });
+}
diff --git a/mlir/lib/Dialect/MemRef/IR/MemoryAccessOpInterfaces.cpp b/mlir/lib/Dialect/MemRef/IR/MemoryAccessOpInterfaces.cpp
index c71df9a2015f7..9a76f07f96f23 100644
--- a/mlir/lib/Dialect/MemRef/IR/MemoryAccessOpInterfaces.cpp
+++ b/mlir/lib/Dialect/MemRef/IR/MemoryAccessOpInterfaces.cpp
@@ -25,7 +25,7 @@ LogicalResult detail::verifyIndexedAccessOpInterface(Operation *op) {
 
   TypedValue<MemRefType> memref = iface.getAccessedMemref();
   if (!memref) {
-    // Some operations can carry tensors, this is fine.
+    // Some operations do not always access memory through a memref.
     return success();
   }
   if (memref.getType().getRank() !=
@@ -45,7 +45,7 @@ LogicalResult detail::verifyIndexedMemCopyOpInterface(Operation *op) {
   TypedValue<MemRefType> src = iface.getSrc();
   TypedValue<MemRefType> dst = iface.getDst();
   if (!src || !dst) {
-    // Allow operations to not always have memref arguments.
+    // Allow operations to not always copy between memref arguments.
     return success();
   }
   if (src.getType().getRank() !=
diff --git a/mlir/lib/Dialect/MemRef/Transforms/FoldMemRefAliasOps.cpp b/mlir/lib/Dialect/MemRef/Transforms/FoldMemRefAliasOps.cpp
index f5c5a48e7f543..014f913cbac92 100644
--- a/mlir/lib/Dialect/MemRef/Transforms/FoldMemRefAliasOps.cpp
+++ b/mlir/lib/Dialect/MemRef/Transforms/FoldMemRefAliasOps.cpp
@@ -218,7 +218,11 @@ struct TransferOpOfCollapseShapeOpFolder final
 LogicalResult
 AccessOpOfSubViewOpFolder::matchAndRewrite(memref::IndexedAccessOpInterface op,
                                            PatternRewriter &rewriter) const {
-  auto subview = op.getAccessedMemref().getDefiningOp<memref::SubViewOp>();
+  TypedValue<MemRefType> accessedMemref = op.getAccessedMemref();
+  if (!accessedMemref)
+    return rewriter.notifyMatchFailure(op, "not accessing a memref");
+
+  auto subview = accessedMemref.getDefiningOp<memref::SubViewOp>();
   if (!subview)
     return rewriter.notifyMatchFailure(op, "not accessing a subview");
 
@@ -262,7 +266,11 @@ AccessOpOfSubViewOpFolder::matchAndRewrite(memref::IndexedAccessOpInterface op,
 
 LogicalResult AccessOpOfExpandShapeOpFolder::matchAndRewrite(
     memref::IndexedAccessOpInterface op, PatternRewriter &rewriter) const {
-  auto expand = op.getAccessedMemref().getDefiningOp<memref::ExpandShapeOp>();
+  TypedValue<MemRefType> accessedMemref = op.getAccessedMemref();
+  if (!accessedMemref)
+    return rewriter.notifyMatchFailure(op, "not accessing a memref");
+
+  auto expand = accessedMemref.getDefiningOp<memref::ExpandShapeOp>();
   if (!expand)
     return rewriter.notifyMatchFailure(op, "not accessing an expand_shape");
 
@@ -294,8 +302,11 @@ LogicalResult AccessOpOfExpandShapeOpFolder::matchAndRewrite(
 
 LogicalResult AccessOpOfCollapseShapeOpFolder::matchAndRewrite(
     memref::IndexedAccessOpInterface op, PatternRewriter &rewriter) const {
-  auto collapse =
-      op.getAccessedMemref().getDefiningOp<memref::CollapseShapeOp>();
+  TypedValue<MemRefType> accessedMemref = op.getAccessedMemref();
+  if (!accessedMemref)
+    return rewriter.notifyMatchFailure(op, "not accessing a memref");
+
+  auto collapse = accessedMemref.getDefiningOp<memref::CollapseShapeOp>();
   if (!collapse)
     return rewriter.notifyMatchFailure(op, "not accessing a collapse_shape");
 
@@ -328,15 +339,17 @@ LogicalResult AccessOpOfCollapseShapeOpFolder::matchAndRewrite(
 
 LogicalResult IndexedMemCopyOpOfSubViewOpFolder::matchAndRewrite(
     memref::IndexedMemCopyOpInterface op, PatternRewriter &rewriter) const {
-  auto srcSubview = op.getSrc().getDefiningOp<memref::SubViewOp>();
-  auto dstSubview = op.getDst().getDefiningOp<memref::SubViewOp>();
+  TypedValue<MemRefType> src = op.getSrc();
+  TypedValue<MemRefType> dst = op.getDst();
+  auto srcSubview = src ? src.getDefiningOp<memref::SubViewOp>() : nullptr;
+  auto dstSubview = dst ? dst.getDefiningOp<memref::SubViewOp>() : nullptr;
   if (!srcSubview && !dstSubview)
     return rewriter.notifyMatchFailure(
         op, "no subviews found on indexed copy inputs");
 
-  Value newSrc = op.getSrc();
+  Value newSrc = src;
   SmallVector<Value> newSrcIndices = llvm::to_vector(op.getSrcIndices());
-  Value newDst = op.getDst();
+  Value newDst = dst;
   SmallVector<Value> newDstIndices = llvm::to_vector(op.getDstIndices());
   if (srcSubview) {
     newSrc = srcSubview.getSource();
@@ -361,29 +374,31 @@ LogicalResult IndexedMemCopyOpOfSubViewOpFolder::matchAndRewrite(
 
 LogicalResult IndexedMemCopyOpOfExpandShapeOpFolder::matchAndRewrite(
     memref::IndexedMemCopyOpInterface op, PatternRewriter &rewriter) const {
-  auto srcExpand = op.getSrc().getDefiningOp<memref::ExpandShapeOp>();
-  auto dstExpand = op.getDst().getDefiningOp<memref::ExpandShapeOp>();
+  TypedValue<MemRefType> src = op.getSrc();
+  TypedValue<MemRefType> dst = op.getDst();
+  auto srcExpand = src ? src.getDefiningOp<memref::ExpandShapeOp>() : nullptr;
+  auto dstExpand = dst ? dst.getDefiningOp<memref::ExpandShapeOp>() : nullptr;
   if (!srcExpand && !dstExpand)
     return rewriter.notifyMatchFailure(
         op, "no expand_shapes found on indexed copy inputs");
 
-  Value newSrc = op.getSrc();
+  Value newSrc = src;
   SmallVector<Value> newSrcIndices = llvm::to_vector(op.getSrcIndices());
-  Value newDst = op.getDst();
+  Value newDst = dst;
   SmallVector<Value> newDstIndices = llvm::to_vector(op.getDstIndices());
   if (srcExpand) {
     newSrc = srcExpand.getViewSource();
     newSrcIndices.clear();
     memref::resolveSourceIndicesExpandShape(op.getLoc(), rewriter, srcExpand,
                                             op.getSrcIndices(), newSrcIndices,
-                                            /*startsInbounds=*/true);
+                                            op.hasInboundsSrcIndices());
   }
   if (dstExpand) {
     newDst = dstExpand.getViewSource();
     newDstIndices.clear();
     memref::resolveSourceIndicesExpandShape(op.getLoc(), rewriter, dstExpand,
                                             op.getDstIndices(), newDstIndices,
-                                            /*startsInbounds=*/true);
+                                            op.hasInboundsDstIndices());
   }
   op.setMemrefsAndIndices(rewriter, newSrc, newSrcIndices, newDst,
                           newDstIndices);
@@ -392,29 +407,33 @@ LogicalResult IndexedMemCopyOpOfExpandShapeOpFolder::matchAndRewrite(
 
 LogicalResult IndexedMemCopyOpOfCollapseShapeOpFolder::matchAndRewrite(
     memref::IndexedMemCopyOpInterface op, PatternRewriter &rewriter) const {
-  auto srcCollapse = op.getSrc().getDefiningOp<memref::CollapseShapeOp>();
-  auto dstCollapse = op.getDst().getDefiningOp<memref::CollapseShapeOp>();
+  TypedValue<MemRefType> src = op.getSrc();
+  TypedValue<MemRefType> dst = op.getDst();
+  auto srcCollapse =
+      src ? src.getDefiningOp<memref::CollapseShapeOp>() : nullptr;
+  auto dstCollapse =
+      dst ? dst.getDefiningOp<memref::CollapseShapeOp>() : nullptr;
   if (!srcCollapse && !dstCollapse)
     return rewriter.notifyMatchFailure(
         op, "no collapse_shapes found on indexed copy inputs");
 
-  Value newSrc = op.getSrc();
+  Value newSrc = src;
   SmallVector<Value> newSrcIndices = llvm::to_vector(op.getSrcIndices());
-  Value newDst = op.getDst();
+  Value newDst = dst;
   SmallVector<Value> newDstIndices = llvm::to_vector(op.getDstIndices());
   if (srcCollapse) {
     newSrc = srcCollapse.getViewSource();
     newSrcIndices.clear();
     memref::resolveSourceIndicesCollapseShape(
         op.getLoc(), rewriter, srcCollapse, op.getSrcIndices(), newSrcIndices,
-        /*startsInbounds=*/true);
+        op.hasInboundsSrcIndices());
   }
   if (dstCollapse) {
     newDst = dstCollapse.getViewSource();
     newDstIndices.clear();
     memref::resolveSourceIndicesCollapseShape(
         op.getLoc(), rewriter, dstCollapse, op.getDstIndices(), newDstIndices,
-        /*startsInbounds=*/true);
+        op.hasInboundsDstIndices());
   }
   op.setMemrefsAndIndices(rewriter, newSrc, newSrcIndices, newDst,
                           newDstIndices);
diff --git a/mlir/lib/RegisterAllDialects.cpp b/mlir/lib/RegisterAllDialects.cpp
index 01a7401db4710..2f55296f424cd 100644
--- a/mlir/lib/RegisterAllDialects.cpp
+++ b/mlir/lib/RegisterAllDialects.cpp
@@ -14,6 +14,7 @@
 #include "mlir/InitAllDialects.h"
 
 #include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h"
+#include "mlir/Dialect/AMDGPU/Transforms/MemoryAccessOpInterfacesImpl.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Affine/IR/ValueBoundsOpInterfaceImpl.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
@@ -160,6 +161,7 @@ void mlir::registerAllDialects(DialectRegistry &registry) {
 
   // Register all external models.
   affine::registerValueBoundsOpInterfaceExternalModels(registry);
+  amdgpu::registerMemoryAccessOpInterfacesExternalModels(registry);
   arith::registerBufferDeallocationOpInterfaceExternalModels(registry);
   arith::registerBufferizableOpInterfaceExternalModels(registry);
   arith::registerBufferViewFlowOpInterfaceExternalModels(registry);
diff --git a/mlir/test/Dialect/AMDGPU/amdgpu-fold-memrefs.mlir b/mlir/test/Dialect/AMDGPU/fold-memref-alias-ops.mlir
similarity index 62%
rename from mlir/test/Dialect/AMDGPU/amdgpu-fold-memrefs.mlir
rename to mlir/test/Dialect/AMDGPU/fold-memref-alias-ops.mlir
index 82f76953e2522..65834a66af7b6 100644
--- a/mlir/test/Dialect/AMDGPU/amdgpu-fold-memrefs.mlir
+++ b/mlir/test/Dialect/AMDGPU/fold-memref-alias-ops.mlir
@@ -1,13 +1,13 @@
-// RUN: mlir-opt --amdgpu-fold-memrefs-ops --split-input-file %s | FileCheck %s
+// RUN: mlir-opt --fold-memref-alias-ops --split-input-file %s | FileCheck %s
 
 #gpu_lds_addrspace = #gpu.address_space<workgroup>
 
 // CHECK: func @test_subview_folding
 // CHECK-SAME: %[[ARG0:.*]]: index, %[[ARG1:.*]]: index
 func.func @test_subview_folding(%offset_i: index, %offset_j: index) {
-  // CHECK: %[[LOCAL:.*]] = memref.alloc() : memref<64x64xf16, #gpu.address_space<workgroup>>
-  // CHECK: %[[MEM:.*]] = memref.alloc() : memref<64x128xf16>
-  // CHECK: %[[C0:.*]] = arith.constant 0 : index
+  // CHECK-DAG: %[[LOCAL:.*]] = memref.alloc() : memref<64x64xf16, #gpu.address_space<workgroup>>
+  // CHECK-DAG: %[[MEM:.*]] = memref.alloc() : memref<64x128xf16>
+  // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
   // CHECK: amdgpu.gather_to_lds %[[MEM]][%[[ARG0]], %[[ARG1]]], %[[LOCAL]][%[[C0]], %[[C0]]]
   // CHECK-SAME: vector<8xf16>, memref<64x128xf16>, memref<64x64xf16, #gpu.address_space<workgroup>>
 
@@ -30,9 +30,9 @@ func.func @test_subview_folding(%offset_i: index, %offset_j: index) {
 // CHECK: func @subview_folding_offset
 // CHECK-SAME: %[[ARG0:.*]]: index, %[[ARG1:.*]]: index
 func.func @subview_folding_offset(%offset_i: index, %offset_j: index) {
-  // CHECK: %[[LOCAL:.*]] = memref.alloc() : memref<64x64xf16, #gpu.address_space<workgroup>>
-  // CHECK: %[[MEM:.*]] = memref.alloc() : memref<64x128xf16>
-  // CHECK: %[[C0:.*]] = arith.constant 0 : index
+  // CHECK-DAG: %[[LOCAL:.*]] = memref.alloc() : memref<64x64xf16, #gpu.address_space<workgroup>>
+  // CHECK-DAG: %[[MEM:.*]] = memref.alloc() : memref<64x128xf16>
+  // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
   // CHECK: %[[IDX0:.*]] = affine.apply #[[MAP]]()[%[[ARG0]]]
   // CHECK: %[[IDX1:.*]] = affine.apply #[[MAP1]]()[%[[ARG1]]]
   // CHECK: amdgpu.gather_to_lds %[[MEM]][%[[IDX0]], %[[IDX1]]], %[[LOCAL]][%[[C0]], %[[C0]]]
@@ -54,12 +54,11 @@ func.func @subview_folding_offset(%offset_i: index, %offset_j: index) {
 // CHECK: func @test_expand_shape
 // CHECK-SAME: %[[ARG0:.*]]: index, %[[ARG1:.*]]: index
 func.func @test_expand_shape(%offset_i: index, %offset_j: index) {
-  // CHECK: %[[LOCAL:.*]] = memref.alloc() : memref<4096xf16, #gpu.address_space<workgroup>>
-  // CHECK: %[[MEM:.*]] = memref.alloc() : memref<8192xf16>
-  // CHECK: %[[C0:.*]] = arith.constant 0 : index
-  // CHECK: %[[IDXM:.*]] = affine.linearize_index [%[[ARG0]], %[[ARG1]]] by (64, 128) : index
-  // CHECK: %[[IDXL:.*]] = affine.linearize_index [%[[C0]], %[[C0]]] by (64, 64) : index
-  // CHECK: amdgpu.gather_to_lds %[[MEM]][%[[IDXM]]], %[[LOCAL]][%[[IDXL]]]
+  // CHECK-DAG: %[[LOCAL:.*]] = memref.alloc() : memref<4096xf16, #gpu.address_space<workgroup>>
+  // CHECK-DAG: %[[MEM:.*]] = memref.alloc() : memref<8192xf16>
+  // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
+  // CHECK: %[[IDXM:.*]] = affine.linearize_index disjoint [%[[ARG0]], %[[ARG1]]] by (64, 128) : index
+  // CHECK: amdgpu.gather_to_lds %[[MEM]][%[[IDXM]]], %[[LOCAL]][%[[C0]]]
   // CHECK-SAME: vector<8xf16>, memref<8192xf16>, memref<4096xf16, #gpu.address_space<workgroup>>
 
   %alloc = memref.alloc() : memref<4096xf16, #gpu_lds_addrspace>
@@ -79,11 +78,10 @@ func.func @test_expand_shape(%offset_i: index, %offset_j: index) {
 // CHECK: func @test_collapse_shape
 // CHECK-SAME: %[[ARG0:.*]]: index, %[[ARG1:.*]]: index
 func.func @test_collapse_shape(%offset_i: index, %offset_j: index) {
-  // CHECK: %[[LOCAL:.*]] = memref.alloc() : memref<64x64xf16, #gpu.address_space<workgroup>>
-  // CHECK: %[[MEM:.*]] = memref.alloc() : memref<64x128xf16>
-  // CHECK: %[[C0:.*]] = arith.constant 0 : index
-  // CHECK: %[[INDICES_MEM:.*]]:2 = affine.delinearize_index %[[ARG0]] into (128) : index, index
-  // CHECK: %[[INDICES_LDS:.*]]:2 = affine.delinearize_index %[[ARG1]] into (64) : index, index
+  // CHECK-DAG: %[[LOCAL:.*]] = memref.alloc() : memref<64x64xf16, #gpu.address_space<workgroup>>
+  // CHECK-DAG: %[[MEM:.*]] = memref.alloc() : memref<64x128xf16>
+  // CHECK: %[[INDICES_MEM:.*]]:2 = affine.delinearize_index %[[ARG0]] into (64, 128) : index, index
+  // CHECK: %[[INDICES_LDS:.*]]:2 = affine.delinearize_index %[[ARG1]] into (64, 64) : index, index
   // CHECK: amdgpu.gather_to_lds %[[MEM]][%[[INDICES_MEM]]#0, %[[INDICES_MEM]]#1], %[[LOCAL]][%[[INDICES_LDS]]#0, %[[INDICES_LDS]]#1]
   // CHECK-SAME: vector<8xf16>, memref<64x128xf16>, memref<64x64xf16, #gpu.address_space<workgroup>>
 
@@ -106,8 +104,8 @@ func.func @test_collapse_shape(%offset_i: index, %offset_j: index) {
 // CHECK: func @test_expand_shape_src_raw_buffer
 // CHECK-SAME: %[[ARG0:.*]]: memref<8192xf16, #amdgpu.address_space<fat_raw_buffer>>, %[[ARG1:.*]]: index, %[[ARG2:.*]]: index
 func.func @test_expand_shape_src_raw_buffer(%mem : memref<8192xf16, #amdgpu.address_space<fat_raw_buffer>>, %offset_i: index, %offset_j: index) {
-  // CHECK: %[[LOCAL:.*]] = memref.alloc() : memref<4096xf16, #gpu.address_space<workgroup>>
-  // CHECK: %[[C0:.*]] = arith.constant 0 : index
+  // CHECK-DAG: %[[LOCAL:.*]] = memref.alloc() : memref<4096xf16, #gpu.address_space<workgroup>>
+  // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
   // CHECK: %[[IDXM:.*]] = affine.linearize_index [%[[ARG1]], %[[ARG2]]] by (64, 128) : index
   // CHECK: amdgpu.gather_to_lds %[[ARG0]][%[[IDXM]]], %[[LOCAL]][%[[C0]]]
   // CHECK-SAME: vector<8xf16>, memref<8192xf16, #amdgpu.address_space<fat_raw_buffer>>, memref<4096xf16, #gpu.address_space<workgroup>>
@@ -125,13 +123,37 @@ func.func @test_expand_shape_src_raw_buffer(%mem : memref<8192xf16, #amdgpu.addr
 
 #gpu_lds_addrspace = #gpu.address_space<workgroup>
 
+// CHECK: func @test_src_raw_buffer_collapse_shape
+// CHECK-SAME: %[[MEM:.*]]: memref<64x128xf16, #amdgpu.address_space<fat_raw_buffer>>, %[[IDX:.*]]: index
+func.func @test_src_raw_buffer_collapse_shape(%mem : memref<64x128xf16, #amdgpu.address_space<fat_raw_buffer>>, %offset: index) {
+  // CHECK-DAG: %[[LOCAL:.*]] = memref.alloc() : memref<4096xf16, #gpu.address_space<workgroup>>
+  // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
+  // CHECK: %[[INDICES:.*]]:2 = affine.delinearize_index %[[IDX]] into (128) : index, index
+  // CHECK: amdgpu.gather_to_lds %[[MEM]][%[[INDICES]]#0, %[[INDICES]]#1], %[[LOCAL]][%[[C0]]]
+  // CHECK-SAME: vector<8xf16>, memref<64x128xf16, #amdgpu.address_space<fat_raw_buffer>>, memref<4096xf16, #gpu.address_space<workgroup>>
+
+  %alloc = memref.alloc() : memref<4096xf16, #gpu_lds_addrspace>
+  %collapse_mem = memref.collapse_shape %mem [[0, 1]]
+    : memref<64x128xf16, #amdgpu.address_space<fat_raw_buffer>>
+    into memref<8192xf16, #amdgpu.address_space<fat_raw_buffer>>
+  %c0 = arith.constant 0 : index
+  amdgpu.gather_to_lds %collapse_mem[%offset], %alloc[%c0]
+    : vector<8xf16>, memref<8192xf16, #amdgpu.address_space<fat_raw_buffer>>,
+      memref<4096xf16, #gpu_lds_addrspace>
+  func.return
+}
+
+// -----
+
+#gpu_lds_addrspace = #gpu.address_space<workgroup>
+
 // CHECK: func @test_expand_shape_dst_only
 // CHECK-SAME: %[[ARG0:.*]]: index, %[[ARG1:.*]]: index
 func.func @test_expand_shape_dst_only(%offset_i: index, %offset_j: index) {
-  // CHECK: %[[LOCAL:.*]] = memref.alloc() : memref<4096xf16, #gpu.address_space<workgroup>>
-  // CHECK: %[[MEM:.*]] = memref.alloc() : memref<8192xf16>
-  // CHECK: %[[C0:.*]] = arith.constant 0 : index
-  // CHECK: %[[IDX_LDS:.*]] = affine.linearize_index [%[[ARG1]], %[[C0]]] by (64, 64) : index
+  // CHECK-DAG: %[[LOCAL:.*]] = memref.alloc() : memref<4096xf16, #gpu.address_space<workgroup>>
+  // CHECK-DAG: %[[MEM:.*]] = memref.alloc() : memref<8192xf16>
+  // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
+  // CHECK: %[[IDX_LDS:.*]] = affine.linearize_index disjoint [%[[ARG1]], %[[C0]]] by (64, 64) : index
   // CHECK: amdgpu.gather_to_lds %[[MEM]][%[[ARG0]]], %[[LOCAL]][%[[IDX_LDS]]]
   // CHECK-SAME: vector<8xf16>, memref<8192xf16>, memref<4096xf16, #gpu.address_space<workgroup>>
 
@@ -152,7 +174,7 @@ func.func @test_expand_shape_dst_only(%offset_i: index, %offset_j: index) {
 // CHECK: func @test_nop
 // CHECK-SAME: %[[ARG0:.*]]: memref<8192xf16, #amdgpu.address_space<fat_raw_buffer>>, %[[ARG1:.*]]: index, %[[ARG2:.*]]: index
 func.func @test_nop(%mem : memref<8192xf16, #amdgpu.address_space<fat_raw_buffer>>, %offset_i: index, %offset_j: index) {
-  // CHECK: %[[LOCAL:.*]] = memref.alloc() : memref<4096xf16, #gpu.address_space<workgroup>>
+  // CHECK-DAG: %[[LOCAL:.*]] = memref.alloc() : memref<4096xf16, #gpu.address_space<workgroup>>
   // CHECK: amdgpu.gather_to_lds %[[ARG0]][%[[ARG1]]], %[[LOCAL]][%[[ARG2]]]
   // CHECK-SAME: vector<8xf16>, memref<8192xf16, #amdgpu.address_space<fat_raw_buffer>>, memref<4096xf16, #gpu.address_space<workgroup>>
 
@@ -169,9 +191,9 @@ func.func @test_nop(%mem : memref<8192xf16, #amdgpu.address_space<fat_raw_buffer
 // CHECK: func @test_async_flag_preserved
 // CHECK-SAME: %[[ARG0:.*]]: index, %[[ARG1:.*]]: index
 func.func @test_async_flag_preserved(%offset_i: index, %offset_j: index) {
-  // CHECK: %[[LOCAL:.*]] = memref.alloc() : memref<64x64xf16, #gpu.address_space<workgroup>>
-  // CHECK: %[[MEM:.*]] = memref.alloc() : memref<64x128xf16>
-  // CHECK: %[[C0:.*]] = arith.constant 0 : index
+  // CHECK-DAG: %[[LOCAL:.*]] = memref.alloc() : memref<64x64xf16, #gpu.address_space<workgroup>>
+  // CHECK-DAG: %[[MEM:.*]] = memref.alloc() : memref<64x128xf16>
+  // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
   // CHECK: amdgpu.gather_to_lds async %[[MEM]][%[[ARG0]], %[[ARG1]]], %[[LOCAL]][%[[C0]], %[[C0]]]
   // CHECK-SAME: vector<8xf16>, memref<64x128xf16>, memref<64x64xf16, #gpu.address_space<workgroup>>
 
@@ -191,7 +213,7 @@ func.func @test_async_flag_preserved(%offset_i: index, %offset_j: index) {
 // CHECK: func @test_transpose_load_subview
 // CHECK-SAME: %[[ARG0:.*]]: index, %[[ARG1:.*]]: index
 func.func @test_transpose_load_subview(%offset_i: index, %offset_j: index) -> vector<4xf16> {
-  // CHECK: %[[ALLOC:.*]] = memref.alloc() : memref<64x128xf16, #gpu.address_space<workgroup>>
+  // CHECK-DAG: %[[ALLOC:.*]] = memref.alloc() : memref<64x128xf16, #gpu.address_space<workgroup>>
   // CHECK: amdgpu.transpose_load %[[ALLOC]][%[[ARG0]], %[[ARG1]]]
   // CHECK-SAME: memref<64x128xf16, #gpu.address_space<workgroup>> -> vector<4xf16>
 
@@ -213,7 +235,7 @@ func.func @test_transpose_load_subview(%offset_i: index, %offset_j: index) -> ve
 // CHECK: func @test_transpose_load_subview_offset
 // CHECK-SAME: %[[ARG0:.*]]: index, %[[ARG1:.*]]: index
 func.func @test_transpose_load_subview_offset(%offset_i: index, %offset_j: index) -> vector<4xf16> {
-  // CHECK: %[[ALLOC:.*]] = memref.alloc() : memref<64x128xf16, #gpu.address_space<workgroup>>
+  // CHECK-DAG: %[[ALLOC:.*]] = memref.alloc() : memref<64x128xf16, #gpu.address_space<workgroup>>
   // CHECK: %[[IDX0:.*]] = affine.apply #[[MAP]]()[%[[ARG0]]]
   // CHECK: %[[IDX1:.*]] = affine.apply #[[MAP1]]()[%[[ARG1]]]
   // CHECK: amdgpu.transpose_load %[[ALLOC]][%[[IDX0]], %[[IDX1]]]
@@ -235,8 +257,8 @@ func.func @test_transpose_load_subview_offset(%offset_i: index, %offset_j: index
 // CHECK: func @test_transpose_load_expand_shape
 // CHECK-SAME: %[[ARG0:.*]]: index, %[[ARG1:.*]]: index
 func.func @test_transpose_load_expand_shape(%offset_i: index, %offset_j: index) -> vector<4xf16> {
-  // CHECK: %[[ALLOC:.*]] = memref.alloc() : memref<4096xf16, #gpu.address_space<workgroup>>
-  // CHECK: %[[IDX:.*]] = affine.linearize_index [%[[ARG0]], %[[ARG1]]] by (32, 128) : index
+  // CHECK-DAG: %[[ALLOC:.*]] = memref.alloc() : memref<4096xf16, #gpu.address_space<workgroup>>
+  // CHECK: %[[IDX:.*]] = affine.linearize_index disjoint [%[[ARG0]], %[[ARG1]]] by (32, 128) : index
   // CHECK: amdgpu.transpose_load %[[ALLOC]][%[[IDX]]]
   // CHECK-SAME: memref<4096xf16, #gpu.address_space<workgroup>> -> vector<4xf16>
 
@@ -255,8 +277,8 @@ func.func @test_transpose_load_expand_shape(%offset_i: index, %offset_j: index)
 // CHECK: func @test_transpose_load_collapse_shape
 // CHECK-SAME: %[[ARG0:.*]]: index
 func.func @test_transpose_load_collapse_shape(%offset_i: index) -> vector<4xf16> {
-  // CHECK: %[[ALLOC:.*]] = memref.alloc() : memref<32x128xf16, #gpu.address_space<workgroup>>
-  // CHECK: %[[INDICES:.*]]:2 = affine.delinearize_index %[[ARG0]] into (128) : index, index
+  // CHECK-DAG: %[[ALLOC:.*]] = memref.alloc() : memref<32x128xf16, #gpu.address_space<workgroup>>
+  // CHECK: %[[INDICES:.*]]:2 = affine.delinearize_index %[[ARG0]] into (32, 128) : index, index
   // CHECK: amdgpu.transpose_load %[[ALLOC]][%[[INDICES]]#0, %[[INDICES]]#1]
   // CHECK-SAME: memref<32x128xf16, #gpu.address_space<workgroup>> -> vector<4xf16>
 
@@ -275,7 +297,7 @@ func.func @test_transpose_load_collapse_shape(%offset_i: index) -> vector<4xf16>
 // CHECK: func @test_transpose_load_nop
 // CHECK-SAME: %[[ARG0:.*]]: index, %[[ARG1:.*]]: index
 func.func @test_transpose_load_nop(%offset_i: index, %offset_j: index) -> vector<4xf16> {
-  // CHECK: %[[ALLOC:.*]] = memref.alloc() : memref<32x128xf16, #gpu.address_space<workgroup>>
+  // CHECK-DAG: %[[ALLOC:.*]] = memref.alloc() : memref<32x128xf16, #gpu.address_space<workgroup>>
   // CHECK: amdgpu.transpose_load %[[ALLOC]][%[[ARG0]], %[[ARG1]]]
   // CHECK-SAME: memref<32x128xf16, #gpu.address_space<workgroup>> -> vector<4xf16>
   // CHECK-NOT: subview
@@ -294,14 +316,15 @@ func.func @test_transpose_load_nop(%offset_i: index, %offset_j: index) -> vector
 
 // CHECK: func @test_make_dma_base_subview
 // CHECK-SAME: %[[MEM:.*]]: memref<64x128xf16, #gpu.address_space<global>>, %[[LDS:.*]]: memref<64x64xf16, #gpu.address_space<workgroup>>, %[[GLOBAL_I:.*]]: index, %[[GLOBAL_J:.*]]: index, %[[LDS_I:.*]]: index, %[[LDS_J:.*]]: index
-func.func @test_make_dma_base_subview(%mem: memref<64x128xf16, #gpu_global_addrspace>, %lds: memref<64x64xf16, #gpu_lds_addrspace>, %global_i: index, %global_j: index, %lds_i: index, %lds_j: index) {
-  // CHECK: amdgpu.make_dma_base %[[MEM]][%[[GLOBAL_I]], %[[GLOBAL_J]]], %[[LDS]][%[[LDS_I]], %[[LDS_J]]]
+func.func @test_make_dma_base_subview(%mem: memref<64x128xf16, #gpu_global_addrspace>, %lds: memref<64x64xf16, #gpu_lds_addrspace>, %global_i: index, %global_j: index, %lds_i: index, %lds_j: index) -> !amdgpu.tdm_base<f16> {
+  // CHECK: %[[BASE:.*]] = amdgpu.make_dma_base %[[MEM]][%[[GLOBAL_I]], %[[GLOBAL_J]]], %[[LDS]][%[[LDS_I]], %[[LDS_J]]]
   // CHECK-SAME: memref<64x128xf16, #gpu.address_space<global>>, memref<64x64xf16, #gpu.address_space<workgroup>> -> !amdgpu.tdm_base<f16>
+  // CHECK: return %[[BASE]] : !amdgpu.tdm_base<f16>
 
   %subview = memref.subview %mem[0, 0][32, 64][1, 1] : memref<64x128xf16, #gpu_global_addrspace> to memref<32x64xf16, strided<[128, 1]>, #gpu_global_addrspace>
   %base = amdgpu.make_dma_base %subview[%global_i, %global_j], %lds[%lds_i, %lds_j]
     : memref<32x64xf16, strided<[128, 1]>, #gpu_global_addrspace>, memref<64x64xf16, #gpu_lds_addrspace> -> !amdgpu.tdm_base<f16>
-  func.return
+  func.return %base : !amdgpu.tdm_base<f16>
 }
 
 // -----
@@ -311,15 +334,16 @@ func.func @test_make_dma_base_subview(%mem: memref<64x128xf16, #gpu_global_addrs
 
 // CHECK: func @test_make_dma_base_expand_shape
 // CHECK-SAME: %[[MEM:.*]]: memref<64x128xf16, #gpu.address_space<global>>, %[[LDS:.*]]: memref<4096xf16, #gpu.address_space<workgroup>>, %[[GLOBAL_I:.*]]: index, %[[GLOBAL_J:.*]]: index, %[[LDS_I:.*]]: index, %[[LDS_J:.*]]: index
-func.func @test_make_dma_base_expand_shape(%mem: memref<64x128xf16, #gpu_global_addrspace>, %lds: memref<4096xf16, #gpu_lds_addrspace>, %global_i: index, %global_j: index, %lds_i: index, %lds_j: index) {
-  // CHECK: %[[IDX:.*]] = affine.linearize_index [%[[LDS_I]], %[[LDS_J]]] by (64, 64) : index
-  // CHECK: amdgpu.make_dma_base %[[MEM]][%[[GLOBAL_I]], %[[GLOBAL_J]]], %[[LDS]][%[[IDX]]]
+func.func @test_make_dma_base_expand_shape(%mem: memref<64x128xf16, #gpu_global_addrspace>, %lds: memref<4096xf16, #gpu_lds_addrspace>, %global_i: index, %global_j: index, %lds_i: index, %lds_j: index) -> !amdgpu.tdm_base<f16> {
+  // CHECK: %[[IDX:.*]] = affine.linearize_index disjoint [%[[LDS_I]], %[[LDS_J]]] by (64, 64) : index
+  // CHECK: %[[BASE:.*]] = amdgpu.make_dma_base %[[MEM]][%[[GLOBAL_I]], %[[GLOBAL_J]]], %[[LDS]][%[[IDX]]]
   // CHECK-SAME: memref<64x128xf16, #gpu.address_space<global>>, memref<4096xf16, #gpu.address_space<workgroup>> -> !amdgpu.tdm_base<f16>
+  // CHECK: return %[[BASE]] : !amdgpu.tdm_base<f16>
 
   %expand_lds = memref.expand_shape %lds [[0, 1]] output_shape [64, 64] : memref<4096xf16, #gpu_lds_addrspace> into memref<64x64xf16, #gpu_lds_addrspace>
   %base = amdgpu.make_dma_base %mem[%global_i, %global_j], %expand_lds[%lds_i, %lds_j]
     : memref<64x128xf16, #gpu_global_addrspace>, memref<64x64xf16, #gpu_lds_addrspace> -> !amdgpu.tdm_base<f16>
-  func.return
+  func.return %base : !amdgpu.tdm_base<f16>
 }
 
 // -----
@@ -329,14 +353,15 @@ func.func @test_make_dma_base_expand_shape(%mem: memref<64x128xf16, #gpu_global_
 
 // CHECK: func @test_make_gather_dma_base_subview
 // CHECK-SAME: %[[MEM:.*]]: memref<64x128xf16, #gpu.address_space<global>>, %[[LDS:.*]]: memref<64x64xf16, #gpu.address_space<workgroup>>, %[[GLOBAL_I:.*]]: index, %[[GLOBAL_J:.*]]: index, %[[LDS_I:.*]]: index, %[[LDS_J:.*]]: index
-func.func @test_make_gather_dma_base_subview(%mem: memref<64x128xf16, #gpu_global_addrspace>, %lds: memref<64x64xf16, #gpu_lds_addrspace>, %global_i: index, %global_j: index, %lds_i: index, %lds_j: index) {
-  // CHECK: amdgpu.make_gather_dma_base %[[MEM]][%[[GLOBAL_I]], %[[GLOBAL_J]]], %[[LDS]][%[[LDS_I]], %[[LDS_J]]]
+func.func @test_make_gather_dma_base_subview(%mem: memref<64x128xf16, #gpu_global_addrspace>, %lds: memref<64x64xf16, #gpu_lds_addrspace>, %global_i: index, %global_j: index, %lds_i: index, %lds_j: index) -> !amdgpu.tdm_gather_base<f16, i16> {
+  // CHECK: %[[BASE:.*]] = amdgpu.make_gather_dma_base %[[MEM]][%[[GLOBAL_I]], %[[GLOBAL_J]]], %[[LDS]][%[[LDS_I]], %[[LDS_J]]]
   // CHECK-SAME: memref<64x128xf16, #gpu.address_space<global>>, memref<64x64xf16, #gpu.address_space<workgroup>> -> !amdgpu.tdm_gather_base<f16, i16>
+  // CHECK: return %[[BASE]] : !amdgpu.tdm_gather_base<f16, i16>
 
   %subview = memref.subview %mem[0, 0][32, 64][1, 1] : memref<64x128xf16, #gpu_global_addrspace> to memref<32x64xf16, strided<[128, 1]>, #gpu_global_addrspace>
   %base = amdgpu.make_gather_dma_base %subview[%global_i, %global_j], %lds[%lds_i, %lds_j]
     : memref<32x64xf16, strided<[128, 1]>, #gpu_global_addrspace>, memref<64x64xf16, #gpu_lds_addrspace> -> !amdgpu.tdm_gather_base<f16, i16>
-  func.return
+  func.return %base : !amdgpu.tdm_gather_base<f16, i16>
 }
 
 // -----
@@ -346,15 +371,16 @@ func.func @test_make_gather_dma_base_subview(%mem: memref<64x128xf16, #gpu_globa
 
 // CHECK: func @test_make_gather_dma_base_collapse_shape
 // CHECK-SAME: %[[MEM:.*]]: memref<64x128xf16, #gpu.address_space<global>>, %[[LDS:.*]]: memref<64x64xf16, #gpu.address_space<workgroup>>, %[[GLOBAL_I:.*]]: index, %[[GLOBAL_J:.*]]: index, %[[LDS_IDX:.*]]: index
-func.func @test_make_gather_dma_base_collapse_shape(%mem: memref<64x128xf16, #gpu_global_addrspace>, %lds: memref<64x64xf16, #gpu_lds_addrspace>, %global_i: index, %global_j: index, %lds_idx: index) {
-  // CHECK: %[[INDICES:.*]]:2 = affine.delinearize_index %[[LDS_IDX]] into (64) : index, index
-  // CHECK: amdgpu.make_gather_dma_base %[[MEM]][%[[GLOBAL_I]], %[[GLOBAL_J]]], %[[LDS]][%[[INDICES]]#0, %[[INDICES]]#1]
+func.func @test_make_gather_dma_base_collapse_shape(%mem: memref<64x128xf16, #gpu_global_addrspace>, %lds: memref<64x64xf16, #gpu_lds_addrspace>, %global_i: index, %global_j: index, %lds_idx: index) -> !amdgpu.tdm_gather_base<f16, i16> {
+  // CHECK: %[[INDICES:.*]]:2 = affine.delinearize_index %[[LDS_IDX]] into (64, 64) : index, index
+  // CHECK: %[[BASE:.*]] = amdgpu.make_gather_dma_base %[[MEM]][%[[GLOBAL_I]], %[[GLOBAL_J]]], %[[LDS]][%[[INDICES]]#0, %[[INDICES]]#1]
   // CHECK-SAME: memref<64x128xf16, #gpu.address_space<global>>, memref<64x64xf16, #gpu.address_space<workgroup>> -> !amdgpu.tdm_gather_base<f16, i16>
+  // CHECK: return %[[BASE]] : !amdgpu.tdm_gather_base<f16, i16>
 
   %collapse_lds = memref.collapse_shape %lds [[0, 1]] : memref<64x64xf16, #gpu_lds_addrspace> into memref<4096xf16, #gpu_lds_addrspace>
   %base = amdgpu.make_gather_dma_base %mem[%global_i, %global_j], %collapse_lds[%lds_idx]
     : memref<64x128xf16, #gpu_global_addrspace>, memref<4096xf16, #gpu_lds_addrspace> -> !amdgpu.tdm_gather_base<f16, i16>
-  func.return
+  func.return %base : !amdgpu.tdm_gather_base<f16, i16>
 }
 
 // -----
@@ -367,18 +393,19 @@ func.func @test_make_gather_dma_base_collapse_shape(%mem: memref<64x128xf16, #gp
 
 // CHECK: func @test_make_dma_base_both_fold
 // CHECK-SAME: %[[MEM:.*]]: memref<64x128xf16, #gpu.address_space<global>>, %[[LDS:.*]]: memref<4096xf16, #gpu.address_space<workgroup>>, %[[GLOBAL_I:.*]]: index, %[[GLOBAL_J:.*]]: index, %[[LDS_I:.*]]: index, %[[LDS_J:.*]]: index
-func.func @test_make_dma_base_both_fold(%mem: memref<64x128xf16, #gpu_global_addrspace>, %lds: memref<4096xf16, #gpu_lds_addrspace>, %global_i: index, %global_j: index, %lds_i: index, %lds_j: index) {
+func.func @test_make_dma_base_both_fold(%mem: memref<64x128xf16, #gpu_global_addrspace>, %lds: memref<4096xf16, #gpu_lds_addrspace>, %global_i: index, %global_j: index, %lds_i: index, %lds_j: index) -> !amdgpu.tdm_base<f16> {
   // CHECK: %[[GI:.*]] = affine.apply #[[BOTH_MAP]]()[%[[GLOBAL_I]]]
   // CHECK: %[[GJ:.*]] = affine.apply #[[BOTH_MAP1]]()[%[[GLOBAL_J]]]
-  // CHECK: %[[IDX:.*]] = affine.linearize_index [%[[LDS_I]], %[[LDS_J]]] by (64, 64) : index
-  // CHECK: amdgpu.make_dma_base %[[MEM]][%[[GI]], %[[GJ]]], %[[LDS]][%[[IDX]]]
+  // CHECK: %[[IDX:.*]] = affine.linearize_index disjoint [%[[LDS_I]], %[[LDS_J]]] by (64, 64) : index
+  // CHECK: %[[BASE:.*]] = amdgpu.make_dma_base %[[MEM]][%[[GI]], %[[GJ]]], %[[LDS]][%[[IDX]]]
   // CHECK-SAME: memref<64x128xf16, #gpu.address_space<global>>, memref<4096xf16, #gpu.address_space<workgroup>> -> !amdgpu.tdm_base<f16>
+  // CHECK: return %[[BASE]] : !amdgpu.tdm_base<f16>
 
   %subview = memref.subview %mem[32, 64][32, 64][1, 1] : memref<64x128xf16, #gpu_global_addrspace> to memref<32x64xf16, strided<[128, 1], offset: 4160>, #gpu_global_addrspace>
   %expand_lds = memref.expand_shape %lds [[0, 1]] output_shape [64, 64] : memref<4096xf16, #gpu_lds_addrspace> into memref<64x64xf16, #gpu_lds_addrspace>
   %base = amdgpu.make_dma_base %subview[%global_i, %global_j], %expand_lds[%lds_i, %lds_j]
     : memref<32x64xf16, strided<[128, 1], offset: 4160>, #gpu_global_addrspace>, memref<64x64xf16, #gpu_lds_addrspace> -> !amdgpu.tdm_base<f16>
-  func.return
+  func.return %base : !amdgpu.tdm_base<f16>
 }
 
 // -----
@@ -388,16 +415,17 @@ func.func @test_make_dma_base_both_fold(%mem: memref<64x128xf16, #gpu_global_add
 
 // CHECK: func @test_make_dma_base_nop
 // CHECK-SAME: %[[MEM:.*]]: memref<64x128xf16, #gpu.address_space<global>>, %[[LDS:.*]]: memref<64x64xf16, #gpu.address_space<workgroup>>, %[[GLOBAL_I:.*]]: index, %[[GLOBAL_J:.*]]: index, %[[LDS_I:.*]]: index, %[[LDS_J:.*]]: index
-func.func @test_make_dma_base_nop(%mem: memref<64x128xf16, #gpu_global_addrspace>, %lds: memref<64x64xf16, #gpu_lds_addrspace>, %global_i: index, %global_j: index, %lds_i: index, %lds_j: index) {
-  // CHECK: amdgpu.make_dma_base %[[MEM]][%[[GLOBAL_I]], %[[GLOBAL_J]]], %[[LDS]][%[[LDS_I]], %[[LDS_J]]]
+func.func @test_make_dma_base_nop(%mem: memref<64x128xf16, #gpu_global_addrspace>, %lds: memref<64x64xf16, #gpu_lds_addrspace>, %global_i: index, %global_j: index, %lds_i: index, %lds_j: index) -> !amdgpu.tdm_base<f16> {
+  // CHECK: %[[BASE:.*]] = amdgpu.make_dma_base %[[MEM]][%[[GLOBAL_I]], %[[GLOBAL_J]]], %[[LDS]][%[[LDS_I]], %[[LDS_J]]]
   // CHECK-SAME: memref<64x128xf16, #gpu.address_space<global>>, memref<64x64xf16, #gpu.address_space<workgroup>> -> !amdgpu.tdm_base<f16>
+  // CHECK: return %[[BASE]] : !amdgpu.tdm_base<f16>
   // CHECK-NOT: subview
   // CHECK-NOT: expand_shape
   // CHECK-NOT: collapse_shape
 
   %base = amdgpu.make_dma_base %mem[%global_i, %global_j], %lds[%lds_i, %lds_j]
     : memref<64x128xf16, #gpu_global_addrspace>, memref<64x64xf16, #gpu_lds_addrspace> -> !amdgpu.tdm_base<f16>
-  func.return
+  func.return %base : !amdgpu.tdm_base<f16>
 }
 
 // -----
@@ -407,16 +435,17 @@ func.func @test_make_dma_base_nop(%mem: memref<64x128xf16, #gpu_global_addrspace
 
 // CHECK: func @test_make_gather_dma_base_nop
 // CHECK-SAME: %[[MEM:.*]]: memref<64x128xf16, #gpu.address_space<global>>, %[[LDS:.*]]: memref<64x64xf16, #gpu.address_space<workgroup>>, %[[GLOBAL_I:.*]]: index, %[[GLOBAL_J:.*]]: index, %[[LDS_I:.*]]: index, %[[LDS_J:.*]]: index
-func.func @test_make_gather_dma_base_nop(%mem: memref<64x128xf16, #gpu_global_addrspace>, %lds: memref<64x64xf16, #gpu_lds_addrspace>, %global_i: index, %global_j: index, %lds_i: index, %lds_j: index) {
-  // CHECK: amdgpu.make_gather_dma_base %[[MEM]][%[[GLOBAL_I]], %[[GLOBAL_J]]], %[[LDS]][%[[LDS_I]], %[[LDS_J]]]
+func.func @test_make_gather_dma_base_nop(%mem: memref<64x128xf16, #gpu_global_addrspace>, %lds: memref<64x64xf16, #gpu_lds_addrspace>, %global_i: index, %global_j: index, %lds_i: index, %lds_j: index) -> !amdgpu.tdm_gather_base<f16, i16> {
+  // CHECK: %[[BASE:.*]] = amdgpu.make_gather_dma_base %[[MEM]][%[[GLOBAL_I]], %[[GLOBAL_J]]], %[[LDS]][%[[LDS_I]], %[[LDS_J]]]
   // CHECK-SAME: memref<64x128xf16, #gpu.address_space<global>>, memref<64x64xf16, #gpu.address_space<workgroup>> -> !amdgpu.tdm_gather_base<f16, i16>
+  // CHECK: return %[[BASE]] : !amdgpu.tdm_gather_base<f16, i16>
   // CHECK-NOT: subview
   // CHECK-NOT: expand_shape
   // CHECK-NOT: collapse_shape
 
   %base = amdgpu.make_gather_dma_base %mem[%global_i, %global_j], %lds[%lds_i, %lds_j]
     : memref<64x128xf16, #gpu_global_addrspace>, memref<64x64xf16, #gpu_lds_addrspace> -> !amdgpu.tdm_gather_base<f16, i16>
-  func.return
+  func.return %base : !amdgpu.tdm_gather_base<f16, i16>
 }
 
 // -----
@@ -456,7 +485,7 @@ func.func @test_global_load_async_to_lds_both_fold_masked(%src: memref<64x128xf3
 // CHECK: func @test_global_load_async_to_lds_no_mask_dst_collapse
 // CHECK-SAME: %[[SRC:.*]]: memref<8192xi32, #gpu.address_space<global>>, %[[LDS:.*]]: memref<64x64xi32, #gpu.address_space<workgroup>>, %[[SRC_IDX:.*]]: index, %[[DST_IDX:.*]]: index
 func.func @test_global_load_async_to_lds_no_mask_dst_collapse(%src: memref<8192xi32, #gpu_global_addrspace>, %lds: memref<64x64xi32, #gpu_lds_addrspace>, %src_idx: index, %dst_idx: index) {
-  // CHECK: %[[INDICES:.*]]:2 = affine.delinearize_index %[[DST_IDX]] into (64) : index, index
+  // CHECK: %[[INDICES:.*]]:2 = affine.delinearize_index %[[DST_IDX]] into (64, 64) : index, index
   // CHECK: amdgpu.global_load_async_to_lds %[[SRC]][%[[SRC_IDX]]], %[[LDS]][%[[INDICES]]#0, %[[INDICES]]#1] :
   // CHECK-SAME: i32, memref<8192xi32, #gpu.address_space<global>>, memref<64x64xi32, #gpu.address_space<workgroup>>
 
@@ -468,3 +497,148 @@ func.func @test_global_load_async_to_lds_no_mask_dst_collapse(%src: memref<8192x
       memref<4096xi32, #gpu_lds_addrspace>
   func.return
 }
+
+// -----
+
+#gpu_lds_addrspace = #gpu.address_space<workgroup>
+#gpu_global_addrspace = #gpu.address_space<global>
+
+// CHECK: func @test_global_load_async_to_lds_masked_dst_collapse
+// CHECK-SAME: %[[SRC:.*]]: memref<8192xi32, #gpu.address_space<global>>, %[[LDS:.*]]: memref<64x64xi32, #gpu.address_space<workgroup>>, %[[SRC_IDX:.*]]: index, %[[DST_IDX:.*]]: index, %[[MASK:.*]]: i1
+func.func @test_global_load_async_to_lds_masked_dst_collapse(%src: memref<8192xi32, #gpu_global_addrspace>, %lds: memref<64x64xi32, #gpu_lds_addrspace>, %src_idx: index, %dst_idx: index, %mask: i1) {
+  // CHECK: %[[INDICES:.*]]:2 = affine.delinearize_index %[[DST_IDX]] into (64) : index, index
+  // CHECK: amdgpu.global_load_async_to_lds %[[SRC]][%[[SRC_IDX]]], %[[LDS]][%[[INDICES]]#0, %[[INDICES]]#1], %[[MASK]] :
+  // CHECK-SAME: i32, memref<8192xi32, #gpu.address_space<global>>, memref<64x64xi32, #gpu.address_space<workgroup>>
+
+  %collapse_lds = memref.collapse_shape %lds [[0, 1]]
+    : memref<64x64xi32, #gpu_lds_addrspace>
+    into memref<4096xi32, #gpu_lds_addrspace>
+  amdgpu.global_load_async_to_lds %src[%src_idx], %collapse_lds[%dst_idx], %mask
+    : i32, memref<8192xi32, #gpu_global_addrspace>,
+      memref<4096xi32, #gpu_lds_addrspace>
+  func.return
+}
+
+// -----
+
+#gpu_global_addrspace = #gpu.address_space<global>
+
+// CHECK: func @test_global_transpose_load_expand_shape
+// CHECK-SAME: %[[SRC:.*]]: memref<8192xf16, #gpu.address_space<global>>, %[[I:.*]]: index, %[[J:.*]]: index
+func.func @test_global_transpose_load_expand_shape(%src: memref<8192xf16, #gpu_global_addrspace>, %i: index, %j: index) -> vector<8xf16> {
+  // CHECK: %[[IDX:.*]] = affine.linearize_index disjoint [%[[I]], %[[J]]] by (64, 128) : index
+  // CHECK: amdgpu.global_transpose_load %[[SRC]][%[[IDX]]]
+  // CHECK-SAME: memref<8192xf16, #gpu.address_space<global>> -> vector<8xf16>
+
+  %expand_src = memref.expand_shape %src [[0, 1]] output_shape [64, 128]
+    : memref<8192xf16, #gpu_global_addrspace>
+    into memref<64x128xf16, #gpu_global_addrspace>
+  %result = amdgpu.global_transpose_load %expand_src[%i, %j]
+    : memref<64x128xf16, #gpu_global_addrspace> -> vector<8xf16>
+  return %result : vector<8xf16>
+}
+
+// -----
+
+#gpu_wg = #gpu.address_space<workgroup>
+
+// CHECK: func @test_ds_barrier_ops_expand_shape
+// CHECK-SAME: %[[BARRIER:.*]]: memref<16x!amdgpu.ds_barrier_state, #gpu.address_space<workgroup>>, %[[I:.*]]: index, %[[J:.*]]: index, %[[PARTICIPANTS:.*]]: i32, %[[COUNT:.*]]: i64
+func.func @test_ds_barrier_ops_expand_shape(%barrier: memref<16x!amdgpu.ds_barrier_state, #gpu_wg>, %i: index, %j: index, %participants: i32, %count: i64) -> !amdgpu.ds_barrier_state {
+  // CHECK: %[[INIT_IDX:.*]] = affine.linearize_index disjoint [%[[I]], %[[J]]] by (4, 4) : index
+  // CHECK: amdgpu.ds_barrier_init %[[BARRIER]][%[[INIT_IDX]]], %[[PARTICIPANTS]]
+  // CHECK-SAME: memref<16x!amdgpu.ds_barrier_state, #gpu.address_space<workgroup>>, i32
+  // CHECK: %[[POLL_IDX:.*]] = affine.linearize_index disjoint [%[[I]], %[[J]]] by (4, 4) : index
+  // CHECK: %[[STATE:.*]] = amdgpu.ds_barrier_poll_state %[[BARRIER]][%[[POLL_IDX]]]
+  // CHECK-SAME: memref<16x!amdgpu.ds_barrier_state, #gpu.address_space<workgroup>> -> !amdgpu.ds_barrier_state
+  // CHECK: %[[ASYNC_IDX:.*]] = affine.linearize_index disjoint [%[[I]], %[[J]]] by (4, 4) : index
+  // CHECK: amdgpu.ds_async_barrier_arrive %[[BARRIER]][%[[ASYNC_IDX]]]
+  // CHECK-SAME: memref<16x!amdgpu.ds_barrier_state, #gpu.address_space<workgroup>>
+  // CHECK: %[[ARRIVE_IDX:.*]] = affine.linearize_index disjoint [%[[I]], %[[J]]] by (4, 4) : index
+  // CHECK: %[[OLD:.*]] = amdgpu.ds_barrier_arrive %[[BARRIER]][%[[ARRIVE_IDX]]], %[[COUNT]]
+  // CHECK-SAME: memref<16x!amdgpu.ds_barrier_state, #gpu.address_space<workgroup>>, i64 -> !amdgpu.ds_barrier_state
+
+  %expand_barrier = memref.expand_shape %barrier [[0, 1]] output_shape [4, 4]
+    : memref<16x!amdgpu.ds_barrier_state, #gpu_wg>
+    into memref<4x4x!amdgpu.ds_barrier_state, #gpu_wg>
+  amdgpu.ds_barrier_init %expand_barrier[%i, %j], %participants
+    : memref<4x4x!amdgpu.ds_barrier_state, #gpu_wg>, i32
+  %state = amdgpu.ds_barrier_poll_state %expand_barrier[%i, %j]
+    : memref<4x4x!amdgpu.ds_barrier_state, #gpu_wg> -> !amdgpu.ds_barrier_state
+  amdgpu.ds_async_barrier_arrive %expand_barrier[%i, %j]
+    : memref<4x4x!amdgpu.ds_barrier_state, #gpu_wg>
+  %old_state = amdgpu.ds_barrier_arrive %expand_barrier[%i, %j], %count
+    : memref<4x4x!amdgpu.ds_barrier_state, #gpu_wg>, i64 -> !amdgpu.ds_barrier_state
+  return %old_state : !amdgpu.ds_barrier_state
+}
+
+// -----
+
+#gpu_wg = #gpu.address_space<workgroup>
+
+// CHECK: func @test_make_dma_descriptor_atomic_barrier_expand_shape
+// CHECK-SAME: %[[BASE:.*]]: !amdgpu.tdm_base<i32>, %[[BARRIER:.*]]: memref<16x!amdgpu.ds_barrier_state, #gpu.address_space<workgroup>>, %[[I:.*]]: index, %[[J:.*]]: index
+func.func @test_make_dma_descriptor_atomic_barrier_expand_shape(%base: !amdgpu.tdm_base<i32>, %barrier: memref<16x!amdgpu.ds_barrier_state, #gpu_wg>, %i: index, %j: index) -> !amdgpu.tdm_descriptor {
+  // CHECK: %[[BARRIER_IDX:.*]] = affine.linearize_index disjoint [%[[I]], %[[J]]] by (4, 4) : index
+  // CHECK: %[[DESC:.*]] = amdgpu.make_dma_descriptor %[[BASE]]
+  // CHECK-SAME: globalSize [64, 64]
+  // CHECK-SAME: globalStride [64, 1]
+  // CHECK-SAME: sharedSize [64, 64]
+  // CHECK-SAME: atomicBarrier(%[[BARRIER]][%[[BARRIER_IDX]]] : memref<16x!amdgpu.ds_barrier_state, #gpu.address_space<workgroup>>)
+
+  %expand_barrier = memref.expand_shape %barrier [[0, 1]] output_shape [4, 4]
+    : memref<16x!amdgpu.ds_barrier_state, #gpu_wg>
+    into memref<4x4x!amdgpu.ds_barrier_state, #gpu_wg>
+  %desc = amdgpu.make_dma_descriptor %base
+    globalSize [64, 64]
+    globalStride [64, 1]
+    sharedSize [64, 64]
+    atomicBarrier(%expand_barrier[%i, %j] : memref<4x4x!amdgpu.ds_barrier_state, #gpu_wg>)
+    : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
+  return %desc : !amdgpu.tdm_descriptor
+}
+
+// -----
+
+#gpu_wg = #gpu.address_space<workgroup>
+
+// CHECK: func @test_make_gather_dma_descriptor_atomic_barrier_expand_shape
+// CHECK-SAME: %[[BASE:.*]]: !amdgpu.tdm_gather_base<i32, i32>, %[[INDICES:.*]]: vector<8xi32>, %[[BARRIER:.*]]: memref<16x!amdgpu.ds_barrier_state, #gpu.address_space<workgroup>>, %[[I:.*]]: index, %[[J:.*]]: index
+func.func @test_make_gather_dma_descriptor_atomic_barrier_expand_shape(%base: !amdgpu.tdm_gather_base<i32, i32>, %indices: vector<8xi32>, %barrier: memref<16x!amdgpu.ds_barrier_state, #gpu_wg>, %i: index, %j: index) -> !amdgpu.tdm_descriptor {
+  // CHECK: %[[BARRIER_IDX:.*]] = affine.linearize_index disjoint [%[[I]], %[[J]]] by (4, 4) : index
+  // CHECK: %[[DESC:.*]] = amdgpu.make_gather_dma_descriptor %[[BASE]][%[[INDICES]]]
+  // CHECK-SAME: globalSize [64, 64]
+  // CHECK-SAME: globalStride [64, 1]
+  // CHECK-SAME: sharedSize [64, 64]
+  // CHECK-SAME: atomicBarrier(%[[BARRIER]][%[[BARRIER_IDX]]] : memref<16x!amdgpu.ds_barrier_state, #gpu.address_space<workgroup>>)
+
+  %expand_barrier = memref.expand_shape %barrier [[0, 1]] output_shape [4, 4]
+    : memref<16x!amdgpu.ds_barrier_state, #gpu_wg>
+    into memref<4x4x!amdgpu.ds_barrier_state, #gpu_wg>
+  %desc = amdgpu.make_gather_dma_descriptor %base[%indices]
+    globalSize [64, 64]
+    globalStride [64, 1]
+    sharedSize [64, 64]
+    atomicBarrier(%expand_barrier[%i, %j] : memref<4x4x!amdgpu.ds_barrier_state, #gpu_wg>)
+    : !amdgpu.tdm_gather_base<i32, i32>, vector<8xi32> -> !amdgpu.tdm_descriptor
+  return %desc : !amdgpu.tdm_descriptor
+}
+
+// -----
+
+// CHECK: func @test_make_dma_descriptor_no_atomic_barrier
+// CHECK-SAME: %[[BASE:.*]]: !amdgpu.tdm_base<i32>
+func.func @test_make_dma_descriptor_no_atomic_barrier(%base: !amdgpu.tdm_base<i32>) -> !amdgpu.tdm_descriptor {
+  // CHECK: %[[DESC:.*]] = amdgpu.make_dma_descriptor %[[BASE]]
+  // CHECK-SAME: globalSize [64, 64]
+  // CHECK-SAME: globalStride [64, 1]
+  // CHECK-SAME: sharedSize [64, 64]
+  // CHECK-NOT: atomicBarrier
+
+  %desc = amdgpu.make_dma_descriptor %base
+    globalSize [64, 64]
+    globalStride [64, 1]
+    sharedSize [64, 64]
+    : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
+  return %desc : !amdgpu.tdm_descriptor
+}
diff --git a/mlir/test/Dialect/AMDGPU/invalid.mlir b/mlir/test/Dialect/AMDGPU/invalid.mlir
index 4e4cfe53298c7..d176e44736315 100644
--- a/mlir/test/Dialect/AMDGPU/invalid.mlir
+++ b/mlir/test/Dialect/AMDGPU/invalid.mlir
@@ -377,6 +377,16 @@ func.func @gather_to_lds_wrong_num_indices(%idx : index,
 
 // -----
 
+func.func @gather_to_lds_bad_integer_address_space(%idx : index,
+    %src : memref<32xf16, 8>,
+    %dst : memref<32xf16, #gpu.address_space<workgroup>>) {
+  // expected-error at +1 {{'amdgpu.gather_to_lds' op source memory address space must be global or fat raw buffer}}
+  amdgpu.gather_to_lds %src[%idx], %dst[%idx] : vector<2xf16>, memref<32xf16, 8>, memref<32xf16, #gpu.address_space<workgroup>>
+  func.return
+}
+
+// -----
+
 func.func @global_load_async_to_lds_non_lds(%idx1 : index,
     %mem1 : memref<32xf32, #gpu.address_space<global>>,
     %mem2 : memref<32xf32>) {
diff --git a/mlir/test/Dialect/AMDGPU/ops.mlir b/mlir/test/Dialect/AMDGPU/ops.mlir
index b05627d6ee967..2aaa61d14db95 100644
--- a/mlir/test/Dialect/AMDGPU/ops.mlir
+++ b/mlir/test/Dialect/AMDGPU/ops.mlir
@@ -700,6 +700,15 @@ func.func @gather_to_lds(%idx1 : index, %idx2 : index, %mem1 : memref<32xf16>, %
   func.return
 }
 
+// CHECK-LABEL: func @gather_to_lds_integer_fat_raw_buffer_address_space
+func.func @gather_to_lds_integer_fat_raw_buffer_address_space(%idx : index,
+    %mem : memref<32xf16, 7>,
+    %smem : memref<32xf16, #gpu.address_space<workgroup>>) {
+  // CHECK: amdgpu.gather_to_lds
+  amdgpu.gather_to_lds %mem[%idx], %smem[%idx] : vector<2xf16>, memref<32xf16, 7>, memref<32xf16, #gpu.address_space<workgroup>>
+  func.return
+}
+
 // CHECK-LABEL: func @gather_to_lds_0d
 func.func @gather_to_lds_0d(%mem1 : memref<f16>, %smem1 : memref<f16, #gpu.address_space<workgroup>>) {
   // CHECK: amdgpu.gather_to_lds async %{{.*}}[], %{{.*}}[]