[Mlir-commits] [mlir] [mlir][memref][NVGPU] Move NVGPU ops to IndexedAccessOpInterface (PR #190430)
Krzysztof Drewniak
llvmlistbot at llvm.org
Fri Apr 3 16:31:35 PDT 2026
https://github.com/krzysz00 created https://github.com/llvm/llvm-project/pull/190430
This removes the need for the memref dialect to know about nvgpu operations (though we still haven't converted
ExtractAddressComputations to t new interface, so we can't remove the dependency just yet).
ldmatrix is defined to access a 1-D region of memory in order to enable folding in arbitrary expand_ and collpapse_shapes, as its underlying lowering is jut a scalar getStridedElementPtr()
>From 5fb8381b61d4a6d40d93139e8c918dd4cc874d67 Mon Sep 17 00:00:00 2001
From: Krzysztof Drewniak <Krzysztof.Drewniak at amd.com>
Date: Fri, 3 Apr 2026 23:20:49 +0000
Subject: [PATCH] [mlir][memref][NVGPU] Move NVGPU ops to
IndexedAccessOpInterface
This removes the need for the memref dialect to know about nvgpu
operations (though we still haven't converted
ExtractAddressComputations to t new interface, so we can't remove the
dependency just yet).
ldmatrix is defined to access a 1-D region of memory in order to
enable folding in arbitrary expand_ and collpapse_shapes, as its
underlying lowering is jut a scalar getStridedElementPtr()
---
.../include/mlir/Dialect/NVGPU/IR/NVGPUOps.td | 2 +
.../Transforms/MemoryAccessOpInterfacesImpl.h | 21 +++++
.../MemRef/Transforms/FoldMemRefAliasOps.cpp | 76 +--------------
mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp | 4 +
.../Dialect/NVGPU/Transforms/CMakeLists.txt | 1 +
.../MemoryAccessOpInterfacesImpl.cpp | 93 +++++++++++++++++++
mlir/lib/RegisterAllDialects.cpp | 2 +
.../Dialect/MemRef/fold-memref-alias-ops.mlir | 80 ----------------
.../Dialect/NVGPU/fold-memref-alias-ops.mlir | 93 +++++++++++++++++++
9 files changed, 217 insertions(+), 155 deletions(-)
create mode 100644 mlir/include/mlir/Dialect/NVGPU/Transforms/MemoryAccessOpInterfacesImpl.h
create mode 100644 mlir/lib/Dialect/NVGPU/Transforms/MemoryAccessOpInterfacesImpl.cpp
create mode 100644 mlir/test/Dialect/NVGPU/fold-memref-alias-ops.mlir
diff --git a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUOps.td b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUOps.td
index 73d86283a5940..4c11725405ea5 100644
--- a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUOps.td
+++ b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUOps.td
@@ -30,6 +30,7 @@ include "mlir/Dialect/NVGPU/IR/NVGPUTypes.td"
class NVGPU_Op<string mnemonic, list<Trait> traits = []> :
Op<NVGPU_Dialect, mnemonic, traits> {}
+// Promises IndexedAccessOpInterface.
def NVGPU_LdMatrixOp : NVGPU_Op<"ldmatrix", [
MemoryEffects<[MemRead]>,
PredOpTrait<"srcMemref and res have same element type",
@@ -183,6 +184,7 @@ def NVGPU_MmaSparseSyncOp : NVGPU_MmaSyncOp<"mma.sp.sync"> {
let extraClassDeclaration = extraBaseClassDeclaration;
}
+// Promises IndexedMemCopyOpInterface.
def NVGPU_DeviceAsyncCopyOp : NVGPU_Op<"device_async_copy", [
AttrSizedOperandSegments]> {
let summary = "device-side asynchronous copy";
diff --git a/mlir/include/mlir/Dialect/NVGPU/Transforms/MemoryAccessOpInterfacesImpl.h b/mlir/include/mlir/Dialect/NVGPU/Transforms/MemoryAccessOpInterfacesImpl.h
new file mode 100644
index 0000000000000..50d2223912a27
--- /dev/null
+++ b/mlir/include/mlir/Dialect/NVGPU/Transforms/MemoryAccessOpInterfacesImpl.h
@@ -0,0 +1,21 @@
+//===- MemoryAccessOpInterfacesImpl.h -------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_NVGPU_TRANSFORMS_MEMORYACCESSOPINTERFACESIMPL_H
+#define MLIR_DIALECT_NVGPU_TRANSFORMS_MEMORYACCESSOPINTERFACESIMPL_H
+
+namespace mlir {
+
+class DialectRegistry;
+
+namespace nvgpu {
+void registerMemoryAccessOpInterfacesExternalModels(DialectRegistry ®istry);
+} // namespace nvgpu
+} // namespace mlir
+
+#endif // MLIR_DIALECT_NVGPU_TRANSFORMS_MEMORYACCESSOPINTERFACESIMPL_H
diff --git a/mlir/lib/Dialect/MemRef/Transforms/FoldMemRefAliasOps.cpp b/mlir/lib/Dialect/MemRef/Transforms/FoldMemRefAliasOps.cpp
index 6f2752932422a..d42d4f4599e18 100644
--- a/mlir/lib/Dialect/MemRef/Transforms/FoldMemRefAliasOps.cpp
+++ b/mlir/lib/Dialect/MemRef/Transforms/FoldMemRefAliasOps.cpp
@@ -19,7 +19,6 @@
#include "mlir/Dialect/MemRef/Transforms/Passes.h"
#include "mlir/Dialect/MemRef/Transforms/Transforms.h"
#include "mlir/Dialect/MemRef/Utils/MemRefUtils.h"
-#include "mlir/Dialect/NVGPU/IR/NVGPUDialect.h"
#include "mlir/Dialect/Vector/IR/VectorOps.h"
#include "mlir/IR/AffineExpr.h"
#include "mlir/IR/AffineMap.h"
@@ -76,10 +75,6 @@ static Value getMemRefOperand(vector::TransferReadOp op) {
return op.getBase();
}
-static Value getMemRefOperand(nvgpu::LdMatrixOp op) {
- return op.getSrcMemref();
-}
-
static Value getMemRefOperand(vector::LoadOp op) { return op.getBase(); }
static Value getMemRefOperand(vector::StoreOp op) { return op.getBase(); }
@@ -208,17 +203,6 @@ class SubViewOfSubViewFolder : public OpRewritePattern<memref::SubViewOp> {
}
};
-/// Folds nvgpu.device_async_copy subviews into the copy itself. This pattern
-/// is folds subview on src and dst memref of the copy.
-class NVGPUAsyncCopyOpSubViewOpFolder final
- : public OpRewritePattern<nvgpu::DeviceAsyncCopyOp> {
-public:
- using OpRewritePattern<nvgpu::DeviceAsyncCopyOp>::OpRewritePattern;
-
- LogicalResult matchAndRewrite(nvgpu::DeviceAsyncCopyOp copyOp,
- PatternRewriter &rewriter) const override;
-};
-
/// Merges subview operations with load/store like operations unless such a
/// merger would cause the strides between dimensions accessed by that operaton
/// to change.
@@ -377,11 +361,6 @@ LogicalResult LoadOpOfSubViewOpFolder<OpTy>::matchAndRewrite(
op, op.getType(), subViewOp.getSource(), sourceIndices,
op.getLeadDimension(), op.getTransposeAttr());
})
- .Case([&](nvgpu::LdMatrixOp op) {
- rewriter.replaceOpWithNewOp<nvgpu::LdMatrixOp>(
- op, op.getType(), subViewOp.getSource(), sourceIndices,
- op.getTranspose(), op.getNumTiles());
- })
.DefaultUnreachable("unexpected operation");
return success();
}
@@ -822,57 +801,6 @@ LogicalResult IndexedMemCopyOpOfCollapseShapeOpFolder::matchAndRewrite(
return success();
}
-LogicalResult NVGPUAsyncCopyOpSubViewOpFolder::matchAndRewrite(
- nvgpu::DeviceAsyncCopyOp copyOp, PatternRewriter &rewriter) const {
-
- LLVM_DEBUG(DBGS() << "copyOp : " << copyOp << "\n");
-
- auto srcSubViewOp =
- copyOp.getSrc().template getDefiningOp<memref::SubViewOp>();
- auto dstSubViewOp =
- copyOp.getDst().template getDefiningOp<memref::SubViewOp>();
-
- if (!(srcSubViewOp || dstSubViewOp))
- return rewriter.notifyMatchFailure(copyOp, "does not use subview ops for "
- "source or destination");
-
- // If the source is a subview, we need to resolve the indices.
- SmallVector<Value> foldedSrcIndices(copyOp.getSrcIndices().begin(),
- copyOp.getSrcIndices().end());
-
- if (srcSubViewOp) {
- LLVM_DEBUG(DBGS() << "srcSubViewOp : " << srcSubViewOp << "\n");
- affine::resolveIndicesIntoOpWithOffsetsAndStrides(
- rewriter, copyOp.getLoc(), srcSubViewOp.getMixedOffsets(),
- srcSubViewOp.getMixedStrides(), srcSubViewOp.getDroppedDims(),
- copyOp.getSrcIndices(), foldedSrcIndices);
- }
-
- // If the destination is a subview, we need to resolve the indices.
- SmallVector<Value> foldedDstIndices(copyOp.getDstIndices().begin(),
- copyOp.getDstIndices().end());
-
- if (dstSubViewOp) {
- LLVM_DEBUG(DBGS() << "dstSubViewOp : " << dstSubViewOp << "\n");
- affine::resolveIndicesIntoOpWithOffsetsAndStrides(
- rewriter, copyOp.getLoc(), dstSubViewOp.getMixedOffsets(),
- dstSubViewOp.getMixedStrides(), dstSubViewOp.getDroppedDims(),
- copyOp.getDstIndices(), foldedDstIndices);
- }
-
- // Replace the copy op with a new copy op that uses the source and destination
- // of the subview.
- rewriter.replaceOpWithNewOp<nvgpu::DeviceAsyncCopyOp>(
- copyOp, nvgpu::DeviceAsyncTokenType::get(copyOp.getContext()),
- (dstSubViewOp ? dstSubViewOp.getSource() : copyOp.getDst()),
- foldedDstIndices,
- (srcSubViewOp ? srcSubViewOp.getSource() : copyOp.getSrc()),
- foldedSrcIndices, copyOp.getDstElements(), copyOp.getSrcElements(),
- copyOp.getBypassL1Attr());
-
- return success();
-}
-
void memref::populateFoldMemRefAliasOpPatterns(RewritePatternSet &patterns) {
patterns.add<
// Interface-based patterns to which we will be migrating.
@@ -881,7 +809,6 @@ void memref::populateFoldMemRefAliasOpPatterns(RewritePatternSet &patterns) {
IndexedMemCopyOpOfExpandShapeOpFolder,
IndexedMemCopyOpOfCollapseShapeOpFolder,
// The old way of doing things. Don't add more of these.
- LoadOpOfSubViewOpFolder<nvgpu::LdMatrixOp>,
LoadOpOfSubViewOpFolder<vector::LoadOp>,
LoadOpOfSubViewOpFolder<vector::MaskedLoadOp>,
LoadOpOfSubViewOpFolder<vector::TransferReadOp>,
@@ -899,8 +826,7 @@ void memref::populateFoldMemRefAliasOpPatterns(RewritePatternSet &patterns) {
LoadOpOfCollapseShapeOpFolder<vector::MaskedLoadOp>,
StoreOpOfCollapseShapeOpFolder<vector::StoreOp>,
StoreOpOfCollapseShapeOpFolder<vector::MaskedStoreOp>,
- SubViewOfSubViewFolder, NVGPUAsyncCopyOpSubViewOpFolder>(
- patterns.getContext());
+ SubViewOfSubViewFolder>(patterns.getContext());
}
//===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp b/mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp
index 237aab4d7f309..b60658c7e3041 100644
--- a/mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp
+++ b/mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp
@@ -12,6 +12,7 @@
#include "mlir/Dialect/NVGPU/IR/NVGPUDialect.h"
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/MemRef/IR/MemoryAccessOpInterfaces.h"
#include "mlir/IR/Builders.h"
#include "mlir/IR/BuiltinAttributes.h"
#include "mlir/IR/BuiltinTypes.h"
@@ -40,6 +41,9 @@ void NVGPUDialect::initialize() {
#define GET_OP_LIST
#include "mlir/Dialect/NVGPU/IR/NVGPUOps.cpp.inc"
>();
+ declarePromisedInterfaces<memref::IndexedAccessOpInterface, LdMatrixOp>();
+ declarePromisedInterfaces<memref::IndexedMemCopyOpInterface,
+ DeviceAsyncCopyOp>();
}
bool NVGPUDialect::isSharedMemoryAddressSpace(Attribute memorySpace) {
diff --git a/mlir/lib/Dialect/NVGPU/Transforms/CMakeLists.txt b/mlir/lib/Dialect/NVGPU/Transforms/CMakeLists.txt
index 3f967d2b189be..8852ed7fb30a8 100644
--- a/mlir/lib/Dialect/NVGPU/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/NVGPU/Transforms/CMakeLists.txt
@@ -1,5 +1,6 @@
add_mlir_dialect_library(MLIRNVGPUTransforms
CreateAsyncGroups.cpp
+ MemoryAccessOpInterfacesImpl.cpp
OptimizeSharedMemory.cpp
MmaSyncTF32Transform.cpp
Utils.cpp
diff --git a/mlir/lib/Dialect/NVGPU/Transforms/MemoryAccessOpInterfacesImpl.cpp b/mlir/lib/Dialect/NVGPU/Transforms/MemoryAccessOpInterfacesImpl.cpp
new file mode 100644
index 0000000000000..92a4834c481c0
--- /dev/null
+++ b/mlir/lib/Dialect/NVGPU/Transforms/MemoryAccessOpInterfacesImpl.cpp
@@ -0,0 +1,93 @@
+//===- MemoryAccessOpInterfacesImpl.cpp -----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/NVGPU/Transforms/MemoryAccessOpInterfacesImpl.h"
+
+#include "mlir/Dialect/MemRef/IR/MemoryAccessOpInterfaces.h"
+#include "mlir/Dialect/NVGPU/IR/NVGPUDialect.h"
+#include "mlir/IR/Dialect.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/PatternMatch.h"
+
+using namespace mlir;
+using namespace mlir::memref;
+using namespace mlir::nvgpu;
+
+namespace {
+struct LdMatrixOpInterface final
+ : IndexedAccessOpInterface::ExternalModel<LdMatrixOpInterface, LdMatrixOp> {
+ TypedValue<MemRefType> getAccessedMemref(Operation *op) const {
+ return cast<LdMatrixOp>(op).getSrcMemref();
+ }
+
+ Operation::operand_range getIndices(Operation *op) const {
+ return cast<LdMatrixOp>(op).getIndices();
+ }
+
+ SmallVector<int64_t> getAccessedShape(Operation *op) const {
+ VectorType vecTy = cast<LdMatrixOp>(op).getRes().getType();
+ // The 2-D nature of the result is an artifact of this operation returning
+ // a struct of vectors and doesn't reflect any strides that need to be
+ // preserved.
+ return SmallVector<int64_t>{vecTy.getNumElements()};
+ }
+
+ std::optional<SmallVector<Value>>
+ updateMemrefAndIndices(Operation *op, RewriterBase &rewriter, Value newMemref,
+ ValueRange newIndices) const {
+ auto ldMatrixOp = cast<LdMatrixOp>(op);
+ rewriter.modifyOpInPlace(ldMatrixOp, [&]() {
+ ldMatrixOp.getSrcMemrefMutable().assign(newMemref);
+ ldMatrixOp.getIndicesMutable().assign(newIndices);
+ });
+ return std::nullopt;
+ }
+
+ bool hasInboundsIndices(Operation *) const { return true; }
+};
+
+struct DeviceAsyncCopyOpInterface final
+ : IndexedMemCopyOpInterface::ExternalModel<DeviceAsyncCopyOpInterface,
+ DeviceAsyncCopyOp> {
+ TypedValue<MemRefType> getSrc(Operation *op) const {
+ return cast<DeviceAsyncCopyOp>(op).getSrc();
+ }
+
+ Operation::operand_range getSrcIndices(Operation *op) const {
+ return cast<DeviceAsyncCopyOp>(op).getSrcIndices();
+ }
+
+ TypedValue<MemRefType> getDst(Operation *op) const {
+ return cast<DeviceAsyncCopyOp>(op).getDst();
+ }
+
+ Operation::operand_range getDstIndices(Operation *op) const {
+ return cast<DeviceAsyncCopyOp>(op).getDstIndices();
+ }
+
+ void setMemrefsAndIndices(Operation *op, RewriterBase &rewriter, Value newSrc,
+ ValueRange newSrcIndices, Value newDst,
+ ValueRange newDstIndices) const {
+ auto copyOp = cast<DeviceAsyncCopyOp>(op);
+ rewriter.modifyOpInPlace(copyOp, [&]() {
+ copyOp.getSrcMutable().assign(newSrc);
+ copyOp.getSrcIndicesMutable().assign(newSrcIndices);
+ copyOp.getDstMutable().assign(newDst);
+ copyOp.getDstIndicesMutable().assign(newDstIndices);
+ });
+ }
+};
+} // namespace
+
+void mlir::nvgpu::registerMemoryAccessOpInterfacesExternalModels(
+ DialectRegistry ®istry) {
+ registry.addExtension(+[](MLIRContext *ctx, nvgpu::NVGPUDialect *dialect) {
+ LdMatrixOp::attachInterface<LdMatrixOpInterface>(*ctx);
+ DeviceAsyncCopyOp::attachInterface<DeviceAsyncCopyOpInterface>(*ctx);
+ });
+}
diff --git a/mlir/lib/RegisterAllDialects.cpp b/mlir/lib/RegisterAllDialects.cpp
index ea5698f39c0b0..bf994500a19f0 100644
--- a/mlir/lib/RegisterAllDialects.cpp
+++ b/mlir/lib/RegisterAllDialects.cpp
@@ -59,6 +59,7 @@
#include "mlir/Dialect/MemRef/Transforms/BufferViewFlowOpInterfaceImpl.h"
#include "mlir/Dialect/MemRef/Transforms/RuntimeOpVerification.h"
#include "mlir/Dialect/NVGPU/IR/NVGPUDialect.h"
+#include "mlir/Dialect/NVGPU/Transforms/MemoryAccessOpInterfacesImpl.h"
#include "mlir/Dialect/OpenACC/OpenACC.h"
#include "mlir/Dialect/OpenMP/OpenMPDialect.h"
#include "mlir/Dialect/PDL/IR/PDL.h"
@@ -179,6 +180,7 @@ void mlir::registerAllDialects(DialectRegistry ®istry) {
memref::registerValueBoundsOpInterfaceExternalModels(registry);
memref::registerMemorySlotExternalModels(registry);
ml_program::registerBufferizableOpInterfaceExternalModels(registry);
+ nvgpu::registerMemoryAccessOpInterfacesExternalModels(registry);
scf::registerBufferDeallocationOpInterfaceExternalModels(registry);
scf::registerBufferizableOpInterfaceExternalModels(registry);
scf::registerValueBoundsOpInterfaceExternalModels(registry);
diff --git a/mlir/test/Dialect/MemRef/fold-memref-alias-ops.mlir b/mlir/test/Dialect/MemRef/fold-memref-alias-ops.mlir
index 114ba86cda718..52c9ad590eda0 100644
--- a/mlir/test/Dialect/MemRef/fold-memref-alias-ops.mlir
+++ b/mlir/test/Dialect/MemRef/fold-memref-alias-ops.mlir
@@ -594,86 +594,6 @@ func.func @fold_gpu_subgroup_mma_load_matrix_2d(%arg0 : memref<128x128xf32>, %ar
// -----
-
-func.func @fold_nvgpu_device_async_copy_zero_sub_idx(%gmem_memref_3d : memref<2x128x768xf16>, %idx_1 : index, %idx_2 : index, %idx_3 : index) {
-
- %c0 = arith.constant 0 : index
- %smem_memref_4d = memref.alloc() : memref<5x1x64x64xf16, #gpu.address_space<workgroup>>
- %gmem_memref_subview_2d = memref.subview %gmem_memref_3d[%idx_1, %idx_2, %idx_3] [1, 1, 8] [1, 1, 1] : memref<2x128x768xf16> to memref<1x8xf16, strided<[98304, 1], offset: ?>>
- %async_token = nvgpu.device_async_copy %gmem_memref_subview_2d[%c0, %c0], %smem_memref_4d[%c0, %c0, %c0, %c0], 8 {bypassL1} : memref<1x8xf16, strided<[98304, 1], offset: ?>> to memref<5x1x64x64xf16, #gpu.address_space<workgroup>>
- return
-}
-
-// CHECK-LABEL: func.func @fold_nvgpu_device_async_copy_zero_sub_idx
-// CHECK-SAME: (%[[GMEM_MEMREF_3d:.+]]: memref<2x128x768xf16>, %[[IDX_1:.+]]: index, %[[IDX_2:.+]]: index, %[[IDX_3:.+]]: index)
-// CHECK-DAG: %[[c0:.+]] = arith.constant 0 : index
-// CHECK-DAG: %[[SMEM_MEMREF_4d:.+]] = memref.alloc() : memref<5x1x64x64xf16, #gpu.address_space<workgroup>>
-// CHECK: nvgpu.device_async_copy %[[GMEM_MEMREF_3d]][%[[IDX_1]], %[[IDX_2]], %[[IDX_3]]], %[[SMEM_MEMREF_4d]][%[[c0]], %[[c0]], %[[c0]], %[[c0]]], 8 {bypassL1} : memref<2x128x768xf16> to memref<5x1x64x64xf16, #gpu.address_space<workgroup>>
-
-// -----
-
-
-func.func @fold_src_nvgpu_device_async_copy(%gmem_memref_3d : memref<2x128x768xf16>, %src_idx_0 : index, %src_idx_1 : index, %src_idx_2 : index, %src_sub_idx_0 : index, %src_sub_idx_1 : index) {
- %c0 = arith.constant 0 : index
- %smem_memref_4d = memref.alloc() : memref<5x1x64x64xf16, #gpu.address_space<workgroup>>
- %gmem_memref_subview_2d = memref.subview %gmem_memref_3d[%src_idx_0, %src_idx_1, %src_idx_2] [1, 1, 8] [1, 1, 1] : memref<2x128x768xf16> to memref<1x8xf16, strided<[98304, 1], offset: ?>>
- %async_token = nvgpu.device_async_copy %gmem_memref_subview_2d[%src_sub_idx_0, %src_sub_idx_1], %smem_memref_4d[%c0, %c0, %c0, %c0], 8 {bypassL1} : memref<1x8xf16, strided<[98304, 1], offset: ?>> to memref<5x1x64x64xf16, #gpu.address_space<workgroup>>
- return
-}
-
-// CHECK-DAG: #[[MAP:.+]] = affine_map<()[s0, s1] -> (s0 + s1)>
-// CHECK: func.func @fold_src_nvgpu_device_async_copy
-// CHECK-SAME: (%[[GMEM_MEMREF_3d:.+]]: memref<2x128x768xf16>, %[[SRC_IDX_0:.+]]: index, %[[SRC_IDX_1:.+]]: index, %[[SRC_IDX_2:.+]]: index, %[[SRC_SUB_IDX_0:.+]]: index, %[[SRC_SUB_IDX_1:.+]]: index)
-// CHECK-DAG: %[[c0:.+]] = arith.constant 0 : index
-// CHECK-DAG: %[[RESOLVED_SRC_IDX_0:.+]] = affine.apply #[[MAP]]()[%[[SRC_IDX_0]], %[[SRC_SUB_IDX_0]]]
-// CHECK-DAG: %[[RESOLVED_SRC_IDX_1:.+]] = affine.apply #[[MAP]]()[%[[SRC_IDX_2]], %[[SRC_SUB_IDX_1]]]
-// CHECK-DAG: nvgpu.device_async_copy %[[GMEM_MEMREF_3d]][%[[RESOLVED_SRC_IDX_0]], %[[SRC_IDX_1]], %[[RESOLVED_SRC_IDX_1]]], %[[SMEM_MEMREF_4d]][%[[c0]], %[[c0]], %[[c0]], %[[c0]]], 8 {bypassL1} : memref<2x128x768xf16> to memref<5x1x64x64xf16, #gpu.address_space<workgroup>>
-
-// -----
-
-
-func.func @fold_src_fold_dest_nvgpu_device_async_copy(%gmem_memref_3d : memref<2x128x768xf16>, %src_idx_0 : index, %src_idx_1 : index, %src_idx_2 : index, %src_sub_idx_0 : index, %src_sub_idx_1 : index, %dest_idx_0 : index, %dest_idx_1 : index, %dest_idx_2 : index, %dest_idx_3 : index, %dest_sub_idx_0 : index, %dest_sub_idx_1 : index) {
- %c0 = arith.constant 0 : index
- %smem_memref_4d = memref.alloc() : memref<5x1x64x64xf16, #gpu.address_space<workgroup>>
- %gmem_memref_subview_2d = memref.subview %gmem_memref_3d[%src_idx_0, %src_idx_1, %src_idx_2] [1, 1, 8] [1, 1, 1] : memref<2x128x768xf16> to memref<1x8xf16, strided<[98304, 1], offset: ?>>
- %smem_memref_2d = memref.subview %smem_memref_4d[%dest_idx_0, %dest_idx_1, %dest_idx_2, %dest_idx_3] [1, 1, 1, 8] [1, 1, 1, 1] : memref<5x1x64x64xf16, #gpu.address_space<workgroup>> to memref<1x8xf16, strided<[4096, 1], offset: ?>, #gpu.address_space<workgroup>>
- %async_token = nvgpu.device_async_copy %gmem_memref_subview_2d[%src_sub_idx_0, %src_sub_idx_1], %smem_memref_2d[%dest_sub_idx_0, %dest_sub_idx_1], 8 {bypassL1} : memref<1x8xf16, strided<[98304, 1], offset: ?>> to memref<1x8xf16, strided<[4096, 1], offset: ?>, #gpu.address_space<workgroup>>
- return
-}
-
-// CHECK-DAG: #[[MAP:.+]] = affine_map<()[s0, s1] -> (s0 + s1)>
-// CHECK: func.func @fold_src_fold_dest_nvgpu_device_async_copy
-// CHECK-SAME: (%[[GMEM_MEMREF_3d:.+]]: memref<2x128x768xf16>, %[[SRC_IDX_0:.+]]: index, %[[SRC_IDX_1:.+]]: index, %[[SRC_IDX_2:.+]]: index, %[[SRC_SUB_IDX_0:.+]]: index, %[[SRC_SUB_IDX_1:.+]]: index, %[[DEST_IDX_0:.+]]: index, %[[DEST_IDX_1:.+]]: index, %[[DEST_IDX_2:.+]]: index, %[[DEST_IDX_3:.+]]: index, %[[DEST_SUB_IDX_0:.+]]: index, %[[DEST_SUB_IDX_1:.+]]: index)
-// CHECK-DAG: %[[RESOLVED_SRC_IDX_0:.+]] = affine.apply #[[MAP]]()[%[[SRC_IDX_0]], %[[SRC_SUB_IDX_0]]]
-// CHECK-DAG: %[[RESOLVED_SRC_IDX_1:.+]] = affine.apply #[[MAP]]()[%[[SRC_IDX_2]], %[[SRC_SUB_IDX_1]]]
-// CHECK-DAG: %[[RESOLVED_DST_IDX_1:.+]] = affine.apply #[[MAP]]()[%[[DEST_IDX_1]], %[[DEST_SUB_IDX_0]]]
-// CHECK-DAG: %[[RESOLVED_DST_IDX_3:.+]] = affine.apply #[[MAP]]()[%[[DEST_IDX_3]], %[[DEST_SUB_IDX_1]]]
-// CHECK-DAG: nvgpu.device_async_copy %[[GMEM_MEMREF_3d]][%[[RESOLVED_SRC_IDX_0]], %[[SRC_IDX_1]], %[[RESOLVED_SRC_IDX_1]]], %[[SMEM_MEMREF_4d]][%[[DEST_IDX_0]], %[[RESOLVED_DST_IDX_1]], %[[DEST_IDX_2]], %[[RESOLVED_DST_IDX_3]]], 8 {bypassL1} : memref<2x128x768xf16> to memref<5x1x64x64xf16, #gpu.address_space<workgroup>>
-
-// -----
-
-#map = affine_map<()[s0] -> (-s0 + 4)>
-#map1 = affine_map<()[s0] -> (-s0 + 32)>
-
-func.func @test_ldmatrix(%arg0: memref<4x32x32xf16, 3>, %arg1: index, %arg2: index, %arg3: index) -> vector<4x2xf16> {
- %c0 = arith.constant 0 : index
- %0 = affine.apply #map()[%arg1]
- %1 = affine.apply #map1()[%arg2]
- %2 = affine.apply #map1()[%arg3]
- %subview = memref.subview %arg0[%arg1, %arg2, %arg3] [%0, %1, %2] [1, 1, 1] : memref<4x32x32xf16, 3> to memref<?x?x?xf16, strided<[1024, 32, 1], offset: ?>, 3>
- %3 = nvgpu.ldmatrix %subview[%c0, %c0, %c0] {numTiles = 4 : i32, transpose = false} : memref<?x?x?xf16, strided<[1024, 32, 1], offset: ?>, 3> -> vector<4x2xf16>
- return %3 : vector<4x2xf16>
-}
-
-// CHECK: func @test_ldmatrix
-// CHECK-SAME: %[[ARG0:[a-zA-Z0-9_]+]]: memref<4x32x32xf16, 3>
-// CHECK-SAME: %[[ARG1:[a-zA-Z0-9_]+]]: index
-// CHECK-SAME: %[[ARG2:[a-zA-Z0-9_]+]]: index
-// CHECK-SAME: %[[ARG3:[a-zA-Z0-9_]+]]: index
-// CHECK: nvgpu.ldmatrix %[[ARG0]][%[[ARG1]], %[[ARG2]], %[[ARG3]]] {numTiles = 4 : i32, transpose = false} : memref<4x32x32xf16, 3> -> vector<4x2xf16>
-
-// -----
-
func.func @fold_vector_load_subview(%src : memref<24x64xf32>,
%off1 : index,
%off2 : index,
diff --git a/mlir/test/Dialect/NVGPU/fold-memref-alias-ops.mlir b/mlir/test/Dialect/NVGPU/fold-memref-alias-ops.mlir
new file mode 100644
index 0000000000000..44dcffcc1f00d
--- /dev/null
+++ b/mlir/test/Dialect/NVGPU/fold-memref-alias-ops.mlir
@@ -0,0 +1,93 @@
+// RUN: mlir-opt -fold-memref-alias-ops -split-input-file %s | FileCheck %s
+
+func.func @fold_nvgpu_device_async_copy_zero_sub_idx(%gmem_memref_3d : memref<2x128x768xf16>, %idx_1 : index, %idx_2 : index, %idx_3 : index) {
+ %c0 = arith.constant 0 : index
+ %smem_memref_4d = memref.alloc() : memref<5x1x64x64xf16, #gpu.address_space<workgroup>>
+ %gmem_memref_subview_2d = memref.subview %gmem_memref_3d[%idx_1, %idx_2, %idx_3] [1, 1, 8] [1, 1, 1] : memref<2x128x768xf16> to memref<1x8xf16, strided<[98304, 1], offset: ?>>
+ %async_token = nvgpu.device_async_copy %gmem_memref_subview_2d[%c0, %c0], %smem_memref_4d[%c0, %c0, %c0, %c0], 8 {bypassL1} : memref<1x8xf16, strided<[98304, 1], offset: ?>> to memref<5x1x64x64xf16, #gpu.address_space<workgroup>>
+ return
+}
+
+// CHECK-LABEL: func.func @fold_nvgpu_device_async_copy_zero_sub_idx
+// CHECK-SAME: (%[[GMEM_MEMREF_3d:.+]]: memref<2x128x768xf16>, %[[IDX_1:.+]]: index, %[[IDX_2:.+]]: index, %[[IDX_3:.+]]: index)
+// CHECK-DAG: %[[c0:.+]] = arith.constant 0 : index
+// CHECK-DAG: %[[SMEM_MEMREF_4d:.+]] = memref.alloc() : memref<5x1x64x64xf16, #gpu.address_space<workgroup>>
+// CHECK: nvgpu.device_async_copy %[[GMEM_MEMREF_3d]][%[[IDX_1]], %[[IDX_2]], %[[IDX_3]]], %[[SMEM_MEMREF_4d]][%[[c0]], %[[c0]], %[[c0]], %[[c0]]], 8 {bypassL1} : memref<2x128x768xf16> to memref<5x1x64x64xf16, #gpu.address_space<workgroup>>
+
+// -----
+
+
+func.func @fold_src_nvgpu_device_async_copy(%gmem_memref_3d : memref<2x128x768xf16>, %src_idx_0 : index, %src_idx_1 : index, %src_idx_2 : index, %src_sub_idx_0 : index, %src_sub_idx_1 : index) {
+ %c0 = arith.constant 0 : index
+ %smem_memref_4d = memref.alloc() : memref<5x1x64x64xf16, #gpu.address_space<workgroup>>
+ %gmem_memref_subview_2d = memref.subview %gmem_memref_3d[%src_idx_0, %src_idx_1, %src_idx_2] [1, 1, 8] [1, 1, 1] : memref<2x128x768xf16> to memref<1x8xf16, strided<[98304, 1], offset: ?>>
+ %async_token = nvgpu.device_async_copy %gmem_memref_subview_2d[%src_sub_idx_0, %src_sub_idx_1], %smem_memref_4d[%c0, %c0, %c0, %c0], 8 {bypassL1} : memref<1x8xf16, strided<[98304, 1], offset: ?>> to memref<5x1x64x64xf16, #gpu.address_space<workgroup>>
+ return
+}
+
+// CHECK-DAG: #[[MAP:.+]] = affine_map<()[s0, s1] -> (s0 + s1)>
+// CHECK: func.func @fold_src_nvgpu_device_async_copy
+// CHECK-SAME: (%[[GMEM_MEMREF_3d:.+]]: memref<2x128x768xf16>, %[[SRC_IDX_0:.+]]: index, %[[SRC_IDX_1:.+]]: index, %[[SRC_IDX_2:.+]]: index, %[[SRC_SUB_IDX_0:.+]]: index, %[[SRC_SUB_IDX_1:.+]]: index)
+// CHECK-DAG: %[[c0:.+]] = arith.constant 0 : index
+// CHECK-DAG: %[[RESOLVED_SRC_IDX_0:.+]] = affine.apply #[[MAP]]()[%[[SRC_IDX_0]], %[[SRC_SUB_IDX_0]]]
+// CHECK-DAG: %[[RESOLVED_SRC_IDX_1:.+]] = affine.apply #[[MAP]]()[%[[SRC_IDX_2]], %[[SRC_SUB_IDX_1]]]
+// CHECK-DAG: nvgpu.device_async_copy %[[GMEM_MEMREF_3d]][%[[RESOLVED_SRC_IDX_0]], %[[SRC_IDX_1]], %[[RESOLVED_SRC_IDX_1]]], %[[SMEM_MEMREF_4d]][%[[c0]], %[[c0]], %[[c0]], %[[c0]]], 8 {bypassL1} : memref<2x128x768xf16> to memref<5x1x64x64xf16, #gpu.address_space<workgroup>>
+
+// -----
+
+
+func.func @fold_src_fold_dest_nvgpu_device_async_copy(%gmem_memref_3d : memref<2x128x768xf16>, %src_idx_0 : index, %src_idx_1 : index, %src_idx_2 : index, %src_sub_idx_0 : index, %src_sub_idx_1 : index, %dest_idx_0 : index, %dest_idx_1 : index, %dest_idx_2 : index, %dest_idx_3 : index, %dest_sub_idx_0 : index, %dest_sub_idx_1 : index) {
+ %c0 = arith.constant 0 : index
+ %smem_memref_4d = memref.alloc() : memref<5x1x64x64xf16, #gpu.address_space<workgroup>>
+ %gmem_memref_subview_2d = memref.subview %gmem_memref_3d[%src_idx_0, %src_idx_1, %src_idx_2] [1, 1, 8] [1, 1, 1] : memref<2x128x768xf16> to memref<1x8xf16, strided<[98304, 1], offset: ?>>
+ %smem_memref_2d = memref.subview %smem_memref_4d[%dest_idx_0, %dest_idx_1, %dest_idx_2, %dest_idx_3] [1, 1, 1, 8] [1, 1, 1, 1] : memref<5x1x64x64xf16, #gpu.address_space<workgroup>> to memref<1x8xf16, strided<[4096, 1], offset: ?>, #gpu.address_space<workgroup>>
+ %async_token = nvgpu.device_async_copy %gmem_memref_subview_2d[%src_sub_idx_0, %src_sub_idx_1], %smem_memref_2d[%dest_sub_idx_0, %dest_sub_idx_1], 8 {bypassL1} : memref<1x8xf16, strided<[98304, 1], offset: ?>> to memref<1x8xf16, strided<[4096, 1], offset: ?>, #gpu.address_space<workgroup>>
+ return
+}
+
+// CHECK-DAG: #[[MAP:.+]] = affine_map<()[s0, s1] -> (s0 + s1)>
+// CHECK: func.func @fold_src_fold_dest_nvgpu_device_async_copy
+// CHECK-SAME: (%[[GMEM_MEMREF_3d:.+]]: memref<2x128x768xf16>, %[[SRC_IDX_0:.+]]: index, %[[SRC_IDX_1:.+]]: index, %[[SRC_IDX_2:.+]]: index, %[[SRC_SUB_IDX_0:.+]]: index, %[[SRC_SUB_IDX_1:.+]]: index, %[[DEST_IDX_0:.+]]: index, %[[DEST_IDX_1:.+]]: index, %[[DEST_IDX_2:.+]]: index, %[[DEST_IDX_3:.+]]: index, %[[DEST_SUB_IDX_0:.+]]: index, %[[DEST_SUB_IDX_1:.+]]: index)
+// CHECK-DAG: %[[RESOLVED_SRC_IDX_0:.+]] = affine.apply #[[MAP]]()[%[[SRC_IDX_0]], %[[SRC_SUB_IDX_0]]]
+// CHECK-DAG: %[[RESOLVED_SRC_IDX_1:.+]] = affine.apply #[[MAP]]()[%[[SRC_IDX_2]], %[[SRC_SUB_IDX_1]]]
+// CHECK-DAG: %[[RESOLVED_DST_IDX_1:.+]] = affine.apply #[[MAP]]()[%[[DEST_IDX_1]], %[[DEST_SUB_IDX_0]]]
+// CHECK-DAG: %[[RESOLVED_DST_IDX_3:.+]] = affine.apply #[[MAP]]()[%[[DEST_IDX_3]], %[[DEST_SUB_IDX_1]]]
+// CHECK-DAG: nvgpu.device_async_copy %[[GMEM_MEMREF_3d]][%[[RESOLVED_SRC_IDX_0]], %[[SRC_IDX_1]], %[[RESOLVED_SRC_IDX_1]]], %[[SMEM_MEMREF_4d]][%[[DEST_IDX_0]], %[[RESOLVED_DST_IDX_1]], %[[DEST_IDX_2]], %[[RESOLVED_DST_IDX_3]]], 8 {bypassL1} : memref<2x128x768xf16> to memref<5x1x64x64xf16, #gpu.address_space<workgroup>>
+
+// -----
+
+#map = affine_map<()[s0] -> (-s0 + 4)>
+#map1 = affine_map<()[s0] -> (-s0 + 32)>
+
+func.func @test_ldmatrix(%arg0: memref<4x32x32xf16, 3>, %arg1: index, %arg2: index, %arg3: index) -> vector<4x2xf16> {
+ %c0 = arith.constant 0 : index
+ %0 = affine.apply #map()[%arg1]
+ %1 = affine.apply #map1()[%arg2]
+ %2 = affine.apply #map1()[%arg3]
+ %subview = memref.subview %arg0[%arg1, %arg2, %arg3] [%0, %1, %2] [1, 1, 1] : memref<4x32x32xf16, 3> to memref<?x?x?xf16, strided<[1024, 32, 1], offset: ?>, 3>
+ %3 = nvgpu.ldmatrix %subview[%c0, %c0, %c0] {numTiles = 4 : i32, transpose = false} : memref<?x?x?xf16, strided<[1024, 32, 1], offset: ?>, 3> -> vector<4x2xf16>
+ return %3 : vector<4x2xf16>
+}
+
+// CHECK: func @test_ldmatrix
+// CHECK-SAME: %[[ARG0:[a-zA-Z0-9_]+]]: memref<4x32x32xf16, 3>
+// CHECK-SAME: %[[ARG1:[a-zA-Z0-9_]+]]: index
+// CHECK-SAME: %[[ARG2:[a-zA-Z0-9_]+]]: index
+// CHECK-SAME: %[[ARG3:[a-zA-Z0-9_]+]]: index
+// CHECK: nvgpu.ldmatrix %[[ARG0]][%[[ARG1]], %[[ARG2]], %[[ARG3]]] {numTiles = 4 : i32, transpose = false} : memref<4x32x32xf16, 3> -> vector<4x2xf16>
+
+// -----
+
+func.func @ldmatrix_expand(%arg0: memref<4096xf16, 3>, %arg1: index, %arg2: index, %arg3: index) -> vector<4x2xf16> {
+ %exp = memref.expand_shape %arg0 [[0, 1, 2]] output_shape [4, 32, 32] : memref<4096xf16, 3> into memref<4x32x32xf16, 3>
+ %3 = nvgpu.ldmatrix %exp[%arg1, %arg2, %arg3] {numTiles = 4 : i32, transpose = false} : memref<4x32x32xf16, 3> -> vector<4x2xf16>
+ return %3 : vector<4x2xf16>
+}
+
+// CHECK: func @ldmatrix_expand
+// CHECK-SAME: %[[ARG0:[a-zA-Z0-9_]+]]: memref<4096xf16, 3>
+// CHECK-SAME: %[[ARG1:[a-zA-Z0-9_]+]]: index
+// CHECK-SAME: %[[ARG2:[a-zA-Z0-9_]+]]: index
+// CHECK-SAME: %[[ARG3:[a-zA-Z0-9_]+]]: index
+// CHECK: %[[LIN:[a-zA-Z0-9_]+]] = affine.linearize_index disjoint [%[[ARG1]], %[[ARG2]], %[[ARG3]]] by (4, 32, 32)
+// CHECK: nvgpu.ldmatrix %[[ARG0]][%[[LIN]]] {numTiles = 4 : i32, transpose = false} : memref<4096xf16, 3> -> vector<4x2xf16>
More information about the Mlir-commits
mailing list