[Mlir-commits] [mlir] dbc63f1 - [AMDGPU] fold `memref.subview/expand_shape/collapse_shape` into `amdgpu.gather_to_lds` (#149851)
llvmlistbot at llvm.org
llvmlistbot at llvm.org
Wed Jul 23 08:22:44 PDT 2025
Author: Alan Li
Date: 2025-07-23T11:22:41-04:00
New Revision: dbc63f1e3724b6f2348c431dc1216537d9c042e8
URL: https://github.com/llvm/llvm-project/commit/dbc63f1e3724b6f2348c431dc1216537d9c042e8
DIFF: https://github.com/llvm/llvm-project/commit/dbc63f1e3724b6f2348c431dc1216537d9c042e8.diff
LOG: [AMDGPU] fold `memref.subview/expand_shape/collapse_shape` into `amdgpu.gather_to_lds` (#149851)
This PR adds a new optimization pass to fold
`memref.subview/expand_shape/collapse_shape` ops into consumer
`amdgpu.gather_to_lds` operations.
* Implements a new pass `AmdgpuFoldMemRefOpsPass` with pattern
`FoldMemRefOpsIntoGatherToLDSOp`
* Adds corresponding folding tests
---------
Co-authored-by: Copilot <175728472+Copilot at users.noreply.github.com>
Added:
mlir/lib/Dialect/AMDGPU/Transforms/FoldMemRefsOps.cpp
mlir/test/Dialect/AMDGPU/amdgpu-fold-memrefs.mlir
Modified:
mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.h
mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.td
mlir/include/mlir/Dialect/MemRef/Utils/MemRefUtils.h
mlir/lib/Dialect/AMDGPU/Transforms/CMakeLists.txt
mlir/lib/Dialect/MemRef/Transforms/FoldMemRefAliasOps.cpp
mlir/lib/Dialect/MemRef/Utils/MemRefUtils.cpp
Removed:
################################################################################
diff --git a/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.h b/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.h
index cc2f543e79f69..58b9c74b2f8e0 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.h
+++ b/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.h
@@ -22,8 +22,9 @@ class ConversionTarget;
namespace amdgpu {
#define GEN_PASS_DECL_AMDGPUEMULATEATOMICSPASS
-#define GEN_PASS_DECL_AMDGPURESOLVESTRIDEDMETADATAPASS
+#define GEN_PASS_DECL_AMDGPUFOLDMEMREFOPSPASS
#define GEN_PASS_DECL_AMDGPUMASKEDLOADTOLOADPASS
+#define GEN_PASS_DECL_AMDGPURESOLVESTRIDEDMETADATAPASS
#define GEN_PASS_REGISTRATION
#include "mlir/Dialect/AMDGPU/Transforms/Passes.h.inc"
@@ -38,6 +39,9 @@ void populateAmdgpuResolveStridedMetadataPatterns(RewritePatternSet &patterns,
void populateAmdgpuMaskedloadToLoadPatterns(RewritePatternSet &patterns,
PatternBenefit benefit = 1);
+void populateAmdgpuFoldMemRefOpsPatterns(RewritePatternSet &patterns,
+ PatternBenefit benefit = 1);
+
} // namespace amdgpu
} // namespace mlir
diff --git a/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.td b/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.td
index 8d0e6829ab0cc..8664f971cabde 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.td
@@ -70,4 +70,16 @@ def AmdgpuMaskedloadToLoadPass : Pass<"amdgpu-maskedload-to-load"> {
"memref::MemRefDialect"
];
}
+
+def AmdgpuFoldMemRefOpsPass : Pass<"amdgpu-fold-memrefs-ops"> {
+ let summary = "Fold memref operations into their parent operations";
+ let description = [{
+ This pass identifies memref operations (subview, expand_shape, collapse_shape)
+ that are sources of `GatherToLDSOp` and attempts to fold the source ops,
+ potentially simplifying the overall operation and improving performance.
+ }];
+ let dependentDialects = [
+ "memref::MemRefDialect"
+ ];
+}
#endif // MLIR_DIALECT_AMDGPU_TRANSFORMS_PASSES_TD_
diff --git a/mlir/include/mlir/Dialect/MemRef/Utils/MemRefUtils.h b/mlir/include/mlir/Dialect/MemRef/Utils/MemRefUtils.h
index 34ad279a07a8b..dd3b3dea6ef26 100644
--- a/mlir/include/mlir/Dialect/MemRef/Utils/MemRefUtils.h
+++ b/mlir/include/mlir/Dialect/MemRef/Utils/MemRefUtils.h
@@ -116,6 +116,43 @@ inline bool isSameViewOrTrivialAlias(MemrefValue a, MemrefValue b) {
/// the source memref (i.e. implements ViewLikeOpInterface).
MemrefValue skipViewLikeOps(MemrefValue source);
+/// Given the 'indices' of a load/store operation where the memref is a result
+/// of a expand_shape op, returns the indices w.r.t to the source memref of the
+/// expand_shape op. For example
+///
+/// %0 = ... : memref<12x42xf32>
+/// %1 = memref.expand_shape %0 [[0, 1], [2]]
+/// : memref<12x42xf32> into memref<2x6x42xf32>
+/// %2 = load %1[%i1, %i2, %i3] : memref<2x6x42xf32
+///
+/// could be folded into
+///
+/// %2 = load %0[6 * i1 + i2, %i3] :
+/// memref<12x42xf32>
+LogicalResult resolveSourceIndicesExpandShape(
+ Location loc, PatternRewriter &rewriter,
+ memref::ExpandShapeOp expandShapeOp, ValueRange indices,
+ SmallVectorImpl<Value> &sourceIndices, bool startsInbounds);
+
+/// Given the 'indices' of a load/store operation where the memref is a result
+/// of a collapse_shape op, returns the indices w.r.t to the source memref of
+/// the collapse_shape op. For example
+///
+/// %0 = ... : memref<2x6x42xf32>
+/// %1 = memref.collapse_shape %0 [[0, 1], [2]]
+/// : memref<2x6x42xf32> into memref<12x42xf32>
+/// %2 = load %1[%i1, %i2] : memref<12x42xf32>
+///
+/// could be folded into
+///
+/// %2 = load %0[%i1 / 6, %i1 % 6, %i2] :
+/// memref<2x6x42xf32>
+LogicalResult
+resolveSourceIndicesCollapseShape(Location loc, PatternRewriter &rewriter,
+ memref::CollapseShapeOp collapseShapeOp,
+ ValueRange indices,
+ SmallVectorImpl<Value> &sourceIndices);
+
} // namespace memref
} // namespace mlir
diff --git a/mlir/lib/Dialect/AMDGPU/Transforms/CMakeLists.txt b/mlir/lib/Dialect/AMDGPU/Transforms/CMakeLists.txt
index 17bbe54ea6c0c..3b0c072ed1217 100644
--- a/mlir/lib/Dialect/AMDGPU/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/AMDGPU/Transforms/CMakeLists.txt
@@ -1,7 +1,8 @@
add_mlir_dialect_library(MLIRAMDGPUTransforms
EmulateAtomics.cpp
- ResolveStridedMetadata.cpp
+ FoldMemRefsOps.cpp
MaskedloadToLoad.cpp
+ ResolveStridedMetadata.cpp
ADDITIONAL_HEADER_DIRS
{$MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/AMDGPU/Transforms
diff --git a/mlir/lib/Dialect/AMDGPU/Transforms/FoldMemRefsOps.cpp b/mlir/lib/Dialect/AMDGPU/Transforms/FoldMemRefsOps.cpp
new file mode 100644
index 0000000000000..a3fdc7ee385ed
--- /dev/null
+++ b/mlir/lib/Dialect/AMDGPU/Transforms/FoldMemRefsOps.cpp
@@ -0,0 +1,97 @@
+//===- FoldSubviewOps.cpp - AMDGPU fold subview ops -----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/AMDGPU/Transforms/Passes.h"
+
+#include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h"
+#include "mlir/Dialect/Affine/ViewLikeInterfaceUtils.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/MemRef/Utils/MemRefUtils.h"
+#include "mlir/Transforms/WalkPatternRewriteDriver.h"
+#include "llvm/ADT/TypeSwitch.h"
+
+namespace mlir::amdgpu {
+#define GEN_PASS_DEF_AMDGPUFOLDMEMREFOPSPASS
+#include "mlir/Dialect/AMDGPU/Transforms/Passes.h.inc"
+
+struct AmdgpuFoldMemRefOpsPass final
+ : amdgpu::impl::AmdgpuFoldMemRefOpsPassBase<AmdgpuFoldMemRefOpsPass> {
+ void runOnOperation() override {
+ RewritePatternSet patterns(&getContext());
+ populateAmdgpuFoldMemRefOpsPatterns(patterns);
+ walkAndApplyPatterns(getOperation(), std::move(patterns));
+ }
+};
+
+struct FoldMemRefOpsIntoGatherToLDSOp final : OpRewritePattern<GatherToLDSOp> {
+ using OpRewritePattern::OpRewritePattern;
+ LogicalResult matchAndRewrite(GatherToLDSOp op,
+ PatternRewriter &rewriter) const override {
+ Location loc = op.getLoc();
+
+ Value memrefSource;
+ SmallVector<Value> sourceIndices;
+ auto foldResult =
+ llvm::TypeSwitch<Operation *, LogicalResult>(
+ op.getSrc().getDefiningOp())
+ .Case<memref::SubViewOp>([&](memref::SubViewOp subviewOp) {
+ // If the source is a SubViewOp, we can directly rewrite the
+ // GatherToLDSOp.
+ mlir::affine::resolveIndicesIntoOpWithOffsetsAndStrides(
+ rewriter, loc, subviewOp.getMixedOffsets(),
+ subviewOp.getMixedStrides(), subviewOp.getDroppedDims(),
+ op.getSrcIndices(), sourceIndices);
+ memrefSource = subviewOp.getSource();
+ return success();
+ })
+ .Case<memref::ExpandShapeOp>(
+ [&](memref::ExpandShapeOp expandShapeOp) {
+ if (failed(mlir::memref::resolveSourceIndicesExpandShape(
+ loc, rewriter, expandShapeOp, op.getSrcIndices(),
+ sourceIndices, false))) {
+ return failure();
+ }
+ memrefSource = expandShapeOp.getViewSource();
+ return success();
+ })
+ .Case<memref::CollapseShapeOp>(
+ [&](memref::CollapseShapeOp collapseShapeOp) {
+ if (failed(mlir::memref::resolveSourceIndicesCollapseShape(
+ loc, rewriter, collapseShapeOp, op.getSrcIndices(),
+ sourceIndices))) {
+ return failure();
+ }
+ memrefSource = collapseShapeOp.getViewSource();
+ return success();
+ })
+ .Default([&](Operation *op) {
+ // If the source is not a SubViewOp, ExpandShapeOp, or
+ // CollapseShapeOp, we cannot fold the GatherToLDSOp.
+ return rewriter.notifyMatchFailure(
+ op,
+ "source producer is not one of SubViewOp, ExpandShapeOp, or "
+ "CollapseShapeOp");
+ });
+
+ if (failed(foldResult)) {
+ return failure();
+ }
+
+ rewriter.replaceOpWithNewOp<GatherToLDSOp>(op, memrefSource, sourceIndices,
+ op.getDst(), op.getDstIndices(),
+ op.getTransferType());
+
+ return success();
+ }
+};
+
+void populateAmdgpuFoldMemRefOpsPatterns(RewritePatternSet &patterns,
+ PatternBenefit benefit) {
+ patterns.add<FoldMemRefOpsIntoGatherToLDSOp>(patterns.getContext(), benefit);
+}
+} // namespace mlir::amdgpu
diff --git a/mlir/lib/Dialect/MemRef/Transforms/FoldMemRefAliasOps.cpp b/mlir/lib/Dialect/MemRef/Transforms/FoldMemRefAliasOps.cpp
index 89be188af9129..24da447ad7685 100644
--- a/mlir/lib/Dialect/MemRef/Transforms/FoldMemRefAliasOps.cpp
+++ b/mlir/lib/Dialect/MemRef/Transforms/FoldMemRefAliasOps.cpp
@@ -44,97 +44,6 @@ using namespace mlir;
// Utility functions
//===----------------------------------------------------------------------===//
-/// Given the 'indices' of a load/store operation where the memref is a result
-/// of a expand_shape op, returns the indices w.r.t to the source memref of the
-/// expand_shape op. For example
-///
-/// %0 = ... : memref<12x42xf32>
-/// %1 = memref.expand_shape %0 [[0, 1], [2]]
-/// : memref<12x42xf32> into memref<2x6x42xf32>
-/// %2 = load %1[%i1, %i2, %i3] : memref<2x6x42xf32
-///
-/// could be folded into
-///
-/// %2 = load %0[6 * i1 + i2, %i3] :
-/// memref<12x42xf32>
-static LogicalResult resolveSourceIndicesExpandShape(
- Location loc, PatternRewriter &rewriter,
- memref::ExpandShapeOp expandShapeOp, ValueRange indices,
- SmallVectorImpl<Value> &sourceIndices, bool startsInbounds) {
- SmallVector<OpFoldResult> destShape = expandShapeOp.getMixedOutputShape();
-
- // Traverse all reassociation groups to determine the appropriate indices
- // corresponding to each one of them post op folding.
- for (ArrayRef<int64_t> group : expandShapeOp.getReassociationIndices()) {
- assert(!group.empty() && "association indices groups cannot be empty");
- int64_t groupSize = group.size();
- if (groupSize == 1) {
- sourceIndices.push_back(indices[group[0]]);
- continue;
- }
- SmallVector<OpFoldResult> groupBasis =
- llvm::map_to_vector(group, [&](int64_t d) { return destShape[d]; });
- SmallVector<Value> groupIndices =
- llvm::map_to_vector(group, [&](int64_t d) { return indices[d]; });
- Value collapsedIndex = rewriter.create<affine::AffineLinearizeIndexOp>(
- loc, groupIndices, groupBasis, /*disjoint=*/startsInbounds);
- sourceIndices.push_back(collapsedIndex);
- }
- return success();
-}
-
-/// Given the 'indices' of a load/store operation where the memref is a result
-/// of a collapse_shape op, returns the indices w.r.t to the source memref of
-/// the collapse_shape op. For example
-///
-/// %0 = ... : memref<2x6x42xf32>
-/// %1 = memref.collapse_shape %0 [[0, 1], [2]]
-/// : memref<2x6x42xf32> into memref<12x42xf32>
-/// %2 = load %1[%i1, %i2] : memref<12x42xf32>
-///
-/// could be folded into
-///
-/// %2 = load %0[%i1 / 6, %i1 % 6, %i2] :
-/// memref<2x6x42xf32>
-static LogicalResult
-resolveSourceIndicesCollapseShape(Location loc, PatternRewriter &rewriter,
- memref::CollapseShapeOp collapseShapeOp,
- ValueRange indices,
- SmallVectorImpl<Value> &sourceIndices) {
- // Note: collapse_shape requires a strided memref, we can do this.
- auto metadata = rewriter.create<memref::ExtractStridedMetadataOp>(
- loc, collapseShapeOp.getSrc());
- SmallVector<OpFoldResult> sourceSizes = metadata.getConstifiedMixedSizes();
- for (auto [index, group] :
- llvm::zip(indices, collapseShapeOp.getReassociationIndices())) {
- assert(!group.empty() && "association indices groups cannot be empty");
- int64_t groupSize = group.size();
-
- if (groupSize == 1) {
- sourceIndices.push_back(index);
- continue;
- }
-
- SmallVector<OpFoldResult> basis =
- llvm::map_to_vector(group, [&](int64_t d) { return sourceSizes[d]; });
- auto delinearize = rewriter.create<affine::AffineDelinearizeIndexOp>(
- loc, index, basis, /*hasOuterBound=*/true);
- llvm::append_range(sourceIndices, delinearize.getResults());
- }
- if (collapseShapeOp.getReassociationIndices().empty()) {
- auto zeroAffineMap = rewriter.getConstantAffineMap(0);
- int64_t srcRank =
- cast<MemRefType>(collapseShapeOp.getViewSource().getType()).getRank();
- OpFoldResult ofr = affine::makeComposedFoldedAffineApply(
- rewriter, loc, zeroAffineMap, ArrayRef<OpFoldResult>{});
- for (int64_t i = 0; i < srcRank; i++) {
- sourceIndices.push_back(
- getValueOrCreateConstantIndexOp(rewriter, loc, ofr));
- }
- }
- return success();
-}
-
/// Helpers to access the memref operand for each op.
template <typename LoadOrStoreOpTy>
static Value getMemRefOperand(LoadOrStoreOpTy op) {
diff --git a/mlir/lib/Dialect/MemRef/Utils/MemRefUtils.cpp b/mlir/lib/Dialect/MemRef/Utils/MemRefUtils.cpp
index a50b4cfc74708..97fe3cb5b4705 100644
--- a/mlir/lib/Dialect/MemRef/Utils/MemRefUtils.cpp
+++ b/mlir/lib/Dialect/MemRef/Utils/MemRefUtils.cpp
@@ -12,6 +12,7 @@
#include "mlir/Dialect/MemRef/Utils/MemRefUtils.h"
#include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Arith/Utils/Utils.h"
#include "mlir/Dialect/MemRef/IR/MemRef.h"
#include "mlir/Interfaces/ViewLikeInterface.h"
#include "llvm/ADT/STLExtras.h"
@@ -217,5 +218,70 @@ MemrefValue skipViewLikeOps(MemrefValue source) {
return source;
}
+LogicalResult resolveSourceIndicesExpandShape(
+ Location loc, PatternRewriter &rewriter,
+ memref::ExpandShapeOp expandShapeOp, ValueRange indices,
+ SmallVectorImpl<Value> &sourceIndices, bool startsInbounds) {
+ SmallVector<OpFoldResult> destShape = expandShapeOp.getMixedOutputShape();
+
+ // Traverse all reassociation groups to determine the appropriate indices
+ // corresponding to each one of them post op folding.
+ for (ArrayRef<int64_t> group : expandShapeOp.getReassociationIndices()) {
+ assert(!group.empty() && "association indices groups cannot be empty");
+ int64_t groupSize = group.size();
+ if (groupSize == 1) {
+ sourceIndices.push_back(indices[group[0]]);
+ continue;
+ }
+ SmallVector<OpFoldResult> groupBasis =
+ llvm::map_to_vector(group, [&](int64_t d) { return destShape[d]; });
+ SmallVector<Value> groupIndices =
+ llvm::map_to_vector(group, [&](int64_t d) { return indices[d]; });
+ Value collapsedIndex = rewriter.create<affine::AffineLinearizeIndexOp>(
+ loc, groupIndices, groupBasis, /*disjoint=*/startsInbounds);
+ sourceIndices.push_back(collapsedIndex);
+ }
+ return success();
+}
+
+LogicalResult
+resolveSourceIndicesCollapseShape(Location loc, PatternRewriter &rewriter,
+ memref::CollapseShapeOp collapseShapeOp,
+ ValueRange indices,
+ SmallVectorImpl<Value> &sourceIndices) {
+ // Note: collapse_shape requires a strided memref, we can do this.
+ auto metadata = rewriter.create<memref::ExtractStridedMetadataOp>(
+ loc, collapseShapeOp.getSrc());
+ SmallVector<OpFoldResult> sourceSizes = metadata.getConstifiedMixedSizes();
+ for (auto [index, group] :
+ llvm::zip(indices, collapseShapeOp.getReassociationIndices())) {
+ assert(!group.empty() && "association indices groups cannot be empty");
+ int64_t groupSize = group.size();
+
+ if (groupSize == 1) {
+ sourceIndices.push_back(index);
+ continue;
+ }
+
+ SmallVector<OpFoldResult> basis =
+ llvm::map_to_vector(group, [&](int64_t d) { return sourceSizes[d]; });
+ auto delinearize = rewriter.create<affine::AffineDelinearizeIndexOp>(
+ loc, index, basis, /*hasOuterBound=*/true);
+ llvm::append_range(sourceIndices, delinearize.getResults());
+ }
+ if (collapseShapeOp.getReassociationIndices().empty()) {
+ auto zeroAffineMap = rewriter.getConstantAffineMap(0);
+ int64_t srcRank =
+ cast<MemRefType>(collapseShapeOp.getViewSource().getType()).getRank();
+ OpFoldResult ofr = affine::makeComposedFoldedAffineApply(
+ rewriter, loc, zeroAffineMap, ArrayRef<OpFoldResult>{});
+ for (int64_t i = 0; i < srcRank; i++) {
+ sourceIndices.push_back(
+ getValueOrCreateConstantIndexOp(rewriter, loc, ofr));
+ }
+ }
+ return success();
+}
+
} // namespace memref
} // namespace mlir
diff --git a/mlir/test/Dialect/AMDGPU/amdgpu-fold-memrefs.mlir b/mlir/test/Dialect/AMDGPU/amdgpu-fold-memrefs.mlir
new file mode 100644
index 0000000000000..57afa127c9da8
--- /dev/null
+++ b/mlir/test/Dialect/AMDGPU/amdgpu-fold-memrefs.mlir
@@ -0,0 +1,94 @@
+// RUN: mlir-opt --amdgpu-fold-memrefs-ops --split-input-file %s | FileCheck %s
+
+#gpu_lds_addrspace = 3
+
+// CHECK: func @test_subview_folding
+// CHECK-SAME: %[[ARG0:.*]]: index, %[[ARG1:.*]]: index
+func.func @test_subview_folding(%offset_i: index, %offset_j: index) {
+ // CHECK: %[[LOCAL:.*]] = memref.alloc() : memref<64x64xf16, 3>
+ // CHECK: %[[MEM:.*]] = memref.alloc() : memref<64x128xf16>
+ // CHECK: %[[C0:.*]] = arith.constant 0 : index
+ // CHECK: amdgpu.gather_to_lds %[[MEM]][%[[ARG0]], %[[ARG1]]], %[[LOCAL]][%[[C0]], %[[C0]]]
+ // CHECK-SAME: vector<8xf16>, memref<64x128xf16>, memref<64x64xf16, 3>
+
+ %alloc = memref.alloc() : memref<64x64xf16, #gpu_lds_addrspace>
+ %mem = memref.alloc() : memref<64x128xf16>
+ %subview = memref.subview %mem[0, 0][32, 64][1, 1] : memref<64x128xf16> to memref<32x64xf16, strided<[128, 1]>>
+ %c0 = arith.constant 0 : index
+ amdgpu.gather_to_lds %subview[%offset_i, %offset_j], %alloc[%c0, %c0]
+ : vector<8xf16>, memref<32x64xf16, strided<[128, 1]>>, memref<64x64xf16, #gpu_lds_addrspace>
+ func.return
+}
+
+// -----
+
+#gpu_lds_addrspace = 3
+
+// CHECK: #[[MAP:.*]] = affine_map<()[s0] -> (s0 + 32)>
+// CHECK: #[[MAP1:.*]] = affine_map<()[s0] -> (s0 + 64)>
+
+// CHECK: func @subview_folding_offset
+// CHECK-SAME: %[[ARG0:.*]]: index, %[[ARG1:.*]]: index
+func.func @subview_folding_offset(%offset_i: index, %offset_j: index) {
+ // CHECK: %[[LOCAL:.*]] = memref.alloc() : memref<64x64xf16, 3>
+ // CHECK: %[[MEM:.*]] = memref.alloc() : memref<64x128xf16>
+ // CHECK: %[[C0:.*]] = arith.constant 0 : index
+ // CHECK: %[[IDX0:.*]] = affine.apply #[[MAP]]()[%[[ARG0]]]
+ // CHECK: %[[IDX1:.*]] = affine.apply #[[MAP1]]()[%[[ARG1]]]
+ // CHECK: amdgpu.gather_to_lds %[[MEM]][%[[IDX0]], %[[IDX1]]], %[[LOCAL]][%[[C0]], %[[C0]]]
+ // CHECK-SAME: vector<8xf16>, memref<64x128xf16>, memref<64x64xf16, 3>
+
+ %alloc = memref.alloc() : memref<64x64xf16, #gpu_lds_addrspace>
+ %mem = memref.alloc() : memref<64x128xf16>
+ %subview = memref.subview %mem[32, 64][32, 64][1, 1] : memref<64x128xf16> to memref<32x64xf16, strided<[128, 1], offset: 4160>>
+ %c0 = arith.constant 0 : index
+ amdgpu.gather_to_lds %subview[%offset_i, %offset_j], %alloc[%c0, %c0]
+ : vector<8xf16>, memref<32x64xf16, strided<[128, 1], offset: 4160>>, memref<64x64xf16, #gpu_lds_addrspace>
+ func.return
+}
+
+// -----
+
+#gpu_lds_addrspace = 3
+
+// CHECK: func @test_expand_shape
+// CHECK-SAME: %[[ARG0:.*]]: index, %[[ARG1:.*]]: index
+func.func @test_expand_shape(%offset_i: index, %offset_j: index) {
+ // CHECK: %[[LOCAL:.*]] = memref.alloc() : memref<64x64xf16, 3>
+ // CHECK: %[[MEM:.*]] = memref.alloc() : memref<8192xf16>
+ // CHECK: %[[C0:.*]] = arith.constant 0 : index
+ // CHECK: %[[IDX:.*]] = affine.linearize_index [%[[ARG0]], %[[ARG1]]] by (64, 128) : index
+ // CHECK: amdgpu.gather_to_lds %[[MEM]][%[[IDX]]], %[[LOCAL]][%[[C0]], %[[C0]]]
+ // CHECK-SAME: vector<8xf16>, memref<8192xf16>, memref<64x64xf16, 3>
+
+ %alloc = memref.alloc() : memref<64x64xf16, #gpu_lds_addrspace>
+ %mem = memref.alloc() : memref<8192xf16>
+ %expand = memref.expand_shape %mem [[0, 1]] output_shape [64, 128] : memref<8192xf16> into memref<64x128xf16>
+ %c0 = arith.constant 0 : index
+ amdgpu.gather_to_lds %expand[%offset_i, %offset_j], %alloc[%c0, %c0]
+ : vector<8xf16>, memref<64x128xf16>, memref<64x64xf16, #gpu_lds_addrspace>
+ func.return
+}
+
+// -----
+
+#gpu_lds_addrspace = 3
+
+// CHECK: func @test_collapse_shape
+// CHECK-SAME: %[[ARG0:.*]]: index, %[[ARG1:.*]]: index
+func.func @test_collapse_shape(%offset_i: index, %offset_j: index) {
+ // CHECK: %[[LOCAL:.*]] = memref.alloc() : memref<64x64xf16, 3>
+ // CHECK: %[[MEM:.*]] = memref.alloc() : memref<64x128xf16>
+ // CHECK: %[[C0:.*]] = arith.constant 0 : index
+ // CHECK: %[[INDICES:.*]]:2 = affine.delinearize_index %[[ARG0]] into (64, 128) : index, index
+ // CHECK: amdgpu.gather_to_lds %[[MEM]][%[[INDICES]]#0, %[[INDICES]]#1], %[[LOCAL]][%[[C0]], %[[C0]]]
+ // CHECK-SAME: vector<8xf16>, memref<64x128xf16>, memref<64x64xf16, 3>
+
+ %alloc = memref.alloc() : memref<64x64xf16, #gpu_lds_addrspace>
+ %mem = memref.alloc() : memref<64x128xf16>
+ %collapse = memref.collapse_shape %mem [[0, 1]] : memref<64x128xf16> into memref<8192xf16>
+ %c0 = arith.constant 0 : index
+ amdgpu.gather_to_lds %collapse[%offset_i], %alloc[%c0, %c0]
+ : vector<8xf16>, memref<8192xf16>, memref<64x64xf16, #gpu_lds_addrspace>
+ func.return
+}
More information about the Mlir-commits
mailing list