[Mlir-commits] [mlir] fe52a6c - [AMDGPU][MLIR] Add FoldMemRefOpsIntoGlobalLoadAsyncToLDSOp (#195982)
llvmlistbot at llvm.org
llvmlistbot at llvm.org
Wed May 6 19:01:17 PDT 2026
Author: Eric
Date: 2026-05-06T19:01:12-07:00
New Revision: fe52a6c99b6b1fa8222b28ffc70d9f43effdfbca
URL: https://github.com/llvm/llvm-project/commit/fe52a6c99b6b1fa8222b28ffc70d9f43effdfbca
DIFF: https://github.com/llvm/llvm-project/commit/fe52a6c99b6b1fa8222b28ffc70d9f43effdfbca.diff
LOG: [AMDGPU][MLIR] Add FoldMemRefOpsIntoGlobalLoadAsyncToLDSOp (#195982)
Add folders for memref aliases for gfx1250 global async to lds.
Assisted by: cursor
Signed-off-by: Eric Feng <Eric.Feng at amd.com>
Added:
Modified:
mlir/lib/Dialect/AMDGPU/Transforms/FoldMemRefsOps.cpp
mlir/test/Dialect/AMDGPU/amdgpu-fold-memrefs.mlir
Removed:
################################################################################
diff --git a/mlir/lib/Dialect/AMDGPU/Transforms/FoldMemRefsOps.cpp b/mlir/lib/Dialect/AMDGPU/Transforms/FoldMemRefsOps.cpp
index 8570927a77794..53cb673ced999 100644
--- a/mlir/lib/Dialect/AMDGPU/Transforms/FoldMemRefsOps.cpp
+++ b/mlir/lib/Dialect/AMDGPU/Transforms/FoldMemRefsOps.cpp
@@ -103,6 +103,45 @@ struct FoldMemRefOpsIntoGatherToLDSOp final : OpRewritePattern<GatherToLDSOp> {
}
};
+struct FoldMemRefOpsIntoGlobalLoadAsyncToLDSOp final
+ : OpRewritePattern<GlobalLoadAsyncToLDSOp> {
+ using Base::Base;
+ LogicalResult matchAndRewrite(GlobalLoadAsyncToLDSOp op,
+ PatternRewriter &rewriter) const override {
+ Location loc = op.getLoc();
+
+ SmallVector<Value> sourceIndices, destIndices;
+ Value memrefSource, memrefDest;
+
+ auto foldSrcResult =
+ foldMemrefViewOp(rewriter, loc, op.getSrc(), op.getSrcIndices(),
+ sourceIndices, memrefSource, "source");
+
+ if (failed(foldSrcResult)) {
+ memrefSource = op.getSrc();
+ sourceIndices = op.getSrcIndices();
+ }
+
+ auto foldDstResult =
+ foldMemrefViewOp(rewriter, loc, op.getDst(), op.getDstIndices(),
+ destIndices, memrefDest, "destination");
+
+ if (failed(foldDstResult)) {
+ memrefDest = op.getDst();
+ destIndices = op.getDstIndices();
+ }
+
+ if (failed(foldSrcResult) && failed(foldDstResult))
+ return rewriter.notifyMatchFailure(op, "no fold found");
+
+ rewriter.replaceOpWithNewOp<GlobalLoadAsyncToLDSOp>(
+ op, memrefSource, sourceIndices, memrefDest, destIndices,
+ op.getTransferType(), op.getMask());
+
+ return success();
+ }
+};
+
template <typename OpTy>
struct FoldMemRefOpsIntoDmaBaseOp final : OpRewritePattern<OpTy> {
using OpRewritePattern<OpTy>::OpRewritePattern;
@@ -160,6 +199,7 @@ struct FoldMemRefOpsIntoTransposeLoadOp final
void populateAmdgpuFoldMemRefOpsPatterns(RewritePatternSet &patterns,
PatternBenefit benefit) {
patterns.add<FoldMemRefOpsIntoGatherToLDSOp,
+ FoldMemRefOpsIntoGlobalLoadAsyncToLDSOp,
FoldMemRefOpsIntoDmaBaseOp<MakeDmaBaseOp>,
FoldMemRefOpsIntoDmaBaseOp<MakeGatherDmaBaseOp>,
FoldMemRefOpsIntoTransposeLoadOp>(patterns.getContext(),
diff --git a/mlir/test/Dialect/AMDGPU/amdgpu-fold-memrefs.mlir b/mlir/test/Dialect/AMDGPU/amdgpu-fold-memrefs.mlir
index 4fc6bc1846c3d..1274fe59f8be5 100644
--- a/mlir/test/Dialect/AMDGPU/amdgpu-fold-memrefs.mlir
+++ b/mlir/test/Dialect/AMDGPU/amdgpu-fold-memrefs.mlir
@@ -418,3 +418,53 @@ func.func @test_make_gather_dma_base_nop(%mem: memref<64x128xf16, #gpu_global_ad
: memref<64x128xf16, #gpu_global_addrspace>, memref<64x64xf16, #gpu_lds_addrspace> -> !amdgpu.tdm_gather_base<f16, i16>
func.return
}
+
+// -----
+
+#gpu_lds_addrspace = #gpu.address_space<workgroup>
+#gpu_global_addrspace = #gpu.address_space<global>
+
+// CHECK: #[[GLOBAL_ASYNC_MAP:.*]] = affine_map<()[s0] -> (s0 + 32)>
+// CHECK: #[[GLOBAL_ASYNC_MAP1:.*]] = affine_map<()[s0] -> (s0 + 64)>
+
+// CHECK: func @test_global_load_async_to_lds_both_fold_masked
+// CHECK-SAME: %[[SRC:.*]]: memref<64x128xf32, #gpu.address_space<global>>, %[[LDS:.*]]: memref<4096xf32, #gpu.address_space<workgroup>>, %[[GLOBAL_I:.*]]: index, %[[GLOBAL_J:.*]]: index, %[[LDS_I:.*]]: index, %[[LDS_J:.*]]: index, %[[MASK:.*]]: i1
+func.func @test_global_load_async_to_lds_both_fold_masked(%src: memref<64x128xf32, #gpu_global_addrspace>, %lds: memref<4096xf32, #gpu_lds_addrspace>, %global_i: index, %global_j: index, %lds_i: index, %lds_j: index, %mask: i1) {
+ // CHECK: %[[GI:.*]] = affine.apply #[[GLOBAL_ASYNC_MAP]]()[%[[GLOBAL_I]]]
+ // CHECK: %[[GJ:.*]] = affine.apply #[[GLOBAL_ASYNC_MAP1]]()[%[[GLOBAL_J]]]
+ // CHECK: %[[LDS_IDX:.*]] = affine.linearize_index [%[[LDS_I]], %[[LDS_J]]] by (64, 64) : index
+ // CHECK: amdgpu.global_load_async_to_lds %[[SRC]][%[[GI]], %[[GJ]]], %[[LDS]][%[[LDS_IDX]]], %[[MASK]]
+ // CHECK-SAME: vector<4xf32>, memref<64x128xf32, #gpu.address_space<global>>, memref<4096xf32, #gpu.address_space<workgroup>>
+
+ %subview = memref.subview %src[32, 64][32, 64][1, 1]
+ : memref<64x128xf32, #gpu_global_addrspace>
+ to memref<32x64xf32, strided<[128, 1], offset: 4160>, #gpu_global_addrspace>
+ %expand_lds = memref.expand_shape %lds [[0, 1]] output_shape [64, 64]
+ : memref<4096xf32, #gpu_lds_addrspace>
+ into memref<64x64xf32, #gpu_lds_addrspace>
+ amdgpu.global_load_async_to_lds %subview[%global_i, %global_j], %expand_lds[%lds_i, %lds_j], %mask
+ : vector<4xf32>, memref<32x64xf32, strided<[128, 1], offset: 4160>, #gpu_global_addrspace>,
+ memref<64x64xf32, #gpu_lds_addrspace>
+ func.return
+}
+
+// -----
+
+#gpu_lds_addrspace = #gpu.address_space<workgroup>
+#gpu_global_addrspace = #gpu.address_space<global>
+
+// CHECK: func @test_global_load_async_to_lds_no_mask_dst_collapse
+// CHECK-SAME: %[[SRC:.*]]: memref<8192xi32, #gpu.address_space<global>>, %[[LDS:.*]]: memref<64x64xi32, #gpu.address_space<workgroup>>, %[[SRC_IDX:.*]]: index, %[[DST_IDX:.*]]: index
+func.func @test_global_load_async_to_lds_no_mask_dst_collapse(%src: memref<8192xi32, #gpu_global_addrspace>, %lds: memref<64x64xi32, #gpu_lds_addrspace>, %src_idx: index, %dst_idx: index) {
+ // CHECK: %[[INDICES:.*]]:2 = affine.delinearize_index %[[DST_IDX]] into (64, 64) : index, index
+ // CHECK: amdgpu.global_load_async_to_lds %[[SRC]][%[[SRC_IDX]]], %[[LDS]][%[[INDICES]]#0, %[[INDICES]]#1] :
+ // CHECK-SAME: i32, memref<8192xi32, #gpu.address_space<global>>, memref<64x64xi32, #gpu.address_space<workgroup>>
+
+ %collapse_lds = memref.collapse_shape %lds [[0, 1]]
+ : memref<64x64xi32, #gpu_lds_addrspace>
+ into memref<4096xi32, #gpu_lds_addrspace>
+ amdgpu.global_load_async_to_lds %src[%src_idx], %collapse_lds[%dst_idx]
+ : i32, memref<8192xi32, #gpu_global_addrspace>,
+ memref<4096xi32, #gpu_lds_addrspace>
+ func.return
+}
More information about the Mlir-commits
mailing list