[Mlir-commits] [mlir] 247a9bf - [mlir][AMDGPU] Add folders for memref aliases to TDM base creation (#184567)
llvmlistbot at llvm.org
llvmlistbot at llvm.org
Wed Mar 4 07:26:46 PST 2026
Author: Krzysztof Drewniak
Date: 2026-03-04T07:26:40-08:00
New Revision: 247a9bfc26ad1f062c52f7b4625ec0e0710a6424
URL: https://github.com/llvm/llvm-project/commit/247a9bfc26ad1f062c52f7b4625ec0e0710a6424
DIFF: https://github.com/llvm/llvm-project/commit/247a9bfc26ad1f062c52f7b4625ec0e0710a6424.diff
LOG: [mlir][AMDGPU] Add folders for memref aliases to TDM base creation (#184567)
The TDM base creation (amdgpu.make_tdm_base and
amdgpu.make_gather_tdm_base) take references to a
`%memref[%i0, %i1,, ...]` for the starting point of the tiles in
global/shared memory that the TDM descriptor refers to. Memory alias ops
can be safely folded into these operations, since these two memref
operands are just pointers to a scalar starting pint and don't have
semantics that depend on the memref layout (except to the extent that it
defines a location in memory).
While I'm here, I've cleaned up a few things, like the incorrect file
header and fixed the tests to not use integer address spaces.
Co-authored-by: Claude Opus 4.6 <noreply at anthropic.com>
Added:
Modified:
mlir/lib/Dialect/AMDGPU/Transforms/FoldMemRefsOps.cpp
mlir/test/Dialect/AMDGPU/amdgpu-fold-memrefs.mlir
Removed:
################################################################################
diff --git a/mlir/lib/Dialect/AMDGPU/Transforms/FoldMemRefsOps.cpp b/mlir/lib/Dialect/AMDGPU/Transforms/FoldMemRefsOps.cpp
index 19b4341cc27a1..8570927a77794 100644
--- a/mlir/lib/Dialect/AMDGPU/Transforms/FoldMemRefsOps.cpp
+++ b/mlir/lib/Dialect/AMDGPU/Transforms/FoldMemRefsOps.cpp
@@ -1,4 +1,4 @@
-//===- FoldSubviewOps.cpp - AMDGPU fold subview ops -----------------------===//
+//===- FoldMemRefsOps.cpp - AMDGPU fold memref ops ------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
@@ -92,6 +92,9 @@ struct FoldMemRefOpsIntoGatherToLDSOp final : OpRewritePattern<GatherToLDSOp> {
destIndices = op.getDstIndices();
}
+ if (failed(foldSrcResult) && failed(foldDstResult))
+ return rewriter.notifyMatchFailure(op, "no fold found");
+
rewriter.replaceOpWithNewOp<GatherToLDSOp>(
op, memrefSource, sourceIndices, memrefDest, destIndices,
op.getTransferType(), op.getAsync());
@@ -100,6 +103,41 @@ struct FoldMemRefOpsIntoGatherToLDSOp final : OpRewritePattern<GatherToLDSOp> {
}
};
+template <typename OpTy>
+struct FoldMemRefOpsIntoDmaBaseOp final : OpRewritePattern<OpTy> {
+ using OpRewritePattern<OpTy>::OpRewritePattern;
+ LogicalResult matchAndRewrite(OpTy op,
+ PatternRewriter &rewriter) const override {
+ Location loc = op.getLoc();
+
+ SmallVector<Value> globalIndices, ldsIndices;
+ Value globalBase, ldsBase;
+
+ LogicalResult didFoldGlobal =
+ foldMemrefViewOp(rewriter, loc, op.getGlobal(), op.getGlobalIndices(),
+ globalIndices, globalBase, "global");
+ if (failed(didFoldGlobal)) {
+ globalBase = op.getGlobal();
+ globalIndices = op.getGlobalIndices();
+ }
+
+ LogicalResult didFoldLds =
+ foldMemrefViewOp(rewriter, loc, op.getLds(), op.getLdsIndices(),
+ ldsIndices, ldsBase, "lds");
+ if (failed(didFoldLds)) {
+ ldsBase = op.getLds();
+ ldsIndices = op.getLdsIndices();
+ }
+
+ if (failed(didFoldGlobal) && failed(didFoldLds))
+ return rewriter.notifyMatchFailure(op, "no fold found");
+
+ rewriter.replaceOpWithNewOp<OpTy>(op, op.getBase().getType(), globalBase,
+ globalIndices, ldsBase, ldsIndices);
+ return success();
+ }
+};
+
struct FoldMemRefOpsIntoTransposeLoadOp final
: OpRewritePattern<TransposeLoadOp> {
using Base::Base;
@@ -121,8 +159,10 @@ struct FoldMemRefOpsIntoTransposeLoadOp final
void populateAmdgpuFoldMemRefOpsPatterns(RewritePatternSet &patterns,
PatternBenefit benefit) {
- patterns
- .add<FoldMemRefOpsIntoGatherToLDSOp, FoldMemRefOpsIntoTransposeLoadOp>(
- patterns.getContext(), benefit);
+ patterns.add<FoldMemRefOpsIntoGatherToLDSOp,
+ FoldMemRefOpsIntoDmaBaseOp<MakeDmaBaseOp>,
+ FoldMemRefOpsIntoDmaBaseOp<MakeGatherDmaBaseOp>,
+ FoldMemRefOpsIntoTransposeLoadOp>(patterns.getContext(),
+ benefit);
}
} // namespace mlir::amdgpu
diff --git a/mlir/test/Dialect/AMDGPU/amdgpu-fold-memrefs.mlir b/mlir/test/Dialect/AMDGPU/amdgpu-fold-memrefs.mlir
index 11571b060a0b8..4fc6bc1846c3d 100644
--- a/mlir/test/Dialect/AMDGPU/amdgpu-fold-memrefs.mlir
+++ b/mlir/test/Dialect/AMDGPU/amdgpu-fold-memrefs.mlir
@@ -1,15 +1,15 @@
// RUN: mlir-opt --amdgpu-fold-memrefs-ops --split-input-file %s | FileCheck %s
-#gpu_lds_addrspace = 3
+#gpu_lds_addrspace = #gpu.address_space<workgroup>
// CHECK: func @test_subview_folding
// CHECK-SAME: %[[ARG0:.*]]: index, %[[ARG1:.*]]: index
func.func @test_subview_folding(%offset_i: index, %offset_j: index) {
- // CHECK: %[[LOCAL:.*]] = memref.alloc() : memref<64x64xf16, 3>
+ // CHECK: %[[LOCAL:.*]] = memref.alloc() : memref<64x64xf16, #gpu.address_space<workgroup>>
// CHECK: %[[MEM:.*]] = memref.alloc() : memref<64x128xf16>
// CHECK: %[[C0:.*]] = arith.constant 0 : index
// CHECK: amdgpu.gather_to_lds %[[MEM]][%[[ARG0]], %[[ARG1]]], %[[LOCAL]][%[[C0]], %[[C0]]]
- // CHECK-SAME: vector<8xf16>, memref<64x128xf16>, memref<64x64xf16, 3>
+ // CHECK-SAME: vector<8xf16>, memref<64x128xf16>, memref<64x64xf16, #gpu.address_space<workgroup>>
%alloc = memref.alloc() : memref<64x64xf16, #gpu_lds_addrspace>
%mem = memref.alloc() : memref<64x128xf16>
@@ -22,7 +22,7 @@ func.func @test_subview_folding(%offset_i: index, %offset_j: index) {
// -----
-#gpu_lds_addrspace = 3
+#gpu_lds_addrspace = #gpu.address_space<workgroup>
// CHECK: #[[MAP:.*]] = affine_map<()[s0] -> (s0 + 32)>
// CHECK: #[[MAP1:.*]] = affine_map<()[s0] -> (s0 + 64)>
@@ -30,13 +30,13 @@ func.func @test_subview_folding(%offset_i: index, %offset_j: index) {
// CHECK: func @subview_folding_offset
// CHECK-SAME: %[[ARG0:.*]]: index, %[[ARG1:.*]]: index
func.func @subview_folding_offset(%offset_i: index, %offset_j: index) {
- // CHECK: %[[LOCAL:.*]] = memref.alloc() : memref<64x64xf16, 3>
+ // CHECK: %[[LOCAL:.*]] = memref.alloc() : memref<64x64xf16, #gpu.address_space<workgroup>>
// CHECK: %[[MEM:.*]] = memref.alloc() : memref<64x128xf16>
// CHECK: %[[C0:.*]] = arith.constant 0 : index
// CHECK: %[[IDX0:.*]] = affine.apply #[[MAP]]()[%[[ARG0]]]
// CHECK: %[[IDX1:.*]] = affine.apply #[[MAP1]]()[%[[ARG1]]]
// CHECK: amdgpu.gather_to_lds %[[MEM]][%[[IDX0]], %[[IDX1]]], %[[LOCAL]][%[[C0]], %[[C0]]]
- // CHECK-SAME: vector<8xf16>, memref<64x128xf16>, memref<64x64xf16, 3>
+ // CHECK-SAME: vector<8xf16>, memref<64x128xf16>, memref<64x64xf16, #gpu.address_space<workgroup>>
%alloc = memref.alloc() : memref<64x64xf16, #gpu_lds_addrspace>
%mem = memref.alloc() : memref<64x128xf16>
@@ -49,18 +49,18 @@ func.func @subview_folding_offset(%offset_i: index, %offset_j: index) {
// -----
-#gpu_lds_addrspace = 3
+#gpu_lds_addrspace = #gpu.address_space<workgroup>
// CHECK: func @test_expand_shape
// CHECK-SAME: %[[ARG0:.*]]: index, %[[ARG1:.*]]: index
func.func @test_expand_shape(%offset_i: index, %offset_j: index) {
- // CHECK: %[[LOCAL:.*]] = memref.alloc() : memref<4096xf16, 3>
+ // CHECK: %[[LOCAL:.*]] = memref.alloc() : memref<4096xf16, #gpu.address_space<workgroup>>
// CHECK: %[[MEM:.*]] = memref.alloc() : memref<8192xf16>
// CHECK: %[[C0:.*]] = arith.constant 0 : index
// CHECK: %[[IDXM:.*]] = affine.linearize_index [%[[ARG0]], %[[ARG1]]] by (64, 128) : index
// CHECK: %[[IDXL:.*]] = affine.linearize_index [%[[C0]], %[[C0]]] by (64, 64) : index
// CHECK: amdgpu.gather_to_lds %[[MEM]][%[[IDXM]]], %[[LOCAL]][%[[IDXL]]]
- // CHECK-SAME: vector<8xf16>, memref<8192xf16>, memref<4096xf16, 3>
+ // CHECK-SAME: vector<8xf16>, memref<8192xf16>, memref<4096xf16, #gpu.address_space<workgroup>>
%alloc = memref.alloc() : memref<4096xf16, #gpu_lds_addrspace>
%mem = memref.alloc() : memref<8192xf16>
@@ -74,18 +74,18 @@ func.func @test_expand_shape(%offset_i: index, %offset_j: index) {
// -----
-#gpu_lds_addrspace = 3
+#gpu_lds_addrspace = #gpu.address_space<workgroup>
// CHECK: func @test_collapse_shape
// CHECK-SAME: %[[ARG0:.*]]: index, %[[ARG1:.*]]: index
func.func @test_collapse_shape(%offset_i: index, %offset_j: index) {
- // CHECK: %[[LOCAL:.*]] = memref.alloc() : memref<64x64xf16, 3>
+ // CHECK: %[[LOCAL:.*]] = memref.alloc() : memref<64x64xf16, #gpu.address_space<workgroup>>
// CHECK: %[[MEM:.*]] = memref.alloc() : memref<64x128xf16>
// CHECK: %[[C0:.*]] = arith.constant 0 : index
// CHECK: %[[INDICES_MEM:.*]]:2 = affine.delinearize_index %[[ARG0]] into (64, 128) : index, index
// CHECK: %[[INDICES_LDS:.*]]:2 = affine.delinearize_index %[[ARG1]] into (64, 64) : index, index
// CHECK: amdgpu.gather_to_lds %[[MEM]][%[[INDICES_MEM]]#0, %[[INDICES_MEM]]#1], %[[LOCAL]][%[[INDICES_LDS]]#0, %[[INDICES_LDS]]#1]
- // CHECK-SAME: vector<8xf16>, memref<64x128xf16>, memref<64x64xf16, 3>
+ // CHECK-SAME: vector<8xf16>, memref<64x128xf16>, memref<64x64xf16, #gpu.address_space<workgroup>>
%alloc = memref.alloc() : memref<64x64xf16, #gpu_lds_addrspace>
%collapse_alloc = memref.collapse_shape %alloc [[0, 1]] : memref<64x64xf16, #gpu_lds_addrspace> into memref<4096xf16, #gpu_lds_addrspace>
@@ -100,17 +100,17 @@ func.func @test_collapse_shape(%offset_i: index, %offset_j: index) {
// -----
-#gpu_lds_addrspace = 3
+#gpu_lds_addrspace = #gpu.address_space<workgroup>
// CHECK: func @test_expand_shape_src_raw_buffer
// CHECK-SAME: %[[ARG0:.*]]: memref<8192xf16, #amdgpu.address_space<fat_raw_buffer>>, %[[ARG1:.*]]: index, %[[ARG2:.*]]: index
func.func @test_expand_shape_src_raw_buffer(%mem : memref<8192xf16, #amdgpu.address_space<fat_raw_buffer>>, %offset_i: index, %offset_j: index) {
- // CHECK: %[[LOCAL:.*]] = memref.alloc() : memref<4096xf16, 3>
+ // CHECK: %[[LOCAL:.*]] = memref.alloc() : memref<4096xf16, #gpu.address_space<workgroup>>
// CHECK: %[[C0:.*]] = arith.constant 0 : index
// CHECK: %[[IDXM:.*]] = affine.linearize_index [%[[ARG1]], %[[ARG2]]] by (64, 128) : index
// CHECK: amdgpu.gather_to_lds %[[ARG0]][%[[IDXM]]], %[[LOCAL]][%[[C0]]]
- // CHECK-SAME: vector<8xf16>, memref<8192xf16, #amdgpu.address_space<fat_raw_buffer>>, memref<4096xf16, 3>
+ // CHECK-SAME: vector<8xf16>, memref<8192xf16, #amdgpu.address_space<fat_raw_buffer>>, memref<4096xf16, #gpu.address_space<workgroup>>
%alloc = memref.alloc() : memref<4096xf16, #gpu_lds_addrspace>
%expand_mem = memref.expand_shape %mem [[0, 1]] output_shape [64, 128] : memref<8192xf16, #amdgpu.address_space<fat_raw_buffer>> into memref<64x128xf16, #amdgpu.address_space<fat_raw_buffer>>
@@ -123,17 +123,17 @@ func.func @test_expand_shape_src_raw_buffer(%mem : memref<8192xf16, #amdgpu.addr
// -----
-#gpu_lds_addrspace = 3
+#gpu_lds_addrspace = #gpu.address_space<workgroup>
// CHECK: func @test_expand_shape_dst_only
// CHECK-SAME: %[[ARG0:.*]]: index, %[[ARG1:.*]]: index
func.func @test_expand_shape_dst_only(%offset_i: index, %offset_j: index) {
- // CHECK: %[[LOCAL:.*]] = memref.alloc() : memref<4096xf16, 3>
+ // CHECK: %[[LOCAL:.*]] = memref.alloc() : memref<4096xf16, #gpu.address_space<workgroup>>
// CHECK: %[[MEM:.*]] = memref.alloc() : memref<8192xf16>
// CHECK: %[[C0:.*]] = arith.constant 0 : index
// CHECK: %[[IDX_LDS:.*]] = affine.linearize_index [%[[ARG1]], %[[C0]]] by (64, 64) : index
// CHECK: amdgpu.gather_to_lds %[[MEM]][%[[ARG0]]], %[[LOCAL]][%[[IDX_LDS]]]
- // CHECK-SAME: vector<8xf16>, memref<8192xf16>, memref<4096xf16, 3>
+ // CHECK-SAME: vector<8xf16>, memref<8192xf16>, memref<4096xf16, #gpu.address_space<workgroup>>
%alloc = memref.alloc() : memref<4096xf16, #gpu_lds_addrspace>
%mem = memref.alloc() : memref<8192xf16>
@@ -147,14 +147,14 @@ func.func @test_expand_shape_dst_only(%offset_i: index, %offset_j: index) {
// -----
-#gpu_lds_addrspace = 3
+#gpu_lds_addrspace = #gpu.address_space<workgroup>
// CHECK: func @test_nop
// CHECK-SAME: %[[ARG0:.*]]: memref<8192xf16, #amdgpu.address_space<fat_raw_buffer>>, %[[ARG1:.*]]: index, %[[ARG2:.*]]: index
func.func @test_nop(%mem : memref<8192xf16, #amdgpu.address_space<fat_raw_buffer>>, %offset_i: index, %offset_j: index) {
- // CHECK: %[[LOCAL:.*]] = memref.alloc() : memref<4096xf16, 3>
+ // CHECK: %[[LOCAL:.*]] = memref.alloc() : memref<4096xf16, #gpu.address_space<workgroup>>
// CHECK: amdgpu.gather_to_lds %[[ARG0]][%[[ARG1]]], %[[LOCAL]][%[[ARG2]]]
- // CHECK-SAME: vector<8xf16>, memref<8192xf16, #amdgpu.address_space<fat_raw_buffer>>, memref<4096xf16, 3>
+ // CHECK-SAME: vector<8xf16>, memref<8192xf16, #amdgpu.address_space<fat_raw_buffer>>, memref<4096xf16, #gpu.address_space<workgroup>>
%alloc = memref.alloc() : memref<4096xf16, #gpu_lds_addrspace>
amdgpu.gather_to_lds %mem[%offset_i], %alloc[%offset_j]
@@ -164,16 +164,16 @@ func.func @test_nop(%mem : memref<8192xf16, #amdgpu.address_space<fat_raw_buffer
// -----
-#gpu_lds_addrspace = 3
+#gpu_lds_addrspace = #gpu.address_space<workgroup>
// CHECK: func @test_async_flag_preserved
// CHECK-SAME: %[[ARG0:.*]]: index, %[[ARG1:.*]]: index
func.func @test_async_flag_preserved(%offset_i: index, %offset_j: index) {
- // CHECK: %[[LOCAL:.*]] = memref.alloc() : memref<64x64xf16, 3>
+ // CHECK: %[[LOCAL:.*]] = memref.alloc() : memref<64x64xf16, #gpu.address_space<workgroup>>
// CHECK: %[[MEM:.*]] = memref.alloc() : memref<64x128xf16>
// CHECK: %[[C0:.*]] = arith.constant 0 : index
// CHECK: amdgpu.gather_to_lds async %[[MEM]][%[[ARG0]], %[[ARG1]]], %[[LOCAL]][%[[C0]], %[[C0]]]
- // CHECK-SAME: vector<8xf16>, memref<64x128xf16>, memref<64x64xf16, 3>
+ // CHECK-SAME: vector<8xf16>, memref<64x128xf16>, memref<64x64xf16, #gpu.address_space<workgroup>>
%alloc = memref.alloc() : memref<64x64xf16, #gpu_lds_addrspace>
%mem = memref.alloc() : memref<64x128xf16>
@@ -286,3 +286,135 @@ func.func @test_transpose_load_nop(%offset_i: index, %offset_j: index) -> vector
: memref<32x128xf16, #gpu_wg> -> vector<4xf16>
return %result : vector<4xf16>
}
+
+// -----
+
+#gpu_lds_addrspace = #gpu.address_space<workgroup>
+#gpu_global_addrspace = #gpu.address_space<global>
+
+// CHECK: func @test_make_dma_base_subview
+// CHECK-SAME: %[[MEM:.*]]: memref<64x128xf16, #gpu.address_space<global>>, %[[LDS:.*]]: memref<64x64xf16, #gpu.address_space<workgroup>>, %[[GLOBAL_I:.*]]: index, %[[GLOBAL_J:.*]]: index, %[[LDS_I:.*]]: index, %[[LDS_J:.*]]: index
+func.func @test_make_dma_base_subview(%mem: memref<64x128xf16, #gpu_global_addrspace>, %lds: memref<64x64xf16, #gpu_lds_addrspace>, %global_i: index, %global_j: index, %lds_i: index, %lds_j: index) {
+ // CHECK: amdgpu.make_dma_base %[[MEM]][%[[GLOBAL_I]], %[[GLOBAL_J]]], %[[LDS]][%[[LDS_I]], %[[LDS_J]]]
+ // CHECK-SAME: memref<64x128xf16, #gpu.address_space<global>>, memref<64x64xf16, #gpu.address_space<workgroup>> -> !amdgpu.tdm_base<f16>
+
+ %subview = memref.subview %mem[0, 0][32, 64][1, 1] : memref<64x128xf16, #gpu_global_addrspace> to memref<32x64xf16, strided<[128, 1]>, #gpu_global_addrspace>
+ %base = amdgpu.make_dma_base %subview[%global_i, %global_j], %lds[%lds_i, %lds_j]
+ : memref<32x64xf16, strided<[128, 1]>, #gpu_global_addrspace>, memref<64x64xf16, #gpu_lds_addrspace> -> !amdgpu.tdm_base<f16>
+ func.return
+}
+
+// -----
+
+#gpu_lds_addrspace = #gpu.address_space<workgroup>
+#gpu_global_addrspace = #gpu.address_space<global>
+
+// CHECK: func @test_make_dma_base_expand_shape
+// CHECK-SAME: %[[MEM:.*]]: memref<64x128xf16, #gpu.address_space<global>>, %[[LDS:.*]]: memref<4096xf16, #gpu.address_space<workgroup>>, %[[GLOBAL_I:.*]]: index, %[[GLOBAL_J:.*]]: index, %[[LDS_I:.*]]: index, %[[LDS_J:.*]]: index
+func.func @test_make_dma_base_expand_shape(%mem: memref<64x128xf16, #gpu_global_addrspace>, %lds: memref<4096xf16, #gpu_lds_addrspace>, %global_i: index, %global_j: index, %lds_i: index, %lds_j: index) {
+ // CHECK: %[[IDX:.*]] = affine.linearize_index [%[[LDS_I]], %[[LDS_J]]] by (64, 64) : index
+ // CHECK: amdgpu.make_dma_base %[[MEM]][%[[GLOBAL_I]], %[[GLOBAL_J]]], %[[LDS]][%[[IDX]]]
+ // CHECK-SAME: memref<64x128xf16, #gpu.address_space<global>>, memref<4096xf16, #gpu.address_space<workgroup>> -> !amdgpu.tdm_base<f16>
+
+ %expand_lds = memref.expand_shape %lds [[0, 1]] output_shape [64, 64] : memref<4096xf16, #gpu_lds_addrspace> into memref<64x64xf16, #gpu_lds_addrspace>
+ %base = amdgpu.make_dma_base %mem[%global_i, %global_j], %expand_lds[%lds_i, %lds_j]
+ : memref<64x128xf16, #gpu_global_addrspace>, memref<64x64xf16, #gpu_lds_addrspace> -> !amdgpu.tdm_base<f16>
+ func.return
+}
+
+// -----
+
+#gpu_lds_addrspace = #gpu.address_space<workgroup>
+#gpu_global_addrspace = #gpu.address_space<global>
+
+// CHECK: func @test_make_gather_dma_base_subview
+// CHECK-SAME: %[[MEM:.*]]: memref<64x128xf16, #gpu.address_space<global>>, %[[LDS:.*]]: memref<64x64xf16, #gpu.address_space<workgroup>>, %[[GLOBAL_I:.*]]: index, %[[GLOBAL_J:.*]]: index, %[[LDS_I:.*]]: index, %[[LDS_J:.*]]: index
+func.func @test_make_gather_dma_base_subview(%mem: memref<64x128xf16, #gpu_global_addrspace>, %lds: memref<64x64xf16, #gpu_lds_addrspace>, %global_i: index, %global_j: index, %lds_i: index, %lds_j: index) {
+ // CHECK: amdgpu.make_gather_dma_base %[[MEM]][%[[GLOBAL_I]], %[[GLOBAL_J]]], %[[LDS]][%[[LDS_I]], %[[LDS_J]]]
+ // CHECK-SAME: memref<64x128xf16, #gpu.address_space<global>>, memref<64x64xf16, #gpu.address_space<workgroup>> -> !amdgpu.tdm_gather_base<f16, i16>
+
+ %subview = memref.subview %mem[0, 0][32, 64][1, 1] : memref<64x128xf16, #gpu_global_addrspace> to memref<32x64xf16, strided<[128, 1]>, #gpu_global_addrspace>
+ %base = amdgpu.make_gather_dma_base %subview[%global_i, %global_j], %lds[%lds_i, %lds_j]
+ : memref<32x64xf16, strided<[128, 1]>, #gpu_global_addrspace>, memref<64x64xf16, #gpu_lds_addrspace> -> !amdgpu.tdm_gather_base<f16, i16>
+ func.return
+}
+
+// -----
+
+#gpu_lds_addrspace = #gpu.address_space<workgroup>
+#gpu_global_addrspace = #gpu.address_space<global>
+
+// CHECK: func @test_make_gather_dma_base_collapse_shape
+// CHECK-SAME: %[[MEM:.*]]: memref<64x128xf16, #gpu.address_space<global>>, %[[LDS:.*]]: memref<64x64xf16, #gpu.address_space<workgroup>>, %[[GLOBAL_I:.*]]: index, %[[GLOBAL_J:.*]]: index, %[[LDS_IDX:.*]]: index
+func.func @test_make_gather_dma_base_collapse_shape(%mem: memref<64x128xf16, #gpu_global_addrspace>, %lds: memref<64x64xf16, #gpu_lds_addrspace>, %global_i: index, %global_j: index, %lds_idx: index) {
+ // CHECK: %[[INDICES:.*]]:2 = affine.delinearize_index %[[LDS_IDX]] into (64, 64) : index, index
+ // CHECK: amdgpu.make_gather_dma_base %[[MEM]][%[[GLOBAL_I]], %[[GLOBAL_J]]], %[[LDS]][%[[INDICES]]#0, %[[INDICES]]#1]
+ // CHECK-SAME: memref<64x128xf16, #gpu.address_space<global>>, memref<64x64xf16, #gpu.address_space<workgroup>> -> !amdgpu.tdm_gather_base<f16, i16>
+
+ %collapse_lds = memref.collapse_shape %lds [[0, 1]] : memref<64x64xf16, #gpu_lds_addrspace> into memref<4096xf16, #gpu_lds_addrspace>
+ %base = amdgpu.make_gather_dma_base %mem[%global_i, %global_j], %collapse_lds[%lds_idx]
+ : memref<64x128xf16, #gpu_global_addrspace>, memref<4096xf16, #gpu_lds_addrspace> -> !amdgpu.tdm_gather_base<f16, i16>
+ func.return
+}
+
+// -----
+
+#gpu_lds_addrspace = #gpu.address_space<workgroup>
+#gpu_global_addrspace = #gpu.address_space<global>
+
+// CHECK: #[[BOTH_MAP:.*]] = affine_map<()[s0] -> (s0 + 32)>
+// CHECK: #[[BOTH_MAP1:.*]] = affine_map<()[s0] -> (s0 + 64)>
+
+// CHECK: func @test_make_dma_base_both_fold
+// CHECK-SAME: %[[MEM:.*]]: memref<64x128xf16, #gpu.address_space<global>>, %[[LDS:.*]]: memref<4096xf16, #gpu.address_space<workgroup>>, %[[GLOBAL_I:.*]]: index, %[[GLOBAL_J:.*]]: index, %[[LDS_I:.*]]: index, %[[LDS_J:.*]]: index
+func.func @test_make_dma_base_both_fold(%mem: memref<64x128xf16, #gpu_global_addrspace>, %lds: memref<4096xf16, #gpu_lds_addrspace>, %global_i: index, %global_j: index, %lds_i: index, %lds_j: index) {
+ // CHECK: %[[GI:.*]] = affine.apply #[[BOTH_MAP]]()[%[[GLOBAL_I]]]
+ // CHECK: %[[GJ:.*]] = affine.apply #[[BOTH_MAP1]]()[%[[GLOBAL_J]]]
+ // CHECK: %[[IDX:.*]] = affine.linearize_index [%[[LDS_I]], %[[LDS_J]]] by (64, 64) : index
+ // CHECK: amdgpu.make_dma_base %[[MEM]][%[[GI]], %[[GJ]]], %[[LDS]][%[[IDX]]]
+ // CHECK-SAME: memref<64x128xf16, #gpu.address_space<global>>, memref<4096xf16, #gpu.address_space<workgroup>> -> !amdgpu.tdm_base<f16>
+
+ %subview = memref.subview %mem[32, 64][32, 64][1, 1] : memref<64x128xf16, #gpu_global_addrspace> to memref<32x64xf16, strided<[128, 1], offset: 4160>, #gpu_global_addrspace>
+ %expand_lds = memref.expand_shape %lds [[0, 1]] output_shape [64, 64] : memref<4096xf16, #gpu_lds_addrspace> into memref<64x64xf16, #gpu_lds_addrspace>
+ %base = amdgpu.make_dma_base %subview[%global_i, %global_j], %expand_lds[%lds_i, %lds_j]
+ : memref<32x64xf16, strided<[128, 1], offset: 4160>, #gpu_global_addrspace>, memref<64x64xf16, #gpu_lds_addrspace> -> !amdgpu.tdm_base<f16>
+ func.return
+}
+
+// -----
+
+#gpu_lds_addrspace = #gpu.address_space<workgroup>
+#gpu_global_addrspace = #gpu.address_space<global>
+
+// CHECK: func @test_make_dma_base_nop
+// CHECK-SAME: %[[MEM:.*]]: memref<64x128xf16, #gpu.address_space<global>>, %[[LDS:.*]]: memref<64x64xf16, #gpu.address_space<workgroup>>, %[[GLOBAL_I:.*]]: index, %[[GLOBAL_J:.*]]: index, %[[LDS_I:.*]]: index, %[[LDS_J:.*]]: index
+func.func @test_make_dma_base_nop(%mem: memref<64x128xf16, #gpu_global_addrspace>, %lds: memref<64x64xf16, #gpu_lds_addrspace>, %global_i: index, %global_j: index, %lds_i: index, %lds_j: index) {
+ // CHECK: amdgpu.make_dma_base %[[MEM]][%[[GLOBAL_I]], %[[GLOBAL_J]]], %[[LDS]][%[[LDS_I]], %[[LDS_J]]]
+ // CHECK-SAME: memref<64x128xf16, #gpu.address_space<global>>, memref<64x64xf16, #gpu.address_space<workgroup>> -> !amdgpu.tdm_base<f16>
+ // CHECK-NOT: subview
+ // CHECK-NOT: expand_shape
+ // CHECK-NOT: collapse_shape
+
+ %base = amdgpu.make_dma_base %mem[%global_i, %global_j], %lds[%lds_i, %lds_j]
+ : memref<64x128xf16, #gpu_global_addrspace>, memref<64x64xf16, #gpu_lds_addrspace> -> !amdgpu.tdm_base<f16>
+ func.return
+}
+
+// -----
+
+#gpu_lds_addrspace = #gpu.address_space<workgroup>
+#gpu_global_addrspace = #gpu.address_space<global>
+
+// CHECK: func @test_make_gather_dma_base_nop
+// CHECK-SAME: %[[MEM:.*]]: memref<64x128xf16, #gpu.address_space<global>>, %[[LDS:.*]]: memref<64x64xf16, #gpu.address_space<workgroup>>, %[[GLOBAL_I:.*]]: index, %[[GLOBAL_J:.*]]: index, %[[LDS_I:.*]]: index, %[[LDS_J:.*]]: index
+func.func @test_make_gather_dma_base_nop(%mem: memref<64x128xf16, #gpu_global_addrspace>, %lds: memref<64x64xf16, #gpu_lds_addrspace>, %global_i: index, %global_j: index, %lds_i: index, %lds_j: index) {
+ // CHECK: amdgpu.make_gather_dma_base %[[MEM]][%[[GLOBAL_I]], %[[GLOBAL_J]]], %[[LDS]][%[[LDS_I]], %[[LDS_J]]]
+ // CHECK-SAME: memref<64x128xf16, #gpu.address_space<global>>, memref<64x64xf16, #gpu.address_space<workgroup>> -> !amdgpu.tdm_gather_base<f16, i16>
+ // CHECK-NOT: subview
+ // CHECK-NOT: expand_shape
+ // CHECK-NOT: collapse_shape
+
+ %base = amdgpu.make_gather_dma_base %mem[%global_i, %global_j], %lds[%lds_i, %lds_j]
+ : memref<64x128xf16, #gpu_global_addrspace>, memref<64x64xf16, #gpu_lds_addrspace> -> !amdgpu.tdm_gather_base<f16, i16>
+ func.return
+}
More information about the Mlir-commits
mailing list