[Mlir-commits] [mlir] [mlir][amdgpu] Lower amdgpu.make_dma_base (PR #169817)
Erick Ochoa Lopez
llvmlistbot at llvm.org
Thu Nov 27 06:46:05 PST 2025
https://github.com/amd-eochoalo created https://github.com/llvm/llvm-project/pull/169817
* Adds lowering for `amdgpu.make_dma_base`
* Adds definition for `amdgpu.tensor_load_to_lds`.
* Adds definition for `amdgpu.tensor_store_from_lds`.
Builds on top of #169407
>From b766215221d453a97002d6faabc1c387dac3f10b Mon Sep 17 00:00:00 2001
From: Erick Ochoa <erick.ochoalopez at amd.com>
Date: Wed, 26 Nov 2025 16:42:44 -0500
Subject: [PATCH 1/4] [mlir][amdgpu] Add make_dma_descriptor op
---
mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td | 117 ++++++++++++++++--
mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp | 28 +++++
mlir/test/Dialect/AMDGPU/invalid.mlir | 40 ++++++
mlir/test/Dialect/AMDGPU/ops.mlir | 59 ++++++++-
4 files changed, 232 insertions(+), 12 deletions(-)
diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
index e07c72b839e7c..3581b07dc4e3e 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
@@ -80,15 +80,15 @@ def AMDGPU_AddressSpaceAttr : EnumAttr<AMDGPU_Dialect, AMDGPU_AddressSpace,
let assemblyFormat = "`<` $value `>`";
}
+//===----------------------------------------------------------------------===//
+// AMDGPU Type definitions
+//===----------------------------------------------------------------------===//
+
class AMDGPU_Type<string name, string typeMnemonic, list<Trait> traits = []>
: TypeDef<AMDGPU_Dialect, name, traits> {
let mnemonic = typeMnemonic;
}
-//===----------------------------------------------------------------------===//
-// AMDGPU Type definitions
-//===----------------------------------------------------------------------===//
-
def AMDGPU_TDMBaseType : AMDGPU_Type<"TDMBase", "tdm_base"> {
let summary = "Pair of base addresses that move data between LDS and global storage.";
let description = [{
@@ -104,6 +104,15 @@ def AMDGPU_TDMBaseType : AMDGPU_Type<"TDMBase", "tdm_base"> {
let assemblyFormat = "`<` $elementType `>`";
}
+def AMDGPU_TDMDescriptorType : AMDGPU_Type<"TDMDescriptor", "tdm_descriptor"> {
+ let summary = "Descriptors used in tensor store/load operations.";
+ let description = [{
+ This type is opaque and corresponds to the two or four descriptor groups
+ used in tensor_load_to_lds or tensor_store_from_lds.
+ }];
+
+}
+
//===----------------------------------------------------------------------===//
// AMDGPU Op definitions
//===----------------------------------------------------------------------===//
@@ -1222,14 +1231,13 @@ def AMDGPU_MakeDmaBaseOp :
AMDGPU_Op<"make_dma_base", [Pure, AttrSizedOperandSegments]>,
Arguments<(ins
Arg<AnyMemRef, "buffer to read from">:$src,
- Variadic<Index>:$srcIndices,
+ Variadic<Index>:$src_indices,
Arg<AnyMemRef, "buffer to write to">:$dst,
- Variadic<Index>:$dstIndices)>,
+ Variadic<Index>:$dst_indices)>,
Results<(outs AMDGPU_TDMBaseType: $base)> {
// TODO:
// * Add verifiers such that one of the memrefs is from LDS and the other global.
- // * Add verifiers to make sure that the type is in the correct direction.
// * Add verifiers to make sure that the number of indices do not exceed the number of dimensions.
let summary = "Pair of based addresses used when moving tiles between LDS and global memory.";
@@ -1240,12 +1248,105 @@ def AMDGPU_MakeDmaBaseOp :
This operation creates a value corresponding to the tensor descriptor (D#) group 0
found in TensorLoadToLDSOp and TensorStoreFromLDSOp in the rocdl dialect.
+ For example:
+
+ ```mlir
+ %base = amdgpu.make_dma_base %src[%idx0], %dst[%idx1] : memref<8xi32>, memref<8xi32, #gpu.address_space<workgroup>> -> !amdgpu.tdm_base<i32>
+ %descriptor = amdgpu.make_dma_descriptor %base globalSize [2, 2] globalStride [2, 1] sharedSize [2, 2] : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
+ amdgpu.tensor_load_to_lds %descriptor : !amdgpu.tdm_descriptor
+ ```
+
+ to
+
+ ```mlir
+ // pseudocode
+ %base_0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr)>
+ %base_1 = llvm.insertvalue %global_addr, %base_0[0] : !llvm.struct<(ptr, ptr)>
+ %base_2 = llvm.insertvalue %lds_addr, %base_1[1] : !llvm.struct(ptr, ptr)>
+ // type(%base_2) = !llvm.struct<(ptr, ptr) roughly corresponds to amdgpu.tdm_base<i32>
+
+ // The base will be used when contructing dgroup0
+ // when lowering amdgpu.make_dma_descriptor
+ %dgroup0_0 = llvm.mlir.undef : !llvm.struct<(....)>
+ %dgroup0_1 = llvm.insertvalue %base2, %dgroup0_0 : ....
+
+ // When lowering amdgpu.tensor_load_to_lds
+ rocdl.tensor.load.to.lds %dgroup0, %dgroup1, %dgroup2, %dgroup3 cachepolicy 0 : vector<4xi32>, vector<8xi32>
+ ```
+
These tensor DMA operations were introduced in gfx1250.
}];
let assemblyFormat = [{
- $src `[` $srcIndices `]` `,` $dst `[` $dstIndices `]` attr-dict `:` type($src) `,` type($dst) `to` type(results)
+ $src `[` $src_indices `]` `,` $dst `[` $dst_indices `]` attr-dict `:` type($src) `,` type($dst) `->` type(results)
+ }];
+}
+
+def AMDGPU_MakeDmaDescriptorOp :
+ AMDGPU_Op<"make_dma_descriptor", [Pure, AttrSizedOperandSegments]>,
+ Arguments<(ins
+ AMDGPU_TDMBaseType: $base,
+ Variadic<Index>: $global_dynamic_sizes,
+ DenseI64ArrayAttr: $global_static_sizes,
+ Variadic<Index>: $global_dynamic_strides,
+ DenseI64ArrayAttr: $global_static_strides,
+ Variadic<Index>: $shared_dynamic_sizes,
+ DenseI64ArrayAttr: $shared_static_sizes,
+ Optional<Index>: $pad,
+ Optional<Index>: $pad_every,
+ Optional<AnyMemRef>: $atomic_barrier_address,
+ Variadic<Index>: $atomic_barrier_indices,
+ Optional<Index>: $global_increment,
+ Optional<Index>: $lds_increment,
+ Optional<Index>: $iteration_count)>,
+ Results<(outs AMDGPU_TDMDescriptorType: $desc)> {
+
+ let summary = "Make all descriptor groups needed by TensorLoadToLDS/TensorStoreFromLDS.";
+ let description = [{
+ Make all descriptor groups needed by tensor memory operations.
+
+ The $base operand corresponds to the base pair addresses, one must be an address in LDS
+ while the other must be a global memory location.
+
+ $global_{static/dynamic}_sizes determine the size of the tensor.
+ $global_{static/dynamic}_strides determine the strides of the tensor.
+ $shared_{static/dynamic}_sizes determines the size of the tile.
+
+ Padding can be applied to the LDS address when copying from memory to LDS,
+ but not when copying from LDS to memory.
+ The values in the padded target addresses remain the same as before the operation was applied.
+
+ 2D and 3D tensors may be iterated over by setting $global_increment, $lds_increment, and $iteration_count.
+ $global_increment determines how much to increment the starting global memory address per iteration in units of the $base's element type.
+ $lds_increment determines how much to increment the starting LDS address per iteration in units of the $base's element type.
+ $iterate_count determines how many times to iterate.
+
+ ```mlir
+ // Example of moving a two-dimensional tensor to LDS.
+ %base = amdgpu.make_dma_base %src[0, 0], %dst[0, 0] : memref<64x64xi32>, memref<64x64xi32, #gpu.address_space<workgroup>> -> !amdgpu.tdm_base<i32>
+ %descriptor = amdgpu.make_dma_descriptor %base globalSize [64, 64] globalStride [64, 1] sharedSize [64, 64] : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
+ amdgpu.tensor_load_to_lds %descriptor : !amdgpu.tdm_descriptor
+
+ // Example of moving a two dimension tensor to LDS where padding is applied after every integer.
+ %base = amdgpu.make_dma_base %src[0, 0], %dst[0, 0] : memref<32x32xi32>, memref<64x64xi32, #gpu.address_space<workgroup>> -> !amdgpu.tdm_base<i32>
+ %descriptor = amdgpu.make_dma_descriptor %base globalSize [32, 32] globalStride [32, 1] sharedSize [64, 64] padding(%pad pad_every %pad_every) : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
+ amdgpu.tensor_load_to_lds %descriptor : !amdgpu.tdm_descriptor
+ ```
+ }];
+
+ let assemblyFormat = [{
+ $base
+ `globalSize` custom<DynamicIndexList>($global_dynamic_sizes, $global_static_sizes)
+ `globalStride` custom<DynamicIndexList>($global_dynamic_strides, $global_static_strides)
+ `sharedSize` custom<DynamicIndexList>($shared_dynamic_sizes, $shared_static_sizes)
+ ( `padShared` `(` $pad^ `every` $pad_every `)` )?
+ ( `atomicBarrier` `(` $atomic_barrier_address^ `[` $atomic_barrier_indices `]`
+ `:` type($atomic_barrier_address) `)`)?
+ ( `iterate` $global_increment^ `,` $lds_increment `,` $iteration_count )?
+ attr-dict `:` qualified(type($base)) `->` type(results)
}];
+
+ let hasVerifier = 1;
}
#endif // AMDGPU
diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
index cdc10c60a42ae..5ff640b5d1596 100644
--- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
+++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
@@ -705,6 +705,34 @@ LogicalResult TransposeLoadOp::verify() {
return success();
}
+//===----------------------------------------------------------------------===//
+// MakeDmaDescriptorOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult MakeDmaDescriptorOp::verify() {
+ ArrayRef<int64_t> globalStaticStrides = getGlobalStaticStrides();
+
+ if (globalStaticStrides.empty()) {
+ return emitOpError("strides must not be empty.");
+ }
+ if (globalStaticStrides.back() != 1) {
+ return emitOpError("strides for the innermost dimension must be 1.");
+ }
+
+ ArrayRef<int64_t> globalStaticSizes = getGlobalStaticSizes();
+ size_t rank = globalStaticSizes.size();
+ if (rank != globalStaticStrides.size()) {
+ return emitOpError("strides and sizes must have same rank.");
+ }
+
+ ArrayRef<int64_t> sharedStaticSizes = getSharedStaticSizes();
+ if (rank != sharedStaticSizes.size()) {
+ return emitOpError("tensor must have same rank as tile.");
+ }
+
+ return success();
+}
+
//===----------------------------------------------------------------------===//
// ScaledMFMAOp
//===----------------------------------------------------------------------===//
diff --git a/mlir/test/Dialect/AMDGPU/invalid.mlir b/mlir/test/Dialect/AMDGPU/invalid.mlir
index 61fdf29a78cbd..066f46060f62f 100644
--- a/mlir/test/Dialect/AMDGPU/invalid.mlir
+++ b/mlir/test/Dialect/AMDGPU/invalid.mlir
@@ -354,3 +354,43 @@ func.func @scaled_mfma_invalid_k(%arg0 : vector<4xf8E8M0FNU>, %arg1 : vector<32x
%0 = amdgpu.scaled_mfma 32x32x32 (%arg0[0] * %arg1) * (%arg0[1] * %arg1) + %arg2 : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<16xf32>
func.return %0 : vector<16xf32>
}
+
+// -----
+
+// CHECK-LABEL: func @make_dma_descriptor_invalid_empty_strides
+// CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base<i32>)
+func.func @make_dma_descriptor_invalid_empty_strides(%base: !amdgpu.tdm_base<i32>) {
+ // expected-error at +1 {{'amdgpu.make_dma_descriptor' op strides must not be empty.}}
+ amdgpu.make_dma_descriptor %base globalSize [0] globalStride [] sharedSize [0] : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
+ func.return
+}
+
+// -----
+
+// CHECK-LABEL: func @make_dma_descriptor_invalid_innermost_stride
+// CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base<i32>)
+func.func @make_dma_descriptor_invalid_innermost_stride(%base: !amdgpu.tdm_base<i32>) {
+ // expected-error at +1 {{'amdgpu.make_dma_descriptor' op strides for the innermost dimension must be 1.}}
+ amdgpu.make_dma_descriptor %base globalSize [2, 2] globalStride [1, 2] sharedSize [0] : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
+ func.return
+}
+
+// -----
+
+// CHECK-LABEL: func @make_dma_descriptor_invalid_size_and_stride_sizes
+// CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base<i32>)
+func.func @make_dma_descriptor_invalid_size_and_stride_sizes(%base: !amdgpu.tdm_base<i32>) {
+ // expected-error at +1 {{'amdgpu.make_dma_descriptor' op strides and sizes must have same rank.}}
+ amdgpu.make_dma_descriptor %base globalSize [1] globalStride [1, 1] sharedSize [0] : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
+ func.return
+}
+
+// -----
+
+// CHECK-LABEL: func @make_dma_descriptor_invalid_shared_and_global_rank
+// CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base<i32>)
+func.func @make_dma_descriptor_invalid_shared_and_global_rank(%base: !amdgpu.tdm_base<i32>) {
+ // expected-error at +1 {{'amdgpu.make_dma_descriptor' op tensor must have same rank as tile.}}
+ amdgpu.make_dma_descriptor %base globalSize [4, 4] globalStride [1, 1] sharedSize [2] : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
+ func.return
+}
diff --git a/mlir/test/Dialect/AMDGPU/ops.mlir b/mlir/test/Dialect/AMDGPU/ops.mlir
index 653f9f64d24f4..a8af06dc5ff0a 100644
--- a/mlir/test/Dialect/AMDGPU/ops.mlir
+++ b/mlir/test/Dialect/AMDGPU/ops.mlir
@@ -689,11 +689,62 @@ func.func @memory_counter_wait() {
// CHECK-LABEL: func @make_dma_base
// CHECK-SAME: (%[[IDX:.+]]: index, %[[MEM:.+]]: memref<8xi32>, %[[SMEM:.+]]: memref<8xi32, #gpu.address_space<workgroup>>)
func.func @make_dma_base(%idx: index, %mem: memref<8xi32>, %smem: memref<8xi32, #gpu.address_space<workgroup>>) {
- // CHECK: amdgpu.make_dma_base %[[MEM]][%[[IDX]]], %[[SMEM]][%[[IDX]]] : memref<8xi32>, memref<8xi32, #gpu.address_space<workgroup>> to !amdgpu.tdm_base<i32>
- amdgpu.make_dma_base %mem[%idx], %smem[%idx] : memref<8xi32>, memref<8xi32, #gpu.address_space<workgroup>> to !amdgpu.tdm_base<i32>
+ // CHECK: amdgpu.make_dma_base %[[MEM]][%[[IDX]]], %[[SMEM]][%[[IDX]]] : memref<8xi32>, memref<8xi32, #gpu.address_space<workgroup>> -> !amdgpu.tdm_base<i32>
+ amdgpu.make_dma_base %mem[%idx], %smem[%idx] : memref<8xi32>, memref<8xi32, #gpu.address_space<workgroup>> -> !amdgpu.tdm_base<i32>
- // CHECK: amdgpu.make_dma_base %[[SMEM]][%[[IDX]]], %[[MEM]][%[[IDX]]] : memref<8xi32, #gpu.address_space<workgroup>>, memref<8xi32> to !amdgpu.tdm_base<i32>
- amdgpu.make_dma_base %smem[%idx], %mem[%idx] : memref<8xi32, #gpu.address_space<workgroup>>, memref<8xi32> to !amdgpu.tdm_base<i32>
+ // CHECK: amdgpu.make_dma_base %[[SMEM]][%[[IDX]]], %[[MEM]][%[[IDX]]] : memref<8xi32, #gpu.address_space<workgroup>>, memref<8xi32> -> !amdgpu.tdm_base<i32>
+ amdgpu.make_dma_base %smem[%idx], %mem[%idx] : memref<8xi32, #gpu.address_space<workgroup>>, memref<8xi32> -> !amdgpu.tdm_base<i32>
func.return
}
+// CHECK-LABEL: func @make_dma_descriptor
+// CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base<i32>, %[[BARRIER:.+]]: memref<8xi32>, %[[IDX:.+]]: index)
+func.func @make_dma_descriptor(%base: !amdgpu.tdm_base<i32>, %barrier: memref<8xi32>, %idx: index) {
+
+ // CHECK: amdgpu.make_dma_descriptor %[[BASE]]
+ amdgpu.make_dma_descriptor %base
+ // CHECK-SAME: globalSize [0]
+ globalSize [0]
+ // CHECK-SAME: globalStride [1]
+ globalStride [1]
+ // CHECK-SAME: sharedSize [0] : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
+ sharedSize [0] : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
+
+ // CHECK: amdgpu.make_dma_descriptor %[[BASE]]
+ amdgpu.make_dma_descriptor %base
+ // CHECK-SAME: globalSize [0]
+ globalSize [0]
+ // CHECK-SAME: globalStride [1]
+ globalStride [1]
+ // CHECK-SAME: sharedSize [0]
+ sharedSize [0]
+ // CHECK-SAME: padShared(%[[IDX]] every %[[IDX]])
+ padShared(%idx every %idx)
+ : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
+
+ // CHECK: amdgpu.make_dma_descriptor %[[BASE]]
+ amdgpu.make_dma_descriptor %base
+ // CHECK-SAME: globalSize [0]
+ globalSize [0]
+ // CHECK-SAME: globalStride [1]
+ globalStride [1]
+ // CHECK-SAME: sharedSize [0]
+ sharedSize [0]
+ // CHECK-SAME: atomicBarrier(%[[BARRIER]][%[[IDX]]] : memref<8xi32>)
+ atomicBarrier(%barrier[%idx] : memref<8xi32>)
+ : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
+
+ // CHECK: amdgpu.make_dma_descriptor %[[BASE]]
+ amdgpu.make_dma_descriptor %base
+ // CHECK-SAME: globalSize [0]
+ globalSize [0]
+ // CHECK-SAME: globalStride [1]
+ globalStride [1]
+ // CHECK-SAME: sharedSize [0]
+ sharedSize [0]
+ // CHECK-SAME: iterate %[[IDX]], %[[IDX]], %[[IDX]]
+ iterate %idx, %idx, %idx
+ : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
+
+ func.return
+}
>From 3c31d68dab4254b01b727bee1bd15e4a2c6fc15e Mon Sep 17 00:00:00 2001
From: Erick Ochoa <erick.ochoalopez at amd.com>
Date: Tue, 25 Nov 2025 11:28:04 -0500
Subject: [PATCH 2/4] [mlir][amdgpu] Add tensor load store operation
---
mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td | 30 +++++++++++++++++++
mlir/test/Dialect/AMDGPU/ops.mlir | 10 +++++++
2 files changed, 40 insertions(+)
diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
index 3581b07dc4e3e..12ef5337296a2 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
@@ -1349,4 +1349,34 @@ def AMDGPU_MakeDmaDescriptorOp :
let hasVerifier = 1;
}
+def AMDGPU_TensorLoadToLDSOp :
+ AMDGPU_Op<"tensor_load_to_lds", [MemoryEffects<[MemWrite]>, MemoryEffects<[MemRead]>]>,
+ Arguments<(ins AMDGPU_TDMDescriptorType: $desc)> {
+ let summary = "Load tensors from global memory to LDS.";
+ let description = [{
+ Load tensors of up to five dimensions from global memory to LDS.
+
+ The operation is fully described by the descriptor operand.
+ }];
+
+ let assemblyFormat = [{
+ $desc attr-dict `:` qualified(type($desc))
+ }];
+}
+
+def AMDGPU_TensorStoreFromLDSOp :
+ AMDGPU_Op<"tensor_store_from_lds", [MemoryEffects<[MemWrite]>, MemoryEffects<[MemRead]>]>,
+ Arguments<(ins AMDGPU_TDMDescriptorType: $desc)> {
+ let summary = "Store tensors from LDS to global memory.";
+ let description = [{
+ Store tensors of up to five dimensions from LDS to global memory.
+
+ The operation is fully described by the descriptor operand.
+ }];
+
+ let assemblyFormat = [{
+ $desc attr-dict `:` qualified(type($desc))
+ }];
+}
+
#endif // AMDGPU
diff --git a/mlir/test/Dialect/AMDGPU/ops.mlir b/mlir/test/Dialect/AMDGPU/ops.mlir
index a8af06dc5ff0a..aa6bedc0e1135 100644
--- a/mlir/test/Dialect/AMDGPU/ops.mlir
+++ b/mlir/test/Dialect/AMDGPU/ops.mlir
@@ -748,3 +748,13 @@ func.func @make_dma_descriptor(%base: !amdgpu.tdm_base<i32>, %barrier: memref<8x
func.return
}
+
+// CHECK-LABEL: @tensor_load_store
+// CHECK-SAME: (%[[DESC:.+]]: !amdgpu.tdm_descriptor)
+func.func @tensor_load_store(%desc: !amdgpu.tdm_descriptor) {
+ // CHECK: amdgpu.tensor_load_to_lds %[[DESC]]
+ amdgpu.tensor_load_to_lds %desc : !amdgpu.tdm_descriptor
+ // CHECK: amdgpu.tensor_store_from_lds %[[DESC]]
+ amdgpu.tensor_store_from_lds %desc : !amdgpu.tdm_descriptor
+ return
+}
>From cb116ea0b0444eb72c26e38bdb6572cdeef97e61 Mon Sep 17 00:00:00 2001
From: Erick Ochoa <erick.ochoalopez at amd.com>
Date: Tue, 25 Nov 2025 14:24:50 -0500
Subject: [PATCH 3/4] [mlir][amdgpu] Lower amdgpu.make_dma_base.
---
mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td | 3 +-
.../mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h | 5 ++
.../AMDGPUToROCDL/AMDGPUToROCDL.cpp | 77 +++++++++++++++++-
mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp | 79 ++++++++++++-------
...cvt_scale_pk-gfx1250.mlir => gfx1250.mlir} | 74 +++++++++++++++++
5 files changed, 206 insertions(+), 32 deletions(-)
rename mlir/test/Conversion/AMDGPUToROCDL/{cvt_scale_pk-gfx1250.mlir => gfx1250.mlir} (73%)
diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
index 12ef5337296a2..9cb0752fba48b 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
@@ -1237,7 +1237,6 @@ def AMDGPU_MakeDmaBaseOp :
Results<(outs AMDGPU_TDMBaseType: $base)> {
// TODO:
- // * Add verifiers such that one of the memrefs is from LDS and the other global.
// * Add verifiers to make sure that the number of indices do not exceed the number of dimensions.
let summary = "Pair of based addresses used when moving tiles between LDS and global memory.";
@@ -1280,6 +1279,8 @@ def AMDGPU_MakeDmaBaseOp :
let assemblyFormat = [{
$src `[` $src_indices `]` `,` $dst `[` $dst_indices `]` attr-dict `:` type($src) `,` type($dst) `->` type(results)
}];
+
+ let hasVerifier = 1;
}
def AMDGPU_MakeDmaDescriptorOp :
diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h
index a7680fb5c3191..958757da0933e 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h
@@ -48,6 +48,11 @@ inline void printMNKDimensionList(OpAsmPrinter &printer, Operation *,
IntegerAttr m, IntegerAttr n, IntegerAttr k) {
printMNKDimensionList(printer, m, n, k);
}
+
+// Utility functions for quering the address space.
+bool hasGlobalMemorySpace(Attribute memorySpace);
+bool hasWorkgroupMemorySpace(Attribute memorySpace);
+bool hasFatRawBufferMemorySpace(Attribute memorySpace);
} // namespace mlir::amdgpu
#define GET_ATTRDEF_CLASSES
diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index b9a5e7d7f6eac..3316e16a05d5c 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -2264,6 +2264,76 @@ struct AMDGPUPermlaneLowering : public ConvertOpToLLVMPattern<PermlaneSwapOp> {
}
};
+struct AMDGPUMakeDmaBaseLowering
+ : public ConvertOpToLLVMPattern<MakeDmaBaseOp> {
+ using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern;
+
+ AMDGPUMakeDmaBaseLowering(const LLVMTypeConverter &converter, Chipset chipset)
+ : ConvertOpToLLVMPattern<MakeDmaBaseOp>(converter), chipset(chipset) {}
+ Chipset chipset;
+
+ LogicalResult
+ matchAndRewrite(MakeDmaBaseOp op, OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter) const override {
+ if (chipset < kGfx1250)
+ return op->emitOpError("make_dma_base is only supported on gfx1250");
+
+ Location loc = op.getLoc();
+
+ ValueRange srcIndices = adaptor.getSrcIndices();
+ Value src = adaptor.getSrc();
+ auto srcMemRefType = cast<MemRefType>(op.getSrc().getType());
+
+ Value srcPtr =
+ getStridedElementPtr(rewriter, loc, srcMemRefType, src, srcIndices);
+
+ ValueRange dstIndices = adaptor.getDstIndices();
+ Value dst = adaptor.getDst();
+ auto dstMemRefType = cast<MemRefType>(op.getDst().getType());
+
+ Value dstPtr =
+ getStridedElementPtr(rewriter, loc, dstMemRefType, dst, dstIndices);
+
+ bool storeFrom = hasWorkgroupMemorySpace(srcMemRefType.getMemorySpace());
+ Value ldsAddr = storeFrom ? srcPtr : dstPtr;
+ Value globalAddr = storeFrom ? dstPtr : srcPtr;
+
+ Type i32 = rewriter.getI32Type();
+ Type i64 = rewriter.getI64Type();
+
+ Value castForLdsAddr =
+ LLVM::PtrToIntOp::create(rewriter, loc, i32, ldsAddr);
+ Value castForGlobalAddr =
+ LLVM::PtrToIntOp::create(rewriter, loc, i64, globalAddr);
+
+ Value mask = createI64Constant(rewriter, loc, 0x1FFFFFFFFFFFFFF);
+ Value first57BitsOfGlobalAddr =
+ LLVM::AndOp::create(rewriter, loc, castForGlobalAddr, mask);
+ Value shift = LLVM::LShrOp::create(rewriter, loc, first57BitsOfGlobalAddr,
+ createI64Constant(rewriter, loc, 32));
+
+ Value lowHalf =
+ LLVM::TruncOp::create(rewriter, loc, i32, first57BitsOfGlobalAddr);
+ Value highHalf = LLVM::TruncOp::create(rewriter, loc, i32, shift);
+
+ Value c0 = createI32Constant(rewriter, loc, 0);
+ Value c1 = createI32Constant(rewriter, loc, 1);
+ Value c2 = createI32Constant(rewriter, loc, 2);
+ Value c3 = createI32Constant(rewriter, loc, 3);
+
+ Type v4i32 = this->typeConverter->convertType(VectorType::get(4, i32));
+ Value result = LLVM::UndefOp::create(rewriter, loc, v4i32);
+ result = LLVM::InsertElementOp::create(rewriter, loc, result, c0, c0);
+ result = LLVM::InsertElementOp::create(rewriter, loc, result,
+ castForLdsAddr, c1);
+ result = LLVM::InsertElementOp::create(rewriter, loc, result, lowHalf, c2);
+ result = LLVM::InsertElementOp::create(rewriter, loc, result, highHalf, c3);
+
+ rewriter.replaceOp(op, result);
+ return success();
+ }
+};
+
struct ConvertAMDGPUToROCDLPass
: public impl::ConvertAMDGPUToROCDLPassBase<ConvertAMDGPUToROCDLPass> {
using Base::Base;
@@ -2278,6 +2348,10 @@ struct ConvertAMDGPUToROCDLPass
RewritePatternSet patterns(ctx);
LLVMTypeConverter converter(ctx);
+ converter.addConversion([&](TDMBaseType type) -> Type {
+ Type i32 = IntegerType::get(type.getContext(), 32);
+ return converter.convertType(VectorType::get(4, i32));
+ });
populateAMDGPUToROCDLConversionPatterns(converter, patterns, *maybeChipset);
LLVMConversionTarget target(getContext());
target.addIllegalDialect<::mlir::amdgpu::AMDGPUDialect>();
@@ -2333,6 +2407,7 @@ void mlir::populateAMDGPUToROCDLConversionPatterns(LLVMTypeConverter &converter,
ScaledExtPackedOpLowering, PackedScaledTruncOpLowering,
PackedTrunc2xFp8OpLowering, PackedStochRoundFp8OpLowering,
GatherToLDSOpLowering, TransposeLoadOpLowering,
- AMDGPUPermlaneLowering>(converter, chipset);
+ AMDGPUPermlaneLowering, AMDGPUMakeDmaBaseLowering>(converter,
+ chipset);
patterns.add<AMDGPUSwizzleBitModeLowering>(converter);
}
diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
index 5ff640b5d1596..8fc6220efc6ad 100644
--- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
+++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
@@ -41,6 +41,38 @@ using namespace mlir::amdgpu;
#include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.cpp.inc"
+namespace mlir::amdgpu {
+bool hasGlobalMemorySpace(Attribute memorySpace) {
+ if (!memorySpace)
+ return true;
+ if (auto intMemorySpace = dyn_cast<IntegerAttr>(memorySpace))
+ return intMemorySpace.getInt() == 0 || intMemorySpace.getInt() == 1;
+ if (auto gpuMemorySpace = dyn_cast<gpu::AddressSpaceAttr>(memorySpace))
+ return gpuMemorySpace.getValue() == gpu::AddressSpace::Global;
+ return false;
+}
+
+bool hasWorkgroupMemorySpace(Attribute memorySpace) {
+ if (!memorySpace)
+ return false;
+ if (auto intMemorySpace = dyn_cast<IntegerAttr>(memorySpace))
+ return intMemorySpace.getInt() == 3;
+ if (auto gpuMemorySpace = dyn_cast<gpu::AddressSpaceAttr>(memorySpace))
+ return gpuMemorySpace.getValue() == gpu::AddressSpace::Workgroup;
+ return false;
+}
+
+bool hasFatRawBufferMemorySpace(Attribute memorySpace) {
+ if (!memorySpace)
+ return false;
+ if (auto intMemorySpace = dyn_cast<IntegerAttr>(memorySpace))
+ return intMemorySpace.getInt() == 7;
+ if (auto gpuMemorySpace = dyn_cast<amdgpu::AddressSpaceAttr>(memorySpace))
+ return gpuMemorySpace.getValue() == amdgpu::AddressSpace::FatRawBuffer;
+ return false;
+}
+} // namespace mlir::amdgpu
+
namespace {
struct AMDGPUInlinerInterface final : DialectInlinerInterface {
using DialectInlinerInterface::DialectInlinerInterface;
@@ -158,36 +190,6 @@ LogicalResult FatRawBufferCastOp::verify() {
return success();
}
-static bool hasGlobalMemorySpace(Attribute memorySpace) {
- if (!memorySpace)
- return true;
- if (auto intMemorySpace = dyn_cast<IntegerAttr>(memorySpace))
- return intMemorySpace.getInt() == 0 || intMemorySpace.getInt() == 1;
- if (auto gpuMemorySpace = dyn_cast<gpu::AddressSpaceAttr>(memorySpace))
- return gpuMemorySpace.getValue() == gpu::AddressSpace::Global;
- return false;
-}
-
-static bool hasWorkgroupMemorySpace(Attribute memorySpace) {
- if (!memorySpace)
- return false;
- if (auto intMemorySpace = dyn_cast<IntegerAttr>(memorySpace))
- return intMemorySpace.getInt() == 3;
- if (auto gpuMemorySpace = dyn_cast<gpu::AddressSpaceAttr>(memorySpace))
- return gpuMemorySpace.getValue() == gpu::AddressSpace::Workgroup;
- return false;
-}
-
-static bool hasFatRawBufferMemorySpace(Attribute memorySpace) {
- if (!memorySpace)
- return false;
- if (auto intMemorySpace = dyn_cast<IntegerAttr>(memorySpace))
- return intMemorySpace.getInt() == 7;
- if (auto gpuMemorySpace = dyn_cast<amdgpu::AddressSpaceAttr>(memorySpace))
- return gpuMemorySpace.getValue() == amdgpu::AddressSpace::FatRawBuffer;
- return false;
-}
-
//===----------------------------------------------------------------------===//
// RawBuffer*Op
//===----------------------------------------------------------------------===//
@@ -705,6 +707,23 @@ LogicalResult TransposeLoadOp::verify() {
return success();
}
+//===----------------------------------------------------------------------===//
+// MakeDmaBaseOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult MakeDmaBaseOp::verify() {
+ MemRefType srcType = cast<MemRefType>(getSrc().getType());
+ MemRefType dstType = cast<MemRefType>(getDst().getType());
+ bool store_from_lds = hasWorkgroupMemorySpace(srcType.getMemorySpace()) &&
+ hasGlobalMemorySpace(dstType.getMemorySpace());
+ bool load_to_lds = hasGlobalMemorySpace(srcType.getMemorySpace()) &&
+ hasWorkgroupMemorySpace(dstType.getMemorySpace());
+ bool is_valid = store_from_lds != load_to_lds;
+ if (!is_valid)
+ return emitOpError("invalid combination of address spaces.");
+ return success();
+}
+
//===----------------------------------------------------------------------===//
// MakeDmaDescriptorOp
//===----------------------------------------------------------------------===//
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/cvt_scale_pk-gfx1250.mlir b/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir
similarity index 73%
rename from mlir/test/Conversion/AMDGPUToROCDL/cvt_scale_pk-gfx1250.mlir
rename to mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir
index d2391140ce056..96d03a427215f 100644
--- a/mlir/test/Conversion/AMDGPUToROCDL/cvt_scale_pk-gfx1250.mlir
+++ b/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir
@@ -162,3 +162,77 @@ func.func @amdgpu.scaled_ext_packed816_invalid_dst_elem_type(%v: vector<16xf6E3M
%ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<16xf6E3M2FN>, vector<4xf8E8M0FNU> -> vector<16xf64>
return %ret0: vector<16xf64>
}
+
+// -----
+
+#gpu_global_addrspace = 1
+#gpu_lds_addrspace = 3
+#amdgpu_fat_buffer_addrspace = 7
+
+// CHECK-LABEL: func @make_dma_base
+// CHECK-SAME: (%[[IDX:.+]]: index, %[[MEM:.+]]: memref<8xi32, 1>, %[[SMEM:.+]]: memref<8xi32, 3>)
+func.func @make_dma_base(%idx: index, %mem: memref<8xi32, #gpu_global_addrspace>, %smem: memref<8xi32,#gpu_lds_addrspace>) -> (!amdgpu.tdm_base<i32>, !amdgpu.tdm_base<i32>) {
+ // CHECK-DAG: %[[INT:.+]] = builtin.unrealized_conversion_cast %[[IDX]] : index to i64
+ // CHECK-DAG: %[[MEMREF_DESC_MEM:.+]] = builtin.unrealized_conversion_cast %[[MEM]] : memref<8xi32, 1>
+ // CHECK-DAG: %[[MEMREF_DESC_SMEM:.+]] = builtin.unrealized_conversion_cast %[[SMEM]] : memref<8xi32, 3>
+
+ // CHECK-DAG: %[[MEM_BASE_PTR:.+]] = llvm.extractvalue %[[MEMREF_DESC_MEM]][1] : !llvm.struct<(ptr<1>
+ // CHECK-DAG: %[[SMEM_BASE_PTR:.+]] = llvm.extractvalue %[[MEMREF_DESC_SMEM]][1] : !llvm.struct<(ptr<3>
+
+ // CHECK-DAG: %[[MEM_BASE_OFFSET:.+]] = llvm.getelementptr %[[MEM_BASE_PTR]][%[[INT]]]
+ // CHECK-DAG: %[[SMEM_BASE_OFFSET:.+]] = llvm.getelementptr %[[SMEM_BASE_PTR]][%[[INT]]]
+
+ // CHECK-DAG: %[[MEM_INT:.+]] = llvm.ptrtoint %[[MEM_BASE_OFFSET]] : !llvm.ptr<1> to i64
+ // CHECK-DAG: %[[SMEM_INT:.+]] = llvm.ptrtoint %[[SMEM_BASE_OFFSET]] : !llvm.ptr<3> to i32
+
+ // CHECK-DAG: %[[MASK:.+]] = llvm.mlir.constant(144115188075855871 : i64) : i64
+ // CHECK: %[[MEM_INT_LOW_57:.+]] = llvm.and %[[MEM_INT]], %[[MASK]]
+ // CHECK: %[[C32:.+]] = llvm.mlir.constant(32 : i64) : i64
+ // CHECK: %[[SHIFT:.+]] = llvm.lshr %[[MEM_INT_LOW_57]], %[[C32]]
+ // CHECK-DAG: %[[MEM_INT_LOW:.+]] = llvm.trunc %[[MEM_INT_LOW_57]] : i64 to i32
+ // CHECK-DAG: %[[MEM_INT_HIGH:.+]] = llvm.trunc %[[SHIFT]] : i64 to i32
+
+ // CHECK-DAG: %[[C0:.+]] = llvm.mlir.constant(0 : i32) : i32
+ // CHECK-DAG: %[[C1:.+]] = llvm.mlir.constant(1 : i32) : i32
+ // CHECK-DAG: %[[C2:.+]] = llvm.mlir.constant(2 : i32) : i32
+ // CHECK-DAG: %[[C3:.+]] = llvm.mlir.constant(3 : i32) : i32
+
+ // CHECK: %[[V4I32_0_0:.+]] = llvm.mlir.undef : vector<4xi32>
+ // CHECK: %[[V4I32_0_1:.+]] = llvm.insertelement %[[C0]], %[[V4I32_0_0]][%[[C0]] : i32]
+ // CHECK: %[[V4I32_0_2:.+]] = llvm.insertelement %[[SMEM_INT]], %[[V4I32_0_1]][%[[C1]] : i32]
+ // CHECK: %[[V4I32_0_3:.+]] = llvm.insertelement %[[MEM_INT_LOW]], %[[V4I32_0_2]][%[[C2]] : i32]
+ // CHECK: %[[V4I32_0_4:.+]] = llvm.insertelement %[[MEM_INT_HIGH]], %[[V4I32_0_3]][%[[C3]] : i32]
+
+ %0 = amdgpu.make_dma_base %mem[%idx], %smem[%idx] : memref<8xi32, #gpu_global_addrspace>, memref<8xi32, #gpu_lds_addrspace> -> !amdgpu.tdm_base<i32>
+
+ // CHECK-DAG: %[[MEM_BASE_PTR:.+]] = llvm.extractvalue %[[MEMREF_DESC_MEM]][1] : !llvm.struct<(ptr<1>
+ // CHECK-DAG: %[[SMEM_BASE_PTR:.+]] = llvm.extractvalue %[[MEMREF_DESC_SMEM]][1] : !llvm.struct<(ptr<3>
+
+ // CHECK-DAG: %[[MEM_BASE_OFFSET:.+]] = llvm.getelementptr %[[MEM_BASE_PTR]][%[[INT]]]
+ // CHECK-DAG: %[[SMEM_BASE_OFFSET:.+]] = llvm.getelementptr %[[SMEM_BASE_PTR]][%[[INT]]]
+
+ // CHECK-DAG: %[[MEM_INT:.+]] = llvm.ptrtoint %[[MEM_BASE_OFFSET]] : !llvm.ptr<1> to i64
+ // CHECK-DAG: %[[SMEM_INT:.+]] = llvm.ptrtoint %[[SMEM_BASE_OFFSET]] : !llvm.ptr<3> to i32
+
+ // CHECK-DAG: %[[MASK:.+]] = llvm.mlir.constant(144115188075855871 : i64) : i64
+ // CHECK: %[[MEM_INT_LOW_57:.+]] = llvm.and %[[MEM_INT]], %[[MASK]]
+ // CHECK: %[[C32:.+]] = llvm.mlir.constant(32 : i64) : i64
+ // CHECK: %[[SHIFT:.+]] = llvm.lshr %[[MEM_INT_LOW_57]], %[[C32]]
+ // CHECK-DAG: %[[MEM_INT_LOW:.+]] = llvm.trunc %[[MEM_INT_LOW_57]] : i64 to i32
+ // CHECK-DAG: %[[MEM_INT_HIGH:.+]] = llvm.trunc %[[SHIFT]] : i64 to i32
+
+ // CHECK-DAG: %[[C0:.+]] = llvm.mlir.constant(0 : i32) : i32
+ // CHECK-DAG: %[[C1:.+]] = llvm.mlir.constant(1 : i32) : i32
+ // CHECK-DAG: %[[C2:.+]] = llvm.mlir.constant(2 : i32) : i32
+ // CHECK-DAG: %[[C3:.+]] = llvm.mlir.constant(3 : i32) : i32
+
+ // CHECK: %[[V4I32_1_0:.+]] = llvm.mlir.undef : vector<4xi32>
+ // CHECK: %[[V4I32_1_1:.+]] = llvm.insertelement %[[C0]], %[[V4I32_1_0]][%[[C0]] : i32]
+ // CHECK: %[[V4I32_1_2:.+]] = llvm.insertelement %[[SMEM_INT]], %[[V4I32_1_1]][%[[C1]] : i32]
+ // CHECK: %[[V4I32_1_3:.+]] = llvm.insertelement %[[MEM_INT_LOW]], %[[V4I32_1_2]][%[[C2]] : i32]
+ // CHECK: %[[V4I32_1_4:.+]] = llvm.insertelement %[[MEM_INT_HIGH]], %[[V4I32_1_3]][%[[C3]] : i32]
+
+ %1 = amdgpu.make_dma_base %smem[%idx], %mem[%idx] : memref<8xi32, #gpu_lds_addrspace>, memref<8xi32, #gpu_global_addrspace> -> !amdgpu.tdm_base<i32>
+
+ func.return %0, %1 : !amdgpu.tdm_base<i32>, !amdgpu.tdm_base<i32>
+}
>From 3ee5464060d2bae3e071b4910ef09e0b0d4f6728 Mon Sep 17 00:00:00 2001
From: Erick Ochoa <erick.ochoalopez at amd.com>
Date: Thu, 27 Nov 2025 09:41:38 -0500
Subject: [PATCH 4/4] Update documentation
---
mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td | 28 ++++++++++---------
1 file changed, 15 insertions(+), 13 deletions(-)
diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
index 9cb0752fba48b..1806c747046b8 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
@@ -1258,19 +1258,21 @@ def AMDGPU_MakeDmaBaseOp :
to
```mlir
- // pseudocode
- %base_0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr)>
- %base_1 = llvm.insertvalue %global_addr, %base_0[0] : !llvm.struct<(ptr, ptr)>
- %base_2 = llvm.insertvalue %lds_addr, %base_1[1] : !llvm.struct(ptr, ptr)>
- // type(%base_2) = !llvm.struct<(ptr, ptr) roughly corresponds to amdgpu.tdm_base<i32>
-
- // The base will be used when contructing dgroup0
- // when lowering amdgpu.make_dma_descriptor
- %dgroup0_0 = llvm.mlir.undef : !llvm.struct<(....)>
- %dgroup0_1 = llvm.insertvalue %base2, %dgroup0_0 : ....
-
- // When lowering amdgpu.tensor_load_to_lds
- rocdl.tensor.load.to.lds %dgroup0, %dgroup1, %dgroup2, %dgroup3 cachepolicy 0 : vector<4xi32>, vector<8xi32>
+ // pseudo-code
+ %global_base = llvm.extractvalue %global_memref[1]
+ %global_address = llvm.get_element_ptr ...
+
+ %lds_base = llvm.extractvalue %lds_memref[1]
+ %lds_address = llvm.get_element_ptr ...
+
+ // Definition of %base
+ %undef = llvm.mlir.undef : vector<4xi32>
+ %v0 = llvm.insertelement %15, %undef[0] : vector<4xi32>
+ %v1 = llvm.insertelement %lds_address, %v0[1] : vector<4xi32>
+ %v2 = llvm.insertelement %global_address_low, %v1[2] : vector<4xi32>
+ %base = llvm.insertelement %global_address_high, %v2[3] : vector<4xi32>
+
+ rocdl.tensor.load.to.lds %base, %dgroup1, %dgroup2, %dgroup3 cachepolicy 0 : vector<4xi32>, vector<8xi32>
```
These tensor DMA operations were introduced in gfx1250.
More information about the Mlir-commits
mailing list