[Mlir-commits] [mlir] [mlir][amdgpu] Add tensor load store operations (PR #170918)
Erick Ochoa Lopez
llvmlistbot at llvm.org
Tue Dec 9 06:46:56 PST 2025
https://github.com/amd-eochoalo updated https://github.com/llvm/llvm-project/pull/170918
>From 14eb785fdf78c3d8e9bca7c3b2a332f0da762efa Mon Sep 17 00:00:00 2001
From: Erick Ochoa <erick.ochoalopez at amd.com>
Date: Fri, 5 Dec 2025 12:04:46 -0500
Subject: [PATCH 01/10] [mlir][amdgpu] Make tdm_descriptor parametric
---
mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td | 5 +++++
mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp | 9 +++++++++
mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir | 14 +++++++-------
.../AMDGPU/amdgpu-make-dma-descriptor-fold.mlir | 6 +++---
mlir/test/Dialect/AMDGPU/invalid.mlir | 10 +++++-----
mlir/test/Dialect/AMDGPU/ops.mlir | 10 +++++-----
6 files changed, 34 insertions(+), 20 deletions(-)
diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
index 56160d3e8fe85..27f9a1f774b18 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
@@ -111,6 +111,11 @@ def AMDGPU_TDMDescriptorType : AMDGPU_Type<"TDMDescriptor", "tdm_descriptor"> {
used in tensor_load_to_lds or tensor_store_from_lds.
}];
+ let parameters = (ins "unsigned": $size);
+
+ let assemblyFormat = "`<` $size `>`";
+ let genVerifyDecl = 1;
+
}
//===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
index b7a665b0f5367..bfd7c165807fa 100644
--- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
+++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
@@ -781,6 +781,15 @@ LogicalResult MakeDmaBaseOp::verify() {
// MakeDmaDescriptorOp
//===----------------------------------------------------------------------===//
+LogicalResult
+TDMDescriptorType::verify(function_ref<InFlightDiagnostic()> emitError,
+ unsigned size) {
+ if (!llvm::is_contained<unsigned>({2, 4}, size))
+ return emitError() << "only groups of size 2 or 4 are valid but got "
+ << size;
+ return success();
+}
+
LogicalResult MakeDmaDescriptorOp::verify() {
ArrayRef<int64_t> globalStaticStrides = getGlobalStaticStrides();
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir b/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir
index a94e17ab5b9a5..b9ae279d22cb1 100644
--- a/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir
+++ b/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir
@@ -239,7 +239,7 @@ func.func @make_dma_base(%idx: index, %mem: memref<8xi32, #gpu_global_addrspace>
// CHECK-LABEL: func @make_dma_descriptor
// CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base<i32>)
-func.func @make_dma_descriptor(%base: !amdgpu.tdm_base<i32>) -> !amdgpu.tdm_descriptor {
+func.func @make_dma_descriptor(%base: !amdgpu.tdm_base<i32>) -> !amdgpu.tdm_descriptor<2> {
// CHECK-DAG: %[[DGROUP0:.+]] = builtin.unrealized_conversion_cast %[[BASE]]
// CHECK-DAG: %[[C0:.+]] = llvm.mlir.constant(0 : i32)
@@ -304,9 +304,9 @@ func.func @make_dma_descriptor(%base: !amdgpu.tdm_base<i32>) -> !amdgpu.tdm_desc
// CHECK: %[[DGROUP1_6:.+]] = llvm.insertelement %[[SGPR6]], %[[DGROUP1_5]][%[[C6]] : i32]
// CHECK: %[[DGROUP1:.+]] = llvm.insertelement %[[SGPR7]], %[[DGROUP1_6]][%[[C7]] : i32]
- // CHECK: %[[DGROUPS:.+]] = builtin.unrealized_conversion_cast %[[DGROUP0]], %[[DGROUP1]] : vector<4xi32>, vector<8xi32> to !amdgpu.tdm_descriptor
- %descriptor = amdgpu.make_dma_descriptor %base globalSize [128, 64] globalStride [64, 1] sharedSize [128, 64] : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
- func.return %descriptor : !amdgpu.tdm_descriptor
+ // CHECK: %[[DGROUPS:.+]] = builtin.unrealized_conversion_cast %[[DGROUP0]], %[[DGROUP1]] : vector<4xi32>, vector<8xi32> to !amdgpu.tdm_descriptor<2>
+ %descriptor = amdgpu.make_dma_descriptor %base globalSize [128, 64] globalStride [64, 1] sharedSize [128, 64] : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor<2>
+ func.return %descriptor : !amdgpu.tdm_descriptor<2>
}
// -----
@@ -317,7 +317,7 @@ func.func @make_dma_descriptor(%base: !amdgpu.tdm_base<i32>) -> !amdgpu.tdm_desc
// CHECK-LABEL: func @make_dma_descriptor_atomic_barrier
// CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base<i32>, %[[BARRIER:.+]]: {{.*}}, %[[IDX:.+]]: index)
-func.func @make_dma_descriptor_atomic_barrier(%base: !amdgpu.tdm_base<i32>, %barrier : memref<8xi32, #gpu_lds_addrspace>, %idx: index) -> !amdgpu.tdm_descriptor {
+func.func @make_dma_descriptor_atomic_barrier(%base: !amdgpu.tdm_base<i32>, %barrier : memref<8xi32, #gpu_lds_addrspace>, %idx: index) -> !amdgpu.tdm_descriptor<2> {
// CHECK-DAG: %[[INDEX:.+]] = builtin.unrealized_conversion_cast %[[IDX]] : index to i64
// CHECK-DAG: %[[BARRIER_MEMREF_DESC:.+]] = builtin.unrealized_conversion_cast %[[BARRIER]]
// CHECK-DAG: %[[DGROUP0:.+]] = builtin.unrealized_conversion_cast %[[BASE]]
@@ -361,8 +361,8 @@ func.func @make_dma_descriptor_atomic_barrier(%base: !amdgpu.tdm_base<i32>, %bar
globalStride [64, 1]
sharedSize [128, 64]
atomicBarrier(%barrier[%idx] : memref<8xi32, #gpu_lds_addrspace>)
- : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
- func.return %descriptor : !amdgpu.tdm_descriptor
+ : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor<2>
+ func.return %descriptor : !amdgpu.tdm_descriptor<2>
}
// -----
diff --git a/mlir/test/Dialect/AMDGPU/amdgpu-make-dma-descriptor-fold.mlir b/mlir/test/Dialect/AMDGPU/amdgpu-make-dma-descriptor-fold.mlir
index 9d43c9940f8e0..7dfd67407d78a 100644
--- a/mlir/test/Dialect/AMDGPU/amdgpu-make-dma-descriptor-fold.mlir
+++ b/mlir/test/Dialect/AMDGPU/amdgpu-make-dma-descriptor-fold.mlir
@@ -2,7 +2,7 @@
// CHECK-LABEL: @make_dma_descriptor_fold
// CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base<i32>, %[[IDX:.+]]: index)
-func.func @make_dma_descriptor_fold(%base: !amdgpu.tdm_base<i32>, %idx: index) -> !amdgpu.tdm_descriptor {
+func.func @make_dma_descriptor_fold(%base: !amdgpu.tdm_base<i32>, %idx: index) -> !amdgpu.tdm_descriptor<2> {
%c64 = arith.constant 64 : index
// CHECK: amdgpu.make_dma_descriptor %[[BASE]]
@@ -14,6 +14,6 @@ func.func @make_dma_descriptor_fold(%base: !amdgpu.tdm_base<i32>, %idx: index) -
// CHECK-SAME: sharedSize [64, 64]
sharedSize [%c64, %c64]
iterate %idx, %idx, %idx
- : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
- func.return %0 : !amdgpu.tdm_descriptor
+ : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor<2>
+ func.return %0 : !amdgpu.tdm_descriptor<2>
}
diff --git a/mlir/test/Dialect/AMDGPU/invalid.mlir b/mlir/test/Dialect/AMDGPU/invalid.mlir
index 6308ea9a6a096..98fd2756810b4 100644
--- a/mlir/test/Dialect/AMDGPU/invalid.mlir
+++ b/mlir/test/Dialect/AMDGPU/invalid.mlir
@@ -373,7 +373,7 @@ func.func @make_dma_base_invalid_addressspace(%idx: index, %smem : memref<8xi32,
func.func @make_dma_base_invalid_barrier(%base: !amdgpu.tdm_base<i32>, %barrier: memref<8xi32>, %idx: index) {
// expected-error at +1 {{'amdgpu.make_dma_descriptor' op atomic barrier address must be in LDS.}}
- amdgpu.make_dma_descriptor %base globalSize [64, 64] globalStride [64, 1] sharedSize [64, 64] atomicBarrier(%barrier[%idx] : memref<8xi32>) : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
+ amdgpu.make_dma_descriptor %base globalSize [64, 64] globalStride [64, 1] sharedSize [64, 64] atomicBarrier(%barrier[%idx] : memref<8xi32>) : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor<2>
}
// -----
@@ -382,7 +382,7 @@ func.func @make_dma_base_invalid_barrier(%base: !amdgpu.tdm_base<i32>, %barrier:
// CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base<i32>)
func.func @make_dma_descriptor_invalid_empty_strides(%base: !amdgpu.tdm_base<i32>) {
// expected-error at +1 {{'amdgpu.make_dma_descriptor' op strides must not be empty.}}
- amdgpu.make_dma_descriptor %base globalSize [0, 1] globalStride [] sharedSize [1, 0] : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
+ amdgpu.make_dma_descriptor %base globalSize [0, 1] globalStride [] sharedSize [1, 0] : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor<2>
func.return
}
@@ -392,7 +392,7 @@ func.func @make_dma_descriptor_invalid_empty_strides(%base: !amdgpu.tdm_base<i32
// CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base<i32>)
func.func @make_dma_descriptor_invalid_innermost_stride(%base: !amdgpu.tdm_base<i32>) {
// expected-error at +1 {{'amdgpu.make_dma_descriptor' op strides for the innermost dimension must be 1.}}
- amdgpu.make_dma_descriptor %base globalSize [2, 2] globalStride [1, 2] sharedSize [1, 0] : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
+ amdgpu.make_dma_descriptor %base globalSize [2, 2] globalStride [1, 2] sharedSize [1, 0] : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor<2>
func.return
}
@@ -402,7 +402,7 @@ func.func @make_dma_descriptor_invalid_innermost_stride(%base: !amdgpu.tdm_base<
// CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base<i32>)
func.func @make_dma_descriptor_invalid_size_and_stride_sizes(%base: !amdgpu.tdm_base<i32>) {
// expected-error at +1 {{'amdgpu.make_dma_descriptor' op strides and sizes must have same rank.}}
- amdgpu.make_dma_descriptor %base globalSize [1, 1, 1] globalStride [1, 1] sharedSize [1, 0] : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
+ amdgpu.make_dma_descriptor %base globalSize [1, 1, 1] globalStride [1, 1] sharedSize [1, 0] : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor<2>
func.return
}
@@ -412,7 +412,7 @@ func.func @make_dma_descriptor_invalid_size_and_stride_sizes(%base: !amdgpu.tdm_
// CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base<i32>)
func.func @make_dma_descriptor_invalid_shared_and_global_rank(%base: !amdgpu.tdm_base<i32>) {
// expected-error at +1 {{'amdgpu.make_dma_descriptor' op tensor must have same rank as tile.}}
- amdgpu.make_dma_descriptor %base globalSize [4, 4] globalStride [1, 1] sharedSize [1, 2, 3] : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
+ amdgpu.make_dma_descriptor %base globalSize [4, 4] globalStride [1, 1] sharedSize [1, 2, 3] : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor<2>
func.return
}
diff --git a/mlir/test/Dialect/AMDGPU/ops.mlir b/mlir/test/Dialect/AMDGPU/ops.mlir
index 651aff4a0d22a..7b1dfe82d889e 100644
--- a/mlir/test/Dialect/AMDGPU/ops.mlir
+++ b/mlir/test/Dialect/AMDGPU/ops.mlir
@@ -706,8 +706,8 @@ func.func @make_dma_descriptor(%base: !amdgpu.tdm_base<i32>, %wg_mask: i16, %tim
globalSize [64, 64]
// CHECK-SAME: globalStride [64, 1]
globalStride [64, 1]
- // CHECK-SAME: sharedSize [64, 64] : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
- sharedSize [64, 64] : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
+ // CHECK-SAME: sharedSize [64, 64] : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor<2>
+ sharedSize [64, 64] : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor<2>
// CHECK: amdgpu.make_dma_descriptor %[[BASE]]
amdgpu.make_dma_descriptor %base
@@ -719,7 +719,7 @@ func.func @make_dma_descriptor(%base: !amdgpu.tdm_base<i32>, %wg_mask: i16, %tim
sharedSize [64, 64]
// CHECK-SAME: padShared(%[[IDX]] every %[[IDX]])
padShared(%idx every %idx)
- : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
+ : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor<2>
// CHECK: amdgpu.make_dma_descriptor %[[BASE]]
amdgpu.make_dma_descriptor %base
@@ -757,7 +757,7 @@ func.func @make_dma_descriptor(%base: !amdgpu.tdm_base<i32>, %wg_mask: i16, %tim
sharedSize [64, 64]
// CHECK-SAME: atomicBarrier(%[[BARRIER]][%[[IDX]]] : memref<8xi32, #gpu.address_space<workgroup>>)
atomicBarrier(%barrier[%idx] : memref<8xi32, #gpu.address_space<workgroup>>)
- : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
+ : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor<2>
// CHECK: amdgpu.make_dma_descriptor %[[BASE]]
amdgpu.make_dma_descriptor %base
@@ -769,7 +769,7 @@ func.func @make_dma_descriptor(%base: !amdgpu.tdm_base<i32>, %wg_mask: i16, %tim
sharedSize [64, 64]
// CHECK-SAME: iterate %[[IDX]], %[[IDX]], %[[IDX]]
iterate %idx, %idx, %idx
- : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
+ : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor<4>
func.return
}
>From b8cec0aec30ace0f4e6edd75a6ac718767ac9f30 Mon Sep 17 00:00:00 2001
From: Erick Ochoa <erick.ochoalopez at amd.com>
Date: Fri, 5 Dec 2025 14:34:05 -0500
Subject: [PATCH 02/10] Removed unused code
---
mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp | 5 -----
1 file changed, 5 deletions(-)
diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index 7584b17075225..0a1fa091e490e 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -2692,11 +2692,6 @@ struct AMDGPUMakeDmaDescriptorLowering
Location loc = op.getLoc();
- IntegerType i32 = rewriter.getI32Type();
- [[maybe_unused]] Type v4i32 =
- this->typeConverter->convertType(VectorType::get(4, i32));
- assert(v4i32 && "expected type conversion to succeed");
-
SmallVector<Value> consts;
for (int64_t i = 0; i < 8; i++)
consts.push_back(createI32Constant(rewriter, loc, i));
>From e49b8f22104954fd1098fba4a6570fa60fbafa05 Mon Sep 17 00:00:00 2001
From: Erick Ochoa <erick.ochoalopez at amd.com>
Date: Wed, 3 Dec 2025 10:50:36 -0500
Subject: [PATCH 03/10] [mlir][amdgpu] Lower tensor load store ops.
* Adds attributes cache scopes, temporal hints.
* Makes tdm_descriptor parametric.
* Lowers tensor_load_to_lds and tensor_store_from_lds.
---
mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td | 128 ++++++++++++++++++
.../AMDGPUToROCDL/AMDGPUToROCDL.cpp | 75 +++++++++-
.../Conversion/AMDGPUToROCDL/gfx1250.mlir | 126 ++++++++++++++++-
mlir/test/Dialect/AMDGPU/ops.mlir | 4 +-
4 files changed, 326 insertions(+), 7 deletions(-)
diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
index 27f9a1f774b18..95cefda110511 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
@@ -80,6 +80,97 @@ def AMDGPU_AddressSpaceAttr : EnumAttr<AMDGPU_Dialect, AMDGPU_AddressSpace,
let assemblyFormat = "`<` $value `>`";
}
+def AMDGPU_TemporalLoadHints : I32EnumAttr<"TemporalLoadHints",
+ "AMDGPU-specific temporal load hints",
+ [
+ I32EnumAttrCase<"RegularTemporal", 0, "regular">,
+ I32EnumAttrCase<"NonTemporal", 1, "nontemporal">,
+ I32EnumAttrCase<"HighPriorityTemporal", 2, "highpriority">,
+ I32EnumAttrCase<"LastUse", 3, "lastuse">,
+ I32EnumAttrCase<"NT_RT", 4, "nontemporal_regular">,
+ I32EnumAttrCase<"RT_NT", 5, "regular_nontemporal">,
+ I32EnumAttrCase<"NT_HT", 6, "nontemporal_highpriority">,
+ ]> {
+ let genSpecializedAttr = 0;
+ let cppNamespace = "::mlir::amdgpu";
+}
+
+def AMDGPU_TemporalLoadHintsAttr : EnumAttr<AMDGPU_Dialect, AMDGPU_TemporalLoadHints,
+ "temporal_load_hint"> {
+ let description = [{
+ AMDGPU-specific temporal load hints.
+
+ - `regular_temporal` (default).
+ - `nontemporal`: re-use is not expected.
+ - `highpriority`: precedence over `regular_temporal`.
+ - `lastuse`: last-use.
+ - `nontemporal_regular`: non-temporal for near cache(s) and regular for far caches.
+ - `regular_nontemporal`: regular for near cache(s) and non-temporal for far caches.
+ - `nontemporal_highpriority`: non-temporal for near cache(s) and high priority for far caches.
+ }];
+ let assemblyFormat = "`<` $value `>`";
+}
+
+def AMDGPU_TemporalStoreHints : I32EnumAttr<"TemporalStoreHints",
+ "AMDGPU-specific temporal store hints",
+ [
+ I32EnumAttrCase<"RegularTemporal", 0, "regular">,
+ I32EnumAttrCase<"NonTemporal", 1, "nontemporal">,
+ I32EnumAttrCase<"HighPriorityTemporal", 2, "highpriority">,
+ I32EnumAttrCase<"WriteBack", 3, "writeback">,
+ I32EnumAttrCase<"NT_RT", 4, "nontemporal_regular">,
+ I32EnumAttrCase<"RT_NT", 5, "regular_nontemporal">,
+ I32EnumAttrCase<"NT_HT", 6, "nontemporal_highpriority">,
+ I32EnumAttrCase<"NT_WB", 7, "nontemporal_writeback">,
+ ]> {
+ let genSpecializedAttr = 0;
+ let cppNamespace = "::mlir::amdgpu";
+}
+
+def AMDGPU_TemporalStoreHintsAttr : EnumAttr<AMDGPU_Dialect, AMDGPU_TemporalStoreHints,
+ "temporal_store_hint"> {
+ let description = [{
+ AMDGPU-specific temporal load hints.
+
+ - `regular_temporal` (default).
+ - `nontemporal`: re-use is not expected.
+ - `highpriority`: precedence over `regular_temporal`.
+ - `writeback`: same as "HT" but also overrides wr-rinse in far cache where it forces to stay dirty in cache.
+ - `nontemporal_regular`: non-temporal for near cache(s) and regular for far caches.
+ - `regular_nontemporal`: regular for near cache(s) and non-temporal for far caches.
+ - `nontemporal_highpriority`: non-temporal for near cache(s) and high priority for far caches.
+ - `nontemporal_writeback`: non-temporal for near cache(s) and WB for far cache.
+ }];
+
+ let assemblyFormat = "`<` $value `>`";
+}
+
+def AMDGPU_CacheScope : I32EnumAttr<"CacheScope",
+ "Cache scope control enums.",
+ [
+ I32EnumAttrCase<"Workgroup", 0, "workgroup">,
+ I32EnumAttrCase<"ShaderEngine", 1, "shader_engine">,
+ I32EnumAttrCase<"Device", 2, "device">,
+ I32EnumAttrCase<"System", 3, "system">,
+ ]> {
+ let genSpecializedAttr = 0;
+ let cppNamespace = "::mlir::amdgpu";
+}
+
+def AMDGPU_CacheScopeAttr : EnumAttr<AMDGPU_Dialect, AMDGPU_CacheScope,
+ "cache_scope"> {
+ let description = [{
+ AMDGPU cache scope control enums.
+
+ - `workgroup` coherent among all VMEM threads in a workgroup.
+ - `shader_engine`: coeherent among all client (threads) sharing a SE-cache.
+ - `device`: coherent among all threads on the same device.
+ - `system`: system
+ }];
+
+ let assemblyFormat = "`<` $value `>`";
+}
+
//===----------------------------------------------------------------------===//
// AMDGPU Type definitions
//===----------------------------------------------------------------------===//
@@ -1394,4 +1485,41 @@ def AMDGPU_MakeDmaDescriptorOp :
let hasFolder = 1;
}
+def AMDGPU_TensorLoadToLDSOp :
+ AMDGPU_Op<"tensor_load_to_lds", [MemoryEffects<[MemWrite]>, MemoryEffects<[MemRead]>]>,
+ Arguments<(ins AMDGPU_TDMDescriptorType: $desc,
+ DefaultValuedOptionalAttr<AMDGPU_CacheScopeAttr, "CacheScope::Workgroup">: $cache_scope,
+ DefaultValuedOptionalAttr<AMDGPU_TemporalLoadHintsAttr, "TemporalLoadHints::RegularTemporal">: $temporal_hint,
+ DefaultValuedOptionalAttr<BoolAttr, "false">: $non_volatile)> {
+ let summary = "Load tensors from global memory to LDS.";
+ let description = [{
+ Load tensors of up to five dimensions from global memory to LDS.
+
+ This operation was introduced in gfx1250.
+ }];
+
+ let assemblyFormat = [{
+ $desc attr-dict `:` qualified(type($desc))
+ }];
+}
+
+def AMDGPU_TensorStoreFromLDSOp :
+ AMDGPU_Op<"tensor_store_from_lds", [MemoryEffects<[MemWrite]>, MemoryEffects<[MemRead]>]>,
+ Arguments<(ins AMDGPU_TDMDescriptorType: $desc,
+ DefaultValuedOptionalAttr<AMDGPU_CacheScopeAttr, "CacheScope::Workgroup">: $cache_scope,
+ DefaultValuedOptionalAttr<AMDGPU_TemporalStoreHintsAttr, "TemporalStoreHints::RegularTemporal">: $temporal_hint,
+ DefaultValuedOptionalAttr<BoolAttr, "false">: $non_volatile)> {
+
+ let summary = "Store tensors from LDS to global memory.";
+ let description = [{
+ Store tensors of up to five dimensions from LDS to global memory.
+
+ This operation was introduced in gfx1250.
+ }];
+
+ let assemblyFormat = [{
+ $desc attr-dict `:` qualified(type($desc))
+ }];
+}
+
#endif // AMDGPU
diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index 0a1fa091e490e..4da038b844ab5 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -2705,6 +2705,47 @@ struct AMDGPUMakeDmaDescriptorLowering
}
};
+template <typename SourceOp, typename TargetD2Op, typename TargetOp>
+struct AMDGPUTensorLoadStoreOpLowering
+ : public ConvertOpToLLVMPattern<SourceOp> {
+ using ConvertOpToLLVMPattern<SourceOp>::ConvertOpToLLVMPattern;
+ using Adaptor = typename ConvertOpToLLVMPattern<SourceOp>::OneToNOpAdaptor;
+ AMDGPUTensorLoadStoreOpLowering(const LLVMTypeConverter &converter,
+ Chipset chipset)
+ : ConvertOpToLLVMPattern<SourceOp>(converter), chipset(chipset) {}
+ Chipset chipset;
+
+ LogicalResult
+ matchAndRewrite(SourceOp op, Adaptor adaptor,
+ ConversionPatternRewriter &rewriter) const override {
+ if (chipset < kGfx1250)
+ return op->emitOpError("is only supported on gfx1250");
+
+ ValueRange desc = adaptor.getDesc();
+ uint32_t temporalHint = static_cast<uint32_t>(op.getTemporalHint());
+ bool nonVolatile = static_cast<bool>(op.getNonVolatile());
+ uint32_t cacheScope = static_cast<uint32_t>(op.getCacheScope());
+ int32_t cachePolicy = cacheScope | temporalHint << 2 | nonVolatile << 5;
+
+ if (op.getDesc().getType().getSize() == 2) {
+ rewriter.replaceOpWithNewOp<TargetD2Op>(op, desc[0], desc[1],
+ cachePolicy,
+ /*alias_scopes=*/nullptr,
+ /*noalias_scopes=*/nullptr,
+ /*tbaa=*/nullptr);
+ return success();
+ }
+
+ rewriter.replaceOpWithNewOp<TargetOp>(op, desc[0], desc[1], desc[2],
+ desc[3], cachePolicy,
+ /*alias_scopes=*/nullptr,
+ /*noalias_scopes=*/nullptr,
+ /*tbaa=*/nullptr);
+
+ return success();
+ }
+};
+
struct ConvertAMDGPUToROCDLPass
: public impl::ConvertAMDGPUToROCDLPassBase<ConvertAMDGPUToROCDLPass> {
using Base::Base;
@@ -2723,6 +2764,30 @@ struct ConvertAMDGPUToROCDLPass
Type i32 = IntegerType::get(type.getContext(), 32);
return converter.convertType(VectorType::get(4, i32));
});
+ converter.addConversion(
+ [&](TDMDescriptorType type,
+ SmallVectorImpl<Type> &result) -> std::optional<LogicalResult> {
+ Type i32 = IntegerType::get(type.getContext(), 32);
+ Type v4i32 = converter.convertType(VectorType::get(4, i32));
+ Type v8i32 = converter.convertType(VectorType::get(8, i32));
+ result.push_back(v4i32);
+ result.push_back(v8i32);
+ if (type.getSize() != 2) {
+ result.push_back(v4i32);
+ result.push_back(v4i32);
+ }
+ return success();
+ });
+
+ auto addUnrealizedCast = [](OpBuilder &builder, TypeRange types,
+ ValueRange inputs,
+ Location loc) -> SmallVector<Value> {
+ auto cast =
+ UnrealizedConversionCastOp::create(builder, loc, types, inputs);
+ return cast.getResults();
+ };
+
+ converter.addTargetMaterialization(addUnrealizedCast);
populateAMDGPUToROCDLConversionPatterns(converter, patterns, *maybeChipset);
LLVMConversionTarget target(getContext());
@@ -2779,7 +2844,13 @@ void mlir::populateAMDGPUToROCDLConversionPatterns(LLVMTypeConverter &converter,
ScaledExtPackedOpLowering, PackedScaledTruncOpLowering,
PackedTrunc2xFp8OpLowering, PackedStochRoundFp8OpLowering,
GatherToLDSOpLowering, TransposeLoadOpLowering, AMDGPUPermlaneLowering,
- AMDGPUMakeDmaBaseLowering, AMDGPUMakeDmaDescriptorLowering>(converter,
- chipset);
+ AMDGPUMakeDmaBaseLowering, AMDGPUMakeDmaDescriptorLowering,
+ AMDGPUTensorLoadStoreOpLowering<TensorLoadToLDSOp,
+ ROCDL::TensorLoadToLDSD2Op,
+ ROCDL::TensorLoadToLDSOp>,
+ AMDGPUTensorLoadStoreOpLowering<TensorStoreFromLDSOp,
+ ROCDL::TensorStoreFromLDSD2Op,
+ ROCDL::TensorStoreFromLDSOp>>(converter,
+ chipset);
patterns.add<AMDGPUSwizzleBitModeLowering>(converter);
}
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir b/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir
index b9ae279d22cb1..ee7d6a76fd5d4 100644
--- a/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir
+++ b/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir
@@ -369,7 +369,7 @@ func.func @make_dma_descriptor_atomic_barrier(%base: !amdgpu.tdm_base<i32>, %bar
// CHECK-LABEL: func @make_dma_descriptor_workgroup_mask
// CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base<i32>, %[[WG_MASK:.+]]: i16, %[[TIMEOUT:.+]]: i1)
-func.func @make_dma_descriptor_workgroup_mask(%base: !amdgpu.tdm_base<i32>, %wg_mask: i16, %timeout: i1) -> !amdgpu.tdm_descriptor {
+func.func @make_dma_descriptor_workgroup_mask(%base: !amdgpu.tdm_base<i32>, %wg_mask: i16, %timeout: i1) -> !amdgpu.tdm_descriptor<2> {
// CHECK-DAG: %[[DGROUP0:.+]] = builtin.unrealized_conversion_cast %[[BASE]]
// CHECK-DAG: %[[C0:.+]] = llvm.mlir.constant(0 : i32)
@@ -440,6 +440,126 @@ func.func @make_dma_descriptor_workgroup_mask(%base: !amdgpu.tdm_base<i32>, %wg_
// CHECK: %[[DGROUP1:.+]] = llvm.insertelement %[[SGPR7]], %[[DGROUP1_6]][%[[C7]] : i32]
// CHECK: %[[DGROUPS:.+]] = builtin.unrealized_conversion_cast %[[DGROUP0]], %[[DGROUP1]] : vector<4xi32>, vector<8xi32> to !amdgpu.tdm_descriptor
- %descriptor = amdgpu.make_dma_descriptor %base globalSize [128, 64] globalStride [64, 1] sharedSize [128, 64] workgroupMask %wg_mask earlyTimeout %timeout : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
- func.return %descriptor : !amdgpu.tdm_descriptor
+ %descriptor = amdgpu.make_dma_descriptor %base globalSize [128, 64] globalStride [64, 1] sharedSize [128, 64] workgroupMask %wg_mask earlyTimeout %timeout : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor<2>
+ func.return %descriptor : !amdgpu.tdm_descriptor<2>
+}
+
+// CHECK-LABEL: func @tensor_load_to_lds_d2
+// CHECK-SAME: (%[[DESC:.+]]: !amdgpu.tdm_descriptor<2>)
+func.func @tensor_load_to_lds_d2(%desc: !amdgpu.tdm_descriptor<2>) {
+ // CHECK: %[[DGROUPS:.+]]:2 = builtin.unrealized_conversion_cast %[[DESC]]
+ // CHECK: rocdl.tensor.load.to.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 0 : vector<4xi32>, vector<8xi32>
+ amdgpu.tensor_load_to_lds %desc : !amdgpu.tdm_descriptor<2>
+
+ // CHECK: rocdl.tensor.load.to.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 0 : vector<4xi32>, vector<8xi32>
+ amdgpu.tensor_load_to_lds %desc { cache_scope = #amdgpu.cache_scope<workgroup> } : !amdgpu.tdm_descriptor<2>
+
+ // CHECK: rocdl.tensor.load.to.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 1 : vector<4xi32>, vector<8xi32>
+ amdgpu.tensor_load_to_lds %desc { cache_scope = #amdgpu.cache_scope<shader_engine> } : !amdgpu.tdm_descriptor<2>
+
+ // CHECK: rocdl.tensor.load.to.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 2 : vector<4xi32>, vector<8xi32>
+ amdgpu.tensor_load_to_lds %desc { cache_scope = #amdgpu.cache_scope<device> } : !amdgpu.tdm_descriptor<2>
+
+ // CHECK: rocdl.tensor.load.to.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 3 : vector<4xi32>, vector<8xi32>
+ amdgpu.tensor_load_to_lds %desc { cache_scope = #amdgpu.cache_scope<system> } : !amdgpu.tdm_descriptor<2>
+
+ // CHECK: rocdl.tensor.load.to.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 0 : vector<4xi32>, vector<8xi32>
+ amdgpu.tensor_load_to_lds %desc { temporal_hint = #amdgpu.temporal_load_hint<regular> } : !amdgpu.tdm_descriptor<2>
+
+ // CHECK: rocdl.tensor.load.to.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 4 : vector<4xi32>, vector<8xi32>
+ amdgpu.tensor_load_to_lds %desc { temporal_hint = #amdgpu.temporal_load_hint<nontemporal> } : !amdgpu.tdm_descriptor<2>
+
+ // CHECK: rocdl.tensor.load.to.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 8 : vector<4xi32>, vector<8xi32>
+ amdgpu.tensor_load_to_lds %desc { temporal_hint = #amdgpu.temporal_load_hint<highpriority> } : !amdgpu.tdm_descriptor<2>
+
+ // CHECK: rocdl.tensor.load.to.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 12 : vector<4xi32>, vector<8xi32>
+ amdgpu.tensor_load_to_lds %desc { temporal_hint = #amdgpu.temporal_load_hint<lastuse> } : !amdgpu.tdm_descriptor<2>
+
+ // CHECK: rocdl.tensor.load.to.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 16 : vector<4xi32>, vector<8xi32>
+ amdgpu.tensor_load_to_lds %desc { temporal_hint = #amdgpu.temporal_load_hint<nontemporal_regular> } : !amdgpu.tdm_descriptor<2>
+
+ // CHECK: rocdl.tensor.load.to.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 20 : vector<4xi32>, vector<8xi32>
+ amdgpu.tensor_load_to_lds %desc { temporal_hint = #amdgpu.temporal_load_hint<regular_nontemporal> } : !amdgpu.tdm_descriptor<2>
+
+ // CHECK: rocdl.tensor.load.to.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 24 : vector<4xi32>, vector<8xi32>
+ amdgpu.tensor_load_to_lds %desc { temporal_hint = #amdgpu.temporal_load_hint<nontemporal_highpriority> } : !amdgpu.tdm_descriptor<2>
+
+ // CHECK: rocdl.tensor.load.to.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 0 : vector<4xi32>, vector<8xi32>
+ amdgpu.tensor_load_to_lds %desc { non_volatile = false } : !amdgpu.tdm_descriptor<2>
+
+ // CHECK: rocdl.tensor.load.to.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 32 : vector<4xi32>, vector<8xi32>
+ amdgpu.tensor_load_to_lds %desc { non_volatile = true } : !amdgpu.tdm_descriptor<2>
+
+ func.return
}
+
+// CHECK-LABEL: func @tensor_load_to_lds
+// CHECK-SAME: (%[[DESC:.+]]: !amdgpu.tdm_descriptor<4>)
+func.func @tensor_load_to_lds(%desc: !amdgpu.tdm_descriptor<4>) {
+ // CHECK: %[[DGROUPS:.+]]:4 = builtin.unrealized_conversion_cast %[[DESC]]
+ // CHECK: rocdl.tensor.load.to.lds %[[DGROUPS]]#0, %[[DGROUPS]]#1, %[[DGROUPS]]#2, %[[DGROUPS]]#3 cachepolicy 0 : vector<4xi32>, vector<8xi32>
+ amdgpu.tensor_load_to_lds %desc : !amdgpu.tdm_descriptor<4>
+ func.return
+}
+
+// CHECK-LABEL: func @tensor_store_from_lds_d2
+// CHECK-SAME: (%[[DESC:.+]]: !amdgpu.tdm_descriptor<2>)
+func.func @tensor_store_from_lds_d2(%desc: !amdgpu.tdm_descriptor<2>) {
+ // CHECK: %[[DGROUPS:.+]]:2 = builtin.unrealized_conversion_cast %[[DESC]]
+ // CHECK: rocdl.tensor.store.from.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 0 : vector<4xi32>, vector<8xi32>
+ amdgpu.tensor_store_from_lds %desc : !amdgpu.tdm_descriptor<2>
+
+ // CHECK: rocdl.tensor.store.from.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 0 : vector<4xi32>, vector<8xi32>
+ amdgpu.tensor_store_from_lds %desc { cache_scope = #amdgpu.cache_scope<workgroup> } : !amdgpu.tdm_descriptor<2>
+
+ // CHECK: rocdl.tensor.store.from.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 1 : vector<4xi32>, vector<8xi32>
+ amdgpu.tensor_store_from_lds %desc { cache_scope = #amdgpu.cache_scope<shader_engine> } : !amdgpu.tdm_descriptor<2>
+
+ // CHECK: rocdl.tensor.store.from.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 2 : vector<4xi32>, vector<8xi32>
+ amdgpu.tensor_store_from_lds %desc { cache_scope = #amdgpu.cache_scope<device> } : !amdgpu.tdm_descriptor<2>
+
+ // CHECK: rocdl.tensor.store.from.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 3 : vector<4xi32>, vector<8xi32>
+ amdgpu.tensor_store_from_lds %desc { cache_scope = #amdgpu.cache_scope<system> } : !amdgpu.tdm_descriptor<2>
+
+ // CHECK: rocdl.tensor.store.from.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 0 : vector<4xi32>, vector<8xi32>
+ amdgpu.tensor_store_from_lds %desc { temporal_hint = #amdgpu.temporal_store_hint<regular> } : !amdgpu.tdm_descriptor<2>
+
+ // CHECK: rocdl.tensor.store.from.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 4 : vector<4xi32>, vector<8xi32>
+ amdgpu.tensor_store_from_lds %desc { temporal_hint = #amdgpu.temporal_store_hint<nontemporal> } : !amdgpu.tdm_descriptor<2>
+
+ // CHECK: rocdl.tensor.store.from.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 8 : vector<4xi32>, vector<8xi32>
+ amdgpu.tensor_store_from_lds %desc { temporal_hint = #amdgpu.temporal_store_hint<highpriority> } : !amdgpu.tdm_descriptor<2>
+
+ // CHECK: rocdl.tensor.store.from.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 12 : vector<4xi32>, vector<8xi32>
+ amdgpu.tensor_store_from_lds %desc { temporal_hint = #amdgpu.temporal_store_hint<writeback> } : !amdgpu.tdm_descriptor<2>
+
+ // CHECK: rocdl.tensor.store.from.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 16 : vector<4xi32>, vector<8xi32>
+ amdgpu.tensor_store_from_lds %desc { temporal_hint = #amdgpu.temporal_store_hint<nontemporal_regular> } : !amdgpu.tdm_descriptor<2>
+
+ // CHECK: rocdl.tensor.store.from.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 20 : vector<4xi32>, vector<8xi32>
+ amdgpu.tensor_store_from_lds %desc { temporal_hint = #amdgpu.temporal_store_hint<regular_nontemporal> } : !amdgpu.tdm_descriptor<2>
+
+ // CHECK: rocdl.tensor.store.from.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 24 : vector<4xi32>, vector<8xi32>
+ amdgpu.tensor_store_from_lds %desc { temporal_hint = #amdgpu.temporal_store_hint<nontemporal_highpriority> } : !amdgpu.tdm_descriptor<2>
+
+ // CHECK: rocdl.tensor.store.from.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 28 : vector<4xi32>, vector<8xi32>
+ amdgpu.tensor_store_from_lds %desc { temporal_hint = #amdgpu.temporal_store_hint<nontemporal_writeback> } : !amdgpu.tdm_descriptor<2>
+
+ // CHECK: rocdl.tensor.store.from.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 0 : vector<4xi32>, vector<8xi32>
+ amdgpu.tensor_store_from_lds %desc { non_volatile = false } : !amdgpu.tdm_descriptor<2>
+
+ // CHECK: rocdl.tensor.store.from.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 32 : vector<4xi32>, vector<8xi32>
+ amdgpu.tensor_store_from_lds %desc { non_volatile = true } : !amdgpu.tdm_descriptor<2>
+ func.return
+}
+
+
+// CHECK-LABEL: func @tensor_store_from_lds
+// CHECK-SAME: (%[[DESC:.+]]: !amdgpu.tdm_descriptor<4>)
+func.func @tensor_store_from_lds(%desc: !amdgpu.tdm_descriptor<4>) {
+ // CHECK: %[[DGROUPS:.+]]:4 = builtin.unrealized_conversion_cast %[[DESC]]
+ // CHECK: rocdl.tensor.store.from.lds %[[DGROUPS]]#0, %[[DGROUPS]]#1, %[[DGROUPS]]#2, %[[DGROUPS]]#3 cachepolicy 0 : vector<4xi32>, vector<8xi32>
+ amdgpu.tensor_store_from_lds %desc : !amdgpu.tdm_descriptor<4>
+ func.return
+}
+
diff --git a/mlir/test/Dialect/AMDGPU/ops.mlir b/mlir/test/Dialect/AMDGPU/ops.mlir
index 7b1dfe82d889e..b199f15c24a33 100644
--- a/mlir/test/Dialect/AMDGPU/ops.mlir
+++ b/mlir/test/Dialect/AMDGPU/ops.mlir
@@ -731,7 +731,7 @@ func.func @make_dma_descriptor(%base: !amdgpu.tdm_base<i32>, %wg_mask: i16, %tim
sharedSize [64, 64]
// CHECK-SAME: workgroupMask %[[WG_MASK]]
workgroupMask %wg_mask
- : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
+ : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor<2>
// CHECK: amdgpu.make_dma_descriptor %[[BASE]]
amdgpu.make_dma_descriptor %base
@@ -745,7 +745,7 @@ func.func @make_dma_descriptor(%base: !amdgpu.tdm_base<i32>, %wg_mask: i16, %tim
workgroupMask %wg_mask
// CHECK-SAME: earlyTimeout %[[TIMEOUT]]
earlyTimeout %timeout
- : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
+ : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor<2>
// CHECK: amdgpu.make_dma_descriptor %[[BASE]]
amdgpu.make_dma_descriptor %base
>From 3a9dbb9937900c44581cf47b465b0323089ca717 Mon Sep 17 00:00:00 2001
From: Erick Ochoa <erick.ochoalopez at amd.com>
Date: Mon, 8 Dec 2025 14:31:56 -0500
Subject: [PATCH 04/10] reflow
---
mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index 4da038b844ab5..e1523bcb01a85 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -2728,8 +2728,7 @@ struct AMDGPUTensorLoadStoreOpLowering
int32_t cachePolicy = cacheScope | temporalHint << 2 | nonVolatile << 5;
if (op.getDesc().getType().getSize() == 2) {
- rewriter.replaceOpWithNewOp<TargetD2Op>(op, desc[0], desc[1],
- cachePolicy,
+ rewriter.replaceOpWithNewOp<TargetD2Op>(op, desc[0], desc[1], cachePolicy,
/*alias_scopes=*/nullptr,
/*noalias_scopes=*/nullptr,
/*tbaa=*/nullptr);
>From 0b42ea0acdbaa3b0e346d223c7d65a9c0ad8764f Mon Sep 17 00:00:00 2001
From: Erick Ochoa <erick.ochoalopez at amd.com>
Date: Mon, 8 Dec 2025 14:56:29 -0500
Subject: [PATCH 05/10] Revert "[mlir][amdgpu] Make tdm_descriptor parametric"
This reverts commit 14eb785fdf78c3d8e9bca7c3b2a332f0da762efa.
---
mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td | 5 -----
mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp | 9 ---------
mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir | 14 +++++++-------
.../AMDGPU/amdgpu-make-dma-descriptor-fold.mlir | 6 +++---
mlir/test/Dialect/AMDGPU/invalid.mlir | 10 +++++-----
mlir/test/Dialect/AMDGPU/ops.mlir | 10 +++++-----
6 files changed, 20 insertions(+), 34 deletions(-)
diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
index 95cefda110511..7b6056f7acb1e 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
@@ -202,11 +202,6 @@ def AMDGPU_TDMDescriptorType : AMDGPU_Type<"TDMDescriptor", "tdm_descriptor"> {
used in tensor_load_to_lds or tensor_store_from_lds.
}];
- let parameters = (ins "unsigned": $size);
-
- let assemblyFormat = "`<` $size `>`";
- let genVerifyDecl = 1;
-
}
//===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
index bfd7c165807fa..b7a665b0f5367 100644
--- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
+++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
@@ -781,15 +781,6 @@ LogicalResult MakeDmaBaseOp::verify() {
// MakeDmaDescriptorOp
//===----------------------------------------------------------------------===//
-LogicalResult
-TDMDescriptorType::verify(function_ref<InFlightDiagnostic()> emitError,
- unsigned size) {
- if (!llvm::is_contained<unsigned>({2, 4}, size))
- return emitError() << "only groups of size 2 or 4 are valid but got "
- << size;
- return success();
-}
-
LogicalResult MakeDmaDescriptorOp::verify() {
ArrayRef<int64_t> globalStaticStrides = getGlobalStaticStrides();
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir b/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir
index ee7d6a76fd5d4..f64097c76a5f4 100644
--- a/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir
+++ b/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir
@@ -239,7 +239,7 @@ func.func @make_dma_base(%idx: index, %mem: memref<8xi32, #gpu_global_addrspace>
// CHECK-LABEL: func @make_dma_descriptor
// CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base<i32>)
-func.func @make_dma_descriptor(%base: !amdgpu.tdm_base<i32>) -> !amdgpu.tdm_descriptor<2> {
+func.func @make_dma_descriptor(%base: !amdgpu.tdm_base<i32>) -> !amdgpu.tdm_descriptor {
// CHECK-DAG: %[[DGROUP0:.+]] = builtin.unrealized_conversion_cast %[[BASE]]
// CHECK-DAG: %[[C0:.+]] = llvm.mlir.constant(0 : i32)
@@ -304,9 +304,9 @@ func.func @make_dma_descriptor(%base: !amdgpu.tdm_base<i32>) -> !amdgpu.tdm_desc
// CHECK: %[[DGROUP1_6:.+]] = llvm.insertelement %[[SGPR6]], %[[DGROUP1_5]][%[[C6]] : i32]
// CHECK: %[[DGROUP1:.+]] = llvm.insertelement %[[SGPR7]], %[[DGROUP1_6]][%[[C7]] : i32]
- // CHECK: %[[DGROUPS:.+]] = builtin.unrealized_conversion_cast %[[DGROUP0]], %[[DGROUP1]] : vector<4xi32>, vector<8xi32> to !amdgpu.tdm_descriptor<2>
- %descriptor = amdgpu.make_dma_descriptor %base globalSize [128, 64] globalStride [64, 1] sharedSize [128, 64] : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor<2>
- func.return %descriptor : !amdgpu.tdm_descriptor<2>
+ // CHECK: %[[DGROUPS:.+]] = builtin.unrealized_conversion_cast %[[DGROUP0]], %[[DGROUP1]] : vector<4xi32>, vector<8xi32> to !amdgpu.tdm_descriptor
+ %descriptor = amdgpu.make_dma_descriptor %base globalSize [128, 64] globalStride [64, 1] sharedSize [128, 64] : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
+ func.return %descriptor : !amdgpu.tdm_descriptor
}
// -----
@@ -317,7 +317,7 @@ func.func @make_dma_descriptor(%base: !amdgpu.tdm_base<i32>) -> !amdgpu.tdm_desc
// CHECK-LABEL: func @make_dma_descriptor_atomic_barrier
// CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base<i32>, %[[BARRIER:.+]]: {{.*}}, %[[IDX:.+]]: index)
-func.func @make_dma_descriptor_atomic_barrier(%base: !amdgpu.tdm_base<i32>, %barrier : memref<8xi32, #gpu_lds_addrspace>, %idx: index) -> !amdgpu.tdm_descriptor<2> {
+func.func @make_dma_descriptor_atomic_barrier(%base: !amdgpu.tdm_base<i32>, %barrier : memref<8xi32, #gpu_lds_addrspace>, %idx: index) -> !amdgpu.tdm_descriptor {
// CHECK-DAG: %[[INDEX:.+]] = builtin.unrealized_conversion_cast %[[IDX]] : index to i64
// CHECK-DAG: %[[BARRIER_MEMREF_DESC:.+]] = builtin.unrealized_conversion_cast %[[BARRIER]]
// CHECK-DAG: %[[DGROUP0:.+]] = builtin.unrealized_conversion_cast %[[BASE]]
@@ -361,8 +361,8 @@ func.func @make_dma_descriptor_atomic_barrier(%base: !amdgpu.tdm_base<i32>, %bar
globalStride [64, 1]
sharedSize [128, 64]
atomicBarrier(%barrier[%idx] : memref<8xi32, #gpu_lds_addrspace>)
- : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor<2>
- func.return %descriptor : !amdgpu.tdm_descriptor<2>
+ : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
+ func.return %descriptor : !amdgpu.tdm_descriptor
}
// -----
diff --git a/mlir/test/Dialect/AMDGPU/amdgpu-make-dma-descriptor-fold.mlir b/mlir/test/Dialect/AMDGPU/amdgpu-make-dma-descriptor-fold.mlir
index 7dfd67407d78a..9d43c9940f8e0 100644
--- a/mlir/test/Dialect/AMDGPU/amdgpu-make-dma-descriptor-fold.mlir
+++ b/mlir/test/Dialect/AMDGPU/amdgpu-make-dma-descriptor-fold.mlir
@@ -2,7 +2,7 @@
// CHECK-LABEL: @make_dma_descriptor_fold
// CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base<i32>, %[[IDX:.+]]: index)
-func.func @make_dma_descriptor_fold(%base: !amdgpu.tdm_base<i32>, %idx: index) -> !amdgpu.tdm_descriptor<2> {
+func.func @make_dma_descriptor_fold(%base: !amdgpu.tdm_base<i32>, %idx: index) -> !amdgpu.tdm_descriptor {
%c64 = arith.constant 64 : index
// CHECK: amdgpu.make_dma_descriptor %[[BASE]]
@@ -14,6 +14,6 @@ func.func @make_dma_descriptor_fold(%base: !amdgpu.tdm_base<i32>, %idx: index) -
// CHECK-SAME: sharedSize [64, 64]
sharedSize [%c64, %c64]
iterate %idx, %idx, %idx
- : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor<2>
- func.return %0 : !amdgpu.tdm_descriptor<2>
+ : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
+ func.return %0 : !amdgpu.tdm_descriptor
}
diff --git a/mlir/test/Dialect/AMDGPU/invalid.mlir b/mlir/test/Dialect/AMDGPU/invalid.mlir
index 98fd2756810b4..6308ea9a6a096 100644
--- a/mlir/test/Dialect/AMDGPU/invalid.mlir
+++ b/mlir/test/Dialect/AMDGPU/invalid.mlir
@@ -373,7 +373,7 @@ func.func @make_dma_base_invalid_addressspace(%idx: index, %smem : memref<8xi32,
func.func @make_dma_base_invalid_barrier(%base: !amdgpu.tdm_base<i32>, %barrier: memref<8xi32>, %idx: index) {
// expected-error at +1 {{'amdgpu.make_dma_descriptor' op atomic barrier address must be in LDS.}}
- amdgpu.make_dma_descriptor %base globalSize [64, 64] globalStride [64, 1] sharedSize [64, 64] atomicBarrier(%barrier[%idx] : memref<8xi32>) : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor<2>
+ amdgpu.make_dma_descriptor %base globalSize [64, 64] globalStride [64, 1] sharedSize [64, 64] atomicBarrier(%barrier[%idx] : memref<8xi32>) : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
}
// -----
@@ -382,7 +382,7 @@ func.func @make_dma_base_invalid_barrier(%base: !amdgpu.tdm_base<i32>, %barrier:
// CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base<i32>)
func.func @make_dma_descriptor_invalid_empty_strides(%base: !amdgpu.tdm_base<i32>) {
// expected-error at +1 {{'amdgpu.make_dma_descriptor' op strides must not be empty.}}
- amdgpu.make_dma_descriptor %base globalSize [0, 1] globalStride [] sharedSize [1, 0] : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor<2>
+ amdgpu.make_dma_descriptor %base globalSize [0, 1] globalStride [] sharedSize [1, 0] : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
func.return
}
@@ -392,7 +392,7 @@ func.func @make_dma_descriptor_invalid_empty_strides(%base: !amdgpu.tdm_base<i32
// CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base<i32>)
func.func @make_dma_descriptor_invalid_innermost_stride(%base: !amdgpu.tdm_base<i32>) {
// expected-error at +1 {{'amdgpu.make_dma_descriptor' op strides for the innermost dimension must be 1.}}
- amdgpu.make_dma_descriptor %base globalSize [2, 2] globalStride [1, 2] sharedSize [1, 0] : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor<2>
+ amdgpu.make_dma_descriptor %base globalSize [2, 2] globalStride [1, 2] sharedSize [1, 0] : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
func.return
}
@@ -402,7 +402,7 @@ func.func @make_dma_descriptor_invalid_innermost_stride(%base: !amdgpu.tdm_base<
// CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base<i32>)
func.func @make_dma_descriptor_invalid_size_and_stride_sizes(%base: !amdgpu.tdm_base<i32>) {
// expected-error at +1 {{'amdgpu.make_dma_descriptor' op strides and sizes must have same rank.}}
- amdgpu.make_dma_descriptor %base globalSize [1, 1, 1] globalStride [1, 1] sharedSize [1, 0] : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor<2>
+ amdgpu.make_dma_descriptor %base globalSize [1, 1, 1] globalStride [1, 1] sharedSize [1, 0] : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
func.return
}
@@ -412,7 +412,7 @@ func.func @make_dma_descriptor_invalid_size_and_stride_sizes(%base: !amdgpu.tdm_
// CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base<i32>)
func.func @make_dma_descriptor_invalid_shared_and_global_rank(%base: !amdgpu.tdm_base<i32>) {
// expected-error at +1 {{'amdgpu.make_dma_descriptor' op tensor must have same rank as tile.}}
- amdgpu.make_dma_descriptor %base globalSize [4, 4] globalStride [1, 1] sharedSize [1, 2, 3] : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor<2>
+ amdgpu.make_dma_descriptor %base globalSize [4, 4] globalStride [1, 1] sharedSize [1, 2, 3] : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
func.return
}
diff --git a/mlir/test/Dialect/AMDGPU/ops.mlir b/mlir/test/Dialect/AMDGPU/ops.mlir
index b199f15c24a33..23b60f49a21d0 100644
--- a/mlir/test/Dialect/AMDGPU/ops.mlir
+++ b/mlir/test/Dialect/AMDGPU/ops.mlir
@@ -706,8 +706,8 @@ func.func @make_dma_descriptor(%base: !amdgpu.tdm_base<i32>, %wg_mask: i16, %tim
globalSize [64, 64]
// CHECK-SAME: globalStride [64, 1]
globalStride [64, 1]
- // CHECK-SAME: sharedSize [64, 64] : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor<2>
- sharedSize [64, 64] : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor<2>
+ // CHECK-SAME: sharedSize [64, 64] : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
+ sharedSize [64, 64] : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
// CHECK: amdgpu.make_dma_descriptor %[[BASE]]
amdgpu.make_dma_descriptor %base
@@ -719,7 +719,7 @@ func.func @make_dma_descriptor(%base: !amdgpu.tdm_base<i32>, %wg_mask: i16, %tim
sharedSize [64, 64]
// CHECK-SAME: padShared(%[[IDX]] every %[[IDX]])
padShared(%idx every %idx)
- : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor<2>
+ : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
// CHECK: amdgpu.make_dma_descriptor %[[BASE]]
amdgpu.make_dma_descriptor %base
@@ -757,7 +757,7 @@ func.func @make_dma_descriptor(%base: !amdgpu.tdm_base<i32>, %wg_mask: i16, %tim
sharedSize [64, 64]
// CHECK-SAME: atomicBarrier(%[[BARRIER]][%[[IDX]]] : memref<8xi32, #gpu.address_space<workgroup>>)
atomicBarrier(%barrier[%idx] : memref<8xi32, #gpu.address_space<workgroup>>)
- : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor<2>
+ : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
// CHECK: amdgpu.make_dma_descriptor %[[BASE]]
amdgpu.make_dma_descriptor %base
@@ -769,7 +769,7 @@ func.func @make_dma_descriptor(%base: !amdgpu.tdm_base<i32>, %wg_mask: i16, %tim
sharedSize [64, 64]
// CHECK-SAME: iterate %[[IDX]], %[[IDX]], %[[IDX]]
iterate %idx, %idx, %idx
- : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor<4>
+ : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
func.return
}
>From 1aa368f976d039847da380f6947e5578cf91c521 Mon Sep 17 00:00:00 2001
From: Erick Ochoa <erick.ochoalopez at amd.com>
Date: Mon, 8 Dec 2025 15:11:00 -0500
Subject: [PATCH 06/10] Do not make tdm_descriptor parametric
---
.../AMDGPUToROCDL/AMDGPUToROCDL.cpp | 19 +--
.../Conversion/AMDGPUToROCDL/gfx1250.mlir | 155 ++++++++----------
mlir/test/Dialect/AMDGPU/ops.mlir | 4 +-
3 files changed, 73 insertions(+), 105 deletions(-)
diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index e1523bcb01a85..a661ed6c6980c 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -2705,7 +2705,7 @@ struct AMDGPUMakeDmaDescriptorLowering
}
};
-template <typename SourceOp, typename TargetD2Op, typename TargetOp>
+template <typename SourceOp, typename TargetOp>
struct AMDGPUTensorLoadStoreOpLowering
: public ConvertOpToLLVMPattern<SourceOp> {
using ConvertOpToLLVMPattern<SourceOp>::ConvertOpToLLVMPattern;
@@ -2727,20 +2727,11 @@ struct AMDGPUTensorLoadStoreOpLowering
uint32_t cacheScope = static_cast<uint32_t>(op.getCacheScope());
int32_t cachePolicy = cacheScope | temporalHint << 2 | nonVolatile << 5;
- if (op.getDesc().getType().getSize() == 2) {
- rewriter.replaceOpWithNewOp<TargetD2Op>(op, desc[0], desc[1], cachePolicy,
- /*alias_scopes=*/nullptr,
- /*noalias_scopes=*/nullptr,
- /*tbaa=*/nullptr);
- return success();
- }
-
rewriter.replaceOpWithNewOp<TargetOp>(op, desc[0], desc[1], desc[2],
desc[3], cachePolicy,
/*alias_scopes=*/nullptr,
/*noalias_scopes=*/nullptr,
/*tbaa=*/nullptr);
-
return success();
}
};
@@ -2771,10 +2762,8 @@ struct ConvertAMDGPUToROCDLPass
Type v8i32 = converter.convertType(VectorType::get(8, i32));
result.push_back(v4i32);
result.push_back(v8i32);
- if (type.getSize() != 2) {
- result.push_back(v4i32);
- result.push_back(v4i32);
- }
+ result.push_back(v4i32);
+ result.push_back(v4i32);
return success();
});
@@ -2845,10 +2834,8 @@ void mlir::populateAMDGPUToROCDLConversionPatterns(LLVMTypeConverter &converter,
GatherToLDSOpLowering, TransposeLoadOpLowering, AMDGPUPermlaneLowering,
AMDGPUMakeDmaBaseLowering, AMDGPUMakeDmaDescriptorLowering,
AMDGPUTensorLoadStoreOpLowering<TensorLoadToLDSOp,
- ROCDL::TensorLoadToLDSD2Op,
ROCDL::TensorLoadToLDSOp>,
AMDGPUTensorLoadStoreOpLowering<TensorStoreFromLDSOp,
- ROCDL::TensorStoreFromLDSD2Op,
ROCDL::TensorStoreFromLDSOp>>(converter,
chipset);
patterns.add<AMDGPUSwizzleBitModeLowering>(converter);
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir b/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir
index f64097c76a5f4..31a9306cffb95 100644
--- a/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir
+++ b/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir
@@ -369,7 +369,7 @@ func.func @make_dma_descriptor_atomic_barrier(%base: !amdgpu.tdm_base<i32>, %bar
// CHECK-LABEL: func @make_dma_descriptor_workgroup_mask
// CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base<i32>, %[[WG_MASK:.+]]: i16, %[[TIMEOUT:.+]]: i1)
-func.func @make_dma_descriptor_workgroup_mask(%base: !amdgpu.tdm_base<i32>, %wg_mask: i16, %timeout: i1) -> !amdgpu.tdm_descriptor<2> {
+func.func @make_dma_descriptor_workgroup_mask(%base: !amdgpu.tdm_base<i32>, %wg_mask: i16, %timeout: i1) -> !amdgpu.tdm_descriptor {
// CHECK-DAG: %[[DGROUP0:.+]] = builtin.unrealized_conversion_cast %[[BASE]]
// CHECK-DAG: %[[C0:.+]] = llvm.mlir.constant(0 : i32)
@@ -440,126 +440,107 @@ func.func @make_dma_descriptor_workgroup_mask(%base: !amdgpu.tdm_base<i32>, %wg_
// CHECK: %[[DGROUP1:.+]] = llvm.insertelement %[[SGPR7]], %[[DGROUP1_6]][%[[C7]] : i32]
// CHECK: %[[DGROUPS:.+]] = builtin.unrealized_conversion_cast %[[DGROUP0]], %[[DGROUP1]] : vector<4xi32>, vector<8xi32> to !amdgpu.tdm_descriptor
- %descriptor = amdgpu.make_dma_descriptor %base globalSize [128, 64] globalStride [64, 1] sharedSize [128, 64] workgroupMask %wg_mask earlyTimeout %timeout : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor<2>
- func.return %descriptor : !amdgpu.tdm_descriptor<2>
+ %descriptor = amdgpu.make_dma_descriptor %base globalSize [128, 64] globalStride [64, 1] sharedSize [128, 64] workgroupMask %wg_mask earlyTimeout %timeout : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
+ func.return %descriptor : !amdgpu.tdm_descriptor
}
-// CHECK-LABEL: func @tensor_load_to_lds_d2
-// CHECK-SAME: (%[[DESC:.+]]: !amdgpu.tdm_descriptor<2>)
-func.func @tensor_load_to_lds_d2(%desc: !amdgpu.tdm_descriptor<2>) {
- // CHECK: %[[DGROUPS:.+]]:2 = builtin.unrealized_conversion_cast %[[DESC]]
- // CHECK: rocdl.tensor.load.to.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 0 : vector<4xi32>, vector<8xi32>
- amdgpu.tensor_load_to_lds %desc : !amdgpu.tdm_descriptor<2>
+// CHECK-LABEL: func @tensor_load_to_lds
+// CHECK-SAME: (%[[DESC:.+]]: !amdgpu.tdm_descriptor)
+func.func @tensor_load_to_lds(%desc: !amdgpu.tdm_descriptor) {
+ // CHECK: %[[DGROUPS:.+]]:4 = builtin.unrealized_conversion_cast %[[DESC]]
+ // CHECK: rocdl.tensor.load.to.lds %[[DGROUPS]]#0, %[[DGROUPS]]#1, %[[DGROUPS]]#2, %[[DGROUPS]]#3 cachepolicy 0 : vector<4xi32>, vector<8xi32>
+ amdgpu.tensor_load_to_lds %desc : !amdgpu.tdm_descriptor
- // CHECK: rocdl.tensor.load.to.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 0 : vector<4xi32>, vector<8xi32>
- amdgpu.tensor_load_to_lds %desc { cache_scope = #amdgpu.cache_scope<workgroup> } : !amdgpu.tdm_descriptor<2>
+ // CHECK: rocdl.tensor.load.to.lds %[[DGROUPS]]#0, %[[DGROUPS]]#1, %[[DGROUPS]]#2, %[[DGROUPS]]#3 cachepolicy 0 : vector<4xi32>, vector<8xi32>
+ amdgpu.tensor_load_to_lds %desc { cache_scope = #amdgpu.cache_scope<workgroup> } : !amdgpu.tdm_descriptor
- // CHECK: rocdl.tensor.load.to.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 1 : vector<4xi32>, vector<8xi32>
- amdgpu.tensor_load_to_lds %desc { cache_scope = #amdgpu.cache_scope<shader_engine> } : !amdgpu.tdm_descriptor<2>
+ // CHECK: rocdl.tensor.load.to.lds %[[DGROUPS]]#0, %[[DGROUPS]]#1, %[[DGROUPS]]#2, %[[DGROUPS]]#3 cachepolicy 1 : vector<4xi32>, vector<8xi32>
+ amdgpu.tensor_load_to_lds %desc { cache_scope = #amdgpu.cache_scope<shader_engine> } : !amdgpu.tdm_descriptor
- // CHECK: rocdl.tensor.load.to.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 2 : vector<4xi32>, vector<8xi32>
- amdgpu.tensor_load_to_lds %desc { cache_scope = #amdgpu.cache_scope<device> } : !amdgpu.tdm_descriptor<2>
+ // CHECK: rocdl.tensor.load.to.lds %[[DGROUPS]]#0, %[[DGROUPS]]#1, %[[DGROUPS]]#2, %[[DGROUPS]]#3 cachepolicy 2 : vector<4xi32>, vector<8xi32>
+ amdgpu.tensor_load_to_lds %desc { cache_scope = #amdgpu.cache_scope<device> } : !amdgpu.tdm_descriptor
- // CHECK: rocdl.tensor.load.to.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 3 : vector<4xi32>, vector<8xi32>
- amdgpu.tensor_load_to_lds %desc { cache_scope = #amdgpu.cache_scope<system> } : !amdgpu.tdm_descriptor<2>
+ // CHECK: rocdl.tensor.load.to.lds %[[DGROUPS]]#0, %[[DGROUPS]]#1, %[[DGROUPS]]#2, %[[DGROUPS]]#3 cachepolicy 3 : vector<4xi32>, vector<8xi32>
+ amdgpu.tensor_load_to_lds %desc { cache_scope = #amdgpu.cache_scope<system> } : !amdgpu.tdm_descriptor
- // CHECK: rocdl.tensor.load.to.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 0 : vector<4xi32>, vector<8xi32>
- amdgpu.tensor_load_to_lds %desc { temporal_hint = #amdgpu.temporal_load_hint<regular> } : !amdgpu.tdm_descriptor<2>
+ // CHECK: rocdl.tensor.load.to.lds %[[DGROUPS]]#0, %[[DGROUPS]]#1, %[[DGROUPS]]#2, %[[DGROUPS]]#3 cachepolicy 0 : vector<4xi32>, vector<8xi32>
+ amdgpu.tensor_load_to_lds %desc { temporal_hint = #amdgpu.temporal_load_hint<regular> } : !amdgpu.tdm_descriptor
- // CHECK: rocdl.tensor.load.to.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 4 : vector<4xi32>, vector<8xi32>
- amdgpu.tensor_load_to_lds %desc { temporal_hint = #amdgpu.temporal_load_hint<nontemporal> } : !amdgpu.tdm_descriptor<2>
+ // CHECK: rocdl.tensor.load.to.lds %[[DGROUPS]]#0, %[[DGROUPS]]#1, %[[DGROUPS]]#2, %[[DGROUPS]]#3 cachepolicy 4 : vector<4xi32>, vector<8xi32>
+ amdgpu.tensor_load_to_lds %desc { temporal_hint = #amdgpu.temporal_load_hint<nontemporal> } : !amdgpu.tdm_descriptor
- // CHECK: rocdl.tensor.load.to.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 8 : vector<4xi32>, vector<8xi32>
- amdgpu.tensor_load_to_lds %desc { temporal_hint = #amdgpu.temporal_load_hint<highpriority> } : !amdgpu.tdm_descriptor<2>
+ // CHECK: rocdl.tensor.load.to.lds %[[DGROUPS]]#0, %[[DGROUPS]]#1, %[[DGROUPS]]#2, %[[DGROUPS]]#3 cachepolicy 8 : vector<4xi32>, vector<8xi32>
+ amdgpu.tensor_load_to_lds %desc { temporal_hint = #amdgpu.temporal_load_hint<highpriority> } : !amdgpu.tdm_descriptor
- // CHECK: rocdl.tensor.load.to.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 12 : vector<4xi32>, vector<8xi32>
- amdgpu.tensor_load_to_lds %desc { temporal_hint = #amdgpu.temporal_load_hint<lastuse> } : !amdgpu.tdm_descriptor<2>
+ // CHECK: rocdl.tensor.load.to.lds %[[DGROUPS]]#0, %[[DGROUPS]]#1, %[[DGROUPS]]#2, %[[DGROUPS]]#3 cachepolicy 12 : vector<4xi32>, vector<8xi32>
+ amdgpu.tensor_load_to_lds %desc { temporal_hint = #amdgpu.temporal_load_hint<lastuse> } : !amdgpu.tdm_descriptor
- // CHECK: rocdl.tensor.load.to.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 16 : vector<4xi32>, vector<8xi32>
- amdgpu.tensor_load_to_lds %desc { temporal_hint = #amdgpu.temporal_load_hint<nontemporal_regular> } : !amdgpu.tdm_descriptor<2>
+ // CHECK: rocdl.tensor.load.to.lds %[[DGROUPS]]#0, %[[DGROUPS]]#1, %[[DGROUPS]]#2, %[[DGROUPS]]#3 cachepolicy 16 : vector<4xi32>, vector<8xi32>
+ amdgpu.tensor_load_to_lds %desc { temporal_hint = #amdgpu.temporal_load_hint<nontemporal_regular> } : !amdgpu.tdm_descriptor
- // CHECK: rocdl.tensor.load.to.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 20 : vector<4xi32>, vector<8xi32>
- amdgpu.tensor_load_to_lds %desc { temporal_hint = #amdgpu.temporal_load_hint<regular_nontemporal> } : !amdgpu.tdm_descriptor<2>
+ // CHECK: rocdl.tensor.load.to.lds %[[DGROUPS]]#0, %[[DGROUPS]]#1, %[[DGROUPS]]#2, %[[DGROUPS]]#3 cachepolicy 20 : vector<4xi32>, vector<8xi32>
+ amdgpu.tensor_load_to_lds %desc { temporal_hint = #amdgpu.temporal_load_hint<regular_nontemporal> } : !amdgpu.tdm_descriptor
- // CHECK: rocdl.tensor.load.to.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 24 : vector<4xi32>, vector<8xi32>
- amdgpu.tensor_load_to_lds %desc { temporal_hint = #amdgpu.temporal_load_hint<nontemporal_highpriority> } : !amdgpu.tdm_descriptor<2>
+ // CHECK: rocdl.tensor.load.to.lds %[[DGROUPS]]#0, %[[DGROUPS]]#1, %[[DGROUPS]]#2, %[[DGROUPS]]#3 cachepolicy 24 : vector<4xi32>, vector<8xi32>
+ amdgpu.tensor_load_to_lds %desc { temporal_hint = #amdgpu.temporal_load_hint<nontemporal_highpriority> } : !amdgpu.tdm_descriptor
- // CHECK: rocdl.tensor.load.to.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 0 : vector<4xi32>, vector<8xi32>
- amdgpu.tensor_load_to_lds %desc { non_volatile = false } : !amdgpu.tdm_descriptor<2>
+ // CHECK: rocdl.tensor.load.to.lds %[[DGROUPS]]#0, %[[DGROUPS]]#1, %[[DGROUPS]]#2, %[[DGROUPS]]#3 cachepolicy 0 : vector<4xi32>, vector<8xi32>
+ amdgpu.tensor_load_to_lds %desc { non_volatile = false } : !amdgpu.tdm_descriptor
- // CHECK: rocdl.tensor.load.to.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 32 : vector<4xi32>, vector<8xi32>
- amdgpu.tensor_load_to_lds %desc { non_volatile = true } : !amdgpu.tdm_descriptor<2>
+ // CHECK: rocdl.tensor.load.to.lds %[[DGROUPS]]#0, %[[DGROUPS]]#1, %[[DGROUPS]]#2, %[[DGROUPS]]#3 cachepolicy 32 : vector<4xi32>, vector<8xi32>
+ amdgpu.tensor_load_to_lds %desc { non_volatile = true } : !amdgpu.tdm_descriptor
func.return
}
-// CHECK-LABEL: func @tensor_load_to_lds
-// CHECK-SAME: (%[[DESC:.+]]: !amdgpu.tdm_descriptor<4>)
-func.func @tensor_load_to_lds(%desc: !amdgpu.tdm_descriptor<4>) {
+// CHECK-LABEL: func @tensor_store_from_lds
+// CHECK-SAME: (%[[DESC:.+]]: !amdgpu.tdm_descriptor)
+func.func @tensor_store_from_lds(%desc: !amdgpu.tdm_descriptor) {
// CHECK: %[[DGROUPS:.+]]:4 = builtin.unrealized_conversion_cast %[[DESC]]
- // CHECK: rocdl.tensor.load.to.lds %[[DGROUPS]]#0, %[[DGROUPS]]#1, %[[DGROUPS]]#2, %[[DGROUPS]]#3 cachepolicy 0 : vector<4xi32>, vector<8xi32>
- amdgpu.tensor_load_to_lds %desc : !amdgpu.tdm_descriptor<4>
- func.return
-}
-
-// CHECK-LABEL: func @tensor_store_from_lds_d2
-// CHECK-SAME: (%[[DESC:.+]]: !amdgpu.tdm_descriptor<2>)
-func.func @tensor_store_from_lds_d2(%desc: !amdgpu.tdm_descriptor<2>) {
- // CHECK: %[[DGROUPS:.+]]:2 = builtin.unrealized_conversion_cast %[[DESC]]
- // CHECK: rocdl.tensor.store.from.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 0 : vector<4xi32>, vector<8xi32>
- amdgpu.tensor_store_from_lds %desc : !amdgpu.tdm_descriptor<2>
-
- // CHECK: rocdl.tensor.store.from.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 0 : vector<4xi32>, vector<8xi32>
- amdgpu.tensor_store_from_lds %desc { cache_scope = #amdgpu.cache_scope<workgroup> } : !amdgpu.tdm_descriptor<2>
-
- // CHECK: rocdl.tensor.store.from.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 1 : vector<4xi32>, vector<8xi32>
- amdgpu.tensor_store_from_lds %desc { cache_scope = #amdgpu.cache_scope<shader_engine> } : !amdgpu.tdm_descriptor<2>
-
- // CHECK: rocdl.tensor.store.from.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 2 : vector<4xi32>, vector<8xi32>
- amdgpu.tensor_store_from_lds %desc { cache_scope = #amdgpu.cache_scope<device> } : !amdgpu.tdm_descriptor<2>
+ // CHECK: rocdl.tensor.store.from.lds %[[DGROUPS]]#0, %[[DGROUPS]]#1, %[[DGROUPS]]#2, %[[DGROUPS]]#3 cachepolicy 0 : vector<4xi32>, vector<8xi32>
+ amdgpu.tensor_store_from_lds %desc : !amdgpu.tdm_descriptor
- // CHECK: rocdl.tensor.store.from.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 3 : vector<4xi32>, vector<8xi32>
- amdgpu.tensor_store_from_lds %desc { cache_scope = #amdgpu.cache_scope<system> } : !amdgpu.tdm_descriptor<2>
+ // CHECK: rocdl.tensor.store.from.lds %[[DGROUPS]]#0, %[[DGROUPS]]#1, %[[DGROUPS]]#2, %[[DGROUPS]]#3 cachepolicy 0 : vector<4xi32>, vector<8xi32>
+ amdgpu.tensor_store_from_lds %desc { cache_scope = #amdgpu.cache_scope<workgroup> } : !amdgpu.tdm_descriptor
- // CHECK: rocdl.tensor.store.from.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 0 : vector<4xi32>, vector<8xi32>
- amdgpu.tensor_store_from_lds %desc { temporal_hint = #amdgpu.temporal_store_hint<regular> } : !amdgpu.tdm_descriptor<2>
+ // CHECK: rocdl.tensor.store.from.lds %[[DGROUPS]]#0, %[[DGROUPS]]#1, %[[DGROUPS]]#2, %[[DGROUPS]]#3 cachepolicy 1 : vector<4xi32>, vector<8xi32>
+ amdgpu.tensor_store_from_lds %desc { cache_scope = #amdgpu.cache_scope<shader_engine> } : !amdgpu.tdm_descriptor
- // CHECK: rocdl.tensor.store.from.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 4 : vector<4xi32>, vector<8xi32>
- amdgpu.tensor_store_from_lds %desc { temporal_hint = #amdgpu.temporal_store_hint<nontemporal> } : !amdgpu.tdm_descriptor<2>
+ // CHECK: rocdl.tensor.store.from.lds %[[DGROUPS]]#0, %[[DGROUPS]]#1, %[[DGROUPS]]#2, %[[DGROUPS]]#3 cachepolicy 2 : vector<4xi32>, vector<8xi32>
+ amdgpu.tensor_store_from_lds %desc { cache_scope = #amdgpu.cache_scope<device> } : !amdgpu.tdm_descriptor
- // CHECK: rocdl.tensor.store.from.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 8 : vector<4xi32>, vector<8xi32>
- amdgpu.tensor_store_from_lds %desc { temporal_hint = #amdgpu.temporal_store_hint<highpriority> } : !amdgpu.tdm_descriptor<2>
+ // CHECK: rocdl.tensor.store.from.lds %[[DGROUPS]]#0, %[[DGROUPS]]#1, %[[DGROUPS]]#2, %[[DGROUPS]]#3 cachepolicy 3 : vector<4xi32>, vector<8xi32>
+ amdgpu.tensor_store_from_lds %desc { cache_scope = #amdgpu.cache_scope<system> } : !amdgpu.tdm_descriptor
- // CHECK: rocdl.tensor.store.from.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 12 : vector<4xi32>, vector<8xi32>
- amdgpu.tensor_store_from_lds %desc { temporal_hint = #amdgpu.temporal_store_hint<writeback> } : !amdgpu.tdm_descriptor<2>
+ // CHECK: rocdl.tensor.store.from.lds %[[DGROUPS]]#0, %[[DGROUPS]]#1, %[[DGROUPS]]#2, %[[DGROUPS]]#3 cachepolicy 0 : vector<4xi32>, vector<8xi32>
+ amdgpu.tensor_store_from_lds %desc { temporal_hint = #amdgpu.temporal_store_hint<regular> } : !amdgpu.tdm_descriptor
- // CHECK: rocdl.tensor.store.from.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 16 : vector<4xi32>, vector<8xi32>
- amdgpu.tensor_store_from_lds %desc { temporal_hint = #amdgpu.temporal_store_hint<nontemporal_regular> } : !amdgpu.tdm_descriptor<2>
+ // CHECK: rocdl.tensor.store.from.lds %[[DGROUPS]]#0, %[[DGROUPS]]#1, %[[DGROUPS]]#2, %[[DGROUPS]]#3 cachepolicy 4 : vector<4xi32>, vector<8xi32>
+ amdgpu.tensor_store_from_lds %desc { temporal_hint = #amdgpu.temporal_store_hint<nontemporal> } : !amdgpu.tdm_descriptor
- // CHECK: rocdl.tensor.store.from.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 20 : vector<4xi32>, vector<8xi32>
- amdgpu.tensor_store_from_lds %desc { temporal_hint = #amdgpu.temporal_store_hint<regular_nontemporal> } : !amdgpu.tdm_descriptor<2>
+ // CHECK: rocdl.tensor.store.from.lds %[[DGROUPS]]#0, %[[DGROUPS]]#1, %[[DGROUPS]]#2, %[[DGROUPS]]#3 cachepolicy 8 : vector<4xi32>, vector<8xi32>
+ amdgpu.tensor_store_from_lds %desc { temporal_hint = #amdgpu.temporal_store_hint<highpriority> } : !amdgpu.tdm_descriptor
- // CHECK: rocdl.tensor.store.from.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 24 : vector<4xi32>, vector<8xi32>
- amdgpu.tensor_store_from_lds %desc { temporal_hint = #amdgpu.temporal_store_hint<nontemporal_highpriority> } : !amdgpu.tdm_descriptor<2>
+ // CHECK: rocdl.tensor.store.from.lds %[[DGROUPS]]#0, %[[DGROUPS]]#1, %[[DGROUPS]]#2, %[[DGROUPS]]#3 cachepolicy 12 : vector<4xi32>, vector<8xi32>
+ amdgpu.tensor_store_from_lds %desc { temporal_hint = #amdgpu.temporal_store_hint<writeback> } : !amdgpu.tdm_descriptor
- // CHECK: rocdl.tensor.store.from.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 28 : vector<4xi32>, vector<8xi32>
- amdgpu.tensor_store_from_lds %desc { temporal_hint = #amdgpu.temporal_store_hint<nontemporal_writeback> } : !amdgpu.tdm_descriptor<2>
+ // CHECK: rocdl.tensor.store.from.lds %[[DGROUPS]]#0, %[[DGROUPS]]#1, %[[DGROUPS]]#2, %[[DGROUPS]]#3 cachepolicy 16 : vector<4xi32>, vector<8xi32>
+ amdgpu.tensor_store_from_lds %desc { temporal_hint = #amdgpu.temporal_store_hint<nontemporal_regular> } : !amdgpu.tdm_descriptor
- // CHECK: rocdl.tensor.store.from.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 0 : vector<4xi32>, vector<8xi32>
- amdgpu.tensor_store_from_lds %desc { non_volatile = false } : !amdgpu.tdm_descriptor<2>
+ // CHECK: rocdl.tensor.store.from.lds %[[DGROUPS]]#0, %[[DGROUPS]]#1, %[[DGROUPS]]#2, %[[DGROUPS]]#3 cachepolicy 20 : vector<4xi32>, vector<8xi32>
+ amdgpu.tensor_store_from_lds %desc { temporal_hint = #amdgpu.temporal_store_hint<regular_nontemporal> } : !amdgpu.tdm_descriptor
- // CHECK: rocdl.tensor.store.from.lds.d2 %[[DGROUPS]]#0, %[[DGROUPS]]#1 cachepolicy 32 : vector<4xi32>, vector<8xi32>
- amdgpu.tensor_store_from_lds %desc { non_volatile = true } : !amdgpu.tdm_descriptor<2>
- func.return
-}
+ // CHECK: rocdl.tensor.store.from.lds %[[DGROUPS]]#0, %[[DGROUPS]]#1, %[[DGROUPS]]#2, %[[DGROUPS]]#3 cachepolicy 24 : vector<4xi32>, vector<8xi32>
+ amdgpu.tensor_store_from_lds %desc { temporal_hint = #amdgpu.temporal_store_hint<nontemporal_highpriority> } : !amdgpu.tdm_descriptor
+ // CHECK: rocdl.tensor.store.from.lds %[[DGROUPS]]#0, %[[DGROUPS]]#1, %[[DGROUPS]]#2, %[[DGROUPS]]#3 cachepolicy 28 : vector<4xi32>, vector<8xi32>
+ amdgpu.tensor_store_from_lds %desc { temporal_hint = #amdgpu.temporal_store_hint<nontemporal_writeback> } : !amdgpu.tdm_descriptor
-// CHECK-LABEL: func @tensor_store_from_lds
-// CHECK-SAME: (%[[DESC:.+]]: !amdgpu.tdm_descriptor<4>)
-func.func @tensor_store_from_lds(%desc: !amdgpu.tdm_descriptor<4>) {
- // CHECK: %[[DGROUPS:.+]]:4 = builtin.unrealized_conversion_cast %[[DESC]]
// CHECK: rocdl.tensor.store.from.lds %[[DGROUPS]]#0, %[[DGROUPS]]#1, %[[DGROUPS]]#2, %[[DGROUPS]]#3 cachepolicy 0 : vector<4xi32>, vector<8xi32>
- amdgpu.tensor_store_from_lds %desc : !amdgpu.tdm_descriptor<4>
+ amdgpu.tensor_store_from_lds %desc { non_volatile = false } : !amdgpu.tdm_descriptor
+
+ // CHECK: rocdl.tensor.store.from.lds %[[DGROUPS]]#0, %[[DGROUPS]]#1, %[[DGROUPS]]#2, %[[DGROUPS]]#3 cachepolicy 32 : vector<4xi32>, vector<8xi32>
+ amdgpu.tensor_store_from_lds %desc { non_volatile = true } : !amdgpu.tdm_descriptor
func.return
}
diff --git a/mlir/test/Dialect/AMDGPU/ops.mlir b/mlir/test/Dialect/AMDGPU/ops.mlir
index 23b60f49a21d0..651aff4a0d22a 100644
--- a/mlir/test/Dialect/AMDGPU/ops.mlir
+++ b/mlir/test/Dialect/AMDGPU/ops.mlir
@@ -731,7 +731,7 @@ func.func @make_dma_descriptor(%base: !amdgpu.tdm_base<i32>, %wg_mask: i16, %tim
sharedSize [64, 64]
// CHECK-SAME: workgroupMask %[[WG_MASK]]
workgroupMask %wg_mask
- : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor<2>
+ : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
// CHECK: amdgpu.make_dma_descriptor %[[BASE]]
amdgpu.make_dma_descriptor %base
@@ -745,7 +745,7 @@ func.func @make_dma_descriptor(%base: !amdgpu.tdm_base<i32>, %wg_mask: i16, %tim
workgroupMask %wg_mask
// CHECK-SAME: earlyTimeout %[[TIMEOUT]]
earlyTimeout %timeout
- : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor<2>
+ : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
// CHECK: amdgpu.make_dma_descriptor %[[BASE]]
amdgpu.make_dma_descriptor %base
>From 3dcdea7b82e62604eb85ab1484b9922cda93e1b8 Mon Sep 17 00:00:00 2001
From: Erick Ochoa <erick.ochoalopez at amd.com>
Date: Mon, 8 Dec 2025 15:18:42 -0500
Subject: [PATCH 07/10] Change non_volatile to not is_volatile
---
mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td | 4 ++--
mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp | 4 ++--
mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir | 8 ++++----
3 files changed, 8 insertions(+), 8 deletions(-)
diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
index 7b6056f7acb1e..1254425a4bba6 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
@@ -1485,7 +1485,7 @@ def AMDGPU_TensorLoadToLDSOp :
Arguments<(ins AMDGPU_TDMDescriptorType: $desc,
DefaultValuedOptionalAttr<AMDGPU_CacheScopeAttr, "CacheScope::Workgroup">: $cache_scope,
DefaultValuedOptionalAttr<AMDGPU_TemporalLoadHintsAttr, "TemporalLoadHints::RegularTemporal">: $temporal_hint,
- DefaultValuedOptionalAttr<BoolAttr, "false">: $non_volatile)> {
+ DefaultValuedOptionalAttr<BoolAttr, "true">: $is_volatile)> {
let summary = "Load tensors from global memory to LDS.";
let description = [{
Load tensors of up to five dimensions from global memory to LDS.
@@ -1503,7 +1503,7 @@ def AMDGPU_TensorStoreFromLDSOp :
Arguments<(ins AMDGPU_TDMDescriptorType: $desc,
DefaultValuedOptionalAttr<AMDGPU_CacheScopeAttr, "CacheScope::Workgroup">: $cache_scope,
DefaultValuedOptionalAttr<AMDGPU_TemporalStoreHintsAttr, "TemporalStoreHints::RegularTemporal">: $temporal_hint,
- DefaultValuedOptionalAttr<BoolAttr, "false">: $non_volatile)> {
+ DefaultValuedOptionalAttr<BoolAttr, "true">: $is_volatile)> {
let summary = "Store tensors from LDS to global memory.";
let description = [{
diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index a661ed6c6980c..0b9efae89fe75 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -2723,9 +2723,9 @@ struct AMDGPUTensorLoadStoreOpLowering
ValueRange desc = adaptor.getDesc();
uint32_t temporalHint = static_cast<uint32_t>(op.getTemporalHint());
- bool nonVolatile = static_cast<bool>(op.getNonVolatile());
+ bool isVolatile = !static_cast<bool>(op.getIsVolatile());
uint32_t cacheScope = static_cast<uint32_t>(op.getCacheScope());
- int32_t cachePolicy = cacheScope | temporalHint << 2 | nonVolatile << 5;
+ int32_t cachePolicy = cacheScope | temporalHint << 2 | isVolatile << 5;
rewriter.replaceOpWithNewOp<TargetOp>(op, desc[0], desc[1], desc[2],
desc[3], cachePolicy,
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir b/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir
index 31a9306cffb95..cc5d330e399d0 100644
--- a/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir
+++ b/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir
@@ -485,10 +485,10 @@ func.func @tensor_load_to_lds(%desc: !amdgpu.tdm_descriptor) {
amdgpu.tensor_load_to_lds %desc { temporal_hint = #amdgpu.temporal_load_hint<nontemporal_highpriority> } : !amdgpu.tdm_descriptor
// CHECK: rocdl.tensor.load.to.lds %[[DGROUPS]]#0, %[[DGROUPS]]#1, %[[DGROUPS]]#2, %[[DGROUPS]]#3 cachepolicy 0 : vector<4xi32>, vector<8xi32>
- amdgpu.tensor_load_to_lds %desc { non_volatile = false } : !amdgpu.tdm_descriptor
+ amdgpu.tensor_load_to_lds %desc { is_volatile = true } : !amdgpu.tdm_descriptor
// CHECK: rocdl.tensor.load.to.lds %[[DGROUPS]]#0, %[[DGROUPS]]#1, %[[DGROUPS]]#2, %[[DGROUPS]]#3 cachepolicy 32 : vector<4xi32>, vector<8xi32>
- amdgpu.tensor_load_to_lds %desc { non_volatile = true } : !amdgpu.tdm_descriptor
+ amdgpu.tensor_load_to_lds %desc { is_volatile = false } : !amdgpu.tdm_descriptor
func.return
}
@@ -537,10 +537,10 @@ func.func @tensor_store_from_lds(%desc: !amdgpu.tdm_descriptor) {
amdgpu.tensor_store_from_lds %desc { temporal_hint = #amdgpu.temporal_store_hint<nontemporal_writeback> } : !amdgpu.tdm_descriptor
// CHECK: rocdl.tensor.store.from.lds %[[DGROUPS]]#0, %[[DGROUPS]]#1, %[[DGROUPS]]#2, %[[DGROUPS]]#3 cachepolicy 0 : vector<4xi32>, vector<8xi32>
- amdgpu.tensor_store_from_lds %desc { non_volatile = false } : !amdgpu.tdm_descriptor
+ amdgpu.tensor_store_from_lds %desc { is_volatile = true } : !amdgpu.tdm_descriptor
// CHECK: rocdl.tensor.store.from.lds %[[DGROUPS]]#0, %[[DGROUPS]]#1, %[[DGROUPS]]#2, %[[DGROUPS]]#3 cachepolicy 32 : vector<4xi32>, vector<8xi32>
- amdgpu.tensor_store_from_lds %desc { non_volatile = true } : !amdgpu.tdm_descriptor
+ amdgpu.tensor_store_from_lds %desc { is_volatile = false } : !amdgpu.tdm_descriptor
func.return
}
>From 455a399166a5cbfe4b83ba4bc85143f2f7d82c9f Mon Sep 17 00:00:00 2001
From: Erick Ochoa <erick.ochoalopez at amd.com>
Date: Tue, 9 Dec 2025 09:09:18 -0500
Subject: [PATCH 08/10] Fix identifier
---
mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index 0b9efae89fe75..794d50286e11d 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -2723,9 +2723,9 @@ struct AMDGPUTensorLoadStoreOpLowering
ValueRange desc = adaptor.getDesc();
uint32_t temporalHint = static_cast<uint32_t>(op.getTemporalHint());
- bool isVolatile = !static_cast<bool>(op.getIsVolatile());
+ bool nonVolatile = !static_cast<bool>(op.getIsVolatile());
uint32_t cacheScope = static_cast<uint32_t>(op.getCacheScope());
- int32_t cachePolicy = cacheScope | temporalHint << 2 | isVolatile << 5;
+ int32_t cachePolicy = cacheScope | temporalHint << 2 | nonVolatile << 5;
rewriter.replaceOpWithNewOp<TargetOp>(op, desc[0], desc[1], desc[2],
desc[3], cachePolicy,
>From 3eaa7e10af06b0f62eb9f2f9e6bad83d9f295eee Mon Sep 17 00:00:00 2001
From: Erick Ochoa <erick.ochoalopez at amd.com>
Date: Tue, 9 Dec 2025 09:32:47 -0500
Subject: [PATCH 09/10] Use llvm::append_range
---
mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp | 6 ++----
1 file changed, 2 insertions(+), 4 deletions(-)
diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index 794d50286e11d..3b49aabb23c59 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -2760,10 +2760,8 @@ struct ConvertAMDGPUToROCDLPass
Type i32 = IntegerType::get(type.getContext(), 32);
Type v4i32 = converter.convertType(VectorType::get(4, i32));
Type v8i32 = converter.convertType(VectorType::get(8, i32));
- result.push_back(v4i32);
- result.push_back(v8i32);
- result.push_back(v4i32);
- result.push_back(v4i32);
+ llvm::append_range(result,
+ ArrayRef<Type>{v4i32, v8i32, v4i32, v4i32});
return success();
});
>From bcc301ce1ab0ecfc4816659401dc1efd7f05e5cb Mon Sep 17 00:00:00 2001
From: Erick Ochoa <erick.ochoalopez at amd.com>
Date: Tue, 9 Dec 2025 09:45:40 -0500
Subject: [PATCH 10/10] remove a static cast
---
mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index 3b49aabb23c59..860713482a8c1 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -2723,7 +2723,7 @@ struct AMDGPUTensorLoadStoreOpLowering
ValueRange desc = adaptor.getDesc();
uint32_t temporalHint = static_cast<uint32_t>(op.getTemporalHint());
- bool nonVolatile = !static_cast<bool>(op.getIsVolatile());
+ bool nonVolatile = !op.getIsVolatile();
uint32_t cacheScope = static_cast<uint32_t>(op.getCacheScope());
int32_t cachePolicy = cacheScope | temporalHint << 2 | nonVolatile << 5;
More information about the Mlir-commits
mailing list