[Mlir-commits] [mlir] Update TDM ops to use the new barrier type, improve docs (PR #180572)
llvmlistbot at llvm.org
llvmlistbot at llvm.org
Mon Feb 9 09:48:23 PST 2026
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
@llvm/pr-subscribers-mlir
Author: Krzysztof Drewniak (krzysz00)
<details>
<summary>Changes</summary>
Now that we have an AMDGPU dialect type for the in-LDS barriers that the tensor data mover can automatically visit, update the definition of the tensor descriptor operations to use said types and document the behavior of the barrier.
---
Full diff: https://github.com/llvm/llvm-project/pull/180572.diff
4 Files Affected:
- (modified) mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUOps.td (+3-2)
- (modified) mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir (+2-2)
- (modified) mlir/test/Dialect/AMDGPU/invalid.mlir (+2-2)
- (modified) mlir/test/Dialect/AMDGPU/ops.mlir (+4-4)
``````````diff
diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUOps.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUOps.td
index 6042a958a2b3b..55eeea98dcd65 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUOps.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUOps.td
@@ -1379,7 +1379,7 @@ class AMDGPU_MakeDescriptorOp<string mnemonic> :
Optional<I1>: $early_timeout,
Optional<I32>: $pad_amount,
Optional<I32>: $pad_interval,
- Optional<AnyMemRef>: $atomic_barrier_address,
+ Optional<MemRefOf<[AMDGPU_DsBarrierStateType]>>: $atomic_barrier_address,
Variadic<Index>: $atomic_barrier_indices,
Optional<Index>: $global_increment,
Optional<I32>: $lds_increment,
@@ -1466,7 +1466,8 @@ def AMDGPU_MakeDmaDescriptorOp : AMDGPU_MakeDescriptorOp<"make_dma_descriptor">
$pad_interval must be a power of two contained in [2, 256].
$pad_amount must be a value contained in [1, 128].
- $atomic_barrier_address must be aligned to 8 bytes.
+ If an atomic barrier is provided, it will be arrived at **once** after
+ each load/store using this descriptor is completed.
2D and 3D tensors may be iterated over by setting $global_increment, $lds_increment, and $iteration_count.
$global_increment determines how much to increment the starting global memory address per iteration in units of the $base's element type.
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir b/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir
index af12cfdd9a633..ca87e2a519805 100644
--- a/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir
+++ b/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir
@@ -360,7 +360,7 @@ func.func @make_dma_descriptor(%base: !amdgpu.tdm_base<i32>) -> !amdgpu.tdm_desc
// CHECK-LABEL: func @make_dma_descriptor_atomic_barrier
// CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base<i32>, %[[BARRIER:.+]]: {{.*}}, %[[IDX:.+]]: index)
-func.func @make_dma_descriptor_atomic_barrier(%base: !amdgpu.tdm_base<i32>, %barrier : memref<8xi32, #gpu_lds_addrspace>, %idx: index) -> !amdgpu.tdm_descriptor {
+func.func @make_dma_descriptor_atomic_barrier(%base: !amdgpu.tdm_base<i32>, %barrier : memref<2x!amdgpu.ds_barrier_state, #gpu_lds_addrspace>, %idx: index) -> !amdgpu.tdm_descriptor {
// CHECK-DAG: %[[INDEX:.+]] = builtin.unrealized_conversion_cast %[[IDX]] : index to i64
// CHECK-DAG: %[[BARRIER_MEMREF_DESC:.+]] = builtin.unrealized_conversion_cast %[[BARRIER]]
// CHECK-DAG: %[[DGROUP0:.+]] = builtin.unrealized_conversion_cast %[[BASE]]
@@ -403,7 +403,7 @@ func.func @make_dma_descriptor_atomic_barrier(%base: !amdgpu.tdm_base<i32>, %bar
%descriptor = amdgpu.make_dma_descriptor %base globalSize [128, 64]
globalStride [64, 1]
sharedSize [128, 64]
- atomicBarrier(%barrier[%idx] : memref<8xi32, #gpu_lds_addrspace>)
+ atomicBarrier(%barrier[%idx] : memref<2x!amdgpu.ds_barrier_state, #gpu_lds_addrspace>)
: !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
func.return %descriptor : !amdgpu.tdm_descriptor
}
diff --git a/mlir/test/Dialect/AMDGPU/invalid.mlir b/mlir/test/Dialect/AMDGPU/invalid.mlir
index 474fca1157118..f725d61b75e02 100644
--- a/mlir/test/Dialect/AMDGPU/invalid.mlir
+++ b/mlir/test/Dialect/AMDGPU/invalid.mlir
@@ -396,9 +396,9 @@ func.func @make_gather_dma_base_invalid_addressspace(%idx: index, %smem : memref
// -----
-func.func @make_dma_base_invalid_barrier(%base: !amdgpu.tdm_base<i32>, %barrier: memref<8xi32>, %idx: index) {
+func.func @make_dma_base_invalid_barrier(%base: !amdgpu.tdm_base<i32>, %barrier: memref<8x!amdgpu.ds_barrier_state>, %idx: index) {
// expected-error at +1 {{'amdgpu.make_dma_descriptor' op atomic barrier address must be in LDS.}}
- amdgpu.make_dma_descriptor %base globalSize [64, 64] globalStride [64, 1] sharedSize [64, 64] atomicBarrier(%barrier[%idx] : memref<8xi32>) : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
+ amdgpu.make_dma_descriptor %base globalSize [64, 64] globalStride [64, 1] sharedSize [64, 64] atomicBarrier(%barrier[%idx] : memref<8x!amdgpu.ds_barrier_state>) : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
return
}
diff --git a/mlir/test/Dialect/AMDGPU/ops.mlir b/mlir/test/Dialect/AMDGPU/ops.mlir
index c3e7c8b70f4ee..1bc95bd741944 100644
--- a/mlir/test/Dialect/AMDGPU/ops.mlir
+++ b/mlir/test/Dialect/AMDGPU/ops.mlir
@@ -704,8 +704,8 @@ func.func @make_dma_base(%idx: index, %mem: memref<8xi32>, %smem: memref<8xi32,
}
// CHECK-LABEL: func @make_dma_descriptor
-// CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base<i32>, %[[WG_MASK:.+]]: vector<16xi1>, %[[TIMEOUT:.+]]: i1, %[[BARRIER:.+]]: memref<8xi32, #gpu.address_space<workgroup>>, %[[IDX:.+]]: index, %[[I32:.+]]: i32)
-func.func @make_dma_descriptor(%base: !amdgpu.tdm_base<i32>, %wg_mask: vector<16xi1>, %timeout: i1, %barrier: memref<8xi32, #gpu.address_space<workgroup>>, %idx: index, %i32: i32) {
+// CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base<i32>, %[[WG_MASK:.+]]: vector<16xi1>, %[[TIMEOUT:.+]]: i1, %[[BARRIER:.+]]: memref<2x!amdgpu.ds_barrier_state, #gpu.address_space<workgroup>>, %[[IDX:.+]]: index, %[[I32:.+]]: i32)
+func.func @make_dma_descriptor(%base: !amdgpu.tdm_base<i32>, %wg_mask: vector<16xi1>, %timeout: i1, %barrier: memref<2x!amdgpu.ds_barrier_state, #gpu.address_space<workgroup>>, %idx: index, %i32: i32) {
// CHECK: amdgpu.make_dma_descriptor %[[BASE]]
amdgpu.make_dma_descriptor %base
@@ -762,8 +762,8 @@ func.func @make_dma_descriptor(%base: !amdgpu.tdm_base<i32>, %wg_mask: vector<16
globalStride [64, 1]
// CHECK-SAME: sharedSize [64, 64]
sharedSize [64, 64]
- // CHECK-SAME: atomicBarrier(%[[BARRIER]][%[[IDX]]] : memref<8xi32, #gpu.address_space<workgroup>>)
- atomicBarrier(%barrier[%idx] : memref<8xi32, #gpu.address_space<workgroup>>)
+ // CHECK-SAME: atomicBarrier(%[[BARRIER]][%[[IDX]]] : memref<2x!amdgpu.ds_barrier_state, #gpu.address_space<workgroup>>)
+ atomicBarrier(%barrier[%idx] : memref<2x!amdgpu.ds_barrier_state, #gpu.address_space<workgroup>>)
: !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
// CHECK: amdgpu.make_dma_descriptor %[[BASE]]
``````````
</details>
https://github.com/llvm/llvm-project/pull/180572
More information about the Mlir-commits
mailing list