[Mlir-commits] [mlir] [mlir][amdgpu] Add workgroup_mask to MakeDmaDescriptorOp (PR #171103)
Tim Gymnich
llvmlistbot at llvm.org
Mon Dec 8 01:58:57 PST 2025
https://github.com/tgymnich created https://github.com/llvm/llvm-project/pull/171103
None
>From 8fe5fae9d8e34eae9f54e9f362c1282311c23176 Mon Sep 17 00:00:00 2001
From: Tim Gymnich <tim at gymni.ch>
Date: Sun, 7 Dec 2025 14:53:38 +0000
Subject: [PATCH 1/2] [mlir][amdgpu] Add workgroup_mask to MakeDmaDescriptorOp
---
mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td | 4 ++
.../AMDGPUToROCDL/AMDGPUToROCDL.cpp | 17 ++++-
.../Conversion/AMDGPUToROCDL/gfx1250.mlir | 72 +++++++++++++++++++
mlir/test/Dialect/AMDGPU/ops.mlir | 15 +++-
4 files changed, 105 insertions(+), 3 deletions(-)
diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
index ec9f449c35dc4..03a81a03e16af 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
@@ -1295,6 +1295,7 @@ def AMDGPU_MakeDmaDescriptorOp :
DenseI64ArrayAttr: $shared_static_sizes,
Optional<Index>: $pad_amount,
Optional<Index>: $pad_interval,
+ Optional<I16>: $workgroup_mask,
Optional<AnyMemRef>: $atomic_barrier_address,
Variadic<Index>: $atomic_barrier_indices,
Optional<Index>: $global_increment,
@@ -1319,6 +1320,8 @@ def AMDGPU_MakeDmaDescriptorOp :
$pad_interval must be a power of two contained in [2, 256].
$pad_amount must be a value contained in [1, 128].
+ $workgroup_mask determines which workgroups inside of a workgroup claster receive the load or store.
+
$atomic_barrier_address must be aligned to 8 bytes.
2D and 3D tensors may be iterated over by setting $global_increment, $lds_increment, and $iteration_count.
@@ -1345,6 +1348,7 @@ def AMDGPU_MakeDmaDescriptorOp :
`globalStride` custom<DynamicIndexList>($global_dynamic_strides, $global_static_strides)
`sharedSize` custom<DynamicIndexList>($shared_dynamic_sizes, $shared_static_sizes)
( `padShared` `(` $pad_amount^ `every` $pad_interval `)` )?
+ ( `workgroupMask` $workgroup_mask^)?
( `atomicBarrier` `(` $atomic_barrier_address^ `[` $atomic_barrier_indices `]`
`:` type($atomic_barrier_address) `)`)?
( `iterate` $global_increment^ `,` $lds_increment `,` $iteration_count )?
diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index 2fcc24ca1c1c3..354069bf3c97a 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -2368,6 +2368,19 @@ struct AMDGPUMakeDmaDescriptorLowering
return LLVM::OrOp::create(rewriter, loc, accumulator, value);
}
+ Value setWorkgroupMask(MakeDmaDescriptorOp op, OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter, Location loc,
+ Value sgpr0) const {
+ Value mask = op.getWorkgroupMask();
+
+ if (!mask)
+ return sgpr0;
+
+ Type i32 = rewriter.getI32Type();
+ Value extendedMask = LLVM::ZExtOp::create(rewriter, loc, i32, mask);
+ return setValueAtOffset(rewriter, loc, sgpr0, extendedMask, 0);
+ }
+
Value setDataSize(MakeDmaDescriptorOp op, OpAdaptor adaptor,
ConversionPatternRewriter &rewriter, Location loc,
Value sgpr0, ArrayRef<Value> consts) const {
@@ -2377,7 +2390,8 @@ struct AMDGPUMakeDmaDescriptorLowering
llvm::is_contained<unsigned>({8, 16, 32, 64}, elementTypeWidthInBits) &&
"expected type width to be 8, 16, 32, or 64.");
int64_t dataSize = llvm::Log2_32(elementTypeWidthInBits / 8);
- return createI32Constant(rewriter, loc, dataSize << 16);
+ Value size = createI32Constant(rewriter, loc, dataSize);
+ return setValueAtOffset(rewriter, loc, sgpr0, size, 16);
}
Value setAtomicBarrier(MakeDmaDescriptorOp op, OpAdaptor adaptor,
@@ -2615,6 +2629,7 @@ struct AMDGPUMakeDmaDescriptorLowering
sgprs[i] = consts[0];
}
+ sgprs[0] = setWorkgroupMask(op, adaptor, rewriter, loc, sgprs[0]);
sgprs[0] = setDataSize(op, adaptor, rewriter, loc, sgprs[0], consts);
sgprs[0] = setAtomicBarrier(op, adaptor, rewriter, loc, sgprs[0], consts);
sgprs[0] = setIterateEnable(op, adaptor, rewriter, loc, sgprs[0], consts);
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir b/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir
index 5a65689ec1f93..9464f9a000053 100644
--- a/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir
+++ b/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir
@@ -360,3 +360,75 @@ func.func @make_dma_descriptor_atomic_barrier(%base: !amdgpu.tdm_base<i32>, %bar
: !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
func.return %descriptor : !amdgpu.tdm_descriptor
}
+
+// -----
+
+// CHECK-LABEL: func @make_dma_descriptor_workgroup_mask
+// CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base<i32>)
+func.func @make_dma_descriptor_workgroup_mask(%base: !amdgpu.tdm_base<i32>, %wg_mask: i16) -> !amdgpu.tdm_descriptor {
+ // CHECK-DAG: %[[DGROUP0:.+]] = builtin.unrealized_conversion_cast %[[BASE]]
+
+ // CHECK-DAG: %[[C0:.+]] = llvm.mlir.constant(0 : i32)
+ // CHECK-DAG: %[[C1:.+]] = llvm.mlir.constant(1 : i32)
+ // CHECK-DAG: %[[C2:.+]] = llvm.mlir.constant(2 : i32)
+ // CHECK-DAG: %[[C3:.+]] = llvm.mlir.constant(3 : i32)
+ // CHECK-DAG: %[[C4:.+]] = llvm.mlir.constant(4 : i32)
+ // CHECK-DAG: %[[C5:.+]] = llvm.mlir.constant(5 : i32)
+ // CHECK-DAG: %[[C6:.+]] = llvm.mlir.constant(6 : i32)
+ // CHECK-DAG: %[[C7:.+]] = llvm.mlir.constant(7 : i32)
+
+ // CHECK-DAG: %[[SGPR0:.+]] = llvm.mlir.constant(131072 : i32)
+
+ // CHECK-DAG: %[[TENSOR_DIM_0:.+]] = llvm.mlir.constant(64 : i32)
+ // CHECK-DAG: %[[C16:.+]] = llvm.mlir.constant(16 : i32)
+ // CHECK: %[[SGPR2_0:.+]] = llvm.lshr %[[TENSOR_DIM_0]], %[[C16]]
+ // CHECK-DAG: %[[C16:.+]] = llvm.mlir.constant(16 : i32)
+ // CHECK: %[[SGPR1:.+]] = llvm.shl %[[TENSOR_DIM_0]], %[[C16]]
+
+ // CHECK-DAG: %[[TENSOR_DIM_1:.+]] = llvm.mlir.constant(128 : i32)
+ // CHECK-DAG: %[[C16:.+]] = llvm.mlir.constant(16 : i32)
+ // CHECK: %[[SGPR3_0:.+]] = llvm.lshr %[[TENSOR_DIM_1]], %[[C16]]
+ // CHECK-DAG: %[[C16:.+]] = llvm.mlir.constant(16 : i32)
+ // CHECK: %[[TENSOR_DIM_1_SHIFTED:.+]] = llvm.shl %[[TENSOR_DIM_1]], %[[C16]]
+ // CHECK: %[[SGPR2:.+]] = llvm.or %[[SGPR2_0]], %[[TENSOR_DIM_1_SHIFTED]]
+
+ // CHECK-DAG: %[[TILE_DIM_0:.+]] = llvm.mlir.constant(64 : i32)
+ // CHECK-DAG: %[[C16:.+]] = llvm.mlir.constant(16 : i32)
+ // CHECK: %[[TILE_DIM_0_SHIFTED:.+]] = llvm.shl %[[TILE_DIM_0:.+]], %[[C16]]
+ // CHECK: %[[SGPR3:.+]] = llvm.or %[[SGPR3_0]], %[[TILE_DIM_0_SHIFTED]]
+
+ // CHECK-DAG: %[[SGPR4:.+]] = llvm.mlir.constant(128 : i32)
+
+ // CHECK-DAG: %[[TENSOR_DIM_0_STRIDE:.+]] = llvm.mlir.constant(1 : i64) : i64
+ // CHECK-DAG: %[[MASK:.+]] = llvm.mlir.constant(281474976710655 : i64) : i64
+ // CHECK: %[[TENSOR_DIM_0_STRIDE_MASKED:.+]] = llvm.and %[[MASK]], %[[TENSOR_DIM_0_STRIDE]]
+ // CHECK-DAG: %[[SGPR5:.+]] = llvm.trunc %[[TENSOR_DIM_0_STRIDE_MASKED]] : i64 to i32
+ // CHECK-DAG: %[[SHIFT:.+]] = llvm.mlir.constant(32 : i64) : i64
+ // CHECK: %[[TENSOR_DIM_0_STRIDE_HIGH_64:.+]] = llvm.lshr %[[TENSOR_DIM_0_STRIDE_MASKED]], %[[SHIFT]]
+ // CHECK: %[[SGPR6_0:.+]] = llvm.trunc %[[TENSOR_DIM_0_STRIDE_HIGH_64]] : i64 to i32
+
+ // CHECK-DAG: %[[TENSOR_DIM_1_STRIDE:.+]] = llvm.mlir.constant(64 : i64)
+ // CHECK-DAG: %[[MASK:.+]] = llvm.mlir.constant(281474976710655 : i64) : i64
+ // CHECK: %[[TENSOR_DIM_1_STRIDE_MASKED:.+]] = llvm.and %[[MASK]], %[[TENSOR_DIM_1_STRIDE]]
+ // CHECK-DAG: %[[TENSOR_DIM_1_STRIDE_LOW:.+]] = llvm.trunc %[[TENSOR_DIM_1_STRIDE_MASKED]]
+ // CHECK-DAG: %[[SHIFT:.+]] = llvm.mlir.constant(16 : i64) : i64
+ // CHECK: %[[TENSOR_DIM_1_STRIDE_SHIFTED:.+]] = llvm.lshr %[[TENSOR_DIM_1_STRIDE_MASKED]], %[[SHIFT]]
+ // CHECK: %[[SGPR7:.+]] = llvm.trunc %[[TENSOR_DIM_1_STRIDE_SHIFTED]] : i64 to i32
+ // CHECK-DAG: %[[SHIFT:.+]] = llvm.mlir.constant(16 : i32) : i32
+ // CHECK: %[[TENSOR_DIM_1_STRIDE_LOW_SHIFTED:.+]] = llvm.shl %[[TENSOR_DIM_1_STRIDE_LOW]], %[[SHIFT]]
+ // CHECK-DAG: %[[SGPR6:.+]] = llvm.or %[[SGPR6_0]], %[[TENSOR_DIM_1_STRIDE_LOW_SHIFTED]]
+
+ // CHECK: %[[V8I32:.+]] = llvm.mlir.poison : vector<8xi32>
+ // CHECK: %[[DGROUP1_0:.+]] = llvm.insertelement %[[SGPR0]], %[[V8I32]][%[[C0]] : i32]
+ // CHECK: %[[DGROUP1_1:.+]] = llvm.insertelement %[[SGPR1]], %[[DGROUP1_0]][%[[C1]] : i32]
+ // CHECK: %[[DGROUP1_2:.+]] = llvm.insertelement %[[SGPR2]], %[[DGROUP1_1]][%[[C2]] : i32]
+ // CHECK: %[[DGROUP1_3:.+]] = llvm.insertelement %[[SGPR3]], %[[DGROUP1_2]][%[[C3]] : i32]
+ // CHECK: %[[DGROUP1_4:.+]] = llvm.insertelement %[[SGPR4]], %[[DGROUP1_3]][%[[C4]] : i32]
+ // CHECK: %[[DGROUP1_5:.+]] = llvm.insertelement %[[SGPR5]], %[[DGROUP1_4]][%[[C5]] : i32]
+ // CHECK: %[[DGROUP1_6:.+]] = llvm.insertelement %[[SGPR6]], %[[DGROUP1_5]][%[[C6]] : i32]
+ // CHECK: %[[DGROUP1:.+]] = llvm.insertelement %[[SGPR7]], %[[DGROUP1_6]][%[[C7]] : i32]
+
+ // CHECK: %[[DGROUPS:.+]] = builtin.unrealized_conversion_cast %[[DGROUP0]], %[[DGROUP1]] : vector<4xi32>, vector<8xi32> to !amdgpu.tdm_descriptor
+ %descriptor = amdgpu.make_dma_descriptor %base globalSize [128, 64] globalStride [64, 1] sharedSize [128, 64] workgroupMask %wg_mask : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
+ func.return %descriptor : !amdgpu.tdm_descriptor
+}
diff --git a/mlir/test/Dialect/AMDGPU/ops.mlir b/mlir/test/Dialect/AMDGPU/ops.mlir
index e32fd41f91dc8..539afc5e820c9 100644
--- a/mlir/test/Dialect/AMDGPU/ops.mlir
+++ b/mlir/test/Dialect/AMDGPU/ops.mlir
@@ -696,7 +696,7 @@ func.func @make_dma_base(%idx: index, %mem: memref<8xi32>, %smem: memref<8xi32,
// CHECK-LABEL: func @make_dma_descriptor
// CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base<i32>, %[[BARRIER:.+]]: memref<8xi32, #gpu.address_space<workgroup>>, %[[IDX:.+]]: index)
-func.func @make_dma_descriptor(%base: !amdgpu.tdm_base<i32>, %barrier: memref<8xi32, #gpu.address_space<workgroup>>, %idx: index) {
+func.func @make_dma_descriptor(%base: !amdgpu.tdm_base<i32>, %wg_mask: i16, %barrier: memref<8xi32, #gpu.address_space<workgroup>>, %idx: index) {
// CHECK: amdgpu.make_dma_descriptor %[[BASE]]
amdgpu.make_dma_descriptor %base
@@ -719,6 +719,18 @@ func.func @make_dma_descriptor(%base: !amdgpu.tdm_base<i32>, %barrier: memref<8x
padShared(%idx every %idx)
: !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
+ // CHECK: amdgpu.make_dma_descriptor %[[BASE]]
+ amdgpu.make_dma_descriptor %base
+ // CHECK-SAME: globalSize [64, 64]
+ globalSize [64, 64]
+ // CHECK-SAME: globalStride [64, 1]
+ globalStride [64, 1]
+ // CHECK-SAME: sharedSize [64, 64]
+ sharedSize [64, 64]
+ // CHECK-SAME: workgroupMask %wg_mask
+ workgroupMask %wg_mask
+ : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
+
// CHECK: amdgpu.make_dma_descriptor %[[BASE]]
amdgpu.make_dma_descriptor %base
// CHECK-SAME: globalSize [64, 64]
@@ -745,4 +757,3 @@ func.func @make_dma_descriptor(%base: !amdgpu.tdm_base<i32>, %barrier: memref<8x
func.return
}
-
>From 1f472b98627897cf08c5fda08e2b89c8108ec895 Mon Sep 17 00:00:00 2001
From: Tim Gymnich <tim at gymni.ch>
Date: Sun, 7 Dec 2025 18:16:47 +0000
Subject: [PATCH 2/2] fix tests
---
mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir | 16 ++++++++++++----
mlir/test/Dialect/AMDGPU/ops.mlir | 4 ++--
2 files changed, 14 insertions(+), 6 deletions(-)
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir b/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir
index 9464f9a000053..8a840e2c41f19 100644
--- a/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir
+++ b/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir
@@ -251,7 +251,9 @@ func.func @make_dma_descriptor(%base: !amdgpu.tdm_base<i32>) -> !amdgpu.tdm_desc
// CHECK-DAG: %[[C6:.+]] = llvm.mlir.constant(6 : i32)
// CHECK-DAG: %[[C7:.+]] = llvm.mlir.constant(7 : i32)
- // CHECK-DAG: %[[SGPR0:.+]] = llvm.mlir.constant(131072 : i32)
+ // CHECK-DAG: %[[DATA_SIZE:.+]] = llvm.mlir.constant(2 : i32)
+ // CHECK-DAG: %[[C16:.+]] = llvm.mlir.constant(16 : i32)
+ // CHECK: %[[SGPR0:.+]] = llvm.shl %[[DATA_SIZE]], %[[C16]]
// CHECK-DAG: %[[TENSOR_DIM_0:.+]] = llvm.mlir.constant(64 : i32)
// CHECK-DAG: %[[C16:.+]] = llvm.mlir.constant(16 : i32)
@@ -329,7 +331,9 @@ func.func @make_dma_descriptor_atomic_barrier(%base: !amdgpu.tdm_base<i32>, %bar
// CHECK-DAG: %[[C6:.+]] = llvm.mlir.constant(6 : i32)
// CHECK-DAG: %[[C7:.+]] = llvm.mlir.constant(7 : i32)
- // CHECK-DAG: %[[SGPR0_0:.+]] = llvm.mlir.constant(131072 : i32)
+ // CHECK-DAG: %[[DATA_SIZE:.+]] = llvm.mlir.constant(2 : i32)
+ // CHECK-DAG: %[[C16:.+]] = llvm.mlir.constant(16 : i32)
+ // CHECK: %[[SGPR0_0:.+]] = llvm.shl %[[DATA_SIZE]], %[[C16]]
// CHECK-DAG: %[[ATOMIC_BARRIER_ENABLE_OFFSET:.+]] = llvm.mlir.constant(18 : i32)
// CHECK: %[[ATOMIC_BARRIER_ENABLE_FIELD:.+]] = llvm.shl %[[C1]], %[[ATOMIC_BARRIER_ENABLE_OFFSET]]
@@ -364,7 +368,7 @@ func.func @make_dma_descriptor_atomic_barrier(%base: !amdgpu.tdm_base<i32>, %bar
// -----
// CHECK-LABEL: func @make_dma_descriptor_workgroup_mask
-// CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base<i32>)
+// CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base<i32>, %[[WG_MASK:.+]]: i16)
func.func @make_dma_descriptor_workgroup_mask(%base: !amdgpu.tdm_base<i32>, %wg_mask: i16) -> !amdgpu.tdm_descriptor {
// CHECK-DAG: %[[DGROUP0:.+]] = builtin.unrealized_conversion_cast %[[BASE]]
@@ -377,7 +381,11 @@ func.func @make_dma_descriptor_workgroup_mask(%base: !amdgpu.tdm_base<i32>, %wg_
// CHECK-DAG: %[[C6:.+]] = llvm.mlir.constant(6 : i32)
// CHECK-DAG: %[[C7:.+]] = llvm.mlir.constant(7 : i32)
- // CHECK-DAG: %[[SGPR0:.+]] = llvm.mlir.constant(131072 : i32)
+ // CHECK-DAG: %[[WG_MASK_EXT:.+]] = llvm.zext %[[WG_MASK]]
+ // CHECK-DAG: %[[DATA_SIZE:.+]] = llvm.mlir.constant(2 : i32)
+ // CHECK-DAG: %[[C16:.+]] = llvm.mlir.constant(16 : i32)
+ // CHECK: %[[DATA_SIZE_SHIFTED:.+]] = llvm.shl %[[DATA_SIZE]], %[[C16]]
+ // CHECK: %[[SGPR0:.+]] = llvm.or %[[WG_MASK_EXT]], %[[DATA_SIZE_SHIFTED]]
// CHECK-DAG: %[[TENSOR_DIM_0:.+]] = llvm.mlir.constant(64 : i32)
// CHECK-DAG: %[[C16:.+]] = llvm.mlir.constant(16 : i32)
diff --git a/mlir/test/Dialect/AMDGPU/ops.mlir b/mlir/test/Dialect/AMDGPU/ops.mlir
index 539afc5e820c9..265939eb99e24 100644
--- a/mlir/test/Dialect/AMDGPU/ops.mlir
+++ b/mlir/test/Dialect/AMDGPU/ops.mlir
@@ -695,7 +695,7 @@ func.func @make_dma_base(%idx: index, %mem: memref<8xi32>, %smem: memref<8xi32,
}
// CHECK-LABEL: func @make_dma_descriptor
-// CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base<i32>, %[[BARRIER:.+]]: memref<8xi32, #gpu.address_space<workgroup>>, %[[IDX:.+]]: index)
+// CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base<i32>, %[[WG_MASK:.+]]: i16, %[[BARRIER:.+]]: memref<8xi32, #gpu.address_space<workgroup>>, %[[IDX:.+]]: index)
func.func @make_dma_descriptor(%base: !amdgpu.tdm_base<i32>, %wg_mask: i16, %barrier: memref<8xi32, #gpu.address_space<workgroup>>, %idx: index) {
// CHECK: amdgpu.make_dma_descriptor %[[BASE]]
@@ -727,7 +727,7 @@ func.func @make_dma_descriptor(%base: !amdgpu.tdm_base<i32>, %wg_mask: i16, %bar
globalStride [64, 1]
// CHECK-SAME: sharedSize [64, 64]
sharedSize [64, 64]
- // CHECK-SAME: workgroupMask %wg_mask
+ // CHECK-SAME: workgroupMask %[[WG_MASK]]
workgroupMask %wg_mask
: !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
More information about the Mlir-commits
mailing list