[Mlir-commits] [mlir] 0487154 - [mlir][amdgpu] Add workgroup_mask to MakeDmaDescriptorOp (#171103)
llvmlistbot at llvm.org
llvmlistbot at llvm.org
Mon Dec 8 07:02:22 PST 2025
Author: Tim Gymnich
Date: 2025-12-08T16:02:18+01:00
New Revision: 048715458855222919cd2632d740e9221ef3cb50
URL: https://github.com/llvm/llvm-project/commit/048715458855222919cd2632d740e9221ef3cb50
DIFF: https://github.com/llvm/llvm-project/commit/048715458855222919cd2632d740e9221ef3cb50.diff
LOG: [mlir][amdgpu] Add workgroup_mask to MakeDmaDescriptorOp (#171103)
- add `workgroup_mask` and `early_timeout`
Added:
Modified:
mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir
mlir/test/Dialect/AMDGPU/ops.mlir
Removed:
################################################################################
diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
index ec9f449c35dc4..5ff052f16d126 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
@@ -1293,6 +1293,8 @@ def AMDGPU_MakeDmaDescriptorOp :
DenseI64ArrayAttr: $global_static_strides,
Variadic<Index>: $shared_dynamic_sizes,
DenseI64ArrayAttr: $shared_static_sizes,
+ Optional<I16>: $workgroup_mask,
+ Optional<I1>: $early_timeout,
Optional<Index>: $pad_amount,
Optional<Index>: $pad_interval,
Optional<AnyMemRef>: $atomic_barrier_address,
@@ -1313,6 +1315,12 @@ def AMDGPU_MakeDmaDescriptorOp :
$global_{static/dynamic}_strides determine the strides of the tensor.
$shared_{static/dynamic}_sizes determines the size of the tile.
+ $workgroup_mask broadcast load to workgroups inside of a workgroup cluster
+ (0 = do not broadcast result to workgroup, 1 = broadcast result to workgroup). Ignored for stores.
+ An all zeros mask is interpreted as a non-broadcasted load.
+
+ $early_timeout return data to requesters as soon as cache supplies it.
+
Padding can be applied to the LDS address when copying from memory to LDS,
but not when copying from LDS to memory.
The values in the padded target addresses remain the same as before the operation was applied.
@@ -1345,6 +1353,7 @@ def AMDGPU_MakeDmaDescriptorOp :
`globalStride` custom<DynamicIndexList>($global_dynamic_strides, $global_static_strides)
`sharedSize` custom<DynamicIndexList>($shared_dynamic_sizes, $shared_static_sizes)
( `padShared` `(` $pad_amount^ `every` $pad_interval `)` )?
+ ( `workgroupMask` $workgroup_mask^ ( `earlyTimeout` $early_timeout^)?)?
( `atomicBarrier` `(` $atomic_barrier_address^ `[` $atomic_barrier_indices `]`
`:` type($atomic_barrier_address) `)`)?
( `iterate` $global_increment^ `,` $lds_increment `,` $iteration_count )?
diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index 2fcc24ca1c1c3..f3b0da0120998 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -2368,6 +2368,18 @@ struct AMDGPUMakeDmaDescriptorLowering
return LLVM::OrOp::create(rewriter, loc, accumulator, value);
}
+ Value setWorkgroupMask(MakeDmaDescriptorOp op, OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter, Location loc,
+ Value sgpr0) const {
+ Value mask = op.getWorkgroupMask();
+ if (!mask)
+ return sgpr0;
+
+ Type i32 = rewriter.getI32Type();
+ Value extendedMask = LLVM::ZExtOp::create(rewriter, loc, i32, mask);
+ return setValueAtOffset(rewriter, loc, sgpr0, extendedMask, 0);
+ }
+
Value setDataSize(MakeDmaDescriptorOp op, OpAdaptor adaptor,
ConversionPatternRewriter &rewriter, Location loc,
Value sgpr0, ArrayRef<Value> consts) const {
@@ -2377,7 +2389,8 @@ struct AMDGPUMakeDmaDescriptorLowering
llvm::is_contained<unsigned>({8, 16, 32, 64}, elementTypeWidthInBits) &&
"expected type width to be 8, 16, 32, or 64.");
int64_t dataSize = llvm::Log2_32(elementTypeWidthInBits / 8);
- return createI32Constant(rewriter, loc, dataSize << 16);
+ Value size = createI32Constant(rewriter, loc, dataSize);
+ return setValueAtOffset(rewriter, loc, sgpr0, size, 16);
}
Value setAtomicBarrier(MakeDmaDescriptorOp op, OpAdaptor adaptor,
@@ -2411,6 +2424,15 @@ struct AMDGPUMakeDmaDescriptorLowering
return setValueAtOffset(rewriter, loc, sgpr0, consts[1], 20);
}
+ Value setEarlyTimeout(MakeDmaDescriptorOp op, OpAdaptor adaptorm,
+ ConversionPatternRewriter &rewriter, Location loc,
+ Value sgpr0, ArrayRef<Value> consts) const {
+ if (!op.getWorkgroupMask())
+ return sgpr0;
+
+ return setValueAtOffset(rewriter, loc, sgpr0, consts[1], 21);
+ }
+
Value setPadInterval(MakeDmaDescriptorOp op, OpAdaptor adaptor,
ConversionPatternRewriter &rewriter, Location loc,
Value sgpr0, ArrayRef<Value> consts) const {
@@ -2615,10 +2637,12 @@ struct AMDGPUMakeDmaDescriptorLowering
sgprs[i] = consts[0];
}
+ sgprs[0] = setWorkgroupMask(op, adaptor, rewriter, loc, sgprs[0]);
sgprs[0] = setDataSize(op, adaptor, rewriter, loc, sgprs[0], consts);
sgprs[0] = setAtomicBarrier(op, adaptor, rewriter, loc, sgprs[0], consts);
sgprs[0] = setIterateEnable(op, adaptor, rewriter, loc, sgprs[0], consts);
sgprs[0] = setPadEnable(op, adaptor, rewriter, loc, sgprs[0], consts);
+ sgprs[0] = setEarlyTimeout(op, adaptor, rewriter, loc, sgprs[0], consts);
sgprs[0] = setPadInterval(op, adaptor, rewriter, loc, sgprs[0], consts);
sgprs[0] = setPadAmount(op, adaptor, rewriter, loc, sgprs[0], consts);
diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
index cf74f671db216..7b1cd7fe96fe1 100644
--- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
+++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
@@ -768,6 +768,9 @@ LogicalResult MakeDmaDescriptorOp::verify() {
return emitOpError("atomic barrier address must be in LDS.");
}
+ if (getEarlyTimeout() && !getWorkgroupMask())
+ return emitOpError(
+ "early timeout does not apply when workgroup_mask is not set.");
return success();
}
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir b/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir
index 5a65689ec1f93..a94e17ab5b9a5 100644
--- a/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir
+++ b/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir
@@ -251,7 +251,9 @@ func.func @make_dma_descriptor(%base: !amdgpu.tdm_base<i32>) -> !amdgpu.tdm_desc
// CHECK-DAG: %[[C6:.+]] = llvm.mlir.constant(6 : i32)
// CHECK-DAG: %[[C7:.+]] = llvm.mlir.constant(7 : i32)
- // CHECK-DAG: %[[SGPR0:.+]] = llvm.mlir.constant(131072 : i32)
+ // CHECK-DAG: %[[DATA_SIZE:.+]] = llvm.mlir.constant(2 : i32)
+ // CHECK-DAG: %[[C16:.+]] = llvm.mlir.constant(16 : i32)
+ // CHECK: %[[SGPR0:.+]] = llvm.shl %[[DATA_SIZE]], %[[C16]]
// CHECK-DAG: %[[TENSOR_DIM_0:.+]] = llvm.mlir.constant(64 : i32)
// CHECK-DAG: %[[C16:.+]] = llvm.mlir.constant(16 : i32)
@@ -329,7 +331,9 @@ func.func @make_dma_descriptor_atomic_barrier(%base: !amdgpu.tdm_base<i32>, %bar
// CHECK-DAG: %[[C6:.+]] = llvm.mlir.constant(6 : i32)
// CHECK-DAG: %[[C7:.+]] = llvm.mlir.constant(7 : i32)
- // CHECK-DAG: %[[SGPR0_0:.+]] = llvm.mlir.constant(131072 : i32)
+ // CHECK-DAG: %[[DATA_SIZE:.+]] = llvm.mlir.constant(2 : i32)
+ // CHECK-DAG: %[[C16:.+]] = llvm.mlir.constant(16 : i32)
+ // CHECK: %[[SGPR0_0:.+]] = llvm.shl %[[DATA_SIZE]], %[[C16]]
// CHECK-DAG: %[[ATOMIC_BARRIER_ENABLE_OFFSET:.+]] = llvm.mlir.constant(18 : i32)
// CHECK: %[[ATOMIC_BARRIER_ENABLE_FIELD:.+]] = llvm.shl %[[C1]], %[[ATOMIC_BARRIER_ENABLE_OFFSET]]
@@ -360,3 +364,82 @@ func.func @make_dma_descriptor_atomic_barrier(%base: !amdgpu.tdm_base<i32>, %bar
: !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
func.return %descriptor : !amdgpu.tdm_descriptor
}
+
+// -----
+
+// CHECK-LABEL: func @make_dma_descriptor_workgroup_mask
+// CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base<i32>, %[[WG_MASK:.+]]: i16, %[[TIMEOUT:.+]]: i1)
+func.func @make_dma_descriptor_workgroup_mask(%base: !amdgpu.tdm_base<i32>, %wg_mask: i16, %timeout: i1) -> !amdgpu.tdm_descriptor {
+ // CHECK-DAG: %[[DGROUP0:.+]] = builtin.unrealized_conversion_cast %[[BASE]]
+
+ // CHECK-DAG: %[[C0:.+]] = llvm.mlir.constant(0 : i32)
+ // CHECK-DAG: %[[C1:.+]] = llvm.mlir.constant(1 : i32)
+ // CHECK-DAG: %[[C2:.+]] = llvm.mlir.constant(2 : i32)
+ // CHECK-DAG: %[[C3:.+]] = llvm.mlir.constant(3 : i32)
+ // CHECK-DAG: %[[C4:.+]] = llvm.mlir.constant(4 : i32)
+ // CHECK-DAG: %[[C5:.+]] = llvm.mlir.constant(5 : i32)
+ // CHECK-DAG: %[[C6:.+]] = llvm.mlir.constant(6 : i32)
+ // CHECK-DAG: %[[C7:.+]] = llvm.mlir.constant(7 : i32)
+
+ // CHECK-DAG: %[[WG_MASK_EXT:.+]] = llvm.zext %[[WG_MASK]]
+ // CHECK-DAG: %[[DATA_SIZE:.+]] = llvm.mlir.constant(2 : i32)
+ // CHECK-DAG: %[[C16:.+]] = llvm.mlir.constant(16 : i32)
+ // CHECK: %[[DATA_SIZE_SHIFTED:.+]] = llvm.shl %[[DATA_SIZE]], %[[C16]]
+ // CHECK: %[[SGPR0_BASE:.+]] = llvm.or %[[WG_MASK_EXT]], %[[DATA_SIZE_SHIFTED]]
+ // CHECK-DAG: %[[C21:.+]] = llvm.mlir.constant(21 : i32)
+ // CHECK: %[[TIMEOUT_SHIFTED:.+]] = llvm.shl %[[C1]], %[[C21]]
+ // CHECK: %[[SGPR0:.+]] = llvm.or %[[SGPR0_BASE]], %[[TIMEOUT_SHIFTED]]
+
+ // CHECK-DAG: %[[TENSOR_DIM_0:.+]] = llvm.mlir.constant(64 : i32)
+ // CHECK-DAG: %[[C16:.+]] = llvm.mlir.constant(16 : i32)
+ // CHECK: %[[SGPR2_0:.+]] = llvm.lshr %[[TENSOR_DIM_0]], %[[C16]]
+ // CHECK-DAG: %[[C16:.+]] = llvm.mlir.constant(16 : i32)
+ // CHECK: %[[SGPR1:.+]] = llvm.shl %[[TENSOR_DIM_0]], %[[C16]]
+
+ // CHECK-DAG: %[[TENSOR_DIM_1:.+]] = llvm.mlir.constant(128 : i32)
+ // CHECK-DAG: %[[C16:.+]] = llvm.mlir.constant(16 : i32)
+ // CHECK: %[[SGPR3_0:.+]] = llvm.lshr %[[TENSOR_DIM_1]], %[[C16]]
+ // CHECK-DAG: %[[C16:.+]] = llvm.mlir.constant(16 : i32)
+ // CHECK: %[[TENSOR_DIM_1_SHIFTED:.+]] = llvm.shl %[[TENSOR_DIM_1]], %[[C16]]
+ // CHECK: %[[SGPR2:.+]] = llvm.or %[[SGPR2_0]], %[[TENSOR_DIM_1_SHIFTED]]
+
+ // CHECK-DAG: %[[TILE_DIM_0:.+]] = llvm.mlir.constant(64 : i32)
+ // CHECK-DAG: %[[C16:.+]] = llvm.mlir.constant(16 : i32)
+ // CHECK: %[[TILE_DIM_0_SHIFTED:.+]] = llvm.shl %[[TILE_DIM_0:.+]], %[[C16]]
+ // CHECK: %[[SGPR3:.+]] = llvm.or %[[SGPR3_0]], %[[TILE_DIM_0_SHIFTED]]
+
+ // CHECK-DAG: %[[SGPR4:.+]] = llvm.mlir.constant(128 : i32)
+
+ // CHECK-DAG: %[[TENSOR_DIM_0_STRIDE:.+]] = llvm.mlir.constant(1 : i64) : i64
+ // CHECK-DAG: %[[MASK:.+]] = llvm.mlir.constant(281474976710655 : i64) : i64
+ // CHECK: %[[TENSOR_DIM_0_STRIDE_MASKED:.+]] = llvm.and %[[MASK]], %[[TENSOR_DIM_0_STRIDE]]
+ // CHECK-DAG: %[[SGPR5:.+]] = llvm.trunc %[[TENSOR_DIM_0_STRIDE_MASKED]] : i64 to i32
+ // CHECK-DAG: %[[SHIFT:.+]] = llvm.mlir.constant(32 : i64) : i64
+ // CHECK: %[[TENSOR_DIM_0_STRIDE_HIGH_64:.+]] = llvm.lshr %[[TENSOR_DIM_0_STRIDE_MASKED]], %[[SHIFT]]
+ // CHECK: %[[SGPR6_0:.+]] = llvm.trunc %[[TENSOR_DIM_0_STRIDE_HIGH_64]] : i64 to i32
+
+ // CHECK-DAG: %[[TENSOR_DIM_1_STRIDE:.+]] = llvm.mlir.constant(64 : i64)
+ // CHECK-DAG: %[[MASK:.+]] = llvm.mlir.constant(281474976710655 : i64) : i64
+ // CHECK: %[[TENSOR_DIM_1_STRIDE_MASKED:.+]] = llvm.and %[[MASK]], %[[TENSOR_DIM_1_STRIDE]]
+ // CHECK-DAG: %[[TENSOR_DIM_1_STRIDE_LOW:.+]] = llvm.trunc %[[TENSOR_DIM_1_STRIDE_MASKED]]
+ // CHECK-DAG: %[[SHIFT:.+]] = llvm.mlir.constant(16 : i64) : i64
+ // CHECK: %[[TENSOR_DIM_1_STRIDE_SHIFTED:.+]] = llvm.lshr %[[TENSOR_DIM_1_STRIDE_MASKED]], %[[SHIFT]]
+ // CHECK: %[[SGPR7:.+]] = llvm.trunc %[[TENSOR_DIM_1_STRIDE_SHIFTED]] : i64 to i32
+ // CHECK-DAG: %[[SHIFT:.+]] = llvm.mlir.constant(16 : i32) : i32
+ // CHECK: %[[TENSOR_DIM_1_STRIDE_LOW_SHIFTED:.+]] = llvm.shl %[[TENSOR_DIM_1_STRIDE_LOW]], %[[SHIFT]]
+ // CHECK-DAG: %[[SGPR6:.+]] = llvm.or %[[SGPR6_0]], %[[TENSOR_DIM_1_STRIDE_LOW_SHIFTED]]
+
+ // CHECK: %[[V8I32:.+]] = llvm.mlir.poison : vector<8xi32>
+ // CHECK: %[[DGROUP1_0:.+]] = llvm.insertelement %[[SGPR0]], %[[V8I32]][%[[C0]] : i32]
+ // CHECK: %[[DGROUP1_1:.+]] = llvm.insertelement %[[SGPR1]], %[[DGROUP1_0]][%[[C1]] : i32]
+ // CHECK: %[[DGROUP1_2:.+]] = llvm.insertelement %[[SGPR2]], %[[DGROUP1_1]][%[[C2]] : i32]
+ // CHECK: %[[DGROUP1_3:.+]] = llvm.insertelement %[[SGPR3]], %[[DGROUP1_2]][%[[C3]] : i32]
+ // CHECK: %[[DGROUP1_4:.+]] = llvm.insertelement %[[SGPR4]], %[[DGROUP1_3]][%[[C4]] : i32]
+ // CHECK: %[[DGROUP1_5:.+]] = llvm.insertelement %[[SGPR5]], %[[DGROUP1_4]][%[[C5]] : i32]
+ // CHECK: %[[DGROUP1_6:.+]] = llvm.insertelement %[[SGPR6]], %[[DGROUP1_5]][%[[C6]] : i32]
+ // CHECK: %[[DGROUP1:.+]] = llvm.insertelement %[[SGPR7]], %[[DGROUP1_6]][%[[C7]] : i32]
+
+ // CHECK: %[[DGROUPS:.+]] = builtin.unrealized_conversion_cast %[[DGROUP0]], %[[DGROUP1]] : vector<4xi32>, vector<8xi32> to !amdgpu.tdm_descriptor
+ %descriptor = amdgpu.make_dma_descriptor %base globalSize [128, 64] globalStride [64, 1] sharedSize [128, 64] workgroupMask %wg_mask earlyTimeout %timeout : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
+ func.return %descriptor : !amdgpu.tdm_descriptor
+}
diff --git a/mlir/test/Dialect/AMDGPU/ops.mlir b/mlir/test/Dialect/AMDGPU/ops.mlir
index e32fd41f91dc8..cf3f7a9cb08a2 100644
--- a/mlir/test/Dialect/AMDGPU/ops.mlir
+++ b/mlir/test/Dialect/AMDGPU/ops.mlir
@@ -695,8 +695,8 @@ func.func @make_dma_base(%idx: index, %mem: memref<8xi32>, %smem: memref<8xi32,
}
// CHECK-LABEL: func @make_dma_descriptor
-// CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base<i32>, %[[BARRIER:.+]]: memref<8xi32, #gpu.address_space<workgroup>>, %[[IDX:.+]]: index)
-func.func @make_dma_descriptor(%base: !amdgpu.tdm_base<i32>, %barrier: memref<8xi32, #gpu.address_space<workgroup>>, %idx: index) {
+// CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base<i32>, %[[WG_MASK:.+]]: i16, %[[TIMEOUT:.+]]: i1, %[[BARRIER:.+]]: memref<8xi32, #gpu.address_space<workgroup>>, %[[IDX:.+]]: index)
+func.func @make_dma_descriptor(%base: !amdgpu.tdm_base<i32>, %wg_mask: i16, %timeout: i1, %barrier: memref<8xi32, #gpu.address_space<workgroup>>, %idx: index) {
// CHECK: amdgpu.make_dma_descriptor %[[BASE]]
amdgpu.make_dma_descriptor %base
@@ -719,6 +719,32 @@ func.func @make_dma_descriptor(%base: !amdgpu.tdm_base<i32>, %barrier: memref<8x
padShared(%idx every %idx)
: !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
+ // CHECK: amdgpu.make_dma_descriptor %[[BASE]]
+ amdgpu.make_dma_descriptor %base
+ // CHECK-SAME: globalSize [64, 64]
+ globalSize [64, 64]
+ // CHECK-SAME: globalStride [64, 1]
+ globalStride [64, 1]
+ // CHECK-SAME: sharedSize [64, 64]
+ sharedSize [64, 64]
+ // CHECK-SAME: workgroupMask %[[WG_MASK]]
+ workgroupMask %wg_mask
+ : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
+
+ // CHECK: amdgpu.make_dma_descriptor %[[BASE]]
+ amdgpu.make_dma_descriptor %base
+ // CHECK-SAME: globalSize [64, 64]
+ globalSize [64, 64]
+ // CHECK-SAME: globalStride [64, 1]
+ globalStride [64, 1]
+ // CHECK-SAME: sharedSize [64, 64]
+ sharedSize [64, 64]
+ // CHECK-SAME: workgroupMask %[[WG_MASK]]
+ workgroupMask %wg_mask
+ // CHECK-SAME: earlyTimeout %[[TIMEOUT]]
+ earlyTimeout %timeout
+ : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
+
// CHECK: amdgpu.make_dma_descriptor %[[BASE]]
amdgpu.make_dma_descriptor %base
// CHECK-SAME: globalSize [64, 64]
@@ -745,4 +771,3 @@ func.func @make_dma_descriptor(%base: !amdgpu.tdm_base<i32>, %barrier: memref<8x
func.return
}
-
More information about the Mlir-commits
mailing list