[Mlir-commits] [mlir] [mlir][amdgpu] Continue lowering make_tdm_descriptor. (PR #171498)
llvmlistbot at llvm.org
llvmlistbot at llvm.org
Wed Dec 10 09:32:06 PST 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-mlir-amdgpu
Author: Erick Ochoa Lopez (amd-eochoalo)
<details>
<summary>Changes</summary>
* changes workgroup mask's type from i16 to vector<16xi1>
* changes pad_amount and pad_interval from Index to I32
* adds lit tests for padEnable, iteration and dynamic cases
* adds TODO for a future instrumentation pass to validate inputs
* adds descriptor groups 2 and 3
---
Patch is 59.59 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/171498.diff
5 Files Affected:
- (modified) mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td (+12-7)
- (modified) mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp (+306-56)
- (modified) mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir (+310-27)
- (modified) mlir/test/Dialect/AMDGPU/amdgpu-make-dma-descriptor-fold.mlir (+3-3)
- (modified) mlir/test/Dialect/AMDGPU/ops.mlir (+6-6)
``````````diff
diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
index 56160d3e8fe85..6fbc90ded5824 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
@@ -110,9 +110,14 @@ def AMDGPU_TDMDescriptorType : AMDGPU_Type<"TDMDescriptor", "tdm_descriptor"> {
This type is opaque and corresponds to the two or four descriptor groups
used in tensor_load_to_lds or tensor_store_from_lds.
}];
-
}
+class AMDGPU_ConcreteVector<Type elem, int length> :
+ FixedVectorOfLengthAndType<[length], [elem]>,
+ BuildableType<
+ "::mlir::VectorType::get({" # length # "} ,"
+ # elem.builderCall # ")">;
+
//===----------------------------------------------------------------------===//
// AMDGPU Op definitions
//===----------------------------------------------------------------------===//
@@ -1296,14 +1301,14 @@ def AMDGPU_MakeDmaDescriptorOp :
DenseI64ArrayAttr: $global_static_strides,
Variadic<Index>: $shared_dynamic_sizes,
DenseI64ArrayAttr: $shared_static_sizes,
- Optional<I16>: $workgroup_mask,
+ Optional<AMDGPU_ConcreteVector<I1, 16>>: $workgroup_mask,
Optional<I1>: $early_timeout,
- Optional<Index>: $pad_amount,
- Optional<Index>: $pad_interval,
+ Optional<I32>: $pad_amount,
+ Optional<I32>: $pad_interval,
Optional<AnyMemRef>: $atomic_barrier_address,
Variadic<Index>: $atomic_barrier_indices,
Optional<Index>: $global_increment,
- Optional<Index>: $lds_increment,
+ Optional<I32>: $lds_increment,
Optional<Index>: $iteration_count)>,
Results<(outs AMDGPU_TDMDescriptorType: $desc)> {
@@ -1335,7 +1340,7 @@ def AMDGPU_MakeDmaDescriptorOp :
2D and 3D tensors may be iterated over by setting $global_increment, $lds_increment, and $iteration_count.
$global_increment determines how much to increment the starting global memory address per iteration in units of the $base's element type.
$lds_increment determines how much to increment the starting LDS address per iteration in units of the $base's element type.
- $iterate_count determines how many times to iterate.
+ $iterate_count determines how many times to iterate, it must be a value in the inclusive interval [1, 256].
```mlir
// Example of moving a two-dimensional tensor to LDS.
@@ -1345,7 +1350,7 @@ def AMDGPU_MakeDmaDescriptorOp :
// Example of moving a two dimension tensor to LDS where padding is applied after every integer.
%base = amdgpu.make_dma_base %global[0, 0], %lds[0, 0] : memref<32x32xi32>, memref<64x64xi32, #gpu.address_space<workgroup>> -> !amdgpu.tdm_base<i32>
- %descriptor = amdgpu.make_dma_descriptor %base globalSize [32, 32] globalStride [32, 1] sharedSize [64, 64] padding(%pad_amount pad_every %pad_interval) : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
+ %descriptor = amdgpu.make_dma_descriptor %base globalSize [32, 32] globalStride [32, 1] sharedSize [64, 64] padShared(%pad_amount every %pad_interval) : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
amdgpu.tensor_load_to_lds %descriptor : !amdgpu.tdm_descriptor
```
}];
diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index 7584b17075225..592731778a8e4 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -2381,6 +2381,8 @@ struct AMDGPUMakeDmaDescriptorLowering
if (!mask)
return sgpr0;
+ Type i16 = rewriter.getI16Type();
+ mask = LLVM::BitcastOp::create(rewriter, loc, i16, mask);
Type i32 = rewriter.getI32Type();
Value extendedMask = LLVM::ZExtOp::create(rewriter, loc, i32, mask);
return setValueAtOffset(rewriter, loc, sgpr0, extendedMask, 0);
@@ -2389,21 +2391,19 @@ struct AMDGPUMakeDmaDescriptorLowering
Value setDataSize(MakeDmaDescriptorOp op, OpAdaptor adaptor,
ConversionPatternRewriter &rewriter, Location loc,
Value sgpr0, ArrayRef<Value> consts) const {
- // Compute data_size.
unsigned elementTypeWidthInBits = op.getElementTypeWidth();
assert(
llvm::is_contained<unsigned>({8, 16, 32, 64}, elementTypeWidthInBits) &&
"expected type width to be 8, 16, 32, or 64.");
- int64_t dataSize = llvm::Log2_32(elementTypeWidthInBits / 8);
- Value size = createI32Constant(rewriter, loc, dataSize);
+ int64_t idx = llvm::Log2_32(elementTypeWidthInBits / 8);
+ Value size = consts[idx];
return setValueAtOffset(rewriter, loc, sgpr0, size, 16);
}
Value setAtomicBarrier(MakeDmaDescriptorOp op, OpAdaptor adaptor,
ConversionPatternRewriter &rewriter, Location loc,
Value sgpr0, ArrayRef<Value> consts) const {
- bool atomic_barrier_enable = adaptor.getAtomicBarrierAddress() != nullptr;
- if (!atomic_barrier_enable)
+ if (!adaptor.getAtomicBarrierAddress())
return sgpr0;
return setValueAtOffset(rewriter, loc, sgpr0, consts[1], 18);
@@ -2412,19 +2412,16 @@ struct AMDGPUMakeDmaDescriptorLowering
Value setIterateEnable(MakeDmaDescriptorOp op, OpAdaptor adaptor,
ConversionPatternRewriter &rewriter, Location loc,
Value sgpr0, ArrayRef<Value> consts) const {
- bool iterate_enable = adaptor.getGlobalIncrement() != nullptr;
- if (!iterate_enable)
+ if (!adaptor.getGlobalIncrement())
return sgpr0;
- // TODO: In future PR, add other required fields for iteration.
return setValueAtOffset(rewriter, loc, sgpr0, consts[1], 19);
}
Value setPadEnable(MakeDmaDescriptorOp op, OpAdaptor adaptor,
ConversionPatternRewriter &rewriter, Location loc,
Value sgpr0, ArrayRef<Value> consts) const {
- bool pad_enable = op.getPadAmount() != nullptr;
- if (!pad_enable)
+ if (!op.getPadAmount())
return sgpr0;
return setValueAtOffset(rewriter, loc, sgpr0, consts[1], 20);
@@ -2442,13 +2439,16 @@ struct AMDGPUMakeDmaDescriptorLowering
Value setPadInterval(MakeDmaDescriptorOp op, OpAdaptor adaptor,
ConversionPatternRewriter &rewriter, Location loc,
Value sgpr0, ArrayRef<Value> consts) const {
- bool pad_enable = op.getPadAmount() != nullptr;
- if (!pad_enable)
+ if (!op.getPadAmount())
return sgpr0;
+ // pre-condition: padInterval can be a power of two between 2 and 256.
+ // TODO: Validation if the value breaks the pre-condition.
+ // If the pre-condition fails, there is a possibility of
+ // affecting the higher bits. In a following PR add a flag
+ // that instruments conditions that need to be checked at runtime.
IntegerType i32 = rewriter.getI32Type();
Value padInterval = adaptor.getPadInterval();
- // pre-condition: padInterval can be a power of two between 2 and 256.
padInterval = LLVM::CountTrailingZerosOp::create(rewriter, loc, i32,
padInterval, false);
padInterval = LLVM::SubOp::create(rewriter, loc, padInterval, consts[1]);
@@ -2459,12 +2459,15 @@ struct AMDGPUMakeDmaDescriptorLowering
Value setPadAmount(MakeDmaDescriptorOp op, OpAdaptor adaptor,
ConversionPatternRewriter &rewriter, Location loc,
Value sgpr0, ArrayRef<Value> consts) const {
- bool pad_enable = op.getPadAmount() != nullptr;
- if (!pad_enable)
+ if (!op.getPadAmount())
return sgpr0;
- Value padAmount = adaptor.getPadAmount();
// pre-condition: padAmount is a value between 1-128.
+ // TODO: Validation if the value breaks the pre-condition.
+ // If the pre-condition fails, there is a possibility of
+ // affecting the higher bits. In a following PR add a flag
+ // that instruments conditions that need to be checked at runtime.
+ Value padAmount = adaptor.getPadAmount();
padAmount = LLVM::SubOp::create(rewriter, loc, padAmount, consts[1]);
// post-condition: padAmount is a value between 0-127.
return setValueAtOffset(rewriter, loc, sgpr0, padAmount, 25);
@@ -2474,8 +2477,7 @@ struct AMDGPUMakeDmaDescriptorLowering
ConversionPatternRewriter &rewriter,
Location loc, Value sgpr1,
ArrayRef<Value> consts) const {
- bool atomic_barrier_enable = adaptor.getAtomicBarrierAddress() != nullptr;
- if (!atomic_barrier_enable)
+ if (!adaptor.getAtomicBarrierAddress())
return sgpr1;
Value atomicBarrierAddress = adaptor.getAtomicBarrierAddress();
@@ -2488,6 +2490,9 @@ struct AMDGPUMakeDmaDescriptorLowering
IntegerType i32 = rewriter.getI32Type();
// pre-condition: atomicBarrierAddress is aligned to 8 bytes which implies
// that the 3 LSBs are zero.
+ // TODO: Validation if the value breaks the pre-condition.
+ // In a following PR add a flag that instruments conditions that need to be
+ // checked at runtime.
atomicBarrierAddress =
LLVM::PtrToIntOp::create(rewriter, loc, i32, atomicBarrierAddress);
atomicBarrierAddress =
@@ -2498,65 +2503,91 @@ struct AMDGPUMakeDmaDescriptorLowering
return setValueAtOffset(rewriter, loc, sgpr1, atomicBarrierAddress, 32);
}
- std::pair<Value, Value> setTensorDim0(MakeDmaDescriptorOp op,
+ std::pair<Value, Value> setTensorDimX(MakeDmaDescriptorOp op,
OpAdaptor adaptor,
ConversionPatternRewriter &rewriter,
Location loc, Value sgpr1, Value sgpr2,
- ArrayRef<Value> consts) const {
- SmallVector<OpFoldResult> mixedGlobalSizes = op.getMixedGlobalSizes();
- OpFoldResult tensorDim0OpFoldResult = mixedGlobalSizes.back();
- Value tensorDim0;
- if (auto attr = dyn_cast<Attribute>(tensorDim0OpFoldResult))
- tensorDim0 =
+ ArrayRef<Value> consts, uint64_t dimX,
+ uint32_t offset) const {
+ ArrayRef<int64_t> globalStaticSizes = adaptor.getGlobalStaticSizes();
+ ValueRange globalDynamicSizes = adaptor.getGlobalDynamicSizes();
+ SmallVector<OpFoldResult> mixedGlobalSizes =
+ getMixedValues(globalStaticSizes, globalDynamicSizes, rewriter);
+ if (mixedGlobalSizes.size() <= dimX)
+ return {sgpr1, sgpr2};
+
+ OpFoldResult tensorDimXOpFoldResult = *(mixedGlobalSizes.rbegin() + dimX);
+ // pre-condition: tensorDimX is less than 2^48-1
+ // TODO: Validation if the value breaks the pre-condition.
+ // If the pre-condition fails, there is a possibility of
+ // affecting the higher bits. In a following PR add a flag
+ // that instruments conditions that need to be checked at runtime.
+ // This could also be fixed by saying that mixedGlobalSizes is a
+ // DynamicI48List.
+ Value tensorDimX;
+ if (auto attr = dyn_cast<Attribute>(tensorDimXOpFoldResult))
+ tensorDimX =
createI32Constant(rewriter, loc, cast<IntegerAttr>(attr).getInt());
- else
- tensorDim0 = cast<Value>(tensorDim0OpFoldResult);
+ else {
+ IntegerType i32 = rewriter.getI32Type();
+ tensorDimX = cast<Value>(tensorDimXOpFoldResult);
+ tensorDimX = LLVM::TruncOp::create(rewriter, loc, i32, tensorDimX);
+ }
+
+ sgpr1 = setValueAtOffset(rewriter, loc, sgpr1, tensorDimX, offset);
Value c16 = createI32Constant(rewriter, loc, 16);
- Value tensorDim0High = LLVM::LShrOp::create(rewriter, loc, tensorDim0, c16);
- sgpr1 = setValueAtOffset(rewriter, loc, sgpr1, tensorDim0, 48);
- sgpr2 = setValueAtOffset(rewriter, loc, sgpr2, tensorDim0High, 48 + 16);
+ Value tensorDimXHigh = LLVM::LShrOp::create(rewriter, loc, tensorDimX, c16);
+ sgpr2 = setValueAtOffset(rewriter, loc, sgpr2, tensorDimXHigh, offset + 16);
return {sgpr1, sgpr2};
}
+ std::pair<Value, Value> setTensorDim0(MakeDmaDescriptorOp op,
+ OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter,
+ Location loc, Value sgpr1, Value sgpr2,
+ ArrayRef<Value> consts) const {
+ return setTensorDimX(op, adaptor, rewriter, loc, sgpr1, sgpr2, consts, 0,
+ 48);
+ }
+
std::pair<Value, Value> setTensorDim1(MakeDmaDescriptorOp op,
OpAdaptor adaptor,
ConversionPatternRewriter &rewriter,
Location loc, Value sgpr2, Value sgpr3,
ArrayRef<Value> consts) const {
- // TODO: Generalize to setTensorDimX.
- SmallVector<OpFoldResult> mixedGlobalSizes = op.getMixedGlobalSizes();
- OpFoldResult tensorDim1OpFoldResult = *(mixedGlobalSizes.rbegin() + 1);
- Value tensorDim1;
- if (auto attr = dyn_cast<Attribute>(tensorDim1OpFoldResult))
- tensorDim1 =
- createI32Constant(rewriter, loc, cast<IntegerAttr>(attr).getInt());
- else
- tensorDim1 = cast<Value>(tensorDim1OpFoldResult);
-
- Value c16 = createI32Constant(rewriter, loc, 16);
- Value tensorDim1High = LLVM::LShrOp::create(rewriter, loc, tensorDim1, c16);
- sgpr2 = setValueAtOffset(rewriter, loc, sgpr2, tensorDim1, 80);
- sgpr3 = setValueAtOffset(rewriter, loc, sgpr3, tensorDim1High, 80 + 16);
- return {sgpr2, sgpr3};
+ return setTensorDimX(op, adaptor, rewriter, loc, sgpr2, sgpr3, consts, 1,
+ 80);
}
Value setTileDimX(MakeDmaDescriptorOp op, OpAdaptor adaptor,
ConversionPatternRewriter &rewriter, Location loc,
Value sgpr, ArrayRef<Value> consts, size_t dimX,
int64_t offset) const {
- SmallVector<OpFoldResult> mixedSharedSizes = op.getMixedSharedSizes();
-
+ ArrayRef<int64_t> sharedStaticSizes = adaptor.getSharedStaticSizes();
+ ValueRange sharedDynamicSizes = adaptor.getSharedDynamicSizes();
+ SmallVector<OpFoldResult> mixedSharedSizes =
+ getMixedValues(sharedStaticSizes, sharedDynamicSizes, rewriter);
if (mixedSharedSizes.size() <= dimX)
return sgpr;
OpFoldResult tileDimXOpFoldResult = *(mixedSharedSizes.rbegin() + dimX);
+ // pre-condition: tileDimX is less than 2^16-1
+ // TODO: Validation if the value breaks the pre-condition.
+ // If the pre-condition fails, there is a possibility of
+ // affecting the higher bits. In a following PR add a flag
+ // that instruments conditions that need to be checked at runtime.
+ // This could also be fixed by saying that mixedSharedSizes is a
+ // DynamicI16List.
Value tileDimX;
if (auto attr = dyn_cast<Attribute>(tileDimXOpFoldResult))
tileDimX =
createI32Constant(rewriter, loc, cast<IntegerAttr>(attr).getInt());
- else
+ else {
+ IntegerType i32 = rewriter.getI32Type();
tileDimX = cast<Value>(tileDimXOpFoldResult);
+ tileDimX = LLVM::TruncOp::create(rewriter, loc, i32, tileDimX);
+ }
return setValueAtOffset(rewriter, loc, sgpr, tileDimX, offset);
}
@@ -2584,13 +2615,20 @@ struct AMDGPUMakeDmaDescriptorLowering
ConversionPatternRewriter &rewriter, Location loc,
Value sgprY, Value sgprZ, ArrayRef<Value> consts,
size_t dimX, int64_t offset) const {
- SmallVector<OpFoldResult> mixedGlobalStrides = op.getMixedGlobalStrides();
+ ArrayRef<int64_t> globalStaticStrides = adaptor.getGlobalStaticStrides();
+ ValueRange globalDynamicStrides = adaptor.getGlobalDynamicStrides();
+ SmallVector<OpFoldResult> mixedGlobalStrides =
+ getMixedValues(globalStaticStrides, globalDynamicStrides, rewriter);
if (mixedGlobalStrides.size() <= dimX)
return {sgprY, sgprZ};
OpFoldResult tensorDimXStrideOpFoldResult =
*(mixedGlobalStrides.rbegin() + dimX);
+ // pre-condition: tensorDimXStride is less than 2^48-1
+ // TODO: Validation if the value breaks the pre-condition.
+ // In a following PR add a flag that instruments conditions that need to be
+ // checked at runtime.
Value tensorDimXStride;
if (auto attr = dyn_cast<Attribute>(tensorDimXStrideOpFoldResult))
tensorDimXStride =
@@ -2605,6 +2643,7 @@ struct AMDGPUMakeDmaDescriptorLowering
IntegerType i32 = rewriter.getI32Type();
Value tensorDimXStrideLow =
LLVM::TruncOp::create(rewriter, loc, i32, tensorDimXStride);
+ sgprY = setValueAtOffset(rewriter, loc, sgprY, tensorDimXStrideLow, offset);
int64_t shift = (offset % 32) == 0 ? 32 : offset % 32;
Value shiftVal = createI64Constant(rewriter, loc, shift);
@@ -2612,8 +2651,6 @@ struct AMDGPUMakeDmaDescriptorLowering
LLVM::LShrOp::create(rewriter, loc, tensorDimXStride, shiftVal);
tensorDimXStrideHigh =
LLVM::TruncOp::create(rewriter, loc, i32, tensorDimXStrideHigh);
-
- sgprY = setValueAtOffset(rewriter, loc, sgprY, tensorDimXStrideLow, offset);
sgprZ = setValueAtOffset(rewriter, loc, sgprZ, tensorDimXStrideHigh,
offset + shift);
return {sgprY, sgprZ};
@@ -2680,6 +2717,221 @@ struct AMDGPUMakeDmaDescriptorLowering
return dgroup1;
}
+ Value setTensorDimX(MakeDmaDescriptorOp op, OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter, Location loc,
+ Value sgpr0, ArrayRef<Value> consts, int64_t dimX,
+ int64_t offset) const {
+ ArrayRef<int64_t> globalStaticSizes = adaptor.getGlobalStaticSizes();
+ ValueRange globalDynamicSizes = adaptor.getGlobalDynamicSizes();
+ SmallVector<OpFoldResult> mixedGlobalSizes =
+ getMixedValues(globalStaticSizes, globalDynamicSizes, rewriter);
+ if (mixedGlobalSizes.size() <= static_cast<unsigned long>(dimX))
+ return sgpr0;
+
+ OpFoldResult tensorDimXOpFoldResult = *(mixedGlobalSizes.rbegin() + dimX);
+ Value tensorDimX;
+ if (auto attr = dyn_cast<Attribute>(tensorDimXOpFoldResult))
+ tensorDimX =
+ createI32Constant(rewriter, loc, cast<IntegerAttr>(attr).getInt());
+ else {
+ IntegerType i32 = rewriter.getI32Type();
+ tensorDimX = cast<Value>(tensorDimXOpFoldResult);
+ tensorDimX = LLVM::TruncOp::create(rewriter, loc, i32, tensorDimX);
+ }
+
+ return setValueAtOffset(rewriter, loc, sgpr0, tensorDimX, offset);
+ }
+
+ Value setTensorDim2(MakeDmaDescriptorOp op, OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter, Location loc,
+ Value sgpr0, ArrayRef<Value> consts) const {
+ return setTensorDimX(op, adaptor, rewriter, loc, sgpr0, consts, 2, 0);
+ }
+
+ Value truncateAndSetValueAtOffset(ConversionPatternRewriter &rewriter,
+ Location loc, Value accumulator,
+ Value value, int64_t shift) const {
+
+ IntegerType i32 = rewriter.getI32Type();
+ value = LLVM::TruncOp::create(rewriter, loc, i32, value);
+ return setValueAtOffset(rewriter, loc, accumulator, value, shift);
+ }
+
+ Value setLDSAddrIncrement(MakeDmaDescriptorOp op, OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter, Location loc,
+ Value sgpr1, ArrayRef<Value> consts,
+ int64_t offset) const {
+ Value ldsAddrIncrement = adaptor.getLdsIncrement();
+ return setValueAtOffset(rewriter, loc, sgpr1, ldsAddrIncrement, offset);
+ }
+
+ std::pair<Value, Value>
+ setGlobalAddrIncrement(MakeDmaDescriptorOp op, OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter, Location loc,
+ Value sgpr2, Value sgpr3, ArrayRef<Value> consts,
+ int64_t offset) const {
+ Value globalAddrIncrement = adaptor.getGlobalIncrement();
+ sgpr2 = truncateAndSetValueAtOffset(rewriter, loc, sgpr2,
+ globalAddrIncrement, offset);
+ Value shift = createI64Constant(rewriter, loc, 32);
+ globalAddrIncrement =
+ LLVM::LShrOp::create(rewriter, loc, glo...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/171498
More information about the Mlir-commits
mailing list