[Mlir-commits] [mlir] 2f9b8b7 - [mlir][amdgpu] Continue lowering make_tdm_descriptor. (#171498)
llvmlistbot at llvm.org
llvmlistbot at llvm.org
Thu Dec 11 12:49:54 PST 2025
Author: Erick Ochoa Lopez
Date: 2025-12-11T15:49:50-05:00
New Revision: 2f9b8b74280343fb1bc1918059c808811b4f1e90
URL: https://github.com/llvm/llvm-project/commit/2f9b8b74280343fb1bc1918059c808811b4f1e90
DIFF: https://github.com/llvm/llvm-project/commit/2f9b8b74280343fb1bc1918059c808811b4f1e90.diff
LOG: [mlir][amdgpu] Continue lowering make_tdm_descriptor. (#171498)
* changes workgroup mask's type from i16 to vector<16xi1>
* changes pad_amount and pad_interval from Index to I32
* adds lit tests for padEnable, iteration and dynamic cases
* adds TODO for a future instrumentation pass to validate inputs
* adds descriptor groups 2 and 3
Added:
Modified:
mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir
mlir/test/Dialect/AMDGPU/amdgpu-make-dma-descriptor-fold.mlir
mlir/test/Dialect/AMDGPU/ops.mlir
Removed:
################################################################################
diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
index 56160d3e8fe85..6fbc90ded5824 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
@@ -110,9 +110,14 @@ def AMDGPU_TDMDescriptorType : AMDGPU_Type<"TDMDescriptor", "tdm_descriptor"> {
This type is opaque and corresponds to the two or four descriptor groups
used in tensor_load_to_lds or tensor_store_from_lds.
}];
-
}
+class AMDGPU_ConcreteVector<Type elem, int length> :
+ FixedVectorOfLengthAndType<[length], [elem]>,
+ BuildableType<
+ "::mlir::VectorType::get({" # length # "} ,"
+ # elem.builderCall # ")">;
+
//===----------------------------------------------------------------------===//
// AMDGPU Op definitions
//===----------------------------------------------------------------------===//
@@ -1296,14 +1301,14 @@ def AMDGPU_MakeDmaDescriptorOp :
DenseI64ArrayAttr: $global_static_strides,
Variadic<Index>: $shared_dynamic_sizes,
DenseI64ArrayAttr: $shared_static_sizes,
- Optional<I16>: $workgroup_mask,
+ Optional<AMDGPU_ConcreteVector<I1, 16>>: $workgroup_mask,
Optional<I1>: $early_timeout,
- Optional<Index>: $pad_amount,
- Optional<Index>: $pad_interval,
+ Optional<I32>: $pad_amount,
+ Optional<I32>: $pad_interval,
Optional<AnyMemRef>: $atomic_barrier_address,
Variadic<Index>: $atomic_barrier_indices,
Optional<Index>: $global_increment,
- Optional<Index>: $lds_increment,
+ Optional<I32>: $lds_increment,
Optional<Index>: $iteration_count)>,
Results<(outs AMDGPU_TDMDescriptorType: $desc)> {
@@ -1335,7 +1340,7 @@ def AMDGPU_MakeDmaDescriptorOp :
2D and 3D tensors may be iterated over by setting $global_increment, $lds_increment, and $iteration_count.
$global_increment determines how much to increment the starting global memory address per iteration in units of the $base's element type.
$lds_increment determines how much to increment the starting LDS address per iteration in units of the $base's element type.
- $iterate_count determines how many times to iterate.
+ $iterate_count determines how many times to iterate, it must be a value in the inclusive interval [1, 256].
```mlir
// Example of moving a two-dimensional tensor to LDS.
@@ -1345,7 +1350,7 @@ def AMDGPU_MakeDmaDescriptorOp :
// Example of moving a two dimension tensor to LDS where padding is applied after every integer.
%base = amdgpu.make_dma_base %global[0, 0], %lds[0, 0] : memref<32x32xi32>, memref<64x64xi32, #gpu.address_space<workgroup>> -> !amdgpu.tdm_base<i32>
- %descriptor = amdgpu.make_dma_descriptor %base globalSize [32, 32] globalStride [32, 1] sharedSize [64, 64] padding(%pad_amount pad_every %pad_interval) : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
+ %descriptor = amdgpu.make_dma_descriptor %base globalSize [32, 32] globalStride [32, 1] sharedSize [64, 64] padShared(%pad_amount every %pad_interval) : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
amdgpu.tensor_load_to_lds %descriptor : !amdgpu.tdm_descriptor
```
}];
diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index 4b1509392aa6f..bceff12158e8c 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -2382,6 +2382,8 @@ struct AMDGPUMakeDmaDescriptorLowering
if (!mask)
return sgpr0;
+ Type i16 = rewriter.getI16Type();
+ mask = LLVM::BitcastOp::create(rewriter, loc, i16, mask);
Type i32 = rewriter.getI32Type();
Value extendedMask = LLVM::ZExtOp::create(rewriter, loc, i32, mask);
return setValueAtOffset(rewriter, loc, sgpr0, extendedMask, 0);
@@ -2390,21 +2392,19 @@ struct AMDGPUMakeDmaDescriptorLowering
Value setDataSize(MakeDmaDescriptorOp op, OpAdaptor adaptor,
ConversionPatternRewriter &rewriter, Location loc,
Value sgpr0, ArrayRef<Value> consts) const {
- // Compute data_size.
unsigned elementTypeWidthInBits = op.getElementTypeWidth();
assert(
llvm::is_contained<unsigned>({8, 16, 32, 64}, elementTypeWidthInBits) &&
"expected type width to be 8, 16, 32, or 64.");
- int64_t dataSize = llvm::Log2_32(elementTypeWidthInBits / 8);
- Value size = createI32Constant(rewriter, loc, dataSize);
+ int64_t idx = llvm::Log2_32(elementTypeWidthInBits / 8);
+ Value size = consts[idx];
return setValueAtOffset(rewriter, loc, sgpr0, size, 16);
}
Value setAtomicBarrier(MakeDmaDescriptorOp op, OpAdaptor adaptor,
ConversionPatternRewriter &rewriter, Location loc,
Value sgpr0, ArrayRef<Value> consts) const {
- bool atomic_barrier_enable = adaptor.getAtomicBarrierAddress() != nullptr;
- if (!atomic_barrier_enable)
+ if (!adaptor.getAtomicBarrierAddress())
return sgpr0;
return setValueAtOffset(rewriter, loc, sgpr0, consts[1], 18);
@@ -2413,19 +2413,16 @@ struct AMDGPUMakeDmaDescriptorLowering
Value setIterateEnable(MakeDmaDescriptorOp op, OpAdaptor adaptor,
ConversionPatternRewriter &rewriter, Location loc,
Value sgpr0, ArrayRef<Value> consts) const {
- bool iterate_enable = adaptor.getGlobalIncrement() != nullptr;
- if (!iterate_enable)
+ if (!adaptor.getGlobalIncrement())
return sgpr0;
- // TODO: In future PR, add other required fields for iteration.
return setValueAtOffset(rewriter, loc, sgpr0, consts[1], 19);
}
Value setPadEnable(MakeDmaDescriptorOp op, OpAdaptor adaptor,
ConversionPatternRewriter &rewriter, Location loc,
Value sgpr0, ArrayRef<Value> consts) const {
- bool pad_enable = op.getPadAmount() != nullptr;
- if (!pad_enable)
+ if (!op.getPadAmount())
return sgpr0;
return setValueAtOffset(rewriter, loc, sgpr0, consts[1], 20);
@@ -2443,13 +2440,17 @@ struct AMDGPUMakeDmaDescriptorLowering
Value setPadInterval(MakeDmaDescriptorOp op, OpAdaptor adaptor,
ConversionPatternRewriter &rewriter, Location loc,
Value sgpr0, ArrayRef<Value> consts) const {
- bool pad_enable = op.getPadAmount() != nullptr;
- if (!pad_enable)
+ if (!op.getPadAmount())
return sgpr0;
+ // pre-condition: padInterval can be a power of two between 2 and 256.
+ // TODO: Validation if the value breaks the pre-condition.
+ // If the pre-condition fails, there is a possibility of
+ // affecting the higher bits. In a following PR implement
+ // RuntimeVerifiableOpInterface that instruments conditions that need to be
+ // checked at runtime.
IntegerType i32 = rewriter.getI32Type();
Value padInterval = adaptor.getPadInterval();
- // pre-condition: padInterval can be a power of two between 2 and 256.
padInterval = LLVM::CountTrailingZerosOp::create(rewriter, loc, i32,
padInterval, false);
padInterval = LLVM::SubOp::create(rewriter, loc, padInterval, consts[1]);
@@ -2460,12 +2461,16 @@ struct AMDGPUMakeDmaDescriptorLowering
Value setPadAmount(MakeDmaDescriptorOp op, OpAdaptor adaptor,
ConversionPatternRewriter &rewriter, Location loc,
Value sgpr0, ArrayRef<Value> consts) const {
- bool pad_enable = op.getPadAmount() != nullptr;
- if (!pad_enable)
+ if (!op.getPadAmount())
return sgpr0;
- Value padAmount = adaptor.getPadAmount();
// pre-condition: padAmount is a value between 1-128.
+ // TODO: Validation if the value breaks the pre-condition.
+ // If the pre-condition fails, there is a possibility of
+ // affecting the higher bits. In a following PR implement
+ // RuntimeVerifiableOpInterface that instruments conditions that need to be
+ // checked at runtime.
+ Value padAmount = adaptor.getPadAmount();
padAmount = LLVM::SubOp::create(rewriter, loc, padAmount, consts[1]);
// post-condition: padAmount is a value between 0-127.
return setValueAtOffset(rewriter, loc, sgpr0, padAmount, 25);
@@ -2475,8 +2480,7 @@ struct AMDGPUMakeDmaDescriptorLowering
ConversionPatternRewriter &rewriter,
Location loc, Value sgpr1,
ArrayRef<Value> consts) const {
- bool atomic_barrier_enable = adaptor.getAtomicBarrierAddress() != nullptr;
- if (!atomic_barrier_enable)
+ if (!adaptor.getAtomicBarrierAddress())
return sgpr1;
Value atomicBarrierAddress = adaptor.getAtomicBarrierAddress();
@@ -2489,6 +2493,9 @@ struct AMDGPUMakeDmaDescriptorLowering
IntegerType i32 = rewriter.getI32Type();
// pre-condition: atomicBarrierAddress is aligned to 8 bytes which implies
// that the 3 LSBs are zero.
+ // TODO: Validation if the value breaks the pre-condition.
+ // In a following PR implement RuntimeVerifiableOpInterface
+ // that instruments conditions that need to be checked at runtime.
atomicBarrierAddress =
LLVM::PtrToIntOp::create(rewriter, loc, i32, atomicBarrierAddress);
atomicBarrierAddress =
@@ -2499,65 +2506,89 @@ struct AMDGPUMakeDmaDescriptorLowering
return setValueAtOffset(rewriter, loc, sgpr1, atomicBarrierAddress, 32);
}
- std::pair<Value, Value> setTensorDim0(MakeDmaDescriptorOp op,
+ std::pair<Value, Value> setTensorDimX(MakeDmaDescriptorOp op,
OpAdaptor adaptor,
ConversionPatternRewriter &rewriter,
Location loc, Value sgpr1, Value sgpr2,
- ArrayRef<Value> consts) const {
- SmallVector<OpFoldResult> mixedGlobalSizes = op.getMixedGlobalSizes();
- OpFoldResult tensorDim0OpFoldResult = mixedGlobalSizes.back();
- Value tensorDim0;
- if (auto attr = dyn_cast<Attribute>(tensorDim0OpFoldResult))
- tensorDim0 =
+ ArrayRef<Value> consts, uint64_t dimX,
+ uint32_t offset) const {
+ ArrayRef<int64_t> globalStaticSizes = adaptor.getGlobalStaticSizes();
+ ValueRange globalDynamicSizes = adaptor.getGlobalDynamicSizes();
+ SmallVector<OpFoldResult> mixedGlobalSizes =
+ getMixedValues(globalStaticSizes, globalDynamicSizes, rewriter);
+ if (mixedGlobalSizes.size() <= dimX)
+ return {sgpr1, sgpr2};
+
+ OpFoldResult tensorDimXOpFoldResult = *(mixedGlobalSizes.rbegin() + dimX);
+ // pre-condition: tensorDimX is less than 2^32-1
+ // TODO: Validation if the value breaks the pre-condition.
+ // In a following PR implement RuntimeVerifiableOpInterface that instruments
+ // conditions that need to be checked at runtime. This could also be fixed
+ // by saying that mixedGlobalSizes is a DynamicI32List.
+ Value tensorDimX;
+ if (auto attr = dyn_cast<Attribute>(tensorDimXOpFoldResult))
+ tensorDimX =
createI32Constant(rewriter, loc, cast<IntegerAttr>(attr).getInt());
- else
- tensorDim0 = cast<Value>(tensorDim0OpFoldResult);
+ else {
+ IntegerType i32 = rewriter.getI32Type();
+ tensorDimX = cast<Value>(tensorDimXOpFoldResult);
+ tensorDimX = LLVM::TruncOp::create(rewriter, loc, i32, tensorDimX);
+ }
+
+ sgpr1 = setValueAtOffset(rewriter, loc, sgpr1, tensorDimX, offset);
Value c16 = createI32Constant(rewriter, loc, 16);
- Value tensorDim0High = LLVM::LShrOp::create(rewriter, loc, tensorDim0, c16);
- sgpr1 = setValueAtOffset(rewriter, loc, sgpr1, tensorDim0, 48);
- sgpr2 = setValueAtOffset(rewriter, loc, sgpr2, tensorDim0High, 48 + 16);
+ Value tensorDimXHigh = LLVM::LShrOp::create(rewriter, loc, tensorDimX, c16);
+ sgpr2 = setValueAtOffset(rewriter, loc, sgpr2, tensorDimXHigh, offset + 16);
return {sgpr1, sgpr2};
}
+ std::pair<Value, Value> setTensorDim0(MakeDmaDescriptorOp op,
+ OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter,
+ Location loc, Value sgpr1, Value sgpr2,
+ ArrayRef<Value> consts) const {
+ return setTensorDimX(op, adaptor, rewriter, loc, sgpr1, sgpr2, consts, 0,
+ 48);
+ }
+
std::pair<Value, Value> setTensorDim1(MakeDmaDescriptorOp op,
OpAdaptor adaptor,
ConversionPatternRewriter &rewriter,
Location loc, Value sgpr2, Value sgpr3,
ArrayRef<Value> consts) const {
- // TODO: Generalize to setTensorDimX.
- SmallVector<OpFoldResult> mixedGlobalSizes = op.getMixedGlobalSizes();
- OpFoldResult tensorDim1OpFoldResult = *(mixedGlobalSizes.rbegin() + 1);
- Value tensorDim1;
- if (auto attr = dyn_cast<Attribute>(tensorDim1OpFoldResult))
- tensorDim1 =
- createI32Constant(rewriter, loc, cast<IntegerAttr>(attr).getInt());
- else
- tensorDim1 = cast<Value>(tensorDim1OpFoldResult);
-
- Value c16 = createI32Constant(rewriter, loc, 16);
- Value tensorDim1High = LLVM::LShrOp::create(rewriter, loc, tensorDim1, c16);
- sgpr2 = setValueAtOffset(rewriter, loc, sgpr2, tensorDim1, 80);
- sgpr3 = setValueAtOffset(rewriter, loc, sgpr3, tensorDim1High, 80 + 16);
- return {sgpr2, sgpr3};
+ return setTensorDimX(op, adaptor, rewriter, loc, sgpr2, sgpr3, consts, 1,
+ 80);
}
Value setTileDimX(MakeDmaDescriptorOp op, OpAdaptor adaptor,
ConversionPatternRewriter &rewriter, Location loc,
Value sgpr, ArrayRef<Value> consts, size_t dimX,
int64_t offset) const {
- SmallVector<OpFoldResult> mixedSharedSizes = op.getMixedSharedSizes();
-
+ ArrayRef<int64_t> sharedStaticSizes = adaptor.getSharedStaticSizes();
+ ValueRange sharedDynamicSizes = adaptor.getSharedDynamicSizes();
+ SmallVector<OpFoldResult> mixedSharedSizes =
+ getMixedValues(sharedStaticSizes, sharedDynamicSizes, rewriter);
if (mixedSharedSizes.size() <= dimX)
return sgpr;
OpFoldResult tileDimXOpFoldResult = *(mixedSharedSizes.rbegin() + dimX);
+ // pre-condition: tileDimX is less than 2^16-1
+ // TODO: Validation if the value breaks the pre-condition.
+ // If the pre-condition fails, there is a possibility of
+ // affecting the higher bits. In a following PR implement
+ // RuntimeVerifiableOpInterface that instruments conditions that need to be
+ // checked at runtime. This could also be fixed by saying that
+ // mixedSharedSizes is a DynamicI16List.
Value tileDimX;
if (auto attr = dyn_cast<Attribute>(tileDimXOpFoldResult))
tileDimX =
createI32Constant(rewriter, loc, cast<IntegerAttr>(attr).getInt());
- else
+ else {
+ IntegerType i32 = rewriter.getI32Type();
tileDimX = cast<Value>(tileDimXOpFoldResult);
+ tileDimX = LLVM::TruncOp::create(rewriter, loc, i32, tileDimX);
+ }
return setValueAtOffset(rewriter, loc, sgpr, tileDimX, offset);
}
@@ -2585,13 +2616,20 @@ struct AMDGPUMakeDmaDescriptorLowering
ConversionPatternRewriter &rewriter, Location loc,
Value sgprY, Value sgprZ, ArrayRef<Value> consts,
size_t dimX, int64_t offset) const {
- SmallVector<OpFoldResult> mixedGlobalStrides = op.getMixedGlobalStrides();
+ ArrayRef<int64_t> globalStaticStrides = adaptor.getGlobalStaticStrides();
+ ValueRange globalDynamicStrides = adaptor.getGlobalDynamicStrides();
+ SmallVector<OpFoldResult> mixedGlobalStrides =
+ getMixedValues(globalStaticStrides, globalDynamicStrides, rewriter);
if (mixedGlobalStrides.size() <= dimX)
return {sgprY, sgprZ};
OpFoldResult tensorDimXStrideOpFoldResult =
*(mixedGlobalStrides.rbegin() + dimX);
+ // pre-condition: tensorDimXStride is less than 2^48-1
+ // TODO: Validation if the value breaks the pre-condition.
+ // In a following PR implement RuntimeVerifiableOpInterface that instruments
+ // conditions that need to be checked at runtime.
Value tensorDimXStride;
if (auto attr = dyn_cast<Attribute>(tensorDimXStrideOpFoldResult))
tensorDimXStride =
@@ -2606,6 +2644,7 @@ struct AMDGPUMakeDmaDescriptorLowering
IntegerType i32 = rewriter.getI32Type();
Value tensorDimXStrideLow =
LLVM::TruncOp::create(rewriter, loc, i32, tensorDimXStride);
+ sgprY = setValueAtOffset(rewriter, loc, sgprY, tensorDimXStrideLow, offset);
int64_t shift = (offset % 32) == 0 ? 32 : offset % 32;
Value shiftVal = createI64Constant(rewriter, loc, shift);
@@ -2613,8 +2652,6 @@ struct AMDGPUMakeDmaDescriptorLowering
LLVM::LShrOp::create(rewriter, loc, tensorDimXStride, shiftVal);
tensorDimXStrideHigh =
LLVM::TruncOp::create(rewriter, loc, i32, tensorDimXStrideHigh);
-
- sgprY = setValueAtOffset(rewriter, loc, sgprY, tensorDimXStrideLow, offset);
sgprZ = setValueAtOffset(rewriter, loc, sgprZ, tensorDimXStrideHigh,
offset + shift);
return {sgprY, sgprZ};
@@ -2681,6 +2718,224 @@ struct AMDGPUMakeDmaDescriptorLowering
return dgroup1;
}
+ Value setTensorDimX(MakeDmaDescriptorOp op, OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter, Location loc,
+ Value sgpr0, ArrayRef<Value> consts, int64_t dimX,
+ int64_t offset) const {
+ ArrayRef<int64_t> globalStaticSizes = adaptor.getGlobalStaticSizes();
+ ValueRange globalDynamicSizes = adaptor.getGlobalDynamicSizes();
+ SmallVector<OpFoldResult> mixedGlobalSizes =
+ getMixedValues(globalStaticSizes, globalDynamicSizes, rewriter);
+ if (mixedGlobalSizes.size() <= static_cast<unsigned long>(dimX))
+ return sgpr0;
+
+ OpFoldResult tensorDimXOpFoldResult = *(mixedGlobalSizes.rbegin() + dimX);
+ Value tensorDimX;
+ if (auto attr = dyn_cast<Attribute>(tensorDimXOpFoldResult))
+ tensorDimX =
+ createI32Constant(rewriter, loc, cast<IntegerAttr>(attr).getInt());
+ else {
+ IntegerType i32 = rewriter.getI32Type();
+ tensorDimX = cast<Value>(tensorDimXOpFoldResult);
+ tensorDimX = LLVM::TruncOp::create(rewriter, loc, i32, tensorDimX);
+ }
+
+ return setValueAtOffset(rewriter, loc, sgpr0, tensorDimX, offset);
+ }
+
+ Value setTensorDim2(MakeDmaDescriptorOp op, OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter, Location loc,
+ Value sgpr0, ArrayRef<Value> consts) const {
+ return setTensorDimX(op, adaptor, rewriter, loc, sgpr0, consts, 2, 0);
+ }
+
+ Value truncateAndSetValueAtOffset(ConversionPatternRewriter &rewriter,
+ Location loc, Value accumulator,
+ Value value, int64_t shift) const {
+
+ IntegerType i32 = rewriter.getI32Type();
+ value = LLVM::TruncOp::create(rewriter, loc, i32, value);
+ return setValueAtOffset(rewriter, loc, accumulator, value, shift);
+ }
+
+ Value setLDSAddrIncrement(MakeDmaDescriptorOp op, OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter, Location loc,
+ Value sgpr1, ArrayRef<Value> consts,
+ int64_t offset) const {
+ Value ldsAddrIncrement = adaptor.getLdsIncrement();
+ return setValueAtOffset(rewriter, loc, sgpr1, ldsAddrIncrement, offset);
+ }
+
+ std::pair<Value, Value>
+ setGlobalAddrIncrement(MakeDmaDescriptorOp op, OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter, Location loc,
+ Value sgpr2, Value sgpr3, ArrayRef<Value> consts,
+ int64_t offset) const {
+ Value globalAddrIncrement = adaptor.getGlobalIncrement();
+ sgpr2 = truncateAndSetValueAtOffset(rewriter, loc, sgpr2,
+ globalAddrIncrement, offset);
+ Value shift = createI64Constant(rewriter, loc, 32);
+ globalAddrIncrement =
+ LLVM::LShrOp::create(rewriter, loc, globalAddrIncrement, shift);
+ constexpr int64_t first16BitsHigh = (1ll << 16) - 1;
+ sgpr3 = truncateAndSetValueAtOffset(rewriter, loc, sgpr3,
+ globalAddrIncrement, offset + 32);
+ Value mask = createI32Constant(rewriter, loc, first16BitsHigh);
+ sgpr3 = LLVM::AndOp::create(rewriter, loc, sgpr3, mask);
+ return {sgpr2, sgpr3};
+ }
+
+ Value setTensorDim3OrLDSAddrIncrement(MakeDmaDescriptorOp op,
+ OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter,
+ Location loc, Value sgpr1,
+ ArrayRef<Value> consts) const {
+ Value ldsIncrement = op.getLdsIncrement();
+ constexpr int64_t dim = 3;
+ constexpr int64_t offset = 32;
+ if (!ldsIncrement)
+ return setTensorDimX(op, adaptor, rewriter, loc, sgpr1, consts, dim,
+ offset);
+ return setLDSAddrIncrement(op, adaptor, rewriter, loc, sgpr1, consts,
+ offset);
+ }
+
+ std::pair<Value, Value> setTensorDim2StrideOrGlobalAddrIncrement(
+ MakeDmaDescriptorOp op, OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter, Location loc, Value sgpr2,
+ Value sgpr3, ArrayRef<Value> consts) const {
+ Value globalIncrement = op.getGlobalIncrement();
+ constexpr int32_t dim = 2;
+ constexpr int32_t offset = 64;
+ if (!globalIncrement)
+ return setTensorDimXStride(op, adaptor, rewriter, loc, sgpr2, sgpr3,
+ consts, dim, offset);
+ return setGlobalAddrIncrement(op, adaptor, rewriter, loc, sgpr2, sgpr3,
+ consts, offset);
+ }
+
+ Value setIterateCount(MakeDmaDescriptorOp op, OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter, Location loc,
+ Value sgpr3, ArrayRef<Value> consts,
+ int32_t offset) const {
+ Value iterationCount = adaptor.getIterationCount();
+ IntegerType i32 = rewriter.getI32Type();
+ // pre-condition: iterationCount is in the inclusive interval [1, 256].
+ // TODO: validation if the value breaks the pre-condition.
+ // If the pre-condition fails, there is a possibility of
+ // affecting the higher bits. In a following PR implement
+ // RuntimeVerifiableOpInterface that instruments conditions that need to be
+ // checked at runtime.
+ iterationCount = LLVM::TruncOp::create(rewriter, loc, i32, iterationCount);
+ iterationCount =
+ LLVM::SubOp::create(rewriter, loc, iterationCount, consts[1]);
+ return setValueAtOffset(rewriter, loc, sgpr3, iterationCount, offset);
+ }
+
+ Value setTileDim3OrIterateCount(MakeDmaDescriptorOp op, OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter,
+ Location loc, Value sgpr3,
+ ArrayRef<Value> consts) const {
+ Value iterateCount = op.getIterationCount();
+ constexpr int32_t dim = 2;
+ constexpr int32_t offset = 112;
+ if (!iterateCount)
+ return setTileDimX(op, adaptor, rewriter, loc, sgpr3, consts, dim,
+ offset);
+
+ return setIterateCount(op, adaptor, rewriter, loc, sgpr3, consts, offset);
+ }
+
+ Value getDGroup2(MakeDmaDescriptorOp op, OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter, Location loc,
+ ArrayRef<Value> consts) const {
+ IntegerType i32 = rewriter.getI32Type();
+ Type v4i32 = this->typeConverter->convertType(VectorType::get(4, i32));
+ assert(v4i32 && "expected type conversion to succeed.");
+
+ bool onlyNeedsTwoDescriptors = !op.getLdsIncrement() && op.getRank() <= 2;
+ if (onlyNeedsTwoDescriptors)
+ return LLVM::ZeroOp::create(rewriter, loc, v4i32);
+
+ constexpr int64_t sgprlen = 4;
+ Value sgprs[sgprlen];
+ for (int i = 0; i < sgprlen; i++)
+ sgprs[i] = consts[0];
+
+ sgprs[0] = setTensorDim2(op, adaptor, rewriter, loc, sgprs[0], consts);
+ sgprs[1] = setTensorDim3OrLDSAddrIncrement(op, adaptor, rewriter, loc,
+ sgprs[1], consts);
+ std::tie(sgprs[2], sgprs[3]) = setTensorDim2StrideOrGlobalAddrIncrement(
+ op, adaptor, rewriter, loc, sgprs[2], sgprs[3], consts);
+ sgprs[3] =
+ setTileDim3OrIterateCount(op, adaptor, rewriter, loc, sgprs[3], consts);
+
+ Value dgroup2 = LLVM::PoisonOp::create(rewriter, loc, v4i32);
+ for (auto [sgpr, constant] : llvm::zip(sgprs, consts))
+ dgroup2 =
+ LLVM::InsertElementOp::create(rewriter, loc, dgroup2, sgpr, constant);
+
+ return dgroup2;
+ }
+
+ std::pair<Value, Value>
+ setTensorDim3Stride(MakeDmaDescriptorOp op, OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter, Location loc,
+ Value sgpr0, Value sgpr1, ArrayRef<Value> consts) const {
+ constexpr int32_t dim = 3;
+ constexpr int32_t offset = 0;
+ return setTensorDimXStride(op, adaptor, rewriter, loc, sgpr0, sgpr1, consts,
+ dim, offset);
+ }
+
+ std::pair<Value, Value> setTensorDim4(MakeDmaDescriptorOp op,
+ OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter,
+ Location loc, Value sgpr1, Value sgpr2,
+ ArrayRef<Value> consts) const {
+ constexpr int32_t dim = 4;
+ constexpr int32_t offset = 48;
+ return setTensorDimX(op, adaptor, rewriter, loc, sgpr1, sgpr2, consts, dim,
+ offset);
+ }
+
+ Value setTileDim4(MakeDmaDescriptorOp op, OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter, Location loc,
+ Value sgpr2, ArrayRef<Value> consts) const {
+ constexpr int32_t dim = 4;
+ constexpr int32_t offset = 80;
+ return setTileDimX(op, adaptor, rewriter, loc, sgpr2, consts, dim, offset);
+ }
+
+ Value getDGroup3(MakeDmaDescriptorOp op, OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter, Location loc,
+ ArrayRef<Value> consts) const {
+ IntegerType i32 = rewriter.getI32Type();
+ Type v4i32 = this->typeConverter->convertType(VectorType::get(4, i32));
+ assert(v4i32 && "expected type conversion to succeed.");
+ bool onlyNeedsTwoDescriptors = !op.getLdsIncrement() && op.getRank() <= 2;
+ if (onlyNeedsTwoDescriptors)
+ return LLVM::ZeroOp::create(rewriter, loc, v4i32);
+
+ constexpr int32_t sgprlen = 4;
+ Value sgprs[sgprlen];
+ for (int i = 0; i < sgprlen; i++)
+ sgprs[i] = consts[0];
+
+ std::tie(sgprs[0], sgprs[1]) = setTensorDim3Stride(
+ op, adaptor, rewriter, loc, sgprs[0], sgprs[1], consts);
+ std::tie(sgprs[1], sgprs[2]) =
+ setTensorDim4(op, adaptor, rewriter, loc, sgprs[1], sgprs[2], consts);
+ sgprs[2] = setTileDim4(op, adaptor, rewriter, loc, sgprs[2], consts);
+
+ Value dgroup3 = LLVM::PoisonOp::create(rewriter, loc, v4i32);
+ for (auto [sgpr, constant] : llvm::zip(sgprs, consts))
+ dgroup3 =
+ LLVM::InsertElementOp::create(rewriter, loc, dgroup3, sgpr, constant);
+
+ return dgroup3;
+ }
+
LogicalResult
matchAndRewrite(MakeDmaDescriptorOp op, OpAdaptor adaptor,
ConversionPatternRewriter &rewriter) const override {
@@ -2688,9 +2943,6 @@ struct AMDGPUMakeDmaDescriptorLowering
return op->emitOpError(
"make_dma_descriptor is only supported on gfx1250");
- if (op.getRank() > 2)
- return op->emitOpError("unimplemented");
-
Location loc = op.getLoc();
IntegerType i32 = rewriter.getI32Type();
@@ -2704,8 +2956,9 @@ struct AMDGPUMakeDmaDescriptorLowering
Value dgroup0 = this->getDGroup0(adaptor);
Value dgroup1 = this->getDGroup1(op, adaptor, rewriter, loc, consts);
-
- SmallVector<Value> results = {dgroup0, dgroup1};
+ Value dgroup2 = this->getDGroup2(op, adaptor, rewriter, loc, consts);
+ Value dgroup3 = this->getDGroup3(op, adaptor, rewriter, loc, consts);
+ SmallVector<Value> results = {dgroup0, dgroup1, dgroup2, dgroup3};
rewriter.replaceOpWithMultiple(op, {results});
return success();
}
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir b/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir
index a94e17ab5b9a5..c3665dca35837 100644
--- a/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir
+++ b/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir
@@ -237,6 +237,9 @@ func.func @make_dma_base(%idx: index, %mem: memref<8xi32, #gpu_global_addrspace>
// -----
+// This test exercises the lowering for operations that only require 2-descriptors
+// to be fully described.
+
// CHECK-LABEL: func @make_dma_descriptor
// CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base<i32>)
func.func @make_dma_descriptor(%base: !amdgpu.tdm_base<i32>) -> !amdgpu.tdm_descriptor {
@@ -251,23 +254,24 @@ func.func @make_dma_descriptor(%base: !amdgpu.tdm_base<i32>) -> !amdgpu.tdm_desc
// CHECK-DAG: %[[C6:.+]] = llvm.mlir.constant(6 : i32)
// CHECK-DAG: %[[C7:.+]] = llvm.mlir.constant(7 : i32)
- // CHECK-DAG: %[[DATA_SIZE:.+]] = llvm.mlir.constant(2 : i32)
// CHECK-DAG: %[[C16:.+]] = llvm.mlir.constant(16 : i32)
- // CHECK: %[[SGPR0:.+]] = llvm.shl %[[DATA_SIZE]], %[[C16]]
+ // CHECK: %[[SGPR0:.+]] = llvm.shl %[[C2]], %[[C16]]
// CHECK-DAG: %[[TENSOR_DIM_0:.+]] = llvm.mlir.constant(64 : i32)
// CHECK-DAG: %[[C16:.+]] = llvm.mlir.constant(16 : i32)
- // CHECK: %[[SGPR2_0:.+]] = llvm.lshr %[[TENSOR_DIM_0]], %[[C16]]
- // CHECK-DAG: %[[C16:.+]] = llvm.mlir.constant(16 : i32)
// CHECK: %[[SGPR1:.+]] = llvm.shl %[[TENSOR_DIM_0]], %[[C16]]
- // CHECK-DAG: %[[TENSOR_DIM_1:.+]] = llvm.mlir.constant(128 : i32)
// CHECK-DAG: %[[C16:.+]] = llvm.mlir.constant(16 : i32)
- // CHECK: %[[SGPR3_0:.+]] = llvm.lshr %[[TENSOR_DIM_1]], %[[C16]]
+ // CHECK: %[[SGPR2_0:.+]] = llvm.lshr %[[TENSOR_DIM_0]], %[[C16]]
+
+ // CHECK-DAG: %[[TENSOR_DIM_1:.+]] = llvm.mlir.constant(128 : i32)
// CHECK-DAG: %[[C16:.+]] = llvm.mlir.constant(16 : i32)
// CHECK: %[[TENSOR_DIM_1_SHIFTED:.+]] = llvm.shl %[[TENSOR_DIM_1]], %[[C16]]
// CHECK: %[[SGPR2:.+]] = llvm.or %[[SGPR2_0]], %[[TENSOR_DIM_1_SHIFTED]]
+ // CHECK-DAG: %[[C16:.+]] = llvm.mlir.constant(16 : i32)
+ // CHECK: %[[SGPR3_0:.+]] = llvm.lshr %[[TENSOR_DIM_1]], %[[C16]]
+
// CHECK-DAG: %[[TILE_DIM_0:.+]] = llvm.mlir.constant(64 : i32)
// CHECK-DAG: %[[C16:.+]] = llvm.mlir.constant(16 : i32)
// CHECK: %[[TILE_DIM_0_SHIFTED:.+]] = llvm.shl %[[TILE_DIM_0:.+]], %[[C16]]
@@ -279,6 +283,7 @@ func.func @make_dma_descriptor(%base: !amdgpu.tdm_base<i32>) -> !amdgpu.tdm_desc
// CHECK-DAG: %[[MASK:.+]] = llvm.mlir.constant(281474976710655 : i64) : i64
// CHECK: %[[TENSOR_DIM_0_STRIDE_MASKED:.+]] = llvm.and %[[MASK]], %[[TENSOR_DIM_0_STRIDE]]
// CHECK-DAG: %[[SGPR5:.+]] = llvm.trunc %[[TENSOR_DIM_0_STRIDE_MASKED]] : i64 to i32
+
// CHECK-DAG: %[[SHIFT:.+]] = llvm.mlir.constant(32 : i64) : i64
// CHECK: %[[TENSOR_DIM_0_STRIDE_HIGH_64:.+]] = llvm.lshr %[[TENSOR_DIM_0_STRIDE_MASKED]], %[[SHIFT]]
// CHECK: %[[SGPR6_0:.+]] = llvm.trunc %[[TENSOR_DIM_0_STRIDE_HIGH_64]] : i64 to i32
@@ -287,13 +292,14 @@ func.func @make_dma_descriptor(%base: !amdgpu.tdm_base<i32>) -> !amdgpu.tdm_desc
// CHECK-DAG: %[[MASK:.+]] = llvm.mlir.constant(281474976710655 : i64) : i64
// CHECK: %[[TENSOR_DIM_1_STRIDE_MASKED:.+]] = llvm.and %[[MASK]], %[[TENSOR_DIM_1_STRIDE]]
// CHECK-DAG: %[[TENSOR_DIM_1_STRIDE_LOW:.+]] = llvm.trunc %[[TENSOR_DIM_1_STRIDE_MASKED]]
- // CHECK-DAG: %[[SHIFT:.+]] = llvm.mlir.constant(16 : i64) : i64
- // CHECK: %[[TENSOR_DIM_1_STRIDE_SHIFTED:.+]] = llvm.lshr %[[TENSOR_DIM_1_STRIDE_MASKED]], %[[SHIFT]]
- // CHECK: %[[SGPR7:.+]] = llvm.trunc %[[TENSOR_DIM_1_STRIDE_SHIFTED]] : i64 to i32
// CHECK-DAG: %[[SHIFT:.+]] = llvm.mlir.constant(16 : i32) : i32
// CHECK: %[[TENSOR_DIM_1_STRIDE_LOW_SHIFTED:.+]] = llvm.shl %[[TENSOR_DIM_1_STRIDE_LOW]], %[[SHIFT]]
// CHECK-DAG: %[[SGPR6:.+]] = llvm.or %[[SGPR6_0]], %[[TENSOR_DIM_1_STRIDE_LOW_SHIFTED]]
+ // CHECK-DAG: %[[SHIFT:.+]] = llvm.mlir.constant(16 : i64) : i64
+ // CHECK: %[[TENSOR_DIM_1_STRIDE_SHIFTED:.+]] = llvm.lshr %[[TENSOR_DIM_1_STRIDE_MASKED]], %[[SHIFT]]
+ // CHECK: %[[SGPR7:.+]] = llvm.trunc %[[TENSOR_DIM_1_STRIDE_SHIFTED]] : i64 to i32
+
// CHECK: %[[V8I32:.+]] = llvm.mlir.poison : vector<8xi32>
// CHECK: %[[DGROUP1_0:.+]] = llvm.insertelement %[[SGPR0]], %[[V8I32]][%[[C0]] : i32]
// CHECK: %[[DGROUP1_1:.+]] = llvm.insertelement %[[SGPR1]], %[[DGROUP1_0]][%[[C1]] : i32]
@@ -304,7 +310,10 @@ func.func @make_dma_descriptor(%base: !amdgpu.tdm_base<i32>) -> !amdgpu.tdm_desc
// CHECK: %[[DGROUP1_6:.+]] = llvm.insertelement %[[SGPR6]], %[[DGROUP1_5]][%[[C6]] : i32]
// CHECK: %[[DGROUP1:.+]] = llvm.insertelement %[[SGPR7]], %[[DGROUP1_6]][%[[C7]] : i32]
- // CHECK: %[[DGROUPS:.+]] = builtin.unrealized_conversion_cast %[[DGROUP0]], %[[DGROUP1]] : vector<4xi32>, vector<8xi32> to !amdgpu.tdm_descriptor
+ // CHECK: %[[DGROUP2:.+]] = llvm.mlir.zero : vector<4xi32>
+ // CHECK: %[[DGROUP3:.+]] = llvm.mlir.zero : vector<4xi32>
+
+ // CHECK: %[[DGROUPS:.+]] = builtin.unrealized_conversion_cast %[[DGROUP0]], %[[DGROUP1]], %[[DGROUP2]], %[[DGROUP3]] : vector<4xi32>, vector<8xi32>, vector<4xi32>, vector<4xi32> to !amdgpu.tdm_descriptor
%descriptor = amdgpu.make_dma_descriptor %base globalSize [128, 64] globalStride [64, 1] sharedSize [128, 64] : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
func.return %descriptor : !amdgpu.tdm_descriptor
}
@@ -331,9 +340,8 @@ func.func @make_dma_descriptor_atomic_barrier(%base: !amdgpu.tdm_base<i32>, %bar
// CHECK-DAG: %[[C6:.+]] = llvm.mlir.constant(6 : i32)
// CHECK-DAG: %[[C7:.+]] = llvm.mlir.constant(7 : i32)
- // CHECK-DAG: %[[DATA_SIZE:.+]] = llvm.mlir.constant(2 : i32)
// CHECK-DAG: %[[C16:.+]] = llvm.mlir.constant(16 : i32)
- // CHECK: %[[SGPR0_0:.+]] = llvm.shl %[[DATA_SIZE]], %[[C16]]
+ // CHECK: %[[SGPR0_0:.+]] = llvm.shl %[[C2]], %[[C16]]
// CHECK-DAG: %[[ATOMIC_BARRIER_ENABLE_OFFSET:.+]] = llvm.mlir.constant(18 : i32)
// CHECK: %[[ATOMIC_BARRIER_ENABLE_FIELD:.+]] = llvm.shl %[[C1]], %[[ATOMIC_BARRIER_ENABLE_OFFSET]]
@@ -348,11 +356,12 @@ func.func @make_dma_descriptor_atomic_barrier(%base: !amdgpu.tdm_base<i32>, %bar
// CHECK-DAG: %[[TENSOR_DIM_0:.+]] = llvm.mlir.constant(64 : i32)
// CHECK-DAG: %[[C16:.+]] = llvm.mlir.constant(16 : i32)
- // CHECK: %[[SGPR2_0:.+]] = llvm.lshr %[[TENSOR_DIM_0]], %[[C16]]
- // CHECK-DAG: %[[C16:.+]] = llvm.mlir.constant(16 : i32)
// CHECK: %[[SGPR1_0:.+]] = llvm.shl %[[TENSOR_DIM_0]], %[[C16]]
// CHECK: %[[SGPR1:.+]] = llvm.or %[[ATOMIC_BARRIER]], %[[SGPR1_0]]
+ // CHECK-DAG: %[[C16:.+]] = llvm.mlir.constant(16 : i32)
+ // CHECK: %[[SGPR2_0:.+]] = llvm.lshr %[[TENSOR_DIM_0]], %[[C16]]
+
// CHECK: %[[V8I32:.+]] = llvm.mlir.poison : vector<8xi32>
// CHECK: %[[DGROUP1_0:.+]] = llvm.insertelement %[[SGPR0]], %[[V8I32]][%[[C0]] : i32]
// CHECK: %[[DGROUP1_1:.+]] = llvm.insertelement %[[SGPR1]], %[[DGROUP1_0]][%[[C1]] : i32]
@@ -365,11 +374,276 @@ func.func @make_dma_descriptor_atomic_barrier(%base: !amdgpu.tdm_base<i32>, %bar
func.return %descriptor : !amdgpu.tdm_descriptor
}
+
+// -----
+
+// CHECK-LABEL: func @make_dma_descriptor_iterate
+// CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base<i32>, %[[IDX:.+]]: index, %[[I32:.+]]: i32)
+func.func @make_dma_descriptor_iterate(%base: !amdgpu.tdm_base<i32>, %idx : index, %i32: i32) -> !amdgpu.tdm_descriptor {
+ // CHECK-DAG: %[[DGROUP0:.+]] = builtin.unrealized_conversion_cast %[[BASE]]
+ // CHECK-DAG: %[[INDEX:.+]] = builtin.unrealized_conversion_cast %[[IDX]] : index to i64
+
+ // CHECK: %[[C0:.+]] = llvm.mlir.constant(0 : i32)
+ // CHECK: %[[C1:.+]] = llvm.mlir.constant(1 : i32)
+ // CHECK: %[[C2:.+]] = llvm.mlir.constant(2 : i32)
+ // CHECK: %[[C3:.+]] = llvm.mlir.constant(3 : i32)
+
+ // CHECK-DAG: %[[C16:.+]] = llvm.mlir.constant(16 : i32)
+ // CHECK: %[[SGPR0_0:.+]] = llvm.shl %[[C2]], %[[C16]]
+
+ // CHECK-DAG: %[[SHIFT:.+]] = llvm.mlir.constant(19 : i32)
+ // CHECK: %[[ITERATE_ENABLE:.+]] = llvm.shl %[[C1]], %[[SHIFT]]
+ // CHECK: %[[SGPR0:.+]] = llvm.or %[[SGPR0_0]], %[[ITERATE_ENABLE]]
+
+ // CHECK: %[[V8I32:.+]] = llvm.mlir.poison : vector<8xi32>
+ // CHECK: %[[DGROUP1_0:.+]] = llvm.insertelement %[[SGPR0]], %[[V8I32]][%[[C0]] : i32]
+
+ // CHECK: %[[SGPR2:.+]] = llvm.trunc %[[INDEX]] : i64 to i32
+
+ // CHECK-DAG: %[[SHIFT:.+]] = llvm.mlir.constant(32 : i64) : i64
+ // CHECK: %[[GLOBAL_ADDR_INC_HIGH:.+]] = llvm.lshr %[[INDEX]], %[[SHIFT]]
+ // CHECK: %[[GLOBAL_ADDR_INC_HIGH_2:.+]] = llvm.trunc %[[GLOBAL_ADDR_INC_HIGH]] : i64 to i32
+ // CHECK-DAG: %[[MASK:.+]] = llvm.mlir.constant([[FIRST_16_HIGH:65535]] : i32) : i32
+ // CHECK: %[[SGPR3_LOW:.+]] = llvm.and %[[GLOBAL_ADDR_INC_HIGH_2]], %[[MASK]]
+
+ // CHECK: %[[ITERATE_COUNT:.+]] = llvm.trunc %[[INDEX]] : i64 to i32
+ // CHECK: %[[ITERATE_COUNT_M1:.+]] = llvm.sub %[[ITERATE_COUNT]], %[[C1]]
+ // CHECK: %[[SHIFT:.+]] = llvm.mlir.constant(16 : i32)
+ // CHECK: %[[ITERATE_COUNT_SHIFTED:.+]] = llvm.shl %[[ITERATE_COUNT_M1]], %[[SHIFT]]
+ // CHECK: %[[SGPR3:.+]] = llvm.or %[[SGPR3_LOW]], %[[ITERATE_COUNT_SHIFTED]]
+
+ // CHECK: %[[V4I32:.+]] = llvm.mlir.poison : vector<4xi32>
+ // CHECK: %[[DGROUP2_0:.+]] = llvm.insertelement %[[C0]], %[[V4I32]][%[[C0]]
+ // CHECK: %[[DGROUP2_1:.+]] = llvm.insertelement %[[I32]], %[[DGROUP2_0]][%[[C1]]
+ // CHECK: %[[DGROUP2_2:.+]] = llvm.insertelement %[[SGPR2]], %[[DGROUP2_1]][%[[C2]]
+ // CHECK: %[[DGROUP2:.+]] = llvm.insertelement %[[SGPR3]], %[[DGROUP2_2]][%[[C3]]
+
+ %descriptor = amdgpu.make_dma_descriptor %base globalSize [128, 64] globalStride [64, 1] sharedSize [128, 64] iterate %idx, %i32, %idx : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
+ func.return %descriptor : !amdgpu.tdm_descriptor
+}
+
+// -----
+
+// CHECK-LABEL: func @make_dma_descriptor_pad_enable
+// CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base<i32>, %[[PAD_AMOUNT:.+]]: i32, %[[PAD_INTERVAL:.+]]: i32)
+func.func @make_dma_descriptor_pad_enable(%base: !amdgpu.tdm_base<i32>, %pad_amount: i32, %pad_interval: i32) -> !amdgpu.tdm_descriptor {
+
+ // CHECK-DAG: %[[DGROUP0:.+]] = builtin.unrealized_conversion_cast %[[BASE]]
+
+ // CHECK-DAG: %[[C0:.+]] = llvm.mlir.constant(0 : i32)
+ // CHECK-DAG: %[[C1:.+]] = llvm.mlir.constant(1 : i32)
+ // CHECK-DAG: %[[C2:.+]] = llvm.mlir.constant(2 : i32)
+ // CHECK-DAG: %[[C3:.+]] = llvm.mlir.constant(3 : i32)
+ // CHECK-DAG: %[[C4:.+]] = llvm.mlir.constant(4 : i32)
+ // CHECK-DAG: %[[C5:.+]] = llvm.mlir.constant(5 : i32)
+ // CHECK-DAG: %[[C6:.+]] = llvm.mlir.constant(6 : i32)
+ // CHECK-DAG: %[[C7:.+]] = llvm.mlir.constant(7 : i32)
+
+ // CHECK-DAG: %[[SHIFT:.+]] = llvm.mlir.constant(20 : i32)
+ // CHECK: %[[PAD_ENABLE:.+]] = llvm.shl %[[C1]], %[[SHIFT]]
+ // CHECK: %[[SGPR0:.+]] = llvm.or %[[SGPR0_BASE:.+]], %[[PAD_ENABLE]]
+
+ // CHECK: %[[PAD_INTERVAL_CTTZ:.+]] = "llvm.intr.cttz"(%[[PAD_INTERVAL]]) <{is_zero_poison = false}> : (i32) -> i32
+ // CHECK: %[[PAD_INTERVAL_M1:.+]] = llvm.sub %[[PAD_INTERVAL_CTTZ]], %[[C1]]
+ // CHECK-DAG: %[[SHIFT:.+]] = llvm.mlir.constant(22 : i32)
+ // CHECK: %[[PAD_INTERVAL:.+]] = llvm.shl %[[PAD_INTERVAL_M1]], %[[SHIFT]]
+ // CHECK: %[[SGPR0:.+]] = llvm.or %[[SGPR0_BASE:.+]], %[[PAD_INTERVAL]]
+
+ // CHECK: %[[PAD_AMOUNT_M1:.+]] = llvm.sub %[[PAD_AMOUNT]], %[[C1]]
+ // CHECK-DAG: %[[SHIFT:.+]] = llvm.mlir.constant(25 : i32)
+ // CHECK: %[[PAD_AMOUNT_SHIFTED:.+]] = llvm.shl %[[PAD_AMOUNT_M1]], %[[SHIFT]]
+ // CHECK: llvm.or %[[SGPR0:.+]], %[[PAD_AMOUNT_SHIFTED]]
+
+ %descriptor = amdgpu.make_dma_descriptor %base globalSize [128, 64] globalStride [64, 1] sharedSize [128, 64] padShared(%pad_amount every %pad_interval) : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
+ func.return %descriptor : !amdgpu.tdm_descriptor
+}
+
+// -----
+
+// CHECK-LABEL: func @make_dma_descriptor_dynamic
+// CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base<i32>, %[[GS0:.+]]: index, %[[GS1:.+]]: index, %[[GST1:.+]]: index, %[[SHS0:.+]]: index, %[[SHS1:.+]]: index)
+func.func @make_dma_descriptor_dynamic(%base: !amdgpu.tdm_base<i32>, %gs0: index, %gs1: index, %gst1: index, %shs0: index, %shs1: index) -> !amdgpu.tdm_descriptor {
+ // CHECK-DAG: %[[DGROUP0:.+]] = builtin.unrealized_conversion_cast %[[BASE]]
+ // CHECK-DAG: %[[GS0I:.+]] = builtin.unrealized_conversion_cast %[[GS0]]
+ // CHECK-DAG: %[[GS1I:.+]] = builtin.unrealized_conversion_cast %[[GS1]]
+ // CHECK-DAG: %[[GST1I:.+]] = builtin.unrealized_conversion_cast %[[GST1]]
+ // CHECK-DAG: %[[SHS0I:.+]] = builtin.unrealized_conversion_cast %[[SHS0]]
+ // CHECK-DAG: %[[SHS1I:.+]] = builtin.unrealized_conversion_cast %[[SHS1]]
+
+ // CHECK-DAG: %[[C0:.+]] = llvm.mlir.constant(0 : i32)
+ // CHECK-DAG: %[[C1:.+]] = llvm.mlir.constant(1 : i32)
+ // CHECK-DAG: %[[C2:.+]] = llvm.mlir.constant(2 : i32)
+ // CHECK-DAG: %[[C3:.+]] = llvm.mlir.constant(3 : i32)
+ // CHECK-DAG: %[[C4:.+]] = llvm.mlir.constant(4 : i32)
+ // CHECK-DAG: %[[C5:.+]] = llvm.mlir.constant(5 : i32)
+ // CHECK-DAG: %[[C6:.+]] = llvm.mlir.constant(6 : i32)
+ // CHECK-DAG: %[[C7:.+]] = llvm.mlir.constant(7 : i32)
+
+ // CHECK: %[[TENSOR_DIM_0:.+]] = llvm.trunc %[[GS0I]] : i64 to i32
+ // CHECK: %[[SHIFT:.+]] = llvm.mlir.constant(16 : i32)
+ // CHECK: %[[SGPR1:.+]] = llvm.shl %[[TENSOR_DIM_0]], %[[SHIFT]]
+
+ // CHECK: %[[TENSOR_DIM_1:.+]] = llvm.trunc %[[GS1I]] : i64 to i32
+ // CHECK: %[[SHIFT:.+]] = llvm.mlir.constant(16 : i32)
+ // CHECK: %[[SGPR2:.+]] = llvm.shl %[[TENSOR_DIM_1]], %[[SHIFT]]
+
+ // CHECK: %[[TILE_DIM_0:.+]] = llvm.trunc %[[SHS0I]] : i64 to i32
+ // CHECK: %[[SHIFT:.+]] = llvm.mlir.constant(16 : i32)
+ // CHECK: %[[TILE_DIM_0_SHIFTED:.+]] = llvm.shl %[[TILE_DIM_0:.+]], %[[SHIFT]]
+
+ // CHECK: %[[TILE_DIM_1:.+]] = llvm.trunc %[[SHS1I]] : i64 to i32
+
+ // CHECK-DAG: %[[TENSOR_DIM_0_STRIDE:.+]] = llvm.mlir.constant(1 : i64) : i64
+ // CHECK: %[[MASK:.+]] = llvm.mlir.constant(281474976710655 : i64) : i64
+
+ // CHECK-DAG: %[[MASK:.+]] = llvm.mlir.constant(281474976710655 : i64) : i64
+ // CHECK: %[[TENSOR_DIM_1_STRIDE_MASKED:.+]] = llvm.and %[[MASK]], %[[GST1I]]
+ // CHECK-DAG: %[[TENSOR_DIM_1_STRIDE_LOW:.+]] = llvm.trunc %[[TENSOR_DIM_1_STRIDE_MASKED]]
+ // CHECK-DAG: %[[SHIFT:.+]] = llvm.mlir.constant(16 : i32) : i32
+ // CHECK: %[[TENSOR_DIM_1_STRIDE_LOW_SHIFTED:.+]] = llvm.shl %[[TENSOR_DIM_1_STRIDE_LOW]], %[[SHIFT]]
+
+
+ %descriptor = amdgpu.make_dma_descriptor %base globalSize [%gs1, %gs0] globalStride [%gst1, 1] sharedSize [%shs1, %shs0] : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
+ func.return %descriptor : !amdgpu.tdm_descriptor
+}
+
+// -----
+
+// CHECK-LABEL: func @make_dma_descriptor
+// CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base<i32>)
+func.func @make_dma_descriptor(%base: !amdgpu.tdm_base<i32>) -> !amdgpu.tdm_descriptor {
+ // CHECK-DAG: %[[DGROUP0:.+]] = builtin.unrealized_conversion_cast %[[BASE]]
+
+ // CHECK-DAG: %[[C0:.+]] = llvm.mlir.constant(0 : i32)
+ // CHECK-DAG: %[[C1:.+]] = llvm.mlir.constant(1 : i32)
+ // CHECK-DAG: %[[C2:.+]] = llvm.mlir.constant(2 : i32)
+ // CHECK-DAG: %[[C3:.+]] = llvm.mlir.constant(3 : i32)
+ // CHECK-DAG: %[[C4:.+]] = llvm.mlir.constant(4 : i32)
+ // CHECK-DAG: %[[C5:.+]] = llvm.mlir.constant(5 : i32)
+ // CHECK-DAG: %[[C6:.+]] = llvm.mlir.constant(6 : i32)
+ // CHECK-DAG: %[[C7:.+]] = llvm.mlir.constant(7 : i32)
+
+
+ // CHECK-DAG: %[[C16:.+]] = llvm.mlir.constant(16 : i32)
+ // CHECK: %[[SGPR0:.+]] = llvm.shl %[[C2]], %[[C16]]
+
+ // CHECK-DAG: %[[TENSOR_DIM_0:.+]] = llvm.mlir.constant(64 : i32)
+ // CHECK-DAG: %[[C16:.+]] = llvm.mlir.constant(16 : i32)
+ // CHECK: %[[SGPR1:.+]] = llvm.shl %[[TENSOR_DIM_0]], %[[C16]]
+
+ // CHECK-DAG: %[[C16:.+]] = llvm.mlir.constant(16 : i32)
+ // CHECK: %[[SGPR2_0:.+]] = llvm.lshr %[[TENSOR_DIM_0]], %[[C16]]
+
+ // CHECK-DAG: %[[TENSOR_DIM_1:.+]] = llvm.mlir.constant(128 : i32)
+ // CHECK-DAG: %[[C16:.+]] = llvm.mlir.constant(16 : i32)
+ // CHECK: %[[TENSOR_DIM_1_SHIFTED:.+]] = llvm.shl %[[TENSOR_DIM_1]], %[[C16]]
+ // CHECK: %[[SGPR2:.+]] = llvm.or %[[SGPR2_0]], %[[TENSOR_DIM_1_SHIFTED]]
+
+ // CHECK-DAG: %[[C16:.+]] = llvm.mlir.constant(16 : i32)
+ // CHECK: %[[SGPR3_0:.+]] = llvm.lshr %[[TENSOR_DIM_1]], %[[C16]]
+
+ // CHECK-DAG: %[[TILE_DIM_0:.+]] = llvm.mlir.constant(64 : i32)
+ // CHECK-DAG: %[[C16:.+]] = llvm.mlir.constant(16 : i32)
+ // CHECK: %[[TILE_DIM_0_SHIFTED:.+]] = llvm.shl %[[TILE_DIM_0]], %[[C16]]
+ // CHECK: %[[SGPR3:.+]] = llvm.or %[[SGPR3_0]], %[[TILE_DIM_0_SHIFTED]]
+
+ // CHECK-DAG: %[[TILE_DIM_1:.+]] = llvm.mlir.constant(128 : i32)
+ // CHECK-DAG: %[[TILE_DIM_2:.+]] = llvm.mlir.constant(64 : i32)
+ // CHECK-DAG: %[[C16:.+]] = llvm.mlir.constant(16 : i32)
+ // CHECK: %[[TILE_DIM_2_SHIFTED:.+]] = llvm.shl %[[TILE_DIM_2]], %[[C16]]
+ // CHECK: %[[SGPR4:.+]] = llvm.or %[[TILE_DIM_1]], %[[TILE_DIM_2_SHIFTED]]
+
+ // CHECK-DAG: %[[TENSOR_DIM_0_STRIDE:.+]] = llvm.mlir.constant(1 : i64) : i64
+ // CHECK-DAG: %[[MASK:.+]] = llvm.mlir.constant(281474976710655 : i64) : i64
+ // CHECK: %[[TENSOR_DIM_0_STRIDE_MASKED:.+]] = llvm.and %[[MASK]], %[[TENSOR_DIM_0_STRIDE]]
+ // CHECK-DAG: %[[SGPR5:.+]] = llvm.trunc %[[TENSOR_DIM_0_STRIDE_MASKED]] : i64 to i32
+ // CHECK-DAG: %[[SHIFT:.+]] = llvm.mlir.constant(32 : i64) : i64
+ // CHECK: %[[TENSOR_DIM_0_STRIDE_HIGH_64:.+]] = llvm.lshr %[[TENSOR_DIM_0_STRIDE_MASKED]], %[[SHIFT]]
+ // CHECK: %[[SGPR6_0:.+]] = llvm.trunc %[[TENSOR_DIM_0_STRIDE_HIGH_64]] : i64 to i32
+
+ // CHECK-DAG: %[[TENSOR_DIM_1_STRIDE:.+]] = llvm.mlir.constant(64 : i64)
+ // CHECK-DAG: %[[MASK:.+]] = llvm.mlir.constant(281474976710655 : i64) : i64
+ // CHECK: %[[TENSOR_DIM_1_STRIDE_MASKED:.+]] = llvm.and %[[MASK]], %[[TENSOR_DIM_1_STRIDE]]
+ // CHECK-DAG: %[[TENSOR_DIM_1_STRIDE_LOW:.+]] = llvm.trunc %[[TENSOR_DIM_1_STRIDE_MASKED]]
+ // CHECK-DAG: %[[SHIFT:.+]] = llvm.mlir.constant(16 : i32) : i32
+ // CHECK: %[[TENSOR_DIM_1_STRIDE_LOW_SHIFTED:.+]] = llvm.shl %[[TENSOR_DIM_1_STRIDE_LOW]], %[[SHIFT]]
+ // CHECK-DAG: %[[SGPR6:.+]] = llvm.or %[[SGPR6_0]], %[[TENSOR_DIM_1_STRIDE_LOW_SHIFTED]]
+
+ // CHECK-DAG: %[[SHIFT:.+]] = llvm.mlir.constant(16 : i64) : i64
+ // CHECK: %[[TENSOR_DIM_1_STRIDE_SHIFTED:.+]] = llvm.lshr %[[TENSOR_DIM_1_STRIDE_MASKED]], %[[SHIFT]]
+ // CHECK: %[[SGPR7:.+]] = llvm.trunc %[[TENSOR_DIM_1_STRIDE_SHIFTED]] : i64 to i32
+
+ // CHECK: %[[V8I32:.+]] = llvm.mlir.poison : vector<8xi32>
+ // CHECK: %[[DGROUP1_0:.+]] = llvm.insertelement %[[SGPR0]], %[[V8I32]][%[[C0]] : i32]
+ // CHECK: %[[DGROUP1_1:.+]] = llvm.insertelement %[[SGPR1]], %[[DGROUP1_0]][%[[C1]] : i32]
+ // CHECK: %[[DGROUP1_2:.+]] = llvm.insertelement %[[SGPR2]], %[[DGROUP1_1]][%[[C2]] : i32]
+ // CHECK: %[[DGROUP1_3:.+]] = llvm.insertelement %[[SGPR3]], %[[DGROUP1_2]][%[[C3]] : i32]
+ // CHECK: %[[DGROUP1_4:.+]] = llvm.insertelement %[[SGPR4]], %[[DGROUP1_3]][%[[C4]] : i32]
+ // CHECK: %[[DGROUP1_5:.+]] = llvm.insertelement %[[SGPR5]], %[[DGROUP1_4]][%[[C5]] : i32]
+ // CHECK: %[[DGROUP1_6:.+]] = llvm.insertelement %[[SGPR6]], %[[DGROUP1_5]][%[[C6]] : i32]
+ // CHECK: %[[DGROUP1:.+]] = llvm.insertelement %[[SGPR7]], %[[DGROUP1_6]][%[[C7]] : i32]
+
+ // CHECK-DAG: %[[SGPR0:.+]] = llvm.mlir.constant([[TENSOR_DIM_2:64]] : i32)
+
+ // CHECK-DAG: %[[SGPR1:.+]] = llvm.mlir.constant([[TENSOR_DIM_3:64]] : i32)
+
+ // CHECK-DAG: %[[TENSOR_DIM_1_STRIDE:.+]] = llvm.mlir.constant(64 : i64)
+ // CHECK-DAG: %[[MASK:.+]] = llvm.mlir.constant(281474976710655 : i64) : i64
+ // CHECK: %[[TENSOR_DIM_2_STRIDE_MASKED:.+]] = llvm.and %[[MASK]], %[[TENSOR_DIM_1_STRIDE]]
+ // CHECK-DAG: %[[SGPR2:.+]] = llvm.trunc %[[TENSOR_DIM_2_STRIDE_MASKED]]
+
+ // CHECK-DAG: %[[SHIFT:.+]] = llvm.mlir.constant(32 : i64) : i64
+ // CHECK: %[[TENSOR_DIM_2_STRIDE_HIGH_64:.+]] = llvm.lshr %[[TENSOR_DIM_2_STRIDE_MASKED]], %[[SHIFT]]
+ // CHECK: %[[SGPR3_0:.+]] = llvm.trunc %[[TENSOR_DIM_2_STRIDE_HIGH_64]] : i64 to i32
+
+ // CHECK-DAG: %[[TILE_DIM_3:.+]] = llvm.mlir.constant(64 : i32) : i32
+ // CHECK-DAG: %[[SHIFT:.+]] = llvm.mlir.constant(16 : i32) : i32
+ // CHECK: %[[TILE_DIM_3_SHIFTED:.+]] = llvm.shl %[[TILE_DIM_3]], %[[SHIFT]]
+ // CHECK: %[[SGPR3:.+]] = llvm.or %[[SGPR3_0]], %[[TILE_DIM_3_SHIFTED]]
+
+ // CHECK-DAG: %[[V4I32:.+]] = llvm.mlir.poison : vector<4xi32>
+ // CHECK: %[[DGROUP2_0:.+]] = llvm.insertelement %[[SGPR0]], %[[V4I32]][%[[C0]] : i32]
+ // CHECK: %[[DGROUP2_1:.+]] = llvm.insertelement %[[SGPR1]], %[[DGROUP2_0]][%[[C1]] : i32]
+ // CHECK: %[[DGROUP2_2:.+]] = llvm.insertelement %[[SGPR2]], %[[DGROUP2_1]][%[[C2]] : i32]
+ // CHECK: %[[DGROUP2:.+]] = llvm.insertelement %[[SGPR3]], %[[DGROUP2_2]][%[[C3]] : i32]
+
+ // CHECK-DAG: %[[TENSOR_DIM3_STRIDE:.+]] = llvm.mlir.constant(64 : i64)
+ // CHECK-DAG: %[[MASK:.+]] = llvm.mlir.constant(281474976710655 : i64) : i64
+ // CHECK: %[[TENSOR_DIM3_STRIDE_MASKED:.+]] = llvm.and %[[MASK]], %[[TENSOR_DIM3_STRIDE]]
+ // CHECK: %[[TENSOR_DIM3_STRIDE_LOW:.+]] = llvm.trunc %[[TENSOR_DIM3_STRIDE_MASKED]] : i64 to i32
+ // CHECK-DAG: %[[SHIFT:.+]] = llvm.mlir.constant(32 : i64)
+ // CHECK: %[[TENSOR_DIM3_STRIDE_SHIFTED:.+]] = llvm.lshr %[[TENSOR_DIM3_STRIDE_MASKED]], %[[SHIFT]]
+ // CHECK: %[[TENSOR_DIM3_STRIDE_HIGH:.+]] = llvm.trunc %[[TENSOR_DIM3_STRIDE_SHIFTED]] : i64 to i32
+
+ // CHECK-DAG: %[[TENSOR_DIM_4:.+]] = llvm.mlir.constant(64 : i32)
+ // CHECK-DAG: %[[SHIFT:.+]] = llvm.mlir.constant(16 : i32)
+ // CHECK-DAG: %[[TENSOR_DIM_4_LOW:.+]] = llvm.shl %[[TENSOR_DIM_4]], %[[SHIFT]]
+ // CHECK: %[[SGPR1:.+]] = llvm.or %[[TENSOR_DIM3_STRIDE_HIGH]], %[[TENSOR_DIM_4_LOW]]
+
+ // CHECK-DAG: %[[SHIFT:.+]] = llvm.mlir.constant(16 : i32)
+ // CHECK: %[[SGPR2_0:.+]] = llvm.lshr %[[TENSOR_DIM_4]], %[[SHIFT]]
+
+ // CHECK-DAG: %[[TILE_DIM_4:.+]] = llvm.mlir.constant(64 : i32) : i32
+ // CHECK-DAG: %[[SHIFT:.+]] = llvm.mlir.constant(16 : i32) : i32
+ // CHECK: %[[TILE_DIM_4_SHIFTED:.+]] = llvm.shl %[[TILE_DIM_4]], %[[SHIFT]]
+ // CHECK: %[[SGPR2:.+]] = llvm.or %[[SGPR2_0]], %[[TILE_DIM_4_SHIFTED]]
+
+ // CHECK: %[[V4I32:.+]] = llvm.mlir.poison : vector<4xi32>
+ // CHECK: %[[DGROUP3_0:.+]] = llvm.insertelement %[[TENSOR_DIM3_STRIDE_LOW]], %[[V4I32]][%[[C0]] : i32]
+ // CHECK: %[[DGROUP3_1:.+]] = llvm.insertelement %[[SGPR1]], %[[DGROUP3_0]][%[[C1]] : i32]
+ // CHECK: %[[DGROUP3_2:.+]] = llvm.insertelement %[[SGPR2]], %[[DGROUP3_1]][%[[C2]] : i32]
+ // CHECK: %[[DGROUP3:.+]] = llvm.insertelement %[[C0]], %[[DGROUP3_2]][%[[C3]] : i32]
+
+ // CHECK: %[[DGROUPS:.+]] = builtin.unrealized_conversion_cast %[[DGROUP0]], %[[DGROUP1]], %[[DGROUP2]], %[[DGROUP3]] : vector<4xi32>, vector<8xi32>, vector<4xi32>, vector<4xi32> to !amdgpu.tdm_descriptor
+ %descriptor = amdgpu.make_dma_descriptor %base globalSize [64, 64, 64, 128, 64] globalStride [64, 64, 64, 64, 1] sharedSize [64, 64, 64, 128, 64] : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
+ func.return %descriptor : !amdgpu.tdm_descriptor
+}
+
// -----
// CHECK-LABEL: func @make_dma_descriptor_workgroup_mask
-// CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base<i32>, %[[WG_MASK:.+]]: i16, %[[TIMEOUT:.+]]: i1)
-func.func @make_dma_descriptor_workgroup_mask(%base: !amdgpu.tdm_base<i32>, %wg_mask: i16, %timeout: i1) -> !amdgpu.tdm_descriptor {
+// CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base<i32>, %[[WG_MASK:.+]]: vector<16xi1>, %[[TIMEOUT:.+]]: i1)
+func.func @make_dma_descriptor_workgroup_mask(%base: !amdgpu.tdm_base<i32>, %wg_mask: vector<16xi1>, %timeout: i1) -> !amdgpu.tdm_descriptor {
// CHECK-DAG: %[[DGROUP0:.+]] = builtin.unrealized_conversion_cast %[[BASE]]
// CHECK-DAG: %[[C0:.+]] = llvm.mlir.constant(0 : i32)
@@ -381,28 +655,31 @@ func.func @make_dma_descriptor_workgroup_mask(%base: !amdgpu.tdm_base<i32>, %wg_
// CHECK-DAG: %[[C6:.+]] = llvm.mlir.constant(6 : i32)
// CHECK-DAG: %[[C7:.+]] = llvm.mlir.constant(7 : i32)
- // CHECK-DAG: %[[WG_MASK_EXT:.+]] = llvm.zext %[[WG_MASK]]
- // CHECK-DAG: %[[DATA_SIZE:.+]] = llvm.mlir.constant(2 : i32)
+ // CHECK: %[[WG_MASK_CAST:.+]] = llvm.bitcast %[[WG_MASK]] : vector<16xi1> to i16
+ // CHECK-DAG: %[[WG_MASK_EXT:.+]] = llvm.zext %[[WG_MASK_CAST]]
// CHECK-DAG: %[[C16:.+]] = llvm.mlir.constant(16 : i32)
- // CHECK: %[[DATA_SIZE_SHIFTED:.+]] = llvm.shl %[[DATA_SIZE]], %[[C16]]
+ // CHECK: %[[DATA_SIZE_SHIFTED:.+]] = llvm.shl %[[C2]], %[[C16]]
// CHECK: %[[SGPR0_BASE:.+]] = llvm.or %[[WG_MASK_EXT]], %[[DATA_SIZE_SHIFTED]]
+
// CHECK-DAG: %[[C21:.+]] = llvm.mlir.constant(21 : i32)
// CHECK: %[[TIMEOUT_SHIFTED:.+]] = llvm.shl %[[C1]], %[[C21]]
// CHECK: %[[SGPR0:.+]] = llvm.or %[[SGPR0_BASE]], %[[TIMEOUT_SHIFTED]]
// CHECK-DAG: %[[TENSOR_DIM_0:.+]] = llvm.mlir.constant(64 : i32)
// CHECK-DAG: %[[C16:.+]] = llvm.mlir.constant(16 : i32)
- // CHECK: %[[SGPR2_0:.+]] = llvm.lshr %[[TENSOR_DIM_0]], %[[C16]]
- // CHECK-DAG: %[[C16:.+]] = llvm.mlir.constant(16 : i32)
// CHECK: %[[SGPR1:.+]] = llvm.shl %[[TENSOR_DIM_0]], %[[C16]]
- // CHECK-DAG: %[[TENSOR_DIM_1:.+]] = llvm.mlir.constant(128 : i32)
// CHECK-DAG: %[[C16:.+]] = llvm.mlir.constant(16 : i32)
- // CHECK: %[[SGPR3_0:.+]] = llvm.lshr %[[TENSOR_DIM_1]], %[[C16]]
+ // CHECK: %[[SGPR2_0:.+]] = llvm.lshr %[[TENSOR_DIM_0]], %[[C16]]
+
+ // CHECK-DAG: %[[TENSOR_DIM_1:.+]] = llvm.mlir.constant(128 : i32)
// CHECK-DAG: %[[C16:.+]] = llvm.mlir.constant(16 : i32)
// CHECK: %[[TENSOR_DIM_1_SHIFTED:.+]] = llvm.shl %[[TENSOR_DIM_1]], %[[C16]]
// CHECK: %[[SGPR2:.+]] = llvm.or %[[SGPR2_0]], %[[TENSOR_DIM_1_SHIFTED]]
+ // CHECK-DAG: %[[C16:.+]] = llvm.mlir.constant(16 : i32)
+ // CHECK: %[[SGPR3_0:.+]] = llvm.lshr %[[TENSOR_DIM_1]], %[[C16]]
+
// CHECK-DAG: %[[TILE_DIM_0:.+]] = llvm.mlir.constant(64 : i32)
// CHECK-DAG: %[[C16:.+]] = llvm.mlir.constant(16 : i32)
// CHECK: %[[TILE_DIM_0_SHIFTED:.+]] = llvm.shl %[[TILE_DIM_0:.+]], %[[C16]]
@@ -414,6 +691,7 @@ func.func @make_dma_descriptor_workgroup_mask(%base: !amdgpu.tdm_base<i32>, %wg_
// CHECK-DAG: %[[MASK:.+]] = llvm.mlir.constant(281474976710655 : i64) : i64
// CHECK: %[[TENSOR_DIM_0_STRIDE_MASKED:.+]] = llvm.and %[[MASK]], %[[TENSOR_DIM_0_STRIDE]]
// CHECK-DAG: %[[SGPR5:.+]] = llvm.trunc %[[TENSOR_DIM_0_STRIDE_MASKED]] : i64 to i32
+
// CHECK-DAG: %[[SHIFT:.+]] = llvm.mlir.constant(32 : i64) : i64
// CHECK: %[[TENSOR_DIM_0_STRIDE_HIGH_64:.+]] = llvm.lshr %[[TENSOR_DIM_0_STRIDE_MASKED]], %[[SHIFT]]
// CHECK: %[[SGPR6_0:.+]] = llvm.trunc %[[TENSOR_DIM_0_STRIDE_HIGH_64]] : i64 to i32
@@ -422,13 +700,15 @@ func.func @make_dma_descriptor_workgroup_mask(%base: !amdgpu.tdm_base<i32>, %wg_
// CHECK-DAG: %[[MASK:.+]] = llvm.mlir.constant(281474976710655 : i64) : i64
// CHECK: %[[TENSOR_DIM_1_STRIDE_MASKED:.+]] = llvm.and %[[MASK]], %[[TENSOR_DIM_1_STRIDE]]
// CHECK-DAG: %[[TENSOR_DIM_1_STRIDE_LOW:.+]] = llvm.trunc %[[TENSOR_DIM_1_STRIDE_MASKED]]
- // CHECK-DAG: %[[SHIFT:.+]] = llvm.mlir.constant(16 : i64) : i64
- // CHECK: %[[TENSOR_DIM_1_STRIDE_SHIFTED:.+]] = llvm.lshr %[[TENSOR_DIM_1_STRIDE_MASKED]], %[[SHIFT]]
- // CHECK: %[[SGPR7:.+]] = llvm.trunc %[[TENSOR_DIM_1_STRIDE_SHIFTED]] : i64 to i32
+
// CHECK-DAG: %[[SHIFT:.+]] = llvm.mlir.constant(16 : i32) : i32
// CHECK: %[[TENSOR_DIM_1_STRIDE_LOW_SHIFTED:.+]] = llvm.shl %[[TENSOR_DIM_1_STRIDE_LOW]], %[[SHIFT]]
// CHECK-DAG: %[[SGPR6:.+]] = llvm.or %[[SGPR6_0]], %[[TENSOR_DIM_1_STRIDE_LOW_SHIFTED]]
+ // CHECK-DAG: %[[SHIFT:.+]] = llvm.mlir.constant(16 : i64) : i64
+ // CHECK: %[[TENSOR_DIM_1_STRIDE_SHIFTED:.+]] = llvm.lshr %[[TENSOR_DIM_1_STRIDE_MASKED]], %[[SHIFT]]
+ // CHECK: %[[SGPR7:.+]] = llvm.trunc %[[TENSOR_DIM_1_STRIDE_SHIFTED]] : i64 to i32
+
// CHECK: %[[V8I32:.+]] = llvm.mlir.poison : vector<8xi32>
// CHECK: %[[DGROUP1_0:.+]] = llvm.insertelement %[[SGPR0]], %[[V8I32]][%[[C0]] : i32]
// CHECK: %[[DGROUP1_1:.+]] = llvm.insertelement %[[SGPR1]], %[[DGROUP1_0]][%[[C1]] : i32]
@@ -439,7 +719,10 @@ func.func @make_dma_descriptor_workgroup_mask(%base: !amdgpu.tdm_base<i32>, %wg_
// CHECK: %[[DGROUP1_6:.+]] = llvm.insertelement %[[SGPR6]], %[[DGROUP1_5]][%[[C6]] : i32]
// CHECK: %[[DGROUP1:.+]] = llvm.insertelement %[[SGPR7]], %[[DGROUP1_6]][%[[C7]] : i32]
- // CHECK: %[[DGROUPS:.+]] = builtin.unrealized_conversion_cast %[[DGROUP0]], %[[DGROUP1]] : vector<4xi32>, vector<8xi32> to !amdgpu.tdm_descriptor
+ // CHECK: %[[DGROUP2:.+]] = llvm.mlir.zero : vector<4xi32>
+ // CHECK: %[[DGROUP3:.+]] = llvm.mlir.zero : vector<4xi32>
+
+ // CHECK: %[[DGROUPS:.+]] = builtin.unrealized_conversion_cast %[[DGROUP0]], %[[DGROUP1]], %[[DGROUP2]], %[[DGROUP3]] : vector<4xi32>, vector<8xi32>, vector<4xi32>, vector<4xi32> to !amdgpu.tdm_descriptor
%descriptor = amdgpu.make_dma_descriptor %base globalSize [128, 64] globalStride [64, 1] sharedSize [128, 64] workgroupMask %wg_mask earlyTimeout %timeout : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
func.return %descriptor : !amdgpu.tdm_descriptor
}
diff --git a/mlir/test/Dialect/AMDGPU/amdgpu-make-dma-descriptor-fold.mlir b/mlir/test/Dialect/AMDGPU/amdgpu-make-dma-descriptor-fold.mlir
index 9d43c9940f8e0..dcb385384a2b8 100644
--- a/mlir/test/Dialect/AMDGPU/amdgpu-make-dma-descriptor-fold.mlir
+++ b/mlir/test/Dialect/AMDGPU/amdgpu-make-dma-descriptor-fold.mlir
@@ -1,8 +1,8 @@
// RUN: mlir-opt --canonicalize %s | FileCheck %s
// CHECK-LABEL: @make_dma_descriptor_fold
-// CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base<i32>, %[[IDX:.+]]: index)
-func.func @make_dma_descriptor_fold(%base: !amdgpu.tdm_base<i32>, %idx: index) -> !amdgpu.tdm_descriptor {
+// CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base<i32>, %[[IDX:.+]]: index, %[[I32:.+]]: i32)
+func.func @make_dma_descriptor_fold(%base: !amdgpu.tdm_base<i32>, %idx: index, %i32: i32) -> !amdgpu.tdm_descriptor {
%c64 = arith.constant 64 : index
// CHECK: amdgpu.make_dma_descriptor %[[BASE]]
@@ -13,7 +13,7 @@ func.func @make_dma_descriptor_fold(%base: !amdgpu.tdm_base<i32>, %idx: index) -
globalStride [%c64, 1]
// CHECK-SAME: sharedSize [64, 64]
sharedSize [%c64, %c64]
- iterate %idx, %idx, %idx
+ iterate %idx, %i32, %idx
: !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
func.return %0 : !amdgpu.tdm_descriptor
}
diff --git a/mlir/test/Dialect/AMDGPU/ops.mlir b/mlir/test/Dialect/AMDGPU/ops.mlir
index 651aff4a0d22a..6a054fcc2ba71 100644
--- a/mlir/test/Dialect/AMDGPU/ops.mlir
+++ b/mlir/test/Dialect/AMDGPU/ops.mlir
@@ -697,8 +697,8 @@ func.func @make_dma_base(%idx: index, %mem: memref<8xi32>, %smem: memref<8xi32,
}
// CHECK-LABEL: func @make_dma_descriptor
-// CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base<i32>, %[[WG_MASK:.+]]: i16, %[[TIMEOUT:.+]]: i1, %[[BARRIER:.+]]: memref<8xi32, #gpu.address_space<workgroup>>, %[[IDX:.+]]: index)
-func.func @make_dma_descriptor(%base: !amdgpu.tdm_base<i32>, %wg_mask: i16, %timeout: i1, %barrier: memref<8xi32, #gpu.address_space<workgroup>>, %idx: index) {
+// CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base<i32>, %[[WG_MASK:.+]]: vector<16xi1>, %[[TIMEOUT:.+]]: i1, %[[BARRIER:.+]]: memref<8xi32, #gpu.address_space<workgroup>>, %[[IDX:.+]]: index, %[[I32:.+]]: i32)
+func.func @make_dma_descriptor(%base: !amdgpu.tdm_base<i32>, %wg_mask: vector<16xi1>, %timeout: i1, %barrier: memref<8xi32, #gpu.address_space<workgroup>>, %idx: index, %i32: i32) {
// CHECK: amdgpu.make_dma_descriptor %[[BASE]]
amdgpu.make_dma_descriptor %base
@@ -717,8 +717,8 @@ func.func @make_dma_descriptor(%base: !amdgpu.tdm_base<i32>, %wg_mask: i16, %tim
globalStride [64, 1]
// CHECK-SAME: sharedSize [64, 64]
sharedSize [64, 64]
- // CHECK-SAME: padShared(%[[IDX]] every %[[IDX]])
- padShared(%idx every %idx)
+ // CHECK-SAME: padShared(%[[I32]] every %[[I32]])
+ padShared(%i32 every %i32)
: !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
// CHECK: amdgpu.make_dma_descriptor %[[BASE]]
@@ -767,8 +767,8 @@ func.func @make_dma_descriptor(%base: !amdgpu.tdm_base<i32>, %wg_mask: i16, %tim
globalStride [64, 1]
// CHECK-SAME: sharedSize [64, 64]
sharedSize [64, 64]
- // CHECK-SAME: iterate %[[IDX]], %[[IDX]], %[[IDX]]
- iterate %idx, %idx, %idx
+ // CHECK-SAME: iterate %[[IDX]], %[[I32]], %[[IDX]]
+ iterate %idx, %i32, %idx
: !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
func.return
More information about the Mlir-commits
mailing list