[Mlir-commits] [mlir] [mlir][amdgpu] Continue lowering make_tdm_descriptor. (PR #171498)
Erick Ochoa Lopez
llvmlistbot at llvm.org
Wed Dec 10 11:08:01 PST 2025
https://github.com/amd-eochoalo updated https://github.com/llvm/llvm-project/pull/171498
>From 360a927fa268a48848499b56c5cfe51f32fc2819 Mon Sep 17 00:00:00 2001
From: Erick Ochoa <erick.ochoalopez at amd.com>
Date: Thu, 4 Dec 2025 10:25:39 -0500
Subject: [PATCH 01/10] [mlir][amdgpu] Lowering for make_tdm_descriptor.
Continues the lowering of make_tdm_descriptor to support
load and stores operations which require 4 descriptors.
---
mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td | 2 +-
.../AMDGPUToROCDL/AMDGPUToROCDL.cpp | 262 ++++++++++++++++--
.../Conversion/AMDGPUToROCDL/gfx1250.mlir | 188 ++++++++++++-
3 files changed, 419 insertions(+), 33 deletions(-)
diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
index 56160d3e8fe85..e9a8684dc61fe 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
@@ -1335,7 +1335,7 @@ def AMDGPU_MakeDmaDescriptorOp :
2D and 3D tensors may be iterated over by setting $global_increment, $lds_increment, and $iteration_count.
$global_increment determines how much to increment the starting global memory address per iteration in units of the $base's element type.
$lds_increment determines how much to increment the starting LDS address per iteration in units of the $base's element type.
- $iterate_count determines how many times to iterate.
+ $iterate_count determines how many times to iterate, it must be a value in the inclusive interval [1, 256].
```mlir
// Example of moving a two-dimensional tensor to LDS.
diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index 7584b17075225..a39191796493f 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -2498,47 +2498,47 @@ struct AMDGPUMakeDmaDescriptorLowering
return setValueAtOffset(rewriter, loc, sgpr1, atomicBarrierAddress, 32);
}
- std::pair<Value, Value> setTensorDim0(MakeDmaDescriptorOp op,
+ std::pair<Value, Value> setTensorDimX(MakeDmaDescriptorOp op,
OpAdaptor adaptor,
ConversionPatternRewriter &rewriter,
Location loc, Value sgpr1, Value sgpr2,
- ArrayRef<Value> consts) const {
+ ArrayRef<Value> consts, uint64_t dimX,
+ uint32_t offset) const {
SmallVector<OpFoldResult> mixedGlobalSizes = op.getMixedGlobalSizes();
- OpFoldResult tensorDim0OpFoldResult = mixedGlobalSizes.back();
- Value tensorDim0;
- if (auto attr = dyn_cast<Attribute>(tensorDim0OpFoldResult))
- tensorDim0 =
+ if (mixedGlobalSizes.size() <= dimX)
+ return {sgpr1, sgpr2};
+
+ OpFoldResult tensorDimXOpFoldResult = *(mixedGlobalSizes.rbegin() + dimX);
+ Value tensorDimX;
+ if (auto attr = dyn_cast<Attribute>(tensorDimXOpFoldResult))
+ tensorDimX =
createI32Constant(rewriter, loc, cast<IntegerAttr>(attr).getInt());
else
- tensorDim0 = cast<Value>(tensorDim0OpFoldResult);
+ tensorDimX = cast<Value>(tensorDimXOpFoldResult);
Value c16 = createI32Constant(rewriter, loc, 16);
- Value tensorDim0High = LLVM::LShrOp::create(rewriter, loc, tensorDim0, c16);
- sgpr1 = setValueAtOffset(rewriter, loc, sgpr1, tensorDim0, 48);
- sgpr2 = setValueAtOffset(rewriter, loc, sgpr2, tensorDim0High, 48 + 16);
+ Value tensorDimXHigh = LLVM::LShrOp::create(rewriter, loc, tensorDimX, c16);
+ sgpr1 = setValueAtOffset(rewriter, loc, sgpr1, tensorDimX, offset);
+ sgpr2 = setValueAtOffset(rewriter, loc, sgpr2, tensorDimXHigh, offset + 16);
return {sgpr1, sgpr2};
}
+ std::pair<Value, Value> setTensorDim0(MakeDmaDescriptorOp op,
+ OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter,
+ Location loc, Value sgpr1, Value sgpr2,
+ ArrayRef<Value> consts) const {
+ return setTensorDimX(op, adaptor, rewriter, loc, sgpr1, sgpr2, consts, 0,
+ 48);
+ }
+
std::pair<Value, Value> setTensorDim1(MakeDmaDescriptorOp op,
OpAdaptor adaptor,
ConversionPatternRewriter &rewriter,
Location loc, Value sgpr2, Value sgpr3,
ArrayRef<Value> consts) const {
- // TODO: Generalize to setTensorDimX.
- SmallVector<OpFoldResult> mixedGlobalSizes = op.getMixedGlobalSizes();
- OpFoldResult tensorDim1OpFoldResult = *(mixedGlobalSizes.rbegin() + 1);
- Value tensorDim1;
- if (auto attr = dyn_cast<Attribute>(tensorDim1OpFoldResult))
- tensorDim1 =
- createI32Constant(rewriter, loc, cast<IntegerAttr>(attr).getInt());
- else
- tensorDim1 = cast<Value>(tensorDim1OpFoldResult);
-
- Value c16 = createI32Constant(rewriter, loc, 16);
- Value tensorDim1High = LLVM::LShrOp::create(rewriter, loc, tensorDim1, c16);
- sgpr2 = setValueAtOffset(rewriter, loc, sgpr2, tensorDim1, 80);
- sgpr3 = setValueAtOffset(rewriter, loc, sgpr3, tensorDim1High, 80 + 16);
- return {sgpr2, sgpr3};
+ return setTensorDimX(op, adaptor, rewriter, loc, sgpr2, sgpr3, consts, 1,
+ 80);
}
Value setTileDimX(MakeDmaDescriptorOp op, OpAdaptor adaptor,
@@ -2680,6 +2680,210 @@ struct AMDGPUMakeDmaDescriptorLowering
return dgroup1;
}
+ Value setTensorDimX(MakeDmaDescriptorOp op, OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter, Location loc,
+ Value sgpr0, ArrayRef<Value> consts, int64_t dimX,
+ int64_t offset) const {
+ SmallVector<OpFoldResult> mixedGlobalSizes = op.getMixedGlobalSizes();
+ if (mixedGlobalSizes.size() <= static_cast<unsigned long>(dimX))
+ return sgpr0;
+
+ OpFoldResult tensorDimXOpFoldResult = *(mixedGlobalSizes.rbegin() + dimX);
+ Value tensorDimX;
+ if (auto attr = dyn_cast<Attribute>(tensorDimXOpFoldResult))
+ tensorDimX =
+ createI32Constant(rewriter, loc, cast<IntegerAttr>(attr).getInt());
+ else
+ tensorDimX = cast<Value>(tensorDimXOpFoldResult);
+ return setValueAtOffset(rewriter, loc, sgpr0, tensorDimX, offset);
+ }
+
+ Value setTensorDim2(MakeDmaDescriptorOp op, OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter, Location loc,
+ Value sgpr0, ArrayRef<Value> consts) const {
+ return setTensorDimX(op, adaptor, rewriter, loc, sgpr0, consts, 2, 0);
+ }
+
+ Value truncateAndSetValueAtOffset(ConversionPatternRewriter &rewriter,
+ Location loc, Value accumulator,
+ Value value, int64_t shift) const {
+
+ IntegerType i32 = rewriter.getI32Type();
+ value = LLVM::TruncOp::create(rewriter, loc, i32, value);
+ return setValueAtOffset(rewriter, loc, accumulator, value, shift);
+ }
+
+ Value setLDSAddrIncrement(MakeDmaDescriptorOp op, OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter, Location loc,
+ Value sgpr1, ArrayRef<Value> consts,
+ int64_t offset) const {
+ Value ldsAddrIncrement = adaptor.getLdsIncrement();
+ return truncateAndSetValueAtOffset(rewriter, loc, sgpr1, ldsAddrIncrement,
+ offset);
+ }
+
+ std::pair<Value, Value>
+ setGlobalAddrIncrement(MakeDmaDescriptorOp op, OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter, Location loc,
+ Value sgpr2, Value sgpr3, ArrayRef<Value> consts,
+ int64_t offset) const {
+ Value globalAddrIncrement = adaptor.getGlobalIncrement();
+ sgpr2 = truncateAndSetValueAtOffset(rewriter, loc, sgpr2,
+ globalAddrIncrement, offset);
+ Value shift = createI64Constant(rewriter, loc, 32);
+ globalAddrIncrement =
+ LLVM::LShrOp::create(rewriter, loc, globalAddrIncrement, shift);
+ constexpr int64_t first16BitsHigh = (1ll << 16) - 1;
+ sgpr3 = truncateAndSetValueAtOffset(rewriter, loc, sgpr3,
+ globalAddrIncrement, offset + 32);
+ Value mask = createI32Constant(rewriter, loc, first16BitsHigh);
+ sgpr3 = LLVM::AndOp::create(rewriter, loc, sgpr3, mask);
+ return {sgpr2, sgpr3};
+ }
+
+ Value setTensorDim3OrLDSAddrIncrement(MakeDmaDescriptorOp op,
+ OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter,
+ Location loc, Value sgpr1,
+ ArrayRef<Value> consts) const {
+ Value ldsIncrement = op.getLdsIncrement();
+ constexpr int64_t dim = 3;
+ constexpr int64_t offset = 32;
+ if (!ldsIncrement)
+ return setTensorDimX(op, adaptor, rewriter, loc, sgpr1, consts, dim,
+ offset);
+ return setLDSAddrIncrement(op, adaptor, rewriter, loc, sgpr1, consts,
+ offset);
+ }
+
+ std::pair<Value, Value> setTensorDim2StrideOrGlobalAddrIncrement(
+ MakeDmaDescriptorOp op, OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter, Location loc, Value sgpr2,
+ Value sgpr3, ArrayRef<Value> consts) const {
+ Value globalIncrement = op.getGlobalIncrement();
+ constexpr int32_t dim = 2;
+ constexpr int32_t offset = 64;
+ if (!globalIncrement)
+ return setTensorDimXStride(op, adaptor, rewriter, loc, sgpr2, sgpr3,
+ consts, dim, offset);
+ return setGlobalAddrIncrement(op, adaptor, rewriter, loc, sgpr2, sgpr3,
+ consts, offset);
+ }
+
+ Value setIterateCount(MakeDmaDescriptorOp op, OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter, Location loc,
+ Value sgpr3, ArrayRef<Value> consts,
+ int32_t offset) const {
+ Value iterationCount = adaptor.getIterationCount();
+ IntegerType i32 = rewriter.getI32Type();
+ // pre-condition: iterationCount is in the inclusive interval [1, 256].
+ iterationCount = LLVM::TruncOp::create(rewriter, loc, i32, iterationCount);
+ iterationCount =
+ LLVM::SubOp::create(rewriter, loc, iterationCount, consts[1]);
+ return setValueAtOffset(rewriter, loc, sgpr3, iterationCount, offset);
+ }
+
+ Value setTileDim3OrIterateCount(MakeDmaDescriptorOp op, OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter,
+ Location loc, Value sgpr3,
+ ArrayRef<Value> consts) const {
+ Value iterateCount = op.getIterationCount();
+ constexpr int32_t dim = 2;
+ constexpr int32_t offset = 112;
+ if (!iterateCount)
+ return setTileDimX(op, adaptor, rewriter, loc, sgpr3, consts, dim,
+ offset);
+
+ return setIterateCount(op, adaptor, rewriter, loc, sgpr3, consts, offset);
+ }
+
+ Value getDGroup2(MakeDmaDescriptorOp op, OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter, Location loc,
+ ArrayRef<Value> consts) const {
+ IntegerType i32 = rewriter.getI32Type();
+ Type v4i32 = this->typeConverter->convertType(VectorType::get(4, i32));
+ assert(v4i32 && "expected type conversion to succeed.");
+
+ if (!op.getLdsIncrement() && op.getRank() == 2)
+ return LLVM::PoisonOp::create(rewriter, loc, v4i32);
+
+ constexpr int64_t sgprlen = 4;
+ Value sgprs[sgprlen];
+ for (int i = 0; i < sgprlen; i++)
+ sgprs[i] = consts[0];
+
+ sgprs[0] = setTensorDim2(op, adaptor, rewriter, loc, sgprs[0], consts);
+ sgprs[1] = setTensorDim3OrLDSAddrIncrement(op, adaptor, rewriter, loc,
+ sgprs[1], consts);
+ std::tie(sgprs[2], sgprs[3]) = setTensorDim2StrideOrGlobalAddrIncrement(
+ op, adaptor, rewriter, loc, sgprs[2], sgprs[3], consts);
+ sgprs[3] =
+ setTileDim3OrIterateCount(op, adaptor, rewriter, loc, sgprs[3], consts);
+
+ Value dgroup2 = LLVM::PoisonOp::create(rewriter, loc, v4i32);
+ for (auto [sgpr, constant] : llvm::zip(sgprs, consts))
+ dgroup2 =
+ LLVM::InsertElementOp::create(rewriter, loc, dgroup2, sgpr, constant);
+
+ return dgroup2;
+ }
+
+ std::pair<Value, Value>
+ setTensorDim3Stride(MakeDmaDescriptorOp op, OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter, Location loc,
+ Value sgpr0, Value sgpr1, ArrayRef<Value> consts) const {
+ constexpr int32_t dim = 3;
+ constexpr int32_t offset = 0;
+ return setTensorDimXStride(op, adaptor, rewriter, loc, sgpr0, sgpr1, consts,
+ dim, offset);
+ }
+
+ std::pair<Value, Value> setTensorDim4(MakeDmaDescriptorOp op,
+ OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter,
+ Location loc, Value sgpr1, Value sgpr2,
+ ArrayRef<Value> consts) const {
+ constexpr int32_t dim = 4;
+ constexpr int32_t offset = 48;
+ return setTensorDimX(op, adaptor, rewriter, loc, sgpr1, sgpr2, consts, dim, offset);
+ }
+
+ Value setTileDim4(MakeDmaDescriptorOp op, OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter, Location loc,
+ Value sgpr2, ArrayRef<Value> consts) const {
+ constexpr int32_t dim = 4;
+ constexpr int32_t offset = 80;
+ return setTileDimX(op, adaptor, rewriter, loc, sgpr2, consts, dim, offset);
+ }
+
+ Value getDGroup3(MakeDmaDescriptorOp op, OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter, Location loc,
+ ArrayRef<Value> consts) const {
+ IntegerType i32 = rewriter.getI32Type();
+ Type v4i32 = this->typeConverter->convertType(VectorType::get(4, i32));
+ assert(v4i32 && "expected type conversion to succeed.");
+ if (!op.getLdsIncrement() && op.getRank() == 2)
+ return LLVM::PoisonOp::create(rewriter, loc, v4i32);
+
+ constexpr int32_t sgprlen = 4;
+ Value sgprs[sgprlen];
+ for (int i = 0; i < sgprlen; i++)
+ sgprs[i] = consts[0];
+
+ std::tie(sgprs[0], sgprs[1]) = setTensorDim3Stride(
+ op, adaptor, rewriter, loc, sgprs[0], sgprs[1], consts);
+ std::tie(sgprs[1], sgprs[2]) =
+ setTensorDim4(op, adaptor, rewriter, loc, sgprs[1], sgprs[2], consts);
+ sgprs[2] = setTileDim4(op, adaptor, rewriter, loc, sgprs[2], consts);
+
+ Value dgroup3 = LLVM::PoisonOp::create(rewriter, loc, v4i32);
+ for (auto [sgpr, constant] : llvm::zip(sgprs, consts))
+ dgroup3 =
+ LLVM::InsertElementOp::create(rewriter, loc, dgroup3, sgpr, constant);
+
+ return dgroup3;
+ }
+
LogicalResult
matchAndRewrite(MakeDmaDescriptorOp op, OpAdaptor adaptor,
ConversionPatternRewriter &rewriter) const override {
@@ -2687,9 +2891,6 @@ struct AMDGPUMakeDmaDescriptorLowering
return op->emitOpError(
"make_dma_descriptor is only supported on gfx1250");
- if (op.getRank() > 2)
- return op->emitOpError("unimplemented");
-
Location loc = op.getLoc();
IntegerType i32 = rewriter.getI32Type();
@@ -2703,8 +2904,9 @@ struct AMDGPUMakeDmaDescriptorLowering
Value dgroup0 = this->getDGroup0(adaptor);
Value dgroup1 = this->getDGroup1(op, adaptor, rewriter, loc, consts);
-
- SmallVector<Value> results = {dgroup0, dgroup1};
+ Value dgroup2 = this->getDGroup2(op, adaptor, rewriter, loc, consts);
+ Value dgroup3 = this->getDGroup3(op, adaptor, rewriter, loc, consts);
+ SmallVector<Value> results = {dgroup0, dgroup1, dgroup2, dgroup3};
rewriter.replaceOpWithMultiple(op, {results});
return success();
}
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir b/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir
index a94e17ab5b9a5..d15221c3368ee 100644
--- a/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir
+++ b/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir
@@ -304,7 +304,10 @@ func.func @make_dma_descriptor(%base: !amdgpu.tdm_base<i32>) -> !amdgpu.tdm_desc
// CHECK: %[[DGROUP1_6:.+]] = llvm.insertelement %[[SGPR6]], %[[DGROUP1_5]][%[[C6]] : i32]
// CHECK: %[[DGROUP1:.+]] = llvm.insertelement %[[SGPR7]], %[[DGROUP1_6]][%[[C7]] : i32]
- // CHECK: %[[DGROUPS:.+]] = builtin.unrealized_conversion_cast %[[DGROUP0]], %[[DGROUP1]] : vector<4xi32>, vector<8xi32> to !amdgpu.tdm_descriptor
+ // CHECK: %[[DGROUP2:.+]] = llvm.mlir.poison : vector<4xi32>
+ // CHECK: %[[DGROUP3:.+]] = llvm.mlir.poison : vector<4xi32>
+
+ // CHECK: %[[DGROUPS:.+]] = builtin.unrealized_conversion_cast %[[DGROUP0]], %[[DGROUP1]], %[[DGROUP2]], %[[DGROUP3]] : vector<4xi32>, vector<8xi32>, vector<4xi32>, vector<4xi32> to !amdgpu.tdm_descriptor
%descriptor = amdgpu.make_dma_descriptor %base globalSize [128, 64] globalStride [64, 1] sharedSize [128, 64] : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
func.return %descriptor : !amdgpu.tdm_descriptor
}
@@ -365,6 +368,184 @@ func.func @make_dma_descriptor_atomic_barrier(%base: !amdgpu.tdm_base<i32>, %bar
func.return %descriptor : !amdgpu.tdm_descriptor
}
+
+// -----
+
+// CHECK-LABEL: func @make_dma_descriptor_iterate
+// CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base<i32>, %[[IDX:.+]]: index)
+func.func @make_dma_descriptor_iterate(%base: !amdgpu.tdm_base<i32>, %idx : index) -> !amdgpu.tdm_descriptor {
+ // CHECK-DAG: %[[DGROUP0:.+]] = builtin.unrealized_conversion_cast %[[BASE]]
+ // CHECK-DAG: %[[INDEX:.+]] = builtin.unrealized_conversion_cast %[[IDX]] : index to i64
+
+ // CHECK: %[[C0:.+]] = llvm.mlir.constant(0 : i32)
+ // CHECK: %[[C1:.+]] = llvm.mlir.constant(1 : i32)
+ // CHECK: %[[C2:.+]] = llvm.mlir.constant(2 : i32)
+ // CHECK: %[[C3:.+]] = llvm.mlir.constant(3 : i32)
+
+ // CHECK-DAG: %[[DATA_SIZE:.+]] = llvm.mlir.constant(2 : i32)
+ // CHECK-DAG: %[[C16:.+]] = llvm.mlir.constant(16 : i32)
+ // CHECK: %[[SGPR0_0:.+]] = llvm.shl %[[DATA_SIZE]], %[[C16]]
+
+ // CHECK-DAG: %[[SHIFT:.+]] = llvm.mlir.constant(19 : i32)
+ // CHECK: %[[ITERATE_ENABLE:.+]] = llvm.shl %[[C1]], %[[SHIFT]]
+ // CHECK: %[[SGPR0:.+]] = llvm.or %[[SGPR0_0]], %[[ITERATE_ENABLE]]
+
+ // CHECK: %[[V8I32:.+]] = llvm.mlir.poison : vector<8xi32>
+ // CHECK: %[[DGROUP1_0:.+]] = llvm.insertelement %[[SGPR0]], %[[V8I32]][%[[C0]] : i32]
+
+
+ // CHECK: %[[SGPR1:.+]] = llvm.trunc %[[INDEX]] : i64 to i32
+ // CHECK: %[[SGPR2:.+]] = llvm.trunc %[[INDEX]] : i64 to i32
+ // CHECK-DAG: %[[SHIFT:.+]] = llvm.mlir.constant(32 : i64) : i64
+ // CHECK: %[[GLOBAL_ADDR_INC_HIGH:.+]] = llvm.lshr %[[INDEX]], %[[SHIFT]]
+ // CHECK: %[[GLOBAL_ADDR_INC_HIGH_2:.+]] = llvm.trunc %[[GLOBAL_ADDR_INC_HIGH]] : i64 to i32
+ // CHECK-DAG: %[[MASK:.+]] = llvm.mlir.constant([[FIRST_16_HIGH:65535]] : i32) : i32
+ // CHECK: %[[SGPR3_LOW:.+]] = llvm.and %[[GLOBAL_ADDR_INC_HIGH_2]], %[[MASK]]
+
+ // CHECK: %[[ITERATE_COUNT:.+]] = llvm.trunc %[[INDEX]] : i64 to i32
+ // CHECK: %[[ITERATE_COUNT_M1:.+]] = llvm.sub %[[ITERATE_COUNT]], %[[C1]]
+ // CHECK: %[[SHIFT:.+]] = llvm.mlir.constant(16 : i32)
+ // CHECK: %[[ITERATE_COUNT_SHIFTED:.+]] = llvm.shl %[[ITERATE_COUNT_M1]], %[[SHIFT]]
+ // CHECK: %[[SGPR3:.+]] = llvm.or %[[SGPR3_LOW]], %[[ITERATE_COUNT_SHIFTED]]
+
+ // CHECK: %[[V4I32:.+]] = llvm.mlir.poison : vector<4xi32>
+ // CHECK: %[[DGROUP2_0:.+]] = llvm.insertelement %[[C0]], %[[V4I32]][%[[C0]]
+ // CHECK: %[[DGROUP2_1:.+]] = llvm.insertelement %[[SGPR1]], %[[DGROUP2_0]][%[[C1]]
+ // CHECK: %[[DGROUP2_2:.+]] = llvm.insertelement %[[SGPR2]], %[[DGROUP2_1]][%[[C2]]
+ // CHECK: %[[DGROUP2:.+]] = llvm.insertelement %[[SGPR3]], %[[DGROUP2_2]][%[[C3]]
+
+ %descriptor = amdgpu.make_dma_descriptor %base globalSize [128, 64] globalStride [64, 1] sharedSize [128, 64] iterate %idx, %idx, %idx : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
+ func.return %descriptor : !amdgpu.tdm_descriptor
+}
+
+// -----
+
+// CHECK-LABEL: func @make_dma_descriptor
+// CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base<i32>)
+func.func @make_dma_descriptor(%base: !amdgpu.tdm_base<i32>) -> !amdgpu.tdm_descriptor {
+
+ // CHECK-DAG: %[[DGROUP0:.+]] = builtin.unrealized_conversion_cast %[[BASE]]
+
+ // CHECK-DAG: %[[C0:.+]] = llvm.mlir.constant(0 : i32)
+ // CHECK-DAG: %[[C1:.+]] = llvm.mlir.constant(1 : i32)
+ // CHECK-DAG: %[[C2:.+]] = llvm.mlir.constant(2 : i32)
+ // CHECK-DAG: %[[C3:.+]] = llvm.mlir.constant(3 : i32)
+ // CHECK-DAG: %[[C4:.+]] = llvm.mlir.constant(4 : i32)
+ // CHECK-DAG: %[[C5:.+]] = llvm.mlir.constant(5 : i32)
+ // CHECK-DAG: %[[C6:.+]] = llvm.mlir.constant(6 : i32)
+ // CHECK-DAG: %[[C7:.+]] = llvm.mlir.constant(7 : i32)
+
+
+ // CHECK-DAG: %[[DATA_SIZE:.+]] = llvm.mlir.constant(2 : i32)
+ // CHECK-DAG: %[[C16:.+]] = llvm.mlir.constant(16 : i32)
+ // CHECK: %[[SGPR0:.+]] = llvm.shl %[[DATA_SIZE]], %[[C16]]
+
+ // CHECK-DAG: %[[TENSOR_DIM_0:.+]] = llvm.mlir.constant(64 : i32)
+ // CHECK-DAG: %[[C16:.+]] = llvm.mlir.constant(16 : i32)
+ // CHECK: %[[SGPR2_0:.+]] = llvm.lshr %[[TENSOR_DIM_0]], %[[C16]]
+ // CHECK-DAG: %[[C16:.+]] = llvm.mlir.constant(16 : i32)
+ // CHECK: %[[SGPR1:.+]] = llvm.shl %[[TENSOR_DIM_0]], %[[C16]]
+
+ // CHECK-DAG: %[[TENSOR_DIM_1:.+]] = llvm.mlir.constant(128 : i32)
+ // CHECK-DAG: %[[C16:.+]] = llvm.mlir.constant(16 : i32)
+ // CHECK: %[[SGPR3_0:.+]] = llvm.lshr %[[TENSOR_DIM_1]], %[[C16]]
+ // CHECK-DAG: %[[C16:.+]] = llvm.mlir.constant(16 : i32)
+ // CHECK: %[[TENSOR_DIM_1_SHIFTED:.+]] = llvm.shl %[[TENSOR_DIM_1]], %[[C16]]
+ // CHECK: %[[SGPR2:.+]] = llvm.or %[[SGPR2_0]], %[[TENSOR_DIM_1_SHIFTED]]
+
+ // CHECK-DAG: %[[TILE_DIM_0:.+]] = llvm.mlir.constant(64 : i32)
+ // CHECK-DAG: %[[C16:.+]] = llvm.mlir.constant(16 : i32)
+ // CHECK: %[[TILE_DIM_0_SHIFTED:.+]] = llvm.shl %[[TILE_DIM_0]], %[[C16]]
+ // CHECK: %[[SGPR3:.+]] = llvm.or %[[SGPR3_0]], %[[TILE_DIM_0_SHIFTED]]
+
+ // CHECK-DAG: %[[TILE_DIM_1:.+]] = llvm.mlir.constant(128 : i32)
+ // CHECK-DAG: %[[TILE_DIM_2:.+]] = llvm.mlir.constant(64 : i32)
+ // CHECK-DAG: %[[C16:.+]] = llvm.mlir.constant(16 : i32)
+ // CHECK: %[[TILE_DIM_2_SHIFTED:.+]] = llvm.shl %[[TILE_DIM_2]], %[[C16]]
+ // CHECK: %[[SGPR4:.+]] = llvm.or %[[TILE_DIM_1]], %[[TILE_DIM_2_SHIFTED]]
+
+ // CHECK-DAG: %[[TENSOR_DIM_0_STRIDE:.+]] = llvm.mlir.constant(1 : i64) : i64
+ // CHECK-DAG: %[[MASK:.+]] = llvm.mlir.constant(281474976710655 : i64) : i64
+ // CHECK: %[[TENSOR_DIM_0_STRIDE_MASKED:.+]] = llvm.and %[[MASK]], %[[TENSOR_DIM_0_STRIDE]]
+ // CHECK-DAG: %[[SGPR5:.+]] = llvm.trunc %[[TENSOR_DIM_0_STRIDE_MASKED]] : i64 to i32
+ // CHECK-DAG: %[[SHIFT:.+]] = llvm.mlir.constant(32 : i64) : i64
+ // CHECK: %[[TENSOR_DIM_0_STRIDE_HIGH_64:.+]] = llvm.lshr %[[TENSOR_DIM_0_STRIDE_MASKED]], %[[SHIFT]]
+ // CHECK: %[[SGPR6_0:.+]] = llvm.trunc %[[TENSOR_DIM_0_STRIDE_HIGH_64]] : i64 to i32
+
+ // CHECK-DAG: %[[TENSOR_DIM_1_STRIDE:.+]] = llvm.mlir.constant(64 : i64)
+ // CHECK-DAG: %[[MASK:.+]] = llvm.mlir.constant(281474976710655 : i64) : i64
+ // CHECK: %[[TENSOR_DIM_1_STRIDE_MASKED:.+]] = llvm.and %[[MASK]], %[[TENSOR_DIM_1_STRIDE]]
+ // CHECK-DAG: %[[TENSOR_DIM_1_STRIDE_LOW:.+]] = llvm.trunc %[[TENSOR_DIM_1_STRIDE_MASKED]]
+ // CHECK-DAG: %[[SHIFT:.+]] = llvm.mlir.constant(16 : i64) : i64
+ // CHECK: %[[TENSOR_DIM_1_STRIDE_SHIFTED:.+]] = llvm.lshr %[[TENSOR_DIM_1_STRIDE_MASKED]], %[[SHIFT]]
+ // CHECK: %[[SGPR7:.+]] = llvm.trunc %[[TENSOR_DIM_1_STRIDE_SHIFTED]] : i64 to i32
+ // CHECK-DAG: %[[SHIFT:.+]] = llvm.mlir.constant(16 : i32) : i32
+ // CHECK: %[[TENSOR_DIM_1_STRIDE_LOW_SHIFTED:.+]] = llvm.shl %[[TENSOR_DIM_1_STRIDE_LOW]], %[[SHIFT]]
+ // CHECK-DAG: %[[SGPR6:.+]] = llvm.or %[[SGPR6_0]], %[[TENSOR_DIM_1_STRIDE_LOW_SHIFTED]]
+
+ // CHECK: %[[V8I32:.+]] = llvm.mlir.poison : vector<8xi32>
+ // CHECK: %[[DGROUP1_0:.+]] = llvm.insertelement %[[SGPR0]], %[[V8I32]][%[[C0]] : i32]
+ // CHECK: %[[DGROUP1_1:.+]] = llvm.insertelement %[[SGPR1]], %[[DGROUP1_0]][%[[C1]] : i32]
+ // CHECK: %[[DGROUP1_2:.+]] = llvm.insertelement %[[SGPR2]], %[[DGROUP1_1]][%[[C2]] : i32]
+ // CHECK: %[[DGROUP1_3:.+]] = llvm.insertelement %[[SGPR3]], %[[DGROUP1_2]][%[[C3]] : i32]
+ // CHECK: %[[DGROUP1_4:.+]] = llvm.insertelement %[[SGPR4]], %[[DGROUP1_3]][%[[C4]] : i32]
+ // CHECK: %[[DGROUP1_5:.+]] = llvm.insertelement %[[SGPR5]], %[[DGROUP1_4]][%[[C5]] : i32]
+ // CHECK: %[[DGROUP1_6:.+]] = llvm.insertelement %[[SGPR6]], %[[DGROUP1_5]][%[[C6]] : i32]
+ // CHECK: %[[DGROUP1:.+]] = llvm.insertelement %[[SGPR7]], %[[DGROUP1_6]][%[[C7]] : i32]
+
+ // CHECK-DAG: %[[SGPR0:.+]] = llvm.mlir.constant([[TENSOR_DIM_2:64]] : i32)
+ // CHECK-DAG: %[[SGPR1:.+]] = llvm.mlir.constant([[TENSOR_DIM_3:64]] : i32)
+ // CHECK-DAG: %[[TENSOR_DIM_1_STRIDE:.+]] = llvm.mlir.constant(64 : i64)
+ // CHECK-DAG: %[[MASK:.+]] = llvm.mlir.constant(281474976710655 : i64) : i64
+ // CHECK: %[[TENSOR_DIM_2_STRIDE_MASKED:.+]] = llvm.and %[[MASK]], %[[TENSOR_DIM_1_STRIDE]]
+
+ // CHECK-DAG: %[[SGPR2:.+]] = llvm.trunc %[[TENSOR_DIM_2_STRIDE_MASKED]]
+ // CHECK-DAG: %[[SHIFT:.+]] = llvm.mlir.constant(32 : i64) : i64
+ // CHECK: %[[TENSOR_DIM_2_STRIDE_HIGH_64:.+]] = llvm.lshr %[[TENSOR_DIM_2_STRIDE_MASKED]], %[[SHIFT]]
+ // CHECK: %[[SGPR3_0:.+]] = llvm.trunc %[[TENSOR_DIM_2_STRIDE_HIGH_64]] : i64 to i32
+
+ // CHECK-DAG: %[[TILE_DIM_3:.+]] = llvm.mlir.constant(64 : i32) : i32
+ // CHECK-DAG: %[[SHIFT:.+]] = llvm.mlir.constant(16 : i32) : i32
+ // CHECK: %[[TILE_DIM_3_SHIFTED:.+]] = llvm.shl %[[TILE_DIM_3]], %[[SHIFT]]
+ // CHECK: %[[SGPR3:.+]] = llvm.or %[[SGPR3_0]], %[[TILE_DIM_3_SHIFTED]]
+
+ // CHECK-DAG: %[[V4I32:.+]] = llvm.mlir.poison : vector<4xi32>
+ // CHECK: %[[DGROUP2_0:.+]] = llvm.insertelement %[[SGPR0]], %[[V4I32]][%[[C0]] : i32]
+ // CHECK: %[[DGROUP2_1:.+]] = llvm.insertelement %[[SGPR1]], %[[DGROUP2_0]][%[[C1]] : i32]
+ // CHECK: %[[DGROUP2_2:.+]] = llvm.insertelement %[[SGPR2]], %[[DGROUP2_1]][%[[C2]] : i32]
+ // CHECK: %[[DGROUP2:.+]] = llvm.insertelement %[[SGPR3]], %[[DGROUP2_2]][%[[C3]] : i32]
+
+ // CHECK-DAG: %[[TENSOR_DIM3_STRIDE:.+]] = llvm.mlir.constant(64 : i64)
+ // CHECK-DAG: %[[MASK:.+]] = llvm.mlir.constant(281474976710655 : i64) : i64
+ // CHECK: %[[TENSOR_DIM3_STRIDE_MASKED:.+]] = llvm.and %[[MASK]], %[[TENSOR_DIM3_STRIDE]]
+ // CHECK: %[[TENSOR_DIM3_STRIDE_LOW:.+]] = llvm.trunc %[[TENSOR_DIM3_STRIDE_MASKED]] : i64 to i32
+ // CHECK-DAG: %[[SHIFT:.+]] = llvm.mlir.constant(32 : i64)
+ // CHECK: %[[TENSOR_DIM3_STRIDE_SHIFTED:.+]] = llvm.lshr %[[TENSOR_DIM3_STRIDE_MASKED]], %[[SHIFT]]
+ // CHECK: %[[TENSOR_DIM3_STRIDE_HIGH:.+]] = llvm.trunc %[[TENSOR_DIM3_STRIDE_SHIFTED]] : i64 to i32
+
+ // CHECK-DAG: %[[TENSOR_DIM_4:.+]] = llvm.mlir.constant(64 : i32)
+ // CHECK-DAG: %[[C16:.+]] = llvm.mlir.constant(16 : i32)
+ // CHECK: %[[SGPR2_0:.+]] = llvm.lshr %[[TENSOR_DIM_4]], %[[C16]]
+ // CHECK-DAG: %[[SHIFT:.+]] = llvm.mlir.constant(16 : i32)
+ // CHECK-DAG: %[[TENSOR_DIM_4_LOW:.+]] = llvm.shl %[[TENSOR_DIM_4]], %[[SHIFT]]
+ // CHECK: %[[SGPR1:.+]] = llvm.or %[[TENSOR_DIM3_STRIDE_HIGH]], %[[TENSOR_DIM_4_LOW]]
+
+ // CHECK-DAG: %[[TILE_DIM_4:.+]] = llvm.mlir.constant(64 : i32) : i32
+ // CHECK-DAG: %[[SHIFT:.+]] = llvm.mlir.constant(16 : i32) : i32
+ // CHECK: %[[TILE_DIM_4_SHIFTED:.+]] = llvm.shl %[[TILE_DIM_4]], %[[SHIFT]]
+ // CHECK: %[[SGPR2:.+]] = llvm.or %[[SGPR2_0]], %[[TILE_DIM_4_SHIFTED]]
+
+ // CHECK: %[[V4I32:.+]] = llvm.mlir.poison : vector<4xi32>
+ // CHECK: %[[DGROUP3_0:.+]] = llvm.insertelement %[[TENSOR_DIM3_STRIDE_LOW]], %[[V4I32]][%[[C0]] : i32]
+ // CHECK: %[[DGROUP3_1:.+]] = llvm.insertelement %[[SGPR1]], %[[DGROUP3_0]][%[[C1]] : i32]
+ // CHECK: %[[DGROUP3_2:.+]] = llvm.insertelement %[[SGPR2]], %[[DGROUP3_1]][%[[C2]] : i32]
+ // CHECK: %[[DGROUP3:.+]] = llvm.insertelement %[[C0]], %[[DGROUP3_2]][%[[C3]] : i32]
+
+ // CHECK: %[[DGROUPS:.+]] = builtin.unrealized_conversion_cast %[[DGROUP0]], %[[DGROUP1]], %[[DGROUP2]], %[[DGROUP3]] : vector<4xi32>, vector<8xi32>, vector<4xi32>, vector<4xi32> to !amdgpu.tdm_descriptor
+ %descriptor = amdgpu.make_dma_descriptor %base globalSize [64, 64, 64, 128, 64] globalStride [64, 64, 64, 64, 1] sharedSize [64, 64, 64, 128, 64] : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
+ func.return %descriptor : !amdgpu.tdm_descriptor
+}
+
// -----
// CHECK-LABEL: func @make_dma_descriptor_workgroup_mask
@@ -439,7 +620,10 @@ func.func @make_dma_descriptor_workgroup_mask(%base: !amdgpu.tdm_base<i32>, %wg_
// CHECK: %[[DGROUP1_6:.+]] = llvm.insertelement %[[SGPR6]], %[[DGROUP1_5]][%[[C6]] : i32]
// CHECK: %[[DGROUP1:.+]] = llvm.insertelement %[[SGPR7]], %[[DGROUP1_6]][%[[C7]] : i32]
- // CHECK: %[[DGROUPS:.+]] = builtin.unrealized_conversion_cast %[[DGROUP0]], %[[DGROUP1]] : vector<4xi32>, vector<8xi32> to !amdgpu.tdm_descriptor
+ // CHECK: %[[DGROUP2:.+]] = llvm.mlir.poison : vector<4xi32>
+ // CHECK: %[[DGROUP3:.+]] = llvm.mlir.poison : vector<4xi32>
+
+ // CHECK: %[[DGROUPS:.+]] = builtin.unrealized_conversion_cast %[[DGROUP0]], %[[DGROUP1]], %[[DGROUP2]], %[[DGROUP3]] : vector<4xi32>, vector<8xi32>, vector<4xi32>, vector<4xi32> to !amdgpu.tdm_descriptor
%descriptor = amdgpu.make_dma_descriptor %base globalSize [128, 64] globalStride [64, 1] sharedSize [128, 64] workgroupMask %wg_mask earlyTimeout %timeout : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
func.return %descriptor : !amdgpu.tdm_descriptor
}
>From 69d150442af289438ed9b03d9c384ed5e45a1f77 Mon Sep 17 00:00:00 2001
From: Erick Ochoa <erick.ochoalopez at amd.com>
Date: Tue, 9 Dec 2025 13:34:35 -0500
Subject: [PATCH 02/10] [mlir][amdgpu] Refactoring.
Changing the order of operations to make lit tests
more readable.
---
.../AMDGPUToROCDL/AMDGPUToROCDL.cpp | 30 +++----
.../Conversion/AMDGPUToROCDL/gfx1250.mlir | 89 +++++++++++--------
2 files changed, 63 insertions(+), 56 deletions(-)
diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index a39191796493f..61a85147cf842 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -2389,21 +2389,19 @@ struct AMDGPUMakeDmaDescriptorLowering
Value setDataSize(MakeDmaDescriptorOp op, OpAdaptor adaptor,
ConversionPatternRewriter &rewriter, Location loc,
Value sgpr0, ArrayRef<Value> consts) const {
- // Compute data_size.
unsigned elementTypeWidthInBits = op.getElementTypeWidth();
assert(
llvm::is_contained<unsigned>({8, 16, 32, 64}, elementTypeWidthInBits) &&
"expected type width to be 8, 16, 32, or 64.");
- int64_t dataSize = llvm::Log2_32(elementTypeWidthInBits / 8);
- Value size = createI32Constant(rewriter, loc, dataSize);
+ int64_t idx = llvm::Log2_32(elementTypeWidthInBits / 8);
+ Value size = consts[idx];
return setValueAtOffset(rewriter, loc, sgpr0, size, 16);
}
Value setAtomicBarrier(MakeDmaDescriptorOp op, OpAdaptor adaptor,
ConversionPatternRewriter &rewriter, Location loc,
Value sgpr0, ArrayRef<Value> consts) const {
- bool atomic_barrier_enable = adaptor.getAtomicBarrierAddress() != nullptr;
- if (!atomic_barrier_enable)
+ if (!adaptor.getAtomicBarrierAddress())
return sgpr0;
return setValueAtOffset(rewriter, loc, sgpr0, consts[1], 18);
@@ -2412,19 +2410,16 @@ struct AMDGPUMakeDmaDescriptorLowering
Value setIterateEnable(MakeDmaDescriptorOp op, OpAdaptor adaptor,
ConversionPatternRewriter &rewriter, Location loc,
Value sgpr0, ArrayRef<Value> consts) const {
- bool iterate_enable = adaptor.getGlobalIncrement() != nullptr;
- if (!iterate_enable)
+ if (!adaptor.getGlobalIncrement())
return sgpr0;
- // TODO: In future PR, add other required fields for iteration.
return setValueAtOffset(rewriter, loc, sgpr0, consts[1], 19);
}
Value setPadEnable(MakeDmaDescriptorOp op, OpAdaptor adaptor,
ConversionPatternRewriter &rewriter, Location loc,
Value sgpr0, ArrayRef<Value> consts) const {
- bool pad_enable = op.getPadAmount() != nullptr;
- if (!pad_enable)
+ if (!op.getPadAmount())
return sgpr0;
return setValueAtOffset(rewriter, loc, sgpr0, consts[1], 20);
@@ -2442,8 +2437,7 @@ struct AMDGPUMakeDmaDescriptorLowering
Value setPadInterval(MakeDmaDescriptorOp op, OpAdaptor adaptor,
ConversionPatternRewriter &rewriter, Location loc,
Value sgpr0, ArrayRef<Value> consts) const {
- bool pad_enable = op.getPadAmount() != nullptr;
- if (!pad_enable)
+ if (!op.getPadAmount())
return sgpr0;
IntegerType i32 = rewriter.getI32Type();
@@ -2459,8 +2453,7 @@ struct AMDGPUMakeDmaDescriptorLowering
Value setPadAmount(MakeDmaDescriptorOp op, OpAdaptor adaptor,
ConversionPatternRewriter &rewriter, Location loc,
Value sgpr0, ArrayRef<Value> consts) const {
- bool pad_enable = op.getPadAmount() != nullptr;
- if (!pad_enable)
+ if (!op.getPadAmount())
return sgpr0;
Value padAmount = adaptor.getPadAmount();
@@ -2474,8 +2467,7 @@ struct AMDGPUMakeDmaDescriptorLowering
ConversionPatternRewriter &rewriter,
Location loc, Value sgpr1,
ArrayRef<Value> consts) const {
- bool atomic_barrier_enable = adaptor.getAtomicBarrierAddress() != nullptr;
- if (!atomic_barrier_enable)
+ if (!adaptor.getAtomicBarrierAddress())
return sgpr1;
Value atomicBarrierAddress = adaptor.getAtomicBarrierAddress();
@@ -2516,9 +2508,10 @@ struct AMDGPUMakeDmaDescriptorLowering
else
tensorDimX = cast<Value>(tensorDimXOpFoldResult);
+ sgpr1 = setValueAtOffset(rewriter, loc, sgpr1, tensorDimX, offset);
+
Value c16 = createI32Constant(rewriter, loc, 16);
Value tensorDimXHigh = LLVM::LShrOp::create(rewriter, loc, tensorDimX, c16);
- sgpr1 = setValueAtOffset(rewriter, loc, sgpr1, tensorDimX, offset);
sgpr2 = setValueAtOffset(rewriter, loc, sgpr2, tensorDimXHigh, offset + 16);
return {sgpr1, sgpr2};
}
@@ -2605,6 +2598,7 @@ struct AMDGPUMakeDmaDescriptorLowering
IntegerType i32 = rewriter.getI32Type();
Value tensorDimXStrideLow =
LLVM::TruncOp::create(rewriter, loc, i32, tensorDimXStride);
+ sgprY = setValueAtOffset(rewriter, loc, sgprY, tensorDimXStrideLow, offset);
int64_t shift = (offset % 32) == 0 ? 32 : offset % 32;
Value shiftVal = createI64Constant(rewriter, loc, shift);
@@ -2612,8 +2606,6 @@ struct AMDGPUMakeDmaDescriptorLowering
LLVM::LShrOp::create(rewriter, loc, tensorDimXStride, shiftVal);
tensorDimXStrideHigh =
LLVM::TruncOp::create(rewriter, loc, i32, tensorDimXStrideHigh);
-
- sgprY = setValueAtOffset(rewriter, loc, sgprY, tensorDimXStrideLow, offset);
sgprZ = setValueAtOffset(rewriter, loc, sgprZ, tensorDimXStrideHigh,
offset + shift);
return {sgprY, sgprZ};
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir b/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir
index d15221c3368ee..6d1a765d2ea02 100644
--- a/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir
+++ b/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir
@@ -237,6 +237,9 @@ func.func @make_dma_base(%idx: index, %mem: memref<8xi32, #gpu_global_addrspace>
// -----
+// This test exercises the lowering for operations that only require 2-descriptors
+// to be fully described.
+
// CHECK-LABEL: func @make_dma_descriptor
// CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base<i32>)
func.func @make_dma_descriptor(%base: !amdgpu.tdm_base<i32>) -> !amdgpu.tdm_descriptor {
@@ -251,23 +254,24 @@ func.func @make_dma_descriptor(%base: !amdgpu.tdm_base<i32>) -> !amdgpu.tdm_desc
// CHECK-DAG: %[[C6:.+]] = llvm.mlir.constant(6 : i32)
// CHECK-DAG: %[[C7:.+]] = llvm.mlir.constant(7 : i32)
- // CHECK-DAG: %[[DATA_SIZE:.+]] = llvm.mlir.constant(2 : i32)
// CHECK-DAG: %[[C16:.+]] = llvm.mlir.constant(16 : i32)
- // CHECK: %[[SGPR0:.+]] = llvm.shl %[[DATA_SIZE]], %[[C16]]
+ // CHECK: %[[SGPR0:.+]] = llvm.shl %[[C2]], %[[C16]]
// CHECK-DAG: %[[TENSOR_DIM_0:.+]] = llvm.mlir.constant(64 : i32)
// CHECK-DAG: %[[C16:.+]] = llvm.mlir.constant(16 : i32)
- // CHECK: %[[SGPR2_0:.+]] = llvm.lshr %[[TENSOR_DIM_0]], %[[C16]]
- // CHECK-DAG: %[[C16:.+]] = llvm.mlir.constant(16 : i32)
// CHECK: %[[SGPR1:.+]] = llvm.shl %[[TENSOR_DIM_0]], %[[C16]]
- // CHECK-DAG: %[[TENSOR_DIM_1:.+]] = llvm.mlir.constant(128 : i32)
// CHECK-DAG: %[[C16:.+]] = llvm.mlir.constant(16 : i32)
- // CHECK: %[[SGPR3_0:.+]] = llvm.lshr %[[TENSOR_DIM_1]], %[[C16]]
+ // CHECK: %[[SGPR2_0:.+]] = llvm.lshr %[[TENSOR_DIM_0]], %[[C16]]
+
+ // CHECK-DAG: %[[TENSOR_DIM_1:.+]] = llvm.mlir.constant(128 : i32)
// CHECK-DAG: %[[C16:.+]] = llvm.mlir.constant(16 : i32)
// CHECK: %[[TENSOR_DIM_1_SHIFTED:.+]] = llvm.shl %[[TENSOR_DIM_1]], %[[C16]]
// CHECK: %[[SGPR2:.+]] = llvm.or %[[SGPR2_0]], %[[TENSOR_DIM_1_SHIFTED]]
+ // CHECK-DAG: %[[C16:.+]] = llvm.mlir.constant(16 : i32)
+ // CHECK: %[[SGPR3_0:.+]] = llvm.lshr %[[TENSOR_DIM_1]], %[[C16]]
+
// CHECK-DAG: %[[TILE_DIM_0:.+]] = llvm.mlir.constant(64 : i32)
// CHECK-DAG: %[[C16:.+]] = llvm.mlir.constant(16 : i32)
// CHECK: %[[TILE_DIM_0_SHIFTED:.+]] = llvm.shl %[[TILE_DIM_0:.+]], %[[C16]]
@@ -279,6 +283,7 @@ func.func @make_dma_descriptor(%base: !amdgpu.tdm_base<i32>) -> !amdgpu.tdm_desc
// CHECK-DAG: %[[MASK:.+]] = llvm.mlir.constant(281474976710655 : i64) : i64
// CHECK: %[[TENSOR_DIM_0_STRIDE_MASKED:.+]] = llvm.and %[[MASK]], %[[TENSOR_DIM_0_STRIDE]]
// CHECK-DAG: %[[SGPR5:.+]] = llvm.trunc %[[TENSOR_DIM_0_STRIDE_MASKED]] : i64 to i32
+
// CHECK-DAG: %[[SHIFT:.+]] = llvm.mlir.constant(32 : i64) : i64
// CHECK: %[[TENSOR_DIM_0_STRIDE_HIGH_64:.+]] = llvm.lshr %[[TENSOR_DIM_0_STRIDE_MASKED]], %[[SHIFT]]
// CHECK: %[[SGPR6_0:.+]] = llvm.trunc %[[TENSOR_DIM_0_STRIDE_HIGH_64]] : i64 to i32
@@ -287,13 +292,14 @@ func.func @make_dma_descriptor(%base: !amdgpu.tdm_base<i32>) -> !amdgpu.tdm_desc
// CHECK-DAG: %[[MASK:.+]] = llvm.mlir.constant(281474976710655 : i64) : i64
// CHECK: %[[TENSOR_DIM_1_STRIDE_MASKED:.+]] = llvm.and %[[MASK]], %[[TENSOR_DIM_1_STRIDE]]
// CHECK-DAG: %[[TENSOR_DIM_1_STRIDE_LOW:.+]] = llvm.trunc %[[TENSOR_DIM_1_STRIDE_MASKED]]
- // CHECK-DAG: %[[SHIFT:.+]] = llvm.mlir.constant(16 : i64) : i64
- // CHECK: %[[TENSOR_DIM_1_STRIDE_SHIFTED:.+]] = llvm.lshr %[[TENSOR_DIM_1_STRIDE_MASKED]], %[[SHIFT]]
- // CHECK: %[[SGPR7:.+]] = llvm.trunc %[[TENSOR_DIM_1_STRIDE_SHIFTED]] : i64 to i32
// CHECK-DAG: %[[SHIFT:.+]] = llvm.mlir.constant(16 : i32) : i32
// CHECK: %[[TENSOR_DIM_1_STRIDE_LOW_SHIFTED:.+]] = llvm.shl %[[TENSOR_DIM_1_STRIDE_LOW]], %[[SHIFT]]
// CHECK-DAG: %[[SGPR6:.+]] = llvm.or %[[SGPR6_0]], %[[TENSOR_DIM_1_STRIDE_LOW_SHIFTED]]
+ // CHECK-DAG: %[[SHIFT:.+]] = llvm.mlir.constant(16 : i64) : i64
+ // CHECK: %[[TENSOR_DIM_1_STRIDE_SHIFTED:.+]] = llvm.lshr %[[TENSOR_DIM_1_STRIDE_MASKED]], %[[SHIFT]]
+ // CHECK: %[[SGPR7:.+]] = llvm.trunc %[[TENSOR_DIM_1_STRIDE_SHIFTED]] : i64 to i32
+
// CHECK: %[[V8I32:.+]] = llvm.mlir.poison : vector<8xi32>
// CHECK: %[[DGROUP1_0:.+]] = llvm.insertelement %[[SGPR0]], %[[V8I32]][%[[C0]] : i32]
// CHECK: %[[DGROUP1_1:.+]] = llvm.insertelement %[[SGPR1]], %[[DGROUP1_0]][%[[C1]] : i32]
@@ -334,9 +340,8 @@ func.func @make_dma_descriptor_atomic_barrier(%base: !amdgpu.tdm_base<i32>, %bar
// CHECK-DAG: %[[C6:.+]] = llvm.mlir.constant(6 : i32)
// CHECK-DAG: %[[C7:.+]] = llvm.mlir.constant(7 : i32)
- // CHECK-DAG: %[[DATA_SIZE:.+]] = llvm.mlir.constant(2 : i32)
// CHECK-DAG: %[[C16:.+]] = llvm.mlir.constant(16 : i32)
- // CHECK: %[[SGPR0_0:.+]] = llvm.shl %[[DATA_SIZE]], %[[C16]]
+ // CHECK: %[[SGPR0_0:.+]] = llvm.shl %[[C2]], %[[C16]]
// CHECK-DAG: %[[ATOMIC_BARRIER_ENABLE_OFFSET:.+]] = llvm.mlir.constant(18 : i32)
// CHECK: %[[ATOMIC_BARRIER_ENABLE_FIELD:.+]] = llvm.shl %[[C1]], %[[ATOMIC_BARRIER_ENABLE_OFFSET]]
@@ -351,11 +356,12 @@ func.func @make_dma_descriptor_atomic_barrier(%base: !amdgpu.tdm_base<i32>, %bar
// CHECK-DAG: %[[TENSOR_DIM_0:.+]] = llvm.mlir.constant(64 : i32)
// CHECK-DAG: %[[C16:.+]] = llvm.mlir.constant(16 : i32)
- // CHECK: %[[SGPR2_0:.+]] = llvm.lshr %[[TENSOR_DIM_0]], %[[C16]]
- // CHECK-DAG: %[[C16:.+]] = llvm.mlir.constant(16 : i32)
// CHECK: %[[SGPR1_0:.+]] = llvm.shl %[[TENSOR_DIM_0]], %[[C16]]
// CHECK: %[[SGPR1:.+]] = llvm.or %[[ATOMIC_BARRIER]], %[[SGPR1_0]]
+ // CHECK-DAG: %[[C16:.+]] = llvm.mlir.constant(16 : i32)
+ // CHECK: %[[SGPR2_0:.+]] = llvm.lshr %[[TENSOR_DIM_0]], %[[C16]]
+
// CHECK: %[[V8I32:.+]] = llvm.mlir.poison : vector<8xi32>
// CHECK: %[[DGROUP1_0:.+]] = llvm.insertelement %[[SGPR0]], %[[V8I32]][%[[C0]] : i32]
// CHECK: %[[DGROUP1_1:.+]] = llvm.insertelement %[[SGPR1]], %[[DGROUP1_0]][%[[C1]] : i32]
@@ -382,9 +388,8 @@ func.func @make_dma_descriptor_iterate(%base: !amdgpu.tdm_base<i32>, %idx : inde
// CHECK: %[[C2:.+]] = llvm.mlir.constant(2 : i32)
// CHECK: %[[C3:.+]] = llvm.mlir.constant(3 : i32)
- // CHECK-DAG: %[[DATA_SIZE:.+]] = llvm.mlir.constant(2 : i32)
// CHECK-DAG: %[[C16:.+]] = llvm.mlir.constant(16 : i32)
- // CHECK: %[[SGPR0_0:.+]] = llvm.shl %[[DATA_SIZE]], %[[C16]]
+ // CHECK: %[[SGPR0_0:.+]] = llvm.shl %[[C2]], %[[C16]]
// CHECK-DAG: %[[SHIFT:.+]] = llvm.mlir.constant(19 : i32)
// CHECK: %[[ITERATE_ENABLE:.+]] = llvm.shl %[[C1]], %[[SHIFT]]
@@ -393,8 +398,8 @@ func.func @make_dma_descriptor_iterate(%base: !amdgpu.tdm_base<i32>, %idx : inde
// CHECK: %[[V8I32:.+]] = llvm.mlir.poison : vector<8xi32>
// CHECK: %[[DGROUP1_0:.+]] = llvm.insertelement %[[SGPR0]], %[[V8I32]][%[[C0]] : i32]
-
// CHECK: %[[SGPR1:.+]] = llvm.trunc %[[INDEX]] : i64 to i32
+
// CHECK: %[[SGPR2:.+]] = llvm.trunc %[[INDEX]] : i64 to i32
// CHECK-DAG: %[[SHIFT:.+]] = llvm.mlir.constant(32 : i64) : i64
// CHECK: %[[GLOBAL_ADDR_INC_HIGH:.+]] = llvm.lshr %[[INDEX]], %[[SHIFT]]
@@ -436,23 +441,24 @@ func.func @make_dma_descriptor(%base: !amdgpu.tdm_base<i32>) -> !amdgpu.tdm_desc
// CHECK-DAG: %[[C7:.+]] = llvm.mlir.constant(7 : i32)
- // CHECK-DAG: %[[DATA_SIZE:.+]] = llvm.mlir.constant(2 : i32)
// CHECK-DAG: %[[C16:.+]] = llvm.mlir.constant(16 : i32)
- // CHECK: %[[SGPR0:.+]] = llvm.shl %[[DATA_SIZE]], %[[C16]]
+ // CHECK: %[[SGPR0:.+]] = llvm.shl %[[C2]], %[[C16]]
// CHECK-DAG: %[[TENSOR_DIM_0:.+]] = llvm.mlir.constant(64 : i32)
// CHECK-DAG: %[[C16:.+]] = llvm.mlir.constant(16 : i32)
- // CHECK: %[[SGPR2_0:.+]] = llvm.lshr %[[TENSOR_DIM_0]], %[[C16]]
- // CHECK-DAG: %[[C16:.+]] = llvm.mlir.constant(16 : i32)
// CHECK: %[[SGPR1:.+]] = llvm.shl %[[TENSOR_DIM_0]], %[[C16]]
- // CHECK-DAG: %[[TENSOR_DIM_1:.+]] = llvm.mlir.constant(128 : i32)
// CHECK-DAG: %[[C16:.+]] = llvm.mlir.constant(16 : i32)
- // CHECK: %[[SGPR3_0:.+]] = llvm.lshr %[[TENSOR_DIM_1]], %[[C16]]
+ // CHECK: %[[SGPR2_0:.+]] = llvm.lshr %[[TENSOR_DIM_0]], %[[C16]]
+
+ // CHECK-DAG: %[[TENSOR_DIM_1:.+]] = llvm.mlir.constant(128 : i32)
// CHECK-DAG: %[[C16:.+]] = llvm.mlir.constant(16 : i32)
// CHECK: %[[TENSOR_DIM_1_SHIFTED:.+]] = llvm.shl %[[TENSOR_DIM_1]], %[[C16]]
// CHECK: %[[SGPR2:.+]] = llvm.or %[[SGPR2_0]], %[[TENSOR_DIM_1_SHIFTED]]
+ // CHECK-DAG: %[[C16:.+]] = llvm.mlir.constant(16 : i32)
+ // CHECK: %[[SGPR3_0:.+]] = llvm.lshr %[[TENSOR_DIM_1]], %[[C16]]
+
// CHECK-DAG: %[[TILE_DIM_0:.+]] = llvm.mlir.constant(64 : i32)
// CHECK-DAG: %[[C16:.+]] = llvm.mlir.constant(16 : i32)
// CHECK: %[[TILE_DIM_0_SHIFTED:.+]] = llvm.shl %[[TILE_DIM_0]], %[[C16]]
@@ -476,13 +482,14 @@ func.func @make_dma_descriptor(%base: !amdgpu.tdm_base<i32>) -> !amdgpu.tdm_desc
// CHECK-DAG: %[[MASK:.+]] = llvm.mlir.constant(281474976710655 : i64) : i64
// CHECK: %[[TENSOR_DIM_1_STRIDE_MASKED:.+]] = llvm.and %[[MASK]], %[[TENSOR_DIM_1_STRIDE]]
// CHECK-DAG: %[[TENSOR_DIM_1_STRIDE_LOW:.+]] = llvm.trunc %[[TENSOR_DIM_1_STRIDE_MASKED]]
- // CHECK-DAG: %[[SHIFT:.+]] = llvm.mlir.constant(16 : i64) : i64
- // CHECK: %[[TENSOR_DIM_1_STRIDE_SHIFTED:.+]] = llvm.lshr %[[TENSOR_DIM_1_STRIDE_MASKED]], %[[SHIFT]]
- // CHECK: %[[SGPR7:.+]] = llvm.trunc %[[TENSOR_DIM_1_STRIDE_SHIFTED]] : i64 to i32
// CHECK-DAG: %[[SHIFT:.+]] = llvm.mlir.constant(16 : i32) : i32
// CHECK: %[[TENSOR_DIM_1_STRIDE_LOW_SHIFTED:.+]] = llvm.shl %[[TENSOR_DIM_1_STRIDE_LOW]], %[[SHIFT]]
// CHECK-DAG: %[[SGPR6:.+]] = llvm.or %[[SGPR6_0]], %[[TENSOR_DIM_1_STRIDE_LOW_SHIFTED]]
+ // CHECK-DAG: %[[SHIFT:.+]] = llvm.mlir.constant(16 : i64) : i64
+ // CHECK: %[[TENSOR_DIM_1_STRIDE_SHIFTED:.+]] = llvm.lshr %[[TENSOR_DIM_1_STRIDE_MASKED]], %[[SHIFT]]
+ // CHECK: %[[SGPR7:.+]] = llvm.trunc %[[TENSOR_DIM_1_STRIDE_SHIFTED]] : i64 to i32
+
// CHECK: %[[V8I32:.+]] = llvm.mlir.poison : vector<8xi32>
// CHECK: %[[DGROUP1_0:.+]] = llvm.insertelement %[[SGPR0]], %[[V8I32]][%[[C0]] : i32]
// CHECK: %[[DGROUP1_1:.+]] = llvm.insertelement %[[SGPR1]], %[[DGROUP1_0]][%[[C1]] : i32]
@@ -494,12 +501,14 @@ func.func @make_dma_descriptor(%base: !amdgpu.tdm_base<i32>) -> !amdgpu.tdm_desc
// CHECK: %[[DGROUP1:.+]] = llvm.insertelement %[[SGPR7]], %[[DGROUP1_6]][%[[C7]] : i32]
// CHECK-DAG: %[[SGPR0:.+]] = llvm.mlir.constant([[TENSOR_DIM_2:64]] : i32)
+
// CHECK-DAG: %[[SGPR1:.+]] = llvm.mlir.constant([[TENSOR_DIM_3:64]] : i32)
+
// CHECK-DAG: %[[TENSOR_DIM_1_STRIDE:.+]] = llvm.mlir.constant(64 : i64)
// CHECK-DAG: %[[MASK:.+]] = llvm.mlir.constant(281474976710655 : i64) : i64
// CHECK: %[[TENSOR_DIM_2_STRIDE_MASKED:.+]] = llvm.and %[[MASK]], %[[TENSOR_DIM_1_STRIDE]]
-
// CHECK-DAG: %[[SGPR2:.+]] = llvm.trunc %[[TENSOR_DIM_2_STRIDE_MASKED]]
+
// CHECK-DAG: %[[SHIFT:.+]] = llvm.mlir.constant(32 : i64) : i64
// CHECK: %[[TENSOR_DIM_2_STRIDE_HIGH_64:.+]] = llvm.lshr %[[TENSOR_DIM_2_STRIDE_MASKED]], %[[SHIFT]]
// CHECK: %[[SGPR3_0:.+]] = llvm.trunc %[[TENSOR_DIM_2_STRIDE_HIGH_64]] : i64 to i32
@@ -524,12 +533,13 @@ func.func @make_dma_descriptor(%base: !amdgpu.tdm_base<i32>) -> !amdgpu.tdm_desc
// CHECK: %[[TENSOR_DIM3_STRIDE_HIGH:.+]] = llvm.trunc %[[TENSOR_DIM3_STRIDE_SHIFTED]] : i64 to i32
// CHECK-DAG: %[[TENSOR_DIM_4:.+]] = llvm.mlir.constant(64 : i32)
- // CHECK-DAG: %[[C16:.+]] = llvm.mlir.constant(16 : i32)
- // CHECK: %[[SGPR2_0:.+]] = llvm.lshr %[[TENSOR_DIM_4]], %[[C16]]
// CHECK-DAG: %[[SHIFT:.+]] = llvm.mlir.constant(16 : i32)
// CHECK-DAG: %[[TENSOR_DIM_4_LOW:.+]] = llvm.shl %[[TENSOR_DIM_4]], %[[SHIFT]]
// CHECK: %[[SGPR1:.+]] = llvm.or %[[TENSOR_DIM3_STRIDE_HIGH]], %[[TENSOR_DIM_4_LOW]]
+ // CHECK-DAG: %[[SHIFT:.+]] = llvm.mlir.constant(16 : i32)
+ // CHECK: %[[SGPR2_0:.+]] = llvm.lshr %[[TENSOR_DIM_4]], %[[SHIFT]]
+
// CHECK-DAG: %[[TILE_DIM_4:.+]] = llvm.mlir.constant(64 : i32) : i32
// CHECK-DAG: %[[SHIFT:.+]] = llvm.mlir.constant(16 : i32) : i32
// CHECK: %[[TILE_DIM_4_SHIFTED:.+]] = llvm.shl %[[TILE_DIM_4]], %[[SHIFT]]
@@ -563,27 +573,29 @@ func.func @make_dma_descriptor_workgroup_mask(%base: !amdgpu.tdm_base<i32>, %wg_
// CHECK-DAG: %[[C7:.+]] = llvm.mlir.constant(7 : i32)
// CHECK-DAG: %[[WG_MASK_EXT:.+]] = llvm.zext %[[WG_MASK]]
- // CHECK-DAG: %[[DATA_SIZE:.+]] = llvm.mlir.constant(2 : i32)
// CHECK-DAG: %[[C16:.+]] = llvm.mlir.constant(16 : i32)
- // CHECK: %[[DATA_SIZE_SHIFTED:.+]] = llvm.shl %[[DATA_SIZE]], %[[C16]]
+ // CHECK: %[[DATA_SIZE_SHIFTED:.+]] = llvm.shl %[[C2]], %[[C16]]
// CHECK: %[[SGPR0_BASE:.+]] = llvm.or %[[WG_MASK_EXT]], %[[DATA_SIZE_SHIFTED]]
+
// CHECK-DAG: %[[C21:.+]] = llvm.mlir.constant(21 : i32)
// CHECK: %[[TIMEOUT_SHIFTED:.+]] = llvm.shl %[[C1]], %[[C21]]
// CHECK: %[[SGPR0:.+]] = llvm.or %[[SGPR0_BASE]], %[[TIMEOUT_SHIFTED]]
// CHECK-DAG: %[[TENSOR_DIM_0:.+]] = llvm.mlir.constant(64 : i32)
// CHECK-DAG: %[[C16:.+]] = llvm.mlir.constant(16 : i32)
- // CHECK: %[[SGPR2_0:.+]] = llvm.lshr %[[TENSOR_DIM_0]], %[[C16]]
- // CHECK-DAG: %[[C16:.+]] = llvm.mlir.constant(16 : i32)
// CHECK: %[[SGPR1:.+]] = llvm.shl %[[TENSOR_DIM_0]], %[[C16]]
- // CHECK-DAG: %[[TENSOR_DIM_1:.+]] = llvm.mlir.constant(128 : i32)
// CHECK-DAG: %[[C16:.+]] = llvm.mlir.constant(16 : i32)
- // CHECK: %[[SGPR3_0:.+]] = llvm.lshr %[[TENSOR_DIM_1]], %[[C16]]
+ // CHECK: %[[SGPR2_0:.+]] = llvm.lshr %[[TENSOR_DIM_0]], %[[C16]]
+
+ // CHECK-DAG: %[[TENSOR_DIM_1:.+]] = llvm.mlir.constant(128 : i32)
// CHECK-DAG: %[[C16:.+]] = llvm.mlir.constant(16 : i32)
// CHECK: %[[TENSOR_DIM_1_SHIFTED:.+]] = llvm.shl %[[TENSOR_DIM_1]], %[[C16]]
// CHECK: %[[SGPR2:.+]] = llvm.or %[[SGPR2_0]], %[[TENSOR_DIM_1_SHIFTED]]
+ // CHECK-DAG: %[[C16:.+]] = llvm.mlir.constant(16 : i32)
+ // CHECK: %[[SGPR3_0:.+]] = llvm.lshr %[[TENSOR_DIM_1]], %[[C16]]
+
// CHECK-DAG: %[[TILE_DIM_0:.+]] = llvm.mlir.constant(64 : i32)
// CHECK-DAG: %[[C16:.+]] = llvm.mlir.constant(16 : i32)
// CHECK: %[[TILE_DIM_0_SHIFTED:.+]] = llvm.shl %[[TILE_DIM_0:.+]], %[[C16]]
@@ -595,6 +607,7 @@ func.func @make_dma_descriptor_workgroup_mask(%base: !amdgpu.tdm_base<i32>, %wg_
// CHECK-DAG: %[[MASK:.+]] = llvm.mlir.constant(281474976710655 : i64) : i64
// CHECK: %[[TENSOR_DIM_0_STRIDE_MASKED:.+]] = llvm.and %[[MASK]], %[[TENSOR_DIM_0_STRIDE]]
// CHECK-DAG: %[[SGPR5:.+]] = llvm.trunc %[[TENSOR_DIM_0_STRIDE_MASKED]] : i64 to i32
+
// CHECK-DAG: %[[SHIFT:.+]] = llvm.mlir.constant(32 : i64) : i64
// CHECK: %[[TENSOR_DIM_0_STRIDE_HIGH_64:.+]] = llvm.lshr %[[TENSOR_DIM_0_STRIDE_MASKED]], %[[SHIFT]]
// CHECK: %[[SGPR6_0:.+]] = llvm.trunc %[[TENSOR_DIM_0_STRIDE_HIGH_64]] : i64 to i32
@@ -603,13 +616,15 @@ func.func @make_dma_descriptor_workgroup_mask(%base: !amdgpu.tdm_base<i32>, %wg_
// CHECK-DAG: %[[MASK:.+]] = llvm.mlir.constant(281474976710655 : i64) : i64
// CHECK: %[[TENSOR_DIM_1_STRIDE_MASKED:.+]] = llvm.and %[[MASK]], %[[TENSOR_DIM_1_STRIDE]]
// CHECK-DAG: %[[TENSOR_DIM_1_STRIDE_LOW:.+]] = llvm.trunc %[[TENSOR_DIM_1_STRIDE_MASKED]]
- // CHECK-DAG: %[[SHIFT:.+]] = llvm.mlir.constant(16 : i64) : i64
- // CHECK: %[[TENSOR_DIM_1_STRIDE_SHIFTED:.+]] = llvm.lshr %[[TENSOR_DIM_1_STRIDE_MASKED]], %[[SHIFT]]
- // CHECK: %[[SGPR7:.+]] = llvm.trunc %[[TENSOR_DIM_1_STRIDE_SHIFTED]] : i64 to i32
+
// CHECK-DAG: %[[SHIFT:.+]] = llvm.mlir.constant(16 : i32) : i32
// CHECK: %[[TENSOR_DIM_1_STRIDE_LOW_SHIFTED:.+]] = llvm.shl %[[TENSOR_DIM_1_STRIDE_LOW]], %[[SHIFT]]
// CHECK-DAG: %[[SGPR6:.+]] = llvm.or %[[SGPR6_0]], %[[TENSOR_DIM_1_STRIDE_LOW_SHIFTED]]
+ // CHECK-DAG: %[[SHIFT:.+]] = llvm.mlir.constant(16 : i64) : i64
+ // CHECK: %[[TENSOR_DIM_1_STRIDE_SHIFTED:.+]] = llvm.lshr %[[TENSOR_DIM_1_STRIDE_MASKED]], %[[SHIFT]]
+ // CHECK: %[[SGPR7:.+]] = llvm.trunc %[[TENSOR_DIM_1_STRIDE_SHIFTED]] : i64 to i32
+
// CHECK: %[[V8I32:.+]] = llvm.mlir.poison : vector<8xi32>
// CHECK: %[[DGROUP1_0:.+]] = llvm.insertelement %[[SGPR0]], %[[V8I32]][%[[C0]] : i32]
// CHECK: %[[DGROUP1_1:.+]] = llvm.insertelement %[[SGPR1]], %[[DGROUP1_0]][%[[C1]] : i32]
>From d18611860227a66dd4798e45ecd97c51cc447de4 Mon Sep 17 00:00:00 2001
From: Erick Ochoa <erick.ochoalopez at amd.com>
Date: Tue, 9 Dec 2025 15:03:20 -0500
Subject: [PATCH 03/10] Add lit test for pad_enable
---
mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td | 6 ++--
.../AMDGPUToROCDL/AMDGPUToROCDL.cpp | 2 +-
.../Conversion/AMDGPUToROCDL/gfx1250.mlir | 36 +++++++++++++++++++
mlir/test/Dialect/AMDGPU/ops.mlir | 8 ++---
4 files changed, 44 insertions(+), 8 deletions(-)
diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
index e9a8684dc61fe..eebd45306d2c3 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
@@ -1298,8 +1298,8 @@ def AMDGPU_MakeDmaDescriptorOp :
DenseI64ArrayAttr: $shared_static_sizes,
Optional<I16>: $workgroup_mask,
Optional<I1>: $early_timeout,
- Optional<Index>: $pad_amount,
- Optional<Index>: $pad_interval,
+ Optional<I32>: $pad_amount,
+ Optional<I32>: $pad_interval,
Optional<AnyMemRef>: $atomic_barrier_address,
Variadic<Index>: $atomic_barrier_indices,
Optional<Index>: $global_increment,
@@ -1345,7 +1345,7 @@ def AMDGPU_MakeDmaDescriptorOp :
// Example of moving a two dimension tensor to LDS where padding is applied after every integer.
%base = amdgpu.make_dma_base %global[0, 0], %lds[0, 0] : memref<32x32xi32>, memref<64x64xi32, #gpu.address_space<workgroup>> -> !amdgpu.tdm_base<i32>
- %descriptor = amdgpu.make_dma_descriptor %base globalSize [32, 32] globalStride [32, 1] sharedSize [64, 64] padding(%pad_amount pad_every %pad_interval) : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
+ %descriptor = amdgpu.make_dma_descriptor %base globalSize [32, 32] globalStride [32, 1] sharedSize [64, 64] padShared(%pad_amount every %pad_interval) : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
amdgpu.tensor_load_to_lds %descriptor : !amdgpu.tdm_descriptor
```
}];
diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index 61a85147cf842..896f8381ecccd 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -2456,8 +2456,8 @@ struct AMDGPUMakeDmaDescriptorLowering
if (!op.getPadAmount())
return sgpr0;
- Value padAmount = adaptor.getPadAmount();
// pre-condition: padAmount is a value between 1-128.
+ Value padAmount = adaptor.getPadAmount();
padAmount = LLVM::SubOp::create(rewriter, loc, padAmount, consts[1]);
// post-condition: padAmount is a value between 0-127.
return setValueAtOffset(rewriter, loc, sgpr0, padAmount, 25);
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir b/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir
index 6d1a765d2ea02..b831114260d98 100644
--- a/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir
+++ b/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir
@@ -425,6 +425,42 @@ func.func @make_dma_descriptor_iterate(%base: !amdgpu.tdm_base<i32>, %idx : inde
// -----
+// CHECK-LABEL: func @make_dma_descriptor_pad_enable
+// CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base<i32>, %[[PAD_AMOUNT:.+]]: i32, %[[PAD_INTERVAL:.+]]: i32)
+func.func @make_dma_descriptor_pad_enable(%base: !amdgpu.tdm_base<i32>, %pad_amount: i32, %pad_interval: i32) -> !amdgpu.tdm_descriptor {
+
+ // CHECK-DAG: %[[DGROUP0:.+]] = builtin.unrealized_conversion_cast %[[BASE]]
+
+ // CHECK-DAG: %[[C0:.+]] = llvm.mlir.constant(0 : i32)
+ // CHECK-DAG: %[[C1:.+]] = llvm.mlir.constant(1 : i32)
+ // CHECK-DAG: %[[C2:.+]] = llvm.mlir.constant(2 : i32)
+ // CHECK-DAG: %[[C3:.+]] = llvm.mlir.constant(3 : i32)
+ // CHECK-DAG: %[[C4:.+]] = llvm.mlir.constant(4 : i32)
+ // CHECK-DAG: %[[C5:.+]] = llvm.mlir.constant(5 : i32)
+ // CHECK-DAG: %[[C6:.+]] = llvm.mlir.constant(6 : i32)
+ // CHECK-DAG: %[[C7:.+]] = llvm.mlir.constant(7 : i32)
+
+ // CHECK-DAG: %[[SHIFT:.+]] = llvm.mlir.constant(20 : i32)
+ // CHECK: %[[PAD_ENABLE:.+]] = llvm.shl %[[C1]], %[[SHIFT]]
+ // CHECK: %[[SGPR0:.+]] = llvm.or %[[SGPR0_BASE:.+]], %[[PAD_ENABLE]]
+
+ // CHECK: %[[PAD_INTERVAL_CTTZ:.+]] = "llvm.intr.cttz"(%[[PAD_INTERVAL]]) <{is_zero_poison = false}> : (i32) -> i32
+ // CHECK: %[[PAD_INTERVAL_M1:.+]] = llvm.sub %[[PAD_INTERVAL_CTTZ]], %[[C1]]
+ // CHECK-DAG: %[[SHIFT:.+]] = llvm.mlir.constant(22 : i32)
+ // CHECK: %[[PAD_INTERVAL:.+]] = llvm.shl %[[PAD_INTERVAL_M1]], %[[SHIFT]]
+ // CHECK: %[[SGPR0:.+]] = llvm.or %[[SGPR0_BASE:.+]], %[[PAD_INTERVAL]]
+
+ // CHECK: %[[PAD_AMOUNT_M1:.+]] = llvm.sub %[[PAD_AMOUNT]], %[[C1]]
+ // CHECK-DAG: %[[SHIFT:.+]] = llvm.mlir.constant(25 : i32)
+ // CHECK: %[[PAD_AMOUNT_SHIFTED:.+]] = llvm.shl %[[PAD_AMOUNT_M1]], %[[SHIFT]]
+ // CHECK: llvm.or %[[SGPR0:.+]], %[[PAD_AMOUNT_SHIFTED]]
+
+ %descriptor = amdgpu.make_dma_descriptor %base globalSize [128, 64] globalStride [64, 1] sharedSize [128, 64] padShared(%pad_amount every %pad_interval) : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
+ func.return %descriptor : !amdgpu.tdm_descriptor
+}
+
+// -----
+
// CHECK-LABEL: func @make_dma_descriptor
// CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base<i32>)
func.func @make_dma_descriptor(%base: !amdgpu.tdm_base<i32>) -> !amdgpu.tdm_descriptor {
diff --git a/mlir/test/Dialect/AMDGPU/ops.mlir b/mlir/test/Dialect/AMDGPU/ops.mlir
index 651aff4a0d22a..2c87aa31b4fec 100644
--- a/mlir/test/Dialect/AMDGPU/ops.mlir
+++ b/mlir/test/Dialect/AMDGPU/ops.mlir
@@ -697,8 +697,8 @@ func.func @make_dma_base(%idx: index, %mem: memref<8xi32>, %smem: memref<8xi32,
}
// CHECK-LABEL: func @make_dma_descriptor
-// CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base<i32>, %[[WG_MASK:.+]]: i16, %[[TIMEOUT:.+]]: i1, %[[BARRIER:.+]]: memref<8xi32, #gpu.address_space<workgroup>>, %[[IDX:.+]]: index)
-func.func @make_dma_descriptor(%base: !amdgpu.tdm_base<i32>, %wg_mask: i16, %timeout: i1, %barrier: memref<8xi32, #gpu.address_space<workgroup>>, %idx: index) {
+// CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base<i32>, %[[WG_MASK:.+]]: i16, %[[TIMEOUT:.+]]: i1, %[[BARRIER:.+]]: memref<8xi32, #gpu.address_space<workgroup>>, %[[IDX:.+]]: index, %[[I32:.+]]: i32)
+func.func @make_dma_descriptor(%base: !amdgpu.tdm_base<i32>, %wg_mask: i16, %timeout: i1, %barrier: memref<8xi32, #gpu.address_space<workgroup>>, %idx: index, %i32: i32) {
// CHECK: amdgpu.make_dma_descriptor %[[BASE]]
amdgpu.make_dma_descriptor %base
@@ -717,8 +717,8 @@ func.func @make_dma_descriptor(%base: !amdgpu.tdm_base<i32>, %wg_mask: i16, %tim
globalStride [64, 1]
// CHECK-SAME: sharedSize [64, 64]
sharedSize [64, 64]
- // CHECK-SAME: padShared(%[[IDX]] every %[[IDX]])
- padShared(%idx every %idx)
+ // CHECK-SAME: padShared(%[[I32]] every %[[I32]])
+ padShared(%i32 every %i32)
: !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
// CHECK: amdgpu.make_dma_descriptor %[[BASE]]
>From 715879d209d8b11e48d24a57ebda23062ee705ac Mon Sep 17 00:00:00 2001
From: Erick Ochoa <erick.ochoalopez at amd.com>
Date: Tue, 9 Dec 2025 15:25:03 -0500
Subject: [PATCH 04/10] style
---
mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index 896f8381ecccd..89b909bc6c71a 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -2837,7 +2837,8 @@ struct AMDGPUMakeDmaDescriptorLowering
ArrayRef<Value> consts) const {
constexpr int32_t dim = 4;
constexpr int32_t offset = 48;
- return setTensorDimX(op, adaptor, rewriter, loc, sgpr1, sgpr2, consts, dim, offset);
+ return setTensorDimX(op, adaptor, rewriter, loc, sgpr1, sgpr2, consts, dim,
+ offset);
}
Value setTileDim4(MakeDmaDescriptorOp op, OpAdaptor adaptor,
>From 31f1bec636f4b5dde1b3e691a9a102c1c1febbba Mon Sep 17 00:00:00 2001
From: Erick Ochoa <erick.ochoalopez at amd.com>
Date: Tue, 9 Dec 2025 15:47:28 -0500
Subject: [PATCH 05/10] Add truncations where necessary
---
.../Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp | 16 +++++++++++++---
1 file changed, 13 insertions(+), 3 deletions(-)
diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index 89b909bc6c71a..7b4b8e59b2fbc 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -2505,8 +2505,11 @@ struct AMDGPUMakeDmaDescriptorLowering
if (auto attr = dyn_cast<Attribute>(tensorDimXOpFoldResult))
tensorDimX =
createI32Constant(rewriter, loc, cast<IntegerAttr>(attr).getInt());
- else
+ else {
+ IntegerType i32 = rewriter.getI32Type();
tensorDimX = cast<Value>(tensorDimXOpFoldResult);
+ tensorDimX = LLVM::TruncOp::create(rewriter, loc, i32, tensorDimX);
+ }
sgpr1 = setValueAtOffset(rewriter, loc, sgpr1, tensorDimX, offset);
@@ -2548,8 +2551,11 @@ struct AMDGPUMakeDmaDescriptorLowering
if (auto attr = dyn_cast<Attribute>(tileDimXOpFoldResult))
tileDimX =
createI32Constant(rewriter, loc, cast<IntegerAttr>(attr).getInt());
- else
+ else {
+ IntegerType i32 = rewriter.getI32Type();
tileDimX = cast<Value>(tileDimXOpFoldResult);
+ tileDimX = LLVM::TruncOp::create(rewriter, loc, i32, tileDimX);
+ }
return setValueAtOffset(rewriter, loc, sgpr, tileDimX, offset);
}
@@ -2685,8 +2691,12 @@ struct AMDGPUMakeDmaDescriptorLowering
if (auto attr = dyn_cast<Attribute>(tensorDimXOpFoldResult))
tensorDimX =
createI32Constant(rewriter, loc, cast<IntegerAttr>(attr).getInt());
- else
+ else {
+ IntegerType i32 = rewriter.getI32Type();
tensorDimX = cast<Value>(tensorDimXOpFoldResult);
+ tensorDimX = LLVM::TruncOp::create(rewriter, loc, i32, tensorDimX);
+ }
+
return setValueAtOffset(rewriter, loc, sgpr0, tensorDimX, offset);
}
>From 47c5b7bd95cc6f183e750d9ec6197546a4e20180 Mon Sep 17 00:00:00 2001
From: Erick Ochoa <erick.ochoalopez at amd.com>
Date: Tue, 9 Dec 2025 15:56:09 -0500
Subject: [PATCH 06/10] Change lds_increment to I32
---
mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td | 2 +-
mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp | 3 +--
mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir | 11 +++++------
.../AMDGPU/amdgpu-make-dma-descriptor-fold.mlir | 6 +++---
mlir/test/Dialect/AMDGPU/ops.mlir | 4 ++--
5 files changed, 12 insertions(+), 14 deletions(-)
diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
index eebd45306d2c3..11fc6da504a95 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
@@ -1303,7 +1303,7 @@ def AMDGPU_MakeDmaDescriptorOp :
Optional<AnyMemRef>: $atomic_barrier_address,
Variadic<Index>: $atomic_barrier_indices,
Optional<Index>: $global_increment,
- Optional<Index>: $lds_increment,
+ Optional<I32>: $lds_increment,
Optional<Index>: $iteration_count)>,
Results<(outs AMDGPU_TDMDescriptorType: $desc)> {
diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index 7b4b8e59b2fbc..5b40576c5c2bd 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -2720,8 +2720,7 @@ struct AMDGPUMakeDmaDescriptorLowering
Value sgpr1, ArrayRef<Value> consts,
int64_t offset) const {
Value ldsAddrIncrement = adaptor.getLdsIncrement();
- return truncateAndSetValueAtOffset(rewriter, loc, sgpr1, ldsAddrIncrement,
- offset);
+ return setValueAtOffset(rewriter, loc, sgpr1, ldsAddrIncrement, offset);
}
std::pair<Value, Value>
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir b/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir
index b831114260d98..6a80a7661438a 100644
--- a/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir
+++ b/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir
@@ -378,8 +378,8 @@ func.func @make_dma_descriptor_atomic_barrier(%base: !amdgpu.tdm_base<i32>, %bar
// -----
// CHECK-LABEL: func @make_dma_descriptor_iterate
-// CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base<i32>, %[[IDX:.+]]: index)
-func.func @make_dma_descriptor_iterate(%base: !amdgpu.tdm_base<i32>, %idx : index) -> !amdgpu.tdm_descriptor {
+// CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base<i32>, %[[IDX:.+]]: index, %[[I32:.+]]: i32)
+func.func @make_dma_descriptor_iterate(%base: !amdgpu.tdm_base<i32>, %idx : index, %i32: i32) -> !amdgpu.tdm_descriptor {
// CHECK-DAG: %[[DGROUP0:.+]] = builtin.unrealized_conversion_cast %[[BASE]]
// CHECK-DAG: %[[INDEX:.+]] = builtin.unrealized_conversion_cast %[[IDX]] : index to i64
@@ -398,9 +398,8 @@ func.func @make_dma_descriptor_iterate(%base: !amdgpu.tdm_base<i32>, %idx : inde
// CHECK: %[[V8I32:.+]] = llvm.mlir.poison : vector<8xi32>
// CHECK: %[[DGROUP1_0:.+]] = llvm.insertelement %[[SGPR0]], %[[V8I32]][%[[C0]] : i32]
- // CHECK: %[[SGPR1:.+]] = llvm.trunc %[[INDEX]] : i64 to i32
-
// CHECK: %[[SGPR2:.+]] = llvm.trunc %[[INDEX]] : i64 to i32
+
// CHECK-DAG: %[[SHIFT:.+]] = llvm.mlir.constant(32 : i64) : i64
// CHECK: %[[GLOBAL_ADDR_INC_HIGH:.+]] = llvm.lshr %[[INDEX]], %[[SHIFT]]
// CHECK: %[[GLOBAL_ADDR_INC_HIGH_2:.+]] = llvm.trunc %[[GLOBAL_ADDR_INC_HIGH]] : i64 to i32
@@ -415,11 +414,11 @@ func.func @make_dma_descriptor_iterate(%base: !amdgpu.tdm_base<i32>, %idx : inde
// CHECK: %[[V4I32:.+]] = llvm.mlir.poison : vector<4xi32>
// CHECK: %[[DGROUP2_0:.+]] = llvm.insertelement %[[C0]], %[[V4I32]][%[[C0]]
- // CHECK: %[[DGROUP2_1:.+]] = llvm.insertelement %[[SGPR1]], %[[DGROUP2_0]][%[[C1]]
+ // CHECK: %[[DGROUP2_1:.+]] = llvm.insertelement %[[I32]], %[[DGROUP2_0]][%[[C1]]
// CHECK: %[[DGROUP2_2:.+]] = llvm.insertelement %[[SGPR2]], %[[DGROUP2_1]][%[[C2]]
// CHECK: %[[DGROUP2:.+]] = llvm.insertelement %[[SGPR3]], %[[DGROUP2_2]][%[[C3]]
- %descriptor = amdgpu.make_dma_descriptor %base globalSize [128, 64] globalStride [64, 1] sharedSize [128, 64] iterate %idx, %idx, %idx : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
+ %descriptor = amdgpu.make_dma_descriptor %base globalSize [128, 64] globalStride [64, 1] sharedSize [128, 64] iterate %idx, %i32, %idx : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
func.return %descriptor : !amdgpu.tdm_descriptor
}
diff --git a/mlir/test/Dialect/AMDGPU/amdgpu-make-dma-descriptor-fold.mlir b/mlir/test/Dialect/AMDGPU/amdgpu-make-dma-descriptor-fold.mlir
index 9d43c9940f8e0..dcb385384a2b8 100644
--- a/mlir/test/Dialect/AMDGPU/amdgpu-make-dma-descriptor-fold.mlir
+++ b/mlir/test/Dialect/AMDGPU/amdgpu-make-dma-descriptor-fold.mlir
@@ -1,8 +1,8 @@
// RUN: mlir-opt --canonicalize %s | FileCheck %s
// CHECK-LABEL: @make_dma_descriptor_fold
-// CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base<i32>, %[[IDX:.+]]: index)
-func.func @make_dma_descriptor_fold(%base: !amdgpu.tdm_base<i32>, %idx: index) -> !amdgpu.tdm_descriptor {
+// CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base<i32>, %[[IDX:.+]]: index, %[[I32:.+]]: i32)
+func.func @make_dma_descriptor_fold(%base: !amdgpu.tdm_base<i32>, %idx: index, %i32: i32) -> !amdgpu.tdm_descriptor {
%c64 = arith.constant 64 : index
// CHECK: amdgpu.make_dma_descriptor %[[BASE]]
@@ -13,7 +13,7 @@ func.func @make_dma_descriptor_fold(%base: !amdgpu.tdm_base<i32>, %idx: index) -
globalStride [%c64, 1]
// CHECK-SAME: sharedSize [64, 64]
sharedSize [%c64, %c64]
- iterate %idx, %idx, %idx
+ iterate %idx, %i32, %idx
: !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
func.return %0 : !amdgpu.tdm_descriptor
}
diff --git a/mlir/test/Dialect/AMDGPU/ops.mlir b/mlir/test/Dialect/AMDGPU/ops.mlir
index 2c87aa31b4fec..9dd746827958f 100644
--- a/mlir/test/Dialect/AMDGPU/ops.mlir
+++ b/mlir/test/Dialect/AMDGPU/ops.mlir
@@ -767,8 +767,8 @@ func.func @make_dma_descriptor(%base: !amdgpu.tdm_base<i32>, %wg_mask: i16, %tim
globalStride [64, 1]
// CHECK-SAME: sharedSize [64, 64]
sharedSize [64, 64]
- // CHECK-SAME: iterate %[[IDX]], %[[IDX]], %[[IDX]]
- iterate %idx, %idx, %idx
+ // CHECK-SAME: iterate %[[IDX]], %[[I32]], %[[IDX]]
+ iterate %idx, %i32, %idx
: !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
func.return
>From ccb6ed4c6d875ac125d5c1790d86b5a45ef95a97 Mon Sep 17 00:00:00 2001
From: Erick Ochoa <erick.ochoalopez at amd.com>
Date: Tue, 9 Dec 2025 16:24:20 -0500
Subject: [PATCH 07/10] Adds TODOs where needed.
---
.../AMDGPUToROCDL/AMDGPUToROCDL.cpp | 35 ++++++++++++++++++-
1 file changed, 34 insertions(+), 1 deletion(-)
diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index 5b40576c5c2bd..1c329ef4fadcd 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -2440,9 +2440,13 @@ struct AMDGPUMakeDmaDescriptorLowering
if (!op.getPadAmount())
return sgpr0;
+ // pre-condition: padInterval can be a power of two between 2 and 256.
+ // TODO: Validation if the value breaks the pre-condition.
+ // If the pre-condition fails, there is a possibility of
+ // affecting the higher bits. In a following PR add a flag
+ // that instruments conditions that need to be checked at runtime.
IntegerType i32 = rewriter.getI32Type();
Value padInterval = adaptor.getPadInterval();
- // pre-condition: padInterval can be a power of two between 2 and 256.
padInterval = LLVM::CountTrailingZerosOp::create(rewriter, loc, i32,
padInterval, false);
padInterval = LLVM::SubOp::create(rewriter, loc, padInterval, consts[1]);
@@ -2457,6 +2461,10 @@ struct AMDGPUMakeDmaDescriptorLowering
return sgpr0;
// pre-condition: padAmount is a value between 1-128.
+ // TODO: Validation if the value breaks the pre-condition.
+ // If the pre-condition fails, there is a possibility of
+ // affecting the higher bits. In a following PR add a flag
+ // that instruments conditions that need to be checked at runtime.
Value padAmount = adaptor.getPadAmount();
padAmount = LLVM::SubOp::create(rewriter, loc, padAmount, consts[1]);
// post-condition: padAmount is a value between 0-127.
@@ -2480,6 +2488,9 @@ struct AMDGPUMakeDmaDescriptorLowering
IntegerType i32 = rewriter.getI32Type();
// pre-condition: atomicBarrierAddress is aligned to 8 bytes which implies
// that the 3 LSBs are zero.
+ // TODO: Validation if the value breaks the pre-condition.
+ // In a following PR add a flag that instruments conditions that need to be
+ // checked at runtime.
atomicBarrierAddress =
LLVM::PtrToIntOp::create(rewriter, loc, i32, atomicBarrierAddress);
atomicBarrierAddress =
@@ -2501,6 +2512,13 @@ struct AMDGPUMakeDmaDescriptorLowering
return {sgpr1, sgpr2};
OpFoldResult tensorDimXOpFoldResult = *(mixedGlobalSizes.rbegin() + dimX);
+ // pre-condition: tensorDimX is less than 2^48-1
+ // TODO: Validation if the value breaks the pre-condition.
+ // If the pre-condition fails, there is a possibility of
+ // affecting the higher bits. In a following PR add a flag
+ // that instruments conditions that need to be checked at runtime.
+ // This could also be fixed by saying that mixedGlobalSizes is a
+ // DynamicI48List.
Value tensorDimX;
if (auto attr = dyn_cast<Attribute>(tensorDimXOpFoldResult))
tensorDimX =
@@ -2547,6 +2565,13 @@ struct AMDGPUMakeDmaDescriptorLowering
return sgpr;
OpFoldResult tileDimXOpFoldResult = *(mixedSharedSizes.rbegin() + dimX);
+ // pre-condition: tileDimX is less than 2^16-1
+ // TODO: Validation if the value breaks the pre-condition.
+ // If the pre-condition fails, there is a possibility of
+ // affecting the higher bits. In a following PR add a flag
+ // that instruments conditions that need to be checked at runtime.
+ // This could also be fixed by saying that mixedSharedSizes is a
+ // DynamicI16List.
Value tileDimX;
if (auto attr = dyn_cast<Attribute>(tileDimXOpFoldResult))
tileDimX =
@@ -2590,6 +2615,10 @@ struct AMDGPUMakeDmaDescriptorLowering
OpFoldResult tensorDimXStrideOpFoldResult =
*(mixedGlobalStrides.rbegin() + dimX);
+ // pre-condition: tensorDimXStride is less than 2^48-1
+ // TODO: Validation if the value breaks the pre-condition.
+ // In a following PR add a flag that instruments conditions that need to be
+ // checked at runtime.
Value tensorDimXStride;
if (auto attr = dyn_cast<Attribute>(tensorDimXStrideOpFoldResult))
tensorDimXStride =
@@ -2778,6 +2807,10 @@ struct AMDGPUMakeDmaDescriptorLowering
Value iterationCount = adaptor.getIterationCount();
IntegerType i32 = rewriter.getI32Type();
// pre-condition: iterationCount is in the inclusive interval [1, 256].
+ // TODO: validation if the value breaks the pre-condition.
+ // If the pre-condition fails, there is a possibility of
+ // affecting the higher bits. In a following PR add a flag that instruments
+ // conditions that need to be checked at runtime.
iterationCount = LLVM::TruncOp::create(rewriter, loc, i32, iterationCount);
iterationCount =
LLVM::SubOp::create(rewriter, loc, iterationCount, consts[1]);
>From c9dc3a7efbf4288d506f73b18b1f305d9d228bb6 Mon Sep 17 00:00:00 2001
From: Erick Ochoa <erick.ochoalopez at amd.com>
Date: Wed, 10 Dec 2025 11:06:38 -0500
Subject: [PATCH 08/10] Add test for dynamic case
---
.../AMDGPUToROCDL/AMDGPUToROCDL.cpp | 21 ++++++--
.../Conversion/AMDGPUToROCDL/gfx1250.mlir | 50 ++++++++++++++++++-
2 files changed, 65 insertions(+), 6 deletions(-)
diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index 1c329ef4fadcd..f074a0e528b90 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -2507,7 +2507,10 @@ struct AMDGPUMakeDmaDescriptorLowering
Location loc, Value sgpr1, Value sgpr2,
ArrayRef<Value> consts, uint64_t dimX,
uint32_t offset) const {
- SmallVector<OpFoldResult> mixedGlobalSizes = op.getMixedGlobalSizes();
+ ArrayRef<int64_t> globalStaticSizes = adaptor.getGlobalStaticSizes();
+ ValueRange globalDynamicSizes = adaptor.getGlobalDynamicSizes();
+ SmallVector<OpFoldResult> mixedGlobalSizes =
+ getMixedValues(globalStaticSizes, globalDynamicSizes, rewriter);
if (mixedGlobalSizes.size() <= dimX)
return {sgpr1, sgpr2};
@@ -2559,8 +2562,10 @@ struct AMDGPUMakeDmaDescriptorLowering
ConversionPatternRewriter &rewriter, Location loc,
Value sgpr, ArrayRef<Value> consts, size_t dimX,
int64_t offset) const {
- SmallVector<OpFoldResult> mixedSharedSizes = op.getMixedSharedSizes();
-
+ ArrayRef<int64_t> sharedStaticSizes = adaptor.getSharedStaticSizes();
+ ValueRange sharedDynamicSizes = adaptor.getSharedDynamicSizes();
+ SmallVector<OpFoldResult> mixedSharedSizes =
+ getMixedValues(sharedStaticSizes, sharedDynamicSizes, rewriter);
if (mixedSharedSizes.size() <= dimX)
return sgpr;
@@ -2608,7 +2613,10 @@ struct AMDGPUMakeDmaDescriptorLowering
ConversionPatternRewriter &rewriter, Location loc,
Value sgprY, Value sgprZ, ArrayRef<Value> consts,
size_t dimX, int64_t offset) const {
- SmallVector<OpFoldResult> mixedGlobalStrides = op.getMixedGlobalStrides();
+ ArrayRef<int64_t> globalStaticStrides = adaptor.getGlobalStaticStrides();
+ ValueRange globalDynamicStrides = adaptor.getGlobalDynamicStrides();
+ SmallVector<OpFoldResult> mixedGlobalStrides =
+ getMixedValues(globalStaticStrides, globalDynamicStrides, rewriter);
if (mixedGlobalStrides.size() <= dimX)
return {sgprY, sgprZ};
@@ -2711,7 +2719,10 @@ struct AMDGPUMakeDmaDescriptorLowering
ConversionPatternRewriter &rewriter, Location loc,
Value sgpr0, ArrayRef<Value> consts, int64_t dimX,
int64_t offset) const {
- SmallVector<OpFoldResult> mixedGlobalSizes = op.getMixedGlobalSizes();
+ ArrayRef<int64_t> globalStaticSizes = adaptor.getGlobalStaticSizes();
+ ValueRange globalDynamicSizes = adaptor.getGlobalDynamicSizes();
+ SmallVector<OpFoldResult> mixedGlobalSizes =
+ getMixedValues(globalStaticSizes, globalDynamicSizes, rewriter);
if (mixedGlobalSizes.size() <= static_cast<unsigned long>(dimX))
return sgpr0;
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir b/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir
index 6a80a7661438a..9d97ddbb1bec9 100644
--- a/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir
+++ b/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir
@@ -460,10 +460,58 @@ func.func @make_dma_descriptor_pad_enable(%base: !amdgpu.tdm_base<i32>, %pad_amo
// -----
+// CHECK-LABEL: func @make_dma_descriptor_dynamic
+// CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base<i32>, %[[GS0:.+]]: index, %[[GS1:.+]]: index, %[[GST1:.+]]: index, %[[SHS0:.+]]: index, %[[SHS1:.+]]: index)
+func.func @make_dma_descriptor_dynamic(%base: !amdgpu.tdm_base<i32>, %gs0: index, %gs1: index, %gst1: index, %shs0: index, %shs1: index) -> !amdgpu.tdm_descriptor {
+ // CHECK-DAG: %[[DGROUP0:.+]] = builtin.unrealized_conversion_cast %[[BASE]]
+ // CHECK-DAG: %[[GS0I:.+]] = builtin.unrealized_conversion_cast %[[GS0]]
+ // CHECK-DAG: %[[GS1I:.+]] = builtin.unrealized_conversion_cast %[[GS1]]
+ // CHECK-DAG: %[[GST1I:.+]] = builtin.unrealized_conversion_cast %[[GST1]]
+ // CHECK-DAG: %[[SHS0I:.+]] = builtin.unrealized_conversion_cast %[[SHS0]]
+ // CHECK-DAG: %[[SHS1I:.+]] = builtin.unrealized_conversion_cast %[[SHS1]]
+
+ // CHECK-DAG: %[[C0:.+]] = llvm.mlir.constant(0 : i32)
+ // CHECK-DAG: %[[C1:.+]] = llvm.mlir.constant(1 : i32)
+ // CHECK-DAG: %[[C2:.+]] = llvm.mlir.constant(2 : i32)
+ // CHECK-DAG: %[[C3:.+]] = llvm.mlir.constant(3 : i32)
+ // CHECK-DAG: %[[C4:.+]] = llvm.mlir.constant(4 : i32)
+ // CHECK-DAG: %[[C5:.+]] = llvm.mlir.constant(5 : i32)
+ // CHECK-DAG: %[[C6:.+]] = llvm.mlir.constant(6 : i32)
+ // CHECK-DAG: %[[C7:.+]] = llvm.mlir.constant(7 : i32)
+
+ // CHECK: %[[TENSOR_DIM_0:.+]] = llvm.trunc %[[GS0I]] : i64 to i32
+ // CHECK: %[[SHIFT:.+]] = llvm.mlir.constant(16 : i32)
+ // CHECK: %[[SGPR1:.+]] = llvm.shl %[[TENSOR_DIM_0]], %[[SHIFT]]
+
+ // CHECK: %[[TENSOR_DIM_1:.+]] = llvm.trunc %[[GS1I]] : i64 to i32
+ // CHECK: %[[SHIFT:.+]] = llvm.mlir.constant(16 : i32)
+ // CHECK: %[[SGPR2:.+]] = llvm.shl %[[TENSOR_DIM_1]], %[[SHIFT]]
+
+ // CHECK: %[[TILE_DIM_0:.+]] = llvm.trunc %[[SHS0I]] : i64 to i32
+ // CHECK: %[[SHIFT:.+]] = llvm.mlir.constant(16 : i32)
+ // CHECK: %[[TILE_DIM_0_SHIFTED:.+]] = llvm.shl %[[TILE_DIM_0:.+]], %[[SHIFT]]
+
+ // CHECK: %[[TILE_DIM_1:.+]] = llvm.trunc %[[SHS1I]] : i64 to i32
+
+ // CHECK-DAG: %[[TENSOR_DIM_0_STRIDE:.+]] = llvm.mlir.constant(1 : i64) : i64
+ // CHECK: %[[MASK:.+]] = llvm.mlir.constant(281474976710655 : i64) : i64
+
+ // CHECK-DAG: %[[MASK:.+]] = llvm.mlir.constant(281474976710655 : i64) : i64
+ // CHECK: %[[TENSOR_DIM_1_STRIDE_MASKED:.+]] = llvm.and %[[MASK]], %[[GST1I]]
+ // CHECK-DAG: %[[TENSOR_DIM_1_STRIDE_LOW:.+]] = llvm.trunc %[[TENSOR_DIM_1_STRIDE_MASKED]]
+ // CHECK-DAG: %[[SHIFT:.+]] = llvm.mlir.constant(16 : i32) : i32
+ // CHECK: %[[TENSOR_DIM_1_STRIDE_LOW_SHIFTED:.+]] = llvm.shl %[[TENSOR_DIM_1_STRIDE_LOW]], %[[SHIFT]]
+
+
+ %descriptor = amdgpu.make_dma_descriptor %base globalSize [%gs1, %gs0] globalStride [%gst1, 1] sharedSize [%shs1, %shs0] : !amdgpu.tdm_base<i32> -> !amdgpu.tdm_descriptor
+ func.return %descriptor : !amdgpu.tdm_descriptor
+}
+
+// -----
+
// CHECK-LABEL: func @make_dma_descriptor
// CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base<i32>)
func.func @make_dma_descriptor(%base: !amdgpu.tdm_base<i32>) -> !amdgpu.tdm_descriptor {
-
// CHECK-DAG: %[[DGROUP0:.+]] = builtin.unrealized_conversion_cast %[[BASE]]
// CHECK-DAG: %[[C0:.+]] = llvm.mlir.constant(0 : i32)
>From 68b167d772e91de23fe2a8eb650aecf949ed1b0c Mon Sep 17 00:00:00 2001
From: Erick Ochoa <erick.ochoalopez at amd.com>
Date: Wed, 10 Dec 2025 11:20:39 -0500
Subject: [PATCH 09/10] i16 to vector<16xi1>
---
mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td | 9 +++++++--
mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp | 2 ++
mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir | 7 ++++---
mlir/test/Dialect/AMDGPU/ops.mlir | 4 ++--
4 files changed, 15 insertions(+), 7 deletions(-)
diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
index 11fc6da504a95..6fbc90ded5824 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
@@ -110,9 +110,14 @@ def AMDGPU_TDMDescriptorType : AMDGPU_Type<"TDMDescriptor", "tdm_descriptor"> {
This type is opaque and corresponds to the two or four descriptor groups
used in tensor_load_to_lds or tensor_store_from_lds.
}];
-
}
+class AMDGPU_ConcreteVector<Type elem, int length> :
+ FixedVectorOfLengthAndType<[length], [elem]>,
+ BuildableType<
+ "::mlir::VectorType::get({" # length # "} ,"
+ # elem.builderCall # ")">;
+
//===----------------------------------------------------------------------===//
// AMDGPU Op definitions
//===----------------------------------------------------------------------===//
@@ -1296,7 +1301,7 @@ def AMDGPU_MakeDmaDescriptorOp :
DenseI64ArrayAttr: $global_static_strides,
Variadic<Index>: $shared_dynamic_sizes,
DenseI64ArrayAttr: $shared_static_sizes,
- Optional<I16>: $workgroup_mask,
+ Optional<AMDGPU_ConcreteVector<I1, 16>>: $workgroup_mask,
Optional<I1>: $early_timeout,
Optional<I32>: $pad_amount,
Optional<I32>: $pad_interval,
diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index f074a0e528b90..592731778a8e4 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -2381,6 +2381,8 @@ struct AMDGPUMakeDmaDescriptorLowering
if (!mask)
return sgpr0;
+ Type i16 = rewriter.getI16Type();
+ mask = LLVM::BitcastOp::create(rewriter, loc, i16, mask);
Type i32 = rewriter.getI32Type();
Value extendedMask = LLVM::ZExtOp::create(rewriter, loc, i32, mask);
return setValueAtOffset(rewriter, loc, sgpr0, extendedMask, 0);
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir b/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir
index 9d97ddbb1bec9..9fae56fa3ab3b 100644
--- a/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir
+++ b/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir
@@ -642,8 +642,8 @@ func.func @make_dma_descriptor(%base: !amdgpu.tdm_base<i32>) -> !amdgpu.tdm_desc
// -----
// CHECK-LABEL: func @make_dma_descriptor_workgroup_mask
-// CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base<i32>, %[[WG_MASK:.+]]: i16, %[[TIMEOUT:.+]]: i1)
-func.func @make_dma_descriptor_workgroup_mask(%base: !amdgpu.tdm_base<i32>, %wg_mask: i16, %timeout: i1) -> !amdgpu.tdm_descriptor {
+// CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base<i32>, %[[WG_MASK:.+]]: vector<16xi1>, %[[TIMEOUT:.+]]: i1)
+func.func @make_dma_descriptor_workgroup_mask(%base: !amdgpu.tdm_base<i32>, %wg_mask: vector<16xi1>, %timeout: i1) -> !amdgpu.tdm_descriptor {
// CHECK-DAG: %[[DGROUP0:.+]] = builtin.unrealized_conversion_cast %[[BASE]]
// CHECK-DAG: %[[C0:.+]] = llvm.mlir.constant(0 : i32)
@@ -655,7 +655,8 @@ func.func @make_dma_descriptor_workgroup_mask(%base: !amdgpu.tdm_base<i32>, %wg_
// CHECK-DAG: %[[C6:.+]] = llvm.mlir.constant(6 : i32)
// CHECK-DAG: %[[C7:.+]] = llvm.mlir.constant(7 : i32)
- // CHECK-DAG: %[[WG_MASK_EXT:.+]] = llvm.zext %[[WG_MASK]]
+ // CHECK: %[[WG_MASK_CAST:.+]] = llvm.bitcast %[[WG_MASK]] : vector<16xi1> to i16
+ // CHECK-DAG: %[[WG_MASK_EXT:.+]] = llvm.zext %[[WG_MASK_CAST]]
// CHECK-DAG: %[[C16:.+]] = llvm.mlir.constant(16 : i32)
// CHECK: %[[DATA_SIZE_SHIFTED:.+]] = llvm.shl %[[C2]], %[[C16]]
// CHECK: %[[SGPR0_BASE:.+]] = llvm.or %[[WG_MASK_EXT]], %[[DATA_SIZE_SHIFTED]]
diff --git a/mlir/test/Dialect/AMDGPU/ops.mlir b/mlir/test/Dialect/AMDGPU/ops.mlir
index 9dd746827958f..6a054fcc2ba71 100644
--- a/mlir/test/Dialect/AMDGPU/ops.mlir
+++ b/mlir/test/Dialect/AMDGPU/ops.mlir
@@ -697,8 +697,8 @@ func.func @make_dma_base(%idx: index, %mem: memref<8xi32>, %smem: memref<8xi32,
}
// CHECK-LABEL: func @make_dma_descriptor
-// CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base<i32>, %[[WG_MASK:.+]]: i16, %[[TIMEOUT:.+]]: i1, %[[BARRIER:.+]]: memref<8xi32, #gpu.address_space<workgroup>>, %[[IDX:.+]]: index, %[[I32:.+]]: i32)
-func.func @make_dma_descriptor(%base: !amdgpu.tdm_base<i32>, %wg_mask: i16, %timeout: i1, %barrier: memref<8xi32, #gpu.address_space<workgroup>>, %idx: index, %i32: i32) {
+// CHECK-SAME: (%[[BASE:.+]]: !amdgpu.tdm_base<i32>, %[[WG_MASK:.+]]: vector<16xi1>, %[[TIMEOUT:.+]]: i1, %[[BARRIER:.+]]: memref<8xi32, #gpu.address_space<workgroup>>, %[[IDX:.+]]: index, %[[I32:.+]]: i32)
+func.func @make_dma_descriptor(%base: !amdgpu.tdm_base<i32>, %wg_mask: vector<16xi1>, %timeout: i1, %barrier: memref<8xi32, #gpu.address_space<workgroup>>, %idx: index, %i32: i32) {
// CHECK: amdgpu.make_dma_descriptor %[[BASE]]
amdgpu.make_dma_descriptor %base
>From c9f3558235e8194734fe4d8d985233060d7c0ce1 Mon Sep 17 00:00:00 2001
From: Erick Ochoa <erick.ochoalopez at amd.com>
Date: Wed, 10 Dec 2025 14:07:42 -0500
Subject: [PATCH 10/10] review comments
---
.../AMDGPUToROCDL/AMDGPUToROCDL.cpp | 45 ++++++++++---------
1 file changed, 25 insertions(+), 20 deletions(-)
diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index 592731778a8e4..a3989ccfb9ed4 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -2445,8 +2445,9 @@ struct AMDGPUMakeDmaDescriptorLowering
// pre-condition: padInterval can be a power of two between 2 and 256.
// TODO: Validation if the value breaks the pre-condition.
// If the pre-condition fails, there is a possibility of
- // affecting the higher bits. In a following PR add a flag
- // that instruments conditions that need to be checked at runtime.
+ // affecting the higher bits. In a following PR implement
+ // RuntimeVerifiableOpInterface that instruments conditions that need to be
+ // checked at runtime.
IntegerType i32 = rewriter.getI32Type();
Value padInterval = adaptor.getPadInterval();
padInterval = LLVM::CountTrailingZerosOp::create(rewriter, loc, i32,
@@ -2465,8 +2466,9 @@ struct AMDGPUMakeDmaDescriptorLowering
// pre-condition: padAmount is a value between 1-128.
// TODO: Validation if the value breaks the pre-condition.
// If the pre-condition fails, there is a possibility of
- // affecting the higher bits. In a following PR add a flag
- // that instruments conditions that need to be checked at runtime.
+ // affecting the higher bits. In a following PR implement
+ // RuntimeVerifiableOpInterface that instruments conditions that need to be
+ // checked at runtime.
Value padAmount = adaptor.getPadAmount();
padAmount = LLVM::SubOp::create(rewriter, loc, padAmount, consts[1]);
// post-condition: padAmount is a value between 0-127.
@@ -2491,8 +2493,8 @@ struct AMDGPUMakeDmaDescriptorLowering
// pre-condition: atomicBarrierAddress is aligned to 8 bytes which implies
// that the 3 LSBs are zero.
// TODO: Validation if the value breaks the pre-condition.
- // In a following PR add a flag that instruments conditions that need to be
- // checked at runtime.
+ // In a following PR implement RuntimeVerifiableOpInterface
+ // that instruments conditions that need to be checked at runtime.
atomicBarrierAddress =
LLVM::PtrToIntOp::create(rewriter, loc, i32, atomicBarrierAddress);
atomicBarrierAddress =
@@ -2520,10 +2522,10 @@ struct AMDGPUMakeDmaDescriptorLowering
// pre-condition: tensorDimX is less than 2^48-1
// TODO: Validation if the value breaks the pre-condition.
// If the pre-condition fails, there is a possibility of
- // affecting the higher bits. In a following PR add a flag
- // that instruments conditions that need to be checked at runtime.
- // This could also be fixed by saying that mixedGlobalSizes is a
- // DynamicI48List.
+ // affecting the higher bits. In a following PR implement
+ // RuntimeVerifiableOpInterface that instruments conditions that need to be
+ // checked at runtime. This could also be fixed by saying that
+ // mixedGlobalSizes is a DynamicI48List.
Value tensorDimX;
if (auto attr = dyn_cast<Attribute>(tensorDimXOpFoldResult))
tensorDimX =
@@ -2575,10 +2577,10 @@ struct AMDGPUMakeDmaDescriptorLowering
// pre-condition: tileDimX is less than 2^16-1
// TODO: Validation if the value breaks the pre-condition.
// If the pre-condition fails, there is a possibility of
- // affecting the higher bits. In a following PR add a flag
- // that instruments conditions that need to be checked at runtime.
- // This could also be fixed by saying that mixedSharedSizes is a
- // DynamicI16List.
+ // affecting the higher bits. In a following PR implement
+ // RuntimeVerifiableOpInterface that instruments conditions that need to be
+ // checked at runtime. This could also be fixed by saying that
+ // mixedSharedSizes is a DynamicI16List.
Value tileDimX;
if (auto attr = dyn_cast<Attribute>(tileDimXOpFoldResult))
tileDimX =
@@ -2627,8 +2629,8 @@ struct AMDGPUMakeDmaDescriptorLowering
*(mixedGlobalStrides.rbegin() + dimX);
// pre-condition: tensorDimXStride is less than 2^48-1
// TODO: Validation if the value breaks the pre-condition.
- // In a following PR add a flag that instruments conditions that need to be
- // checked at runtime.
+ // In a following PR implement RuntimeVerifiableOpInterface that instruments
+ // conditions that need to be checked at runtime.
Value tensorDimXStride;
if (auto attr = dyn_cast<Attribute>(tensorDimXStrideOpFoldResult))
tensorDimXStride =
@@ -2822,8 +2824,9 @@ struct AMDGPUMakeDmaDescriptorLowering
// pre-condition: iterationCount is in the inclusive interval [1, 256].
// TODO: validation if the value breaks the pre-condition.
// If the pre-condition fails, there is a possibility of
- // affecting the higher bits. In a following PR add a flag that instruments
- // conditions that need to be checked at runtime.
+ // affecting the higher bits. In a following PR implement
+ // RuntimeVerifiableOpInterface that instruments conditions that need to be
+ // checked at runtime.
iterationCount = LLVM::TruncOp::create(rewriter, loc, i32, iterationCount);
iterationCount =
LLVM::SubOp::create(rewriter, loc, iterationCount, consts[1]);
@@ -2851,7 +2854,8 @@ struct AMDGPUMakeDmaDescriptorLowering
Type v4i32 = this->typeConverter->convertType(VectorType::get(4, i32));
assert(v4i32 && "expected type conversion to succeed.");
- if (!op.getLdsIncrement() && op.getRank() == 2)
+ bool onlyNeedsTwoDescriptors = !op.getLdsIncrement() && op.getRank() <= 2;
+ if (onlyNeedsTwoDescriptors)
return LLVM::PoisonOp::create(rewriter, loc, v4i32);
constexpr int64_t sgprlen = 4;
@@ -2910,7 +2914,8 @@ struct AMDGPUMakeDmaDescriptorLowering
IntegerType i32 = rewriter.getI32Type();
Type v4i32 = this->typeConverter->convertType(VectorType::get(4, i32));
assert(v4i32 && "expected type conversion to succeed.");
- if (!op.getLdsIncrement() && op.getRank() == 2)
+ bool onlyNeedsTwoDescriptors = !op.getLdsIncrement() && op.getRank() <= 2;
+ if (onlyNeedsTwoDescriptors)
return LLVM::PoisonOp::create(rewriter, loc, v4i32);
constexpr int32_t sgprlen = 4;
More information about the Mlir-commits
mailing list