[Mlir-commits] [mlir] [mlir][affine][gpu] support unroll dynamic value and apply it to gpu.thread_id op (PR #128113)
llvmlistbot at llvm.org
llvmlistbot at llvm.org
Thu Feb 20 18:48:52 PST 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-mlir
Author: lonely eagle (linuxlonelyeagle)
<details>
<summary>Changes</summary>
I think this is a great improvement, it contains the following content.
* Added support for GPU unroll
Although Thread_id Op is a dynamic Value, the trip of loops can be determined by its range.Even if the threads are divided, affineMap only performs mathematical calculations on thread_id. Further calculations on it can actually be regarded as calculations made on thread_id in a loop.
* Added logic for dynamic value inference
This PR only contains the inference of thread_id,but the same applies to other dynamic values.
* Removed invalid loop (minor changes
---
Full diff: https://github.com/llvm/llvm-project/pull/128113.diff
7 Files Affected:
- (modified) mlir/include/mlir/Dialect/Affine/Analysis/LoopAnalysis.h (+4)
- (modified) mlir/include/mlir/Dialect/Affine/LoopUtils.h (+3)
- (modified) mlir/include/mlir/Dialect/GPU/IR/GPUOps.td (+6)
- (modified) mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp (+96-14)
- (modified) mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp (+46-11)
- (modified) mlir/lib/Dialect/GPU/IR/GPUDialect.cpp (+20)
- (modified) mlir/test/Dialect/Affine/unroll.mlir (+110)
``````````diff
diff --git a/mlir/include/mlir/Dialect/Affine/Analysis/LoopAnalysis.h b/mlir/include/mlir/Dialect/Affine/Analysis/LoopAnalysis.h
index ed3c21d952a01..2bd540b9af2eb 100644
--- a/mlir/include/mlir/Dialect/Affine/Analysis/LoopAnalysis.h
+++ b/mlir/include/mlir/Dialect/Affine/Analysis/LoopAnalysis.h
@@ -43,6 +43,10 @@ void getTripCountMapAndOperands(AffineForOp forOp, AffineMap *map,
/// constant trip count in non-trivial cases.
std::optional<uint64_t> getConstantTripCount(AffineForOp forOp);
+/// In the GPU, the number of trip of each thread in the loop is inconsistent.
+/// This function returns the maximum number of trip.
+std::optional<uint64_t> getMaxConstantTripCount(AffineForOp forOp);
+
/// Returns the greatest known integral divisor of the trip count. Affine
/// expression analysis is used (indirectly through getTripCount), and
/// this method is thus able to determine non-trivial divisors.
diff --git a/mlir/include/mlir/Dialect/Affine/LoopUtils.h b/mlir/include/mlir/Dialect/Affine/LoopUtils.h
index 7fe1f6d48ceeb..1d1d6d94d2382 100644
--- a/mlir/include/mlir/Dialect/Affine/LoopUtils.h
+++ b/mlir/include/mlir/Dialect/Affine/LoopUtils.h
@@ -86,6 +86,9 @@ LogicalResult loopUnrollJamUpToFactor(AffineForOp forOp,
/// was known to have a single iteration.
LogicalResult promoteIfSingleIteration(AffineForOp forOp);
+/// Eliminate loops that will never actually execute.
+LogicalResult removeInvalidLoop(AffineForOp forOp);
+
/// Promotes all single iteration AffineForOp's in the Function, i.e., moves
/// their body into the containing Block.
void promoteSingleIterationLoops(func::FuncOp f);
diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
index 2b1ce573effd0..940d47c5ef2c8 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
@@ -1035,6 +1035,12 @@ def GPU_LaunchOp : GPU_Op<"launch", [
static StringRef getNumWorkgroupAttributionsAttrName() {
return "workgroup_attributions";
}
+
+ /// Find BlockSize via the BlockArgument of gpu.launch.
+ Value getBlockSizeOnAxis(Value threadId);
+
+ /// Find BlockSize via the Dimension Information.
+ Value getBlockSizeOnAxis(Dimension dimension);
}];
let hasCanonicalizer = 1;
diff --git a/mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp b/mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp
index 0d4b0ea1668e0..15a5376fa922e 100644
--- a/mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp
+++ b/mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp
@@ -18,6 +18,7 @@
#include "mlir/Dialect/Affine/Analysis/NestedMatcher.h"
#include "mlir/Dialect/Affine/IR/AffineOps.h"
#include "mlir/Dialect/Affine/IR/AffineValueMap.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/ADT/DenseSet.h"
@@ -84,6 +85,67 @@ void mlir::affine::getTripCountMapAndOperands(
tripCountValueMap.getOperands().end());
}
+/// Replace thread_id with its maximum value, if `replaceWithZero` is true,
+/// thread_id will be replaced by its minimum value 0.
+static void replaceGPUOperands(AffineForOp forOp,
+ SmallVectorImpl<Value> &operands,
+ SmallVectorImpl<AffineExpr> &symReplacements,
+ unsigned numDim, bool replaceWithZero = false) {
+ auto launchOp = forOp->getParentOfType<gpu::LaunchOp>();
+ if (!launchOp)
+ return;
+
+ // `b` is only used to create `AffineExpr`.
+ Builder b(forOp.getContext());
+ unsigned idx = 0;
+
+ for (unsigned i = numDim, e = operands.size(); i < e; ++i) {
+ Value operand = operands[i];
+ if (Value blockSize = launchOp.getBlockSizeOnAxis(operand)) {
+ operands[i] = blockSize;
+ if (!replaceWithZero)
+ symReplacements.push_back(b.getAffineSymbolExpr(idx++) - 1);
+ else
+ symReplacements.push_back(b.getAffineConstantExpr(0));
+ continue;
+ }
+
+ Operation *defOp = operand.getDefiningOp();
+ if (!defOp) {
+ ++idx;
+ continue;
+ }
+
+ if (auto threadIdOp = mlir::dyn_cast<gpu::ThreadIdOp>(defOp)) {
+ gpu::Dimension dimension = threadIdOp.getDimension();
+ operands[i] = launchOp.getBlockSizeOnAxis(dimension);
+ if (!replaceWithZero)
+ symReplacements.push_back(b.getAffineSymbolExpr(idx++) - 1);
+ else
+ symReplacements.push_back(b.getAffineConstantExpr(0));
+ continue;
+ }
+ ++idx;
+ }
+}
+
+/// Take the min if all trip counts are constant.
+static std::optional<uint64_t>
+getConstantTripCountFromAffineMap(AffineMap map) {
+ std::optional<uint64_t> tripCount;
+ for (auto resultExpr : map.getResults()) {
+ auto constExpr = dyn_cast<AffineConstantExpr>(resultExpr);
+ if (!constExpr)
+ return std::nullopt;
+ if (tripCount.has_value())
+ tripCount =
+ std::min(*tripCount, static_cast<uint64_t>(constExpr.getValue()));
+ else
+ tripCount = constExpr.getValue();
+ }
+ return tripCount;
+}
+
/// Returns the trip count of the loop if it's a constant, std::nullopt
/// otherwise. This method uses affine expression analysis (in turn using
/// getTripCount) and is able to determine constant trip count in non-trivial
@@ -95,20 +157,34 @@ std::optional<uint64_t> mlir::affine::getConstantTripCount(AffineForOp forOp) {
if (!map)
return std::nullopt;
+ SmallVector<AffineExpr, 4> symReplacements;
+ replaceGPUOperands(forOp, operands, symReplacements, map.getNumDims());
+ map = map.replaceDimsAndSymbols({}, symReplacements, map.getNumDims(),
+ map.getNumSymbols());
+ affine::AffineValueMap valueMap(map, operands);
+ (void)valueMap.canonicalize();
+ map = valueMap.getAffineMap();
+ return getConstantTripCountFromAffineMap(map);
+}
- // Take the min if all trip counts are constant.
- std::optional<uint64_t> tripCount;
- for (auto resultExpr : map.getResults()) {
- if (auto constExpr = dyn_cast<AffineConstantExpr>(resultExpr)) {
- if (tripCount.has_value())
- tripCount =
- std::min(*tripCount, static_cast<uint64_t>(constExpr.getValue()));
- else
- tripCount = constExpr.getValue();
- } else
- return std::nullopt;
- }
- return tripCount;
+/// In some scenarios, such as GPU, the number of trip of each thread in the
+/// loop is inconsistent. This function returns the maximum number of trip.
+std::optional<uint64_t>
+mlir::affine::getMaxConstantTripCount(AffineForOp forOp) {
+ SmallVector<Value, 4> operands;
+ AffineMap map;
+ getTripCountMapAndOperands(forOp, &map, &operands);
+
+ if (!map)
+ return std::nullopt;
+ SmallVector<AffineExpr, 4> symReplacements;
+ replaceGPUOperands(forOp, operands, symReplacements, map.getNumDims(), true);
+ map = map.replaceDimsAndSymbols({}, symReplacements, map.getNumDims(),
+ map.getNumSymbols());
+ affine::AffineValueMap valueMap(map, operands);
+ (void)valueMap.canonicalize();
+ map = valueMap.getAffineMap();
+ return getConstantTripCountFromAffineMap(map);
}
/// Returns the greatest known integral divisor of the trip count. Affine
@@ -121,7 +197,13 @@ uint64_t mlir::affine::getLargestDivisorOfTripCount(AffineForOp forOp) {
if (!map)
return 1;
-
+ SmallVector<AffineExpr, 4> symReplacements;
+ replaceGPUOperands(forOp, operands, symReplacements, map.getNumDims());
+ map = map.replaceDimsAndSymbols({}, symReplacements, map.getNumDims(),
+ map.getNumSymbols());
+ affine::AffineValueMap valueMap(map, operands);
+ (void)valueMap.canonicalize();
+ map = valueMap.getAffineMap();
// The largest divisor of the trip count is the GCD of the individual largest
// divisors.
assert(map.getNumResults() >= 1 && "expected one or more results");
diff --git a/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp b/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp
index 4e02559a08949..69ceb0f80095b 100644
--- a/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp
+++ b/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp
@@ -17,6 +17,7 @@
#include "mlir/Dialect/Affine/IR/AffineValueMap.h"
#include "mlir/Dialect/Affine/Utils.h"
#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "mlir/Dialect/MemRef/IR/MemRef.h"
#include "mlir/Dialect/SCF/IR/SCF.h"
#include "mlir/IR/IRMapping.h"
@@ -113,11 +114,29 @@ static void replaceIterArgsAndYieldResults(AffineForOp forOp) {
std::get<0>(e).replaceAllUsesWith(std::get<1>(e));
}
+/// Eliminate loops that will never actually execute
+LogicalResult mlir::affine::removeInvalidLoop(AffineForOp forOp) {
+ std::optional<uint64_t> tripCount = getConstantTripCount(forOp);
+ std::optional<uint64_t> maxTripCount = getMaxConstantTripCount(forOp);
+ if (!tripCount || *tripCount > 0 || !maxTripCount || *maxTripCount > 0)
+ return failure();
+
+ auto iterOperands = forOp.getInits();
+ auto results = forOp.getResults();
+ for (auto [result, operand] : llvm::zip(results, iterOperands))
+ result.replaceAllUsesWith(operand);
+
+ IRRewriter b(forOp);
+ b.eraseOp(forOp);
+ return success();
+}
+
/// Promotes the loop body of a forOp to its containing block if the forOp
/// was known to have a single iteration.
LogicalResult mlir::affine::promoteIfSingleIteration(AffineForOp forOp) {
std::optional<uint64_t> tripCount = getConstantTripCount(forOp);
- if (!tripCount || *tripCount != 1)
+ std::optional<uint64_t> maxTripCount = getMaxConstantTripCount(forOp);
+ if (!tripCount || *tripCount != 1 || !maxTripCount || *maxTripCount != 1)
return failure();
// TODO: extend this for arbitrary affine bounds.
@@ -160,7 +179,8 @@ LogicalResult mlir::affine::promoteIfSingleIteration(AffineForOp forOp) {
forOp.getBody()->back().erase();
parentBlock->getOperations().splice(Block::iterator(forOp),
forOp.getBody()->getOperations());
- forOp.erase();
+ IRRewriter b(forOp.getContext());
+ b.eraseOp(forOp);
return success();
}
@@ -884,15 +904,27 @@ void mlir::affine::getTileableBands(
/// Unrolls this loop completely.
LogicalResult mlir::affine::loopUnrollFull(AffineForOp forOp) {
std::optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
- if (mayBeConstantTripCount.has_value()) {
- uint64_t tripCount = *mayBeConstantTripCount;
- if (tripCount == 0)
- return success();
- if (tripCount == 1)
- return promoteIfSingleIteration(forOp);
- return loopUnrollByFactor(forOp, tripCount);
- }
- return failure();
+ std::optional<uint64_t> maxMayBeConstantTripCount =
+ getMaxConstantTripCount(forOp);
+
+ if (!mayBeConstantTripCount.has_value() &&
+ !maxMayBeConstantTripCount.has_value())
+ return failure();
+
+ uint64_t tripCount = *mayBeConstantTripCount;
+ uint64_t maxTripCount = *maxMayBeConstantTripCount;
+
+ // The values of Trip are all 0, and the invalid loop is deleted.
+ if (tripCount <= 0 && maxTripCount <= 0)
+ return removeInvalidLoop(forOp);
+
+ // In special cases, such as in a GPU, only some threads execute this loop.
+ if (tripCount == 0 && maxTripCount == 1)
+ return success();
+
+ if (tripCount == 1 && maxTripCount == 1)
+ return promoteIfSingleIteration(forOp);
+ return loopUnrollByFactor(forOp, tripCount);
}
/// Unrolls this loop by the specified factor or by the trip count (if constant)
@@ -1013,8 +1045,11 @@ LogicalResult mlir::affine::loopUnrollByFactor(
assert(unrollFactor > 0 && "unroll factor should be positive");
std::optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
+ std::optional<uint64_t> maxMayBeConstantTripCount =
+ getMaxConstantTripCount(forOp);
if (unrollFactor == 1) {
if (mayBeConstantTripCount && *mayBeConstantTripCount == 1 &&
+ maxMayBeConstantTripCount && *maxMayBeConstantTripCount == 1 &&
failed(promoteIfSingleIteration(forOp)))
return failure();
return success();
diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
index d06f10d3137a1..31051ed7e55a2 100644
--- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
+++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
@@ -799,6 +799,26 @@ std::optional<KernelDim3> LaunchOp::getClusterSizeOperandValues() {
return KernelDim3{operands[6], operands[7], operands[8]};
}
+Value LaunchOp::getBlockSizeOnAxis(Dimension dimension) {
+ if (dimension == Dimension::x)
+ return getBlockSizeX();
+ else if (dimension == Dimension::y)
+ return getBlockSizeY();
+ else
+ return getBlockSizeZ();
+}
+
+Value LaunchOp::getBlockSizeOnAxis(Value threadId) {
+ KernelDim3 threadIds = getThreadIds();
+ if (threadIds.x == threadId)
+ return getBlockSizeX();
+ else if (threadIds.y == threadId)
+ return getBlockSizeY();
+ else if (threadIds.z == threadId)
+ return getBlockSizeZ();
+ return {};
+}
+
LogicalResult LaunchOp::verify() {
if (!(hasClusterSize()) &&
(getClusterSizeX() || getClusterSizeY() || getClusterSizeZ()))
diff --git a/mlir/test/Dialect/Affine/unroll.mlir b/mlir/test/Dialect/Affine/unroll.mlir
index 574e9f41494af..a2bb0b2cac4e3 100644
--- a/mlir/test/Dialect/Affine/unroll.mlir
+++ b/mlir/test/Dialect/Affine/unroll.mlir
@@ -23,6 +23,7 @@
// UNROLL-BY-4-DAG: [[$MAP5:#map[0-9]*]] = affine_map<(d0)[s0] -> (d0 + s0 + 1)>
// UNROLL-BY-4-DAG: [[$MAP6:#map[0-9]*]] = affine_map<(d0, d1) -> (d0 * 16 + d1)>
// UNROLL-BY-4-DAG: [[$MAP11:#map[0-9]*]] = affine_map<(d0) -> (d0)>
+// UNROLL-BY-4-DAG: [[$MAP7:#map[0-9]*]] = affine_map<()[s0] -> (s0 + (((-s0 + 11) ceildiv 2) floordiv 4) * 8)>
// UNROLL-FULL-LABEL: func @loop_nest_simplest() {
func.func @loop_nest_simplest() {
@@ -258,6 +259,89 @@ gpu.module @unroll_full {
}
}
+// UNROLL-FULL-LABEL: func @thread_partial_execution
+func.func @thread_partial_execution() {
+ %0 = arith.constant 0 :index
+ %1 = arith.constant 2 : index
+ // UNROLL-FULL: %[[C0:.*]] = arith.constant 0 : index
+ gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %1, %sz_by = %1, %sz_bz = %1)
+ threads(%tx, %ty, %tz) in (%sz_tx = %1, %sz_ty = %1, %sz_tz = %1) {
+ affine.for %iv = %tx to 3 step 2 iter_args(%arg = %0) -> index {
+ %3 = arith.addi %arg, %0 : index
+ affine.yield %3 : index
+ }
+ // UNROLL-FULL: %{{.*}} = affine.for %{{.*}} = %{{.*}} to 3 step 2 iter_args(%[[ARG:.*]] = %[[C0]]) -> (index) {
+ // UNROLL-FULL: %[[SUM:.*]] = arith.addi %[[ARG]], %[[C0]] : index
+ // UNROLL-FULL: affine.yield %[[SUM]] : index
+ // UNROLL-FULL: }
+ gpu.terminator
+ }
+ return
+}
+
+// UNROLL-FULL-LABEL: func @invalid_loop
+func.func @invalid_loop() {
+ %0 = arith.constant 0 :index
+ %1 = arith.constant 2 : index
+ gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %1, %sz_by = %1, %sz_bz = %1)
+ threads(%tx, %ty, %tz) in (%sz_tx = %1, %sz_ty = %1, %sz_tz = %1) {
+ %threadid = gpu.thread_id x
+ affine.for %iv = %tx to 0 step 2 iter_args(%arg = %0) -> index {
+ %3 = arith.addi %arg, %0 : index
+ affine.yield %3 : index
+ }
+ gpu.terminator
+ // UNROLL-FULL-CHECK: %{{.*}} = gpu.thread_id x
+ // UNROLL-FULL-CHECK: gpu.terminator
+ }
+ return
+}
+
+// UNROLL-FULL-LABEL: func @unroll_all_thread
+func.func @unroll_all_thread() {
+ %0 = arith.constant 0 :index
+ %1 = arith.constant 2 : index
+ // UNROLL-FULL-CHECK: %[[C0:.*]] = arith.constant 0 : index
+ gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %1, %sz_by = %1, %sz_bz = %1)
+ threads(%tx, %ty, %tz) in (%sz_tx = %1, %sz_ty = %1, %sz_tz = %1) {
+ %threadid = gpu.thread_id x
+ %4 = affine.for %iv = %threadid to 6 step 2 iter_args(%arg = %0) -> index {
+ %3 = arith.addi %arg, %0 : index
+ affine.yield %3 : index
+ }
+ // UNROLL-FULL-CHECK: %[[SUM_0:.*]] = arith.addi %[[C0]], %[[C0]] : index
+ // UNROLL-FULL-CHECK: %[[SUM_1:.*]] = arith.addi %[[SUM_0]], %[[C0]] : index
+ // UNROLL-FULL-CHECK: %[[SUM_2:.*]] = arith.addi %[[SUM_1]], %[[C0]] : index
+ gpu.terminator
+ }
+ return
+}
+
+// UNROLL-FULL-LABEL: func.func @partial_unroll_factor_4
+func.func @partial_unroll_factor_4() {
+ %0 = arith.constant 0 :index
+ %1 = arith.constant 2 : index
+ // UNROLL-FULL: %[[C0:.*]] = arith.constant 0 : index
+ gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %1, %sz_by = %1, %sz_bz = %1)
+ threads(%tx, %ty, %tz) in (%sz_tx = %1, %sz_ty = %1, %sz_tz = %1) {
+ %threadid = gpu.thread_id x
+ affine.for %iv = %threadid to 9 step 2 iter_args(%arg = %0) -> index {
+ %3 = arith.addi %arg, %0 : index
+ affine.yield %3 : index
+ }
+ gpu.terminator
+ }
+ // UNROLL-FULL: %[[ID:.*]] = gpu.thread_id x
+ // UNROLL-FULL: affine.for %{{.*}} = %[[ID]] to 9 step 8 iter_args(%[[ARG:.*]] = %[[C0]]) -> (index) {
+ // UNROLL-FULL: %[[SUM_0:.*]] = arith.addi %[[ARG]], %[[C0]] : index
+ // UNROLL-FULL: %[[SUM_1:.*]] = arith.addi %[[SUM_0]], %[[C0]] : index
+ // UNROLL-FULL: %[[SUM_2:.*]] = arith.addi %[[SUM_1]], %[[C0]] : index
+ // UNROLL-FULL: %[[SUM_3:.*]] = arith.addi %[[SUM_2]], %[[C0]] : index
+ // UNROLL-FULL: affine.yield %[[SUM_3]] : index
+ // UNROLL-FULL: }
+ return
+}
+
// SHORT-LABEL: func @loop_nest_outer_unroll() {
func.func @loop_nest_outer_unroll() {
// SHORT: affine.for %arg0 = 0 to 4 {
@@ -701,6 +785,32 @@ func.func @unroll_with_iter_args_and_promotion(%arg0 : f32, %arg1 : f32) -> f32
return %sum : f32
}
+// UNROLL-BY-4-LABEL: func @gpu_launch_unroll_by_factor_4
+func.func @gpu_launch_unroll_by_factor_4() {
+ %0 = arith.constant 0 :index
+ %1 = arith.constant 2 : index
+ // UNROLL-BY-4: %[[C0:.*]] = arith.constant 0 : index
+ gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %1, %sz_by = %1, %sz_bz = %1)
+ threads(%tx, %ty, %tz) in (%sz_tx = %1, %sz_ty = %1, %sz_tz = %1) {
+ %threadid = gpu.thread_id x
+ affine.for %iv = %threadid to 11 step 2 iter_args(%arg = %0) -> index {
+ %3 = arith.addi %arg, %0 : index
+ affine.yield %3 : index
+ }
+ gpu.terminator
+ }
+ // UNROLL-BY-4: %[[ID:.*]] = gpu.thread_id x
+ // UNROLL-BY-4: %[[SUM_0:.*]] = arith.addi %[[C0]], %[[C0]] : index
+ // UNROLL-BY-4: %[[SUM_1:.*]] = arith.addi %[[SUM_0]], %[[C0]] : index
+ // UNROLL-BY-4: %[[SUM_2:.*]] = arith.addi %[[SUM_1]], %[[C0]] : index
+ // UNROLL-BY-4: %[[SUM_3:.*]] = arith.addi %[[SUM_2]], %[[C0]] : index
+ // UNROLL-BY-4: affine.for %[[VAL_20:.*]] = [[$MAP7]](){{\[}}%[[ID]]] to 11 step 2 iter_args(%[[ARG:.*]] = %[[SUM_3]]) -> (index) {
+ // UNROLL-BY-4: %[[SUM_4:.*]] = arith.addi %[[ARG]], %[[C0]] : index
+ // UNROLL-BY-4: affine.yield %[[SUM_4]] : index
+ // UNROLL-BY-4: }
+ return
+}
+
// UNROLL-FULL: func @unroll_zero_trip_count_case
func.func @unroll_zero_trip_count_case() {
// CHECK-NEXT: affine.for %{{.*}} = 0 to 0
``````````
</details>
https://github.com/llvm/llvm-project/pull/128113
More information about the Mlir-commits
mailing list