[Mlir-commits] [mlir] [mlir][affine] Use value bound inference to determine minimum/maximum trip counts in loop analysis (PR #128113)
lonely eagle
llvmlistbot at llvm.org
Mon Mar 17 19:54:42 PDT 2025
https://github.com/linuxlonelyeagle updated https://github.com/llvm/llvm-project/pull/128113
>From 23b3a7fe966996aa5c6dd9bb3b9e18c840a33075 Mon Sep 17 00:00:00 2001
From: linuxlonelyeagle <2020382038 at qq.com>
Date: Mon, 17 Feb 2025 16:58:22 +0800
Subject: [PATCH 1/5] support unroll by the gpu.launchOp.
---
.../Dialect/Affine/Analysis/LoopAnalysis.h | 4 +
mlir/include/mlir/Dialect/Affine/LoopUtils.h | 3 +
mlir/include/mlir/Dialect/GPU/IR/GPUOps.td | 6 +
.../Dialect/Affine/Analysis/LoopAnalysis.cpp | 110 +++++++++++++++---
mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp | 57 +++++++--
mlir/lib/Dialect/GPU/IR/GPUDialect.cpp | 20 ++++
mlir/test/Dialect/Affine/unroll.mlir | 110 ++++++++++++++++++
7 files changed, 285 insertions(+), 25 deletions(-)
diff --git a/mlir/include/mlir/Dialect/Affine/Analysis/LoopAnalysis.h b/mlir/include/mlir/Dialect/Affine/Analysis/LoopAnalysis.h
index ed3c21d952a01..2bd540b9af2eb 100644
--- a/mlir/include/mlir/Dialect/Affine/Analysis/LoopAnalysis.h
+++ b/mlir/include/mlir/Dialect/Affine/Analysis/LoopAnalysis.h
@@ -43,6 +43,10 @@ void getTripCountMapAndOperands(AffineForOp forOp, AffineMap *map,
/// constant trip count in non-trivial cases.
std::optional<uint64_t> getConstantTripCount(AffineForOp forOp);
+/// In the GPU, the number of trip of each thread in the loop is inconsistent.
+/// This function returns the maximum number of trip.
+std::optional<uint64_t> getMaxConstantTripCount(AffineForOp forOp);
+
/// Returns the greatest known integral divisor of the trip count. Affine
/// expression analysis is used (indirectly through getTripCount), and
/// this method is thus able to determine non-trivial divisors.
diff --git a/mlir/include/mlir/Dialect/Affine/LoopUtils.h b/mlir/include/mlir/Dialect/Affine/LoopUtils.h
index 7fe1f6d48ceeb..1d1d6d94d2382 100644
--- a/mlir/include/mlir/Dialect/Affine/LoopUtils.h
+++ b/mlir/include/mlir/Dialect/Affine/LoopUtils.h
@@ -86,6 +86,9 @@ LogicalResult loopUnrollJamUpToFactor(AffineForOp forOp,
/// was known to have a single iteration.
LogicalResult promoteIfSingleIteration(AffineForOp forOp);
+/// Eliminate loops that will never actually execute.
+LogicalResult removeInvalidLoop(AffineForOp forOp);
+
/// Promotes all single iteration AffineForOp's in the Function, i.e., moves
/// their body into the containing Block.
void promoteSingleIterationLoops(func::FuncOp f);
diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
index 2b1ce573effd0..940d47c5ef2c8 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
@@ -1035,6 +1035,12 @@ def GPU_LaunchOp : GPU_Op<"launch", [
static StringRef getNumWorkgroupAttributionsAttrName() {
return "workgroup_attributions";
}
+
+ /// Find BlockSize via the BlockArgument of gpu.launch.
+ Value getBlockSizeOnAxis(Value threadId);
+
+ /// Find BlockSize via the Dimension Information.
+ Value getBlockSizeOnAxis(Dimension dimension);
}];
let hasCanonicalizer = 1;
diff --git a/mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp b/mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp
index 0d4b0ea1668e0..15a5376fa922e 100644
--- a/mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp
+++ b/mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp
@@ -18,6 +18,7 @@
#include "mlir/Dialect/Affine/Analysis/NestedMatcher.h"
#include "mlir/Dialect/Affine/IR/AffineOps.h"
#include "mlir/Dialect/Affine/IR/AffineValueMap.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/ADT/DenseSet.h"
@@ -84,6 +85,67 @@ void mlir::affine::getTripCountMapAndOperands(
tripCountValueMap.getOperands().end());
}
+/// Replace thread_id with its maximum value, if `replaceWithZero` is true,
+/// thread_id will be replaced by its minimum value 0.
+static void replaceGPUOperands(AffineForOp forOp,
+ SmallVectorImpl<Value> &operands,
+ SmallVectorImpl<AffineExpr> &symReplacements,
+ unsigned numDim, bool replaceWithZero = false) {
+ auto launchOp = forOp->getParentOfType<gpu::LaunchOp>();
+ if (!launchOp)
+ return;
+
+ // `b` is only used to create `AffineExpr`.
+ Builder b(forOp.getContext());
+ unsigned idx = 0;
+
+ for (unsigned i = numDim, e = operands.size(); i < e; ++i) {
+ Value operand = operands[i];
+ if (Value blockSize = launchOp.getBlockSizeOnAxis(operand)) {
+ operands[i] = blockSize;
+ if (!replaceWithZero)
+ symReplacements.push_back(b.getAffineSymbolExpr(idx++) - 1);
+ else
+ symReplacements.push_back(b.getAffineConstantExpr(0));
+ continue;
+ }
+
+ Operation *defOp = operand.getDefiningOp();
+ if (!defOp) {
+ ++idx;
+ continue;
+ }
+
+ if (auto threadIdOp = mlir::dyn_cast<gpu::ThreadIdOp>(defOp)) {
+ gpu::Dimension dimension = threadIdOp.getDimension();
+ operands[i] = launchOp.getBlockSizeOnAxis(dimension);
+ if (!replaceWithZero)
+ symReplacements.push_back(b.getAffineSymbolExpr(idx++) - 1);
+ else
+ symReplacements.push_back(b.getAffineConstantExpr(0));
+ continue;
+ }
+ ++idx;
+ }
+}
+
+/// Take the min if all trip counts are constant.
+static std::optional<uint64_t>
+getConstantTripCountFromAffineMap(AffineMap map) {
+ std::optional<uint64_t> tripCount;
+ for (auto resultExpr : map.getResults()) {
+ auto constExpr = dyn_cast<AffineConstantExpr>(resultExpr);
+ if (!constExpr)
+ return std::nullopt;
+ if (tripCount.has_value())
+ tripCount =
+ std::min(*tripCount, static_cast<uint64_t>(constExpr.getValue()));
+ else
+ tripCount = constExpr.getValue();
+ }
+ return tripCount;
+}
+
/// Returns the trip count of the loop if it's a constant, std::nullopt
/// otherwise. This method uses affine expression analysis (in turn using
/// getTripCount) and is able to determine constant trip count in non-trivial
@@ -95,20 +157,34 @@ std::optional<uint64_t> mlir::affine::getConstantTripCount(AffineForOp forOp) {
if (!map)
return std::nullopt;
+ SmallVector<AffineExpr, 4> symReplacements;
+ replaceGPUOperands(forOp, operands, symReplacements, map.getNumDims());
+ map = map.replaceDimsAndSymbols({}, symReplacements, map.getNumDims(),
+ map.getNumSymbols());
+ affine::AffineValueMap valueMap(map, operands);
+ (void)valueMap.canonicalize();
+ map = valueMap.getAffineMap();
+ return getConstantTripCountFromAffineMap(map);
+}
- // Take the min if all trip counts are constant.
- std::optional<uint64_t> tripCount;
- for (auto resultExpr : map.getResults()) {
- if (auto constExpr = dyn_cast<AffineConstantExpr>(resultExpr)) {
- if (tripCount.has_value())
- tripCount =
- std::min(*tripCount, static_cast<uint64_t>(constExpr.getValue()));
- else
- tripCount = constExpr.getValue();
- } else
- return std::nullopt;
- }
- return tripCount;
+/// In some scenarios, such as GPU, the number of trip of each thread in the
+/// loop is inconsistent. This function returns the maximum number of trip.
+std::optional<uint64_t>
+mlir::affine::getMaxConstantTripCount(AffineForOp forOp) {
+ SmallVector<Value, 4> operands;
+ AffineMap map;
+ getTripCountMapAndOperands(forOp, &map, &operands);
+
+ if (!map)
+ return std::nullopt;
+ SmallVector<AffineExpr, 4> symReplacements;
+ replaceGPUOperands(forOp, operands, symReplacements, map.getNumDims(), true);
+ map = map.replaceDimsAndSymbols({}, symReplacements, map.getNumDims(),
+ map.getNumSymbols());
+ affine::AffineValueMap valueMap(map, operands);
+ (void)valueMap.canonicalize();
+ map = valueMap.getAffineMap();
+ return getConstantTripCountFromAffineMap(map);
}
/// Returns the greatest known integral divisor of the trip count. Affine
@@ -121,7 +197,13 @@ uint64_t mlir::affine::getLargestDivisorOfTripCount(AffineForOp forOp) {
if (!map)
return 1;
-
+ SmallVector<AffineExpr, 4> symReplacements;
+ replaceGPUOperands(forOp, operands, symReplacements, map.getNumDims());
+ map = map.replaceDimsAndSymbols({}, symReplacements, map.getNumDims(),
+ map.getNumSymbols());
+ affine::AffineValueMap valueMap(map, operands);
+ (void)valueMap.canonicalize();
+ map = valueMap.getAffineMap();
// The largest divisor of the trip count is the GCD of the individual largest
// divisors.
assert(map.getNumResults() >= 1 && "expected one or more results");
diff --git a/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp b/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp
index 4e02559a08949..69ceb0f80095b 100644
--- a/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp
+++ b/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp
@@ -17,6 +17,7 @@
#include "mlir/Dialect/Affine/IR/AffineValueMap.h"
#include "mlir/Dialect/Affine/Utils.h"
#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "mlir/Dialect/MemRef/IR/MemRef.h"
#include "mlir/Dialect/SCF/IR/SCF.h"
#include "mlir/IR/IRMapping.h"
@@ -113,11 +114,29 @@ static void replaceIterArgsAndYieldResults(AffineForOp forOp) {
std::get<0>(e).replaceAllUsesWith(std::get<1>(e));
}
+/// Eliminate loops that will never actually execute
+LogicalResult mlir::affine::removeInvalidLoop(AffineForOp forOp) {
+ std::optional<uint64_t> tripCount = getConstantTripCount(forOp);
+ std::optional<uint64_t> maxTripCount = getMaxConstantTripCount(forOp);
+ if (!tripCount || *tripCount > 0 || !maxTripCount || *maxTripCount > 0)
+ return failure();
+
+ auto iterOperands = forOp.getInits();
+ auto results = forOp.getResults();
+ for (auto [result, operand] : llvm::zip(results, iterOperands))
+ result.replaceAllUsesWith(operand);
+
+ IRRewriter b(forOp);
+ b.eraseOp(forOp);
+ return success();
+}
+
/// Promotes the loop body of a forOp to its containing block if the forOp
/// was known to have a single iteration.
LogicalResult mlir::affine::promoteIfSingleIteration(AffineForOp forOp) {
std::optional<uint64_t> tripCount = getConstantTripCount(forOp);
- if (!tripCount || *tripCount != 1)
+ std::optional<uint64_t> maxTripCount = getMaxConstantTripCount(forOp);
+ if (!tripCount || *tripCount != 1 || !maxTripCount || *maxTripCount != 1)
return failure();
// TODO: extend this for arbitrary affine bounds.
@@ -160,7 +179,8 @@ LogicalResult mlir::affine::promoteIfSingleIteration(AffineForOp forOp) {
forOp.getBody()->back().erase();
parentBlock->getOperations().splice(Block::iterator(forOp),
forOp.getBody()->getOperations());
- forOp.erase();
+ IRRewriter b(forOp.getContext());
+ b.eraseOp(forOp);
return success();
}
@@ -884,15 +904,27 @@ void mlir::affine::getTileableBands(
/// Unrolls this loop completely.
LogicalResult mlir::affine::loopUnrollFull(AffineForOp forOp) {
std::optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
- if (mayBeConstantTripCount.has_value()) {
- uint64_t tripCount = *mayBeConstantTripCount;
- if (tripCount == 0)
- return success();
- if (tripCount == 1)
- return promoteIfSingleIteration(forOp);
- return loopUnrollByFactor(forOp, tripCount);
- }
- return failure();
+ std::optional<uint64_t> maxMayBeConstantTripCount =
+ getMaxConstantTripCount(forOp);
+
+ if (!mayBeConstantTripCount.has_value() &&
+ !maxMayBeConstantTripCount.has_value())
+ return failure();
+
+ uint64_t tripCount = *mayBeConstantTripCount;
+ uint64_t maxTripCount = *maxMayBeConstantTripCount;
+
+ // The values of Trip are all 0, and the invalid loop is deleted.
+ if (tripCount <= 0 && maxTripCount <= 0)
+ return removeInvalidLoop(forOp);
+
+ // In special cases, such as in a GPU, only some threads execute this loop.
+ if (tripCount == 0 && maxTripCount == 1)
+ return success();
+
+ if (tripCount == 1 && maxTripCount == 1)
+ return promoteIfSingleIteration(forOp);
+ return loopUnrollByFactor(forOp, tripCount);
}
/// Unrolls this loop by the specified factor or by the trip count (if constant)
@@ -1013,8 +1045,11 @@ LogicalResult mlir::affine::loopUnrollByFactor(
assert(unrollFactor > 0 && "unroll factor should be positive");
std::optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
+ std::optional<uint64_t> maxMayBeConstantTripCount =
+ getMaxConstantTripCount(forOp);
if (unrollFactor == 1) {
if (mayBeConstantTripCount && *mayBeConstantTripCount == 1 &&
+ maxMayBeConstantTripCount && *maxMayBeConstantTripCount == 1 &&
failed(promoteIfSingleIteration(forOp)))
return failure();
return success();
diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
index d06f10d3137a1..31051ed7e55a2 100644
--- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
+++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
@@ -799,6 +799,26 @@ std::optional<KernelDim3> LaunchOp::getClusterSizeOperandValues() {
return KernelDim3{operands[6], operands[7], operands[8]};
}
+Value LaunchOp::getBlockSizeOnAxis(Dimension dimension) {
+ if (dimension == Dimension::x)
+ return getBlockSizeX();
+ else if (dimension == Dimension::y)
+ return getBlockSizeY();
+ else
+ return getBlockSizeZ();
+}
+
+Value LaunchOp::getBlockSizeOnAxis(Value threadId) {
+ KernelDim3 threadIds = getThreadIds();
+ if (threadIds.x == threadId)
+ return getBlockSizeX();
+ else if (threadIds.y == threadId)
+ return getBlockSizeY();
+ else if (threadIds.z == threadId)
+ return getBlockSizeZ();
+ return {};
+}
+
LogicalResult LaunchOp::verify() {
if (!(hasClusterSize()) &&
(getClusterSizeX() || getClusterSizeY() || getClusterSizeZ()))
diff --git a/mlir/test/Dialect/Affine/unroll.mlir b/mlir/test/Dialect/Affine/unroll.mlir
index 574e9f41494af..a2bb0b2cac4e3 100644
--- a/mlir/test/Dialect/Affine/unroll.mlir
+++ b/mlir/test/Dialect/Affine/unroll.mlir
@@ -23,6 +23,7 @@
// UNROLL-BY-4-DAG: [[$MAP5:#map[0-9]*]] = affine_map<(d0)[s0] -> (d0 + s0 + 1)>
// UNROLL-BY-4-DAG: [[$MAP6:#map[0-9]*]] = affine_map<(d0, d1) -> (d0 * 16 + d1)>
// UNROLL-BY-4-DAG: [[$MAP11:#map[0-9]*]] = affine_map<(d0) -> (d0)>
+// UNROLL-BY-4-DAG: [[$MAP7:#map[0-9]*]] = affine_map<()[s0] -> (s0 + (((-s0 + 11) ceildiv 2) floordiv 4) * 8)>
// UNROLL-FULL-LABEL: func @loop_nest_simplest() {
func.func @loop_nest_simplest() {
@@ -258,6 +259,89 @@ gpu.module @unroll_full {
}
}
+// UNROLL-FULL-LABEL: func @thread_partial_execution
+func.func @thread_partial_execution() {
+ %0 = arith.constant 0 :index
+ %1 = arith.constant 2 : index
+ // UNROLL-FULL: %[[C0:.*]] = arith.constant 0 : index
+ gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %1, %sz_by = %1, %sz_bz = %1)
+ threads(%tx, %ty, %tz) in (%sz_tx = %1, %sz_ty = %1, %sz_tz = %1) {
+ affine.for %iv = %tx to 3 step 2 iter_args(%arg = %0) -> index {
+ %3 = arith.addi %arg, %0 : index
+ affine.yield %3 : index
+ }
+ // UNROLL-FULL: %{{.*}} = affine.for %{{.*}} = %{{.*}} to 3 step 2 iter_args(%[[ARG:.*]] = %[[C0]]) -> (index) {
+ // UNROLL-FULL: %[[SUM:.*]] = arith.addi %[[ARG]], %[[C0]] : index
+ // UNROLL-FULL: affine.yield %[[SUM]] : index
+ // UNROLL-FULL: }
+ gpu.terminator
+ }
+ return
+}
+
+// UNROLL-FULL-LABEL: func @invalid_loop
+func.func @invalid_loop() {
+ %0 = arith.constant 0 :index
+ %1 = arith.constant 2 : index
+ gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %1, %sz_by = %1, %sz_bz = %1)
+ threads(%tx, %ty, %tz) in (%sz_tx = %1, %sz_ty = %1, %sz_tz = %1) {
+ %threadid = gpu.thread_id x
+ affine.for %iv = %tx to 0 step 2 iter_args(%arg = %0) -> index {
+ %3 = arith.addi %arg, %0 : index
+ affine.yield %3 : index
+ }
+ gpu.terminator
+ // UNROLL-FULL-CHECK: %{{.*}} = gpu.thread_id x
+ // UNROLL-FULL-CHECK: gpu.terminator
+ }
+ return
+}
+
+// UNROLL-FULL-LABEL: func @unroll_all_thread
+func.func @unroll_all_thread() {
+ %0 = arith.constant 0 :index
+ %1 = arith.constant 2 : index
+ // UNROLL-FULL-CHECK: %[[C0:.*]] = arith.constant 0 : index
+ gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %1, %sz_by = %1, %sz_bz = %1)
+ threads(%tx, %ty, %tz) in (%sz_tx = %1, %sz_ty = %1, %sz_tz = %1) {
+ %threadid = gpu.thread_id x
+ %4 = affine.for %iv = %threadid to 6 step 2 iter_args(%arg = %0) -> index {
+ %3 = arith.addi %arg, %0 : index
+ affine.yield %3 : index
+ }
+ // UNROLL-FULL-CHECK: %[[SUM_0:.*]] = arith.addi %[[C0]], %[[C0]] : index
+ // UNROLL-FULL-CHECK: %[[SUM_1:.*]] = arith.addi %[[SUM_0]], %[[C0]] : index
+ // UNROLL-FULL-CHECK: %[[SUM_2:.*]] = arith.addi %[[SUM_1]], %[[C0]] : index
+ gpu.terminator
+ }
+ return
+}
+
+// UNROLL-FULL-LABEL: func.func @partial_unroll_factor_4
+func.func @partial_unroll_factor_4() {
+ %0 = arith.constant 0 :index
+ %1 = arith.constant 2 : index
+ // UNROLL-FULL: %[[C0:.*]] = arith.constant 0 : index
+ gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %1, %sz_by = %1, %sz_bz = %1)
+ threads(%tx, %ty, %tz) in (%sz_tx = %1, %sz_ty = %1, %sz_tz = %1) {
+ %threadid = gpu.thread_id x
+ affine.for %iv = %threadid to 9 step 2 iter_args(%arg = %0) -> index {
+ %3 = arith.addi %arg, %0 : index
+ affine.yield %3 : index
+ }
+ gpu.terminator
+ }
+ // UNROLL-FULL: %[[ID:.*]] = gpu.thread_id x
+ // UNROLL-FULL: affine.for %{{.*}} = %[[ID]] to 9 step 8 iter_args(%[[ARG:.*]] = %[[C0]]) -> (index) {
+ // UNROLL-FULL: %[[SUM_0:.*]] = arith.addi %[[ARG]], %[[C0]] : index
+ // UNROLL-FULL: %[[SUM_1:.*]] = arith.addi %[[SUM_0]], %[[C0]] : index
+ // UNROLL-FULL: %[[SUM_2:.*]] = arith.addi %[[SUM_1]], %[[C0]] : index
+ // UNROLL-FULL: %[[SUM_3:.*]] = arith.addi %[[SUM_2]], %[[C0]] : index
+ // UNROLL-FULL: affine.yield %[[SUM_3]] : index
+ // UNROLL-FULL: }
+ return
+}
+
// SHORT-LABEL: func @loop_nest_outer_unroll() {
func.func @loop_nest_outer_unroll() {
// SHORT: affine.for %arg0 = 0 to 4 {
@@ -701,6 +785,32 @@ func.func @unroll_with_iter_args_and_promotion(%arg0 : f32, %arg1 : f32) -> f32
return %sum : f32
}
+// UNROLL-BY-4-LABEL: func @gpu_launch_unroll_by_factor_4
+func.func @gpu_launch_unroll_by_factor_4() {
+ %0 = arith.constant 0 :index
+ %1 = arith.constant 2 : index
+ // UNROLL-BY-4: %[[C0:.*]] = arith.constant 0 : index
+ gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %1, %sz_by = %1, %sz_bz = %1)
+ threads(%tx, %ty, %tz) in (%sz_tx = %1, %sz_ty = %1, %sz_tz = %1) {
+ %threadid = gpu.thread_id x
+ affine.for %iv = %threadid to 11 step 2 iter_args(%arg = %0) -> index {
+ %3 = arith.addi %arg, %0 : index
+ affine.yield %3 : index
+ }
+ gpu.terminator
+ }
+ // UNROLL-BY-4: %[[ID:.*]] = gpu.thread_id x
+ // UNROLL-BY-4: %[[SUM_0:.*]] = arith.addi %[[C0]], %[[C0]] : index
+ // UNROLL-BY-4: %[[SUM_1:.*]] = arith.addi %[[SUM_0]], %[[C0]] : index
+ // UNROLL-BY-4: %[[SUM_2:.*]] = arith.addi %[[SUM_1]], %[[C0]] : index
+ // UNROLL-BY-4: %[[SUM_3:.*]] = arith.addi %[[SUM_2]], %[[C0]] : index
+ // UNROLL-BY-4: affine.for %[[VAL_20:.*]] = [[$MAP7]](){{\[}}%[[ID]]] to 11 step 2 iter_args(%[[ARG:.*]] = %[[SUM_3]]) -> (index) {
+ // UNROLL-BY-4: %[[SUM_4:.*]] = arith.addi %[[ARG]], %[[C0]] : index
+ // UNROLL-BY-4: affine.yield %[[SUM_4]] : index
+ // UNROLL-BY-4: }
+ return
+}
+
// UNROLL-FULL: func @unroll_zero_trip_count_case
func.func @unroll_zero_trip_count_case() {
// CHECK-NEXT: affine.for %{{.*}} = 0 to 0
>From c834f4d70494daa5abac945447b6d307d3900bac Mon Sep 17 00:00:00 2001
From: linuxlonelyeagle <2020382038 at qq.com>
Date: Sat, 22 Feb 2025 17:53:19 +0800
Subject: [PATCH 2/5] delete the feature of remove invalid loops.
---
.../Dialect/Affine/Analysis/LoopAnalysis.h | 4 +-
mlir/include/mlir/Dialect/Affine/LoopUtils.h | 3 -
mlir/include/mlir/Dialect/GPU/IR/GPUOps.td | 2 +-
mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp | 25 +------
mlir/test/Dialect/Affine/unroll.mlir | 68 +++++++------------
5 files changed, 30 insertions(+), 72 deletions(-)
diff --git a/mlir/include/mlir/Dialect/Affine/Analysis/LoopAnalysis.h b/mlir/include/mlir/Dialect/Affine/Analysis/LoopAnalysis.h
index 2bd540b9af2eb..591533d17c960 100644
--- a/mlir/include/mlir/Dialect/Affine/Analysis/LoopAnalysis.h
+++ b/mlir/include/mlir/Dialect/Affine/Analysis/LoopAnalysis.h
@@ -43,8 +43,8 @@ void getTripCountMapAndOperands(AffineForOp forOp, AffineMap *map,
/// constant trip count in non-trivial cases.
std::optional<uint64_t> getConstantTripCount(AffineForOp forOp);
-/// In the GPU, the number of trip of each thread in the loop is inconsistent.
-/// This function returns the maximum number of trip.
+/// In some scenarios, such as GPU, the number of trip of each thread in the
+/// loop is inconsistent. This function returns the maximum number of trip.
std::optional<uint64_t> getMaxConstantTripCount(AffineForOp forOp);
/// Returns the greatest known integral divisor of the trip count. Affine
diff --git a/mlir/include/mlir/Dialect/Affine/LoopUtils.h b/mlir/include/mlir/Dialect/Affine/LoopUtils.h
index 1d1d6d94d2382..7fe1f6d48ceeb 100644
--- a/mlir/include/mlir/Dialect/Affine/LoopUtils.h
+++ b/mlir/include/mlir/Dialect/Affine/LoopUtils.h
@@ -86,9 +86,6 @@ LogicalResult loopUnrollJamUpToFactor(AffineForOp forOp,
/// was known to have a single iteration.
LogicalResult promoteIfSingleIteration(AffineForOp forOp);
-/// Eliminate loops that will never actually execute.
-LogicalResult removeInvalidLoop(AffineForOp forOp);
-
/// Promotes all single iteration AffineForOp's in the Function, i.e., moves
/// their body into the containing Block.
void promoteSingleIterationLoops(func::FuncOp f);
diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
index 940d47c5ef2c8..fde1ad482ae2d 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
@@ -1039,7 +1039,7 @@ def GPU_LaunchOp : GPU_Op<"launch", [
/// Find BlockSize via the BlockArgument of gpu.launch.
Value getBlockSizeOnAxis(Value threadId);
- /// Find BlockSize via the Dimension Information.
+ /// Find BlockSize via the Dimension Information.
Value getBlockSizeOnAxis(Dimension dimension);
}];
diff --git a/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp b/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp
index 69ceb0f80095b..b6471ac179b22 100644
--- a/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp
+++ b/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp
@@ -114,23 +114,6 @@ static void replaceIterArgsAndYieldResults(AffineForOp forOp) {
std::get<0>(e).replaceAllUsesWith(std::get<1>(e));
}
-/// Eliminate loops that will never actually execute
-LogicalResult mlir::affine::removeInvalidLoop(AffineForOp forOp) {
- std::optional<uint64_t> tripCount = getConstantTripCount(forOp);
- std::optional<uint64_t> maxTripCount = getMaxConstantTripCount(forOp);
- if (!tripCount || *tripCount > 0 || !maxTripCount || *maxTripCount > 0)
- return failure();
-
- auto iterOperands = forOp.getInits();
- auto results = forOp.getResults();
- for (auto [result, operand] : llvm::zip(results, iterOperands))
- result.replaceAllUsesWith(operand);
-
- IRRewriter b(forOp);
- b.eraseOp(forOp);
- return success();
-}
-
/// Promotes the loop body of a forOp to its containing block if the forOp
/// was known to have a single iteration.
LogicalResult mlir::affine::promoteIfSingleIteration(AffineForOp forOp) {
@@ -914,12 +897,8 @@ LogicalResult mlir::affine::loopUnrollFull(AffineForOp forOp) {
uint64_t tripCount = *mayBeConstantTripCount;
uint64_t maxTripCount = *maxMayBeConstantTripCount;
- // The values of Trip are all 0, and the invalid loop is deleted.
- if (tripCount <= 0 && maxTripCount <= 0)
- return removeInvalidLoop(forOp);
-
- // In special cases, such as in a GPU, only some threads execute this loop.
- if (tripCount == 0 && maxTripCount == 1)
+ // Trip equals 0, this loop cannot unroll.
+ if (tripCount <= 0)
return success();
if (tripCount == 1 && maxTripCount == 1)
diff --git a/mlir/test/Dialect/Affine/unroll.mlir b/mlir/test/Dialect/Affine/unroll.mlir
index a2bb0b2cac4e3..ab73c5ac7e9c4 100644
--- a/mlir/test/Dialect/Affine/unroll.mlir
+++ b/mlir/test/Dialect/Affine/unroll.mlir
@@ -270,38 +270,20 @@ func.func @thread_partial_execution() {
%3 = arith.addi %arg, %0 : index
affine.yield %3 : index
}
- // UNROLL-FULL: %{{.*}} = affine.for %{{.*}} = %{{.*}} to 3 step 2 iter_args(%[[ARG:.*]] = %[[C0]]) -> (index) {
- // UNROLL-FULL: %[[SUM:.*]] = arith.addi %[[ARG]], %[[C0]] : index
- // UNROLL-FULL: affine.yield %[[SUM]] : index
- // UNROLL-FULL: }
+ // UNROLL-FULL: affine.for %{{.*}} = %{{.*}} to 3 step 2 iter_args(%[[ARG:.*]] = %[[C0]]) -> (index) {
+ // UNROLL-FULL-NEXT: %[[SUM:.*]] = arith.addi %[[ARG]], %[[C0]] : index
+ // UNROLL-FULL-NEXT: affine.yield %[[SUM]] : index
+ // UNROLL-FULL-NEXT: }
gpu.terminator
}
return
}
-// UNROLL-FULL-LABEL: func @invalid_loop
-func.func @invalid_loop() {
- %0 = arith.constant 0 :index
- %1 = arith.constant 2 : index
- gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %1, %sz_by = %1, %sz_bz = %1)
- threads(%tx, %ty, %tz) in (%sz_tx = %1, %sz_ty = %1, %sz_tz = %1) {
- %threadid = gpu.thread_id x
- affine.for %iv = %tx to 0 step 2 iter_args(%arg = %0) -> index {
- %3 = arith.addi %arg, %0 : index
- affine.yield %3 : index
- }
- gpu.terminator
- // UNROLL-FULL-CHECK: %{{.*}} = gpu.thread_id x
- // UNROLL-FULL-CHECK: gpu.terminator
- }
- return
-}
-
// UNROLL-FULL-LABEL: func @unroll_all_thread
func.func @unroll_all_thread() {
%0 = arith.constant 0 :index
%1 = arith.constant 2 : index
- // UNROLL-FULL-CHECK: %[[C0:.*]] = arith.constant 0 : index
+ // UNROLL-FULL: %[[C0:.*]] = arith.constant 0 : index
gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %1, %sz_by = %1, %sz_bz = %1)
threads(%tx, %ty, %tz) in (%sz_tx = %1, %sz_ty = %1, %sz_tz = %1) {
%threadid = gpu.thread_id x
@@ -309,19 +291,19 @@ func.func @unroll_all_thread() {
%3 = arith.addi %arg, %0 : index
affine.yield %3 : index
}
- // UNROLL-FULL-CHECK: %[[SUM_0:.*]] = arith.addi %[[C0]], %[[C0]] : index
- // UNROLL-FULL-CHECK: %[[SUM_1:.*]] = arith.addi %[[SUM_0]], %[[C0]] : index
- // UNROLL-FULL-CHECK: %[[SUM_2:.*]] = arith.addi %[[SUM_1]], %[[C0]] : index
+ // UNROLL-FULL: %[[SUM_0:.*]] = arith.addi %[[C0]], %[[C0]] : index
+ // UNROLL-FULL-NEXT: %[[SUM_1:.*]] = arith.addi %[[SUM_0]], %[[C0]] : index
+ // UNROLL-FULL-NEXT: %[[SUM_2:.*]] = arith.addi %[[SUM_1]], %[[C0]] : index
gpu.terminator
}
return
}
-// UNROLL-FULL-LABEL: func.func @partial_unroll_factor_4
+// UNROLL-FULL-LABEL: func.func @partial_unroll_factor_4
func.func @partial_unroll_factor_4() {
%0 = arith.constant 0 :index
%1 = arith.constant 2 : index
- // UNROLL-FULL: %[[C0:.*]] = arith.constant 0 : index
+ // UNROLL-FULL: %[[C0:.*]] = arith.constant 0 : index
gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %1, %sz_by = %1, %sz_bz = %1)
threads(%tx, %ty, %tz) in (%sz_tx = %1, %sz_ty = %1, %sz_tz = %1) {
%threadid = gpu.thread_id x
@@ -332,13 +314,13 @@ func.func @partial_unroll_factor_4() {
gpu.terminator
}
// UNROLL-FULL: %[[ID:.*]] = gpu.thread_id x
- // UNROLL-FULL: affine.for %{{.*}} = %[[ID]] to 9 step 8 iter_args(%[[ARG:.*]] = %[[C0]]) -> (index) {
- // UNROLL-FULL: %[[SUM_0:.*]] = arith.addi %[[ARG]], %[[C0]] : index
- // UNROLL-FULL: %[[SUM_1:.*]] = arith.addi %[[SUM_0]], %[[C0]] : index
- // UNROLL-FULL: %[[SUM_2:.*]] = arith.addi %[[SUM_1]], %[[C0]] : index
- // UNROLL-FULL: %[[SUM_3:.*]] = arith.addi %[[SUM_2]], %[[C0]] : index
- // UNROLL-FULL: affine.yield %[[SUM_3]] : index
- // UNROLL-FULL: }
+ // UNROLL-FULL-NEXT: affine.for %{{.*}} = %[[ID]] to 9 step 8 iter_args(%[[ARG:.*]] = %[[C0]]) -> (index) {
+ // UNROLL-FULL-NEXT: %[[SUM_0:.*]] = arith.addi %[[ARG]], %[[C0]] : index
+ // UNROLL-FULL-NEXT: %[[SUM_1:.*]] = arith.addi %[[SUM_0]], %[[C0]] : index
+ // UNROLL-FULL-NEXT: %[[SUM_2:.*]] = arith.addi %[[SUM_1]], %[[C0]] : index
+ // UNROLL-FULL-NEXT: %[[SUM_3:.*]] = arith.addi %[[SUM_2]], %[[C0]] : index
+ // UNROLL-FULL-NEXT: affine.yield %[[SUM_3]] : index
+ // UNROLL-FULL-NEXT: }
return
}
@@ -800,14 +782,14 @@ func.func @gpu_launch_unroll_by_factor_4() {
gpu.terminator
}
// UNROLL-BY-4: %[[ID:.*]] = gpu.thread_id x
- // UNROLL-BY-4: %[[SUM_0:.*]] = arith.addi %[[C0]], %[[C0]] : index
- // UNROLL-BY-4: %[[SUM_1:.*]] = arith.addi %[[SUM_0]], %[[C0]] : index
- // UNROLL-BY-4: %[[SUM_2:.*]] = arith.addi %[[SUM_1]], %[[C0]] : index
- // UNROLL-BY-4: %[[SUM_3:.*]] = arith.addi %[[SUM_2]], %[[C0]] : index
- // UNROLL-BY-4: affine.for %[[VAL_20:.*]] = [[$MAP7]](){{\[}}%[[ID]]] to 11 step 2 iter_args(%[[ARG:.*]] = %[[SUM_3]]) -> (index) {
- // UNROLL-BY-4: %[[SUM_4:.*]] = arith.addi %[[ARG]], %[[C0]] : index
- // UNROLL-BY-4: affine.yield %[[SUM_4]] : index
- // UNROLL-BY-4: }
+ // UNROLL-BY-4-NEXT: %[[SUM_0:.*]] = arith.addi %[[C0]], %[[C0]] : index
+ // UNROLL-BY-4-NEXT: %[[SUM_1:.*]] = arith.addi %[[SUM_0]], %[[C0]] : index
+ // UNROLL-BY-4-NEXT: %[[SUM_2:.*]] = arith.addi %[[SUM_1]], %[[C0]] : index
+ // UNROLL-BY-4-NEXT: %[[SUM_3:.*]] = arith.addi %[[SUM_2]], %[[C0]] : index
+ // UNROLL-BY-4-NEXT: affine.for %[[VAL_20:.*]] = [[$MAP7]](){{\[}}%[[ID]]] to 11 step 2 iter_args(%[[ARG:.*]] = %[[SUM_3]]) -> (index) {
+ // UNROLL-BY-4-NEXT: %[[SUM_4:.*]] = arith.addi %[[ARG]], %[[C0]] : index
+ // UNROLL-BY-4-NEXT: affine.yield %[[SUM_4]] : index
+ // UNROLL-BY-4-NEXT: }
return
}
>From e865351424ee36285133ee14ceccd924ea21dda3 Mon Sep 17 00:00:00 2001
From: linuxlonelyeagle <2020382038 at qq.com>
Date: Wed, 26 Feb 2025 10:55:07 +0800
Subject: [PATCH 3/5] use IntegerRangeAnalysis and update
launchOp::inferResultRanges.
---
.../Dialect/Affine/Analysis/LoopAnalysis.h | 7 +-
mlir/include/mlir/Dialect/GPU/IR/GPUOps.td | 6 --
.../Dialect/Affine/Analysis/LoopAnalysis.cpp | 90 ++++++++++---------
mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp | 7 +-
mlir/lib/Dialect/GPU/IR/GPUDialect.cpp | 20 -----
.../GPU/IR/InferIntRangeInterfaceImpls.cpp | 34 ++++---
6 files changed, 74 insertions(+), 90 deletions(-)
diff --git a/mlir/include/mlir/Dialect/Affine/Analysis/LoopAnalysis.h b/mlir/include/mlir/Dialect/Affine/Analysis/LoopAnalysis.h
index 591533d17c960..f5b6794d42794 100644
--- a/mlir/include/mlir/Dialect/Affine/Analysis/LoopAnalysis.h
+++ b/mlir/include/mlir/Dialect/Affine/Analysis/LoopAnalysis.h
@@ -43,9 +43,10 @@ void getTripCountMapAndOperands(AffineForOp forOp, AffineMap *map,
/// constant trip count in non-trivial cases.
std::optional<uint64_t> getConstantTripCount(AffineForOp forOp);
-/// In some scenarios, such as GPU, the number of trip of each thread in the
-/// loop is inconsistent. This function returns the maximum number of trip.
-std::optional<uint64_t> getMaxConstantTripCount(AffineForOp forOp);
+/// Returns the maximum trip count when the operand of forOp has a range. If the
+/// operand of forOp is a constant, the return value is the same as
+/// `getConstantTripCount`.
+std::optional<uint64_t> getUpperBoundOnTripCount(AffineForOp forOp);
/// Returns the greatest known integral divisor of the trip count. Affine
/// expression analysis is used (indirectly through getTripCount), and
diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
index fde1ad482ae2d..2b1ce573effd0 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
@@ -1035,12 +1035,6 @@ def GPU_LaunchOp : GPU_Op<"launch", [
static StringRef getNumWorkgroupAttributionsAttrName() {
return "workgroup_attributions";
}
-
- /// Find BlockSize via the BlockArgument of gpu.launch.
- Value getBlockSizeOnAxis(Value threadId);
-
- /// Find BlockSize via the Dimension Information.
- Value getBlockSizeOnAxis(Dimension dimension);
}];
let hasCanonicalizer = 1;
diff --git a/mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp b/mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp
index 15a5376fa922e..5ed11d8bde029 100644
--- a/mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp
+++ b/mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp
@@ -12,13 +12,15 @@
#include "mlir/Dialect/Affine/Analysis/LoopAnalysis.h"
+#include "mlir/Analysis/DataFlow/DeadCodeAnalysis.h"
+#include "mlir/Analysis/DataFlow/IntegerRangeAnalysis.h"
#include "mlir/Analysis/SliceAnalysis.h"
#include "mlir/Dialect/Affine/Analysis/AffineAnalysis.h"
#include "mlir/Dialect/Affine/Analysis/AffineStructures.h"
#include "mlir/Dialect/Affine/Analysis/NestedMatcher.h"
#include "mlir/Dialect/Affine/IR/AffineOps.h"
#include "mlir/Dialect/Affine/IR/AffineValueMap.h"
-#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Interfaces/FunctionInterfaces.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/ADT/DenseSet.h"
@@ -31,6 +33,7 @@
using namespace mlir;
using namespace mlir::affine;
+using namespace mlir::dataflow;
#define DEBUG_TYPE "affine-loop-analysis"
@@ -85,48 +88,54 @@ void mlir::affine::getTripCountMapAndOperands(
tripCountValueMap.getOperands().end());
}
-/// Replace thread_id with its maximum value, if `replaceWithZero` is true,
-/// thread_id will be replaced by its minimum value 0.
-static void replaceGPUOperands(AffineForOp forOp,
- SmallVectorImpl<Value> &operands,
- SmallVectorImpl<AffineExpr> &symReplacements,
- unsigned numDim, bool replaceWithZero = false) {
- auto launchOp = forOp->getParentOfType<gpu::LaunchOp>();
- if (!launchOp)
+/// By running `IntegerRangeAnalysis` to get the ranges of operand, then fill
+/// the `symReplacements` with range. If `replaceByMin` is set to true,
+/// construct `replacement` using the smallest value.By default, the largest
+/// value will be used for constructing `replacement`.
+static void replaceOperandByRange(AffineForOp forOp,
+ SmallVectorImpl<Value> &operands,
+ SmallVectorImpl<AffineExpr> &symReplacements,
+ unsigned numDim, bool replaceByMin = false) {
+ DataFlowSolver solver;
+ solver.load<DeadCodeAnalysis>();
+ solver.load<IntegerRangeAnalysis>();
+ if (failed(solver.initializeAndRun(
+ forOp->getParentOfType<FunctionOpInterface>())))
return;
- // `b` is only used to create `AffineExpr`.
+ // `b` is used to create affineExpr
Builder b(forOp.getContext());
- unsigned idx = 0;
-
for (unsigned i = numDim, e = operands.size(); i < e; ++i) {
Value operand = operands[i];
- if (Value blockSize = launchOp.getBlockSizeOnAxis(operand)) {
- operands[i] = blockSize;
- if (!replaceWithZero)
- symReplacements.push_back(b.getAffineSymbolExpr(idx++) - 1);
- else
- symReplacements.push_back(b.getAffineConstantExpr(0));
+ auto lattice =
+ solver.lookupState<dataflow::IntegerValueRangeLattice>(operand);
+ if (!lattice) {
+ symReplacements.push_back(b.getAffineSymbolExpr(i - numDim));
continue;
}
- Operation *defOp = operand.getDefiningOp();
- if (!defOp) {
- ++idx;
+ if (lattice->getValue().isUninitialized()) {
+ symReplacements.push_back(b.getAffineSymbolExpr(i - numDim));
continue;
}
- if (auto threadIdOp = mlir::dyn_cast<gpu::ThreadIdOp>(defOp)) {
- gpu::Dimension dimension = threadIdOp.getDimension();
- operands[i] = launchOp.getBlockSizeOnAxis(dimension);
- if (!replaceWithZero)
- symReplacements.push_back(b.getAffineSymbolExpr(idx++) - 1);
- else
- symReplacements.push_back(b.getAffineConstantExpr(0));
+ ConstantIntRanges range = lattice->getValue().getValue();
+ APInt max = range.smax();
+ APInt min = range.smin();
+ unsigned bitNums = max.getBitWidth();
+
+ if (APInt::getSignedMaxValue(bitNums) == max &&
+ APInt::getSignedMinValue(bitNums) == min) {
+ symReplacements.push_back(b.getAffineSymbolExpr(i - numDim));
continue;
}
- ++idx;
+
+ if (!replaceByMin)
+ symReplacements.push_back(b.getAffineConstantExpr(max.getZExtValue()));
+ else
+ symReplacements.push_back(b.getAffineConstantExpr(min.getZExtValue()));
}
+ return;
}
/// Take the min if all trip counts are constant.
@@ -158,19 +167,17 @@ std::optional<uint64_t> mlir::affine::getConstantTripCount(AffineForOp forOp) {
if (!map)
return std::nullopt;
SmallVector<AffineExpr, 4> symReplacements;
- replaceGPUOperands(forOp, operands, symReplacements, map.getNumDims());
+ replaceOperandByRange(forOp, operands, symReplacements, map.getNumDims());
map = map.replaceDimsAndSymbols({}, symReplacements, map.getNumDims(),
map.getNumSymbols());
- affine::AffineValueMap valueMap(map, operands);
- (void)valueMap.canonicalize();
- map = valueMap.getAffineMap();
return getConstantTripCountFromAffineMap(map);
}
-/// In some scenarios, such as GPU, the number of trip of each thread in the
-/// loop is inconsistent. This function returns the maximum number of trip.
+/// Returns the maximum trip count when the operand of forOp has a range. If the
+/// operand of forOp is a constant, the return value is the same as
+/// `getConstantTripCount`.
std::optional<uint64_t>
-mlir::affine::getMaxConstantTripCount(AffineForOp forOp) {
+mlir::affine::getUpperBoundOnTripCount(AffineForOp forOp) {
SmallVector<Value, 4> operands;
AffineMap map;
getTripCountMapAndOperands(forOp, &map, &operands);
@@ -178,12 +185,10 @@ mlir::affine::getMaxConstantTripCount(AffineForOp forOp) {
if (!map)
return std::nullopt;
SmallVector<AffineExpr, 4> symReplacements;
- replaceGPUOperands(forOp, operands, symReplacements, map.getNumDims(), true);
+ replaceOperandByRange(forOp, operands, symReplacements, map.getNumDims(),
+ true);
map = map.replaceDimsAndSymbols({}, symReplacements, map.getNumDims(),
map.getNumSymbols());
- affine::AffineValueMap valueMap(map, operands);
- (void)valueMap.canonicalize();
- map = valueMap.getAffineMap();
return getConstantTripCountFromAffineMap(map);
}
@@ -198,12 +203,9 @@ uint64_t mlir::affine::getLargestDivisorOfTripCount(AffineForOp forOp) {
if (!map)
return 1;
SmallVector<AffineExpr, 4> symReplacements;
- replaceGPUOperands(forOp, operands, symReplacements, map.getNumDims());
+ replaceOperandByRange(forOp, operands, symReplacements, map.getNumDims());
map = map.replaceDimsAndSymbols({}, symReplacements, map.getNumDims(),
map.getNumSymbols());
- affine::AffineValueMap valueMap(map, operands);
- (void)valueMap.canonicalize();
- map = valueMap.getAffineMap();
// The largest divisor of the trip count is the GCD of the individual largest
// divisors.
assert(map.getNumResults() >= 1 && "expected one or more results");
diff --git a/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp b/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp
index b6471ac179b22..a344bc8f9bffe 100644
--- a/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp
+++ b/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp
@@ -17,7 +17,6 @@
#include "mlir/Dialect/Affine/IR/AffineValueMap.h"
#include "mlir/Dialect/Affine/Utils.h"
#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "mlir/Dialect/MemRef/IR/MemRef.h"
#include "mlir/Dialect/SCF/IR/SCF.h"
#include "mlir/IR/IRMapping.h"
@@ -118,7 +117,7 @@ static void replaceIterArgsAndYieldResults(AffineForOp forOp) {
/// was known to have a single iteration.
LogicalResult mlir::affine::promoteIfSingleIteration(AffineForOp forOp) {
std::optional<uint64_t> tripCount = getConstantTripCount(forOp);
- std::optional<uint64_t> maxTripCount = getMaxConstantTripCount(forOp);
+ std::optional<uint64_t> maxTripCount = getUpperBoundOnTripCount(forOp);
if (!tripCount || *tripCount != 1 || !maxTripCount || *maxTripCount != 1)
return failure();
@@ -888,7 +887,7 @@ void mlir::affine::getTileableBands(
LogicalResult mlir::affine::loopUnrollFull(AffineForOp forOp) {
std::optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
std::optional<uint64_t> maxMayBeConstantTripCount =
- getMaxConstantTripCount(forOp);
+ getUpperBoundOnTripCount(forOp);
if (!mayBeConstantTripCount.has_value() &&
!maxMayBeConstantTripCount.has_value())
@@ -1025,7 +1024,7 @@ LogicalResult mlir::affine::loopUnrollByFactor(
std::optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
std::optional<uint64_t> maxMayBeConstantTripCount =
- getMaxConstantTripCount(forOp);
+ getUpperBoundOnTripCount(forOp);
if (unrollFactor == 1) {
if (mayBeConstantTripCount && *mayBeConstantTripCount == 1 &&
maxMayBeConstantTripCount && *maxMayBeConstantTripCount == 1 &&
diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
index 31051ed7e55a2..d06f10d3137a1 100644
--- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
+++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
@@ -799,26 +799,6 @@ std::optional<KernelDim3> LaunchOp::getClusterSizeOperandValues() {
return KernelDim3{operands[6], operands[7], operands[8]};
}
-Value LaunchOp::getBlockSizeOnAxis(Dimension dimension) {
- if (dimension == Dimension::x)
- return getBlockSizeX();
- else if (dimension == Dimension::y)
- return getBlockSizeY();
- else
- return getBlockSizeZ();
-}
-
-Value LaunchOp::getBlockSizeOnAxis(Value threadId) {
- KernelDim3 threadIds = getThreadIds();
- if (threadIds.x == threadId)
- return getBlockSizeX();
- else if (threadIds.y == threadId)
- return getBlockSizeY();
- else if (threadIds.z == threadId)
- return getBlockSizeZ();
- return {};
-}
-
LogicalResult LaunchOp::verify() {
if (!(hasClusterSize()) &&
(getClusterSizeX() || getClusterSizeY() || getClusterSizeZ()))
diff --git a/mlir/lib/Dialect/GPU/IR/InferIntRangeInterfaceImpls.cpp b/mlir/lib/Dialect/GPU/IR/InferIntRangeInterfaceImpls.cpp
index f5e30a278f06b..f62d01d719633 100644
--- a/mlir/lib/Dialect/GPU/IR/InferIntRangeInterfaceImpls.cpp
+++ b/mlir/lib/Dialect/GPU/IR/InferIntRangeInterfaceImpls.cpp
@@ -250,26 +250,34 @@ void SubgroupSizeOp::inferResultRanges(ArrayRef<ConstantIntRanges>,
void LaunchOp::inferResultRanges(ArrayRef<ConstantIntRanges> argRanges,
SetIntRangeFn setResultRange) {
auto setRange = [&](const ConstantIntRanges &argRange, Value dimResult,
- Value idxResult) {
+ Value idxResult, Value size) {
if (argRange.umin().getBitWidth() != IndexType::kInternalStorageBitWidth)
return;
- ConstantIntRanges dimRange =
- argRange.intersection(getIndexRange(1, kMaxDim));
- setResultRange(dimResult, dimRange);
- ConstantIntRanges idxRange =
- getIndexRange(0, dimRange.umax().getZExtValue() - 1);
- setResultRange(idxResult, idxRange);
+ APInt sizeInt;
+ if (matchPattern(size, m_ConstantInt(&sizeInt))) {
+ ConstantIntRanges dimRange = ConstantIntRanges::constant(sizeInt);
+ setResultRange(dimResult, dimRange);
+ ConstantIntRanges idxRange = getIndexRange(0, sizeInt.getZExtValue() - 1);
+ setResultRange(idxResult, idxRange);
+ } else {
+ ConstantIntRanges dimRange =
+ argRange.intersection(getIndexRange(1, kMaxDim));
+ setResultRange(dimResult, dimRange);
+ ConstantIntRanges idxRange =
+ getIndexRange(0, dimRange.umax().getZExtValue() - 1);
+ setResultRange(idxResult, idxRange);
+ }
};
argRanges = argRanges.drop_front(getAsyncDependencies().size());
KernelDim3 gridDims = getGridSize();
KernelDim3 blockIds = getBlockIds();
- setRange(argRanges[0], gridDims.x, blockIds.x);
- setRange(argRanges[1], gridDims.y, blockIds.y);
- setRange(argRanges[2], gridDims.z, blockIds.z);
+ setRange(argRanges[0], gridDims.x, blockIds.x, getGridSizeX());
+ setRange(argRanges[1], gridDims.y, blockIds.y, getGridSizeY());
+ setRange(argRanges[2], gridDims.z, blockIds.z, getGridSizeZ());
KernelDim3 blockDims = getBlockSize();
KernelDim3 threadIds = getThreadIds();
- setRange(argRanges[3], blockDims.x, threadIds.x);
- setRange(argRanges[4], blockDims.y, threadIds.y);
- setRange(argRanges[5], blockDims.z, threadIds.z);
+ setRange(argRanges[3], blockDims.x, threadIds.x, getBlockSizeX());
+ setRange(argRanges[4], blockDims.y, threadIds.y, getBlockSizeY());
+ setRange(argRanges[5], blockDims.z, threadIds.z, getBlockSizeZ());
}
>From 0b30c4e9d9747dd1040280a471df17389eed00cb Mon Sep 17 00:00:00 2001
From: linuxlonelyeagle <2020382038 at qq.com>
Date: Fri, 28 Feb 2025 20:45:38 +0800
Subject: [PATCH 4/5] use ValueBoundsOpInterface.
---
.../Dialect/Affine/Analysis/LoopAnalysis.cpp | 117 ++++++------------
.../GPU/IR/InferIntRangeInterfaceImpls.cpp | 34 ++---
.../lib/Interfaces/ValueBoundsOpInterface.cpp | 3 +-
3 files changed, 54 insertions(+), 100 deletions(-)
diff --git a/mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp b/mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp
index 5ed11d8bde029..bcb31db6b1a93 100644
--- a/mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp
+++ b/mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp
@@ -12,8 +12,6 @@
#include "mlir/Dialect/Affine/Analysis/LoopAnalysis.h"
-#include "mlir/Analysis/DataFlow/DeadCodeAnalysis.h"
-#include "mlir/Analysis/DataFlow/IntegerRangeAnalysis.h"
#include "mlir/Analysis/SliceAnalysis.h"
#include "mlir/Dialect/Affine/Analysis/AffineAnalysis.h"
#include "mlir/Dialect/Affine/Analysis/AffineStructures.h"
@@ -21,6 +19,7 @@
#include "mlir/Dialect/Affine/IR/AffineOps.h"
#include "mlir/Dialect/Affine/IR/AffineValueMap.h"
#include "mlir/Interfaces/FunctionInterfaces.h"
+#include "mlir/Interfaces/ValueBoundsOpInterface.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/ADT/DenseSet.h"
@@ -33,7 +32,6 @@
using namespace mlir;
using namespace mlir::affine;
-using namespace mlir::dataflow;
#define DEBUG_TYPE "affine-loop-analysis"
@@ -88,69 +86,37 @@ void mlir::affine::getTripCountMapAndOperands(
tripCountValueMap.getOperands().end());
}
-/// By running `IntegerRangeAnalysis` to get the ranges of operand, then fill
-/// the `symReplacements` with range. If `replaceByMin` is set to true,
-/// construct `replacement` using the smallest value.By default, the largest
-/// value will be used for constructing `replacement`.
-static void replaceOperandByRange(AffineForOp forOp,
- SmallVectorImpl<Value> &operands,
- SmallVectorImpl<AffineExpr> &symReplacements,
- unsigned numDim, bool replaceByMin = false) {
- DataFlowSolver solver;
- solver.load<DeadCodeAnalysis>();
- solver.load<IntegerRangeAnalysis>();
- if (failed(solver.initializeAndRun(
- forOp->getParentOfType<FunctionOpInterface>())))
- return;
-
- // `b` is used to create affineExpr
- Builder b(forOp.getContext());
- for (unsigned i = numDim, e = operands.size(); i < e; ++i) {
- Value operand = operands[i];
- auto lattice =
- solver.lookupState<dataflow::IntegerValueRangeLattice>(operand);
- if (!lattice) {
- symReplacements.push_back(b.getAffineSymbolExpr(i - numDim));
- continue;
- }
-
- if (lattice->getValue().isUninitialized()) {
- symReplacements.push_back(b.getAffineSymbolExpr(i - numDim));
- continue;
- }
-
- ConstantIntRanges range = lattice->getValue().getValue();
- APInt max = range.smax();
- APInt min = range.smin();
- unsigned bitNums = max.getBitWidth();
-
- if (APInt::getSignedMaxValue(bitNums) == max &&
- APInt::getSignedMinValue(bitNums) == min) {
- symReplacements.push_back(b.getAffineSymbolExpr(i - numDim));
- continue;
- }
-
- if (!replaceByMin)
- symReplacements.push_back(b.getAffineConstantExpr(max.getZExtValue()));
- else
- symReplacements.push_back(b.getAffineConstantExpr(min.getZExtValue()));
- }
- return;
-}
-
/// Take the min if all trip counts are constant.
static std::optional<uint64_t>
-getConstantTripCountFromAffineMap(AffineMap map) {
+getConstantTripCountFromAffineMap(AffineMap map,
+ SmallVectorImpl<Value> &operands,
+ presburger::BoundType type) {
std::optional<uint64_t> tripCount;
for (auto resultExpr : map.getResults()) {
- auto constExpr = dyn_cast<AffineConstantExpr>(resultExpr);
- if (!constExpr)
+ AffineMap subMap =
+ AffineMap::get(map.getNumDims(), map.getNumSymbols(), resultExpr);
+ ValueBoundsConstraintSet::Variable var(subMap, operands);
+ auto lbBound = ValueBoundsConstraintSet::computeConstantBound(
+ mlir::presburger::BoundType::LB, var);
+ auto ubBound = ValueBoundsConstraintSet::computeConstantBound(
+ mlir::presburger::BoundType::UB, var, nullptr, true);
+ if (failed(lbBound) || failed(ubBound))
return std::nullopt;
- if (tripCount.has_value())
- tripCount =
- std::min(*tripCount, static_cast<uint64_t>(constExpr.getValue()));
- else
- tripCount = constExpr.getValue();
+ if (type == presburger::BoundType::LB) {
+ if (tripCount.has_value())
+ tripCount =
+ std::min(*tripCount, static_cast<uint64_t>(lbBound.value()));
+ else
+ tripCount = lbBound.value();
+ } else if (type == presburger::BoundType::UB) {
+ if (tripCount.has_value())
+ tripCount =
+ std::min(*tripCount, static_cast<uint64_t>(ubBound.value()));
+ else
+ tripCount = ubBound.value();
+ } else {
+ return std::nullopt;
+ }
}
return tripCount;
}
@@ -166,11 +132,8 @@ std::optional<uint64_t> mlir::affine::getConstantTripCount(AffineForOp forOp) {
if (!map)
return std::nullopt;
- SmallVector<AffineExpr, 4> symReplacements;
- replaceOperandByRange(forOp, operands, symReplacements, map.getNumDims());
- map = map.replaceDimsAndSymbols({}, symReplacements, map.getNumDims(),
- map.getNumSymbols());
- return getConstantTripCountFromAffineMap(map);
+ return getConstantTripCountFromAffineMap(map, operands,
+ presburger::BoundType::LB);
}
/// Returns the maximum trip count when the operand of forOp has a range. If the
@@ -184,12 +147,8 @@ mlir::affine::getUpperBoundOnTripCount(AffineForOp forOp) {
if (!map)
return std::nullopt;
- SmallVector<AffineExpr, 4> symReplacements;
- replaceOperandByRange(forOp, operands, symReplacements, map.getNumDims(),
- true);
- map = map.replaceDimsAndSymbols({}, symReplacements, map.getNumDims(),
- map.getNumSymbols());
- return getConstantTripCountFromAffineMap(map);
+ return getConstantTripCountFromAffineMap(map, operands,
+ presburger::BoundType::UB);
}
/// Returns the greatest known integral divisor of the trip count. Affine
@@ -202,18 +161,20 @@ uint64_t mlir::affine::getLargestDivisorOfTripCount(AffineForOp forOp) {
if (!map)
return 1;
- SmallVector<AffineExpr, 4> symReplacements;
- replaceOperandByRange(forOp, operands, symReplacements, map.getNumDims());
- map = map.replaceDimsAndSymbols({}, symReplacements, map.getNumDims(),
- map.getNumSymbols());
+
// The largest divisor of the trip count is the GCD of the individual largest
// divisors.
assert(map.getNumResults() >= 1 && "expected one or more results");
std::optional<uint64_t> gcd;
for (auto resultExpr : map.getResults()) {
uint64_t thisGcd;
- if (auto constExpr = dyn_cast<AffineConstantExpr>(resultExpr)) {
- uint64_t tripCount = constExpr.getValue();
+ AffineMap subMap =
+ AffineMap::get(map.getNumDims(), map.getNumSymbols(), resultExpr);
+ ValueBoundsConstraintSet::Variable var(subMap, operands);
+ auto lbBound = ValueBoundsConstraintSet::computeConstantBound(
+ mlir::presburger::BoundType::LB, var);
+ if (!failed(lbBound)) {
+ uint64_t tripCount = lbBound.value();
// 0 iteration loops (greatest divisor is 2^64 - 1).
if (tripCount == 0)
thisGcd = std::numeric_limits<uint64_t>::max();
diff --git a/mlir/lib/Dialect/GPU/IR/InferIntRangeInterfaceImpls.cpp b/mlir/lib/Dialect/GPU/IR/InferIntRangeInterfaceImpls.cpp
index f62d01d719633..f5e30a278f06b 100644
--- a/mlir/lib/Dialect/GPU/IR/InferIntRangeInterfaceImpls.cpp
+++ b/mlir/lib/Dialect/GPU/IR/InferIntRangeInterfaceImpls.cpp
@@ -250,34 +250,26 @@ void SubgroupSizeOp::inferResultRanges(ArrayRef<ConstantIntRanges>,
void LaunchOp::inferResultRanges(ArrayRef<ConstantIntRanges> argRanges,
SetIntRangeFn setResultRange) {
auto setRange = [&](const ConstantIntRanges &argRange, Value dimResult,
- Value idxResult, Value size) {
+ Value idxResult) {
if (argRange.umin().getBitWidth() != IndexType::kInternalStorageBitWidth)
return;
- APInt sizeInt;
- if (matchPattern(size, m_ConstantInt(&sizeInt))) {
- ConstantIntRanges dimRange = ConstantIntRanges::constant(sizeInt);
- setResultRange(dimResult, dimRange);
- ConstantIntRanges idxRange = getIndexRange(0, sizeInt.getZExtValue() - 1);
- setResultRange(idxResult, idxRange);
- } else {
- ConstantIntRanges dimRange =
- argRange.intersection(getIndexRange(1, kMaxDim));
- setResultRange(dimResult, dimRange);
- ConstantIntRanges idxRange =
- getIndexRange(0, dimRange.umax().getZExtValue() - 1);
- setResultRange(idxResult, idxRange);
- }
+ ConstantIntRanges dimRange =
+ argRange.intersection(getIndexRange(1, kMaxDim));
+ setResultRange(dimResult, dimRange);
+ ConstantIntRanges idxRange =
+ getIndexRange(0, dimRange.umax().getZExtValue() - 1);
+ setResultRange(idxResult, idxRange);
};
argRanges = argRanges.drop_front(getAsyncDependencies().size());
KernelDim3 gridDims = getGridSize();
KernelDim3 blockIds = getBlockIds();
- setRange(argRanges[0], gridDims.x, blockIds.x, getGridSizeX());
- setRange(argRanges[1], gridDims.y, blockIds.y, getGridSizeY());
- setRange(argRanges[2], gridDims.z, blockIds.z, getGridSizeZ());
+ setRange(argRanges[0], gridDims.x, blockIds.x);
+ setRange(argRanges[1], gridDims.y, blockIds.y);
+ setRange(argRanges[2], gridDims.z, blockIds.z);
KernelDim3 blockDims = getBlockSize();
KernelDim3 threadIds = getThreadIds();
- setRange(argRanges[3], blockDims.x, threadIds.x, getBlockSizeX());
- setRange(argRanges[4], blockDims.y, threadIds.y, getBlockSizeY());
- setRange(argRanges[5], blockDims.z, threadIds.z, getBlockSizeZ());
+ setRange(argRanges[3], blockDims.x, threadIds.x);
+ setRange(argRanges[4], blockDims.y, threadIds.y);
+ setRange(argRanges[5], blockDims.z, threadIds.z);
}
diff --git a/mlir/lib/Interfaces/ValueBoundsOpInterface.cpp b/mlir/lib/Interfaces/ValueBoundsOpInterface.cpp
index 87f883c2e6485..f4408fa9417b5 100644
--- a/mlir/lib/Interfaces/ValueBoundsOpInterface.cpp
+++ b/mlir/lib/Interfaces/ValueBoundsOpInterface.cpp
@@ -646,7 +646,8 @@ FailureOr<int64_t> ValueBoundsConstraintSet::computeConstantBound(
// Compute constant bound for `valueDim`.
int64_t ubAdjustment = closedUB ? 0 : 1;
if (auto bound = cstr.cstr.getConstantBound64(type, pos))
- return type == BoundType::UB ? *bound + ubAdjustment : *bound;
+ if (bound.has_value())
+ return type == BoundType::UB ? *bound + ubAdjustment : *bound;
return failure();
}
>From 82e48ee7ed45cda6b0d410e59097c02bead73a58 Mon Sep 17 00:00:00 2001
From: linuxlonelyeagle <2020382038 at qq.com>
Date: Tue, 18 Mar 2025 10:54:27 +0800
Subject: [PATCH 5/5] update getKnownTripCountBound function name and rename
tripCount to minTripCount.
---
mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp | 11 ++++-------
mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp | 5 +++--
2 files changed, 7 insertions(+), 9 deletions(-)
diff --git a/mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp b/mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp
index f1a723c919f7e..c8f38cfd8c328 100644
--- a/mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp
+++ b/mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp
@@ -216,9 +216,8 @@ void mlir::affine::getTripCountMapAndOperands(
/// Take the min if all trip counts are constant.
static std::optional<uint64_t>
-getConstantTripCountFromAffineMap(AffineMap map,
- SmallVectorImpl<Value> &operands,
- presburger::BoundType type) {
+getKnownTripCountBound(AffineMap map, SmallVectorImpl<Value> &operands,
+ presburger::BoundType type) {
std::optional<uint64_t> tripCount;
for (auto resultExpr : map.getResults()) {
AffineMap subMap =
@@ -260,8 +259,7 @@ std::optional<uint64_t> mlir::affine::getConstantTripCount(AffineForOp forOp) {
if (!map)
return std::nullopt;
- return getConstantTripCountFromAffineMap(map, operands,
- presburger::BoundType::LB);
+ return getKnownTripCountBound(map, operands, presburger::BoundType::LB);
}
/// Returns the maximum trip count when the operand of forOp has a range. If the
@@ -275,8 +273,7 @@ mlir::affine::getUpperBoundOnTripCount(AffineForOp forOp) {
if (!map)
return std::nullopt;
- return getConstantTripCountFromAffineMap(map, operands,
- presburger::BoundType::UB);
+ return getKnownTripCountBound(map, operands, presburger::BoundType::UB);
}
/// Returns the greatest known integral divisor of the trip count. Affine
diff --git a/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp b/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp
index 37e58b1332712..efbc87ec740bb 100644
--- a/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp
+++ b/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp
@@ -116,9 +116,10 @@ static void replaceIterArgsAndYieldResults(AffineForOp forOp) {
/// Promotes the loop body of a forOp to its containing block if the forOp
/// was known to have a single iteration.
LogicalResult mlir::affine::promoteIfSingleIteration(AffineForOp forOp) {
- std::optional<uint64_t> tripCount = getConstantTripCount(forOp);
+ std::optional<uint64_t> minTripCount = getConstantTripCount(forOp);
std::optional<uint64_t> maxTripCount = getUpperBoundOnTripCount(forOp);
- if (!tripCount || *tripCount != 1 || !maxTripCount || *maxTripCount != 1)
+ if (!minTripCount || *minTripCount != 1 || !maxTripCount ||
+ *maxTripCount != 1)
return failure();
// TODO: extend this for arbitrary affine bounds.
More information about the Mlir-commits
mailing list