[llvm] [LoopUtils] Don't wrap in getLoopEstimatedTripCount (PR #129080)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Feb 27 08:22:56 PST 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-llvm-transforms
Author: Ramkumar Ramachandra (artagnon)
<details>
<summary>Changes</summary>
getLoopEstimatedTripCount returns the trip count based on profiling data, and its documentation says that it could return 0 when the trip count is zero, but this is not the case: a valid trip count can never be zero, and it returns 0 when the unsigned ExitCount is incremented by 1 and wraps. Some callers are careful about checking for this zero value in an std::optional, but it makes for an API with footguns, as a std::optional return value indicates that a non-nullopt value would be a valid trip count. Fix this by explicitly returning std::nullopt when the return value would wrap, and strip additional checks in callers. This also fixes a minor bug in LoopVectorize.
---
Full diff: https://github.com/llvm/llvm-project/pull/129080.diff
6 Files Affected:
- (modified) llvm/include/llvm/Transforms/Utils/LoopUtils.h (+2-2)
- (modified) llvm/lib/Transforms/Utils/LoopPeel.cpp (+11-13)
- (modified) llvm/lib/Transforms/Utils/LoopUtils.cpp (+6-1)
- (modified) llvm/lib/Transforms/Vectorize/LoopVectorize.cpp (+4-4)
- (modified) llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_estimated_tc.ll (+43-18)
- (modified) llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll (+2-2)
``````````diff
diff --git a/llvm/include/llvm/Transforms/Utils/LoopUtils.h b/llvm/include/llvm/Transforms/Utils/LoopUtils.h
index b4cd52fef70fd..ab58fca6732bb 100644
--- a/llvm/include/llvm/Transforms/Utils/LoopUtils.h
+++ b/llvm/include/llvm/Transforms/Utils/LoopUtils.h
@@ -318,8 +318,8 @@ void addStringMetadataToLoop(Loop *TheLoop, const char *MDString,
/// Returns a loop's estimated trip count based on branch weight metadata.
/// In addition if \p EstimatedLoopInvocationWeight is not null it is
/// initialized with weight of loop's latch leading to the exit.
-/// Returns 0 when the count is estimated to be 0, or std::nullopt when a
-/// meaningful estimate can not be made.
+/// Returns a valid positive trip count, or std::nullopt when a meaningful
+/// estimate cannot be made.
std::optional<unsigned>
getLoopEstimatedTripCount(Loop *L,
unsigned *EstimatedLoopInvocationWeight = nullptr);
diff --git a/llvm/lib/Transforms/Utils/LoopPeel.cpp b/llvm/lib/Transforms/Utils/LoopPeel.cpp
index 9a24c1b0d03de..0f3a92b65686c 100644
--- a/llvm/lib/Transforms/Utils/LoopPeel.cpp
+++ b/llvm/lib/Transforms/Utils/LoopPeel.cpp
@@ -640,20 +640,18 @@ void llvm::computePeelCount(Loop *L, unsigned LoopSize,
LLVM_DEBUG(dbgs() << "Profile-based estimated trip count is "
<< *EstimatedTripCount << "\n");
- if (*EstimatedTripCount) {
- if (*EstimatedTripCount + AlreadyPeeled <= MaxPeelCount) {
- unsigned PeelCount = *EstimatedTripCount;
- LLVM_DEBUG(dbgs() << "Peeling first " << PeelCount << " iterations.\n");
- PP.PeelCount = PeelCount;
- return;
- }
- LLVM_DEBUG(dbgs() << "Already peel count: " << AlreadyPeeled << "\n");
- LLVM_DEBUG(dbgs() << "Max peel count: " << UnrollPeelMaxCount << "\n");
- LLVM_DEBUG(dbgs() << "Loop cost: " << LoopSize << "\n");
- LLVM_DEBUG(dbgs() << "Max peel cost: " << Threshold << "\n");
- LLVM_DEBUG(dbgs() << "Max peel count by cost: "
- << (Threshold / LoopSize - 1) << "\n");
+ if (*EstimatedTripCount + AlreadyPeeled <= MaxPeelCount) {
+ unsigned PeelCount = *EstimatedTripCount;
+ LLVM_DEBUG(dbgs() << "Peeling first " << PeelCount << " iterations.\n");
+ PP.PeelCount = PeelCount;
+ return;
}
+ LLVM_DEBUG(dbgs() << "Already peel count: " << AlreadyPeeled << "\n");
+ LLVM_DEBUG(dbgs() << "Max peel count: " << UnrollPeelMaxCount << "\n");
+ LLVM_DEBUG(dbgs() << "Loop cost: " << LoopSize << "\n");
+ LLVM_DEBUG(dbgs() << "Max peel cost: " << Threshold << "\n");
+ LLVM_DEBUG(dbgs() << "Max peel count by cost: "
+ << (Threshold / LoopSize - 1) << "\n");
}
}
diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp
index 45915c10107b2..aa7f84c7a83b5 100644
--- a/llvm/lib/Transforms/Utils/LoopUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp
@@ -820,7 +820,7 @@ static BranchInst *getExpectedExitLoopLatchBranch(Loop *L) {
/// Return the estimated trip count for any exiting branch which dominates
/// the loop latch.
-static std::optional<uint64_t> getEstimatedTripCount(BranchInst *ExitingBranch,
+static std::optional<unsigned> getEstimatedTripCount(BranchInst *ExitingBranch,
Loop *L,
uint64_t &OrigExitWeight) {
// To estimate the number of times the loop body was executed, we want to
@@ -842,6 +842,11 @@ static std::optional<uint64_t> getEstimatedTripCount(BranchInst *ExitingBranch,
// Estimated exit count is a ratio of the loop weight by the weight of the
// edge exiting the loop, rounded to nearest.
uint64_t ExitCount = llvm::divideNearest(LoopWeight, ExitWeight);
+
+ // When ExitCount + 1 would wrap in unsigned, return std::nullopt instead.
+ if (ExitCount >= std::numeric_limits<unsigned>::max())
+ return std::nullopt;
+
// Estimated trip count is one plus estimated exit count.
return ExitCount + 1;
}
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index e8a5db28ea0a4..b84ae78a9a56a 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -421,8 +421,9 @@ static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
}
-/// Returns "best known" trip count for the specified loop \p L as defined by
-/// the following procedure:
+/// Returns "best known" trip count, which is either a valid positive trip count
+/// or std::nullopt, for the specified loop \p L as defined by the following
+/// procedure:
/// 1) Returns exact trip count if it is known.
/// 2) Returns expected trip count according to profile data if any.
/// 3) Returns upper bound estimate if known, and if \p CanUseConstantMax.
@@ -2032,7 +2033,6 @@ class GeneratedRTChecks {
PSE, OuterLoop, /* CanUseConstantMax = */ false))
BestTripCount = *EstimatedTC;
- BestTripCount = std::max(BestTripCount, 1U);
InstructionCost NewMemCheckCost = MemCheckCost / BestTripCount;
// Let's ensure the cost is always at least 1.
@@ -5029,7 +5029,7 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
if (TailTripCountUB == TailTripCountLB)
MaxInterleaveCount = InterleaveCountUB;
}
- } else if (BestKnownTC && *BestKnownTC > 0) {
+ } else if (BestKnownTC) {
// At least one iteration must be scalar when this constraint holds. So the
// maximum available iterations for interleaving is one less.
unsigned AvailableTC = requiresScalarEpilogue(VF.isVector())
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_estimated_tc.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_estimated_tc.ll
index 95cbf121377f9..64faaf3947471 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_estimated_tc.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_estimated_tc.ll
@@ -5,7 +5,7 @@ target triple = "aarch64-linux-gnu"
%pair = type { i8, i8 }
-; For a loop with a profile-guided estimated TC of 32, when the auto-vectorizer chooses VF 16,
+; For a loop with a profile-guided estimated TC of 32, when the auto-vectorizer chooses VF 16,
; it should conservatively choose IC 1 so that the vector loop runs twice at least
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 1)
define void @loop_with_profile_tc_32(ptr noalias %p, ptr noalias %q, i64 %n) {
@@ -29,7 +29,7 @@ for.end:
ret void
}
-; For a loop with a profile-guided estimated TC of 33, when the auto-vectorizer chooses VF 16,
+; For a loop with a profile-guided estimated TC of 33, when the auto-vectorizer chooses VF 16,
; it should conservatively choose IC 1 so that the vector loop runs twice at least
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 1)
define void @loop_with_profile_tc_33(ptr noalias %p, ptr noalias %q, i64 %n) {
@@ -53,7 +53,7 @@ for.end:
ret void
}
-; For a loop with a profile-guided estimated TC of 48, when the auto-vectorizer chooses VF 16,
+; For a loop with a profile-guided estimated TC of 48, when the auto-vectorizer chooses VF 16,
; it should conservatively choose IC 1 so that the vector loop runs twice at least
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 1)
define void @loop_with_profile_tc_48(ptr noalias %p, ptr noalias %q, i64 %n) {
@@ -77,7 +77,7 @@ for.end:
ret void
}
-; For a loop with a profile-guided estimated TC of 63, when the auto-vectorizer chooses VF 16,
+; For a loop with a profile-guided estimated TC of 63, when the auto-vectorizer chooses VF 16,
; it should conservatively choose IC 1 so that the vector loop runs twice at least
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 1)
define void @loop_with_profile_tc_63(ptr noalias %p, ptr noalias %q, i64 %n) {
@@ -101,7 +101,7 @@ for.end:
ret void
}
-; For a loop with a profile-guided estimated TC of 64, when the auto-vectorizer chooses VF 16,
+; For a loop with a profile-guided estimated TC of 64, when the auto-vectorizer chooses VF 16,
; it should choose conservatively IC 2 so that the vector loop runs twice at least
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
define void @loop_with_profile_tc_64(ptr noalias %p, ptr noalias %q, i64 %n) {
@@ -125,10 +125,10 @@ for.end:
ret void
}
-; This has the same profile-guided estimated trip count as loop_with_profile_tc_64 but since the
-; resulting interleaved group in this case may access memory out-of-bounds, it requires a scalar
+; This has the same profile-guided estimated trip count as loop_with_profile_tc_64 but since the
+; resulting interleaved group in this case may access memory out-of-bounds, it requires a scalar
; epilogue iteration for correctness, making at most 63 iterations available for interleaving.
-; When the auto-vectorizer chooses VF 16, it should choose IC 1 to leave a smaller scalar
+; When the auto-vectorizer chooses VF 16, it should choose IC 1 to leave a smaller scalar
; remainder than IC 2
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 1)
define void @loop_with_profile_tc_64_scalar_epilogue_reqd(ptr noalias %p, ptr noalias %q, i64 %n) {
@@ -149,7 +149,7 @@ for.end:
ret void
}
-; For a loop with a profile-guided estimated TC of 100, when the auto-vectorizer chooses VF 16,
+; For a loop with a profile-guided estimated TC of 100, when the auto-vectorizer chooses VF 16,
; it should choose conservatively IC 2 so that the vector loop runs twice at least
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
define void @loop_with_profile_tc_100(ptr noalias %p, ptr noalias %q, i64 %n) {
@@ -173,7 +173,7 @@ for.end:
ret void
}
-; For a loop with a profile-guided estimated TC of 128, when the auto-vectorizer chooses VF 16,
+; For a loop with a profile-guided estimated TC of 128, when the auto-vectorizer chooses VF 16,
; it should choose conservatively IC 4 so that the vector loop runs twice at least
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 4)
define void @loop_with_profile_tc_128(ptr noalias %p, ptr noalias %q, i64 %n) {
@@ -206,11 +206,11 @@ for.end:
ret void
}
-; This has the same profile-guided estimated trip count as loop_with_profile_tc_128 but since
-; the resulting interleaved group in this case may access memory out-of-bounds, it requires
-; a scalar epilogue iteration for correctness, making at most 127 iterations available for
+; This has the same profile-guided estimated trip count as loop_with_profile_tc_128 but since
+; the resulting interleaved group in this case may access memory out-of-bounds, it requires
+; a scalar epilogue iteration for correctness, making at most 127 iterations available for
; interleaving.
-; When the auto-vectorizer chooses VF 16, it should choose IC 2 to leave a smaller scalar
+; When the auto-vectorizer chooses VF 16, it should choose IC 2 to leave a smaller scalar
; remainder than IC 4
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
define void @loop_with_profile_tc_128_scalar_epilogue_reqd(ptr noalias %p, ptr noalias %q, i64 %n) {
@@ -240,7 +240,7 @@ for.end:
ret void
}
-; For a loop with a profile-guided estimated TC of 129, when the auto-vectorizer chooses VF 16,
+; For a loop with a profile-guided estimated TC of 129, when the auto-vectorizer chooses VF 16,
; it should choose conservatively IC 4 so that the vector loop runs twice at least
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 4)
define void @loop_with_profile_tc_129(ptr noalias %p, ptr noalias %q, i64 %n) {
@@ -264,7 +264,7 @@ for.end:
ret void
}
-; For a loop with a profile-guided estimated TC of 180, when the auto-vectorizer chooses VF 16,
+; For a loop with a profile-guided estimated TC of 180, when the auto-vectorizer chooses VF 16,
; it should choose conservatively IC 4 so that the vector loop runs twice at least
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 4)
define void @loop_with_profile_tc_180(ptr noalias %p, ptr noalias %q, i64 %n) {
@@ -288,7 +288,7 @@ for.end:
ret void
}
-; For a loop with a profile-guided estimated TC of 193, when the auto-vectorizer chooses VF 16,
+; For a loop with a profile-guided estimated TC of 193, when the auto-vectorizer chooses VF 16,
; it should choose conservatively IC 4 so that the vector loop runs twice at least
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 4)
define void @loop_with_profile_tc_193(ptr noalias %p, ptr noalias %q, i64 %n) {
@@ -312,7 +312,7 @@ for.end:
ret void
}
-; For a loop with a profile-guided estimated TC of 1000, when the auto-vectorizer chooses VF 16,
+; For a loop with a profile-guided estimated TC of 1000, when the auto-vectorizer chooses VF 16,
; the IC will be capped by the target-specific maximum interleave count
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 8)
define void @loop_with_profile_tc_1000(ptr noalias %p, ptr noalias %q, i64 %n) {
@@ -336,6 +336,30 @@ for.end:
ret void
}
+; When the loop weight is UINT_MAX, and the exit count is 1, the trip count
+; computation could wrap.
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 8)
+define void @loop_with_profile_wrap(ptr noalias %p, ptr noalias %q, i64 %n) {
+entry:
+ br label %for.body
+
+for.body:
+ %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+ %tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0
+ %tmp1 = load i8, ptr %tmp0, align 1
+ %tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1
+ %tmp3 = load i8, ptr %tmp2, align 1
+ %add = add i8 %tmp1, %tmp3
+ %qi = getelementptr i8, ptr %q, i64 %i
+ store i8 %add, ptr %qi, align 1
+ %i.next = add nuw nsw i64 %i, 1
+ %cond = icmp eq i64 %i.next, %n
+ br i1 %cond, label %for.end, label %for.body, !prof !11
+
+for.end:
+ ret void
+}
+
!0 = !{!"branch_weights", i32 1, i32 31}
!1 = !{!"branch_weights", i32 1, i32 32}
!2 = !{!"branch_weights", i32 1, i32 47}
@@ -347,3 +371,4 @@ for.end:
!8 = !{!"branch_weights", i32 1, i32 179}
!9 = !{!"branch_weights", i32 1, i32 192}
!10 = !{!"branch_weights", i32 1, i32 999}
+!11 = !{!"branch_weights", i32 1, i32 -1}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll
index 800c55d6740bc..c685655f67a6b 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll
@@ -180,8 +180,8 @@ outer.exit:
define void @outer_pgo_minus1(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %m, i64 noundef %n) {
; CHECK-LABEL: LV: Checking a loop in 'outer_pgo_minus1'
; CHECK: Calculating cost of runtime checks:
-; CHECK-NOT: We expect runtime memory checks to be hoisted out of the outer loop. Cost reduced
-; CHECK: Total cost of runtime checks: 6
+; CHECK: We expect runtime memory checks to be hoisted out of the outer loop. Cost reduced from 6 to 3
+; CHECK: Total cost of runtime checks: 3
; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16
entry:
br label %outer.loop
``````````
</details>
https://github.com/llvm/llvm-project/pull/129080
More information about the llvm-commits
mailing list