[llvm] [LV] Scale block costs using BranchFrequencyInfo (PR #71793)
Simeon K via llvm-commits
llvm-commits at lists.llvm.org
Fri Nov 17 08:27:49 PST 2023
https://github.com/simeonkr updated https://github.com/llvm/llvm-project/pull/71793
>From 3c1c49b20083d3b3ffa645b7e26a795ecbf37b3d Mon Sep 17 00:00:00 2001
From: Simeon Krastnikov <simeon.krastnikov at imgtec.com>
Date: Fri, 3 Nov 2023 15:33:35 +0000
Subject: [PATCH 1/2] [LV] Scale block costs using BranchFrequencyInfo
The probability of a given predicated block executing can be estimated
from its frequency relative to that of the loop header. Use this
estimate to obtain more accurate costs for scalar loops.
---
.../Transforms/Vectorize/LoopVectorize.cpp | 36 ++++++++++++-------
1 file changed, 24 insertions(+), 12 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 1873bb9931320ac..bfc91b7dd95c9b7 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -401,14 +401,23 @@ static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
}
-/// A helper function that returns the reciprocal of the block probability of
-/// predicated blocks. If we return X, we are assuming the predicated block
-/// will execute once for every X iterations of the loop header.
-///
-/// TODO: We should use actual block probability here, if available. Currently,
-/// we always assume predicated blocks have a 50% chance of executing.
+/// A helper function that returns the reciprocal of the block probability of a
+/// predicated block. Without further information, we assume a prob. of 50%.
static unsigned getReciprocalPredBlockProb() { return 2; }
+/// A helper function that returns the reciprocal of the block probability of a
+// predicated block by comparing its BlockFrequency to that of the loop header.
+static unsigned getReciprocalPredBlockProb(BlockFrequencyInfo *BFI,
+ Loop *L, BasicBlock *BB) {
+ if (BFI == nullptr)
+ return getReciprocalPredBlockProb();
+ auto HeaderFreq = BFI->getBlockFreq(L->getHeader()).getFrequency();
+ auto BlockFreq = BFI->getBlockFreq(BB).getFrequency();
+ if (HeaderFreq == 0 || BlockFreq == 0)
+ return getReciprocalPredBlockProb();
+ return HeaderFreq / BlockFreq;
+}
+
/// Returns "best known" trip count for the specified loop \p L as defined by
/// the following procedure:
/// 1) Returns exact trip count if it is known.
@@ -1205,10 +1214,11 @@ class LoopVectorizationCostModel {
AssumptionCache *AC,
OptimizationRemarkEmitter *ORE, const Function *F,
const LoopVectorizeHints *Hints,
- InterleavedAccessInfo &IAI)
+ InterleavedAccessInfo &IAI,
+ BlockFrequencyInfo *BFI)
: ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
- Hints(Hints), InterleaveInfo(IAI) {}
+ Hints(Hints), InterleaveInfo(IAI), BFInfo(BFI) {}
/// \return An upper bound for the vectorization factors (both fixed and
/// scalable). If the factors are 0, vectorization and interleaving should be
@@ -1915,6 +1925,8 @@ class LoopVectorizationCostModel {
/// with the same stride and close to each other.
InterleavedAccessInfo &InterleaveInfo;
+ BlockFrequencyInfo *BFInfo;
+
/// Values to ignore in the cost model.
SmallPtrSet<const Value *, 16> ValuesToIgnore;
@@ -4326,7 +4338,7 @@ LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I,
// Scale the cost by the probability of executing the predicated blocks.
// This assumes the predicated block for each vector lane is equally
// likely.
- ScalarizationCost = ScalarizationCost / getReciprocalPredBlockProb();
+ ScalarizationCost /= getReciprocalPredBlockProb();
}
InstructionCost SafeDivisorCost = 0;
@@ -6245,7 +6257,7 @@ LoopVectorizationCostModel::expectedCost(
// cost by the probability of executing it. blockNeedsPredication from
// Legal is used so as to not include all blocks in tail folded loops.
if (VF.isScalar() && Legal->blockNeedsPredication(BB))
- BlockCost.first /= getReciprocalPredBlockProb();
+ BlockCost.first /= getReciprocalPredBlockProb(BFInfo, TheLoop, BB);
Cost.first += BlockCost.first;
Cost.second |= BlockCost.second;
@@ -9697,7 +9709,7 @@ static bool processLoopInVPlanNativePath(
getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, *LVL, &IAI);
LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
- &Hints, IAI);
+ &Hints, IAI, BFI);
// Use the planner for outer loop vectorization.
// TODO: CM is not used at this point inside the planner. Turn CM into an
// optional argument if we don't need it in the future.
@@ -10052,7 +10064,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
// Use the cost model.
LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
- F, &Hints, IAI);
+ F, &Hints, IAI, BFI);
// Use the planner for vectorization.
LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, &LVL, CM, IAI, PSE, Hints,
ORE);
>From 2fe6c4390748ad297c6249ef804f3742d8967d82 Mon Sep 17 00:00:00 2001
From: Simeon Krastnikov <simeon.krastnikov at imgtec.com>
Date: Fri, 17 Nov 2023 16:17:59 +0000
Subject: [PATCH 2/2] [LV] Add test
---
.../RISCV/branch-frequency-cost.ll | 103 ++++++++++++++++++
1 file changed, 103 insertions(+)
create mode 100644 llvm/test/Transforms/LoopVectorize/RISCV/branch-frequency-cost.ll
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/branch-frequency-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/branch-frequency-cost.ll
new file mode 100644
index 000000000000000..ce33a609251c9d1
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/branch-frequency-cost.ll
@@ -0,0 +1,103 @@
+; RUN: opt -mtriple=riscv64 -mattr=+v -passes='require<profile-summary>,loop-vectorize' -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s
+
+; Check that branch weights make a difference when computing cost of scalar loop
+
+define void @foo_with_wts(ptr %A, ptr %B, i32 %n) {
+; CHECK: LV: Checking a loop in 'foo_with_wts'
+; CHECK: LV: Scalar loop costs: [[COST:[0-9]+]].
+entry:
+ %cmp8 = icmp sgt i32 %n, 0
+ br i1 %cmp8, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+ %wide.trip.count = zext i32 %n to i64
+ br label %for.body
+
+for.cond.cleanup.loopexit:
+ br label %for.cond.cleanup
+
+for.cond.cleanup:
+ ret void
+
+for.body:
+ %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
+ %0 = trunc i64 %indvars.iv to i32
+ %rem = urem i32 %0, 100
+ %cmp1 = icmp eq i32 %rem, 0
+ br i1 %cmp1, label %if.then, label %for.inc, !prof !0
+
+if.then:
+ %arrayidx = getelementptr inbounds i32, ptr %B, i64 %indvars.iv
+ %1 = load i32, ptr %arrayidx, align 4
+ %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %indvars.iv
+ %2 = load i32, ptr %arrayidx3, align 4
+ %udiv = udiv i32 %2, %1
+ store i32 %udiv, ptr %arrayidx3, align 4
+ br label %for.inc
+
+for.inc:
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+
+define void @foo_no_wts(ptr %A, ptr %B, i32 %n) {
+; CHECK: LV: Checking a loop in 'foo_no_wts'
+; CHECK-NOT: LV: Scalar loop costs: [[COST]].
+entry:
+ %cmp8 = icmp sgt i32 %n, 0
+ br i1 %cmp8, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+ %wide.trip.count = zext i32 %n to i64
+ br label %for.body
+
+for.cond.cleanup.loopexit:
+ br label %for.cond.cleanup
+
+for.cond.cleanup:
+ ret void
+
+for.body:
+ %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
+ %0 = trunc i64 %indvars.iv to i32
+ %rem = urem i32 %0, 100
+ %cmp1 = icmp eq i32 %rem, 0
+ br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:
+ %arrayidx = getelementptr inbounds i32, ptr %B, i64 %indvars.iv
+ %1 = load i32, ptr %arrayidx, align 4
+ %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %indvars.iv
+ %2 = load i32, ptr %arrayidx3, align 4
+ %udiv = udiv i32 %2, %1
+ store i32 %udiv, ptr %arrayidx3, align 4
+ br label %for.inc
+
+for.inc:
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+!0 = !{!"branch_weights", i32 1, i32 100}
+
+; Currently, the loop vectorizer only utilizes BranchFrequencyInfo in the
+; presence of ProfileSummaryInfo (https://reviews.llvm.org/D144953)
+; Fabricate a summary which won't be used:
+!llvm.module.flags = !{!1}
+!1 = !{i32 1, !"ProfileSummary", !2}
+!2 = !{!3, !4, !5, !6, !7, !8, !9, !10}
+!3 = !{!"ProfileFormat", !"InstrProf"}
+!4 = !{!"TotalCount", i64 1000}
+!5 = !{!"MaxCount", i64 10}
+!6 = !{!"MaxInternalCount", i64 1}
+!7 = !{!"MaxFunctionCount", i64 100}
+!8 = !{!"NumCounts", i64 200}
+!9 = !{!"NumFunctions", i64 3}
+!10 = !{!"DetailedSummary", !11}
+!11 = !{!12, !13, !14}
+!12 = !{i32 10000, i64 1000, i32 1}
+!13 = !{i32 990000, i64 300, i32 10}
+!14 = !{i32 999999, i64 5, i32 100}
More information about the llvm-commits
mailing list