[llvm] [LV] Scale block costs using BranchFrequencyInfo (PR #71793)

Fri Dec 15 08:45:45 PST 2023

https://github.com/simeonkr updated https://github.com/llvm/llvm-project/pull/71793

>From b67a24f7300c83a041df84fd5a9d50f4a80d4543 Mon Sep 17 00:00:00 2001
From: Simeon Krastnikov <simeon.krastnikov at imgtec.com>
Date: Fri, 15 Dec 2023 16:42:24 +0000
Subject: [PATCH 1/2] [LV] Pre-commit test for: Scale block costs using
 BranchFrequencyInfo

---
 .../RISCV/branch-frequency-cost.ll            | 114 ++++++++++++++++++
 1 file changed, 114 insertions(+)
 create mode 100644 llvm/test/Transforms/LoopVectorize/RISCV/branch-frequency-cost.ll

diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/branch-frequency-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/branch-frequency-cost.ll
new file mode 100644
index 00000000000000..d3035362b772c2
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/branch-frequency-cost.ll
@@ -0,0 +1,114 @@
+; RUN: opt -mtriple=riscv64 -mattr=+v -riscv-v-vector-bits-max=128 -riscv-v-register-bit-width-lmul=1 -passes='require<profile-summary>,loop-vectorize' -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s
+
+; FIXME: This test will fail as branch weights are not considered when computing 
+; cost of scalar loop
+
+define void @foo_with_wts(ptr %A, ptr %B, i32 %n) {
+; CHECK: LV: Checking a loop in 'foo_with_wts'
+; CHECK: LV: Scalar loop costs: [[COST:[0-9]+]].
+; CHECK-NOT: vector.body
+entry:
+  %cmp8 = icmp sgt i32 %n, 0
+  br i1 %cmp8, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+  %wide.trip.count = zext i32 %n to i64
+  br label %for.body
+
+for.cond.cleanup.loopexit:
+  br label %for.cond.cleanup
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
+  %0 = trunc i64 %indvars.iv to i32
+  %rem = urem i32 %0, 100
+  %cmp1 = icmp eq i32 %rem, 0
+  br i1 %cmp1, label %if.then, label %for.inc, !prof !0
+
+if.then:
+  %arrayidx = getelementptr inbounds i32, ptr %B, i64 %indvars.iv
+  %1 = load i32, ptr %arrayidx, align 4
+  %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %indvars.iv
+  %2 = load i32, ptr %arrayidx3, align 4
+  %udiv1 = udiv i32 %2, %1
+  %udiv2 = udiv i32 %2, %1
+  %udiv3 = udiv i32 %2, %1
+  %udiv4 = udiv i32 %2, %1
+  %udiv5 = udiv i32 %2, %1
+  %udiv6 = udiv i32 %2, %1
+  br label %for.inc
+
+for.inc:
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+
+define void @foo_no_wts(ptr %A, ptr %B, i32 %n) {
+; CHECK: LV: Checking a loop in 'foo_no_wts'
+; CHECK-NOT: LV: Scalar loop costs: [[COST]].
+; CHECK: vector.body
+entry:
+  %cmp8 = icmp sgt i32 %n, 0
+  br i1 %cmp8, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+  %wide.trip.count = zext i32 %n to i64
+  br label %for.body
+
+for.cond.cleanup.loopexit:
+  br label %for.cond.cleanup
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
+  %0 = trunc i64 %indvars.iv to i32
+  %rem = urem i32 %0, 100
+  %cmp1 = icmp eq i32 %rem, 0
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:
+  %arrayidx = getelementptr inbounds i32, ptr %B, i64 %indvars.iv
+  %1 = load i32, ptr %arrayidx, align 4
+  %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %indvars.iv
+  %2 = load i32, ptr %arrayidx3, align 4
+  %udiv1 = udiv i32 %2, %1
+  %udiv2 = udiv i32 %2, %1
+  %udiv3 = udiv i32 %2, %1
+  %udiv4 = udiv i32 %2, %1
+  %udiv5 = udiv i32 %2, %1
+  %udiv6 = udiv i32 %2, %1
+  br label %for.inc
+
+for.inc:
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+!0 = !{!"branch_weights", i32 1, i32 100}
+
+; Currently, the loop vectorizer only utilizes BranchFrequencyInfo in the
+; presence of ProfileSummaryInfo (https://reviews.llvm.org/D144953)
+; Fabricate a summary which won't be used:
+!llvm.module.flags = !{!1}
+!1 = !{i32 1, !"ProfileSummary", !2}
+!2 = !{!3, !4, !5, !6, !7, !8, !9, !10}
+!3 = !{!"ProfileFormat", !"InstrProf"}
+!4 = !{!"TotalCount", i64 1000}
+!5 = !{!"MaxCount", i64 10}
+!6 = !{!"MaxInternalCount", i64 1}
+!7 = !{!"MaxFunctionCount", i64 100}
+!8 = !{!"NumCounts", i64 200}
+!9 = !{!"NumFunctions", i64 3}
+!10 = !{!"DetailedSummary", !11}
+!11 = !{!12, !13, !14}
+!12 = !{i32 10000, i64 1000, i32 1}
+!13 = !{i32 990000, i64 300, i32 10}
+!14 = !{i32 999999, i64 5, i32 100}

>From 9728f546dec1307d724bd6f1bfd980188e157f79 Mon Sep 17 00:00:00 2001
From: Simeon Krastnikov <simeon.krastnikov at imgtec.com>
Date: Fri, 3 Nov 2023 15:33:35 +0000
Subject: [PATCH 2/2] [LV] Scale block costs using BranchFrequencyInfo

The probability of a given predicated block executing can be estimated
from its frequency relative to that of the loop header. Use this
estimate to obtain more accurate costs for scalar loops.
---
 .../Transforms/Vectorize/LoopVectorize.cpp    | 36 ++++++++++++-------
 .../RISCV/branch-frequency-cost.ll            |  3 +-
 2 files changed, 25 insertions(+), 14 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 1873bb9931320a..bfc91b7dd95c9b 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -401,14 +401,23 @@ static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
   return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
 }
 
-/// A helper function that returns the reciprocal of the block probability of
-/// predicated blocks. If we return X, we are assuming the predicated block
-/// will execute once for every X iterations of the loop header.
-///
-/// TODO: We should use actual block probability here, if available. Currently,
-///       we always assume predicated blocks have a 50% chance of executing.
+/// A helper function that returns the reciprocal of the block probability of a 
+/// predicated block. Without further information, we assume a prob. of 50%.
 static unsigned getReciprocalPredBlockProb() { return 2; }
 
+/// A helper function that returns the reciprocal of the block probability of a 
+// predicated block by comparing its BlockFrequency to that of the loop header.
+static unsigned getReciprocalPredBlockProb(BlockFrequencyInfo *BFI,
+                                           Loop *L, BasicBlock *BB) {
+  if (BFI == nullptr)
+    return getReciprocalPredBlockProb();
+  auto HeaderFreq = BFI->getBlockFreq(L->getHeader()).getFrequency();
+  auto BlockFreq = BFI->getBlockFreq(BB).getFrequency();
+  if (HeaderFreq == 0 || BlockFreq == 0)
+    return getReciprocalPredBlockProb();
+  return HeaderFreq / BlockFreq;
+}
+
 /// Returns "best known" trip count for the specified loop \p L as defined by
 /// the following procedure:
 ///   1) Returns exact trip count if it is known.
@@ -1205,10 +1214,11 @@ class LoopVectorizationCostModel {
                              AssumptionCache *AC,
                              OptimizationRemarkEmitter *ORE, const Function *F,
                              const LoopVectorizeHints *Hints,
-                             InterleavedAccessInfo &IAI)
+                             InterleavedAccessInfo &IAI,
+                             BlockFrequencyInfo *BFI)
       : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
         TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
-        Hints(Hints), InterleaveInfo(IAI) {}
+        Hints(Hints), InterleaveInfo(IAI), BFInfo(BFI) {}
 
   /// \return An upper bound for the vectorization factors (both fixed and
   /// scalable). If the factors are 0, vectorization and interleaving should be
@@ -1915,6 +1925,8 @@ class LoopVectorizationCostModel {
   /// with the same stride and close to each other.
   InterleavedAccessInfo &InterleaveInfo;
 
+  BlockFrequencyInfo *BFInfo;
+
   /// Values to ignore in the cost model.
   SmallPtrSet<const Value *, 16> ValuesToIgnore;
 
@@ -4326,7 +4338,7 @@ LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I,
     // Scale the cost by the probability of executing the predicated blocks.
     // This assumes the predicated block for each vector lane is equally
     // likely.
-    ScalarizationCost = ScalarizationCost / getReciprocalPredBlockProb();
+    ScalarizationCost /= getReciprocalPredBlockProb();
   }
   InstructionCost SafeDivisorCost = 0;
 
@@ -6245,7 +6257,7 @@ LoopVectorizationCostModel::expectedCost(
     // cost by the probability of executing it. blockNeedsPredication from
     // Legal is used so as to not include all blocks in tail folded loops.
     if (VF.isScalar() && Legal->blockNeedsPredication(BB))
-      BlockCost.first /= getReciprocalPredBlockProb();
+      BlockCost.first /= getReciprocalPredBlockProb(BFInfo, TheLoop, BB);
 
     Cost.first += BlockCost.first;
     Cost.second |= BlockCost.second;
@@ -9697,7 +9709,7 @@ static bool processLoopInVPlanNativePath(
       getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, *LVL, &IAI);
 
   LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
-                                &Hints, IAI);
+                                &Hints, IAI, BFI);
   // Use the planner for outer loop vectorization.
   // TODO: CM is not used at this point inside the planner. Turn CM into an
   // optional argument if we don't need it in the future.
@@ -10052,7 +10064,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
 
   // Use the cost model.
   LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
-                                F, &Hints, IAI);
+                                F, &Hints, IAI, BFI);
   // Use the planner for vectorization.
   LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, &LVL, CM, IAI, PSE, Hints,
                                ORE);
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/branch-frequency-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/branch-frequency-cost.ll
index d3035362b772c2..c59058ffd86ccd 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/branch-frequency-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/branch-frequency-cost.ll
@@ -1,7 +1,6 @@
 ; RUN: opt -mtriple=riscv64 -mattr=+v -riscv-v-vector-bits-max=128 -riscv-v-register-bit-width-lmul=1 -passes='require<profile-summary>,loop-vectorize' -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s
 
-; FIXME: This test will fail as branch weights are not considered when computing 
-; cost of scalar loop
+; Check that branch weights make a difference when computing cost of scalar loop
 
 define void @foo_with_wts(ptr %A, ptr %B, i32 %n) {
 ; CHECK: LV: Checking a loop in 'foo_with_wts'