[llvm] [BlockPlacement] Add flag to disable profile usage (PR #102956)

via llvm-commits llvm-commits at lists.llvm.org
Mon Aug 12 12:07:18 PDT 2024


llvmbot wrote:


<!--LLVM PR SUMMARY COMMENT-->

@llvm/pr-subscribers-llvm-transforms

Author: Ellis Hoag (ellishg)

<details>
<summary>Changes</summary>

Create the `-block-placement-use-profile` LLVM flag to enable/disable using profiles to make decisions.

When building with `-Oz`, consuming profiles can drastically increase binary size. We found `-block-placement-use-profile=false` gives a slight text size win, which mitigates some of this regression.

---
Full diff: https://github.com/llvm/llvm-project/pull/102956.diff


5 Files Affected:

- (modified) llvm/include/llvm/Transforms/Utils/LoopPeel.h (+2-1) 
- (modified) llvm/lib/CodeGen/MachineBlockPlacement.cpp (+14-8) 
- (modified) llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp (+9-2) 
- (modified) llvm/lib/Transforms/Utils/LoopPeel.cpp (+3-3) 
- (modified) llvm/test/Transforms/LoopUnroll/peel-loop-conditions-pgo-1.ll (+4-3) 


``````````diff
diff --git a/llvm/include/llvm/Transforms/Utils/LoopPeel.h b/llvm/include/llvm/Transforms/Utils/LoopPeel.h
index 0b78700ca71bb..987c21b7ca561 100644
--- a/llvm/include/llvm/Transforms/Utils/LoopPeel.h
+++ b/llvm/include/llvm/Transforms/Utils/LoopPeel.h
@@ -37,7 +37,8 @@ gatherPeelingPreferences(Loop *L, ScalarEvolution &SE,
 void computePeelCount(Loop *L, unsigned LoopSize,
                       TargetTransformInfo::PeelingPreferences &PP,
                       unsigned TripCount, DominatorTree &DT,
-                      ScalarEvolution &SE, AssumptionCache *AC = nullptr,
+                      ScalarEvolution &SE, bool UseBranchWeights,
+                      AssumptionCache *AC = nullptr,
                       unsigned Threshold = UINT_MAX);
 
 } // end namespace llvm
diff --git a/llvm/lib/CodeGen/MachineBlockPlacement.cpp b/llvm/lib/CodeGen/MachineBlockPlacement.cpp
index be783bc4e2973..8d5cdc9c08b7f 100644
--- a/llvm/lib/CodeGen/MachineBlockPlacement.cpp
+++ b/llvm/lib/CodeGen/MachineBlockPlacement.cpp
@@ -219,6 +219,10 @@ static cl::opt<unsigned> ExtTspBlockPlacementMaxBlocks(
              "block placement."),
     cl::init(UINT_MAX), cl::Hidden);
 
+static cl::opt<bool>
+    UseProfileData("block-placement-use-profile", cl::init(true), cl::Hidden,
+                   cl::desc("Use profile data to do precise benefit analysis"));
+
 namespace llvm {
 extern cl::opt<bool> EnableExtTspBlockPlacement;
 extern cl::opt<bool> ApplyExtTspWithoutProfile;
@@ -1220,7 +1224,7 @@ bool MachineBlockPlacement::canTailDuplicateUnplacedPreds(
 
   // If profile information is available, findDuplicateCandidates can do more
   // precise benefit analysis.
-  if (F->getFunction().hasProfileData())
+  if (UseProfileData && F->getFunction().hasProfileData())
     return true;
 
   // This is mainly for function exit BB.
@@ -1388,7 +1392,7 @@ void MachineBlockPlacement::precomputeTriangleChains() {
 // When profile is available, we need to handle the triangle-shape CFG.
 static BranchProbability getLayoutSuccessorProbThreshold(
       const MachineBasicBlock *BB) {
-  if (!BB->getParent()->getFunction().hasProfileData())
+  if (!UseProfileData || !BB->getParent()->getFunction().hasProfileData())
     return BranchProbability(StaticLikelyProb, 100);
   if (BB->succ_size() == 2) {
     const MachineBasicBlock *Succ1 = *BB->succ_begin();
@@ -2621,7 +2625,8 @@ MachineBlockPlacement::collectLoopBlockSet(const MachineLoop &L) {
   // will be merged into the first outer loop chain for which this block is not
   // cold anymore. This needs precise profile data and we only do this when
   // profile data is available.
-  if (F->getFunction().hasProfileData() || ForceLoopColdBlock) {
+  if ((UseProfileData && F->getFunction().hasProfileData()) ||
+      ForceLoopColdBlock) {
     BlockFrequency LoopFreq(0);
     for (auto *LoopPred : L.getHeader()->predecessors())
       if (!L.contains(LoopPred))
@@ -2670,8 +2675,8 @@ void MachineBlockPlacement::buildLoopChains(const MachineLoop &L) {
   // this loop by modeling costs more precisely which requires the profile data
   // for better layout.
   bool RotateLoopWithProfile =
-      ForcePreciseRotationCost ||
-      (PreciseRotationCost && F->getFunction().hasProfileData());
+      ForcePreciseRotationCost || (PreciseRotationCost && UseProfileData &&
+                                   F->getFunction().hasProfileData());
 
   // First check to see if there is an obviously preferable top block for the
   // loop. This will default to the header, but may end up as one of the
@@ -3208,7 +3213,7 @@ bool MachineBlockPlacement::maybeTailDuplicateBlock(
   bool IsSimple = TailDup.isSimpleBB(BB);
   SmallVector<MachineBasicBlock *, 8> CandidatePreds;
   SmallVectorImpl<MachineBasicBlock *> *CandidatePtr = nullptr;
-  if (F->getFunction().hasProfileData()) {
+  if (UseProfileData && F->getFunction().hasProfileData()) {
     // We can do partial duplication with precise profile information.
     findDuplicateCandidates(CandidatePreds, BB, BlockFilter);
     if (CandidatePreds.size() == 0)
@@ -3409,7 +3414,7 @@ void MachineBlockPlacement::findDuplicateCandidates(
 
 void MachineBlockPlacement::initDupThreshold() {
   DupThreshold = BlockFrequency(0);
-  if (!F->getFunction().hasProfileData())
+  if (!UseProfileData || !F->getFunction().hasProfileData())
     return;
 
   // We prefer to use prifile count.
@@ -3529,7 +3534,8 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
 
   // Apply a post-processing optimizing block placement.
   if (MF.size() >= 3 && EnableExtTspBlockPlacement &&
-      (ApplyExtTspWithoutProfile || MF.getFunction().hasProfileData()) &&
+      (ApplyExtTspWithoutProfile ||
+       (UseProfileData && MF.getFunction().hasProfileData())) &&
       MF.size() <= ExtTspBlockPlacementMaxBlocks) {
     // Find a new placement and modify the layout of the blocks in the function.
     applyExtTsp();
diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
index cbc35b6dd4292..0a446851acf2d 100644
--- a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -179,6 +179,12 @@ static cl::opt<unsigned> PragmaUnrollFullMaxIterations(
     "pragma-unroll-full-max-iterations", cl::init(1'000'000), cl::Hidden,
     cl::desc("Maximum allowed iterations to unroll under pragma unroll full."));
 
+static cl::opt<bool>
+    UseBranchWeights("loop-unroll-use-branch-weights", cl::init(true),
+                     cl::Hidden,
+                     cl::desc("Estimate loop trip counts with branch weight "
+                              "metadata to help determine the peel count"));
+
 /// A magic value for use with the Threshold parameter to indicate
 /// that the loop unroll should be performed regardless of how much
 /// code expansion would result.
@@ -1012,7 +1018,8 @@ bool llvm::computeUnrollCount(
   }
 
   // 5th priority is loop peeling.
-  computePeelCount(L, LoopSize, PP, TripCount, DT, SE, AC, UP.Threshold);
+  computePeelCount(L, LoopSize, PP, TripCount, DT, SE, UseBranchWeights, AC,
+                   UP.Threshold);
   if (PP.PeelCount) {
     UP.Runtime = false;
     UP.Count = 1;
@@ -1081,7 +1088,7 @@ bool llvm::computeUnrollCount(
   }
 
   // Check if the runtime trip count is too small when profile is available.
-  if (L->getHeader()->getParent()->hasProfileData()) {
+  if (UseBranchWeights && L->getHeader()->getParent()->hasProfileData()) {
     if (auto ProfileTripCount = getLoopEstimatedTripCount(L)) {
       if (*ProfileTripCount < FlatLoopTripCountThreshold)
         return false;
diff --git a/llvm/lib/Transforms/Utils/LoopPeel.cpp b/llvm/lib/Transforms/Utils/LoopPeel.cpp
index 5d7c0d947facc..9557d31a122a6 100644
--- a/llvm/lib/Transforms/Utils/LoopPeel.cpp
+++ b/llvm/lib/Transforms/Utils/LoopPeel.cpp
@@ -538,8 +538,8 @@ static bool violatesLegacyMultiExitLoopCheck(Loop *L) {
 void llvm::computePeelCount(Loop *L, unsigned LoopSize,
                             TargetTransformInfo::PeelingPreferences &PP,
                             unsigned TripCount, DominatorTree &DT,
-                            ScalarEvolution &SE, AssumptionCache *AC,
-                            unsigned Threshold) {
+                            ScalarEvolution &SE, bool UseBranchWeights,
+                            AssumptionCache *AC, unsigned Threshold) {
   assert(LoopSize > 0 && "Zero loop size is not allowed!");
   // Save the PP.PeelCount value set by the target in
   // TTI.getPeelingPreferences or by the flag -unroll-peel-count.
@@ -632,7 +632,7 @@ void llvm::computePeelCount(Loop *L, unsigned LoopSize,
   // hit the peeled section.
   // We only do this in the presence of profile information, since otherwise
   // our estimates of the trip count are not reliable enough.
-  if (L->getHeader()->getParent()->hasProfileData()) {
+  if (UseBranchWeights && L->getHeader()->getParent()->hasProfileData()) {
     if (violatesLegacyMultiExitLoopCheck(L))
       return;
     std::optional<unsigned> EstimatedTripCount = getLoopEstimatedTripCount(L);
diff --git a/llvm/test/Transforms/LoopUnroll/peel-loop-conditions-pgo-1.ll b/llvm/test/Transforms/LoopUnroll/peel-loop-conditions-pgo-1.ll
index e3cfe53950f57..c7fb389c63595 100644
--- a/llvm/test/Transforms/LoopUnroll/peel-loop-conditions-pgo-1.ll
+++ b/llvm/test/Transforms/LoopUnroll/peel-loop-conditions-pgo-1.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -S -passes=loop-unroll,loop-unroll -verify-dom-info -debug-only=loop-unroll -unroll-peel-max-count=7 2>&1 | FileCheck %s
+; RUN: opt < %s -S -passes=loop-unroll,loop-unroll -verify-dom-info -debug-only=loop-unroll -unroll-peel-max-count=7 2>&1 | FileCheck %s --check-prefixes=CHECK,PGO
+; RUN: opt < %s -S -passes=loop-unroll,loop-unroll -verify-dom-info -debug-only=loop-unroll -unroll-peel-max-count=7 -loop-unroll-use-branch-weights=false 2>&1 | FileCheck %s
 ; REQUIRES: asserts
 
 declare void @f1()
@@ -11,8 +12,8 @@ declare void @f2()
 define void @test1(i32 %k) !prof !4 {
 ; CHECK: Loop Unroll: F[test1] Loop %for.body
 ; CHECK: PEELING loop %for.body with iteration count 2!
-; CHECK: PEELING loop %for.body with iteration count 5!
-; CHECK: llvm.loop.unroll.disable
+; PGO: PEELING loop %for.body with iteration count 5!
+; PGO: llvm.loop.unroll.disable
 for.body.lr.ph:
   br label %for.body
 

``````````

</details>


https://github.com/llvm/llvm-project/pull/102956


More information about the llvm-commits mailing list