[llvm] [BlockPlacement] Add flag to disable profile usage (PR #102956)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Aug 12 12:07:18 PDT 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-llvm-transforms
Author: Ellis Hoag (ellishg)
<details>
<summary>Changes</summary>
Create the `-block-placement-use-profile` LLVM flag to enable/disable using profiles to make decisions.
When building with `-Oz`, consuming profiles can drastically increase binary size. We found `-block-placement-use-profile=false` gives a slight text size win, which mitigates some of this regression.
---
Full diff: https://github.com/llvm/llvm-project/pull/102956.diff
5 Files Affected:
- (modified) llvm/include/llvm/Transforms/Utils/LoopPeel.h (+2-1)
- (modified) llvm/lib/CodeGen/MachineBlockPlacement.cpp (+14-8)
- (modified) llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp (+9-2)
- (modified) llvm/lib/Transforms/Utils/LoopPeel.cpp (+3-3)
- (modified) llvm/test/Transforms/LoopUnroll/peel-loop-conditions-pgo-1.ll (+4-3)
``````````diff
diff --git a/llvm/include/llvm/Transforms/Utils/LoopPeel.h b/llvm/include/llvm/Transforms/Utils/LoopPeel.h
index 0b78700ca71bb..987c21b7ca561 100644
--- a/llvm/include/llvm/Transforms/Utils/LoopPeel.h
+++ b/llvm/include/llvm/Transforms/Utils/LoopPeel.h
@@ -37,7 +37,8 @@ gatherPeelingPreferences(Loop *L, ScalarEvolution &SE,
void computePeelCount(Loop *L, unsigned LoopSize,
TargetTransformInfo::PeelingPreferences &PP,
unsigned TripCount, DominatorTree &DT,
- ScalarEvolution &SE, AssumptionCache *AC = nullptr,
+ ScalarEvolution &SE, bool UseBranchWeights,
+ AssumptionCache *AC = nullptr,
unsigned Threshold = UINT_MAX);
} // end namespace llvm
diff --git a/llvm/lib/CodeGen/MachineBlockPlacement.cpp b/llvm/lib/CodeGen/MachineBlockPlacement.cpp
index be783bc4e2973..8d5cdc9c08b7f 100644
--- a/llvm/lib/CodeGen/MachineBlockPlacement.cpp
+++ b/llvm/lib/CodeGen/MachineBlockPlacement.cpp
@@ -219,6 +219,10 @@ static cl::opt<unsigned> ExtTspBlockPlacementMaxBlocks(
"block placement."),
cl::init(UINT_MAX), cl::Hidden);
+static cl::opt<bool>
+ UseProfileData("block-placement-use-profile", cl::init(true), cl::Hidden,
+ cl::desc("Use profile data to do precise benefit analysis"));
+
namespace llvm {
extern cl::opt<bool> EnableExtTspBlockPlacement;
extern cl::opt<bool> ApplyExtTspWithoutProfile;
@@ -1220,7 +1224,7 @@ bool MachineBlockPlacement::canTailDuplicateUnplacedPreds(
// If profile information is available, findDuplicateCandidates can do more
// precise benefit analysis.
- if (F->getFunction().hasProfileData())
+ if (UseProfileData && F->getFunction().hasProfileData())
return true;
// This is mainly for function exit BB.
@@ -1388,7 +1392,7 @@ void MachineBlockPlacement::precomputeTriangleChains() {
// When profile is available, we need to handle the triangle-shape CFG.
static BranchProbability getLayoutSuccessorProbThreshold(
const MachineBasicBlock *BB) {
- if (!BB->getParent()->getFunction().hasProfileData())
+ if (!UseProfileData || !BB->getParent()->getFunction().hasProfileData())
return BranchProbability(StaticLikelyProb, 100);
if (BB->succ_size() == 2) {
const MachineBasicBlock *Succ1 = *BB->succ_begin();
@@ -2621,7 +2625,8 @@ MachineBlockPlacement::collectLoopBlockSet(const MachineLoop &L) {
// will be merged into the first outer loop chain for which this block is not
// cold anymore. This needs precise profile data and we only do this when
// profile data is available.
- if (F->getFunction().hasProfileData() || ForceLoopColdBlock) {
+ if ((UseProfileData && F->getFunction().hasProfileData()) ||
+ ForceLoopColdBlock) {
BlockFrequency LoopFreq(0);
for (auto *LoopPred : L.getHeader()->predecessors())
if (!L.contains(LoopPred))
@@ -2670,8 +2675,8 @@ void MachineBlockPlacement::buildLoopChains(const MachineLoop &L) {
// this loop by modeling costs more precisely which requires the profile data
// for better layout.
bool RotateLoopWithProfile =
- ForcePreciseRotationCost ||
- (PreciseRotationCost && F->getFunction().hasProfileData());
+ ForcePreciseRotationCost || (PreciseRotationCost && UseProfileData &&
+ F->getFunction().hasProfileData());
// First check to see if there is an obviously preferable top block for the
// loop. This will default to the header, but may end up as one of the
@@ -3208,7 +3213,7 @@ bool MachineBlockPlacement::maybeTailDuplicateBlock(
bool IsSimple = TailDup.isSimpleBB(BB);
SmallVector<MachineBasicBlock *, 8> CandidatePreds;
SmallVectorImpl<MachineBasicBlock *> *CandidatePtr = nullptr;
- if (F->getFunction().hasProfileData()) {
+ if (UseProfileData && F->getFunction().hasProfileData()) {
// We can do partial duplication with precise profile information.
findDuplicateCandidates(CandidatePreds, BB, BlockFilter);
if (CandidatePreds.size() == 0)
@@ -3409,7 +3414,7 @@ void MachineBlockPlacement::findDuplicateCandidates(
void MachineBlockPlacement::initDupThreshold() {
DupThreshold = BlockFrequency(0);
- if (!F->getFunction().hasProfileData())
+ if (!UseProfileData || !F->getFunction().hasProfileData())
return;
// We prefer to use prifile count.
@@ -3529,7 +3534,8 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
// Apply a post-processing optimizing block placement.
if (MF.size() >= 3 && EnableExtTspBlockPlacement &&
- (ApplyExtTspWithoutProfile || MF.getFunction().hasProfileData()) &&
+ (ApplyExtTspWithoutProfile ||
+ (UseProfileData && MF.getFunction().hasProfileData())) &&
MF.size() <= ExtTspBlockPlacementMaxBlocks) {
// Find a new placement and modify the layout of the blocks in the function.
applyExtTsp();
diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
index cbc35b6dd4292..0a446851acf2d 100644
--- a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -179,6 +179,12 @@ static cl::opt<unsigned> PragmaUnrollFullMaxIterations(
"pragma-unroll-full-max-iterations", cl::init(1'000'000), cl::Hidden,
cl::desc("Maximum allowed iterations to unroll under pragma unroll full."));
+static cl::opt<bool>
+ UseBranchWeights("loop-unroll-use-branch-weights", cl::init(true),
+ cl::Hidden,
+ cl::desc("Estimate loop trip counts with branch weight "
+ "metadata to help determine the peel count"));
+
/// A magic value for use with the Threshold parameter to indicate
/// that the loop unroll should be performed regardless of how much
/// code expansion would result.
@@ -1012,7 +1018,8 @@ bool llvm::computeUnrollCount(
}
// 5th priority is loop peeling.
- computePeelCount(L, LoopSize, PP, TripCount, DT, SE, AC, UP.Threshold);
+ computePeelCount(L, LoopSize, PP, TripCount, DT, SE, UseBranchWeights, AC,
+ UP.Threshold);
if (PP.PeelCount) {
UP.Runtime = false;
UP.Count = 1;
@@ -1081,7 +1088,7 @@ bool llvm::computeUnrollCount(
}
// Check if the runtime trip count is too small when profile is available.
- if (L->getHeader()->getParent()->hasProfileData()) {
+ if (UseBranchWeights && L->getHeader()->getParent()->hasProfileData()) {
if (auto ProfileTripCount = getLoopEstimatedTripCount(L)) {
if (*ProfileTripCount < FlatLoopTripCountThreshold)
return false;
diff --git a/llvm/lib/Transforms/Utils/LoopPeel.cpp b/llvm/lib/Transforms/Utils/LoopPeel.cpp
index 5d7c0d947facc..9557d31a122a6 100644
--- a/llvm/lib/Transforms/Utils/LoopPeel.cpp
+++ b/llvm/lib/Transforms/Utils/LoopPeel.cpp
@@ -538,8 +538,8 @@ static bool violatesLegacyMultiExitLoopCheck(Loop *L) {
void llvm::computePeelCount(Loop *L, unsigned LoopSize,
TargetTransformInfo::PeelingPreferences &PP,
unsigned TripCount, DominatorTree &DT,
- ScalarEvolution &SE, AssumptionCache *AC,
- unsigned Threshold) {
+ ScalarEvolution &SE, bool UseBranchWeights,
+ AssumptionCache *AC, unsigned Threshold) {
assert(LoopSize > 0 && "Zero loop size is not allowed!");
// Save the PP.PeelCount value set by the target in
// TTI.getPeelingPreferences or by the flag -unroll-peel-count.
@@ -632,7 +632,7 @@ void llvm::computePeelCount(Loop *L, unsigned LoopSize,
// hit the peeled section.
// We only do this in the presence of profile information, since otherwise
// our estimates of the trip count are not reliable enough.
- if (L->getHeader()->getParent()->hasProfileData()) {
+ if (UseBranchWeights && L->getHeader()->getParent()->hasProfileData()) {
if (violatesLegacyMultiExitLoopCheck(L))
return;
std::optional<unsigned> EstimatedTripCount = getLoopEstimatedTripCount(L);
diff --git a/llvm/test/Transforms/LoopUnroll/peel-loop-conditions-pgo-1.ll b/llvm/test/Transforms/LoopUnroll/peel-loop-conditions-pgo-1.ll
index e3cfe53950f57..c7fb389c63595 100644
--- a/llvm/test/Transforms/LoopUnroll/peel-loop-conditions-pgo-1.ll
+++ b/llvm/test/Transforms/LoopUnroll/peel-loop-conditions-pgo-1.ll
@@ -1,5 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -S -passes=loop-unroll,loop-unroll -verify-dom-info -debug-only=loop-unroll -unroll-peel-max-count=7 2>&1 | FileCheck %s
+; RUN: opt < %s -S -passes=loop-unroll,loop-unroll -verify-dom-info -debug-only=loop-unroll -unroll-peel-max-count=7 2>&1 | FileCheck %s --check-prefixes=CHECK,PGO
+; RUN: opt < %s -S -passes=loop-unroll,loop-unroll -verify-dom-info -debug-only=loop-unroll -unroll-peel-max-count=7 -loop-unroll-use-branch-weights=false 2>&1 | FileCheck %s
; REQUIRES: asserts
declare void @f1()
@@ -11,8 +12,8 @@ declare void @f2()
define void @test1(i32 %k) !prof !4 {
; CHECK: Loop Unroll: F[test1] Loop %for.body
; CHECK: PEELING loop %for.body with iteration count 2!
-; CHECK: PEELING loop %for.body with iteration count 5!
-; CHECK: llvm.loop.unroll.disable
+; PGO: PEELING loop %for.body with iteration count 5!
+; PGO: llvm.loop.unroll.disable
for.body.lr.ph:
br label %for.body
``````````
</details>
https://github.com/llvm/llvm-project/pull/102956
More information about the llvm-commits
mailing list