[llvm] [CodeLayout] Size-aware machine block placement (PR #109711)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Sep 23 13:31:20 PDT 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-x86
Author: None (spupyrev)
<details>
<summary>Changes</summary>
This is an implementation of a new "size-aware" machine block placement. The
idea is to reorder blocks so that the number of fall-through jumps is maximized.
Observe that profile data is ignored for the optimization, and it is applied only
for instances with hasOptSize()=true.
This strategy has two benefits:
(i) it eliminates jump instructions and hence, smaller binaries;
(ii) we avoid using profile data while reordering blocks, which yields more
"uniform" functions, thus helping ICF and machine outliner/merger.
For large (mobile) apps, the size benefits of (i) and (ii) are roughly the same,
each providing up to 0.5% uncompressed and up to 1% compressed savings size on
top of the current solution.
The optimization is turned off by default.
---
Patch is 53.21 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/109711.diff
2 Files Affected:
- (modified) llvm/lib/CodeGen/MachineBlockPlacement.cpp (+354-314)
- (added) llvm/test/CodeGen/X86/code_placement_ext_tsp_size.ll (+134)
``````````diff
diff --git a/llvm/lib/CodeGen/MachineBlockPlacement.cpp b/llvm/lib/CodeGen/MachineBlockPlacement.cpp
index be783bc4e29738..3677818c8f08df 100644
--- a/llvm/lib/CodeGen/MachineBlockPlacement.cpp
+++ b/llvm/lib/CodeGen/MachineBlockPlacement.cpp
@@ -119,10 +119,10 @@ static cl::opt<unsigned> LoopToColdBlockRatio(
"(frequency of block) is greater than this ratio"),
cl::init(5), cl::Hidden);
-static cl::opt<bool> ForceLoopColdBlock(
- "force-loop-cold-block",
- cl::desc("Force outlining cold blocks from loops."),
- cl::init(false), cl::Hidden);
+static cl::opt<bool>
+ ForceLoopColdBlock("force-loop-cold-block",
+ cl::desc("Force outlining cold blocks from loops."),
+ cl::init(false), cl::Hidden);
static cl::opt<bool>
PreciseRotationCost("precise-rotation-cost",
@@ -147,43 +147,43 @@ static cl::opt<unsigned> JumpInstCost("jump-inst-cost",
cl::desc("Cost of jump instructions."),
cl::init(1), cl::Hidden);
static cl::opt<bool>
-TailDupPlacement("tail-dup-placement",
- cl::desc("Perform tail duplication during placement. "
- "Creates more fallthrough opportunites in "
- "outline branches."),
- cl::init(true), cl::Hidden);
+ TailDupPlacement("tail-dup-placement",
+ cl::desc("Perform tail duplication during placement. "
+ "Creates more fallthrough opportunites in "
+ "outline branches."),
+ cl::init(true), cl::Hidden);
static cl::opt<bool>
-BranchFoldPlacement("branch-fold-placement",
- cl::desc("Perform branch folding during placement. "
- "Reduces code size."),
- cl::init(true), cl::Hidden);
+ BranchFoldPlacement("branch-fold-placement",
+ cl::desc("Perform branch folding during placement. "
+ "Reduces code size."),
+ cl::init(true), cl::Hidden);
// Heuristic for tail duplication.
static cl::opt<unsigned> TailDupPlacementThreshold(
"tail-dup-placement-threshold",
cl::desc("Instruction cutoff for tail duplication during layout. "
"Tail merging during layout is forced to have a threshold "
- "that won't conflict."), cl::init(2),
- cl::Hidden);
+ "that won't conflict."),
+ cl::init(2), cl::Hidden);
// Heuristic for aggressive tail duplication.
static cl::opt<unsigned> TailDupPlacementAggressiveThreshold(
"tail-dup-placement-aggressive-threshold",
cl::desc("Instruction cutoff for aggressive tail duplication during "
"layout. Used at -O3. Tail merging during layout is forced to "
- "have a threshold that won't conflict."), cl::init(4),
- cl::Hidden);
+ "have a threshold that won't conflict."),
+ cl::init(4), cl::Hidden);
// Heuristic for tail duplication.
static cl::opt<unsigned> TailDupPlacementPenalty(
"tail-dup-placement-penalty",
- cl::desc("Cost penalty for blocks that can avoid breaking CFG by copying. "
- "Copying can increase fallthrough, but it also increases icache "
- "pressure. This parameter controls the penalty to account for that. "
- "Percent as integer."),
- cl::init(2),
- cl::Hidden);
+ cl::desc(
+ "Cost penalty for blocks that can avoid breaking CFG by copying. "
+ "Copying can increase fallthrough, but it also increases icache "
+ "pressure. This parameter controls the penalty to account for that. "
+ "Percent as integer."),
+ cl::init(2), cl::Hidden);
// Heuristic for tail duplication if profile count is used in cost model.
static cl::opt<unsigned> TailDupProfilePercentThreshold(
@@ -198,8 +198,7 @@ static cl::opt<unsigned> TriangleChainCount(
"triangle-chain-count",
cl::desc("Number of triangle-shaped-CFG's that need to be in a row for the "
"triangle tail duplication heuristic to kick in. 0 to disable."),
- cl::init(2),
- cl::Hidden);
+ cl::init(2), cl::Hidden);
// Use case: When block layout is visualized after MBP pass, the basic blocks
// are labeled in layout order; meanwhile blocks could be numbered in a
@@ -219,6 +218,11 @@ static cl::opt<unsigned> ExtTspBlockPlacementMaxBlocks(
"block placement."),
cl::init(UINT_MAX), cl::Hidden);
+// Apply the ext-tsp algorithm minimizing the size of a binary.
+static cl::opt<bool>
+ ApplyExtTspForSize("apply-ext-tsp-for-size", cl::init(false), cl::Hidden,
+ cl::desc("Use ext-tsp for size-aware block placement."));
+
namespace llvm {
extern cl::opt<bool> EnableExtTspBlockPlacement;
extern cl::opt<bool> ApplyExtTspWithoutProfile;
@@ -292,8 +296,8 @@ class BlockChain {
iterator end() { return Blocks.end(); }
const_iterator end() const { return Blocks.end(); }
- bool remove(MachineBasicBlock* BB) {
- for(iterator i = begin(); i != end(); ++i) {
+ bool remove(MachineBasicBlock *BB) {
+ for (iterator i = begin(); i != end(); ++i) {
if (*i == BB) {
Blocks.erase(i);
return true;
@@ -405,6 +409,8 @@ class MachineBlockPlacement : public MachineFunctionPass {
ProfileSummaryInfo *PSI = nullptr;
+ TargetPassConfig *PassConfig = nullptr;
+
/// Duplicator used to duplicate tails during placement.
///
/// Placement decisions can open up new tail duplication opportunities, but
@@ -415,6 +421,8 @@ class MachineBlockPlacement : public MachineFunctionPass {
/// Partial tail duplication threshold.
BlockFrequency DupThreshold;
+ unsigned TailDupSize;
+
/// True: use block profile count to compute tail duplication cost.
/// False: use block frequency to compute tail duplication cost.
bool UseProfileCount = false;
@@ -459,26 +467,24 @@ class MachineBlockPlacement : public MachineFunctionPass {
/// Scale the DupThreshold according to basic block size.
BlockFrequency scaleThreshold(MachineBasicBlock *BB);
- void initDupThreshold();
+ void initTailDupThreshold();
/// Decrease the UnscheduledPredecessors count for all blocks in chain, and
/// if the count goes to 0, add them to the appropriate work list.
- void markChainSuccessors(
- const BlockChain &Chain, const MachineBasicBlock *LoopHeaderBB,
- const BlockFilterSet *BlockFilter = nullptr);
+ void markChainSuccessors(const BlockChain &Chain,
+ const MachineBasicBlock *LoopHeaderBB,
+ const BlockFilterSet *BlockFilter = nullptr);
/// Decrease the UnscheduledPredecessors count for a single block, and
/// if the count goes to 0, add them to the appropriate work list.
- void markBlockSuccessors(
- const BlockChain &Chain, const MachineBasicBlock *BB,
- const MachineBasicBlock *LoopHeaderBB,
- const BlockFilterSet *BlockFilter = nullptr);
+ void markBlockSuccessors(const BlockChain &Chain, const MachineBasicBlock *BB,
+ const MachineBasicBlock *LoopHeaderBB,
+ const BlockFilterSet *BlockFilter = nullptr);
BranchProbability
- collectViableSuccessors(
- const MachineBasicBlock *BB, const BlockChain &Chain,
- const BlockFilterSet *BlockFilter,
- SmallVector<MachineBasicBlock *, 4> &Successors);
+ collectViableSuccessors(const MachineBasicBlock *BB, const BlockChain &Chain,
+ const BlockFilterSet *BlockFilter,
+ SmallVector<MachineBasicBlock *, 4> &Successors);
bool isBestSuccessor(MachineBasicBlock *BB, MachineBasicBlock *Pred,
BlockFilterSet *BlockFilter);
void findDuplicateCandidates(SmallVectorImpl<MachineBasicBlock *> &Candidates,
@@ -496,16 +502,19 @@ class MachineBlockPlacement : public MachineFunctionPass {
MachineFunction::iterator &PrevUnplacedBlockIt,
BlockFilterSet::iterator &PrevUnplacedBlockInFilterIt,
bool &DuplicatedToLPred);
- bool hasBetterLayoutPredecessor(
- const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
- const BlockChain &SuccChain, BranchProbability SuccProb,
- BranchProbability RealSuccProb, const BlockChain &Chain,
- const BlockFilterSet *BlockFilter);
- BlockAndTailDupResult selectBestSuccessor(
- const MachineBasicBlock *BB, const BlockChain &Chain,
- const BlockFilterSet *BlockFilter);
- MachineBasicBlock *selectBestCandidateBlock(
- const BlockChain &Chain, SmallVectorImpl<MachineBasicBlock *> &WorkList);
+ bool hasBetterLayoutPredecessor(const MachineBasicBlock *BB,
+ const MachineBasicBlock *Succ,
+ const BlockChain &SuccChain,
+ BranchProbability SuccProb,
+ BranchProbability RealSuccProb,
+ const BlockChain &Chain,
+ const BlockFilterSet *BlockFilter);
+ BlockAndTailDupResult selectBestSuccessor(const MachineBasicBlock *BB,
+ const BlockChain &Chain,
+ const BlockFilterSet *BlockFilter);
+ MachineBasicBlock *
+ selectBestCandidateBlock(const BlockChain &Chain,
+ SmallVectorImpl<MachineBasicBlock *> &WorkList);
MachineBasicBlock *
getFirstUnplacedBlock(const BlockChain &PlacedChain,
MachineFunction::iterator &PrevUnplacedBlockIt);
@@ -536,20 +545,19 @@ class MachineBlockPlacement : public MachineFunctionPass {
const MachineBasicBlock *ExitBB,
const BlockFilterSet &LoopBlockSet);
MachineBasicBlock *findBestLoopTopHelper(MachineBasicBlock *OldTop,
- const MachineLoop &L, const BlockFilterSet &LoopBlockSet);
- MachineBasicBlock *findBestLoopTop(
- const MachineLoop &L, const BlockFilterSet &LoopBlockSet);
- MachineBasicBlock *findBestLoopExit(
- const MachineLoop &L, const BlockFilterSet &LoopBlockSet,
- BlockFrequency &ExitFreq);
+ const MachineLoop &L,
+ const BlockFilterSet &LoopBlockSet);
+ MachineBasicBlock *findBestLoopTop(const MachineLoop &L,
+ const BlockFilterSet &LoopBlockSet);
+ MachineBasicBlock *findBestLoopExit(const MachineLoop &L,
+ const BlockFilterSet &LoopBlockSet,
+ BlockFrequency &ExitFreq);
BlockFilterSet collectLoopBlockSet(const MachineLoop &L);
void buildLoopChains(const MachineLoop &L);
- void rotateLoop(
- BlockChain &LoopChain, const MachineBasicBlock *ExitingBB,
- BlockFrequency ExitFreq, const BlockFilterSet &LoopBlockSet);
- void rotateLoopWithProfile(
- BlockChain &LoopChain, const MachineLoop &L,
- const BlockFilterSet &LoopBlockSet);
+ void rotateLoop(BlockChain &LoopChain, const MachineBasicBlock *ExitingBB,
+ BlockFrequency ExitFreq, const BlockFilterSet &LoopBlockSet);
+ void rotateLoopWithProfile(BlockChain &LoopChain, const MachineLoop &L,
+ const BlockFilterSet &LoopBlockSet);
void buildCFGChains();
void optimizeBranches();
void alignBlocks();
@@ -558,10 +566,10 @@ class MachineBlockPlacement : public MachineFunctionPass {
bool shouldTailDuplicate(MachineBasicBlock *BB);
/// Check the edge frequencies to see if tail duplication will increase
/// fallthroughs.
- bool isProfitableToTailDup(
- const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
- BranchProbability QProb,
- const BlockChain &Chain, const BlockFilterSet *BlockFilter);
+ bool isProfitableToTailDup(const MachineBasicBlock *BB,
+ const MachineBasicBlock *Succ,
+ BranchProbability QProb, const BlockChain &Chain,
+ const BlockFilterSet *BlockFilter);
/// Check for a trellis layout.
bool isTrellis(const MachineBasicBlock *BB,
@@ -582,16 +590,17 @@ class MachineBlockPlacement : public MachineFunctionPass {
/// Returns true if a block can tail duplicate into all unplaced
/// predecessors. Filters based on loop.
- bool canTailDuplicateUnplacedPreds(
- const MachineBasicBlock *BB, MachineBasicBlock *Succ,
- const BlockChain &Chain, const BlockFilterSet *BlockFilter);
+ bool canTailDuplicateUnplacedPreds(const MachineBasicBlock *BB,
+ MachineBasicBlock *Succ,
+ const BlockChain &Chain,
+ const BlockFilterSet *BlockFilter);
/// Find chains of triangles to tail-duplicate where a global analysis works,
/// but a local analysis would not find them.
void precomputeTriangleChains();
/// Apply a post-processing step optimizing block placement.
- void applyExtTsp();
+ void applyExtTsp(bool OptForSize);
/// Modify the existing block placement in the function and adjust all jumps.
void assignBlockOrder(const std::vector<const MachineBasicBlock *> &NewOrder);
@@ -802,8 +811,8 @@ bool MachineBlockPlacement::shouldTailDuplicate(MachineBasicBlock *BB) {
/// Compare 2 BlockFrequency's with a small penalty for \p A.
/// In order to be conservative, we apply a X% penalty to account for
/// increased icache pressure and static heuristics. For small frequencies
-/// we use only the numerators to improve accuracy. For simplicity, we assume the
-/// penalty is less than 100%
+/// we use only the numerators to improve accuracy. For simplicity, we assume
+/// the penalty is less than 100%
/// TODO(iteratee): Use 64-bit fixed point edge frequencies everywhere.
static bool greaterWithBias(BlockFrequency A, BlockFrequency B,
BlockFrequency EntryFreq) {
@@ -819,8 +828,8 @@ static bool greaterWithBias(BlockFrequency A, BlockFrequency B,
/// considering duplication.
bool MachineBlockPlacement::isProfitableToTailDup(
const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
- BranchProbability QProb,
- const BlockChain &Chain, const BlockFilterSet *BlockFilter) {
+ BranchProbability QProb, const BlockChain &Chain,
+ const BlockFilterSet *BlockFilter) {
// We need to do a probability calculation to make sure this is profitable.
// First: does succ have a successor that post-dominates? This affects the
// calculation. The 2 relevant cases are:
@@ -876,12 +885,12 @@ bool MachineBlockPlacement::isProfitableToTailDup(
// from BB.
auto SuccBestPred = BlockFrequency(0);
for (MachineBasicBlock *SuccPred : Succ->predecessors()) {
- if (SuccPred == Succ || SuccPred == BB
- || BlockToChain[SuccPred] == &Chain
- || (BlockFilter && !BlockFilter->count(SuccPred)))
+ if (SuccPred == Succ || SuccPred == BB ||
+ BlockToChain[SuccPred] == &Chain ||
+ (BlockFilter && !BlockFilter->count(SuccPred)))
continue;
- auto Freq = MBFI->getBlockFreq(SuccPred)
- * MBPI->getEdgeProbability(SuccPred, Succ);
+ auto Freq =
+ MBFI->getBlockFreq(SuccPred) * MBPI->getEdgeProbability(SuccPred, Succ);
if (Freq > SuccBestPred)
SuccBestPred = Freq;
}
@@ -1137,7 +1146,7 @@ MachineBlockPlacement::getBestTrellisSuccessor(
}
// We have already computed the optimal edge for the other side of the
// trellis.
- ComputedEdges[BestB.Src] = { BestB.Dest, false };
+ ComputedEdges[BestB.Src] = {BestB.Dest, false};
auto TrellisSucc = BestA.Dest;
LLVM_DEBUG(BranchProbability SuccProb = getAdjustedProbability(
@@ -1169,8 +1178,8 @@ bool MachineBlockPlacement::canTailDuplicateUnplacedPreds(
// Make sure all unplaced and unfiltered predecessors can be
// tail-duplicated into.
// Skip any blocks that are already placed or not in this loop.
- if (Pred == BB || (BlockFilter && !BlockFilter->count(Pred))
- || (BlockToChain[Pred] == &Chain && !Succ->succ_empty()))
+ if (Pred == BB || (BlockFilter && !BlockFilter->count(Pred)) ||
+ (BlockToChain[Pred] == &Chain && !Succ->succ_empty()))
continue;
if (!TailDup.canTailDuplicate(Succ, Pred)) {
if (Successors.size() > 1 && hasSameSuccessors(*Pred, Successors))
@@ -1289,9 +1298,7 @@ void MachineBlockPlacement::precomputeTriangleChains() {
unsigned count() const { return Edges.size() - 1; }
- MachineBasicBlock *getKey() const {
- return Edges.back();
- }
+ MachineBasicBlock *getKey() const { return Edges.back(); }
};
if (TriangleChainCount == 0)
@@ -1326,7 +1333,7 @@ void MachineBlockPlacement::precomputeTriangleChains() {
bool CanTailDuplicate = true;
// If PDom can't tail-duplicate into it's non-BB predecessors, then this
// isn't the kind of triangle we're looking for.
- for (MachineBasicBlock* Pred : PDom->predecessors()) {
+ for (MachineBasicBlock *Pred : PDom->predecessors()) {
if (Pred == &BB)
continue;
if (!TailDup.canTailDuplicate(PDom, Pred)) {
@@ -1386,8 +1393,8 @@ void MachineBlockPlacement::precomputeTriangleChains() {
// When profile is not present, return the StaticLikelyProb.
// When profile is available, we need to handle the triangle-shape CFG.
-static BranchProbability getLayoutSuccessorProbThreshold(
- const MachineBasicBlock *BB) {
+static BranchProbability
+getLayoutSuccessorProbThreshold(const MachineBasicBlock *BB) {
if (!BB->getParent()->getFunction().hasProfileData())
return BranchProbability(StaticLikelyProb, 100);
if (BB->succ_size() == 2) {
@@ -1551,8 +1558,8 @@ bool MachineBlockPlacement::hasBetterLayoutPredecessor(
for (MachineBasicBlock *Pred : Succ->predecessors()) {
BlockChain *PredChain = BlockToChain[Pred];
if (Pred == Succ || PredChain == &SuccChain ||
- (BlockFilter && !BlockFilter->count(Pred)) ||
- PredChain == &Chain || Pred != *std::prev(PredChain->end()) ||
+ (BlockFilter && !BlockFilter->count(Pred)) || PredChain == &Chain ||
+ Pred != *std::prev(PredChain->end()) ||
// This check is redundant except for look ahead. This function is
// called for lookahead by isProfitableToTailDup when BB hasn't been
// placed yet.
@@ -1599,12 +1606,12 @@ bool MachineBlockPlacement::hasBetterLayoutPredecessor(
/// \returns The best successor block found, or null if none are viable, along
/// with a boolean indicating if tail duplication is necessary.
MachineBlockPlacement::BlockAndTailDupResult
-MachineBlockPlacement::selectBestSuccessor(
- const MachineBasicBlock *BB, const BlockChain &Chain,
- const BlockFilterSet *BlockFilter) {
+MachineBlockPlacement::selectBestSuccessor(const MachineBasicBlock *BB,
+ const BlockChain &Chain,
+ const BlockFilterSet *BlockFilter) {
const BranchProbability HotProb(StaticLikelyProb, 100);
- BlockAndTailDupResult BestSucc = { nullptr, false };
+ BlockAndTailDupResult BestSucc = {nullptr, false};
auto BestProb = BranchProbability::getZero();
SmallVector<MachineBasicBlock *, 4> Successors;
@@ -1684,8 +1691,8 @@ MachineBlockPlacement::selectBestSuccessor(
std::tie(DupProb, Succ) = Tup;
if (DupProb < BestProb)
break;
- if (canTailDuplicateUnplacedPreds(BB, Succ, Chain, BlockFilter)
- && (isProfitableToTailDup(BB, Succ, BestProb, Chain, BlockFilter))) {
+ if (canTailDuplicateUnplacedPreds(BB, Succ, Chain, BlockFilter) &&
+ (isProfitableToTailDup(BB, Succ, BestProb, Chain, BlockFilter))) {
LLVM_DEBUG(dbgs() << " Candidate: " << getBlockName(Succ)
<< ", probability: " << DupProb
<< " (Tail Duplicate)\n");
@@ -1822,8 +1829,7 @@ MachineBasicBlock *MachineBlockPlacement::getFirstUnplacedBlock(
}
void MachineBlockPlacement::fillWorkLists(
- const MachineBasicBlock *MBB,
- SmallPtrSetImpl<BlockChain *> &UpdatedPreds,
+ const MachineBasicBlock *MBB, SmallPtrSetImpl<BlockChain *> &UpdatedPreds,
const BlockFilterSet *Block...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/109711
More information about the llvm-commits
mailing list