[llvm] r239843 - Fix PR 23525 - Separate header mass propagation in irregular loops.
Diego Novillo
dnovillo at google.com
Tue Jun 16 12:10:58 PDT 2015
Author: dnovillo
Date: Tue Jun 16 14:10:58 2015
New Revision: 239843
URL: http://llvm.org/viewvc/llvm-project?rev=239843&view=rev
Log:
Fix PR 23525 - Separate header mass propagation in irregular loops.
Summary:
When propagating mass through irregular loops, the mass flowing through
each loop header may not be equal. This was causing wrong frequencies
to be computed for irregular loop headers.
Fixed by keeping track of masses flowing through each of the headers in
an irregular loop. To do this, we now keep track of per-header backedge
weights. After the loop mass is distributed through the loop, the
backedge weights are used to re-distribute the loop mass to the loop
headers.
Since each backedge will have a mass proportional to the different
branch weights, the loop headers will end up with a more approximate
weight distribution (as opposed to the current distribution that assumes
that every loop header is the same).
Reviewers: dexonsmith
Subscribers: llvm-commits
Differential Revision: http://reviews.llvm.org/D10348
Added:
llvm/trunk/test/Analysis/BlockFrequencyInfo/PR23525.ll
Modified:
llvm/trunk/include/llvm/Analysis/BlockFrequencyInfoImpl.h
llvm/trunk/lib/Analysis/BlockFrequencyInfoImpl.cpp
llvm/trunk/test/Analysis/BlockFrequencyInfo/irreducible.ll
Modified: llvm/trunk/include/llvm/Analysis/BlockFrequencyInfoImpl.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/Analysis/BlockFrequencyInfoImpl.h?rev=239843&r1=239842&r2=239843&view=diff
==============================================================================
--- llvm/trunk/include/llvm/Analysis/BlockFrequencyInfoImpl.h (original)
+++ llvm/trunk/include/llvm/Analysis/BlockFrequencyInfoImpl.h Tue Jun 16 14:10:58 2015
@@ -196,23 +196,26 @@ public:
struct LoopData {
typedef SmallVector<std::pair<BlockNode, BlockMass>, 4> ExitMap;
typedef SmallVector<BlockNode, 4> NodeList;
- LoopData *Parent; ///< The parent loop.
- bool IsPackaged; ///< Whether this has been packaged.
- uint32_t NumHeaders; ///< Number of headers.
- ExitMap Exits; ///< Successor edges (and weights).
- NodeList Nodes; ///< Header and the members of the loop.
- BlockMass BackedgeMass; ///< Mass returned to loop header.
+ typedef SmallVector<BlockMass, 1> HeaderMassList;
+ LoopData *Parent; ///< The parent loop.
+ bool IsPackaged; ///< Whether this has been packaged.
+ uint32_t NumHeaders; ///< Number of headers.
+ ExitMap Exits; ///< Successor edges (and weights).
+ NodeList Nodes; ///< Header and the members of the loop.
+ HeaderMassList BackedgeMass; ///< Mass returned to each loop header.
BlockMass Mass;
Scaled64 Scale;
LoopData(LoopData *Parent, const BlockNode &Header)
- : Parent(Parent), IsPackaged(false), NumHeaders(1), Nodes(1, Header) {}
+ : Parent(Parent), IsPackaged(false), NumHeaders(1), Nodes(1, Header),
+ BackedgeMass(1) {}
template <class It1, class It2>
LoopData(LoopData *Parent, It1 FirstHeader, It1 LastHeader, It2 FirstOther,
It2 LastOther)
: Parent(Parent), IsPackaged(false), Nodes(FirstHeader, LastHeader) {
NumHeaders = Nodes.size();
Nodes.insert(Nodes.end(), FirstOther, LastOther);
+ BackedgeMass.resize(NumHeaders);
}
bool isHeader(const BlockNode &Node) const {
if (isIrreducible())
@@ -223,6 +226,14 @@ public:
BlockNode getHeader() const { return Nodes[0]; }
bool isIrreducible() const { return NumHeaders > 1; }
+ HeaderMassList::difference_type headerIndexFor(const BlockNode &B) {
+ assert(isHeader(B) && "this is only valid on loop header blocks");
+ if (isIrreducible())
+ return std::lower_bound(Nodes.begin(), Nodes.begin() + NumHeaders, B) -
+ Nodes.begin();
+ return 0;
+ }
+
NodeList::const_iterator members_begin() const {
return Nodes.begin() + NumHeaders;
}
@@ -431,6 +442,16 @@ public:
/// \brief Compute the loop scale for a loop.
void computeLoopScale(LoopData &Loop);
+ /// Adjust the mass of all headers in an irreducible loop.
+ ///
+ /// Initially, irreducible loops are assumed to distribute their mass
+ /// equally among its headers. This can lead to wrong frequency estimates
+ /// since some headers may be executed more frequently than others.
+ ///
+ /// This adjusts header mass distribution so it matches the weights of
+ /// the backedges going into each of the loop headers.
+ void adjustLoopHeaderMass(LoopData &Loop);
+
/// \brief Package up a loop.
void packageLoop(LoopData &Loop);
@@ -735,11 +756,6 @@ void IrreducibleGraph::addEdges(const Bl
/// as sub-loops, rather than arbitrarily shoving the problematic
/// blocks into the headers of the main irreducible SCC.
///
-/// - Backedge frequencies are assumed to be evenly split between the
-/// headers of a given irreducible SCC. Instead, we could track the
-/// backedge mass separately for each header, and adjust their relative
-/// frequencies.
-///
/// - Entry frequencies are assumed to be evenly split between the
/// headers of a given irreducible SCC, which is the only option if we
/// need to compute mass in the SCC before its parent loop. Instead,
@@ -1042,6 +1058,8 @@ bool BlockFrequencyInfoImpl<BT>::compute
for (const BlockNode &M : Loop.Nodes)
if (!propagateMassToSuccessors(&Loop, M))
llvm_unreachable("unhandled irreducible control flow");
+
+ adjustLoopHeaderMass(Loop);
} else {
Working[Loop.getHeader().Index].getMass() = BlockMass::getFull();
if (!propagateMassToSuccessors(&Loop, Loop.getHeader()))
Modified: llvm/trunk/lib/Analysis/BlockFrequencyInfoImpl.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Analysis/BlockFrequencyInfoImpl.cpp?rev=239843&r1=239842&r2=239843&view=diff
==============================================================================
--- llvm/trunk/lib/Analysis/BlockFrequencyInfoImpl.cpp (original)
+++ llvm/trunk/lib/Analysis/BlockFrequencyInfoImpl.cpp Tue Jun 16 14:10:58 2015
@@ -286,7 +286,7 @@ bool BlockFrequencyInfoImplBase::addToDi
if (isLoopHeader(Resolved)) {
DEBUG(debugSuccessor("backedge"));
- Dist.addBackedge(OuterLoop->getHeader(), Weight);
+ Dist.addBackedge(Resolved, Weight);
return true;
}
@@ -349,7 +349,10 @@ void BlockFrequencyInfoImplBase::compute
// LoopScale == 1 / ExitMass
// ExitMass == HeadMass - BackedgeMass
- BlockMass ExitMass = BlockMass::getFull() - Loop.BackedgeMass;
+ BlockMass TotalBackedgeMass;
+ for (auto &Mass : Loop.BackedgeMass)
+ TotalBackedgeMass += Mass;
+ BlockMass ExitMass = BlockMass::getFull() - TotalBackedgeMass;
// Block scale stores the inverse of the scale. If this is an infinite loop,
// its exit mass will be zero. In this case, use an arbitrary scale for the
@@ -358,7 +361,7 @@ void BlockFrequencyInfoImplBase::compute
ExitMass.isEmpty() ? InifiniteLoopScale : ExitMass.toScaled().inverse();
DEBUG(dbgs() << " - exit-mass = " << ExitMass << " (" << BlockMass::getFull()
- << " - " << Loop.BackedgeMass << ")\n"
+ << " - " << TotalBackedgeMass << ")\n"
<< " - scale = " << Loop.Scale << "\n");
}
@@ -375,6 +378,19 @@ void BlockFrequencyInfoImplBase::package
Loop.IsPackaged = true;
}
+#ifndef NDEBUG
+static void debugAssign(const BlockFrequencyInfoImplBase &BFI,
+ const DitheringDistributer &D, const BlockNode &T,
+ const BlockMass &M, const char *Desc) {
+ dbgs() << " => assign " << M << " (" << D.RemMass << ")";
+ if (Desc)
+ dbgs() << " [" << Desc << "]";
+ if (T.isValid())
+ dbgs() << " to " << BFI.getBlockName(T);
+ dbgs() << "\n";
+}
+#endif
+
void BlockFrequencyInfoImplBase::distributeMass(const BlockNode &Source,
LoopData *OuterLoop,
Distribution &Dist) {
@@ -384,25 +400,12 @@ void BlockFrequencyInfoImplBase::distrib
// Distribute mass to successors as laid out in Dist.
DitheringDistributer D(Dist, Mass);
-#ifndef NDEBUG
- auto debugAssign = [&](const BlockNode &T, const BlockMass &M,
- const char *Desc) {
- dbgs() << " => assign " << M << " (" << D.RemMass << ")";
- if (Desc)
- dbgs() << " [" << Desc << "]";
- if (T.isValid())
- dbgs() << " to " << getBlockName(T);
- dbgs() << "\n";
- };
- (void)debugAssign;
-#endif
-
for (const Weight &W : Dist.Weights) {
// Check for a local edge (non-backedge and non-exit).
BlockMass Taken = D.takeMass(W.Amount);
if (W.Type == Weight::Local) {
Working[W.TargetNode.Index].getMass() += Taken;
- DEBUG(debugAssign(W.TargetNode, Taken, nullptr));
+ DEBUG(debugAssign(*this, D, W.TargetNode, Taken, nullptr));
continue;
}
@@ -411,15 +414,16 @@ void BlockFrequencyInfoImplBase::distrib
// Check for a backedge.
if (W.Type == Weight::Backedge) {
- OuterLoop->BackedgeMass += Taken;
- DEBUG(debugAssign(BlockNode(), Taken, "back"));
+ auto ix = OuterLoop->headerIndexFor(W.TargetNode);
+ OuterLoop->BackedgeMass[ix] += Taken;
+ DEBUG(debugAssign(*this, D, W.TargetNode, Taken, "back"));
continue;
}
// This must be an exit.
assert(W.Type == Weight::Exit);
OuterLoop->Exits.push_back(std::make_pair(W.TargetNode, Taken));
- DEBUG(debugAssign(W.TargetNode, Taken, "exit"));
+ DEBUG(debugAssign(*this, D, W.TargetNode, Taken, "exit"));
}
}
@@ -713,10 +717,44 @@ BlockFrequencyInfoImplBase::analyzeIrred
void
BlockFrequencyInfoImplBase::updateLoopWithIrreducible(LoopData &OuterLoop) {
OuterLoop.Exits.clear();
- OuterLoop.BackedgeMass = BlockMass::getEmpty();
+ for (auto &Mass : OuterLoop.BackedgeMass)
+ Mass = BlockMass::getEmpty();
auto O = OuterLoop.Nodes.begin() + 1;
for (auto I = O, E = OuterLoop.Nodes.end(); I != E; ++I)
if (!Working[I->Index].isPackaged())
*O++ = *I;
OuterLoop.Nodes.erase(O, OuterLoop.Nodes.end());
}
+
+void BlockFrequencyInfoImplBase::adjustLoopHeaderMass(LoopData &Loop) {
+ assert(Loop.isIrreducible() && "this only makes sense on irreducible loops");
+
+ // Since the loop has more than one header block, the mass flowing back into
+ // each header will be different. Adjust the mass in each header loop to
+ // reflect the masses flowing through back edges.
+ //
+ // To do this, we distribute the initial mass using the backedge masses
+ // as weights for the distribution.
+ BlockMass LoopMass = BlockMass::getFull();
+ Distribution Dist;
+
+ DEBUG(dbgs() << "adjust-loop-header-mass:\n");
+ for (uint32_t H = 0; H < Loop.NumHeaders; ++H) {
+ auto &HeaderNode = Loop.Nodes[H];
+ auto &BackedgeMass = Loop.BackedgeMass[Loop.headerIndexFor(HeaderNode)];
+ DEBUG(dbgs() << " - Add back edge mass for node "
+ << getBlockName(HeaderNode) << ": " << BackedgeMass << "\n");
+ Dist.addLocal(HeaderNode, BackedgeMass.getMass());
+ }
+
+ DitheringDistributer D(Dist, LoopMass);
+
+ DEBUG(dbgs() << " Distribute loop mass " << LoopMass
+ << " to headers using above weights\n");
+ for (const Weight &W : Dist.Weights) {
+ BlockMass Taken = D.takeMass(W.Amount);
+ assert(W.Type == Weight::Local && "all weights should be local");
+ Working[W.TargetNode.Index].getMass() = Taken;
+ DEBUG(debugAssign(*this, D, W.TargetNode, Taken, nullptr));
+ }
+}
Added: llvm/trunk/test/Analysis/BlockFrequencyInfo/PR23525.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Analysis/BlockFrequencyInfo/PR23525.ll?rev=239843&view=auto
==============================================================================
--- llvm/trunk/test/Analysis/BlockFrequencyInfo/PR23525.ll (added)
+++ llvm/trunk/test/Analysis/BlockFrequencyInfo/PR23525.ll Tue Jun 16 14:10:58 2015
@@ -0,0 +1,80 @@
+; RUN: opt < %s -analyze -block-freq | FileCheck %s
+
+ at g = global i32 0, align 4
+
+; Function Attrs: inlinehint noinline nounwind uwtable
+define i32 @_Z8hot_loopi(i32 %n) !prof !1 {
+entry:
+ %div = sdiv i32 %n, 2
+ %rem12 = and i32 %n, 1
+ %cmp = icmp eq i32 %rem12, 0
+ br i1 %cmp, label %Next, label %for.cond, !prof !2
+
+; CHECK: - for.cond: float = 25.85{{[0-9]*}}, int = 206
+for.cond: ; preds = %entry, %for.inc
+ %i.0 = phi i32 [ %inc, %for.inc ], [ %div, %entry ]
+ %cmp1 = icmp slt i32 %i.0, %n
+ br i1 %cmp1, label %for.body, label %for.end, !prof !3, !llvm.loop !4
+
+; CHECK: - for.body: float = 24.52, int = 196
+for.body: ; preds = %for.cond
+ %rem213 = and i32 %i.0, 1
+ %cmp3 = icmp eq i32 %rem213, 0
+ br i1 %cmp3, label %if.then.4, label %Next, !prof !6
+
+; CHECK: - if.then.4: float = 12.26{{[0-9]*}}, int = 98
+if.then.4: ; preds = %for.body
+ %0 = load i32, i32* @g, align 4, !tbaa !7
+ %mul = shl nsw i32 %0, 1
+ br label %for.inc
+
+; CHECK: - Next: float = 12.41{{[0-9]*}}, int = 99
+Next: ; preds = %for.body, %entry
+ %i.1 = phi i32 [ %div, %entry ], [ %i.0, %for.body ]
+ %1 = load i32, i32* @g, align 4, !tbaa !7
+ %add = add nsw i32 %1, %n
+ br label %for.inc
+
+; CHECK: - for.inc: float = 38.28{{[0-9]*}}, int = 306
+for.inc: ; preds = %if.then.4, %Next
+ %storemerge = phi i32 [ %add, %Next ], [ %mul, %if.then.4 ]
+ %i.2 = phi i32 [ %i.1, %Next ], [ %i.0, %if.then.4 ]
+ store i32 %storemerge, i32* @g, align 4, !tbaa !7
+ %inc = add nsw i32 %i.2, 1
+ br label %for.cond
+
+; CHECK: - for.end: float = 1.0, int = 8
+for.end: ; preds = %for.cond
+ %2 = load i32, i32* @g, align 4, !tbaa !7
+ ret i32 %2
+}
+
+; Function Attrs: nounwind uwtable
+define i32 @main() !prof !11 {
+entry:
+ br label %for.body
+
+for.cond.cleanup: ; preds = %for.body
+ ret i32 0
+
+for.body: ; preds = %for.body, %entry
+ %i.04 = phi i32 [ 1, %entry ], [ %inc, %for.body ]
+ %call = tail call i32 @_Z8hot_loopi(i32 %i.04)
+ %inc = add nuw nsw i32 %i.04, 1
+ %exitcond = icmp eq i32 %inc, 100
+ br i1 %exitcond, label %for.cond.cleanup, label %for.body, !prof !12
+}
+
+
+!1 = !{!"function_entry_count", i64 99}
+!2 = !{!"branch_weights", i32 50, i32 51}
+!3 = !{!"branch_weights", i32 2452, i32 100}
+!4 = distinct !{!4, !5}
+!5 = !{!"llvm.loop.unroll.disable"}
+!6 = !{!"branch_weights", i32 1227, i32 1226}
+!7 = !{!8, !8, i64 0}
+!8 = !{!"int", !9, i64 0}
+!9 = !{!"omnipotent char", !10, i64 0}
+!10 = !{!"Simple C/C++ TBAA"}
+!11 = !{!"function_entry_count", i64 1}
+!12 = !{!"branch_weights", i32 2, i32 100}
Modified: llvm/trunk/test/Analysis/BlockFrequencyInfo/irreducible.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Analysis/BlockFrequencyInfo/irreducible.ll?rev=239843&r1=239842&r2=239843&view=diff
==============================================================================
--- llvm/trunk/test/Analysis/BlockFrequencyInfo/irreducible.ll (original)
+++ llvm/trunk/test/Analysis/BlockFrequencyInfo/irreducible.ll Tue Jun 16 14:10:58 2015
@@ -130,9 +130,6 @@ exit:
; At the first step, c1 and c2 each get 1/3 of the entry. At each subsequent
; step, c1 and c2 each get 1/3 of what's left in c1 and c2 combined. This
; infinite series sums to 1.
-;
-; Since the currently algorithm *always* assumes entry blocks are equal,
-; -block-freq gets the right answers here.
define void @crossloops(i2 %x) {
; CHECK-LABEL: Printing analysis {{.*}} for function 'crossloops':
; CHECK-NEXT: block-frequency-info: crossloops
@@ -386,7 +383,7 @@ exit:
;
; This testcases uses non-trivial branch weights. The CHECK statements here
; will start to fail if we change -block-freq to be more accurate. Currently,
-; we expect left, right and top to be treated as equal headers.
+; loop headers are affected by the weight of their corresponding back edges.
define void @nonentry_header(i1 %x, i2 %y) {
; CHECK-LABEL: Printing analysis {{.*}} for function 'nonentry_header':
; CHECK-NEXT: block-frequency-info: nonentry_header
@@ -395,15 +392,15 @@ entry:
br i1 %x, label %left, label %right, !prof !21
left:
-; CHECK-NEXT: left: float = 3.0,
+; CHECK-NEXT: left: float = 0.14{{[0-9]*}},
br i1 %x, label %top, label %bottom, !prof !22
right:
-; CHECK-NEXT: right: float = 3.0,
+; CHECK-NEXT: right: float = 0.42{{[0-9]*}},
br i1 %x, label %top, label %bottom, !prof !22
top:
-; CHECK-NEXT: top: float = 3.0,
+; CHECK-NEXT: top: float = 8.43{{[0-9]*}},
switch i2 %y, label %exit [ i2 0, label %left
i2 1, label %right
i2 2, label %bottom ], !prof !23
More information about the llvm-commits
mailing list