[llvm] r251833 - In MachineBlockPlacement, filter cold blocks off the loop chain when profile data is available.

Mon Nov 2 13:24:00 PST 2015

Author: conghou
Date: Mon Nov  2 15:24:00 2015
New Revision: 251833

URL: http://llvm.org/viewvc/llvm-project?rev=251833&view=rev
Log:
In MachineBlockPlacement, filter cold blocks off the loop chain when profile data is available.

In the current BB placement algorithm, a loop chain always contains all loop blocks. This has a drawback that cold blocks in the loop may be inserted on a hot function path, hence increasing branch cost and also reducing icache locality.

Consider a simple example shown below:

A
|
B⇆C
|
D

When B->C is quite cold, the best BB-layout should be A,B,D,C. But the current implementation produces A,C,B,D.

This patch filters those cold blocks off from the loop chain by comparing the ratio:

LoopBBFreq / LoopFreq

to 20%: if it is less than 20%, we don't include this BB to the loop chain. Here LoopFreq is the frequency of the loop when we reduce the loop into a single node. In general we have more cold blocks when the loop has few iterations. And vice versa.


Differential revision: http://reviews.llvm.org/D11662



Added:
    llvm/trunk/test/CodeGen/X86/code_placement_cold_loop_blocks.ll
Modified:
    llvm/trunk/lib/CodeGen/MachineBlockPlacement.cpp

Modified: llvm/trunk/lib/CodeGen/MachineBlockPlacement.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/MachineBlockPlacement.cpp?rev=251833&r1=251832&r2=251833&view=diff
==============================================================================

--- llvm/trunk/lib/CodeGen/MachineBlockPlacement.cpp (original)
+++ llvm/trunk/lib/CodeGen/MachineBlockPlacement.cpp Mon Nov  2 15:24:00 2015
@@ -81,6 +81,12 @@ static cl::opt<unsigned> OutlineOptional
              "instruction count below this threshold"),
     cl::init(4), cl::Hidden);
 
+static cl::opt<unsigned> LoopToColdBlockRatio(
+    "loop-to-cold-block-ratio",
+    cl::desc("Outline loop blocks from loop chain if (frequency of loop) / "
+             "(frequency of block) is greater than this ratio"),
+    cl::init(5), cl::Hidden);
+
 static cl::opt<bool>
     PreciseRotationCost("precise-rotation-cost",
                         cl::desc("Model the cost of loop rotation more "
@@ -263,6 +269,7 @@ class MachineBlockPlacement : public Mac
                                      const BlockFilterSet &LoopBlockSet);
   MachineBasicBlock *findBestLoopExit(MachineFunction &F, MachineLoop &L,
                                       const BlockFilterSet &LoopBlockSet);
+  BlockFilterSet collectLoopBlockSet(MachineFunction &F, MachineLoop &L);
   void buildLoopChains(MachineFunction &F, MachineLoop &L);
   void rotateLoop(BlockChain &LoopChain, MachineBasicBlock *ExitingBB,
                   const BlockFilterSet &LoopBlockSet);
@@ -960,6 +967,42 @@ void MachineBlockPlacement::rotateLoopWi
   }
 }
 
+/// \brief Collect blocks in the given loop that are to be placed.
+///
+/// When profile data is available, exclude cold blocks from the returned set;
+/// otherwise, collect all blocks in the loop.
+MachineBlockPlacement::BlockFilterSet
+MachineBlockPlacement::collectLoopBlockSet(MachineFunction &F, MachineLoop &L) {
+  BlockFilterSet LoopBlockSet;
+
+  // Filter cold blocks off from LoopBlockSet when profile data is available.
+  // Collect the sum of frequencies of incoming edges to the loop header from
+  // outside. If we treat the loop as a super block, this is the frequency of
+  // the loop. Then for each block in the loop, we calculate the ratio between
+  // its frequency and the frequency of the loop block. When it is too small,
+  // don't add it to the loop chain. If there are outer loops, then this block
+  // will be merged into the first outer loop chain for which this block is not
+  // cold anymore. This needs precise profile data and we only do this when
+  // profile data is available.
+  if (F.getFunction()->getEntryCount()) {
+    BlockFrequency LoopFreq(0);
+    for (auto LoopPred : L.getHeader()->predecessors())
+      if (!L.contains(LoopPred))
+        LoopFreq += MBFI->getBlockFreq(LoopPred) *
+                    MBPI->getEdgeProbability(LoopPred, L.getHeader());
+
+    for (MachineBasicBlock *LoopBB : L.getBlocks()) {
+      auto Freq = MBFI->getBlockFreq(LoopBB).getFrequency();
+      if (Freq == 0 || LoopFreq.getFrequency() / Freq > LoopToColdBlockRatio)
+        continue;
+      LoopBlockSet.insert(LoopBB);
+    }
+  } else
+    LoopBlockSet.insert(L.block_begin(), L.block_end());
+
+  return LoopBlockSet;
+}
+
 /// \brief Forms basic block chains from the natural loop structures.
 ///
 /// These chains are designed to preserve the existing *structure* of the code
@@ -974,7 +1017,7 @@ void MachineBlockPlacement::buildLoopCha
     buildLoopChains(F, *InnerLoop);
 
   SmallVector<MachineBasicBlock *, 16> BlockWorkList;
-  BlockFilterSet LoopBlockSet(L.block_begin(), L.block_end());
+  BlockFilterSet LoopBlockSet = collectLoopBlockSet(F, L);
 
   // Check if we have profile data for this function. If yes, we will rotate
   // this loop by modeling costs more precisely which requires the profile data
@@ -1005,7 +1048,8 @@ void MachineBlockPlacement::buildLoopCha
   SmallPtrSet<BlockChain *, 4> UpdatedPreds;
   assert(LoopChain.LoopPredecessors == 0);
   UpdatedPreds.insert(&LoopChain);
-  for (MachineBasicBlock *LoopBB : L.getBlocks()) {
+
+  for (MachineBasicBlock *LoopBB : LoopBlockSet) {
     BlockChain &Chain = *BlockToChain[LoopBB];
     if (!UpdatedPreds.insert(&Chain).second)
       continue;

Added: llvm/trunk/test/CodeGen/X86/code_placement_cold_loop_blocks.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/code_placement_cold_loop_blocks.ll?rev=251833&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/X86/code_placement_cold_loop_blocks.ll (added)
+++ llvm/trunk/test/CodeGen/X86/code_placement_cold_loop_blocks.ll Mon Nov  2 15:24:00 2015
@@ -0,0 +1,122 @@
+; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux < %s | FileCheck %s -check-prefix=CHECK
+
+define void @foo() !prof !1 {
+; Test if a cold block in a loop will be placed at the end of the function
+; chain.
+;
+; CHECK-LABEL: foo:
+; CHECK: callq b
+; CHECK: callq c
+; CHECK: callq e
+; CHECK: callq f
+; CHECK: callq d
+
+entry:
+  br label %header
+
+header:
+  call void @b()
+  %call = call zeroext i1 @a()
+  br i1 %call, label %if.then, label %if.else, !prof !4
+
+if.then:
+  call void @c()
+  br label %if.end
+
+if.else:
+  call void @d()
+  br label %if.end
+
+if.end:
+  call void @e()
+  %call2 = call zeroext i1 @a()
+  br i1 %call2, label %header, label %end, !prof !5
+
+end:
+  call void @f()
+  ret void
+}
+
+define void @nested_loop_0() !prof !1 {
+; Test if a block that is cold in the inner loop but not cold in the outer loop
+; will merged to the outer loop chain.
+;
+; CHECK-LABEL: nested_loop_0:
+; CHECK: callq c
+; CHECK: callq d
+; CHECK: callq e
+; CHECK: callq b
+; CHECK: callq f
+
+entry:
+  br label %header
+
+header:
+  call void @b()
+  %call4 = call zeroext i1 @a()
+  br i1 %call4, label %header2, label %end
+
+header2:
+  call void @c()
+  %call = call zeroext i1 @a()
+  br i1 %call, label %if.then, label %if.else, !prof !2
+
+if.then:
+  call void @d()
+  %call3 = call zeroext i1 @a()
+  br i1 %call3, label %header2, label %header, !prof !3
+
+if.else:
+  call void @e()
+  %call2 = call zeroext i1 @a()
+  br i1 %call2, label %header2, label %header, !prof !3
+
+end:
+  call void @f()
+  ret void
+}
+
+define void @nested_loop_1() !prof !1 {
+; Test if a cold block in an inner loop will be placed at the end of the
+; function chain.
+;
+; CHECK-LABEL: nested_loop_1:
+; CHECK: callq b
+; CHECK: callq c
+; CHECK: callq e
+; CHECK: callq d
+
+entry:
+  br label %header
+
+header:
+  call void @b()
+  br label %header2
+
+header2:
+  call void @c()
+  %call = call zeroext i1 @a()
+  br i1 %call, label %end, label %if.else, !prof !4
+
+if.else:
+  call void @d()
+  %call2 = call zeroext i1 @a()
+  br i1 %call2, label %header2, label %header, !prof !5
+
+end:
+  call void @e()
+  ret void
+}
+
+declare zeroext i1 @a()
+declare void @b()
+declare void @c()
+declare void @d()
+declare void @e()
+declare void @f()
+
+!1 = !{!"function_entry_count", i64 1}
+!2 = !{!"branch_weights", i32 100, i32 1}
+!3 = !{!"branch_weights", i32 1, i32 10}
+!4 = !{!"branch_weights", i32 1000, i32 1}
+!5 = !{!"branch_weights", i32 100, i32 1}