[llvm] r312719 - [MachineCombiner] Update instruction depths incrementally for large BBs.

Florian Hahn via llvm-commits llvm-commits at lists.llvm.org
Thu Sep 7 05:49:40 PDT 2017


Author: fhahn
Date: Thu Sep  7 05:49:39 2017
New Revision: 312719

URL: http://llvm.org/viewvc/llvm-project?rev=312719&view=rev
Log:
[MachineCombiner] Update instruction depths incrementally for large BBs.

Summary:
For large basic blocks with lots of combinable instructions, the
MachineTraceMetrics computations in MachineCombiner can dominate the compile
time, as computing the trace information is quadratic in the number of
instructions in a BB and it's relevant successors/predecessors.

In most cases, knowing the instruction depth should be enough to make
combination decisions. As we already iterate over all instructions in a basic
block, the instruction depth can be computed incrementally. This reduces the
cost of machine-combine drastically in cases where lots of instructions
are combined. The major drawback is that AFAIK, computing the critical path
length cannot be done incrementally. Therefore we only compute
instruction depths incrementally, for basic blocks with more
instructions than inc_threshold. The -machine-combiner-inc-threshold
option can be used to set the threshold and allows for easier
experimenting and checking if using incremental updates for all basic
blocks has any impact on the performance.

Reviewers: sanjoy, Gerolf, MatzeB, efriedma, fhahn

Reviewed By: fhahn

Subscribers: kiranchandramohan, javed.absar, efriedma, llvm-commits

Differential Revision: https://reviews.llvm.org/D36619

Modified:
    llvm/trunk/include/llvm/CodeGen/MachineTraceMetrics.h
    llvm/trunk/lib/CodeGen/MachineCombiner.cpp
    llvm/trunk/lib/CodeGen/MachineTraceMetrics.cpp
    llvm/trunk/test/CodeGen/AArch64/machine-combiner.ll
    llvm/trunk/test/CodeGen/X86/machine-combiner.ll
    llvm/trunk/test/CodeGen/X86/mul-constant-result.ll

Modified: llvm/trunk/include/llvm/CodeGen/MachineTraceMetrics.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/CodeGen/MachineTraceMetrics.h?rev=312719&r1=312718&r2=312719&view=diff
==============================================================================
--- llvm/trunk/include/llvm/CodeGen/MachineTraceMetrics.h (original)
+++ llvm/trunk/include/llvm/CodeGen/MachineTraceMetrics.h Thu Sep  7 05:49:39 2017
@@ -366,6 +366,12 @@ public:
                      SparseSet<LiveRegUnit> &RegUnits);
     void updateDepth(const MachineBasicBlock *, const MachineInstr&,
                      SparseSet<LiveRegUnit> &RegUnits);
+
+    /// Updates the depth of the instructions from Start to End.
+    void updateDepths(MachineBasicBlock::iterator Start,
+                      MachineBasicBlock::iterator End,
+                      SparseSet<LiveRegUnit> &RegUnits);
+
   };
 
   /// Strategies for selecting traces.

Modified: llvm/trunk/lib/CodeGen/MachineCombiner.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/MachineCombiner.cpp?rev=312719&r1=312718&r2=312719&view=diff
==============================================================================
--- llvm/trunk/lib/CodeGen/MachineCombiner.cpp (original)
+++ llvm/trunk/lib/CodeGen/MachineCombiner.cpp Thu Sep  7 05:49:39 2017
@@ -22,6 +22,7 @@
 #include "llvm/CodeGen/MachineTraceMetrics.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TargetSchedule.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
@@ -34,6 +35,11 @@ using namespace llvm;
 
 STATISTIC(NumInstCombined, "Number of machineinst combined");
 
+static cl::opt<unsigned>
+inc_threshold("machine-combiner-inc-threshold", cl::Hidden,
+              cl::desc("Incremental depth computation will be used for basic "
+                       "blocks with more instructions."), cl::init(500));
+
 namespace {
 class MachineCombiner : public MachineFunctionPass {
   const TargetInstrInfo *TII;
@@ -73,7 +79,7 @@ private:
                           SmallVectorImpl<MachineInstr *> &InsInstrs,
                           SmallVectorImpl<MachineInstr *> &DelInstrs,
                           DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
-                          MachineCombinerPattern Pattern);
+                          MachineCombinerPattern Pattern, bool SlackIsAccurate);
   bool preservesResourceLen(MachineBasicBlock *MBB,
                             MachineTraceMetrics::Trace BlockTrace,
                             SmallVectorImpl<MachineInstr *> &InsInstrs,
@@ -247,7 +253,8 @@ bool MachineCombiner::improvesCriticalPa
     SmallVectorImpl<MachineInstr *> &InsInstrs,
     SmallVectorImpl<MachineInstr *> &DelInstrs,
     DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
-    MachineCombinerPattern Pattern) {
+    MachineCombinerPattern Pattern,
+    bool SlackIsAccurate) {
   assert(TSchedModel.hasInstrSchedModelOrItineraries() &&
          "Missing machine model\n");
   // NewRoot is the last instruction in the \p InsInstrs vector.
@@ -258,7 +265,7 @@ bool MachineCombiner::improvesCriticalPa
   unsigned NewRootDepth = getDepth(InsInstrs, InstrIdxForVirtReg, BlockTrace);
   unsigned RootDepth = BlockTrace.getInstrCycles(*Root).Depth;
 
-  DEBUG(dbgs() << "DEPENDENCE DATA FOR " << Root << "\n";
+  DEBUG(dbgs() << "DEPENDENCE DATA FOR " << *Root << "\n";
         dbgs() << " NewRootDepth: " << NewRootDepth << "\n";
         dbgs() << " RootDepth: " << RootDepth << "\n");
 
@@ -281,17 +288,18 @@ bool MachineCombiner::improvesCriticalPa
     RootLatency += TSchedModel.computeInstrLatency(I);
 
   unsigned RootSlack = BlockTrace.getInstrSlack(*Root);
-
+  unsigned NewCycleCount = NewRootDepth + NewRootLatency;
+  unsigned OldCycleCount = RootDepth + RootLatency +
+                           (SlackIsAccurate ? RootSlack : 0);
   DEBUG(dbgs() << " NewRootLatency: " << NewRootLatency << "\n";
         dbgs() << " RootLatency: " << RootLatency << "\n";
-        dbgs() << " RootSlack: " << RootSlack << "\n";
+        dbgs() << " RootSlack: " << RootSlack << " SlackIsAccurate="
+               << SlackIsAccurate << "\n";
         dbgs() << " NewRootDepth + NewRootLatency = "
-               << NewRootDepth + NewRootLatency << "\n";
+               << NewCycleCount << "\n";
         dbgs() << " RootDepth + RootLatency + RootSlack = "
-               << RootDepth + RootLatency + RootSlack << "\n";);
-
-  unsigned NewCycleCount = NewRootDepth + NewRootLatency;
-  unsigned OldCycleCount = RootDepth + RootLatency + RootSlack;
+               << OldCycleCount << "\n";
+        );
 
   return NewCycleCount <= OldCycleCount;
 }
@@ -354,17 +362,44 @@ bool MachineCombiner::doSubstitute(unsig
   return false;
 }
 
+/// Inserts InsInstrs and deletes DelInstrs. Incrementally updates instruction
+/// depths if requested.
+///
+/// \param MBB basic block to insert instructions in
+/// \param MI current machine instruction
+/// \param InsInstrs new instructions to insert in \p MBB
+/// \param DelInstrs instruction to delete from \p MBB
+/// \param MinInstr is a pointer to the machine trace information
+/// \param RegUnits set of live registers, needed to compute instruction depths
+/// \param IncrementalUpdate if true, compute instruction depths incrementally,
+///                          otherwise invalidate the trace
 static void insertDeleteInstructions(MachineBasicBlock *MBB, MachineInstr &MI,
                                      SmallVector<MachineInstr *, 16> InsInstrs,
                                      SmallVector<MachineInstr *, 16> DelInstrs,
-                                     MachineTraceMetrics *Traces) {
+                                     MachineTraceMetrics::Ensemble *MinInstr,
+                                     SparseSet<LiveRegUnit> &RegUnits,
+                                     bool IncrementalUpdate) {
   for (auto *InstrPtr : InsInstrs)
     MBB->insert((MachineBasicBlock::iterator)&MI, InstrPtr);
-  for (auto *InstrPtr : DelInstrs)
+
+  for (auto *InstrPtr : DelInstrs) {
     InstrPtr->eraseFromParentAndMarkDBGValuesForRemoval();
-  ++NumInstCombined;
-  Traces->invalidate(MBB);
-  Traces->verifyAnalysis();
+    // Erase all LiveRegs defined by the removed instruction
+    for (auto I = RegUnits.begin(); I != RegUnits.end(); ) {
+      if (I->MI == InstrPtr)
+        I = RegUnits.erase(I);
+      else
+        I++;
+    }
+  }
+
+  if (IncrementalUpdate)
+    for (auto *InstrPtr : InsInstrs)
+      MinInstr->updateDepth(MBB, *InstrPtr, RegUnits);
+  else
+    MinInstr->invalidate(MBB);
+
+  NumInstCombined++;
 }
 
 /// Substitute a slow code sequence with a faster one by
@@ -378,9 +413,16 @@ bool MachineCombiner::combineInstruction
   bool Changed = false;
   DEBUG(dbgs() << "Combining MBB " << MBB->getName() << "\n");
 
+  bool IncrementalUpdate = false;
   auto BlockIter = MBB->begin();
+  auto LastUpdate = BlockIter;
   // Check if the block is in a loop.
   const MachineLoop *ML = MLI->getLoopFor(MBB);
+  if (!MinInstr)
+    MinInstr = Traces->getEnsemble(MachineTraceMetrics::TS_MinInstrCount);
+
+  SparseSet<LiveRegUnit> RegUnits;
+  RegUnits.setUniverse(TRI->getNumRegUnits());
 
   while (BlockIter != MBB->end()) {
     auto &MI = *BlockIter++;
@@ -419,9 +461,6 @@ bool MachineCombiner::combineInstruction
       SmallVector<MachineInstr *, 16> InsInstrs;
       SmallVector<MachineInstr *, 16> DelInstrs;
       DenseMap<unsigned, unsigned> InstrIdxForVirtReg;
-      if (!MinInstr)
-        MinInstr = Traces->getEnsemble(MachineTraceMetrics::TS_MinInstrCount);
-      Traces->verifyAnalysis();
       TII->genAlternativeCodeSequence(MI, P, InsInstrs, DelInstrs,
                                       InstrIdxForVirtReg);
       unsigned NewInstCount = InsInstrs.size();
@@ -436,23 +475,41 @@ bool MachineCombiner::combineInstruction
       if (ML && TII->isThroughputPattern(P))
         SubstituteAlways = true;
 
+      if (IncrementalUpdate) {
+        // Update depths since the last incremental update.
+        MinInstr->updateDepths(LastUpdate, std::next(BlockIter), RegUnits);
+        LastUpdate = BlockIter;
+      }
+
       // Substitute when we optimize for codesize and the new sequence has
       // fewer instructions OR
       // the new sequence neither lengthens the critical path nor increases
       // resource pressure.
       if (SubstituteAlways || doSubstitute(NewInstCount, OldInstCount)) {
-        insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, Traces);
+        insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, MinInstr,
+                                 RegUnits, IncrementalUpdate);
         // Eagerly stop after the first pattern fires.
         Changed = true;
         break;
       } else {
-        // Calculating the trace metrics may be expensive,
-        // so only do this when necessary.
+        // For big basic blocks, we only compute the full trace the first time
+        // we hit this. We do not invalidate the trace, but instead update the
+        // instruction depths incrementally.
+        // NOTE: Only the instruction depths up to MI are accurate. All other
+        // trace information is not updated.
         MachineTraceMetrics::Trace BlockTrace = MinInstr->getTrace(MBB);
+        Traces->verifyAnalysis();
         if (improvesCriticalPathLen(MBB, &MI, BlockTrace, InsInstrs, DelInstrs,
-                                    InstrIdxForVirtReg, P) &&
+                                    InstrIdxForVirtReg, P,
+                                    !IncrementalUpdate) &&
             preservesResourceLen(MBB, BlockTrace, InsInstrs, DelInstrs)) {
-          insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, Traces);
+          if (MBB->size() > inc_threshold)
+            // Use incremental depth updates for basic blocks above treshold
+            IncrementalUpdate = true;
+
+          insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, MinInstr,
+                                   RegUnits, IncrementalUpdate);
+
           // Eagerly stop after the first pattern fires.
           Changed = true;
           break;
@@ -467,6 +524,8 @@ bool MachineCombiner::combineInstruction
     }
   }
 
+  if (Changed && IncrementalUpdate)
+    Traces->invalidate(MBB);
   return Changed;
 }
 

Modified: llvm/trunk/lib/CodeGen/MachineTraceMetrics.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/MachineTraceMetrics.cpp?rev=312719&r1=312718&r2=312719&view=diff
==============================================================================
--- llvm/trunk/lib/CodeGen/MachineTraceMetrics.cpp (original)
+++ llvm/trunk/lib/CodeGen/MachineTraceMetrics.cpp Thu Sep  7 05:49:39 2017
@@ -823,6 +823,14 @@ updateDepth(const MachineBasicBlock *MBB
   updateDepth(BlockInfo[MBB->getNumber()], UseMI, RegUnits);
 }
 
+void MachineTraceMetrics::Ensemble::
+updateDepths(MachineBasicBlock::iterator Start,
+             MachineBasicBlock::iterator End,
+             SparseSet<LiveRegUnit> &RegUnits) {
+    for (; Start != End; Start++)
+      updateDepth(Start->getParent(), *Start, RegUnits);
+}
+
 /// Compute instruction depths for all instructions above or in MBB in its
 /// trace. This assumes that the trace through MBB has already been computed.
 void MachineTraceMetrics::Ensemble::

Modified: llvm/trunk/test/CodeGen/AArch64/machine-combiner.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AArch64/machine-combiner.ll?rev=312719&r1=312718&r2=312719&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AArch64/machine-combiner.ll (original)
+++ llvm/trunk/test/CodeGen/AArch64/machine-combiner.ll Thu Sep  7 05:49:39 2017
@@ -1,5 +1,10 @@
 ; RUN: llc -mtriple=aarch64-gnu-linux -mcpu=cortex-a57 -enable-unsafe-fp-math -disable-post-ra < %s | FileCheck %s
 
+; Incremental updates of the instruction depths should be enough for this test
+; case.
+; RUN: llc -mtriple=aarch64-gnu-linux -mcpu=cortex-a57 -enable-unsafe-fp-math \
+; RUN:     -disable-post-ra -machine-combiner-inc-threshold=0 < %s | FileCheck %s
+
 ; Verify that the first two adds are independent regardless of how the inputs are
 ; commuted. The destination registers are used as source registers for the third add.
 

Modified: llvm/trunk/test/CodeGen/X86/machine-combiner.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/machine-combiner.ll?rev=312719&r1=312718&r2=312719&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/machine-combiner.ll (original)
+++ llvm/trunk/test/CodeGen/X86/machine-combiner.ll Thu Sep  7 05:49:39 2017
@@ -1,6 +1,11 @@
 ; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=sse -enable-unsafe-fp-math < %s | FileCheck %s --check-prefix=SSE
 ; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=avx -enable-unsafe-fp-math < %s | FileCheck %s --check-prefix=AVX
 
+; Incremental updates of the instruction depths should be enough for this test
+; case.
+; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=sse -enable-unsafe-fp-math -machine-combiner-inc-threshold=0 < %s | FileCheck %s --check-prefix=SSE
+; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=avx -enable-unsafe-fp-math -machine-combiner-inc-threshold=0 < %s | FileCheck %s --check-prefix=AVX
+
 ; Verify that the first two adds are independent regardless of how the inputs are
 ; commuted. The destination registers are used as source registers for the third add.
 

Modified: llvm/trunk/test/CodeGen/X86/mul-constant-result.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/mul-constant-result.ll?rev=312719&r1=312718&r2=312719&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/mul-constant-result.ll (original)
+++ llvm/trunk/test/CodeGen/X86/mul-constant-result.ll Thu Sep  7 05:49:39 2017
@@ -1,6 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=i686-unknown | FileCheck %s --check-prefix=X86
-; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=haswell| FileCheck %s --check-prefix=X64-HSW
+
+; Incremental updates of the instruction depths should be enough for this test
+; case.
+; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=haswell -machine-combiner-inc-threshold=0| FileCheck %s --check-prefix=X64-HSW
 
 ; Function Attrs: norecurse nounwind readnone uwtable
 define i32 @mult(i32, i32) local_unnamed_addr #0 {




More information about the llvm-commits mailing list