[llvm] r323873 - [MachineCombiner] Add check for optimal pattern order.

Wed Jan 31 05:54:30 PST 2018

Author: fhahn
Date: Wed Jan 31 05:54:30 2018
New Revision: 323873

URL: http://llvm.org/viewvc/llvm-project?rev=323873&view=rev
Log:
[MachineCombiner] Add check for optimal pattern order.

In D41587, @mssimpso discovered that the order of some patterns for
AArch64 was sub-optimal. I thought a bit about how we could avoid that
case in the future. I do not think there is a need for evaluating all
patterns for now. But this patch adds an extra (expensive) check, that
evaluates the latencies of all patterns, and ensures that the latency
saved decreases for subsequent patterns.

This catches the sub-optimal order fixed in D41587, but I am not
entirely happy with the check, as it only applies to sub-optimal
patterns seen while building with EXPENSIVE_CHECKS on. It did not
discover any other sub-optimal pattern ordering.

Reviewers: Gerolf, spatel, mssimpso

Reviewed By: Gerolf, mssimpso

Differential Revision: https://reviews.llvm.org/D41766

Modified:
    llvm/trunk/lib/CodeGen/MachineCombiner.cpp
    llvm/trunk/test/CodeGen/AArch64/aarch64-combine-fmul-fsub.mir
    llvm/trunk/test/CodeGen/AArch64/machine-combiner.ll
    llvm/trunk/test/CodeGen/AArch64/machine-combiner.mir
    llvm/trunk/test/CodeGen/X86/machine-combiner-int-vec.ll
    llvm/trunk/test/CodeGen/X86/machine-combiner-int.ll
    llvm/trunk/test/CodeGen/X86/machine-combiner.ll

Modified: llvm/trunk/lib/CodeGen/MachineCombiner.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/MachineCombiner.cpp?rev=323873&r1=323872&r2=323873&view=diff
==============================================================================

--- llvm/trunk/lib/CodeGen/MachineCombiner.cpp (original)
+++ llvm/trunk/lib/CodeGen/MachineCombiner.cpp Wed Jan 31 05:54:30 2018
@@ -39,6 +39,20 @@ inc_threshold("machine-combiner-inc-thre
               cl::desc("Incremental depth computation will be used for basic "
                        "blocks with more instructions."), cl::init(500));
 
+#ifdef EXPENSIVE_CHECKS
+static cl::opt<bool> VerifyPatternOrder(
+    "machine-combiner-verify-pattern-order", cl::Hidden,
+    cl::desc(
+        "Verify that the generated patterns are ordered by increasing latency"),
+    cl::init(true));
+#else
+static cl::opt<bool> VerifyPatternOrder(
+    "machine-combiner-verify-pattern-order", cl::Hidden,
+    cl::desc(
+        "Verify that the generated patterns are ordered by increasing latency"),
+    cl::init(false));
+#endif
+
 namespace {
 class MachineCombiner : public MachineFunctionPass {
   const TargetInstrInfo *TII;
@@ -85,6 +99,14 @@ private:
                             SmallVectorImpl<MachineInstr *> &DelInstrs);
   void instr2instrSC(SmallVectorImpl<MachineInstr *> &Instrs,
                      SmallVectorImpl<const MCSchedClassDesc *> &InstrsSC);
+  std::pair<unsigned, unsigned>
+  getLatenciesForInstrSequences(MachineInstr &MI,
+                                SmallVectorImpl<MachineInstr *> &InsInstrs,
+                                SmallVectorImpl<MachineInstr *> &DelInstrs,
+                                MachineTraceMetrics::Trace BlockTrace);
+
+  void verifyPatternOrder(MachineBasicBlock *MBB, MachineInstr &Root,
+                          SmallVector<MachineCombinerPattern, 16> &Patterns);
 };
 }
 
@@ -242,6 +264,29 @@ static CombinerObjective getCombinerObje
   }
 }
 
+/// Estimate the latency of the new and original instruction sequence by summing
+/// up the latencies of the inserted and deleted instructions. This assumes
+/// that the inserted and deleted instructions are dependent instruction chains,
+/// which might not hold in all cases.
+std::pair<unsigned, unsigned> MachineCombiner::getLatenciesForInstrSequences(
+    MachineInstr &MI, SmallVectorImpl<MachineInstr *> &InsInstrs,
+    SmallVectorImpl<MachineInstr *> &DelInstrs,
+    MachineTraceMetrics::Trace BlockTrace) {
+  assert(!InsInstrs.empty() && "Only support sequences that insert instrs.");
+  unsigned NewRootLatency = 0;
+  // NewRoot is the last instruction in the \p InsInstrs vector.
+  MachineInstr *NewRoot = InsInstrs.back();
+  for (unsigned i = 0; i < InsInstrs.size() - 1; i++)
+    NewRootLatency += TSchedModel.computeInstrLatency(InsInstrs[i]);
+  NewRootLatency += getLatency(&MI, NewRoot, BlockTrace);
+
+  unsigned RootLatency = 0;
+  for (auto I : DelInstrs)
+    RootLatency += TSchedModel.computeInstrLatency(I);
+
+  return {NewRootLatency, RootLatency};
+}
+
 /// The DAGCombine code sequence ends in MI (Machine Instruction) Root.
 /// The new code sequence ends in MI NewRoot. A necessary condition for the new
 /// sequence to replace the old sequence is that it cannot lengthen the critical
@@ -257,10 +302,6 @@ bool MachineCombiner::improvesCriticalPa
     bool SlackIsAccurate) {
   assert(TSchedModel.hasInstrSchedModelOrItineraries() &&
          "Missing machine model\n");
-  // NewRoot is the last instruction in the \p InsInstrs vector.
-  unsigned NewRootIdx = InsInstrs.size() - 1;
-  MachineInstr *NewRoot = InsInstrs[NewRootIdx];
-
   // Get depth and latency of NewRoot and Root.
   unsigned NewRootDepth = getDepth(InsInstrs, InstrIdxForVirtReg, BlockTrace);
   unsigned RootDepth = BlockTrace.getInstrCycles(*Root).Depth;
@@ -282,17 +323,9 @@ bool MachineCombiner::improvesCriticalPa
   // even if the instruction depths (data dependency cycles) become worse.
 
   // Account for the latency of the inserted and deleted instructions by
-  // adding up their latencies. This assumes that the inserted and deleted
-  // instructions are dependent instruction chains, which might not hold
-  // in all cases.
-  unsigned NewRootLatency = 0;
-  for (unsigned i = 0; i < InsInstrs.size() - 1; i++)
-    NewRootLatency += TSchedModel.computeInstrLatency(InsInstrs[i]);
-  NewRootLatency += getLatency(Root, NewRoot, BlockTrace);
-
-  unsigned RootLatency = 0;
-  for (auto I : DelInstrs)
-    RootLatency += TSchedModel.computeInstrLatency(I);
+  unsigned NewRootLatency, RootLatency;
+  std::tie(NewRootLatency, RootLatency) =
+      getLatenciesForInstrSequences(*Root, InsInstrs, DelInstrs, BlockTrace);
 
   unsigned RootSlack = BlockTrace.getInstrSlack(*Root);
   unsigned NewCycleCount = NewRootDepth + NewRootLatency;
@@ -409,6 +442,34 @@ static void insertDeleteInstructions(Mac
   NumInstCombined++;
 }
 
+// Check that the difference between original and new latency is decreasing for
+// later patterns. This helps to discover sub-optimal pattern orderings.
+void MachineCombiner::verifyPatternOrder(
+    MachineBasicBlock *MBB, MachineInstr &Root,
+    SmallVector<MachineCombinerPattern, 16> &Patterns) {
+  long PrevLatencyDiff = std::numeric_limits<long>::max();
+  for (auto P : Patterns) {
+    SmallVector<MachineInstr *, 16> InsInstrs;
+    SmallVector<MachineInstr *, 16> DelInstrs;
+    DenseMap<unsigned, unsigned> InstrIdxForVirtReg;
+    TII->genAlternativeCodeSequence(Root, P, InsInstrs, DelInstrs,
+                                    InstrIdxForVirtReg);
+    // Found pattern, but did not generate alternative sequence.
+    // This can happen e.g. when an immediate could not be materialized
+    // in a single instruction.
+    if (InsInstrs.empty() || !TSchedModel.hasInstrSchedModelOrItineraries())
+      continue;
+
+    unsigned NewRootLatency, RootLatency;
+    std::tie(NewRootLatency, RootLatency) = getLatenciesForInstrSequences(
+        Root, InsInstrs, DelInstrs, MinInstr->getTrace(MBB));
+    long CurrentLatencyDiff = ((long)RootLatency) - ((long)NewRootLatency);
+    assert(CurrentLatencyDiff <= PrevLatencyDiff &&
+           "Current pattern is better than previous pattern.");
+    PrevLatencyDiff = CurrentLatencyDiff;
+  }
+}
+
 /// Substitute a slow code sequence with a faster one by
 /// evaluating instruction combining pattern.
 /// The prototype of such a pattern is MUl + ADD -> MADD. Performs instruction
@@ -459,11 +520,16 @@ bool MachineCombiner::combineInstruction
     // The algorithm does not try to evaluate all patterns and pick the best.
     // This is only an artificial restriction though. In practice there is
     // mostly one pattern, and getMachineCombinerPatterns() can order patterns
-    // based on an internal cost heuristic.
+    // based on an internal cost heuristic. If
+    // machine-combiner-verify-pattern-order is enabled, all patterns are
+    // checked to ensure later patterns do not provide better latency savings.
 
     if (!TII->getMachineCombinerPatterns(MI, Patterns))
       continue;
 
+    if (VerifyPatternOrder)
+      verifyPatternOrder(MBB, MI, Patterns);
+
     for (auto P : Patterns) {
       SmallVector<MachineInstr *, 16> InsInstrs;
       SmallVector<MachineInstr *, 16> DelInstrs;

Modified: llvm/trunk/test/CodeGen/AArch64/aarch64-combine-fmul-fsub.mir
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AArch64/aarch64-combine-fmul-fsub.mir?rev=323873&r1=323872&r2=323873&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AArch64/aarch64-combine-fmul-fsub.mir (original)
+++ llvm/trunk/test/CodeGen/AArch64/aarch64-combine-fmul-fsub.mir Wed Jan 31 05:54:30 2018
@@ -1,7 +1,7 @@
-# RUN: llc -run-pass=machine-combiner -o - -mtriple=aarch64-unknown-linux -mcpu=cortex-a57 -enable-unsafe-fp-math %s | FileCheck --check-prefixes=UNPROFITABLE,ALL %s
-# RUN: llc -run-pass=machine-combiner -o - -mtriple=aarch64-unknown-linux -mcpu=falkor -enable-unsafe-fp-math %s | FileCheck --check-prefixes=PROFITABLE,ALL %s
-# RUN: llc -run-pass=machine-combiner -o - -mtriple=aarch64-unknown-linux -mcpu=exynos-m1 -enable-unsafe-fp-math %s | FileCheck --check-prefixes=PROFITABLE,ALL %s
-# RUN: llc -run-pass=machine-combiner -o - -mtriple=aarch64-unknown-linux -mcpu=thunderx2t99 -enable-unsafe-fp-math %s | FileCheck --check-prefixes=PROFITABLE,ALL %s
+# RUN: llc -run-pass=machine-combiner -o - -mtriple=aarch64-unknown-linux -mcpu=cortex-a57 -enable-unsafe-fp-math -machine-combiner-verify-pattern-order=true %s | FileCheck --check-prefixes=UNPROFITABLE,ALL %s
+# RUN: llc -run-pass=machine-combiner -o - -mtriple=aarch64-unknown-linux -mcpu=falkor -enable-unsafe-fp-math %s -machine-combiner-verify-pattern-order=true | FileCheck --check-prefixes=PROFITABLE,ALL %s
+# RUN: llc -run-pass=machine-combiner -o - -mtriple=aarch64-unknown-linux -mcpu=exynos-m1 -enable-unsafe-fp-math -machine-combiner-verify-pattern-order=true %s | FileCheck --check-prefixes=PROFITABLE,ALL %s
+# RUN: llc -run-pass=machine-combiner -o - -mtriple=aarch64-unknown-linux -mcpu=thunderx2t99 -enable-unsafe-fp-math -machine-combiner-verify-pattern-order=true %s | FileCheck --check-prefixes=PROFITABLE,ALL %s
 #
 name:            f1_2s
 registers:
@@ -26,8 +26,8 @@ body:             |
 # UNPROFITABLE-NEXT: FSUBv2f32 killed %3, %2
 #
 # PROFITABLE-LABEL: name: f1_2s
-# PROFITABLE: %5:fpr64 = FNEGv2f32 %2
-# PROFITABLE-NEXT: FMLAv2f32 killed %5, %0, %1
+# PROFITABLE: [[R1:%[0-9]+]]:fpr64 = FNEGv2f32 %2
+# PROFITABLE-NEXT: FMLAv2f32 killed [[R1]], %0, %1
 ---
 name:            f1_4s
 registers:
@@ -52,8 +52,8 @@ body:             |
 # UNPROFITABLE-NEXT: FSUBv4f32 killed %3, %2
 #
 # PROFITABLE-LABEL: name: f1_4s
-# PROFITABLE: %5:fpr128 = FNEGv4f32 %2
-# PROFITABLE-NEXT: FMLAv4f32 killed %5, %0, %1
+# PROFITABLE: [[R1:%[0-9]+]]:fpr128 = FNEGv4f32 %2
+# PROFITABLE-NEXT: FMLAv4f32 killed [[R1]], %0, %1
 ---
 name:            f1_2d
 registers:
@@ -78,8 +78,8 @@ body:             |
 # UNPROFITABLE-NEXT: FSUBv2f64 killed %3, %2
 #
 # PROFITABLE-LABEL: name: f1_2d
-# PROFITABLE: %5:fpr128 = FNEGv2f64 %2
-# PROFITABLE-NEXT: FMLAv2f64 killed %5, %0, %1
+# PROFITABLE: [[R1:%[0-9]+]]:fpr128 = FNEGv2f64 %2
+# PROFITABLE-NEXT: FMLAv2f64 killed [[R1]], %0, %1
 ---
 name:            f1_both_fmul_2s
 registers:

Modified: llvm/trunk/test/CodeGen/AArch64/machine-combiner.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AArch64/machine-combiner.ll?rev=323873&r1=323872&r2=323873&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AArch64/machine-combiner.ll (original)
+++ llvm/trunk/test/CodeGen/AArch64/machine-combiner.ll Wed Jan 31 05:54:30 2018
@@ -3,7 +3,7 @@
 ; Incremental updates of the instruction depths should be enough for this test
 ; case.
 ; RUN: llc -mtriple=aarch64-gnu-linux -mcpu=cortex-a57 -enable-unsafe-fp-math \
-; RUN:     -disable-post-ra -machine-combiner-inc-threshold=0 < %s | FileCheck %s
+; RUN:     -disable-post-ra -machine-combiner-inc-threshold=0 -machine-combiner-verify-pattern-order=true < %s | FileCheck %s
 
 ; Verify that the first two adds are independent regardless of how the inputs are
 ; commuted. The destination registers are used as source registers for the third add.

Modified: llvm/trunk/test/CodeGen/AArch64/machine-combiner.mir
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AArch64/machine-combiner.mir?rev=323873&r1=323872&r2=323873&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AArch64/machine-combiner.mir (original)
+++ llvm/trunk/test/CodeGen/AArch64/machine-combiner.mir Wed Jan 31 05:54:30 2018
@@ -1,6 +1,6 @@
 # RUN: llc -mtriple=aarch64-none-linux-gnu -mcpu=cortex-a57 -enable-unsafe-fp-math \
 # RUN:     -run-pass machine-combiner -machine-combiner-inc-threshold=0 \
-# RUN:     -verify-machineinstrs  -o - %s | FileCheck %s
+# RUN:     -machine-combiner-verify-pattern-order=true -verify-machineinstrs  -o - %s | FileCheck %s
 ---
 # Test incremental depth updates succeed when triggered after the removal of
 # the first instruction in a basic block.

Modified: llvm/trunk/test/CodeGen/X86/machine-combiner-int-vec.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/machine-combiner-int-vec.ll?rev=323873&r1=323872&r2=323873&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/machine-combiner-int-vec.ll (original)
+++ llvm/trunk/test/CodeGen/X86/machine-combiner-int-vec.ll Wed Jan 31 05:54:30 2018
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=sse2 < %s | FileCheck %s --check-prefix=SSE
-; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=avx2 < %s | FileCheck %s --check-prefix=AVX
+; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=sse2 -machine-combiner-verify-pattern-order=true < %s | FileCheck %s --check-prefix=SSE
+; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=avx2 -machine-combiner-verify-pattern-order=true < %s | FileCheck %s --check-prefix=AVX
 
 ; Verify that 128-bit vector logical ops are reassociated.
 

Modified: llvm/trunk/test/CodeGen/X86/machine-combiner-int.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/machine-combiner-int.ll?rev=323873&r1=323872&r2=323873&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/machine-combiner-int.ll (original)
+++ llvm/trunk/test/CodeGen/X86/machine-combiner-int.ll Wed Jan 31 05:54:30 2018
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -stop-after machine-combiner -o - | FileCheck %s --check-prefix=DEAD
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -machine-combiner-verify-pattern-order=true | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -stop-after machine-combiner -machine-combiner-verify-pattern-order=true -o - | FileCheck %s --check-prefix=DEAD
 
 ; Verify that integer multiplies are reassociated. The first multiply in 
 ; each test should be independent of the result of the preceding add (lea).

Modified: llvm/trunk/test/CodeGen/X86/machine-combiner.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/machine-combiner.ll?rev=323873&r1=323872&r2=323873&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/machine-combiner.ll (original)
+++ llvm/trunk/test/CodeGen/X86/machine-combiner.ll Wed Jan 31 05:54:30 2018
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=sse -enable-unsafe-fp-math < %s | FileCheck %s --check-prefix=SSE
-; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=avx -enable-unsafe-fp-math < %s | FileCheck %s --check-prefix=AVX
+; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=sse -enable-unsafe-fp-math -machine-combiner-verify-pattern-order=true < %s | FileCheck %s --check-prefix=SSE
+; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=avx -enable-unsafe-fp-math -machine-combiner-verify-pattern-order=true < %s | FileCheck %s --check-prefix=AVX
 
 ; Incremental updates of the instruction depths should be enough for this test
 ; case.