[llvm] r289399 - instr-combiner: sum up all latencies of the transformed instructions

Sun Dec 11 11:39:33 PST 2016

Author: spop
Date: Sun Dec 11 13:39:32 2016
New Revision: 289399

URL: http://llvm.org/viewvc/llvm-project?rev=289399&view=rev
Log:
instr-combiner: sum up all latencies of the transformed instructions

We have found that -- when the selected subarchitecture has a scheduling model
and we are not optimizing for size -- the machine-instruction combiner uses a
too-simple algorithm to compute the cost of one of the two alternatives [before
and after running a combining pass on a section of code], and therefor it throws
away the combination results too often.

This fix has the potential to help any ISA with the potential to combine
instructions and for which at least one subarchitecture has a scheduling model.
As of now, this is only known to definitely affect AArch64 subarchitectures with
a scheduling model.

Regression tested on AMD64/GNU-Linux, new test case tested to fail on an
unpatched compiler and pass on a patched compiler.

Patch by Abe Skolnik and Sebastian Pop.

Added:
    llvm/trunk/test/CodeGen/AArch64/machine-combiner-madd.ll
Modified:
    llvm/trunk/lib/CodeGen/MachineCombiner.cpp
    llvm/trunk/test/CodeGen/AArch64/arm64-fma-combines.ll
    llvm/trunk/test/CodeGen/AArch64/mul-lohi.ll

Modified: llvm/trunk/lib/CodeGen/MachineCombiner.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/MachineCombiner.cpp?rev=289399&r1=289398&r2=289399&view=diff
==============================================================================

--- llvm/trunk/lib/CodeGen/MachineCombiner.cpp (original)
+++ llvm/trunk/lib/CodeGen/MachineCombiner.cpp Sun Dec 11 13:39:32 2016
@@ -71,6 +71,7 @@ private:
   improvesCriticalPathLen(MachineBasicBlock *MBB, MachineInstr *Root,
                           MachineTraceMetrics::Trace BlockTrace,
                           SmallVectorImpl<MachineInstr *> &InsInstrs,
+                          SmallVectorImpl<MachineInstr *> &DelInstrs,
                           DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
                           MachineCombinerPattern Pattern);
   bool preservesResourceLen(MachineBasicBlock *MBB,
@@ -242,6 +243,7 @@ bool MachineCombiner::improvesCriticalPa
     MachineBasicBlock *MBB, MachineInstr *Root,
     MachineTraceMetrics::Trace BlockTrace,
     SmallVectorImpl<MachineInstr *> &InsInstrs,
+    SmallVectorImpl<MachineInstr *> &DelInstrs,
     DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
     MachineCombinerPattern Pattern) {
   assert(TSchedModel.hasInstrSchedModelOrItineraries() &&
@@ -269,8 +271,13 @@ bool MachineCombiner::improvesCriticalPa
   // A more flexible cost calculation for the critical path includes the slack
   // of the original code sequence. This may allow the transform to proceed
   // even if the instruction depths (data dependency cycles) become worse.
+
   unsigned NewRootLatency = getLatency(Root, NewRoot, BlockTrace);
-  unsigned RootLatency = TSchedModel.computeInstrLatency(Root);
+  unsigned RootLatency = 0;
+
+  for (auto I : DelInstrs)
+    RootLatency += TSchedModel.computeInstrLatency(I);
+
   unsigned RootSlack = BlockTrace.getInstrSlack(*Root);
 
   DEBUG(dbgs() << " NewRootLatency: " << NewRootLatency << "\n";
@@ -421,7 +428,7 @@ bool MachineCombiner::combineInstruction
       // resource pressure.
       if (SubstituteAlways || doSubstitute(NewInstCount, OldInstCount) ||
           (improvesCriticalPathLen(MBB, &MI, BlockTrace, InsInstrs,
-                                   InstrIdxForVirtReg, P) &&
+                                   DelInstrs, InstrIdxForVirtReg, P) &&
            preservesResourceLen(MBB, BlockTrace, InsInstrs, DelInstrs))) {
         for (auto *InstrPtr : InsInstrs)
           MBB->insert((MachineBasicBlock::iterator) &MI, InstrPtr);

Modified: llvm/trunk/test/CodeGen/AArch64/arm64-fma-combines.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AArch64/arm64-fma-combines.ll?rev=289399&r1=289398&r2=289399&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AArch64/arm64-fma-combines.ll (original)
+++ llvm/trunk/test/CodeGen/AArch64/arm64-fma-combines.ll Sun Dec 11 13:39:32 2016
@@ -2,7 +2,7 @@
 define void @foo_2d(double* %src) {
 ; CHECK-LABEL: %entry
 ; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
+; CHECK: fmadd {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 entry:
   %arrayidx1 = getelementptr inbounds double, double* %src, i64 5
   %arrayidx2 = getelementptr inbounds double, double* %src, i64 11

Added: llvm/trunk/test/CodeGen/AArch64/machine-combiner-madd.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AArch64/machine-combiner-madd.ll?rev=289399&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/AArch64/machine-combiner-madd.ll (added)
+++ llvm/trunk/test/CodeGen/AArch64/machine-combiner-madd.ll Sun Dec 11 13:39:32 2016
@@ -0,0 +1,40 @@
+; Test all AArch64 subarches with scheduling models.
+; RUN: llc -mtriple=aarch64-linux-gnu -mcpu=cortex-a57 < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mcpu=cortex-a72 < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mcpu=cortex-a73 < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mcpu=cyclone    < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mcpu=exynos-m1  < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mcpu=exynos-m2  < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mcpu=kryo       < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mcpu=vulcan     < %s | FileCheck %s
+
+; Make sure that inst-combine fuses the multiply add in the addressing mode of
+; the load.
+
+; CHECK-LABEL: fun:
+; CHECK-NOT: mul
+; CHECK:     madd
+; CHECK-NOT: mul
+
+%class.D = type { %class.basic_string.base, [4 x i8] }
+%class.basic_string.base = type <{ i64, i64, i32 }>
+ at a = global %class.D* zeroinitializer, align 8
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i32, i1)
+define internal void @fun() section ".text.startup" {
+entry:
+  %tmp.i.i = alloca %class.D, align 8
+  %y = bitcast %class.D* %tmp.i.i to i8*
+  br label %loop
+loop:
+  %conv11.i.i = phi i64 [ 0, %entry ], [ %inc.i.i, %loop ]
+  %i = phi i64 [ undef, %entry ], [ %inc.i.i, %loop ]
+  %x = load %class.D*, %class.D** getelementptr inbounds (%class.D*, %class.D** @a, i64 0), align 8
+  %arrayidx.i.i.i = getelementptr inbounds %class.D, %class.D* %x, i64 %conv11.i.i
+  %d = bitcast %class.D* %arrayidx.i.i.i to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull %y, i8* %d, i64 24, i32 8, i1 false)
+  %inc.i.i = add i64 %i, 1
+  %cmp.i.i = icmp slt i64 %inc.i.i, 0
+  br i1 %cmp.i.i, label %loop, label %exit
+exit:
+  ret void
+}

Modified: llvm/trunk/test/CodeGen/AArch64/mul-lohi.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AArch64/mul-lohi.ll?rev=289399&r1=289398&r2=289399&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AArch64/mul-lohi.ll (original)
+++ llvm/trunk/test/CodeGen/AArch64/mul-lohi.ll Sun Dec 11 13:39:32 2016
@@ -3,16 +3,18 @@
 
 define i128 @test_128bitmul(i128 %lhs, i128 %rhs) {
 ; CHECK-LABEL: test_128bitmul:
-; CHECK-DAG: mul [[PART1:x[0-9]+]], x0, x3
-; CHECK-DAG: umulh [[CARRY:x[0-9]+]], x0, x2
-; CHECK: mul [[PART2:x[0-9]+]], x1, x2
-; CHECK: mul x0, x0, x2
+; CHECK:       umulh [[HI:x[0-9]+]], x0, x2
+; CHECK:       madd  [[TEMP1:x[0-9]+]], x0, x3, [[HI]]
+; CHECK-DAG:   madd  x1, x1, x2, [[TEMP1]]
+; CHECK-DAG:   mul   x0, x0, x2
+; CHECK-NEXT:  ret
 
 ; CHECK-BE-LABEL: test_128bitmul:
-; CHECK-BE-DAG: mul [[PART1:x[0-9]+]], x1, x2
-; CHECK-BE-DAG: umulh [[CARRY:x[0-9]+]], x1, x3
-; CHECK-BE: mul [[PART2:x[0-9]+]], x0, x3
-; CHECK-BE: mul x1, x1, x3
+; CHECK-BE:       umulh [[HI:x[0-9]+]], x1, x3
+; CHECK-BE:       madd  [[TEMP1:x[0-9]+]], x1, x2, [[HI]]
+; CHECK-BE-DAG:   madd  x0, x0, x3, [[TEMP1]]
+; CHECK-BE-DAG:   mul   x1, x1, x3
+; CHECK-BE-NEXT:  ret
 
   %prod = mul i128 %lhs, %rhs
   ret i128 %prod
@@ -25,8 +27,8 @@ define i128 @test_128bitmul_optsize(i128
 ; CHECK-LABEL: test_128bitmul_optsize:
 ; CHECK:       umulh [[HI:x[0-9]+]], x0, x2
 ; CHECK-NEXT:  madd  [[TEMP1:x[0-9]+]], x0, x3, [[HI]]
-; CHECK-NEXT:  madd  x1, x1, x2, [[TEMP1]]
-; CHECK-NEXT:  mul   x0, x0, x2
+; CHECK-DAG:   madd  x1, x1, x2, [[TEMP1]]
+; CHECK-DAG:   mul   x0, x0, x2
 ; CHECK-NEXT:  ret
 
   %prod = mul i128 %lhs, %rhs
@@ -37,8 +39,8 @@ define i128 @test_128bitmul_minsize(i128
 ; CHECK-LABEL: test_128bitmul_minsize:
 ; CHECK:       umulh [[HI:x[0-9]+]], x0, x2
 ; CHECK-NEXT:  madd  [[TEMP1:x[0-9]+]], x0, x3, [[HI]]
-; CHECK-NEXT:  madd  x1, x1, x2, [[TEMP1]]
-; CHECK-NEXT:  mul   x0, x0, x2
+; CHECK-DAG:   madd  x1, x1, x2, [[TEMP1]]
+; CHECK-DAG:   mul   x0, x0, x2
 ; CHECK-NEXT:  ret
 
   %prod = mul i128 %lhs, %rhs