[llvm] r241515 - [x86] extend machine combiner reassociation optimization to SSE scalar adds

Mon Jul 6 15:35:29 PDT 2015

Author: spatel
Date: Mon Jul  6 17:35:29 2015
New Revision: 241515

URL: http://llvm.org/viewvc/llvm-project?rev=241515&view=rev
Log:
[x86] extend machine combiner reassociation optimization to SSE scalar adds

Extend the reassociation optimization of http://reviews.llvm.org/rL240361 (D10460)
to SSE scalar FP SP adds in addition to AVX scalar FP SP adds.

With the 'switch' in place, we can trivially add other opcodes and test cases in
future patches.

Differential Revision: http://reviews.llvm.org/D10975


Modified:
    llvm/trunk/lib/Target/X86/X86InstrInfo.cpp
    llvm/trunk/test/CodeGen/X86/machine-combiner.ll

Modified: llvm/trunk/lib/Target/X86/X86InstrInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86InstrInfo.cpp?rev=241515&r1=241514&r2=241515&view=diff
==============================================================================

--- llvm/trunk/lib/Target/X86/X86InstrInfo.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86InstrInfo.cpp Mon Jul  6 17:35:29 2015
@@ -6417,18 +6417,30 @@ static bool hasReassocSibling(const Mach
   return false;
 }
 
+// TODO: There are many more machine instruction opcodes to match:
+//       1. Other data types (double, integer, vectors)
+//       2. Other math / logic operations (mul, and, or)
+static bool isAssociativeAndCommutative(unsigned Opcode) {
+  switch (Opcode) {
+  case X86::VADDSSrr:
+  case X86::ADDSSrr:
+    return true;
+  default:
+    return false;
+  }
+}
+
 /// Return true if the input instruction is part of a chain of dependent ops
 /// that are suitable for reassociation, otherwise return false.
 /// If the instruction's operands must be commuted to have a previous
 /// instruction of the same type define the first source operand, Commuted will
 /// be set to true.
-static bool isReassocCandidate(const MachineInstr &Inst, unsigned AssocOpcode,
-                               bool &Commuted) {
-  // 1. The instruction must have the correct type.
+static bool isReassocCandidate(const MachineInstr &Inst, bool &Commuted) {
+  // 1. The operation must be associative and commutative.
   // 2. The instruction must have virtual register definitions for its
   //    operands in the same basic block.
-  // 3. The instruction must have a reassociatable sibling.
-  if (Inst.getOpcode() == AssocOpcode &&
+  // 3. The instruction must have a reassociable sibling.
+  if (isAssociativeAndCommutative(Inst.getOpcode()) &&
       hasVirtualRegDefsInBasicBlock(Inst, Inst.getParent()) &&
       hasReassocSibling(Inst, Commuted))
     return true;
@@ -6455,14 +6467,8 @@ bool X86InstrInfo::getMachineCombinerPat
   //   B = A op X (Prev)
   //   C = B op Y (Root)
 
-  // TODO: There are many more associative instruction types to match:
-  //       1. Other forms of scalar FP add (non-AVX)
-  //       2. Other data types (double, integer, vectors)
-  //       3. Other math / logic operations (mul, and, or)
-  unsigned AssocOpcode = X86::VADDSSrr;
-
-  bool Commute = false;
-  if (isReassocCandidate(Root, AssocOpcode, Commute)) {
+  bool Commute;
+  if (isReassocCandidate(Root, Commute)) {
     // We found a sequence of instructions that may be suitable for a
     // reassociation of operands to increase ILP. Specify each commutation
     // possibility for the Prev instruction in the sequence and let the

Modified: llvm/trunk/test/CodeGen/X86/machine-combiner.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/machine-combiner.ll?rev=241515&r1=241514&r2=241515&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/machine-combiner.ll (original)
+++ llvm/trunk/test/CodeGen/X86/machine-combiner.ll Mon Jul  6 17:35:29 2015
@@ -1,15 +1,23 @@
-; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=avx -enable-unsafe-fp-math < %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=sse -enable-unsafe-fp-math < %s | FileCheck %s --check-prefix=SSE
+; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=avx -enable-unsafe-fp-math < %s | FileCheck %s --check-prefix=AVX
 
 ; Verify that the first two adds are independent regardless of how the inputs are
 ; commuted. The destination registers are used as source registers for the third add.
 
 define float @reassociate_adds1(float %x0, float %x1, float %x2, float %x3) {
-; CHECK-LABEL: reassociate_adds1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    vaddss %xmm3, %xmm2, %xmm1
-; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    retq
+; SSE-LABEL: reassociate_adds1:
+; SSE:       # BB#0:
+; SSE-NEXT:    addss %xmm1, %xmm0
+; SSE-NEXT:    addss %xmm3, %xmm2
+; SSE-NEXT:    addss %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: reassociate_adds1:
+; AVX:       # BB#0:
+; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vaddss %xmm3, %xmm2, %xmm1
+; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
   %t0 = fadd float %x0, %x1
   %t1 = fadd float %t0, %x2
   %t2 = fadd float %t1, %x3
@@ -17,12 +25,19 @@ define float @reassociate_adds1(float %x
 }
 
 define float @reassociate_adds2(float %x0, float %x1, float %x2, float %x3) {
-; CHECK-LABEL: reassociate_adds2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    vaddss %xmm3, %xmm2, %xmm1
-; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    retq
+; SSE-LABEL: reassociate_adds2:
+; SSE:       # BB#0:
+; SSE-NEXT:    addss %xmm1, %xmm0
+; SSE-NEXT:    addss %xmm3, %xmm2
+; SSE-NEXT:    addss %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: reassociate_adds2:
+; AVX:       # BB#0:
+; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vaddss %xmm3, %xmm2, %xmm1
+; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
   %t0 = fadd float %x0, %x1
   %t1 = fadd float %x2, %t0
   %t2 = fadd float %t1, %x3
@@ -30,12 +45,19 @@ define float @reassociate_adds2(float %x
 }
 
 define float @reassociate_adds3(float %x0, float %x1, float %x2, float %x3) {
-; CHECK-LABEL: reassociate_adds3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    vaddss %xmm3, %xmm2, %xmm1
-; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    retq
+; SSE-LABEL: reassociate_adds3:
+; SSE:       # BB#0:
+; SSE-NEXT:    addss %xmm1, %xmm0
+; SSE-NEXT:    addss %xmm3, %xmm2
+; SSE-NEXT:    addss %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: reassociate_adds3:
+; AVX:       # BB#0:
+; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vaddss %xmm3, %xmm2, %xmm1
+; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
   %t0 = fadd float %x0, %x1
   %t1 = fadd float %t0, %x2
   %t2 = fadd float %x3, %t1
@@ -43,12 +65,19 @@ define float @reassociate_adds3(float %x
 }
 
 define float @reassociate_adds4(float %x0, float %x1, float %x2, float %x3) {
-; CHECK-LABEL: reassociate_adds4:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    vaddss %xmm3, %xmm2, %xmm1
-; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    retq
+; SSE-LABEL: reassociate_adds4:
+; SSE:       # BB#0:
+; SSE-NEXT:    addss %xmm1, %xmm0
+; SSE-NEXT:    addss %xmm3, %xmm2
+; SSE-NEXT:    addss %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: reassociate_adds4:
+; AVX:       # BB#0:
+; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vaddss %xmm3, %xmm2, %xmm1
+; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
   %t0 = fadd float %x0, %x1
   %t1 = fadd float %x2, %t0
   %t2 = fadd float %x3, %t1
@@ -59,16 +88,27 @@ define float @reassociate_adds4(float %x
 ; produced because that would cost more compile time.
 
 define float @reassociate_adds5(float %x0, float %x1, float %x2, float %x3, float %x4, float %x5, float %x6, float %x7) {
-; CHECK-LABEL: reassociate_adds5:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    vaddss %xmm3, %xmm2, %xmm1
-; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    vaddss %xmm5, %xmm4, %xmm1
-; CHECK-NEXT:    vaddss %xmm6, %xmm1, %xmm1
-; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    vaddss %xmm7, %xmm0, %xmm0
-; CHECK-NEXT:    retq
+; SSE-LABEL: reassociate_adds5:
+; SSE:       # BB#0:
+; SSE-NEXT:    addss %xmm1, %xmm0
+; SSE-NEXT:    addss %xmm3, %xmm2
+; SSE-NEXT:    addss %xmm2, %xmm0
+; SSE-NEXT:    addss %xmm5, %xmm4
+; SSE-NEXT:    addss %xmm6, %xmm4
+; SSE-NEXT:    addss %xmm4, %xmm0
+; SSE-NEXT:    addss %xmm7, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: reassociate_adds5:
+; AVX:       # BB#0:
+; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vaddss %xmm3, %xmm2, %xmm1
+; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vaddss %xmm5, %xmm4, %xmm1
+; AVX-NEXT:    vaddss %xmm6, %xmm1, %xmm1
+; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vaddss %xmm7, %xmm0, %xmm0
+; AVX-NEXT:    retq
   %t0 = fadd float %x0, %x1
   %t1 = fadd float %t0, %x2
   %t2 = fadd float %t1, %x3
@@ -83,14 +123,21 @@ define float @reassociate_adds5(float %x
 ; Also, we should reassociate such that the result of the high latency division
 ; is used by the final 'add' rather than reassociating the %x3 operand with the
 ; division. The latter reassociation would not improve anything.
- 
+
 define float @reassociate_adds6(float %x0, float %x1, float %x2, float %x3) {
-; CHECK-LABEL: reassociate_adds6:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vdivss %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    vaddss %xmm3, %xmm2, %xmm1
-; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    retq
+; SSE-LABEL: reassociate_adds6:
+; SSE:       # BB#0:
+; SSE-NEXT:    divss %xmm1, %xmm0
+; SSE-NEXT:    addss %xmm3, %xmm2
+; SSE-NEXT:    addss %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: reassociate_adds6:
+; AVX:       # BB#0:
+; AVX-NEXT:    vdivss %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vaddss %xmm3, %xmm2, %xmm1
+; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
   %t0 = fdiv float %x0, %x1
   %t1 = fadd float %x2, %t0
   %t2 = fadd float %x3, %t1