[PATCH][X86] Teach the backend how to simplify/canonicalize dag nodes introduced during type legalization.

Thu May 29 11:51:59 PDT 2014

Hi,

This patch teaches the backend how to simplify/canonicalize dag node
sequences normally introduced by the backend when promoting dag nodes
with an illegal vector types.

Example:

define double @foo(double %A, double %B) {
  %1 = bitcast double %A to <2 x i32>
  %2 = bitcast double %B to <2 x i32>
  %add = add <2 x i32> %1, %2
  %3 = bitcast <2 x i32> %add to double
  ret double %3
}

All the bitcasts in function @foo are promoted to type MVT::v2i64,
since type MVT::v2i32 is not a legal type. For the same reason, the
integer result of the vector add node also promoted.

Type promotion might introduce new build_vector nodes (which are then
combined into shuffles) and bitcast operations. This is what happens
for example with function 'foo' that is compiled to the following
assembly sequence (using -mcpu=corei7):
  pmovzxdq  %xmm0, %xmm0  # promotion from <2 x i32> to <2 x i64>
  pmovzxdq  %xmm1, %xmm1  # promotion from <2 x i32> to <2 x i64>
  paddq %xmm0, %xmm1         # promoted to a legal add of type <2 x i64>
  pshufd $8, %xmm1, %xmm0  # xmm0 = xmm1[0,2,u,u]

Ideally, the backend should be able to understand that the code
sequence above can be simplified into a single instruction:
  paddd %xmm0, %xmm1

This patch adds two new combine rules:
1)
  fold (shuffle (bitcast (BINOP A, B)), Undef, <Mask>) ->
       (shuffle (BINOP (bitcast A), (bitcast B)), Undef, <Mask>)

2)
  fold (BINOP (shuffle (A, Undef, <Mask>)), (shuffle (B, Undef, <Mask>))) ->
       (shuffle (BINOP A, B), Undef, <Mask>).

The goal is to simplify the dag node sequence when dealing with 64-bit
vector types.

Both rules are only triggered on the type-legalized DAG.
In particular, rule 1. is a target specific combine rule that attempts
to sink a bitconvert into the operands of a binary operation.
Rule 2. is a target independet rule that attempts to move a shuffle
immediately after a binary operation.

With this patch, all the functions from test 'combine-64bit-vbinop.ll'
(a new test) are now strongly simplified.
In the case of function @foo (from the example), the backend now
correctly generates a single 'paddd' instruction.

Please let me know if ok to submit.

Thanks,
Andrea Di Biagio
-------------- next part --------------
Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp
===================================================================

--- lib/CodeGen/SelectionDAG/DAGCombiner.cpp	(revision 209833)
+++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp	(working copy)
@@ -10740,6 +10740,28 @@
       return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N), LHS.getValueType(), Ops);
   }
 
+
+  // Type legalization might introduce new shuffles in the DAG.
+  // Fold (VBinOp (shuffle (A, Undef, Mask)), (shuffle (B, Undef, Mask)))
+  //   -> (shuffle (VBinOp (A, B)), Undef, Mask).
+  if (LegalTypes && isa<ShuffleVectorSDNode>(LHS) &&
+      isa<ShuffleVectorSDNode>(RHS) && LHS.hasOneUse() && RHS.hasOneUse() &&
+      LHS.getOperand(1).getOpcode() == ISD::UNDEF &&
+      RHS.getOperand(1).getOpcode() == ISD::UNDEF) {
+    ShuffleVectorSDNode *SVN0 = cast<ShuffleVectorSDNode>(LHS);
+    ShuffleVectorSDNode *SVN1 = cast<ShuffleVectorSDNode>(RHS);
+
+    if (SVN0->getMask().equals(SVN1->getMask())) {
+      EVT VT = N->getValueType(0);
+      SDValue UndefVector = LHS.getOperand(1);
+      SDValue NewBinOp = DAG.getNode(N->getOpcode(), SDLoc(N), VT,
+                                     LHS.getOperand(0), RHS.getOperand(0));
+      AddUsersToWorkList(N);
+      return DAG.getVectorShuffle(VT, SDLoc(N), NewBinOp, UndefVector,
+                                  &SVN0->getMask()[0]);
+    }
+  }
+
   return SDValue();
 }
 
Index: lib/Target/X86/X86ISelLowering.cpp
===================================================================
--- lib/Target/X86/X86ISelLowering.cpp	(revision 209833)
+++ lib/Target/X86/X86ISelLowering.cpp	(working copy)
@@ -17478,6 +17478,8 @@
                                      TargetLowering::DAGCombinerInfo &DCI,
                                      const X86Subtarget *Subtarget) {
   SDLoc dl(N);
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
   EVT VT = N->getValueType(0);
 
   // Don't create instructions with illegal types after legalize types has run.
@@ -17490,6 +17492,62 @@
       N->getOpcode() == ISD::VECTOR_SHUFFLE)
     return PerformShuffleCombine256(N, DAG, DCI, Subtarget);
 
+  // During Type Legalization, when promoting illegal vector types,
+  // the backend might introduce new shuffle dag nodes and bitcasts.
+  //
+  // This code performs the following transformation:
+  // fold: (shuffle (bitcast (BINOP A, B)), Undef, <Mask>) ->
+  //       (shuffle (BINOP (bitcast A), (bitcast B)), Undef, <Mask>)
+  //
+  // 
+  // We do this only if both the bitcast and the BINOP dag nodes have
+  // one use. Also, perform this transformation only if the new binary
+  // operation is legal. This is to avoid introducing dag nodes that
+  // potentially need to be further expanded (or custom lowered) into a
+  // less optimal sequence of dag nodes.
+  if (!DCI.isBeforeLegalize() && DCI.isBeforeLegalizeOps() &&
+      N1.getOpcode() == ISD::UNDEF && N0.hasOneUse() &&
+      N0.getOpcode() == ISD::BITCAST) {
+    SDValue BC0 = N0.getOperand(0);
+    EVT SVT = BC0.getValueType();
+    unsigned Opcode = BC0.getOpcode();
+    unsigned NumElts = VT.getVectorNumElements();
+    bool IsValidCandidate = false;
+    
+    if (BC0.hasOneUse() && SVT.isVector() &&
+        SVT.getVectorNumElements() * 2 == NumElts &&
+        TLI.isOperationLegal(Opcode, VT)) {
+      bool CanFold = false;
+      switch (Opcode) {
+      default : break;
+      case ISD::ADD :
+      case ISD::FADD :
+      case ISD::SUB :
+      case ISD::FSUB :
+      case ISD::MUL :
+      case ISD::FMUL :
+      case ISD::AND :
+      case ISD::OR :
+      case ISD::XOR :
+        CanFold = true;
+      }
+
+      unsigned SVTNumElts = SVT.getVectorNumElements();
+      ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+      for (unsigned i = 0, e = SVTNumElts; i != e && CanFold; ++i)
+        CanFold = SVOp->getMaskElt(i) == (int)(i * 2);
+      for (unsigned i = SVTNumElts, e = NumElts; i != e && CanFold; ++i)
+        CanFold = SVOp->getMaskElt(i) < 0;
+
+      if (CanFold) {
+        SDValue BC00 = DAG.getNode(ISD::BITCAST, dl, VT, BC0.getOperand(0));
+        SDValue BC01 = DAG.getNode(ISD::BITCAST, dl, VT, BC0.getOperand(1));
+        SDValue NewBinOp = DAG.getNode(BC0.getOpcode(), dl, VT, BC00, BC01);
+        return DAG.getVectorShuffle(VT, dl, NewBinOp, N1, &SVOp->getMask()[0]);
+      }
+    }
+  }
+
   // Only handle 128 wide vector from here on.
   if (!VT.is128BitVector())
     return SDValue();
Index: test/CodeGen/X86/lower-bitcast.ll
===================================================================
--- test/CodeGen/X86/lower-bitcast.ll	(revision 209833)
+++ test/CodeGen/X86/lower-bitcast.ll	(working copy)
@@ -14,7 +14,7 @@
 ; CHECK-LABEL: test1
 ; CHECK-NOT: movsd
 ; CHECK: pshufd
-; CHECK-NEXT: paddq
+; CHECK-NEXT: paddd
 ; CHECK-NEXT: pshufd
 ; CHECK-NEXT: ret
 
@@ -26,16 +26,9 @@
   %3 = bitcast <2 x i32> %add to double
   ret double %3
 }
-; FIXME: Ideally we should be able to fold the entire body of @test2 into a
-; single 'paddd %xmm1, %xmm0' instruction. At the moment we produce the
-; sequence pshufd+pshufd+paddq+pshufd.
-
 ; CHECK-LABEL: test2
 ; CHECK-NOT: movsd
-; CHECK: pshufd
-; CHECK-NEXT: pshufd
-; CHECK-NEXT: paddq
-; CHECK-NEXT: pshufd
+; CHECK: paddd
 ; CHECK-NEXT: ret
 
 
@@ -91,7 +84,7 @@
 ; CHECK-LABEL: test6
 ; CHECK-NOT: movsd
 ; CHECK: punpcklwd
-; CHECK-NEXT: paddd
+; CHECK-NEXT: paddw
 ; CHECK-NEXT: pshufb
 ; CHECK-NEXT: ret
 
@@ -103,16 +96,10 @@
   %3 = bitcast <4 x i16> %add to double
   ret double %3
 }
-; FIXME: Ideally we should be able to fold the entire body of @test7 into a
-; single 'paddw %xmm1, %xmm0' instruction. At the moment we produce the
-; sequence pshufd+pshufd+paddd+pshufd.
-
 ; CHECK-LABEL: test7
 ; CHECK-NOT: movsd
-; CHECK: punpcklwd
-; CHECK-NEXT: punpcklwd
-; CHECK-NEXT: paddd
-; CHECK-NEXT: pshufb
+; CHECK-NOT: punpcklwd
+; CHECK: paddw
 ; CHECK-NEXT: ret
 
 
@@ -129,7 +116,7 @@
 ; CHECK-LABEL: test8
 ; CHECK-NOT: movsd
 ; CHECK: punpcklbw
-; CHECK-NEXT: paddw
+; CHECK-NEXT: paddb
 ; CHECK-NEXT: pshufb
 ; CHECK-NEXT: ret
 
@@ -141,15 +128,9 @@
   %3 = bitcast <8 x i8> %add to double
   ret double %3
 }
-; FIXME: Ideally we should be able to fold the entire body of @test9 into a
-; single 'paddb %xmm1, %xmm0' instruction. At the moment we produce the
-; sequence pshufd+pshufd+paddw+pshufd.
-
 ; CHECK-LABEL: test9
 ; CHECK-NOT: movsd
-; CHECK: punpcklbw
-; CHECK-NEXT: punpcklbw
-; CHECK-NEXT: paddw
-; CHECK-NEXT: pshufb
+; CHECK-NOT: punpcklbw
+; CHECK: paddb
 ; CHECK-NEXT: ret
 
Index: test/CodeGen/X86/combine-64bit-vec-binop.ll
===================================================================
--- test/CodeGen/X86/combine-64bit-vec-binop.ll	(revision 0)
+++ test/CodeGen/X86/combine-64bit-vec-binop.ll	(working copy)
@@ -0,0 +1,273 @@
+; RUN: llc < %s -march=x86-64 -mcpu=corei7 -mtriple=x86_64-unknown-linux-gnu | FileCheck %s -check-prefix=CHECK -check-prefix=SSE41
+; RUN: llc < %s -march=x86-64 -mcpu=corei7-avx -mtriple=x86_64-unknown-linux-gnu | FileCheck %s -check-prefix=CHECK -check-prefix=AVX
+
+
+define double @test1_add(double %A, double %B) {
+  %1 = bitcast double %A to <2 x i32>
+  %2 = bitcast double %B to <2 x i32>
+  %add = add <2 x i32> %1, %2
+  %3 = bitcast <2 x i32> %add to double
+  ret double %3
+}
+; CHECK-LABEL: test1_add
+; SSE41: paddd
+; AVX: vpaddd
+; CHECK-NEXT: ret
+
+
+define double @test2_add(double %A, double %B) {
+  %1 = bitcast double %A to <4 x i16>
+  %2 = bitcast double %B to <4 x i16>
+  %add = add <4 x i16> %1, %2
+  %3 = bitcast <4 x i16> %add to double
+  ret double %3
+}
+; CHECK-LABEL: test2_add
+; SSE41: paddw
+; AVX: vpaddw
+; CHECK-NEXT: ret
+
+define double @test3_add(double %A, double %B) {
+  %1 = bitcast double %A to <8 x i8>
+  %2 = bitcast double %B to <8 x i8>
+  %add = add <8 x i8> %1, %2
+  %3 = bitcast <8 x i8> %add to double
+  ret double %3
+}
+; CHECK-LABEL: test3_add
+; SSE41: paddb
+; AVX: vpaddb
+; CHECK-NEXT: ret
+
+
+define double @test1_sub(double %A, double %B) {
+  %1 = bitcast double %A to <2 x i32>
+  %2 = bitcast double %B to <2 x i32>
+  %sub = sub <2 x i32> %1, %2
+  %3 = bitcast <2 x i32> %sub to double
+  ret double %3
+}
+; CHECK-LABEL: test1_sub
+; SSE41: psubd
+; AVX: vpsubd
+; CHECK-NEXT: ret
+
+
+define double @test2_sub(double %A, double %B) {
+  %1 = bitcast double %A to <4 x i16>
+  %2 = bitcast double %B to <4 x i16>
+  %sub = sub <4 x i16> %1, %2
+  %3 = bitcast <4 x i16> %sub to double
+  ret double %3
+}
+; CHECK-LABEL: test2_sub
+; SSE41: psubw
+; AVX: vpsubw
+; CHECK-NEXT: ret
+
+
+define double @test3_sub(double %A, double %B) {
+  %1 = bitcast double %A to <8 x i8>
+  %2 = bitcast double %B to <8 x i8>
+  %sub = sub <8 x i8> %1, %2
+  %3 = bitcast <8 x i8> %sub to double
+  ret double %3
+}
+; CHECK-LABEL: test3_sub
+; SSE41: psubb
+; AVX: vpsubb
+; CHECK-NEXT: ret
+
+
+define double @test1_mul(double %A, double %B) {
+  %1 = bitcast double %A to <2 x i32>
+  %2 = bitcast double %B to <2 x i32>
+  %mul = mul <2 x i32> %1, %2
+  %3 = bitcast <2 x i32> %mul to double
+  ret double %3
+}
+; CHECK-LABEL: test1_mul
+; SSE41: pmulld
+; AVX: vpmulld
+; CHECK-NEXT: ret
+
+
+define double @test2_mul(double %A, double %B) {
+  %1 = bitcast double %A to <4 x i16>
+  %2 = bitcast double %B to <4 x i16>
+  %mul = mul <4 x i16> %1, %2
+  %3 = bitcast <4 x i16> %mul to double
+  ret double %3
+}
+; CHECK-LABEL: test2_mul
+; SSE41: pmullw
+; AVX: vpmullw
+; CHECK-NEXT: ret
+
+; There is no legal ISD::MUL with type MVT::v8i16.
+define double @test3_mul(double %A, double %B) {
+  %1 = bitcast double %A to <8 x i8>
+  %2 = bitcast double %B to <8 x i8>
+  %mul = mul <8 x i8> %1, %2
+  %3 = bitcast <8 x i8> %mul to double
+  ret double %3
+}
+; CHECK-LABEL: test3_mul
+; CHECK: pmullw
+; CHECK-NEXT: pshufb
+; CHECK-NEXT: ret
+
+
+define double @test1_and(double %A, double %B) {
+  %1 = bitcast double %A to <2 x i32>
+  %2 = bitcast double %B to <2 x i32>
+  %and = and <2 x i32> %1, %2
+  %3 = bitcast <2 x i32> %and to double
+  ret double %3
+}
+; CHECK-LABEL: test1_and
+; SSE41: andps
+; AVX: vandps
+; CHECK-NEXT: ret
+
+
+define double @test2_and(double %A, double %B) {
+  %1 = bitcast double %A to <4 x i16>
+  %2 = bitcast double %B to <4 x i16>
+  %and = and <4 x i16> %1, %2
+  %3 = bitcast <4 x i16> %and to double
+  ret double %3
+}
+; CHECK-LABEL: test2_and
+; SSE41: andps
+; AVX: vandps
+; CHECK-NEXT: ret
+
+
+define double @test3_and(double %A, double %B) {
+  %1 = bitcast double %A to <8 x i8>
+  %2 = bitcast double %B to <8 x i8>
+  %and = and <8 x i8> %1, %2
+  %3 = bitcast <8 x i8> %and to double
+  ret double %3
+}
+; CHECK-LABEL: test3_and
+; SSE41: andps
+; AVX: vandps
+; CHECK-NEXT: ret
+
+
+define double @test1_or(double %A, double %B) {
+  %1 = bitcast double %A to <2 x i32>
+  %2 = bitcast double %B to <2 x i32>
+  %or = or <2 x i32> %1, %2
+  %3 = bitcast <2 x i32> %or to double
+  ret double %3
+}
+; CHECK-LABEL: test1_or
+; SSE41: orps
+; AVX: vorps
+; CHECK-NEXT: ret
+
+
+define double @test2_or(double %A, double %B) {
+  %1 = bitcast double %A to <4 x i16>
+  %2 = bitcast double %B to <4 x i16>
+  %or = or <4 x i16> %1, %2
+  %3 = bitcast <4 x i16> %or to double
+  ret double %3
+}
+; CHECK-LABEL: test2_or
+; SSE41: orps
+; AVX: vorps
+; CHECK-NEXT: ret
+
+
+define double @test3_or(double %A, double %B) {
+  %1 = bitcast double %A to <8 x i8>
+  %2 = bitcast double %B to <8 x i8>
+  %or = or <8 x i8> %1, %2
+  %3 = bitcast <8 x i8> %or to double
+  ret double %3
+}
+; CHECK-LABEL: test3_or
+; SSE41: orps
+; AVX: vorps
+; CHECK-NEXT: ret
+
+
+define double @test1_xor(double %A, double %B) {
+  %1 = bitcast double %A to <2 x i32>
+  %2 = bitcast double %B to <2 x i32>
+  %xor = xor <2 x i32> %1, %2
+  %3 = bitcast <2 x i32> %xor to double
+  ret double %3
+}
+; CHECK-LABEL: test1_xor
+; SSE41: xorps
+; AVX: vxorps
+; CHECK-NEXT: ret
+
+
+define double @test2_xor(double %A, double %B) {
+  %1 = bitcast double %A to <4 x i16>
+  %2 = bitcast double %B to <4 x i16>
+  %xor = xor <4 x i16> %1, %2
+  %3 = bitcast <4 x i16> %xor to double
+  ret double %3
+}
+; CHECK-LABEL: test2_xor
+; SSE41: xorps
+; AVX: vxorps
+; CHECK-NEXT: ret
+
+
+define double @test3_xor(double %A, double %B) {
+  %1 = bitcast double %A to <8 x i8>
+  %2 = bitcast double %B to <8 x i8>
+  %xor = xor <8 x i8> %1, %2
+  %3 = bitcast <8 x i8> %xor to double
+  ret double %3
+}
+; CHECK-LABEL: test3_xor
+; SSE41: xorps
+; AVX: vxorps
+; CHECK-NEXT: ret
+
+
+define double @test_fadd(double %A, double %B) {
+  %1 = bitcast double %A to <2 x float>
+  %2 = bitcast double %B to <2 x float>
+  %add = fadd <2 x float> %1, %2
+  %3 = bitcast <2 x float> %add to double
+  ret double %3
+}
+; CHECK-LABEL: test_fadd
+; SSE41: addps
+; AVX: vaddps
+; CHECK-NEXT: ret
+
+define double @test_fsub(double %A, double %B) {
+  %1 = bitcast double %A to <2 x float>
+  %2 = bitcast double %B to <2 x float>
+  %sub = fsub <2 x float> %1, %2
+  %3 = bitcast <2 x float> %sub to double
+  ret double %3
+}
+; CHECK-LABEL: test_fsub
+; SSE41: subps
+; AVX: vsubps
+; CHECK-NEXT: ret
+
+define double @test_fmul(double %A, double %B) {
+  %1 = bitcast double %A to <2 x float>
+  %2 = bitcast double %B to <2 x float>
+  %mul = fmul <2 x float> %1, %2
+  %3 = bitcast <2 x float> %mul to double
+  ret double %3
+}
+; CHECK-LABEL: test_fmul
+; SSE41: mulps
+; AVX: vmulps
+; CHECK-NEXT: ret
+