[llvm-branch-commits] [llvm] 770d1e0 - [X86][SSE] isHorizontalBinOp - reuse any existing horizontal ops.

Mon Jan 18 02:19:05 PST 2021

Author: Simon Pilgrim
Date: 2021-01-18T10:14:45Z
New Revision: 770d1e0a8828010a7c95de4596e24d54ed2527c3

URL: https://github.com/llvm/llvm-project/commit/770d1e0a8828010a7c95de4596e24d54ed2527c3
DIFF: https://github.com/llvm/llvm-project/commit/770d1e0a8828010a7c95de4596e24d54ed2527c3.diff

LOG: [X86][SSE] isHorizontalBinOp - reuse any existing horizontal ops.

If we already have similar horizontal ops using the same args, then match that, even if we are on a target with slow horizontal ops.

Added: 
    

Modified: 
    llvm/lib/Target/X86/X86ISelLowering.cpp
    llvm/test/CodeGen/X86/haddsub-shuf.ll
    llvm/test/CodeGen/X86/haddsub-undef.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 6bee21747bce..78a5d4a6dfbf 100644

--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -45628,8 +45628,9 @@ static SDValue combineVEXTRACT_STORE(SDNode *N, SelectionDAG &DAG,
 /// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
 /// A horizontal-op B, for some already available A and B, and if so then LHS is
 /// set to A, RHS to B, and the routine returns 'true'.
-static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, SelectionDAG &DAG,
-                              const X86Subtarget &Subtarget, bool IsCommutative,
+static bool isHorizontalBinOp(unsigned HOpcode, SDValue &LHS, SDValue &RHS,
+                              SelectionDAG &DAG, const X86Subtarget &Subtarget,
+                              bool IsCommutative,
                               SmallVectorImpl<int> &PostShuffleMask) {
   // If either operand is undef, bail out. The binop should be simplified.
   if (LHS.isUndef() || RHS.isUndef())
@@ -45790,9 +45791,20 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, SelectionDAG &DAG,
       isMultiLaneShuffleMask(128, VT.getScalarSizeInBits(), PostShuffleMask))
     return false;
 
+  // If the source nodes are already used in HorizOps then always accept this.
+  // Shuffle folding should merge these back together.
+  bool FoundHorizLHS = llvm::any_of(NewLHS->uses(), [&](SDNode *User) {
+    return User->getOpcode() == HOpcode && User->getValueType(0) == VT;
+  });
+  bool FoundHorizRHS = llvm::any_of(NewRHS->uses(), [&](SDNode *User) {
+    return User->getOpcode() == HOpcode && User->getValueType(0) == VT;
+  });
+  bool ForceHorizOp = FoundHorizLHS && FoundHorizRHS;
+
   // Assume a SingleSource HOP if we only shuffle one input and don't need to
   // shuffle the result.
-  if (!shouldUseHorizontalOp(NewLHS == NewRHS &&
+  if (!ForceHorizOp &&
+      !shouldUseHorizontalOp(NewLHS == NewRHS &&
                                  (NumShuffles < 2 || !IsIdentityPostShuffle),
                              DAG, Subtarget))
     return false;
@@ -45816,7 +45828,8 @@ static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
   SmallVector<int, 8> PostShuffleMask;
   if (((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
        (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
-      isHorizontalBinOp(LHS, RHS, DAG, Subtarget, IsFadd, PostShuffleMask)) {
+      isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsFadd,
+                        PostShuffleMask)) {
     SDValue HorizBinOp = DAG.getNode(HorizOpcode, SDLoc(N), VT, LHS, RHS);
     if (!PostShuffleMask.empty())
       HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
@@ -48931,17 +48944,18 @@ static SDValue combineAddOrSubToHADDorHSUB(SDNode *N, SelectionDAG &DAG,
   SDValue Op0 = N->getOperand(0);
   SDValue Op1 = N->getOperand(1);
   bool IsAdd = N->getOpcode() == ISD::ADD;
+  auto HorizOpcode = IsAdd ? X86ISD::HADD : X86ISD::HSUB;
   assert((IsAdd || N->getOpcode() == ISD::SUB) && "Wrong opcode");
 
   SmallVector<int, 8> PostShuffleMask;
   if ((VT == MVT::v8i16 || VT == MVT::v4i32 || VT == MVT::v16i16 ||
        VT == MVT::v8i32) &&
       Subtarget.hasSSSE3() &&
-      isHorizontalBinOp(Op0, Op1, DAG, Subtarget, IsAdd, PostShuffleMask)) {
-    auto HOpBuilder = [IsAdd](SelectionDAG &DAG, const SDLoc &DL,
-                              ArrayRef<SDValue> Ops) {
-      return DAG.getNode(IsAdd ? X86ISD::HADD : X86ISD::HSUB, DL,
-                         Ops[0].getValueType(), Ops);
+      isHorizontalBinOp(HorizOpcode, Op0, Op1, DAG, Subtarget, IsAdd,
+                        PostShuffleMask)) {
+    auto HOpBuilder = [HorizOpcode](SelectionDAG &DAG, const SDLoc &DL,
+                                    ArrayRef<SDValue> Ops) {
+      return DAG.getNode(HorizOpcode, DL, Ops[0].getValueType(), Ops);
     };
     SDValue HorizBinOp =
         SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, {Op0, Op1}, HOpBuilder);

diff  --git a/llvm/test/CodeGen/X86/haddsub-shuf.ll b/llvm/test/CodeGen/X86/haddsub-shuf.ll
index 37eedcd54441..282ef37f6e52 100644
--- a/llvm/test/CodeGen/X86/haddsub-shuf.ll
+++ b/llvm/test/CodeGen/X86/haddsub-shuf.ll
@@ -873,45 +873,15 @@ define <4 x float> @broadcast_haddps_v4f32(<4 x float> %a0) {
 declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>)
 
 define <4 x float> @PR34724_1(<4 x float> %a, <4 x float> %b) {
-; SSSE3_SLOW-LABEL: PR34724_1:
-; SSSE3_SLOW:       # %bb.0:
-; SSSE3_SLOW-NEXT:    haddps %xmm1, %xmm0
-; SSSE3_SLOW-NEXT:    movsldup {{.*#+}} xmm2 = xmm1[0,0,2,2]
-; SSSE3_SLOW-NEXT:    addps %xmm1, %xmm2
-; SSSE3_SLOW-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,0],xmm0[2,0]
-; SSSE3_SLOW-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0]
-; SSSE3_SLOW-NEXT:    retq
-;
-; SSSE3_FAST-LABEL: PR34724_1:
-; SSSE3_FAST:       # %bb.0:
-; SSSE3_FAST-NEXT:    haddps %xmm1, %xmm0
-; SSSE3_FAST-NEXT:    retq
-;
-; AVX1_SLOW-LABEL: PR34724_1:
-; AVX1_SLOW:       # %bb.0:
-; AVX1_SLOW-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
-; AVX1_SLOW-NEXT:    vmovsldup {{.*#+}} xmm2 = xmm1[0,0,2,2]
-; AVX1_SLOW-NEXT:    vaddps %xmm1, %xmm2, %xmm1
-; AVX1_SLOW-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
-; AVX1_SLOW-NEXT:    retq
-;
-; AVX1_FAST-LABEL: PR34724_1:
-; AVX1_FAST:       # %bb.0:
-; AVX1_FAST-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
-; AVX1_FAST-NEXT:    retq
-;
-; AVX2_SLOW-LABEL: PR34724_1:
-; AVX2_SLOW:       # %bb.0:
-; AVX2_SLOW-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
-; AVX2_SLOW-NEXT:    vmovsldup {{.*#+}} xmm2 = xmm1[0,0,2,2]
-; AVX2_SLOW-NEXT:    vaddps %xmm1, %xmm2, %xmm1
-; AVX2_SLOW-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
-; AVX2_SLOW-NEXT:    retq
+; SSSE3-LABEL: PR34724_1:
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    haddps %xmm1, %xmm0
+; SSSE3-NEXT:    retq
 ;
-; AVX2_FAST-LABEL: PR34724_1:
-; AVX2_FAST:       # %bb.0:
-; AVX2_FAST-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
-; AVX2_FAST-NEXT:    retq
+; AVX-LABEL: PR34724_1:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
   %t0 = shufflevector <4 x float> %a, <4 x float> %b, <2 x i32> <i32 2, i32 4>
   %t1 = shufflevector <4 x float> %a, <4 x float> %b, <2 x i32> <i32 3, i32 5>
   %t2 = fadd <2 x float> %t0, %t1

diff  --git a/llvm/test/CodeGen/X86/haddsub-undef.ll b/llvm/test/CodeGen/X86/haddsub-undef.ll
index d268438121ef..48ee31fe64fc 100644
--- a/llvm/test/CodeGen/X86/haddsub-undef.ll
+++ b/llvm/test/CodeGen/X86/haddsub-undef.ll
@@ -576,30 +576,17 @@ define <4 x float> @add_ps_008(<4 x float> %x) {
 }
 
 define <4 x float> @add_ps_016(<4 x float> %0, <4 x float> %1) {
-; SSE-SLOW-LABEL: add_ps_016:
-; SSE-SLOW:       # %bb.0:
-; SSE-SLOW-NEXT:    movshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE-SLOW-NEXT:    addps %xmm1, %xmm2
-; SSE-SLOW-NEXT:    haddps %xmm0, %xmm1
-; SSE-SLOW-NEXT:    shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[0,0]
-; SSE-SLOW-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[3,3]
-; SSE-SLOW-NEXT:    movaps %xmm2, %xmm0
-; SSE-SLOW-NEXT:    retq
-;
-; SSE-FAST-LABEL: add_ps_016:
-; SSE-FAST:       # %bb.0:
-; SSE-FAST-NEXT:    haddps %xmm0, %xmm1
-; SSE-FAST-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,0,3,3]
-; SSE-FAST-NEXT:    movaps %xmm1, %xmm0
-; SSE-FAST-NEXT:    retq
+; SSE-LABEL: add_ps_016:
+; SSE:       # %bb.0:
+; SSE-NEXT:    haddps %xmm0, %xmm1
+; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,0,3,3]
+; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    retq
 ;
 ; AVX-SLOW-LABEL: add_ps_016:
 ; AVX-SLOW:       # %bb.0:
 ; AVX-SLOW-NEXT:    vhaddps %xmm0, %xmm1, %xmm0
-; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; AVX-SLOW-NEXT:    vaddps %xmm1, %xmm2, %xmm1
-; AVX-SLOW-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
-; AVX-SLOW-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,0,3,3]
+; AVX-SLOW-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[1,0,3,3]
 ; AVX-SLOW-NEXT:    retq
 ;
 ; AVX-FAST-LABEL: add_ps_016:
@@ -1006,32 +993,15 @@ define <4 x float> @PR45747_2(<4 x float> %a, <4 x float> %b) nounwind {
 }
 
 define <4 x float> @PR34724_add_v4f32_u123(<4 x float> %0, <4 x float> %1) {
-; SSE-SLOW-LABEL: PR34724_add_v4f32_u123:
-; SSE-SLOW:       # %bb.0:
-; SSE-SLOW-NEXT:    haddps %xmm1, %xmm0
-; SSE-SLOW-NEXT:    movsldup {{.*#+}} xmm2 = xmm1[0,0,2,2]
-; SSE-SLOW-NEXT:    addps %xmm1, %xmm2
-; SSE-SLOW-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,0],xmm0[2,0]
-; SSE-SLOW-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0]
-; SSE-SLOW-NEXT:    retq
-;
-; SSE-FAST-LABEL: PR34724_add_v4f32_u123:
-; SSE-FAST:       # %bb.0:
-; SSE-FAST-NEXT:    haddps %xmm1, %xmm0
-; SSE-FAST-NEXT:    retq
-;
-; AVX-SLOW-LABEL: PR34724_add_v4f32_u123:
-; AVX-SLOW:       # %bb.0:
-; AVX-SLOW-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
-; AVX-SLOW-NEXT:    vmovsldup {{.*#+}} xmm2 = xmm1[0,0,2,2]
-; AVX-SLOW-NEXT:    vaddps %xmm1, %xmm2, %xmm1
-; AVX-SLOW-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
-; AVX-SLOW-NEXT:    retq
+; SSE-LABEL: PR34724_add_v4f32_u123:
+; SSE:       # %bb.0:
+; SSE-NEXT:    haddps %xmm1, %xmm0
+; SSE-NEXT:    retq
 ;
-; AVX-FAST-LABEL: PR34724_add_v4f32_u123:
-; AVX-FAST:       # %bb.0:
-; AVX-FAST-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
-; AVX-FAST-NEXT:    retq
+; AVX-LABEL: PR34724_add_v4f32_u123:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
   %3 = shufflevector <4 x float> %0, <4 x float> %1, <2 x i32> <i32 2, i32 4>
   %4 = shufflevector <4 x float> %0, <4 x float> %1, <2 x i32> <i32 3, i32 5>
   %5 = fadd <2 x float> %3, %4