[llvm] r369937 - [DAGCombiner][X86] Teach SimplifyVBinOp to fold VBinOp (concat X, undef/constant), (concat Y, undef/constant) -> concat (VBinOp X, Y), VecC

Mon Aug 26 10:59:11 PDT 2019

Author: ctopper
Date: Mon Aug 26 10:59:11 2019
New Revision: 369937

URL: http://llvm.org/viewvc/llvm-project?rev=369937&view=rev
Log:
[DAGCombiner][X86] Teach SimplifyVBinOp to fold VBinOp (concat X, undef/constant), (concat Y, undef/constant) -> concat (VBinOp X, Y), VecC

This improves the combine I included in D66504 to handle constants in the upper operands of the concat. If we can constant fold them away we can pull the concat after the bin op. This helps with chains of madd reductions on X86 from loop unrolling. The loop madd reduction pattern creates pmaddwd with half the width of the add that follows it using zeroes to fill the upper bits. If we have two of these added together we can pull the zeroes through the accumulating add and then shrink it.

Differential Revision: https://reviews.llvm.org/D66680

Modified:
    llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
    llvm/trunk/test/CodeGen/X86/madd.ll

Modified: llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp?rev=369937&r1=369936&r2=369937&view=diff
==============================================================================

--- llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp (original)
+++ llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp Mon Aug 26 10:59:11 2019
@@ -19448,33 +19448,35 @@ SDValue DAGCombiner::SimplifyVBinOp(SDNo
     }
   }
 
-  // Make sure all but the first op are undef.
-  auto ConcatWithUndef = [](SDValue Concat) {
+  // Make sure all but the first op are undef or constant.
+  auto ConcatWithConstantOrUndef = [](SDValue Concat) {
     return Concat.getOpcode() == ISD::CONCAT_VECTORS &&
            std::all_of(std::next(Concat->op_begin()), Concat->op_end(),
-                       [](const SDValue &Op) {
-                         return Op.isUndef();
-                       });
+                     [](const SDValue &Op) {
+                       return Op.isUndef() ||
+                              ISD::isBuildVectorOfConstantSDNodes(Op.getNode());
+                     });
   };
 
   // The following pattern is likely to emerge with vector reduction ops. Moving
   // the binary operation ahead of the concat may allow using a narrower vector
   // instruction that has better performance than the wide version of the op:
-  // VBinOp (concat X, undef), (concat Y, undef) --> concat (VBinOp X, Y), VecC
-  if (ConcatWithUndef(LHS) && ConcatWithUndef(RHS) &&
+  // VBinOp (concat X, undef/constant), (concat Y, undef/constant) -->
+  //   concat (VBinOp X, Y), VecC
+  if (ConcatWithConstantOrUndef(LHS) && ConcatWithConstantOrUndef(RHS) &&
       (LHS.hasOneUse() || RHS.hasOneUse())) {
-    SDValue X = LHS.getOperand(0);
-    SDValue Y = RHS.getOperand(0);
-    EVT NarrowVT = X.getValueType();
-    if (NarrowVT == Y.getValueType() &&
+    EVT NarrowVT = LHS.getOperand(0).getValueType();
+    if (NarrowVT == RHS.getOperand(0).getValueType() &&
         TLI.isOperationLegalOrCustomOrPromote(Opcode, NarrowVT)) {
-      // (binop undef, undef) may not return undef, so compute that result.
       SDLoc DL(N);
-      SDValue VecC =
-          DAG.getNode(Opcode, DL, NarrowVT, DAG.getUNDEF(NarrowVT),
-                      DAG.getUNDEF(NarrowVT));
-      SmallVector<SDValue, 4> Ops(LHS.getNumOperands(), VecC);
-      Ops[0] = DAG.getNode(Opcode, DL, NarrowVT, X, Y);
+      unsigned NumOperands = LHS.getNumOperands();
+      SmallVector<SDValue, 4> Ops;
+      for (unsigned i = 0; i != NumOperands; ++i) {
+        // This constant fold for operands 1 and up.
+        Ops.push_back(DAG.getNode(Opcode, DL, NarrowVT, LHS.getOperand(i),
+                                  RHS.getOperand(i)));
+      }
+
       return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Ops);
     }
   }

Modified: llvm/trunk/test/CodeGen/X86/madd.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/madd.ll?rev=369937&r1=369936&r2=369937&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/madd.ll (original)
+++ llvm/trunk/test/CodeGen/X86/madd.ll Mon Aug 26 10:59:11 2019
@@ -2720,52 +2720,27 @@ define i32 @madd_quad_reduction(<8 x i16
 ; SSE2-NEXT:    movd %xmm1, %eax
 ; SSE2-NEXT:    retq
 ;
-; AVX1-LABEL: madd_quad_reduction:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX1-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX1-NEXT:    vmovdqu (%rdi), %xmm0
-; AVX1-NEXT:    vmovdqu (%rdx), %xmm1
-; AVX1-NEXT:    vpmaddwd (%rcx), %xmm1, %xmm1
-; AVX1-NEXT:    vpmaddwd (%rsi), %xmm0, %xmm0
-; AVX1-NEXT:    vmovdqu (%r8), %xmm2
-; AVX1-NEXT:    vpmaddwd (%r9), %xmm2, %xmm2
-; AVX1-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vmovdqu (%rax), %xmm2
-; AVX1-NEXT:    vpmaddwd (%r10), %xmm2, %xmm2
-; AVX1-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vmovd %xmm0, %eax
-; AVX1-NEXT:    retq
-;
-; AVX256-LABEL: madd_quad_reduction:
-; AVX256:       # %bb.0:
-; AVX256-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX256-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX256-NEXT:    vmovdqu (%rdi), %xmm0
-; AVX256-NEXT:    vmovdqu (%rdx), %xmm1
-; AVX256-NEXT:    vpmaddwd (%rcx), %xmm1, %xmm1
-; AVX256-NEXT:    vpmaddwd (%rsi), %xmm0, %xmm0
-; AVX256-NEXT:    vmovdqu (%r8), %xmm2
-; AVX256-NEXT:    vpmaddwd (%r9), %xmm2, %xmm2
-; AVX256-NEXT:    vpaddd %ymm2, %ymm1, %ymm1
-; AVX256-NEXT:    vmovdqu (%rax), %xmm2
-; AVX256-NEXT:    vpmaddwd (%r10), %xmm2, %xmm2
-; AVX256-NEXT:    vpaddd %ymm2, %ymm1, %ymm1
-; AVX256-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
-; AVX256-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX256-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; AVX256-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX256-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX256-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX256-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX256-NEXT:    vmovd %xmm0, %eax
-; AVX256-NEXT:    vzeroupper
-; AVX256-NEXT:    retq
+; AVX-LABEL: madd_quad_reduction:
+; AVX:       # %bb.0:
+; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX-NEXT:    vmovdqu (%rdi), %xmm0
+; AVX-NEXT:    vmovdqu (%rdx), %xmm1
+; AVX-NEXT:    vpmaddwd (%rcx), %xmm1, %xmm1
+; AVX-NEXT:    vpmaddwd (%rsi), %xmm0, %xmm0
+; AVX-NEXT:    vmovdqu (%r8), %xmm2
+; AVX-NEXT:    vpmaddwd (%r9), %xmm2, %xmm2
+; AVX-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
+; AVX-NEXT:    vmovdqu (%rax), %xmm2
+; AVX-NEXT:    vpmaddwd (%r10), %xmm2, %xmm2
+; AVX-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
+; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    retq
   %tmp = load <8 x i16>, <8 x i16>* %arg, align 1
   %tmp6 = load <8 x i16>, <8 x i16>* %arg1, align 1
   %tmp7 = sext <8 x i16> %tmp to <8 x i32>