[llvm] r312379 - [x86] eliminate redundant shuffle of horizontal math ops when both inputs are the same

Fri Sep 1 14:09:04 PDT 2017

Author: spatel
Date: Fri Sep  1 14:09:04 2017
New Revision: 312379

URL: http://llvm.org/viewvc/llvm-project?rev=312379&view=rev
Log:
[x86] eliminate redundant shuffle of horizontal math ops when both inputs are the same

This is limited to a set of patterns based on the example in PR34111:
https://bugs.llvm.org/show_bug.cgi?id=34111
...but as I was investigating this, I see that horizontal patterns can go wrong in many, 
many other ways that would not be handled by this patch. Each data type may even go 
different in the DAG after starting with the same basic IR pattern, so even proper IR 
canonicalization won't fix it all.

Differential Revision: https://reviews.llvm.org/D37357

Modified:
    llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
    llvm/trunk/test/CodeGen/X86/haddsub-shuf.ll

Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=312379&r1=312378&r2=312379&view=diff
==============================================================================

--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Fri Sep  1 14:09:04 2017
@@ -29053,6 +29053,40 @@ static SDValue combineShuffleOfConcatUnd
   return DAG.getVectorShuffle(VT, DL, Concat, DAG.getUNDEF(VT), Mask);
 }
 
+/// Eliminate a redundant shuffle of a horizontal math op.
+static SDValue foldShuffleOfHorizOp(SDNode *N) {
+  if (N->getOpcode() != ISD::VECTOR_SHUFFLE || !N->getOperand(1).isUndef())
+    return SDValue();
+
+  SDValue HOp = N->getOperand(0);
+  if (HOp.getOpcode() != X86ISD::HADD && HOp.getOpcode() != X86ISD::FHADD &&
+      HOp.getOpcode() != X86ISD::HSUB && HOp.getOpcode() != X86ISD::FHSUB)
+    return SDValue();
+
+  // 128-bit horizontal math instructions are defined to operate on adjacent
+  // lanes of each operand as:
+  // v4X32: A[0] + A[1] , A[2] + A[3] , B[0] + B[1] , B[2] + B[3]
+  // ...similarly for v2f64 and v8i16.
+  // TODO: 256-bit is not the same because...x86.
+  if (HOp.getOperand(0) != HOp.getOperand(1) || HOp.getValueSizeInBits() != 128)
+    return SDValue();
+
+  // When the operands of a horizontal math op are identical, the low half of
+  // the result is the same as the high half. If the shuffle is also replicating
+  // low and high halves, we don't need the shuffle.
+  // shuffle (hadd X, X), undef, [low half...high half] --> hadd X, X
+  ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
+  // TODO: Other mask possibilities like {1,1} and {1,0} could be added here,
+  // but this should be tied to whatever horizontal op matching and shuffle
+  // canonicalization are producing.
+  if (isTargetShuffleEquivalent(Mask, { 0, 0 }) ||
+      isTargetShuffleEquivalent(Mask, { 0, 1, 0, 1 }) ||
+      isTargetShuffleEquivalent(Mask, { 0, 1, 2, 3, 0, 1, 2, 3 }))
+    return HOp;
+
+  return SDValue();
+}
+
 static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
                               TargetLowering::DAGCombinerInfo &DCI,
                               const X86Subtarget &Subtarget) {
@@ -29061,10 +29095,14 @@ static SDValue combineShuffle(SDNode *N,
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   // If we have legalized the vector types, look for blends of FADD and FSUB
   // nodes that we can fuse into an ADDSUB node.
-  if (TLI.isTypeLegal(VT))
+  if (TLI.isTypeLegal(VT)) {
     if (SDValue AddSub = combineShuffleToAddSubOrFMAddSub(N, Subtarget, DAG))
       return AddSub;
 
+    if (SDValue HAddSub = foldShuffleOfHorizOp(N))
+      return HAddSub;
+  }
+
   // During Type Legalization, when promoting illegal vector types,
   // the backend might introduce new shuffle dag nodes and bitcasts.
   //

Modified: llvm/trunk/test/CodeGen/X86/haddsub-shuf.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/haddsub-shuf.ll?rev=312379&r1=312378&r2=312379&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/haddsub-shuf.ll (original)
+++ llvm/trunk/test/CodeGen/X86/haddsub-shuf.ll Fri Sep  1 14:09:04 2017
@@ -9,13 +9,11 @@ define <4 x float> @hadd_v4f32(<4 x floa
 ; SSSE3-LABEL: hadd_v4f32:
 ; SSSE3:       # BB#0:
 ; SSSE3-NEXT:    haddps %xmm0, %xmm0
-; SSSE3-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
 ; SSSE3-NEXT:    retq
 ;
 ; AVX-LABEL: hadd_v4f32:
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
 ; AVX-NEXT:    retq
   %a02 = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 0, i32 2>
   %a13 = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 1, i32 3>
@@ -28,13 +26,11 @@ define <4 x float> @hsub_v4f32(<4 x floa
 ; SSSE3-LABEL: hsub_v4f32:
 ; SSSE3:       # BB#0:
 ; SSSE3-NEXT:    hsubps %xmm0, %xmm0
-; SSSE3-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
 ; SSSE3-NEXT:    retq
 ;
 ; AVX-LABEL: hsub_v4f32:
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vhsubps %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
 ; AVX-NEXT:    retq
   %a02 = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 0, i32 2>
   %a13 = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 1, i32 3>
@@ -47,13 +43,11 @@ define <2 x double> @hadd_v2f64(<2 x dou
 ; SSSE3-LABEL: hadd_v2f64:
 ; SSSE3:       # BB#0:
 ; SSSE3-NEXT:    haddpd %xmm0, %xmm0
-; SSSE3-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
 ; SSSE3-NEXT:    retq
 ;
 ; AVX-LABEL: hadd_v2f64:
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
 ; AVX-NEXT:    retq
   %a0 = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> <i32 0, i32 undef>
   %a1 = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
@@ -66,13 +60,11 @@ define <2 x double> @hsub_v2f64(<2 x dou
 ; SSSE3-LABEL: hsub_v2f64:
 ; SSSE3:       # BB#0:
 ; SSSE3-NEXT:    hsubpd %xmm0, %xmm0
-; SSSE3-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
 ; SSSE3-NEXT:    retq
 ;
 ; AVX-LABEL: hsub_v2f64:
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vhsubpd %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
 ; AVX-NEXT:    retq
   %a0 = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> <i32 0, i32 undef>
   %a1 = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
@@ -85,13 +77,11 @@ define <4 x i32> @hadd_v4i32(<4 x i32> %
 ; SSSE3-LABEL: hadd_v4i32:
 ; SSSE3:       # BB#0:
 ; SSSE3-NEXT:    phaddd %xmm0, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
 ; SSSE3-NEXT:    retq
 ;
 ; AVX-LABEL: hadd_v4i32:
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
 ; AVX-NEXT:    retq
   %a02 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
   %a13 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
@@ -104,13 +94,11 @@ define <4 x i32> @hsub_v4i32(<4 x i32> %
 ; SSSE3-LABEL: hsub_v4i32:
 ; SSSE3:       # BB#0:
 ; SSSE3-NEXT:    phsubd %xmm0, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
 ; SSSE3-NEXT:    retq
 ;
 ; AVX-LABEL: hsub_v4i32:
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vphsubd %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
 ; AVX-NEXT:    retq
   %a02 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
   %a13 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
@@ -123,13 +111,11 @@ define <8 x i16> @hadd_v8i16(<8 x i16> %
 ; SSSE3-LABEL: hadd_v8i16:
 ; SSSE3:       # BB#0:
 ; SSSE3-NEXT:    phaddw %xmm0, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
 ; SSSE3-NEXT:    retq
 ;
 ; AVX-LABEL: hadd_v8i16:
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
 ; AVX-NEXT:    retq
   %a0246 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 undef, i32 undef, i32 undef, i32 undef>
   %a1357 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -142,13 +128,11 @@ define <8 x i16> @hsub_v8i16(<8 x i16> %
 ; SSSE3-LABEL: hsub_v8i16:
 ; SSSE3:       # BB#0:
 ; SSSE3-NEXT:    phsubw %xmm0, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
 ; SSSE3-NEXT:    retq
 ;
 ; AVX-LABEL: hsub_v8i16:
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vphsubw %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
 ; AVX-NEXT:    retq
   %a0246 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 undef, i32 undef, i32 undef, i32 undef>
   %a1357 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>