[llvm] r211892 - [x86] Teach the target combine step to aggressively fold pshufd insturcions.

Fri Jun 27 04:40:14 PDT 2014

Author: chandlerc
Date: Fri Jun 27 06:40:13 2014
New Revision: 211892

URL: http://llvm.org/viewvc/llvm-project?rev=211892&view=rev
Log:
[x86] Teach the target combine step to aggressively fold pshufd insturcions.

Summary:
This allows it to fold pshufd instructions across intervening
half-shuffles and other noise. This pattern actually shows up in the
generic lowering tests, but I've also added direct tests using
intrinsics to make sure that the specific desired functionality is
working even if the lowering stuff changes in the future.

Differential Revision: http://reviews.llvm.org/D4292

Modified:
    llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
    llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v8.ll
    llvm/trunk/test/CodeGen/X86/vector-shuffle-combining.ll

Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=211892&r1=211891&r2=211892&view=diff
==============================================================================

--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Fri Jun 27 06:40:13 2014
@@ -19061,6 +19061,79 @@ static SmallVector<int, 4> getPSHUFShuff
   }
 }
 
+/// \brief Search for a combinable shuffle across a chain ending in pshufd.
+///
+/// We walk up the chain and look for a combinable shuffle, skipping over
+/// shuffles that we could hoist this shuffle's transformation past without
+/// altering anything.
+static bool combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
+                                         SelectionDAG &DAG,
+                                         TargetLowering::DAGCombinerInfo &DCI) {
+  assert(N.getOpcode() == X86ISD::PSHUFD &&
+         "Called with something other than an x86 128-bit half shuffle!");
+  SDLoc DL(N);
+
+  // Walk up a single-use chain looking for a combinable shuffle.
+  SDValue V = N.getOperand(0);
+  for (; V.hasOneUse(); V = V.getOperand(0)) {
+    switch (V.getOpcode()) {
+    default:
+      return false; // Nothing combined!
+
+    case ISD::BITCAST:
+      // Skip bitcasts as we always know the type for the target specific
+      // instructions.
+      continue;
+
+    case X86ISD::PSHUFD:
+      // Found another dword shuffle.
+      break;
+
+    case X86ISD::PSHUFLW:
+      // Check that the low words (being shuffled) are the identity in the
+      // dword shuffle, and the high words are self-contained.
+      if (Mask[0] != 0 || Mask[1] != 1 ||
+          !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
+        return false;
+
+      continue;
+
+    case X86ISD::PSHUFHW:
+      // Check that the high words (being shuffled) are the identity in the
+      // dword shuffle, and the low words are self-contained.
+      if (Mask[2] != 2 || Mask[3] != 3 ||
+          !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
+        return false;
+
+      continue;
+    }
+    // Break out of the loop if we break out of the switch.
+    break;
+  }
+
+  if (!V.hasOneUse())
+    // We fell out of the loop without finding a viable combining instruction.
+    return false;
+
+  // Record the old value to use in RAUW-ing.
+  SDValue Old = V;
+
+  // Merge this node's mask and our incoming mask.
+  SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
+  for (int &M : Mask)
+    M = VMask[M];
+  V = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V.getOperand(0),
+                  getV4X86ShuffleImm8ForMask(Mask, DAG));
+
+  // Replace N with its operand as we're going to combine that shuffle away.
+  DAG.ReplaceAllUsesWith(N, N.getOperand(0));
+
+  // Replace the combinable shuffle with the combined one, updating all users
+  // so that we re-evaluate the chain here.
+  DCI.CombineTo(Old.getNode(), V, /*AddTo*/ true);
+  return true;
+}
+
 /// \brief Search for a combinable shuffle across a chain ending in pshuflw or pshufhw.
 ///
 /// We walk up the chain, skipping shuffles of the other half and looking
@@ -19194,18 +19267,11 @@ static SDValue PerformTargetShuffleCombi
       return DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V);
     }
 
-    // Fallthrough
+    break;
+
   case X86ISD::PSHUFD:
-    if (V.getOpcode() == N.getOpcode()) {
-      // If we have two sequential shuffles of the same kind we can always fold
-      // them. Even if there are multiple uses, this is beneficial because it
-      // breaks a dependency.
-      SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
-      for (int &M : Mask)
-        M = VMask[M];
-      return DAG.getNode(N.getOpcode(), DL, VT, V.getOperand(0),
-                         getV4X86ShuffleImm8ForMask(Mask, DAG));
-    }
+    if (combineRedundantDWordShuffle(N, Mask, DAG, DCI))
+      return SDValue(); // We combined away this shuffle.
 
     break;
   }

Modified: llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v8.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v8.ll?rev=211892&r1=211891&r2=211892&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v8.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v8.ll Fri Jun 27 06:40:13 2014
@@ -157,9 +157,8 @@ define <8 x i16> @shuffle_v8i16_26401375
 ; CHECK-SSE2:       # BB#0:
 ; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[0,2,1,3,4,5,6,7]
 ; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,5,4,6]
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,3,2,1]
+; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,3,1,2]
 ; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[1,3,2,0,4,5,6,7]
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,1,3,2]
 ; CHECK-SSE2-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 2, i32 6, i32 4, i32 0, i32 1, i32 3, i32 7, i32 5>
   ret <8 x i16> %shuffle

Modified: llvm/trunk/test/CodeGen/X86/vector-shuffle-combining.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-shuffle-combining.ll?rev=211892&r1=211891&r2=211892&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-shuffle-combining.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-shuffle-combining.ll Fri Jun 27 06:40:13 2014
@@ -3,9 +3,69 @@
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-unknown"
 
+declare <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32>, i8)
 declare <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16>, i8)
 declare <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16>, i8)
 
+define <4 x i32> @combine_pshufd1(<4 x i32> %a) {
+; CHECK-SSE2-LABEL: @combine_pshufd1
+; CHECK-SSE2:       # BB#0:
+; CHECK-SSE2-NEXT:    retq
+  %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27) 
+  %c = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %b, i8 27) 
+  ret <4 x i32> %c
+}
+
+define <4 x i32> @combine_pshufd2(<4 x i32> %a) {
+; CHECK-SSE2-LABEL: @combine_pshufd2
+; CHECK-SSE2:       # BB#0:
+; CHECK-SSE2-NEXT:    retq
+  %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27) 
+  %b.cast = bitcast <4 x i32> %b to <8 x i16>
+  %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b.cast, i8 -28)
+  %c.cast = bitcast <8 x i16> %c to <4 x i32>
+  %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 27) 
+  ret <4 x i32> %d
+}
+
+define <4 x i32> @combine_pshufd3(<4 x i32> %a) {
+; CHECK-SSE2-LABEL: @combine_pshufd3
+; CHECK-SSE2:       # BB#0:
+; CHECK-SSE2-NEXT:    retq
+  %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27) 
+  %b.cast = bitcast <4 x i32> %b to <8 x i16>
+  %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b.cast, i8 -28)
+  %c.cast = bitcast <8 x i16> %c to <4 x i32>
+  %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 27) 
+  ret <4 x i32> %d
+}
+
+define <4 x i32> @combine_pshufd4(<4 x i32> %a) {
+; CHECK-SSE2-LABEL: @combine_pshufd4
+; CHECK-SSE2:       # BB#0:
+; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,6,5,4]
+; CHECK-SSE2-NEXT:    retq
+  %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 -31) 
+  %b.cast = bitcast <4 x i32> %b to <8 x i16>
+  %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b.cast, i8 27)
+  %c.cast = bitcast <8 x i16> %c to <4 x i32>
+  %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 -31) 
+  ret <4 x i32> %d
+}
+
+define <4 x i32> @combine_pshufd5(<4 x i32> %a) {
+; CHECK-SSE2-LABEL: @combine_pshufd5
+; CHECK-SSE2:       # BB#0:
+; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; CHECK-SSE2-NEXT:    retq
+  %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 -76) 
+  %b.cast = bitcast <4 x i32> %b to <8 x i16>
+  %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b.cast, i8 27)
+  %c.cast = bitcast <8 x i16> %c to <4 x i32>
+  %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 -76)
+  ret <4 x i32> %d
+}
+
 define <8 x i16> @combine_pshuflw1(<8 x i16> %a) {
 ; CHECK-SSE2-LABEL: @combine_pshuflw1
 ; CHECK-SSE2:       # BB#0: