[llvm] r211892 - [x86] Teach the target combine step to aggressively fold pshufd insturcions.

Wed Jul 2 05:56:53 PDT 2014

Hi Chandler,

This commit is causing a regression found by llvm-stress:

bin/llvm-stress -size 300 -seed 17093 | bin/llc -march=x86-64 -mcpu=corei7 -o /dev/null
llc: ../lib/CodeGen/SelectionDAG/SelectionDAG.cpp:5730: void llvm::SelectionDAG::ReplaceAllUsesWith(llvm::SDValue, llvm::SDValue): Assertion `From != To.getNode() && "Cannot replace uses of with self"' failed.
0  llc             0x0000000001129a75 llvm::sys::PrintStackTrace(_IO_FILE*) + 37
1  llc             0x0000000001129eb3
2  libpthread.so.0 0x00007f211fcae7c0
3  libc.so.6       0x00007f211efb2b35 gsignal + 53
4  libc.so.6       0x00007f211efb4111 abort + 385
5  libc.so.6       0x00007f211efab9f0 __assert_fail + 240
6  llc             0x0000000000fc73af
7  llc             0x0000000000fc7444 llvm::SelectionDAG::ReplaceAllUsesWith(llvm::SDNode*, llvm::SDValue const*) + 52
8  llc             0x0000000000f27ff9
9  llc             0x0000000000f28203 llvm::TargetLowering::DAGCombinerInfo::CombineTo(llvm::SDNode*, llvm::SDValue, bool) + 35
10 llc             0x0000000000af4308 llvm::X86TargetLowering::PerformDAGCombine(llvm::SDNode*, llvm::TargetLowering::DAGCombinerInfo&) const + 65080
11 llc             0x0000000000f28f7e
12 llc             0x0000000000f288bb llvm::SelectionDAG::Combine(llvm::CombineLevel, llvm::AliasAnalysis&, llvm::CodeGenOpt::Level) + 939
13 llc             0x000000000101819b llvm::SelectionDAGISel::CodeGenAndEmitDAG() + 3259
14 llc             0x0000000001016aa8 llvm::SelectionDAGISel::SelectAllBasicBlocks(llvm::Function const&) + 7096
15 llc             0x0000000001014154 llvm::SelectionDAGISel::runOnMachineFunction(llvm::MachineFunction&) + 1332
16 llc             0x0000000000a6ce26
17 llc             0x0000000000c6348c llvm::MachineFunctionPass::runOnFunction(llvm::Function&) + 124
18 llc             0x0000000000e559ca llvm::FPPassManager::runOnFunction(llvm::Function&) + 362
19 llc             0x0000000000e55c5b llvm::FPPassManager::runOnModule(llvm::Module&) + 43
20 llc             0x0000000000e561f7 llvm::legacy::PassManagerImpl::run(llvm::Module&) + 999
21 llc             0x000000000056d071 main + 6817
22 libc.so.6       0x00007f211ef9ec16 __libc_start_main + 230
23 llc             0x000000000056b4e9
Stack dump:
0.      Program arguments: bin/llc -march=x86-64 -mcpu=corei7 -o /dev/null
1.      Running pass 'Function Pass Manager' on module '<stdin>'.
2.      Running pass 'X86 DAG->DAG Instruction Selection' on function '@autogen_SD17093'
Abort

/Patrik Hägglund
-----Original Message-----
From: llvm-commits-bounces at cs.uiuc.edu [mailto:llvm-commits-bounces at cs.uiuc.edu] On Behalf Of Chandler Carruth
Sent: den 27 juni 2014 13:40
To: llvm-commits at cs.uiuc.edu
Subject: [llvm] r211892 - [x86] Teach the target combine step to aggressively fold pshufd insturcions.

Author: chandlerc
Date: Fri Jun 27 06:40:13 2014
New Revision: 211892

URL: http://llvm.org/viewvc/llvm-project?rev=211892&view=rev
Log:
[x86] Teach the target combine step to aggressively fold pshufd insturcions.

Summary:
This allows it to fold pshufd instructions across intervening
half-shuffles and other noise. This pattern actually shows up in the
generic lowering tests, but I've also added direct tests using
intrinsics to make sure that the specific desired functionality is
working even if the lowering stuff changes in the future.

Differential Revision: http://reviews.llvm.org/D4292

Modified:
    llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
    llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v8.ll
    llvm/trunk/test/CodeGen/X86/vector-shuffle-combining.ll

Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=211892&r1=211891&r2=211892&view=diff
==============================================================================

--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Fri Jun 27 06:40:13 2014
@@ -19061,6 +19061,79 @@ static SmallVector<int, 4> getPSHUFShuff
   }
 }
 
+/// \brief Search for a combinable shuffle across a chain ending in pshufd.
+///
+/// We walk up the chain and look for a combinable shuffle, skipping over
+/// shuffles that we could hoist this shuffle's transformation past without
+/// altering anything.
+static bool combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
+                                         SelectionDAG &DAG,
+                                         TargetLowering::DAGCombinerInfo &DCI) {
+  assert(N.getOpcode() == X86ISD::PSHUFD &&
+         "Called with something other than an x86 128-bit half shuffle!");
+  SDLoc DL(N);
+
+  // Walk up a single-use chain looking for a combinable shuffle.
+  SDValue V = N.getOperand(0);
+  for (; V.hasOneUse(); V = V.getOperand(0)) {
+    switch (V.getOpcode()) {
+    default:
+      return false; // Nothing combined!
+
+    case ISD::BITCAST:
+      // Skip bitcasts as we always know the type for the target specific
+      // instructions.
+      continue;
+
+    case X86ISD::PSHUFD:
+      // Found another dword shuffle.
+      break;
+
+    case X86ISD::PSHUFLW:
+      // Check that the low words (being shuffled) are the identity in the
+      // dword shuffle, and the high words are self-contained.
+      if (Mask[0] != 0 || Mask[1] != 1 ||
+          !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
+        return false;
+
+      continue;
+
+    case X86ISD::PSHUFHW:
+      // Check that the high words (being shuffled) are the identity in the
+      // dword shuffle, and the low words are self-contained.
+      if (Mask[2] != 2 || Mask[3] != 3 ||
+          !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
+        return false;
+
+      continue;
+    }
+    // Break out of the loop if we break out of the switch.
+    break;
+  }
+
+  if (!V.hasOneUse())
+    // We fell out of the loop without finding a viable combining instruction.
+    return false;
+
+  // Record the old value to use in RAUW-ing.
+  SDValue Old = V;
+
+  // Merge this node's mask and our incoming mask.
+  SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
+  for (int &M : Mask)
+    M = VMask[M];
+  V = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V.getOperand(0),
+                  getV4X86ShuffleImm8ForMask(Mask, DAG));
+
+  // Replace N with its operand as we're going to combine that shuffle away.
+  DAG.ReplaceAllUsesWith(N, N.getOperand(0));
+
+  // Replace the combinable shuffle with the combined one, updating all users
+  // so that we re-evaluate the chain here.
+  DCI.CombineTo(Old.getNode(), V, /*AddTo*/ true);
+  return true;
+}
+
 /// \brief Search for a combinable shuffle across a chain ending in pshuflw or pshufhw.
 ///
 /// We walk up the chain, skipping shuffles of the other half and looking
@@ -19194,18 +19267,11 @@ static SDValue PerformTargetShuffleCombi
       return DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V);
     }
 
-    // Fallthrough
+    break;
+
   case X86ISD::PSHUFD:
-    if (V.getOpcode() == N.getOpcode()) {
-      // If we have two sequential shuffles of the same kind we can always fold
-      // them. Even if there are multiple uses, this is beneficial because it
-      // breaks a dependency.
-      SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
-      for (int &M : Mask)
-        M = VMask[M];
-      return DAG.getNode(N.getOpcode(), DL, VT, V.getOperand(0),
-                         getV4X86ShuffleImm8ForMask(Mask, DAG));
-    }
+    if (combineRedundantDWordShuffle(N, Mask, DAG, DCI))
+      return SDValue(); // We combined away this shuffle.
 
     break;
   }

Modified: llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v8.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v8.ll?rev=211892&r1=211891&r2=211892&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v8.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v8.ll Fri Jun 27 06:40:13 2014
@@ -157,9 +157,8 @@ define <8 x i16> @shuffle_v8i16_26401375
 ; CHECK-SSE2:       # BB#0:
 ; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[0,2,1,3,4,5,6,7]
 ; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,5,4,6]
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,3,2,1]
+; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,3,1,2]
 ; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[1,3,2,0,4,5,6,7]
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,1,3,2]
 ; CHECK-SSE2-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 2, i32 6, i32 4, i32 0, i32 1, i32 3, i32 7, i32 5>
   ret <8 x i16> %shuffle

Modified: llvm/trunk/test/CodeGen/X86/vector-shuffle-combining.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-shuffle-combining.ll?rev=211892&r1=211891&r2=211892&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-shuffle-combining.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-shuffle-combining.ll Fri Jun 27 06:40:13 2014
@@ -3,9 +3,69 @@
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-unknown"
 
+declare <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32>, i8)
 declare <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16>, i8)
 declare <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16>, i8)
 
+define <4 x i32> @combine_pshufd1(<4 x i32> %a) {
+; CHECK-SSE2-LABEL: @combine_pshufd1
+; CHECK-SSE2:       # BB#0:
+; CHECK-SSE2-NEXT:    retq
+  %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27) 
+  %c = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %b, i8 27) 
+  ret <4 x i32> %c
+}
+
+define <4 x i32> @combine_pshufd2(<4 x i32> %a) {
+; CHECK-SSE2-LABEL: @combine_pshufd2
+; CHECK-SSE2:       # BB#0:
+; CHECK-SSE2-NEXT:    retq
+  %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27) 
+  %b.cast = bitcast <4 x i32> %b to <8 x i16>
+  %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b.cast, i8 -28)
+  %c.cast = bitcast <8 x i16> %c to <4 x i32>
+  %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 27) 
+  ret <4 x i32> %d
+}
+
+define <4 x i32> @combine_pshufd3(<4 x i32> %a) {
+; CHECK-SSE2-LABEL: @combine_pshufd3
+; CHECK-SSE2:       # BB#0:
+; CHECK-SSE2-NEXT:    retq
+  %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27) 
+  %b.cast = bitcast <4 x i32> %b to <8 x i16>
+  %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b.cast, i8 -28)
+  %c.cast = bitcast <8 x i16> %c to <4 x i32>
+  %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 27) 
+  ret <4 x i32> %d
+}
+
+define <4 x i32> @combine_pshufd4(<4 x i32> %a) {
+; CHECK-SSE2-LABEL: @combine_pshufd4
+; CHECK-SSE2:       # BB#0:
+; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,6,5,4]
+; CHECK-SSE2-NEXT:    retq
+  %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 -31) 
+  %b.cast = bitcast <4 x i32> %b to <8 x i16>
+  %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b.cast, i8 27)
+  %c.cast = bitcast <8 x i16> %c to <4 x i32>
+  %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 -31) 
+  ret <4 x i32> %d
+}
+
+define <4 x i32> @combine_pshufd5(<4 x i32> %a) {
+; CHECK-SSE2-LABEL: @combine_pshufd5
+; CHECK-SSE2:       # BB#0:
+; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; CHECK-SSE2-NEXT:    retq
+  %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 -76) 
+  %b.cast = bitcast <4 x i32> %b to <8 x i16>
+  %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b.cast, i8 27)
+  %c.cast = bitcast <8 x i16> %c to <4 x i32>
+  %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 -76)
+  ret <4 x i32> %d
+}
+
 define <8 x i16> @combine_pshuflw1(<8 x i16> %a) {
 ; CHECK-SSE2-LABEL: @combine_pshuflw1
 ; CHECK-SSE2:       # BB#0:


_______________________________________________
llvm-commits mailing list
llvm-commits at cs.uiuc.edu
http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits