[llvm] r212748 - [DAG] Further improve the logic in DAGCombiner that folds a pair of shuffles into a single shuffle if the resulting mask is legal.

Thu Jul 10 11:04:56 PDT 2014

Author: adibiagio
Date: Thu Jul 10 13:04:55 2014
New Revision: 212748

URL: http://llvm.org/viewvc/llvm-project?rev=212748&view=rev
Log:
[DAG] Further improve the logic in DAGCombiner that folds a pair of shuffles into a single shuffle if the resulting mask is legal.

This patch teaches the DAGCombiner how to fold shuffles according to the
following new rules:
  1. shuffle(shuffle(x, y), undef) -> x
  2. shuffle(shuffle(x, y), undef) -> y
  3. shuffle(shuffle(x, y), undef) -> shuffle(x, undef)
  4. shuffle(shuffle(x, y), undef) -> shuffle(y, undef)

The backend avoids to combine shuffles according to rules 3. and 4. if
the resulting shuffle does not have a legal mask. This is to avoid introducing
illegal shuffles that are potentially expanded into a sub-optimal sequence of
target specific dag nodes during vector legalization.

Added test case combine-vec-shuffle-2.ll to verify that we correctly triggers
the new rules when combining shuffles.


Added:
    llvm/trunk/test/CodeGen/X86/combine-vec-shuffle-2.ll
Modified:
    llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
    llvm/trunk/test/CodeGen/X86/widen_shuffle-1.ll

Modified: llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp?rev=212748&r1=212747&r2=212748&view=diff
==============================================================================

--- llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp (original)
+++ llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp Thu Jul 10 13:04:55 2014
@@ -10677,17 +10677,12 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE
   }
 
   // If this shuffle node is simply a swizzle of another shuffle node,
-  // and it reverses the swizzle of the previous shuffle then we can
-  // optimize shuffle(shuffle(x, undef), undef) -> x.
+  // then try to simplify it.
   if (N0.getOpcode() == ISD::VECTOR_SHUFFLE && Level < AfterLegalizeDAG &&
       N1.getOpcode() == ISD::UNDEF) {
 
     ShuffleVectorSDNode *OtherSV = cast<ShuffleVectorSDNode>(N0);
 
-    // Shuffle nodes can only reverse shuffles with a single non-undef value.
-    if (N0.getOperand(1).getOpcode() != ISD::UNDEF)
-      return SDValue();
-
     // The incoming shuffle must be of the same type as the result of the
     // current shuffle.
     assert(OtherSV->getOperand(0).getValueType() == VT &&
@@ -10704,27 +10699,69 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE
         Idx = OtherSV->getMaskElt(Idx);
       Mask.push_back(Idx);
     }
+    
+    bool CommuteOperands = false;
+    if (N0.getOperand(1).getOpcode() != ISD::UNDEF) {
+      // To be valid, the combine shuffle mask should only reference elements
+      // from one of the two vectors in input to the inner shufflevector.
+      bool IsValidMask = true;
+      for (unsigned i = 0; i != NumElts && IsValidMask; ++i)
+        // See if the combined mask only reference undefs or elements coming
+        // from the first shufflevector operand.
+        IsValidMask = Mask[i] < 0 || (unsigned)Mask[i] < NumElts;
+
+      if (!IsValidMask) {
+        IsValidMask = true;
+        for (unsigned i = 0; i != NumElts && IsValidMask; ++i)
+          // Check that all the elements come from the second shuffle operand.
+          IsValidMask = Mask[i] < 0 || (unsigned)Mask[i] >= NumElts;
+        CommuteOperands = IsValidMask;
+      }
+
+      // Early exit if the combined shuffle mask is not valid.
+      if (!IsValidMask)
+        return SDValue();
+    }
 
+    // See if this pair of shuffles can be safely folded according to either
+    // of the following rules:
+    //   shuffle(shuffle(x, y), undef) -> x
+    //   shuffle(shuffle(x, undef), undef) -> x
+    //   shuffle(shuffle(x, y), undef) -> y
     bool IsIdentityMask = true;
+    unsigned BaseMaskIndex = CommuteOperands ? NumElts : 0;
     for (unsigned i = 0; i != NumElts && IsIdentityMask; ++i) {
       // Skip Undefs.
       if (Mask[i] < 0)
         continue;
 
       // The combined shuffle must map each index to itself.
-      IsIdentityMask = (unsigned)Mask[i] == i;
+      IsIdentityMask = (unsigned)Mask[i] == i + BaseMaskIndex;
     }
-
-    if (IsIdentityMask)
-      // optimize shuffle(shuffle(x, undef), undef) -> x.
+    
+    if (IsIdentityMask) {
+      if (CommuteOperands)
+        // optimize shuffle(shuffle(x, y), undef) -> y.
+        return OtherSV->getOperand(1);
+      
+      // optimize shuffle(shuffle(x, undef), undef) -> x
+      // optimize shuffle(shuffle(x, y), undef) -> x
       return OtherSV->getOperand(0);
+    }
 
     // It may still be beneficial to combine the two shuffles if the
     // resulting shuffle is legal.
-    //   shuffle(shuffle(x, undef, M1), undef, M2) -> shuffle(x, undef, M3).
-    if (TLI.isShuffleMaskLegal(Mask, VT))
-      return DAG.getVectorShuffle(VT, SDLoc(N), N0->getOperand(0), N1,
+    if (TLI.isShuffleMaskLegal(Mask, VT)) {
+      if (!CommuteOperands)
+        // shuffle(shuffle(x, undef, M1), undef, M2) -> shuffle(x, undef, M3).
+        // shuffle(shuffle(x, y, M1), undef, M2) -> shuffle(x, undef, M3)
+        return DAG.getVectorShuffle(VT, SDLoc(N), N0->getOperand(0), N1,
+                                    &Mask[0]);
+      
+      //   shuffle(shuffle(x, y, M1), undef, M2) -> shuffle(undef, y, M3)
+      return DAG.getVectorShuffle(VT, SDLoc(N), N1, N0->getOperand(1),
                                   &Mask[0]);
+    }
   }
 
   return SDValue();

Added: llvm/trunk/test/CodeGen/X86/combine-vec-shuffle-2.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/combine-vec-shuffle-2.ll?rev=212748&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/X86/combine-vec-shuffle-2.ll (added)
+++ llvm/trunk/test/CodeGen/X86/combine-vec-shuffle-2.ll Thu Jul 10 13:04:55 2014
@@ -0,0 +1,164 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 | FileCheck %s
+
+; Check that DAGCombiner correctly folds the following pairs of shuffles
+; using the following rules:
+;  1. shuffle(shuffle(x, y), undef) -> x
+;  2. shuffle(shuffle(x, y), undef) -> y
+;  3. shuffle(shuffle(x, y), undef) -> shuffle(x, undef)
+;  4. shuffle(shuffle(x, y), undef) -> shuffle(undef, y)
+;
+; Rules 3. and 4. are used only if the resulting shuffle mask is legal.
+
+define <4 x i32> @test1(<4 x i32> %A, <4 x i32> %B) {
+  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 3, i32 1>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 3>
+  ret <4 x i32> %2
+}
+; CHECK-LABEL: test1
+; Mask: [3,0,0,1]
+; CHECK: pshufd $67
+; CHECK-NEXT: ret
+
+
+define <4 x i32> @test2(<4 x i32> %A, <4 x i32> %B) {
+  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 3>
+  ret <4 x i32> %2
+}
+; CHECK-LABEL: test2
+; Mask: [2,0,0,3]
+; CHECK: pshufd $-62
+; CHECK-NEXT: ret
+
+
+define <4 x i32> @test3(<4 x i32> %A, <4 x i32> %B) {
+  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 6, i32 2, i32 3>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 3>
+  ret <4 x i32> %2
+}
+; CHECK-LABEL: test3
+; Mask: [2,0,0,3]
+; CHECK: pshufd $-62
+; CHECK-NEXT: ret
+
+
+define <4 x i32> @test4(<4 x i32> %A, <4 x i32> %B) {
+  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 7, i32 1>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 4, i32 4, i32 0, i32 3>
+  ret <4 x i32> %2
+}
+; CHECK-LABEL: test4
+; Mask: [0,0,0,1]
+; CHECK: pshufd $64
+; CHECK-NEXT: ret
+
+
+define <4 x i32> @test5(<4 x i32> %A, <4 x i32> %B) {
+  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 5, i32 5, i32 2, i32 3>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 4, i32 3>
+  ret <4 x i32> %2
+}
+; CHECK-LABEL: test5
+; Mask: [1,1]
+; CHECK: movhlps
+; CHECK-NEXT: ret
+
+
+define <4 x i32> @test6(<4 x i32> %A, <4 x i32> %B) {
+  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 6, i32 2, i32 4>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 4>
+  ret <4 x i32> %2
+}
+; CHECK-LABEL: test6
+; Mask: [2,0,0,0]
+; CHECK: pshufd $2
+; CHECK-NEXT: ret
+
+
+define <4 x i32> @test7(<4 x i32> %A, <4 x i32> %B) {
+  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 0, i32 2>
+  ret <4 x i32> %2
+}
+; CHECK-LABEL: test7
+; Mask: [0,2,0,2]
+; CHECK: pshufd $-120
+; CHECK-NEXT: ret
+
+
+define <4 x i32> @test8(<4 x i32> %A, <4 x i32> %B) {
+  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 4, i32 3, i32 4>
+  ret <4 x i32> %2
+}
+; CHECK-LABEL: test8
+; Mask: [1,0,3,0]
+; CHECK: pshufd $49
+; CHECK-NEXT: ret
+
+
+define <4 x i32> @test9(<4 x i32> %A, <4 x i32> %B) {
+  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 3, i32 2, i32 5>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 4, i32 2>
+  ret <4 x i32> %2
+}
+; CHECK-LABEL: test9
+; Mask: [1,3,0,2]
+; CHECK: pshufd $-115
+; CHECK-NEXT: ret
+
+
+define <4 x i32> @test10(<4 x i32> %A, <4 x i32> %B) {
+  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 1, i32 5, i32 5>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 4>
+  ret <4 x i32> %2
+}
+; CHECK-LABEL: test10
+; Mask: [1,0,1,0]
+; CHECK: pshufd $17
+; CHECK-NEXT: ret
+
+
+define <4 x i32> @test11(<4 x i32> %A, <4 x i32> %B) {
+  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 2, i32 5, i32 4>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 0>
+  ret <4 x i32> %2
+}
+; CHECK-LABEL: test11
+; Mask: [1,0,2,1]
+; CHECK: pshufd $97
+; CHECK-NEXT: ret
+
+
+define <4 x i32> @test12(<4 x i32> %A, <4 x i32> %B) {
+  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 0, i32 2, i32 4>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 4, i32 0, i32 4>
+  ret <4 x i32> %2
+}
+; CHECK-LABEL: test12
+; Mask: [0,0,0,0]
+; CHECK: pshufd $0
+; CHECK-NEXT: ret
+
+
+; The following pair of shuffles is folded into vector %A.
+define <4 x i32> @test13(<4 x i32> %A, <4 x i32> %B) {
+  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 4, i32 2, i32 6>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 4, i32 0, i32 2, i32 4>
+  ret <4 x i32> %2
+}
+; CHECK-LABEL: test13
+; CHECK-NOT: pshufd
+; CHECK: ret
+
+
+; The following pair of shuffles is folded into vector %B.
+define <4 x i32> @test14(<4 x i32> %A, <4 x i32> %B) {
+  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 6, i32 2, i32 4>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 4, i32 1, i32 4>
+  ret <4 x i32> %2
+}
+; CHECK-LABEL: test14
+; CHECK-NOT: pshufd
+; CHECK: ret
+

Modified: llvm/trunk/test/CodeGen/X86/widen_shuffle-1.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/widen_shuffle-1.ll?rev=212748&r1=212747&r2=212748&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/widen_shuffle-1.ll (original)
+++ llvm/trunk/test/CodeGen/X86/widen_shuffle-1.ll Thu Jul 10 13:04:55 2014
@@ -33,7 +33,9 @@ entry:
 define void @shuf3(<4 x float> %tmp10, <4 x float> %vecinit15, <4 x float>* %dst) nounwind {
 entry:
 ; CHECK-LABEL: shuf3:
-; CHECK: shufps
+; CHECK-NOT: movlhps
+; CHECK-NOT: shufps
+; CHECK: pshufd
   %shuffle.i.i.i12 = shufflevector <4 x float> %tmp10, <4 x float> %vecinit15, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
   %tmp25.i.i = shufflevector <4 x float> %shuffle.i.i.i12, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2> 
   %tmp1.i.i = shufflevector <3 x float> %tmp25.i.i, <3 x float> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>