[PATCH] Teach the DAGCombiner how to fold a OR of two shufflevector into a single shufflevector node

Thu Mar 6 06:25:22 PST 2014

Hi Nadav, Hal and Tim,

I modified my original patch adding checks agains
'TLI.isShuffleMaskLegal' as suggested by Hal.
This new patch now avoids folding an OR of two shuffles if the
resulting shuffle would be illegal for the target.

The rules for this new combine are:
   // fold (or (shuf A, V_0, MaskA), (shuf B, V_0, MaskB)) -> (shuf A, B, Mask1)
   // fold (or (shuf A, V_0, MaskA), (shuf B, V_0, MaskB)) -> (shuf B, A, Mask2)

Basically:
we can take advantage of the fact that OR is commutative and compute
two different shuffle masks: Mask1, Mask2.
If the dag node in input can be folded, then we check if Mask1 is a
legal shuffle mask; in case it folds according to the first rule.
Otherwise we checks if Mask2 is legal to see if we can fold according
to the second rule.
If both Masks are illegal then we conservatively don't fold the OR
into a shuffle.

///
@Nadav
In my experiments on the x86 (Corei7 with and without AVX support) I
noticed that this new patch correctly folds all the interesting cases
except this one:

define <4 x i32> @foo(<4 x i32> %a, <4 x i32> %b) {
  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x
i32><i32 0, i32 4, i32 4, i32 4>
  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x
i32><i32 4, i32 0, i32 3, i32 2>
  %or = or <4 x i32> %shuf1, %shuf2
  ret <4 x i32> %or
}

The OR in function @foo is not folded because that would result in an
illegal shuffle.
However, that particular case could be still optimized into a sequence
of two legal shuffle operations (getting rid of the OR entirely).
Ideally we could generate the following assembly:
    movlhps %xmm1, %xmm0
    shufps $-71 %xmm1, %xmm0

The transformation would be valid in general if the target can emit
(V)MOVLHPS instructions and if each shuffle in input to the OR has
only one Use.
If ok for you, once this patch is accepted I will send another patch
to improve that particular case on the x86 only (adding a target
specific combine).
///

Please let me know if this new patch is reasonable for you.

Thanks in advance!
Andrea


On Wed, Mar 5, 2014 at 9:37 PM, Tim Northover <t.p.northover at gmail.com> wrote:
>> It is okay to generate new shuffles from a collection of insert/extracts
>> because the new shuffle instructions can't be worse then a collection
>> of inserts and extracts.
>
> Thanks Nadav; it sounds like I should remove the TODO and comment on
> the scope of that function then as part of whatever I do.
>
> Tim.
>
> _______________________________________________
> llvm-commits mailing list
> llvm-commits at cs.uiuc.edu
> http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits
-------------- next part --------------
Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp
===================================================================

--- lib/CodeGen/SelectionDAG/DAGCombiner.cpp	(revision 203120)
+++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp	(working copy)
@@ -3200,6 +3200,60 @@
       return N0;
     if (ISD::isBuildVectorAllOnes(N1.getNode()))
       return N1;
+
+    // fold (or (shuf A, V_0, MA), (shuf B, V_0, MB)) -> (shuf A, B, Mask1)
+    // fold (or (shuf A, V_0, MA), (shuf B, V_0, MB)) -> (shuf B, A, Mask2)
+    // Do this only if the resulting shuffle is legal.
+    if (isa<ShuffleVectorSDNode>(N0) &&
+        isa<ShuffleVectorSDNode>(N1) &&
+        N0->getOperand(1) == N1->getOperand(1) &&
+        ISD::isBuildVectorAllZeros(N0.getOperand(1).getNode())) {
+      bool CanFold = true;
+      unsigned NumElts = VT.getVectorNumElements();
+      const ShuffleVectorSDNode *SV0 = cast<ShuffleVectorSDNode>(N0);
+      const ShuffleVectorSDNode *SV1 = cast<ShuffleVectorSDNode>(N1);
+      // We construct two shuffle masks:
+      // - Mask1 is a shuffle mask for a shuffle with N0 as the first operand
+      // and N1 as the second operand.
+      // - Mask2 is a shuffle mask for a shuffle with N1 as the first operand
+      // and N0 as the second operand.
+      // We do this because OR is commutable and therefore there might be
+      // two ways to fold this node into a shuffle.
+      SmallVector<int,4> Mask1;
+      SmallVector<int,4> Mask2;
+      
+      for (unsigned i = 0; i != NumElts && CanFold; ++i) {
+        int M0 = SV0->getMaskElt(i);
+        int M1 = SV1->getMaskElt(i);
+   
+        // Both shuffle indexes are undef. Propagate Undef.
+        if (M0 < 0 && M1 < 0) {
+          Mask1.push_back(M0);
+          Mask2.push_back(M0);
+          continue;
+        }
+
+        if (M0 < 0 || M1 < 0 ||
+            (M0 < (int)NumElts && M1 < (int)NumElts) ||
+            (M0 >= (int)NumElts && M1 >= (int)NumElts)) {
+          CanFold = false;
+          break;
+        }
+        
+        Mask1.push_back(M0 < (int)NumElts ? M0 : M1 + NumElts);
+        Mask2.push_back(M1 < (int)NumElts ? M1 : M0 + NumElts);
+      }
+
+      if (CanFold) {
+        // Fold this sequence only if the resulting shuffle is 'legal'.
+        if (TLI.isShuffleMaskLegal(Mask1, VT))
+          return DAG.getVectorShuffle(VT, SDLoc(N), N0->getOperand(0),
+                                      N1->getOperand(0), &Mask1[0]);
+        if (TLI.isShuffleMaskLegal(Mask2, VT))
+          return DAG.getVectorShuffle(VT, SDLoc(N), N1->getOperand(0),
+                                      N0->getOperand(0), &Mask2[0]);
+      }
+    }
   }
 
   // fold (or x, undef) -> -1
Index: test/CodeGen/X86/combine-or.ll
===================================================================
--- test/CodeGen/X86/combine-or.ll	(revision 0)
+++ test/CodeGen/X86/combine-or.ll	(working copy)
@@ -0,0 +1,267 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 | FileCheck %s
+
+
+; Verify that each of the following test cases is folded into a single
+; instruction which performs a blend operation.
+
+define <2 x i64> @test1(<2 x i64> %a, <2 x i64> %b) {
+  %shuf1 = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32><i32 0, i32 2>
+  %shuf2 = shufflevector <2 x i64> %b, <2 x i64> zeroinitializer, <2 x i32><i32 2, i32 1>
+  %or = or <2 x i64> %shuf1, %shuf2
+  ret <2 x i64> %or
+}
+; CHECK-LABEL: test1
+; CHECK-NOT: xorps
+; CHECK: movsd
+; CHECK-NOT: orps
+; CHECK: ret
+
+
+define <4 x i32> @test2(<4 x i32> %a, <4 x i32> %b) {
+  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 4, i32 2, i32 3>
+  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 1, i32 4, i32 4>
+  %or = or <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %or
+}
+; CHECK-LABEL: test2
+; CHECK-NOT: xorps
+; CHECK: shufps
+; CHECK: ret
+
+
+define <2 x i64> @test3(<2 x i64> %a, <2 x i64> %b) {
+  %shuf1 = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32><i32 2, i32 1>
+  %shuf2 = shufflevector <2 x i64> %b, <2 x i64> zeroinitializer, <2 x i32><i32 0, i32 2>
+  %or = or <2 x i64> %shuf1, %shuf2
+  ret <2 x i64> %or
+}
+; CHECK-LABEL: test3
+; CHECK-NOT: xorps
+; CHECK: movsd
+; CHECK-NEXT: ret
+
+
+define <4 x i32> @test4(<4 x i32> %a, <4 x i32> %b) {
+  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 4, i32 4, i32 4>
+  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 1, i32 2, i32 3>
+  %or = or <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %or
+}
+; CHECK-LABEL: test4
+; CHECK-NOT: xorps
+; CHECK: movss
+; CHECK-NOT: orps
+; CHECK: ret
+
+
+define <4 x i32> @test5(<4 x i32> %a, <4 x i32> %b) {
+  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 1, i32 2, i32 3>
+  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 4, i32 4, i32 4>
+  %or = or <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %or
+}
+; CHECK-LABEL: test5
+; CHECK-NOT: xorps
+; CHECK: movss
+; CHECK-NEXT: ret
+
+
+define <4 x i32> @test6(<4 x i32> %a, <4 x i32> %b) {
+  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 1, i32 4, i32 4>
+  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 4, i32 2, i32 3>
+  %or = or <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %or
+}
+; CHECK-LABEL: test6
+; CHECK-NOT: xorps
+; CHECK: shufps
+; CHECK-NEXT: ret
+
+
+define <4 x i32> @test7(<4 x i32> %a, <4 x i32> %b) {
+  %and1 = and <4 x i32> %a, <i32 -1, i32 -1, i32 0, i32 0>
+  %and2 = and <4 x i32> %b, <i32 0, i32 0, i32 -1, i32 -1>
+  %or = or <4 x i32> %and1, %and2
+  ret <4 x i32> %or
+}
+; CHECK-LABEL: test7
+; CHECK-NOT: xorps
+; CHECK: shufps
+; CHECK-NEXT: ret
+
+
+define <2 x i64> @test8(<2 x i64> %a, <2 x i64> %b) {
+  %and1 = and <2 x i64> %a, <i64 -1, i64 0>
+  %and2 = and <2 x i64> %b, <i64 0, i64 -1>
+  %or = or <2 x i64> %and1, %and2
+  ret <2 x i64> %or
+}
+; CHECK-LABEL: test8
+; CHECK-NOT: xorps
+; CHECK: movsd
+; CHECK-NOT: orps
+; CHECK: ret
+
+
+define <4 x i32> @test9(<4 x i32> %a, <4 x i32> %b) {
+  %and1 = and <4 x i32> %a, <i32 0, i32 0, i32 -1, i32 -1>
+  %and2 = and <4 x i32> %b, <i32 -1, i32 -1, i32 0, i32 0>
+  %or = or <4 x i32> %and1, %and2
+  ret <4 x i32> %or
+}
+; CHECK-LABEL: test9
+; CHECK-NOT: xorps
+; CHECK: shufps
+; CHECK: ret
+
+
+define <2 x i64> @test10(<2 x i64> %a, <2 x i64> %b) {
+  %and1 = and <2 x i64> %a, <i64 0, i64 -1>
+  %and2 = and <2 x i64> %b, <i64 -1, i64 0>
+  %or = or <2 x i64> %and1, %and2
+  ret <2 x i64> %or
+}
+; CHECK-LABEL: test10
+; CHECK-NOT: xorps
+; CHECK: movsd
+; CHECK-NEXT: ret
+
+
+define <4 x i32> @test11(<4 x i32> %a, <4 x i32> %b) {
+  %and1 = and <4 x i32> %a, <i32 -1, i32 0, i32 0, i32 0>
+  %and2 = and <4 x i32> %b, <i32 0, i32 -1, i32 -1, i32 -1>
+  %or = or <4 x i32> %and1, %and2
+  ret <4 x i32> %or
+}
+; CHECK-LABEL: test11
+; CHECK-NOT: xorps
+; CHECK: movss
+; CHECK-NOT: orps
+; CHECK: ret
+
+
+define <4 x i32> @test12(<4 x i32> %a, <4 x i32> %b) {
+  %and1 = and <4 x i32> %a, <i32 0, i32 -1, i32 -1, i32 -1>
+  %and2 = and <4 x i32> %b, <i32 -1, i32 0, i32 0, i32 0>
+  %or = or <4 x i32> %and1, %and2
+  ret <4 x i32> %or
+}
+; CHECK-LABEL: test12
+; CHECK-NOT: xorps
+; CHECK: movss
+; CHECK-NEXT: ret
+
+
+; Verify that the following test cases are folded into single shuffles.
+
+define <4 x i32> @test13(<4 x i32> %a, <4 x i32> %b) {
+  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 1, i32 1, i32 4, i32 4>
+  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 4, i32 2, i32 3>
+  %or = or <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %or
+}
+; CHECK-LABEL: test13
+; CHECK-NOT: xorps
+; CHECK: shufps
+; CHECK-NEXT: ret
+
+
+define <2 x i64> @test14(<2 x i64> %a, <2 x i64> %b) {
+  %shuf1 = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32><i32 0, i32 2>
+  %shuf2 = shufflevector <2 x i64> %b, <2 x i64> zeroinitializer, <2 x i32><i32 2, i32 0>
+  %or = or <2 x i64> %shuf1, %shuf2
+  ret <2 x i64> %or
+}
+; CHECK-LABEL: test14
+; CHECK-NOT: pslldq
+; CHECK-NOT: por
+; CHECK: punpcklqdq
+; CHECK-NEXT: ret
+
+
+define <4 x i32> @test15(<4 x i32> %a, <4 x i32> %b) {
+  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 4, i32 2, i32 1>
+  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 2, i32 1, i32 4, i32 4>
+  %or = or <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %or
+}
+; CHECK-LABEL: test15
+; CHECK-NOT: xorps
+; CHECK: shufps
+; CHECK-NOT: shufps
+; CHECK-NOT: orps
+; CHECK: ret
+
+
+define <2 x i64> @test16(<2 x i64> %a, <2 x i64> %b) {
+  %shuf1 = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32><i32 2, i32 0>
+  %shuf2 = shufflevector <2 x i64> %b, <2 x i64> zeroinitializer, <2 x i32><i32 0, i32 2>
+  %or = or <2 x i64> %shuf1, %shuf2
+  ret <2 x i64> %or
+}
+; CHECK-LABEL: test16
+; CHECK-NOT: pslldq
+; CHECK-NOT: por
+; CHECK: punpcklqdq
+; CHECK: ret
+
+
+; Verify that the dag-combiner does not fold a OR of two shuffles into a single
+; shuffle instruction when the shuffle indexes are not compatible.
+
+define <4 x i32> @test17(<4 x i32> %a, <4 x i32> %b) {
+  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 0, i32 4, i32 2>
+  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 1, i32 4, i32 4>
+  %or = or <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %or
+}
+; CHECK-LABEL: test17
+; CHECK: por
+; CHECK-NEXT: ret
+
+
+define <4 x i32> @test18(<4 x i32> %a, <4 x i32> %b) {
+  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 0, i32 4, i32 4>
+  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 4, i32 4, i32 4>
+  %or = or <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %or
+}
+; CHECK-LABEL: test18
+; CHECK: orps
+; CHECK: ret
+
+
+define <4 x i32> @test19(<4 x i32> %a, <4 x i32> %b) {
+  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 0, i32 4, i32 3>
+  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 4, i32 2, i32 2>
+  %or = or <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %or
+}
+; CHECK-LABEL: test19
+; CHECK: por
+; CHECK-NEXT: ret
+
+
+define <2 x i64> @test20(<2 x i64> %a, <2 x i64> %b) {
+  %shuf1 = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32><i32 0, i32 2>
+  %shuf2 = shufflevector <2 x i64> %b, <2 x i64> zeroinitializer, <2 x i32><i32 0, i32 2>
+  %or = or <2 x i64> %shuf1, %shuf2
+  ret <2 x i64> %or
+}
+; CHECK-LABEL: test20
+; CHECK-NOT: xorps
+; CHECK: orps
+; CHECK-NEXT: ret
+
+
+define <2 x i64> @test21(<2 x i64> %a, <2 x i64> %b) {
+  %shuf1 = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32><i32 2, i32 0>
+  %shuf2 = shufflevector <2 x i64> %b, <2 x i64> zeroinitializer, <2 x i32><i32 2, i32 0>
+  %or = or <2 x i64> %shuf1, %shuf2
+  ret <2 x i64> %or
+}
+; CHECK-LABEL: test21
+; CHECK: por
+; CHECK-NEXT: ret
+
+