[PATCH][DAGCombiner] Teach how to simply a xor/and/or of two shuffles with same mask.

Tue Mar 18 09:40:54 PDT 2014

Hi,

this patch teaches the DAGCombiner how to simplify xor/and/or nodes
according to the following rules:

 1)  (AND (shuf (A, C, Mask), shuf (B, C, Mask)) -> shuf (AND (A, B), C, Mask)
 2)  (OR  (shuf (A, C, Mask), shuf (B, C, Mask)) -> shuf (OR  (A, B), C, Mask)
 3)  (XOR (shuf (A, C, Mask), shuf (B, C, Mask)) -> shuf (XOR (A, B), V_0, Mask)

 4)  (AND (shuf (C, A, Mask), shuf (C, B, Mask)) -> shuf (C, AND (A, B), Mask)
 5)  (OR  (shuf (C, A, Mask), shuf (C, B, Mask)) -> shuf (C, OR  (A, B), Mask)
 6)  (XOR (shuf (C, A, Mask), shuf (C, B, Mask)) -> shuf (V_0, XOR (A, B), Mask)

Before this patch,
the DAGCombiner was only able to simplify a xor/and/or of shuffles
that implemented a swizzle operation. The rules originally implemented
were similar to rules 1,2 and 3, with the difference that 'C' was only
allowed to be ISD::UNDEF.

There is however a number of other cases where it would be profitable
to move the shuffle operation immediately after the binary xor/and/or
operation.
This patch tries to improve the situation teaching the DAGCombiner how
to simplify the code in those other cases.

One main difference with respect to the original implementation is
that before, DAGCombiner always moved the swizzle after the xor/and/or
operation regardless of how many uses each shuffle nodes had.

For example, the code below triggered the dag combine rule but - as a
side effect - it caused the introduction of an extra shuffle in the
code.

define <4 x i32> @foo(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0,
i32 2, i32 1, i32 3>
  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0,
i32 2, i32 1, i32 3>
  %and = and <4 x i32> %shuf1, %shuf2
  %add = add <4 x i32> %and, %shuf1
  %add2 = add <4 x i32> %add, %shuf2
  ret <4 x i32> %add2
}

Before (mcpu=corei7-avx):
   vpshufd $Mask, %xmm0, %xmm2
   vpshufd $Mask, %xmm1, %xmm3
   vpand   %xmm1, %xmm0,  %xmm0
   vpshufd $Mask, %xmm0, %xmm0
   vpaddd  %xmm2, %xmm0,  %xmm0
   vpaddd  %xmm3, %xmm0, %xmm0


With this patch, it is no longer worthwhile to perform the
transformation on the AND node; this is to avoid introducing extra
nodes in the dag.

After:
   vpshufd $Mask, %xmm0, %xmm0
   vpshufd $Mask, %xmm1, %xmm1
   vpand   %xmm1, %xmm0, %xmm2
   vpaddd  %xmm0, %xmm2, %xmm0
   vpaddd  %xmm1, %xmm0, %xmm0

Now we don't introduce an extra shuffle.

The new x86 test  'combine-vec-shuffle.ll' contains some examples of
functions that are optimized by these new rules.

Example:

define <4 x i32> @test1b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0,
i32 5, i32 2, i32 7>
  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0,
i32 5, i32 2, i32 7>
  %and = and <4 x i32> %shuf1, %shuf2
  ret <4 x i32> %and
}

Before produced the sequence "vblendps+vblendps+vandps".
Now it produces the sequence "vandps+vblendps".

Please let me know if ok to submit.

Thanks,
Andrea Di Biagio
SN Systems - Sony Computer Entertainment.
-------------- next part --------------
Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp
===================================================================

--- lib/CodeGen/SelectionDAG/DAGCombiner.cpp	(revision 204158)
+++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp	(working copy)
@@ -2518,35 +2518,66 @@
   // The type-legalizer generates this pattern when loading illegal
   // vector types from memory. In many cases this allows additional shuffle
   // optimizations.
-  if (N0.getOpcode() == ISD::VECTOR_SHUFFLE && Level < AfterLegalizeDAG &&
-      N0.getOperand(1).getOpcode() == ISD::UNDEF &&
-      N1.getOperand(1).getOpcode() == ISD::UNDEF) {
+  // There are other cases where moving the shuffle after the xor/and/or
+  // is profitable even if shuffles don't perform a swizzle.
+  // If both shuffles use the same mask, and the first or second operand of
+  // each shuffle is the same, then moving the shuffle after the xor/and/or
+  // operation might still be profitable.
+  if (N0.getOpcode() == ISD::VECTOR_SHUFFLE && Level < AfterLegalizeDAG) {
     ShuffleVectorSDNode *SVN0 = cast<ShuffleVectorSDNode>(N0);
     ShuffleVectorSDNode *SVN1 = cast<ShuffleVectorSDNode>(N1);
 
-    assert(N0.getOperand(0).getValueType() == N1.getOperand(1).getValueType() &&
+    assert(N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType() &&
            "Inputs to shuffles are not the same type");
-
-    unsigned NumElts = VT.getVectorNumElements();
-
+    
     // Check that both shuffles use the same mask. The masks are known to be of
     // the same length because the result vector type is the same.
-    bool SameMask = true;
-    for (unsigned i = 0; i != NumElts; ++i) {
-      int Idx0 = SVN0->getMaskElt(i);
-      int Idx1 = SVN1->getMaskElt(i);
-      if (Idx0 != Idx1) {
-        SameMask = false;
-        break;
+    // Check also that shuffles have only one use to avoid introducing extra
+    // instructions.
+    if (SVN0->hasOneUse() && SVN1->hasOneUse() &&
+        SVN0->getMask().equals(SVN1->getMask())) {
+      SDValue ShOp = N0->getOperand(1);
+
+      // Don't try to fold this node if it requires introducing a
+      // build vector of all zeros that might be illegal at this stage.
+      if (N->getOpcode() == ISD::XOR && ShOp.getOpcode() != ISD::UNDEF) {
+        if (!LegalTypes)
+          ShOp = DAG.getConstant(0, VT);
+        else
+          ShOp = SDValue();
       }
-    }
 
-    if (SameMask) {
-      SDValue Op = DAG.getNode(N->getOpcode(), SDLoc(N), VT,
-                               N0.getOperand(0), N1.getOperand(0));
-      AddToWorkList(Op.getNode());
-      return DAG.getVectorShuffle(VT, SDLoc(N), Op,
-                                  DAG.getUNDEF(VT), &SVN0->getMask()[0]);
+      // (AND (shuf (A, C), shuf (B, C)) -> shuf (AND (A, B), C)
+      // (OR  (shuf (A, C), shuf (B, C)) -> shuf (OR  (A, B), C)
+      // (XOR (shuf (A, C), shuf (B, C)) -> shuf (XOR (A, B), V_0)
+      if (N0.getOperand(1) == N1.getOperand(1) && ShOp.getNode()) {
+        SDValue NewNode = DAG.getNode(N->getOpcode(), SDLoc(N), VT,
+                                      N0->getOperand(0), N1->getOperand(0));
+        AddToWorkList(NewNode.getNode());
+        return DAG.getVectorShuffle(VT, SDLoc(N), NewNode, ShOp,
+                                    &SVN0->getMask()[0]);
+      }
+
+      // Don't try to fold this node if it requires introducing a
+      // build vector of all zeros that might be illegal at this stage.
+      ShOp = N0->getOperand(0);
+      if (N->getOpcode() == ISD::XOR && ShOp.getOpcode() != ISD::UNDEF) {
+        if (!LegalTypes)
+          ShOp = DAG.getConstant(0, VT);
+        else
+          ShOp = SDValue();
+      }
+
+      // (AND (shuf (C, A), shuf (C, B)) -> shuf (C, AND (A, B))
+      // (OR  (shuf (C, A), shuf (C, B)) -> shuf (C, OR  (A, B))
+      // (XOR (shuf (C, A), shuf (C, B)) -> shuf (V_0, XOR (A, B))
+      if (N0->getOperand(0) == N1->getOperand(0) && ShOp.getNode()) {
+        SDValue NewNode = DAG.getNode(N->getOpcode(), SDLoc(N), VT,
+                                      N0->getOperand(1), N1->getOperand(1));
+        AddToWorkList(NewNode.getNode());
+        return DAG.getVectorShuffle(VT, SDLoc(N), ShOp, NewNode,
+                                    &SVN0->getMask()[0]);
+      }
     }
   }
 
Index: test/CodeGen/X86/combine-vec-shuffle.ll
===================================================================
--- test/CodeGen/X86/combine-vec-shuffle.ll	(revision 0)
+++ test/CodeGen/X86/combine-vec-shuffle.ll	(working copy)
@@ -0,0 +1,253 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 | FileCheck %s
+
+; Verify that the DAGCombiner correctly folds according to the following rules:
+
+; fold (AND (shuf (A, C), shuf (B, C)) -> shuf (AND (A, B), C)
+; fold (OR  (shuf (A, C), shuf (B, C)) -> shuf (OR  (A, B), C)
+; fold (XOR (shuf (A, C), shuf (B, C)) -> shuf (XOR (A, B), V_0)
+
+; fold (AND (shuf (C, A), shuf (C, B)) -> shuf (C, AND (A, B))
+; fold (OR  (shuf (C, A), shuf (C, B)) -> shuf (C, OR  (A, B))
+; fold (XOR (shuf (C, A), shuf (C, B)) -> shuf (V_0, XOR (A, B))
+
+
+
+define <4 x i32> @test1(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
+  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
+  %and = and <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %and
+}
+; CHECK-LABEL: test1
+; CHECK-NOT: pshufd
+; CHECK: pand
+; CHECK-NEXT: pshufd
+; CHECK-NEXT: ret
+
+
+define <4 x i32> @test2(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
+  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
+  %or = or <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %or
+}
+; CHECK-LABEL: test2
+; CHECK-NOT: pshufd
+; CHECK: por
+; CHECK-NEXT: pshufd
+; CHECK-NEXT: ret
+
+
+define <4 x i32> @test3(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
+  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
+  %xor = xor <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %xor
+}
+; CHECK-LABEL: test3
+; CHECK-NOT: pshufd
+; CHECK: pxor
+; CHECK-NEXT: pshufd
+; CHECK-NEXT: ret
+
+
+define <4 x i32> @test4(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 4, i32 6, i32 5, i32 7>
+  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 4, i32 6, i32 5, i32 7>
+  %and = and <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %and
+}
+; CHECK-LABEL: test4
+; CHECK-NOT: pshufd
+; CHECK: pand
+; CHECK-NEXT: pshufd
+; CHECK-NEXT: ret
+
+
+define <4 x i32> @test5(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 4, i32 6, i32 5, i32 7>
+  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 4, i32 6, i32 5, i32 7>
+  %or = or <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %or
+}
+; CHECK-LABEL: test5
+; CHECK-NOT: pshufd
+; CHECK: por
+; CHECK-NEXT: pshufd
+; CHECK-NEXT: ret
+
+
+define <4 x i32> @test6(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 4, i32 6, i32 5, i32 7>
+  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 4, i32 6, i32 5, i32 7>
+  %xor = xor <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %xor
+}
+; CHECK-LABEL: test6
+; CHECK-NOT: pshufd
+; CHECK: pxor
+; CHECK-NEXT: pshufd
+; CHECK-NEXT: ret
+
+
+; Verify that DAGCombiner moves the shuffle after the xor/and/or even if shuffles
+; are not performing a swizzle operations.
+
+define <4 x i32> @test1b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
+  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
+  %and = and <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %and
+}
+; CHECK-LABEL: test1b
+; CHECK-NOT: blendps
+; CHECK: andps
+; CHECK-NEXT: blendps
+; CHECK-NEXT: ret
+
+
+define <4 x i32> @test2b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
+  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
+  %or = or <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %or
+}
+; CHECK-LABEL: test2b
+; CHECK-NOT: blendps
+; CHECK: orps
+; CHECK-NEXT: blendps
+; CHECK-NEXT: ret
+
+
+define <4 x i32> @test3b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
+  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
+  %xor = xor <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %xor
+}
+; CHECK-LABEL: test3b
+; CHECK-NOT: blendps
+; CHECK: xorps
+; CHECK-NEXT: xorps
+; CHECK-NEXT: blendps
+; CHECK-NEXT: ret
+
+
+define <4 x i32> @test4b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 5, i32 2, i32 7>
+  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 5, i32 2, i32 7>
+  %and = and <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %and
+}
+; CHECK-LABEL: test4b
+; CHECK-NOT: blendps
+; CHECK: andps
+; CHECK-NEXT: blendps
+; CHECK: ret
+
+
+define <4 x i32> @test5b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 5, i32 2, i32 7>
+  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 5, i32 2, i32 7>
+  %or = or <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %or
+}
+; CHECK-LABEL: test5b
+; CHECK-NOT: blendps
+; CHECK: orps
+; CHECK-NEXT: blendps
+; CHECK: ret
+
+
+define <4 x i32> @test6b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 5, i32 2, i32 7>
+  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 5, i32 2, i32 7>
+  %xor = xor <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %xor
+}
+; CHECK-LABEL: test6b
+; CHECK-NOT: blendps
+; CHECK: xorps
+; CHECK-NEXT: xorps
+; CHECK-NEXT: blendps
+; CHECK: ret
+
+define <4 x i32> @test1c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
+  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
+  %and = and <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %and
+}
+; CHECK-LABEL: test1c
+; CHECK-NOT: shufps
+; CHECK: andps
+; CHECK-NEXT: shufps
+; CHECK-NEXT: ret
+
+
+define <4 x i32> @test2c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
+  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
+  %or = or <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %or
+}
+; CHECK-LABEL: test2c
+; CHECK-NOT: shufps
+; CHECK: orps
+; CHECK-NEXT: shufps
+; CHECK-NEXT: ret
+
+
+define <4 x i32> @test3c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
+  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
+  %xor = xor <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %xor
+}
+; CHECK-LABEL: test3c
+; CHECK-NOT: shufps
+; CHECK: xorps
+; CHECK-NEXT: xorps
+; CHECK-NEXT: shufps
+; CHECK-NEXT: ret
+
+
+define <4 x i32> @test4c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 2, i32 5, i32 7>
+  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 2, i32 5, i32 7>
+  %and = and <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %and
+}
+; CHECK-LABEL: test4c
+; CHECK-NOT: shufps
+; CHECK: andps
+; CHECK-NEXT: shufps
+; CHECK: ret
+
+
+define <4 x i32> @test5c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 2, i32 5, i32 7>
+  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 2, i32 5, i32 7>
+  %or = or <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %or
+}
+; CHECK-LABEL: test5c
+; CHECK-NOT: shufps
+; CHECK: orps
+; CHECK-NEXT: shufps
+; CHECK: ret
+
+
+define <4 x i32> @test6c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 2, i32 5, i32 7>
+  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 2, i32 5, i32 7>
+  %xor = xor <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %xor
+}
+; CHECK-LABEL: test6c
+; CHECK-NOT: shufps
+; CHECK: xorps
+; CHECK-NEXT: xorps
+; CHECK-NEXT: shufps
+; CHECK: ret
+
Index: test/CodeGen/X86/combine-or.ll
===================================================================
--- test/CodeGen/X86/combine-or.ll	(revision 204158)
+++ test/CodeGen/X86/combine-or.ll	(working copy)
@@ -251,6 +251,7 @@
 ; CHECK-LABEL: test20
 ; CHECK-NOT: xorps
 ; CHECK: orps
+; CHECK-NEXT: movq
 ; CHECK-NEXT: ret
 
 
@@ -262,6 +263,7 @@
 }
 ; CHECK-LABEL: test21
 ; CHECK: por
+; CHECK-NEXT: pslldq
 ; CHECK-NEXT: ret