[llvm] r348534 - [DAGCombiner] don't group bswap with casts in logic hoisting fold

Thu Dec 6 14:10:44 PST 2018

Author: spatel
Date: Thu Dec  6 14:10:44 2018
New Revision: 348534

URL: http://llvm.org/viewvc/llvm-project?rev=348534&view=rev
Log:
[DAGCombiner] don't group bswap with casts in logic hoisting fold

This was probably organized as it was because bswap is a unary op.
But that's where the similarity to the other opcodes ends. We should
not limit this transform to scalars, and we should not try it if
either input has other uses. This is another step towards trying to
clean this whole function up to prevent it from causing infinite loops
and memory explosions. 

Earlier commits in this series:
rL348501
rL348508
rL348518

Modified:
    llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
    llvm/trunk/test/CodeGen/X86/bswap-vector.ll

Modified: llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp?rev=348534&r1=348533&r2=348534&view=diff
==============================================================================

--- llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp (original)
+++ llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp Thu Dec  6 14:10:44 2018
@@ -3718,13 +3718,13 @@ SDValue DAGCombiner::hoistLogicOpWithSam
   // FIXME: We should check number of uses of the operands to not increase
   //        the instruction count for all transforms.
 
+  // Handle size-changing casts.
   EVT Op0VT = N0.getOperand(0).getValueType();
   switch (HandOpcode) {
     case ISD::ANY_EXTEND:
     case ISD::TRUNCATE:
     case ISD::ZERO_EXTEND:
     case ISD::SIGN_EXTEND:
-    case ISD::BSWAP:
       // If both operands have other uses, this transform would create extra
       // instructions without eliminating anything.
       if (!N0.hasOneUse() && !N1.hasOneUse())
@@ -3732,7 +3732,7 @@ SDValue DAGCombiner::hoistLogicOpWithSam
       // We need matching integer source types.
       // Do not hoist logic op inside of a vector extend, since it may combine
       // into a vsetcc.
-      // TODO: Should the vector check apply to truncate and bswap though?
+      // TODO: Should the vector check apply to truncate though?
       if (VT.isVector() || Op0VT != N1.getOperand(0).getValueType())
         return SDValue();
       // Don't create an illegal op during or after legalization.
@@ -3757,10 +3757,8 @@ SDValue DAGCombiner::hoistLogicOpWithSam
       return DAG.getNode(HandOpcode, SDLoc(N), VT, Logic);
   }
 
-  // For each of OP in SHL/SRL/SRA/AND...
-  //   fold (and (OP x, z), (OP y, z)) -> (OP (and x, y), z)
-  //   fold (or  (OP x, z), (OP y, z)) -> (OP (or  x, y), z)
-  //   fold (xor (OP x, z), (OP y, z)) -> (OP (xor x, y), z)
+  // For binops SHL/SRL/SRA/AND:
+  //   logic_op (OP x, z), (OP y, z) --> OP (logic_op x, y), z
   if ((HandOpcode == ISD::SHL || HandOpcode == ISD::SRL ||
        HandOpcode == ISD::SRA || HandOpcode == ISD::AND) &&
       N0.getOperand(1) == N1.getOperand(1)) {
@@ -3773,6 +3771,17 @@ SDValue DAGCombiner::hoistLogicOpWithSam
     return DAG.getNode(HandOpcode, SDLoc(N), VT, Logic, N0.getOperand(1));
   }
 
+  // Unary ops: logic_op (bswap x), (bswap y) --> bswap (logic_op x, y)
+  if (HandOpcode == ISD::BSWAP) {
+    // If either operand has other uses, this transform is not an improvement.
+    if (!N0.hasOneUse() || !N1.hasOneUse())
+      return SDValue();
+    SDValue Logic = DAG.getNode(LogicOpcode, SDLoc(N0), Op0VT,
+                                N0.getOperand(0), N1.getOperand(0));
+    AddToWorklist(Logic.getNode());
+    return DAG.getNode(HandOpcode, SDLoc(N), VT, Logic);
+  }
+
   // Simplify xor/and/or (bitcast(A), bitcast(B)) -> bitcast(op (A,B))
   // Only perform this optimization up until type legalization, before
   // LegalizeVectorOprs. LegalizeVectorOprs promotes vector operations by

Modified: llvm/trunk/test/CodeGen/X86/bswap-vector.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/bswap-vector.ll?rev=348534&r1=348533&r2=348534&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/bswap-vector.ll (original)
+++ llvm/trunk/test/CodeGen/X86/bswap-vector.ll Thu Dec  6 14:10:44 2018
@@ -76,48 +76,34 @@ define <4 x i32> @test2(<4 x i32> %v) {
 define <4 x i32> @or_bswap(<4 x i32> %x, <4 x i32> %y, <4 x i32>* %p1, <4 x i32>* %p2) {
 ; CHECK-NOSSSE3-LABEL: or_bswap:
 ; CHECK-NOSSSE3:       # %bb.0:
-; CHECK-NOSSSE3-NEXT:    pxor %xmm2, %xmm2
-; CHECK-NOSSSE3-NEXT:    movdqa %xmm0, %xmm3
-; CHECK-NOSSSE3-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
-; CHECK-NOSSSE3-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7]
-; CHECK-NOSSSE3-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4]
-; CHECK-NOSSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; CHECK-NOSSSE3-NEXT:    por %xmm1, %xmm0
+; CHECK-NOSSSE3-NEXT:    pxor %xmm1, %xmm1
+; CHECK-NOSSSE3-NEXT:    movdqa %xmm0, %xmm2
+; CHECK-NOSSSE3-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
+; CHECK-NOSSSE3-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
+; CHECK-NOSSSE3-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
+; CHECK-NOSSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
 ; CHECK-NOSSSE3-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
-; CHECK-NOSSSE3-NEXT:    pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,7,6,5,4]
-; CHECK-NOSSSE3-NEXT:    packuswb %xmm3, %xmm4
-; CHECK-NOSSSE3-NEXT:    movdqa %xmm1, %xmm0
-; CHECK-NOSSSE3-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
-; CHECK-NOSSSE3-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
-; CHECK-NOSSSE3-NEXT:    pshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,7,6,5,4]
-; CHECK-NOSSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; CHECK-NOSSSE3-NEXT:    pshuflw {{.*#+}} xmm0 = xmm1[3,2,1,0,4,5,6,7]
 ; CHECK-NOSSSE3-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
-; CHECK-NOSSSE3-NEXT:    packuswb %xmm3, %xmm0
-; CHECK-NOSSSE3-NEXT:    por %xmm4, %xmm0
+; CHECK-NOSSSE3-NEXT:    packuswb %xmm2, %xmm0
 ; CHECK-NOSSSE3-NEXT:    retq
 ;
 ; CHECK-SSSE3-LABEL: or_bswap:
 ; CHECK-SSSE3:       # %bb.0:
-; CHECK-SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
-; CHECK-SSSE3-NEXT:    pshufb %xmm2, %xmm0
-; CHECK-SSSE3-NEXT:    pshufb %xmm2, %xmm1
 ; CHECK-SSSE3-NEXT:    por %xmm1, %xmm0
+; CHECK-SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
 ; CHECK-SSSE3-NEXT:    retq
 ;
 ; CHECK-AVX-LABEL: or_bswap:
 ; CHECK-AVX:       # %bb.0:
-; CHECK-AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
-; CHECK-AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; CHECK-AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; CHECK-AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; CHECK-AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
 ; CHECK-AVX-NEXT:    retq
 ;
 ; CHECK-WIDE-AVX-LABEL: or_bswap:
 ; CHECK-WIDE-AVX:       # %bb.0:
-; CHECK-WIDE-AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
-; CHECK-WIDE-AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; CHECK-WIDE-AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; CHECK-WIDE-AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; CHECK-WIDE-AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
 ; CHECK-WIDE-AVX-NEXT:    retq
   %xt = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %x)
   %yt = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %y)