[llvm] r261023 - [X86] Don't turn (c?-v:v) into (c?-v:0) by blindly using PSIGN.

Tue Feb 16 14:14:04 PST 2016

Author: ab
Date: Tue Feb 16 16:14:03 2016
New Revision: 261023

URL: http://llvm.org/viewvc/llvm-project?rev=261023&view=rev
Log:
[X86] Don't turn (c?-v:v) into (c?-v:0) by blindly using PSIGN.

Currently, we sometimes miscompile this vector pattern:
    (c ? -v : v)
We lower it to (because "c" is <4 x i1>, lowered as a vector mask):
    (~c & v) | (c & -v)

When we have SSSE3, we incorrectly lower that to PSIGN, which does:
    (c < 0 ? -v : c > 0 ? v : 0)
in other words, when c is either all-ones or all-zero:
    (c ? -v : 0)
While this is an old bug, it rarely triggers because the PSIGN combine
is too sensitive to operand order. This will be improved separately.

Note that the PSIGN tests are also incorrect. Consider:
    %b.lobit = ashr <4 x i32> %b, <i32 31, i32 31, i32 31, i32 31>
    %sub = sub nsw <4 x i32> zeroinitializer, %a
    %0 = xor <4 x i32> %b.lobit, <i32 -1, i32 -1, i32 -1, i32 -1>
    %1 = and <4 x i32> %a, %0
    %2 = and <4 x i32> %b.lobit, %sub
    %cond = or <4 x i32> %1, %2
    ret <4 x i32> %cond
if %b is zero:
    %b.lobit = <4 x i32> zeroinitializer
    %sub = sub nsw <4 x i32> zeroinitializer, %a
    %0 = <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>
    %1 = <4 x i32> %a
    %2 = <4 x i32> zeroinitializer
    %cond = or <4 x i32> %a, zeroinitializer
    ret <4 x i32> %a
whereas we currently generate:
    psignd %xmm1, %xmm0
    retq
which returns 0, as %xmm1 is 0.

Instead, use a pure logic sequence, as described in:
https://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate

Fixes PR26110.

Differential Revision: http://reviews.llvm.org/D17181

Modified:
    llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
    llvm/trunk/test/CodeGen/X86/vector-blend.ll

Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=261023&r1=261022&r2=261023&view=diff
==============================================================================

--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Tue Feb 16 16:14:03 2016
@@ -26394,7 +26394,7 @@ static SDValue PerformAndCombine(SDNode
 // As a special case, try to fold:
 //   (or (and (m, (sub 0, x)), (pandn m, x)))
 // into:
-//   (psign m, x)
+//   (sub (xor X, M), M)
 static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
                                             const X86Subtarget &Subtarget) {
   assert(N->getOpcode() == ISD::OR);
@@ -26403,9 +26403,9 @@ static SDValue combineLogicBlendIntoPBLE
   SDValue N1 = N->getOperand(1);
   EVT VT = N->getValueType(0);
 
-  if (!((VT == MVT::v2i64 && Subtarget.hasSSSE3()) ||
-        (VT == MVT::v4i64 && Subtarget.hasInt256())))
+  if (!((VT == MVT::v2i64) || (VT == MVT::v4i64 && Subtarget.hasInt256())))
     return SDValue();
+  assert(Subtarget.hasSSE2() && "Unexpected i64 vector without SSE2!");
 
   // Canonicalize pandn to RHS
   if (N0.getOpcode() == X86ISD::ANDNP)
@@ -26454,16 +26454,29 @@ static SDValue combineLogicBlendIntoPBLE
 
   SDLoc DL(N);
 
-  // Now we know we at least have a plendvb with the mask val.  See if
-  // we can form a psignb/w/d.
-  // psign = x.type == y.type == mask.type && y = sub(0, x);
+  // Try to match:
+  //   (or (and (M, (sub 0, X)), (pandn M, X)))
+  // which is a special case of vselect:
+  //   (vselect M, (sub 0, X), X)
+  // Per:
+  // http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate
+  // We know that, if fNegate is 0 or 1:
+  //   (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)
+  //
+  // Here, we have a mask, M (all 1s or 0), and, similarly, we know that:
+  //   ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))
+  //   ( M      ? -X : X) == ((X ^   M     ) + (M & 1))
+  // This lets us transform our vselect to:
+  //   (add (xor X, M), (and M, 1))
+  // And further to:
+  //   (sub (xor X, M), M)
   if (Y.getOpcode() == ISD::SUB && Y.getOperand(1) == X &&
       ISD::isBuildVectorAllZeros(Y.getOperand(0).getNode()) &&
       X.getValueType() == MaskVT && Y.getValueType() == MaskVT) {
-    assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
-           "Unsupported VT for PSIGN");
-    Mask = DAG.getNode(X86ISD::PSIGN, DL, MaskVT, X, Mask.getOperand(0));
-    return DAG.getBitcast(VT, Mask);
+    assert(EltBits == 8 || EltBits == 16 || EltBits == 32);
+    return DAG.getBitcast(
+        VT, DAG.getNode(ISD::SUB, DL, MaskVT,
+                        DAG.getNode(ISD::XOR, DL, MaskVT, X, Mask), Mask));
   }
 
   // PBLENDVB is only available on SSE 4.1.

Modified: llvm/trunk/test/CodeGen/X86/vector-blend.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-blend.ll?rev=261023&r1=261022&r2=261023&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-blend.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-blend.ll Tue Feb 16 16:14:03 2016
@@ -908,27 +908,29 @@ define <4 x i32> @blend_neg_logic_v4i32(
 ; SSE2-LABEL: blend_neg_logic_v4i32:
 ; SSE2:       # BB#0: # %entry
 ; SSE2-NEXT:    psrad $31, %xmm1
-; SSE2-NEXT:    pxor %xmm2, %xmm2
-; SSE2-NEXT:    psubd %xmm0, %xmm2
-; SSE2-NEXT:    pand %xmm1, %xmm2
-; SSE2-NEXT:    pandn %xmm0, %xmm1
-; SSE2-NEXT:    por %xmm1, %xmm2
-; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm0
+; SSE2-NEXT:    psubd %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: blend_neg_logic_v4i32:
 ; SSSE3:       # BB#0: # %entry
-; SSSE3-NEXT:    psignd %xmm1, %xmm0
+; SSSE3-NEXT:    psrad $31, %xmm1
+; SSSE3-NEXT:    pxor %xmm1, %xmm0
+; SSSE3-NEXT:    psubd %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: blend_neg_logic_v4i32:
 ; SSE41:       # BB#0: # %entry
-; SSE41-NEXT:    psignd %xmm1, %xmm0
+; SSE41-NEXT:    psrad $31, %xmm1
+; SSE41-NEXT:    pxor %xmm1, %xmm0
+; SSE41-NEXT:    psubd %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: blend_neg_logic_v4i32:
 ; AVX:       # BB#0: # %entry
-; AVX-NEXT:    vpsignd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpsrad $31, %xmm1, %xmm1
+; AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 entry:
   %b.lobit = ashr <4 x i32> %b, <i32 31, i32 31, i32 31, i32 31>
@@ -943,32 +945,32 @@ entry:
 define <8 x i32> @blend_neg_logic_v8i32(<8 x i32> %a, <8 x i32> %b) {
 ; SSE2-LABEL: blend_neg_logic_v8i32:
 ; SSE2:       # BB#0: # %entry
-; SSE2-NEXT:    psrad $31, %xmm2
 ; SSE2-NEXT:    psrad $31, %xmm3
-; SSE2-NEXT:    pxor %xmm4, %xmm4
-; SSE2-NEXT:    pxor %xmm5, %xmm5
-; SSE2-NEXT:    psubd %xmm0, %xmm5
-; SSE2-NEXT:    psubd %xmm1, %xmm4
-; SSE2-NEXT:    pand %xmm3, %xmm4
-; SSE2-NEXT:    pandn %xmm1, %xmm3
-; SSE2-NEXT:    pand %xmm2, %xmm5
-; SSE2-NEXT:    pandn %xmm0, %xmm2
-; SSE2-NEXT:    por %xmm2, %xmm5
-; SSE2-NEXT:    por %xmm3, %xmm4
-; SSE2-NEXT:    movdqa %xmm5, %xmm0
-; SSE2-NEXT:    movdqa %xmm4, %xmm1
+; SSE2-NEXT:    psrad $31, %xmm2
+; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    psubd %xmm2, %xmm0
+; SSE2-NEXT:    pxor %xmm3, %xmm1
+; SSE2-NEXT:    psubd %xmm3, %xmm1
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: blend_neg_logic_v8i32:
 ; SSSE3:       # BB#0: # %entry
-; SSSE3-NEXT:    psignd %xmm2, %xmm0
-; SSSE3-NEXT:    psignd %xmm3, %xmm1
+; SSSE3-NEXT:    psrad $31, %xmm3
+; SSSE3-NEXT:    psrad $31, %xmm2
+; SSSE3-NEXT:    pxor %xmm2, %xmm0
+; SSSE3-NEXT:    psubd %xmm2, %xmm0
+; SSSE3-NEXT:    pxor %xmm3, %xmm1
+; SSSE3-NEXT:    psubd %xmm3, %xmm1
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: blend_neg_logic_v8i32:
 ; SSE41:       # BB#0: # %entry
-; SSE41-NEXT:    psignd %xmm2, %xmm0
-; SSE41-NEXT:    psignd %xmm3, %xmm1
+; SSE41-NEXT:    psrad $31, %xmm3
+; SSE41-NEXT:    psrad $31, %xmm2
+; SSE41-NEXT:    pxor %xmm2, %xmm0
+; SSE41-NEXT:    psubd %xmm2, %xmm0
+; SSE41-NEXT:    pxor %xmm3, %xmm1
+; SSE41-NEXT:    psubd %xmm3, %xmm1
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: blend_neg_logic_v8i32:
@@ -989,7 +991,9 @@ define <8 x i32> @blend_neg_logic_v8i32(
 ;
 ; AVX2-LABEL: blend_neg_logic_v8i32:
 ; AVX2:       # BB#0: # %entry
-; AVX2-NEXT:    vpsignd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrad $31, %ymm1, %ymm1
+; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsubd %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
 entry:
   %b.lobit = ashr <8 x i32> %b, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>