[llvm] 4fc1fc4 - [DAGCombiner] add fold for vselect based on mask of signbit

Sanjay Patel via llvm-commits llvm-commits at lists.llvm.org
Fri Nov 5 07:08:37 PDT 2021


Author: Sanjay Patel
Date: 2021-11-05T10:06:16-04:00
New Revision: 4fc1fc4005f7818e1f8a795ca15ca8f8c03e426a

URL: https://github.com/llvm/llvm-project/commit/4fc1fc4005f7818e1f8a795ca15ca8f8c03e426a
DIFF: https://github.com/llvm/llvm-project/commit/4fc1fc4005f7818e1f8a795ca15ca8f8c03e426a.diff

LOG: [DAGCombiner] add fold for vselect based on mask of signbit

(X s< 0) ? Y : 0 --> (X s>> BW-1) & Y

We canonicalize to the icmp+select form in IR, and we already have this fold
for scalar select in SDAG, so I think it's an oversight that we don't have
the fold for vectors. It seems neutral for AArch64 and saves some instructions
on x86.

Whether we should also have the sibling folds for the inverse condition or
all-ones true value may depend on target-specific factors such as whether
there's an "and-not" instruction.

Differential Revision: https://reviews.llvm.org/D113212

Added: 
    

Modified: 
    llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
    llvm/test/CodeGen/AArch64/vselect-constants.ll
    llvm/test/CodeGen/Thumb2/mve-vselect-constants.ll
    llvm/test/CodeGen/X86/avx512-logic.ll
    llvm/test/CodeGen/X86/avx512vl-logic.ll
    llvm/test/CodeGen/X86/vselect-zero.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index d1114a9802e3..bc48ea279cb4 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -9544,6 +9544,31 @@ static SDValue foldBoolSelectToLogic(SDNode *N, SelectionDAG &DAG) {
   return SDValue();
 }
 
+static SDValue foldVSelectToSignBitSplatMask(SDNode *N, SelectionDAG &DAG) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  SDValue N2 = N->getOperand(2);
+  EVT VT = N->getValueType(0);
+  if (N0.getOpcode() != ISD::SETCC || !N0.hasOneUse() || !isNullOrNullSplat(N2))
+    return SDValue();
+
+  SDValue Cond0 = N0.getOperand(0);
+  SDValue Cond1 = N0.getOperand(1);
+  ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get();
+  if (VT != Cond0.getValueType())
+    return SDValue();
+
+  // (Cond0 s< 0) ? N1 : 0 --> (Cond0 s>> BW-1) & N1
+  if (CC == ISD::SETLT && isNullOrNullSplat(Cond1)) {
+    SDLoc DL(N);
+    SDValue ShiftAmt = DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT);
+    SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, Cond0, ShiftAmt);
+    return DAG.getNode(ISD::AND, DL, VT, Sra, N1);
+  }
+
+  return SDValue();
+}
+
 SDValue DAGCombiner::visitSELECT(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
@@ -10234,6 +10259,10 @@ SDValue DAGCombiner::visitVSELECT(SDNode *N) {
   if (SDValue V = foldVSelectOfConstants(N))
     return V;
 
+  if (hasOperation(ISD::SRA, VT))
+    if (SDValue V = foldVSelectToSignBitSplatMask(N, DAG))
+      return V;
+
   return SDValue();
 }
 

diff  --git a/llvm/test/CodeGen/AArch64/vselect-constants.ll b/llvm/test/CodeGen/AArch64/vselect-constants.ll
index 992dd17ff444..130385d1bf18 100644
--- a/llvm/test/CodeGen/AArch64/vselect-constants.ll
+++ b/llvm/test/CodeGen/AArch64/vselect-constants.ll
@@ -196,8 +196,8 @@ define <4 x i32> @cmp_sel_0_or_1_vec(<4 x i32> %x, <4 x i32> %y) {
 define <16 x i8> @signbit_mask_v16i8(<16 x i8> %a, <16 x i8> %b) {
 ; CHECK-LABEL: signbit_mask_v16i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    cmlt v0.16b, v0.16b, #0
-; CHECK-NEXT:    and v0.16b, v1.16b, v0.16b
+; CHECK-NEXT:    sshr v0.16b, v0.16b, #7
+; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
   %cond = icmp slt <16 x i8> %a, zeroinitializer
   %r = select <16 x i1> %cond, <16 x i8> %b, <16 x i8> zeroinitializer
@@ -207,8 +207,8 @@ define <16 x i8> @signbit_mask_v16i8(<16 x i8> %a, <16 x i8> %b) {
 define <8 x i16> @signbit_mask_v8i16(<8 x i16> %a, <8 x i16> %b) {
 ; CHECK-LABEL: signbit_mask_v8i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    cmlt v0.8h, v0.8h, #0
-; CHECK-NEXT:    and v0.16b, v1.16b, v0.16b
+; CHECK-NEXT:    sshr v0.8h, v0.8h, #15
+; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
   %cond = icmp slt <8 x i16> %a, zeroinitializer
   %r = select <8 x i1> %cond, <8 x i16> %b, <8 x i16> zeroinitializer
@@ -218,8 +218,8 @@ define <8 x i16> @signbit_mask_v8i16(<8 x i16> %a, <8 x i16> %b) {
 define <4 x i32> @signbit_mask_v4i32(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: signbit_mask_v4i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    cmlt v0.4s, v0.4s, #0
-; CHECK-NEXT:    and v0.16b, v1.16b, v0.16b
+; CHECK-NEXT:    sshr v0.4s, v0.4s, #31
+; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
   %cond = icmp slt <4 x i32> %a, zeroinitializer
   %r = select <4 x i1> %cond, <4 x i32> %b, <4 x i32> zeroinitializer
@@ -229,8 +229,8 @@ define <4 x i32> @signbit_mask_v4i32(<4 x i32> %a, <4 x i32> %b) {
 define <2 x i64> @signbit_mask_v2i64(<2 x i64> %a, <2 x i64> %b) {
 ; CHECK-LABEL: signbit_mask_v2i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    cmlt v0.2d, v0.2d, #0
-; CHECK-NEXT:    and v0.16b, v1.16b, v0.16b
+; CHECK-NEXT:    sshr v0.2d, v0.2d, #63
+; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
   %cond = icmp slt <2 x i64> %a, zeroinitializer
   %r = select <2 x i1> %cond, <2 x i64> %b, <2 x i64> zeroinitializer

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vselect-constants.ll b/llvm/test/CodeGen/Thumb2/mve-vselect-constants.ll
index 78d5494b3ce2..4f2acda375fe 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vselect-constants.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vselect-constants.ll
@@ -137,9 +137,8 @@ define arm_aapcs_vfpcc <4 x i32> @cmp_sel_0_or_1_vec(<4 x i32> %x, <4 x i32> %y)
 define arm_aapcs_vfpcc <16 x i8> @signbit_mask_v16i8(<16 x i8> %a, <16 x i8> %b) {
 ; CHECK-LABEL: signbit_mask_v16i8:
 ; CHECK:       @ %bb.0:
-; CHECK-NEXT:    vmov.i32 q2, #0x0
-; CHECK-NEXT:    vcmp.s8 lt, q0, zr
-; CHECK-NEXT:    vpsel q0, q1, q2
+; CHECK-NEXT:    vshr.s8 q0, q0, #7
+; CHECK-NEXT:    vand q0, q0, q1
 ; CHECK-NEXT:    bx lr
   %cond = icmp slt <16 x i8> %a, zeroinitializer
   %r = select <16 x i1> %cond, <16 x i8> %b, <16 x i8> zeroinitializer
@@ -149,9 +148,8 @@ define arm_aapcs_vfpcc <16 x i8> @signbit_mask_v16i8(<16 x i8> %a, <16 x i8> %b)
 define arm_aapcs_vfpcc <8 x i16> @signbit_mask_v8i16(<8 x i16> %a, <8 x i16> %b) {
 ; CHECK-LABEL: signbit_mask_v8i16:
 ; CHECK:       @ %bb.0:
-; CHECK-NEXT:    vmov.i32 q2, #0x0
-; CHECK-NEXT:    vcmp.s16 lt, q0, zr
-; CHECK-NEXT:    vpsel q0, q1, q2
+; CHECK-NEXT:    vshr.s16 q0, q0, #15
+; CHECK-NEXT:    vand q0, q0, q1
 ; CHECK-NEXT:    bx lr
   %cond = icmp slt <8 x i16> %a, zeroinitializer
   %r = select <8 x i1> %cond, <8 x i16> %b, <8 x i16> zeroinitializer
@@ -161,9 +159,8 @@ define arm_aapcs_vfpcc <8 x i16> @signbit_mask_v8i16(<8 x i16> %a, <8 x i16> %b)
 define arm_aapcs_vfpcc <4 x i32> @signbit_mask_v4i32(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: signbit_mask_v4i32:
 ; CHECK:       @ %bb.0:
-; CHECK-NEXT:    vmov.i32 q2, #0x0
-; CHECK-NEXT:    vcmp.s32 lt, q0, zr
-; CHECK-NEXT:    vpsel q0, q1, q2
+; CHECK-NEXT:    vshr.s32 q0, q0, #31
+; CHECK-NEXT:    vand q0, q0, q1
 ; CHECK-NEXT:    bx lr
   %cond = icmp slt <4 x i32> %a, zeroinitializer
   %r = select <4 x i1> %cond, <4 x i32> %b, <4 x i32> zeroinitializer

diff  --git a/llvm/test/CodeGen/X86/avx512-logic.ll b/llvm/test/CodeGen/X86/avx512-logic.ll
index ac35c5639cc8..4617f3f48b67 100644
--- a/llvm/test/CodeGen/X86/avx512-logic.ll
+++ b/llvm/test/CodeGen/X86/avx512-logic.ll
@@ -907,20 +907,12 @@ define <8 x i64> @ternlog_xor_and_mask(<8 x i64> %x, <8 x i64> %y) {
 }
 
 define <16 x i32> @ternlog_maskz_or_and_mask(<16 x i32> %x, <16 x i32> %y, <16 x i32> %mask) {
-; KNL-LABEL: ternlog_maskz_or_and_mask:
-; KNL:       ## %bb.0:
-; KNL-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; KNL-NEXT:    vpcmpgtd %zmm2, %zmm3, %k1
-; KNL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
-; KNL-NEXT:    vpord %zmm1, %zmm0, %zmm0 {%k1} {z}
-; KNL-NEXT:    retq
-;
-; SKX-LABEL: ternlog_maskz_or_and_mask:
-; SKX:       ## %bb.0:
-; SKX-NEXT:    vpmovd2m %zmm2, %k1
-; SKX-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
-; SKX-NEXT:    vorps %zmm1, %zmm0, %zmm0 {%k1} {z}
-; SKX-NEXT:    retq
+; ALL-LABEL: ternlog_maskz_or_and_mask:
+; ALL:       ## %bb.0:
+; ALL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm3
+; ALL-NEXT:    vpsrad $31, %zmm2, %zmm0
+; ALL-NEXT:    vpternlogd $224, %zmm1, %zmm3, %zmm0
+; ALL-NEXT:    retq
   %m = icmp slt <16 x i32> %mask, zeroinitializer
   %a = and <16 x i32> %x, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
   %b = or <16 x i32> %a, %y
@@ -929,20 +921,12 @@ define <16 x i32> @ternlog_maskz_or_and_mask(<16 x i32> %x, <16 x i32> %y, <16 x
 }
 
 define <8 x i64> @ternlog_maskz_xor_and_mask(<8 x i64> %x, <8 x i64> %y, <8 x i64> %mask) {
-; KNL-LABEL: ternlog_maskz_xor_and_mask:
-; KNL:       ## %bb.0:
-; KNL-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; KNL-NEXT:    vpcmpgtq %zmm2, %zmm3, %k1
-; KNL-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
-; KNL-NEXT:    vpxorq %zmm1, %zmm0, %zmm0 {%k1} {z}
-; KNL-NEXT:    retq
-;
-; SKX-LABEL: ternlog_maskz_xor_and_mask:
-; SKX:       ## %bb.0:
-; SKX-NEXT:    vpmovq2m %zmm2, %k1
-; SKX-NEXT:    vandpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
-; SKX-NEXT:    vxorpd %zmm1, %zmm0, %zmm0 {%k1} {z}
-; SKX-NEXT:    retq
+; ALL-LABEL: ternlog_maskz_xor_and_mask:
+; ALL:       ## %bb.0:
+; ALL-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm3
+; ALL-NEXT:    vpsraq $63, %zmm2, %zmm0
+; ALL-NEXT:    vpternlogq $96, %zmm1, %zmm3, %zmm0
+; ALL-NEXT:    retq
   %m = icmp slt <8 x i64> %mask, zeroinitializer
   %a = and <8 x i64> %x, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
   %b = xor <8 x i64> %a, %y

diff  --git a/llvm/test/CodeGen/X86/avx512vl-logic.ll b/llvm/test/CodeGen/X86/avx512vl-logic.ll
index 26c30e950d0d..0d32ddc147fc 100644
--- a/llvm/test/CodeGen/X86/avx512vl-logic.ll
+++ b/llvm/test/CodeGen/X86/avx512vl-logic.ll
@@ -1077,20 +1077,12 @@ define <4 x i64> @ternlog_xor_and_mask_ymm(<4 x i64> %x, <4 x i64> %y) {
 }
 
 define <4 x i32> @ternlog_maskz_or_and_mask(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z, <4 x i32> %mask) {
-; KNL-LABEL: ternlog_maskz_or_and_mask:
-; KNL:       ## %bb.0:
-; KNL-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; KNL-NEXT:    vpcmpgtd %xmm3, %xmm2, %k1
-; KNL-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; KNL-NEXT:    vpord %xmm1, %xmm0, %xmm0 {%k1} {z}
-; KNL-NEXT:    retq
-;
-; SKX-LABEL: ternlog_maskz_or_and_mask:
-; SKX:       ## %bb.0:
-; SKX-NEXT:    vpmovd2m %xmm3, %k1
-; SKX-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; SKX-NEXT:    vorps %xmm1, %xmm0, %xmm0 {%k1} {z}
-; SKX-NEXT:    retq
+; CHECK-LABEL: ternlog_maskz_or_and_mask:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; CHECK-NEXT:    vpsrad $31, %xmm3, %xmm0
+; CHECK-NEXT:    vpternlogd $224, %xmm1, %xmm2, %xmm0
+; CHECK-NEXT:    retq
   %m = icmp slt <4 x i32> %mask, zeroinitializer
   %a = and <4 x i32> %x, <i32 255, i32 255, i32 255, i32 255>
   %b = or <4 x i32> %a, %y
@@ -1099,20 +1091,12 @@ define <4 x i32> @ternlog_maskz_or_and_mask(<4 x i32> %x, <4 x i32> %y, <4 x i32
 }
 
 define <8 x i32> @ternlog_maskz_or_and_mask_ymm(<8 x i32> %x, <8 x i32> %y, <8 x i32> %mask) {
-; KNL-LABEL: ternlog_maskz_or_and_mask_ymm:
-; KNL:       ## %bb.0:
-; KNL-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; KNL-NEXT:    vpcmpgtd %ymm2, %ymm3, %k1
-; KNL-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; KNL-NEXT:    vpord %ymm1, %ymm0, %ymm0 {%k1} {z}
-; KNL-NEXT:    retq
-;
-; SKX-LABEL: ternlog_maskz_or_and_mask_ymm:
-; SKX:       ## %bb.0:
-; SKX-NEXT:    vpmovd2m %ymm2, %k1
-; SKX-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; SKX-NEXT:    vorps %ymm1, %ymm0, %ymm0 {%k1} {z}
-; SKX-NEXT:    retq
+; CHECK-LABEL: ternlog_maskz_or_and_mask_ymm:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3
+; CHECK-NEXT:    vpsrad $31, %ymm2, %ymm0
+; CHECK-NEXT:    vpternlogd $224, %ymm1, %ymm3, %ymm0
+; CHECK-NEXT:    retq
   %m = icmp slt <8 x i32> %mask, zeroinitializer
   %a = and <8 x i32> %x, <i32 -16777216, i32 -16777216, i32 -16777216, i32 -16777216, i32 -16777216, i32 -16777216, i32 -16777216, i32 -16777216>
   %b = or <8 x i32> %a, %y
@@ -1121,20 +1105,12 @@ define <8 x i32> @ternlog_maskz_or_and_mask_ymm(<8 x i32> %x, <8 x i32> %y, <8 x
 }
 
 define <2 x i64> @ternlog_maskz_xor_and_mask(<2 x i64> %x, <2 x i64> %y, <2 x i64> %mask) {
-; KNL-LABEL: ternlog_maskz_xor_and_mask:
-; KNL:       ## %bb.0:
-; KNL-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; KNL-NEXT:    vpcmpgtq %xmm2, %xmm3, %k1
-; KNL-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; KNL-NEXT:    vpxorq %xmm1, %xmm0, %xmm0 {%k1} {z}
-; KNL-NEXT:    retq
-;
-; SKX-LABEL: ternlog_maskz_xor_and_mask:
-; SKX:       ## %bb.0:
-; SKX-NEXT:    vpmovq2m %xmm2, %k1
-; SKX-NEXT:    vandpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; SKX-NEXT:    vxorpd %xmm1, %xmm0, %xmm0 {%k1} {z}
-; SKX-NEXT:    retq
+; CHECK-LABEL: ternlog_maskz_xor_and_mask:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3
+; CHECK-NEXT:    vpsraq $63, %xmm2, %xmm0
+; CHECK-NEXT:    vpternlogq $96, %xmm1, %xmm3, %xmm0
+; CHECK-NEXT:    retq
   %m = icmp slt <2 x i64> %mask, zeroinitializer
   %a = and <2 x i64> %x, <i64 1099511627775, i64 1099511627775>
   %b = xor <2 x i64> %a, %y
@@ -1143,20 +1119,12 @@ define <2 x i64> @ternlog_maskz_xor_and_mask(<2 x i64> %x, <2 x i64> %y, <2 x i6
 }
 
 define <4 x i64> @ternlog_maskz_xor_and_mask_ymm(<4 x i64> %x, <4 x i64> %y, <4 x i64> %mask) {
-; KNL-LABEL: ternlog_maskz_xor_and_mask_ymm:
-; KNL:       ## %bb.0:
-; KNL-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; KNL-NEXT:    vpcmpgtq %ymm2, %ymm3, %k1
-; KNL-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; KNL-NEXT:    vpxorq %ymm1, %ymm0, %ymm0 {%k1} {z}
-; KNL-NEXT:    retq
-;
-; SKX-LABEL: ternlog_maskz_xor_and_mask_ymm:
-; SKX:       ## %bb.0:
-; SKX-NEXT:    vpmovq2m %ymm2, %k1
-; SKX-NEXT:    vandpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; SKX-NEXT:    vxorpd %ymm1, %ymm0, %ymm0 {%k1} {z}
-; SKX-NEXT:    retq
+; CHECK-LABEL: ternlog_maskz_xor_and_mask_ymm:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3
+; CHECK-NEXT:    vpsraq $63, %ymm2, %ymm0
+; CHECK-NEXT:    vpternlogq $96, %ymm1, %ymm3, %ymm0
+; CHECK-NEXT:    retq
   %m = icmp slt <4 x i64> %mask, zeroinitializer
   %a = and <4 x i64> %x, <i64 72057594037927935, i64 72057594037927935, i64 72057594037927935, i64 72057594037927935>
   %b = xor <4 x i64> %a, %y

diff  --git a/llvm/test/CodeGen/X86/vselect-zero.ll b/llvm/test/CodeGen/X86/vselect-zero.ll
index c1f54a5debe1..c48cc809ca64 100644
--- a/llvm/test/CodeGen/X86/vselect-zero.ll
+++ b/llvm/test/CodeGen/X86/vselect-zero.ll
@@ -192,16 +192,13 @@ define <16 x i8> @signbit_mask_v16i8(<16 x i8> %a, <16 x i8> %b) {
 define <8 x i16> @signbit_mask_v8i16(<8 x i16> %a, <8 x i16> %b) {
 ; SSE-LABEL: signbit_mask_v8i16:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    pxor %xmm2, %xmm2
-; SSE-NEXT:    pcmpgtw %xmm0, %xmm2
-; SSE-NEXT:    pand %xmm1, %xmm2
-; SSE-NEXT:    movdqa %xmm2, %xmm0
+; SSE-NEXT:    psraw $15, %xmm0
+; SSE-NEXT:    pand %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: signbit_mask_v8i16:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX-NEXT:    vpcmpgtw %xmm0, %xmm2, %xmm0
+; AVX-NEXT:    vpsraw $15, %xmm0, %xmm0
 ; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
   %cond = icmp slt <8 x i16> %a, zeroinitializer
@@ -212,16 +209,13 @@ define <8 x i16> @signbit_mask_v8i16(<8 x i16> %a, <8 x i16> %b) {
 define <4 x i32> @signbit_mask_v4i32(<4 x i32> %a, <4 x i32> %b) {
 ; SSE-LABEL: signbit_mask_v4i32:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    pxor %xmm2, %xmm2
-; SSE-NEXT:    pcmpgtd %xmm0, %xmm2
-; SSE-NEXT:    pand %xmm1, %xmm2
-; SSE-NEXT:    movdqa %xmm2, %xmm0
+; SSE-NEXT:    psrad $31, %xmm0
+; SSE-NEXT:    pand %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: signbit_mask_v4i32:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX-NEXT:    vpcmpgtd %xmm0, %xmm2, %xmm0
+; AVX-NEXT:    vpsrad $31, %xmm0, %xmm0
 ; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
   %cond = icmp slt <4 x i32> %a, zeroinitializer
@@ -232,9 +226,8 @@ define <4 x i32> @signbit_mask_v4i32(<4 x i32> %a, <4 x i32> %b) {
 define <2 x i64> @signbit_mask_v2i64(<2 x i64> %a, <2 x i64> %b) {
 ; SSE2-LABEL: signbit_mask_v2i64:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    pcmpgtd %xmm2, %xmm0
+; SSE2-NEXT:    psrad $31, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
 ; SSE2-NEXT:    pand %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
@@ -262,12 +255,12 @@ define <32 x i8> @signbit_mask_v32i8(<32 x i8> %a, <32 x i8> %b) {
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    pxor %xmm4, %xmm4
 ; SSE-NEXT:    pxor %xmm5, %xmm5
-; SSE-NEXT:    pcmpgtb %xmm1, %xmm5
-; SSE-NEXT:    pcmpgtb %xmm0, %xmm4
-; SSE-NEXT:    pand %xmm2, %xmm4
-; SSE-NEXT:    pand %xmm3, %xmm5
-; SSE-NEXT:    movdqa %xmm4, %xmm0
-; SSE-NEXT:    movdqa %xmm5, %xmm1
+; SSE-NEXT:    pcmpgtb %xmm0, %xmm5
+; SSE-NEXT:    pand %xmm2, %xmm5
+; SSE-NEXT:    pcmpgtb %xmm1, %xmm4
+; SSE-NEXT:    pand %xmm3, %xmm4
+; SSE-NEXT:    movdqa %xmm5, %xmm0
+; SSE-NEXT:    movdqa %xmm4, %xmm1
 ; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: signbit_mask_v32i8:
@@ -294,30 +287,24 @@ define <32 x i8> @signbit_mask_v32i8(<32 x i8> %a, <32 x i8> %b) {
 define <16 x i16> @signbit_mask_v16i16(<16 x i16> %a, <16 x i16> %b) {
 ; SSE-LABEL: signbit_mask_v16i16:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    pxor %xmm4, %xmm4
-; SSE-NEXT:    pxor %xmm5, %xmm5
-; SSE-NEXT:    pcmpgtw %xmm1, %xmm5
-; SSE-NEXT:    pcmpgtw %xmm0, %xmm4
-; SSE-NEXT:    pand %xmm2, %xmm4
-; SSE-NEXT:    pand %xmm3, %xmm5
-; SSE-NEXT:    movdqa %xmm4, %xmm0
-; SSE-NEXT:    movdqa %xmm5, %xmm1
+; SSE-NEXT:    psraw $15, %xmm0
+; SSE-NEXT:    pand %xmm2, %xmm0
+; SSE-NEXT:    psraw $15, %xmm1
+; SSE-NEXT:    pand %xmm3, %xmm1
 ; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: signbit_mask_v16i16:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT:    vpcmpgtw %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpcmpgtw %xmm0, %xmm3, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    vpsraw $15, %xmm0, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpsraw $15, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
 ; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: signbit_mask_v16i16:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT:    vpcmpgtw %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vpsraw $15, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
   %cond = icmp slt <16 x i16> %a, zeroinitializer
@@ -328,30 +315,24 @@ define <16 x i16> @signbit_mask_v16i16(<16 x i16> %a, <16 x i16> %b) {
 define <8 x i32> @signbit_mask_v8i32(<8 x i32> %a, <8 x i32> %b) {
 ; SSE-LABEL: signbit_mask_v8i32:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    pxor %xmm4, %xmm4
-; SSE-NEXT:    pxor %xmm5, %xmm5
-; SSE-NEXT:    pcmpgtd %xmm1, %xmm5
-; SSE-NEXT:    pcmpgtd %xmm0, %xmm4
-; SSE-NEXT:    pand %xmm2, %xmm4
-; SSE-NEXT:    pand %xmm3, %xmm5
-; SSE-NEXT:    movdqa %xmm4, %xmm0
-; SSE-NEXT:    movdqa %xmm5, %xmm1
+; SSE-NEXT:    psrad $31, %xmm0
+; SSE-NEXT:    pand %xmm2, %xmm0
+; SSE-NEXT:    psrad $31, %xmm1
+; SSE-NEXT:    pand %xmm3, %xmm1
 ; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: signbit_mask_v8i32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpcmpgtd %xmm0, %xmm3, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    vpsrad $31, %xmm0, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpsrad $31, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
 ; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: signbit_mask_v8i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT:    vpcmpgtd %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vpsrad $31, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
   %cond = icmp slt <8 x i32> %a, zeroinitializer
@@ -362,27 +343,24 @@ define <8 x i32> @signbit_mask_v8i32(<8 x i32> %a, <8 x i32> %b) {
 define <4 x i64> @signbit_mask_v4i64(<4 x i64> %a, <4 x i64> %b) {
 ; SSE2-LABEL: signbit_mask_v4i64:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pxor %xmm4, %xmm4
-; SSE2-NEXT:    pxor %xmm1, %xmm1
-; SSE2-NEXT:    pcmpgtd %xmm5, %xmm1
+; SSE2-NEXT:    psrad $31, %xmm0
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT:    pcmpgtd %xmm0, %xmm4
-; SSE2-NEXT:    pand %xmm2, %xmm4
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    psrad $31, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
 ; SSE2-NEXT:    pand %xmm3, %xmm1
-; SSE2-NEXT:    movdqa %xmm4, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE42-LABEL: signbit_mask_v4i64:
 ; SSE42:       # %bb.0:
 ; SSE42-NEXT:    pxor %xmm4, %xmm4
 ; SSE42-NEXT:    pxor %xmm5, %xmm5
-; SSE42-NEXT:    pcmpgtq %xmm1, %xmm5
-; SSE42-NEXT:    pcmpgtq %xmm0, %xmm4
-; SSE42-NEXT:    pand %xmm2, %xmm4
-; SSE42-NEXT:    pand %xmm3, %xmm5
-; SSE42-NEXT:    movdqa %xmm4, %xmm0
-; SSE42-NEXT:    movdqa %xmm5, %xmm1
+; SSE42-NEXT:    pcmpgtq %xmm0, %xmm5
+; SSE42-NEXT:    pand %xmm2, %xmm5
+; SSE42-NEXT:    pcmpgtq %xmm1, %xmm4
+; SSE42-NEXT:    pand %xmm3, %xmm4
+; SSE42-NEXT:    movdqa %xmm5, %xmm0
+; SSE42-NEXT:    movdqa %xmm4, %xmm1
 ; SSE42-NEXT:    retq
 ;
 ; AVX1-LABEL: signbit_mask_v4i64:


        


More information about the llvm-commits mailing list