[llvm] bf468f4 - [X86][SSE] Canonicalize UNARYSHUFFLE(XOR(X,-1) -> XOR(UNARYSHUFFLE(X),-1)

Simon Pilgrim via llvm-commits llvm-commits at lists.llvm.org
Thu Apr 30 11:19:31 PDT 2020


Author: Simon Pilgrim
Date: 2020-04-30T19:18:51+01:00
New Revision: bf468f4349071f28755ca7ea7a53f21fc8c8ddbf

URL: https://github.com/llvm/llvm-project/commit/bf468f4349071f28755ca7ea7a53f21fc8c8ddbf
DIFF: https://github.com/llvm/llvm-project/commit/bf468f4349071f28755ca7ea7a53f21fc8c8ddbf.diff

LOG: [X86][SSE] Canonicalize UNARYSHUFFLE(XOR(X,-1) -> XOR(UNARYSHUFFLE(X),-1)

This pushes the NOT pattern up the DAG to help expose it for further combines (AND->ANDN in particular).

The PSHUFD/MOVDDUP 'splat' cases are the only ones I've seen in the wild so far, we can further generalize if/when we need to.

Added: 
    

Modified: 
    llvm/lib/Target/X86/X86ISelLowering.cpp
    llvm/test/CodeGen/X86/avx512-vec-cmp.ll
    llvm/test/CodeGen/X86/combine-bitselect.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 78015544c232..14b8872b2ebb 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -6089,8 +6089,8 @@ static SDValue getExtendInVec(unsigned Opcode, const SDLoc &DL, EVT VT,
 // Match (xor X, -1) -> X.
 // Match extract_subvector(xor X, -1) -> extract_subvector(X).
 // Match concat_vectors(xor X, -1, xor Y, -1) -> concat_vectors(X, Y).
-static SDValue IsNOT(SDValue V, SelectionDAG &DAG) {
-  V = peekThroughBitcasts(V);
+static SDValue IsNOT(SDValue V, SelectionDAG &DAG, bool OneUse = false) {
+  V = OneUse ? peekThroughOneUseBitcasts(V) : peekThroughBitcasts(V);
   if (V.getOpcode() == ISD::XOR &&
       ISD::isBuildVectorAllOnes(V.getOperand(1).getNode()))
     return V.getOperand(0);
@@ -35455,6 +35455,31 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
   if (SDValue R = combineCommutableSHUFP(N, VT, DL, DAG))
     return R;
 
+  // Canonicalize UNARYSHUFFLE(XOR(X,-1) -> XOR(UNARYSHUFFLE(X),-1) to
+  // help expose the 'NOT' pattern further up the DAG.
+  // TODO: This might be beneficial for any binop with a 'splattable' operand.
+  switch (Opcode) {
+  case X86ISD::MOVDDUP:
+  case X86ISD::PSHUFD: {
+    SDValue Src = N.getOperand(0);
+    if (Src.hasOneUse() && Src.getValueType() == VT) {
+      if (SDValue Not = IsNOT(Src, DAG, /*OneUse*/ true)) {
+        Not = DAG.getBitcast(VT, Not);
+        Not = Opcode == X86ISD::MOVDDUP
+                  ? DAG.getNode(Opcode, DL, VT, Not)
+                  : DAG.getNode(Opcode, DL, VT, Not, N.getOperand(1));
+        EVT IntVT = Not.getValueType().changeTypeToInteger();
+        SDValue AllOnes = DAG.getConstant(-1, DL, IntVT);
+        Not = DAG.getBitcast(IntVT, Not);
+        Not = DAG.getNode(ISD::XOR, DL, IntVT, Not, AllOnes);
+        return DAG.getBitcast(VT, Not);
+      }
+    }
+    break;
+  }
+  }
+
+  // Handle specific target shuffles.
   switch (Opcode) {
   case X86ISD::MOVDDUP: {
     SDValue Src = N.getOperand(0);

diff  --git a/llvm/test/CodeGen/X86/avx512-vec-cmp.ll b/llvm/test/CodeGen/X86/avx512-vec-cmp.ll
index c782d9b5bcf0..719bd9f9d95f 100644
--- a/llvm/test/CodeGen/X86/avx512-vec-cmp.ll
+++ b/llvm/test/CodeGen/X86/avx512-vec-cmp.ll
@@ -1575,11 +1575,9 @@ define <2 x i32> @narrow_cmp_select_reverse(<2 x i64> %x, <2 x i32> %y) nounwind
 ; AVX512:       ## %bb.0:
 ; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xef,0xd2]
 ; AVX512-NEXT:    vpcmpeqq %xmm2, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x29,0xc2]
-; AVX512-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0 ## encoding: [0x62,0xf3,0xfd,0x48,0x25,0xc0,0x0f]
 ; AVX512-NEXT:    vpshufd $232, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x70,0xc0,0xe8]
 ; AVX512-NEXT:    ## xmm0 = xmm0[0,2,2,3]
-; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xdb,0xc1]
-; AVX512-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; AVX512-NEXT:    vpandn %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xdf,0xc1]
 ; AVX512-NEXT:    retq ## encoding: [0xc3]
 ;
 ; SKX-LABEL: narrow_cmp_select_reverse:

diff  --git a/llvm/test/CodeGen/X86/combine-bitselect.ll b/llvm/test/CodeGen/X86/combine-bitselect.ll
index bd4c65b77197..a2d55ea8cf0b 100644
--- a/llvm/test/CodeGen/X86/combine-bitselect.ll
+++ b/llvm/test/CodeGen/X86/combine-bitselect.ll
@@ -186,36 +186,25 @@ define <2 x i64> @bitselect_v2i64_broadcast_rrr(<2 x i64> %a0, <2 x i64> %a1, i6
 ; SSE-LABEL: bitselect_v2i64_broadcast_rrr:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movq %rdi, %xmm2
-; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,1,0,1]
-; SSE-NEXT:    pcmpeqd %xmm4, %xmm4
-; SSE-NEXT:    pxor %xmm2, %xmm4
-; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[0,1,0,1]
-; SSE-NEXT:    pand %xmm3, %xmm0
-; SSE-NEXT:    pand %xmm1, %xmm2
+; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
+; SSE-NEXT:    pand %xmm2, %xmm0
+; SSE-NEXT:    pandn %xmm1, %xmm2
 ; SSE-NEXT:    por %xmm2, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; XOP-LABEL: bitselect_v2i64_broadcast_rrr:
 ; XOP:       # %bb.0:
 ; XOP-NEXT:    vmovq %rdi, %xmm2
-; XOP-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[0,1,0,1]
-; XOP-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
-; XOP-NEXT:    vpxor %xmm4, %xmm2, %xmm2
 ; XOP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
-; XOP-NEXT:    vpand %xmm3, %xmm0, %xmm0
-; XOP-NEXT:    vpand %xmm2, %xmm1, %xmm1
-; XOP-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    vpcmov %xmm2, %xmm1, %xmm0, %xmm0
 ; XOP-NEXT:    retq
 ;
 ; AVX1-LABEL: bitselect_v2i64_broadcast_rrr:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vmovq %rdi, %xmm2
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[0,1,0,1]
-; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
-; AVX1-NEXT:    vpxor %xmm4, %xmm2, %xmm2
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
-; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpandn %xmm1, %xmm2, %xmm1
 ; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    retq
 ;
@@ -256,37 +245,28 @@ define <2 x i64> @bitselect_v2i64_broadcast_rrm(<2 x i64> %a0, <2 x i64> %a1, i6
 ; SSE-LABEL: bitselect_v2i64_broadcast_rrm:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
-; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,1,0,1]
-; SSE-NEXT:    pcmpeqd %xmm4, %xmm4
-; SSE-NEXT:    pxor %xmm2, %xmm4
-; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[0,1,0,1]
-; SSE-NEXT:    pand %xmm3, %xmm0
-; SSE-NEXT:    pand %xmm1, %xmm2
+; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
+; SSE-NEXT:    pand %xmm2, %xmm0
+; SSE-NEXT:    pandn %xmm1, %xmm2
 ; SSE-NEXT:    por %xmm2, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; XOP-LABEL: bitselect_v2i64_broadcast_rrm:
 ; XOP:       # %bb.0:
-; XOP-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
-; XOP-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[0,1,0,1]
-; XOP-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
-; XOP-NEXT:    vpxor %xmm4, %xmm2, %xmm2
-; XOP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
-; XOP-NEXT:    vpand %xmm3, %xmm0, %xmm0
-; XOP-NEXT:    vpand %xmm2, %xmm1, %xmm1
-; XOP-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
+; XOP-NEXT:    vpermilps {{.*#+}} xmm2 = xmm2[0,1,0,1]
+; XOP-NEXT:    vandps %xmm2, %xmm0, %xmm0
+; XOP-NEXT:    vandnps %xmm1, %xmm2, %xmm1
+; XOP-NEXT:    vorps %xmm1, %xmm0, %xmm0
 ; XOP-NEXT:    retq
 ;
 ; AVX1-LABEL: bitselect_v2i64_broadcast_rrm:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[0,1,0,1]
-; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
-; AVX1-NEXT:    vpxor %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
-; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm2 = xmm2[0,1,0,1]
+; AVX1-NEXT:    vandps %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vandnps %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vorps %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: bitselect_v2i64_broadcast_rrm:
@@ -524,16 +504,14 @@ define <4 x i64> @bitselect_v4i64_broadcast_rrr(<4 x i64> %a0, <4 x i64> %a1, i6
 ; SSE-LABEL: bitselect_v4i64_broadcast_rrr:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movq %rdi, %xmm4
-; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,1,0,1]
-; SSE-NEXT:    pcmpeqd %xmm6, %xmm6
-; SSE-NEXT:    pxor %xmm4, %xmm6
-; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm6[0,1,0,1]
-; SSE-NEXT:    pand %xmm5, %xmm1
-; SSE-NEXT:    pand %xmm5, %xmm0
-; SSE-NEXT:    pand %xmm4, %xmm3
-; SSE-NEXT:    por %xmm3, %xmm1
-; SSE-NEXT:    pand %xmm4, %xmm2
-; SSE-NEXT:    por %xmm2, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,1,0,1]
+; SSE-NEXT:    pand %xmm4, %xmm1
+; SSE-NEXT:    pand %xmm4, %xmm0
+; SSE-NEXT:    movdqa %xmm4, %xmm5
+; SSE-NEXT:    pandn %xmm3, %xmm5
+; SSE-NEXT:    por %xmm5, %xmm1
+; SSE-NEXT:    pandn %xmm2, %xmm4
+; SSE-NEXT:    por %xmm4, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; XOP-LABEL: bitselect_v4i64_broadcast_rrr:
@@ -542,12 +520,10 @@ define <4 x i64> @bitselect_v4i64_broadcast_rrr(<4 x i64> %a0, <4 x i64> %a1, i6
 ; XOP-NEXT:    vmovq %rdi, %xmm3
 ; XOP-NEXT:    vmovddup {{.*#+}} xmm2 = xmm2[0,0]
 ; XOP-NEXT:    vinsertf128 $1, %xmm2, %ymm2, %ymm2
-; XOP-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
-; XOP-NEXT:    vpxor %xmm4, %xmm3, %xmm3
 ; XOP-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,1]
 ; XOP-NEXT:    vinsertf128 $1, %xmm3, %ymm3, %ymm3
 ; XOP-NEXT:    vandps %ymm2, %ymm0, %ymm0
-; XOP-NEXT:    vandps %ymm3, %ymm1, %ymm1
+; XOP-NEXT:    vandnps %ymm1, %ymm3, %ymm1
 ; XOP-NEXT:    vorps %ymm1, %ymm0, %ymm0
 ; XOP-NEXT:    retq
 ;
@@ -557,12 +533,10 @@ define <4 x i64> @bitselect_v4i64_broadcast_rrr(<4 x i64> %a0, <4 x i64> %a1, i6
 ; AVX1-NEXT:    vmovq %rdi, %xmm3
 ; AVX1-NEXT:    vmovddup {{.*#+}} xmm2 = xmm2[0,0]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm2, %ymm2
-; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
-; AVX1-NEXT:    vpxor %xmm4, %xmm3, %xmm3
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,1]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm3, %ymm3
 ; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
-; AVX1-NEXT:    vandps %ymm3, %ymm1, %ymm1
+; AVX1-NEXT:    vandnps %ymm1, %ymm3, %ymm1
 ; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
@@ -603,43 +577,31 @@ define <4 x i64> @bitselect_v4i64_broadcast_rrm(<4 x i64> %a0, <4 x i64> %a1, i6
 ; SSE-LABEL: bitselect_v4i64_broadcast_rrm:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movq {{.*#+}} xmm4 = mem[0],zero
-; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,1,0,1]
-; SSE-NEXT:    pcmpeqd %xmm6, %xmm6
-; SSE-NEXT:    pxor %xmm4, %xmm6
-; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm6[0,1,0,1]
-; SSE-NEXT:    pand %xmm5, %xmm1
-; SSE-NEXT:    pand %xmm5, %xmm0
-; SSE-NEXT:    pand %xmm4, %xmm3
-; SSE-NEXT:    por %xmm3, %xmm1
-; SSE-NEXT:    pand %xmm4, %xmm2
-; SSE-NEXT:    por %xmm2, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,1,0,1]
+; SSE-NEXT:    pand %xmm4, %xmm1
+; SSE-NEXT:    pand %xmm4, %xmm0
+; SSE-NEXT:    movdqa %xmm4, %xmm5
+; SSE-NEXT:    pandn %xmm3, %xmm5
+; SSE-NEXT:    por %xmm5, %xmm1
+; SSE-NEXT:    pandn %xmm2, %xmm4
+; SSE-NEXT:    por %xmm4, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; XOP-LABEL: bitselect_v4i64_broadcast_rrm:
 ; XOP:       # %bb.0:
-; XOP-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
-; XOP-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[0,1,0,1]
-; XOP-NEXT:    vinsertf128 $1, %xmm3, %ymm3, %ymm3
-; XOP-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
-; XOP-NEXT:    vpxor %xmm4, %xmm2, %xmm2
-; XOP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
-; XOP-NEXT:    vinsertf128 $1, %xmm2, %ymm2, %ymm2
-; XOP-NEXT:    vandps %ymm3, %ymm0, %ymm0
-; XOP-NEXT:    vandps %ymm2, %ymm1, %ymm1
+; XOP-NEXT:    vbroadcastsd (%rdi), %ymm2
+; XOP-NEXT:    vinsertf128 $1, %xmm2, %ymm2, %ymm3
+; XOP-NEXT:    vandps %ymm2, %ymm0, %ymm0
+; XOP-NEXT:    vandnps %ymm1, %ymm3, %ymm1
 ; XOP-NEXT:    vorps %ymm1, %ymm0, %ymm0
 ; XOP-NEXT:    retq
 ;
 ; AVX1-LABEL: bitselect_v4i64_broadcast_rrm:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[0,1,0,1]
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm3, %ymm3
-; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
-; AVX1-NEXT:    vpxor %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm2, %ymm2
-; AVX1-NEXT:    vandps %ymm3, %ymm0, %ymm0
-; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
+; AVX1-NEXT:    vbroadcastsd (%rdi), %ymm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm2, %ymm3
+; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT:    vandnps %ymm1, %ymm3, %ymm1
 ; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
@@ -922,22 +884,22 @@ define <8 x i64> @bitselect_v8i64_broadcast_rrr(<8 x i64> %a0, <8 x i64> %a1, i6
 ; SSE-LABEL: bitselect_v8i64_broadcast_rrr:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movq %rdi, %xmm8
-; SSE-NEXT:    pshufd {{.*#+}} xmm9 = xmm8[0,1,0,1]
-; SSE-NEXT:    pcmpeqd %xmm10, %xmm10
-; SSE-NEXT:    pxor %xmm8, %xmm10
-; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm10[0,1,0,1]
-; SSE-NEXT:    pand %xmm9, %xmm3
-; SSE-NEXT:    pand %xmm9, %xmm2
-; SSE-NEXT:    pand %xmm9, %xmm1
-; SSE-NEXT:    pand %xmm9, %xmm0
-; SSE-NEXT:    pand %xmm8, %xmm7
-; SSE-NEXT:    por %xmm7, %xmm3
-; SSE-NEXT:    pand %xmm8, %xmm6
-; SSE-NEXT:    por %xmm6, %xmm2
-; SSE-NEXT:    pand %xmm8, %xmm5
-; SSE-NEXT:    por %xmm5, %xmm1
-; SSE-NEXT:    pand %xmm8, %xmm4
-; SSE-NEXT:    por %xmm4, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm8[0,1,0,1]
+; SSE-NEXT:    pand %xmm8, %xmm3
+; SSE-NEXT:    pand %xmm8, %xmm2
+; SSE-NEXT:    pand %xmm8, %xmm1
+; SSE-NEXT:    pand %xmm8, %xmm0
+; SSE-NEXT:    movdqa %xmm8, %xmm9
+; SSE-NEXT:    pandn %xmm7, %xmm9
+; SSE-NEXT:    por %xmm9, %xmm3
+; SSE-NEXT:    movdqa %xmm8, %xmm7
+; SSE-NEXT:    pandn %xmm6, %xmm7
+; SSE-NEXT:    por %xmm7, %xmm2
+; SSE-NEXT:    movdqa %xmm8, %xmm6
+; SSE-NEXT:    pandn %xmm5, %xmm6
+; SSE-NEXT:    por %xmm6, %xmm1
+; SSE-NEXT:    pandn %xmm4, %xmm8
+; SSE-NEXT:    por %xmm8, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; XOP-LABEL: bitselect_v8i64_broadcast_rrr:
@@ -946,15 +908,13 @@ define <8 x i64> @bitselect_v8i64_broadcast_rrr(<8 x i64> %a0, <8 x i64> %a1, i6
 ; XOP-NEXT:    vmovq %rdi, %xmm5
 ; XOP-NEXT:    vmovddup {{.*#+}} xmm4 = xmm4[0,0]
 ; XOP-NEXT:    vinsertf128 $1, %xmm4, %ymm4, %ymm4
-; XOP-NEXT:    vpcmpeqd %xmm6, %xmm6, %xmm6
-; XOP-NEXT:    vpxor %xmm6, %xmm5, %xmm5
 ; XOP-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[0,1,0,1]
 ; XOP-NEXT:    vinsertf128 $1, %xmm5, %ymm5, %ymm5
 ; XOP-NEXT:    vandps %ymm4, %ymm1, %ymm1
 ; XOP-NEXT:    vandps %ymm4, %ymm0, %ymm0
-; XOP-NEXT:    vandps %ymm5, %ymm3, %ymm3
+; XOP-NEXT:    vandnps %ymm3, %ymm5, %ymm3
 ; XOP-NEXT:    vorps %ymm3, %ymm1, %ymm1
-; XOP-NEXT:    vandps %ymm5, %ymm2, %ymm2
+; XOP-NEXT:    vandnps %ymm2, %ymm5, %ymm2
 ; XOP-NEXT:    vorps %ymm2, %ymm0, %ymm0
 ; XOP-NEXT:    retq
 ;
@@ -964,15 +924,13 @@ define <8 x i64> @bitselect_v8i64_broadcast_rrr(<8 x i64> %a0, <8 x i64> %a1, i6
 ; AVX1-NEXT:    vmovq %rdi, %xmm5
 ; AVX1-NEXT:    vmovddup {{.*#+}} xmm4 = xmm4[0,0]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm4, %ymm4
-; AVX1-NEXT:    vpcmpeqd %xmm6, %xmm6, %xmm6
-; AVX1-NEXT:    vpxor %xmm6, %xmm5, %xmm5
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[0,1,0,1]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm5, %ymm5
 ; AVX1-NEXT:    vandps %ymm4, %ymm1, %ymm1
 ; AVX1-NEXT:    vandps %ymm4, %ymm0, %ymm0
-; AVX1-NEXT:    vandps %ymm5, %ymm3, %ymm3
+; AVX1-NEXT:    vandnps %ymm3, %ymm5, %ymm3
 ; AVX1-NEXT:    vorps %ymm3, %ymm1, %ymm1
-; AVX1-NEXT:    vandps %ymm5, %ymm2, %ymm2
+; AVX1-NEXT:    vandnps %ymm2, %ymm5, %ymm2
 ; AVX1-NEXT:    vorps %ymm2, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
@@ -1007,55 +965,45 @@ define <8 x i64> @bitselect_v8i64_broadcast_rrm(<8 x i64> %a0, <8 x i64> %a1, i6
 ; SSE-LABEL: bitselect_v8i64_broadcast_rrm:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movq {{.*#+}} xmm8 = mem[0],zero
-; SSE-NEXT:    pshufd {{.*#+}} xmm9 = xmm8[0,1,0,1]
-; SSE-NEXT:    pcmpeqd %xmm10, %xmm10
-; SSE-NEXT:    pxor %xmm8, %xmm10
-; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm10[0,1,0,1]
-; SSE-NEXT:    pand %xmm9, %xmm3
-; SSE-NEXT:    pand %xmm9, %xmm2
-; SSE-NEXT:    pand %xmm9, %xmm1
-; SSE-NEXT:    pand %xmm9, %xmm0
-; SSE-NEXT:    pand %xmm8, %xmm7
-; SSE-NEXT:    por %xmm7, %xmm3
-; SSE-NEXT:    pand %xmm8, %xmm6
-; SSE-NEXT:    por %xmm6, %xmm2
-; SSE-NEXT:    pand %xmm8, %xmm5
-; SSE-NEXT:    por %xmm5, %xmm1
-; SSE-NEXT:    pand %xmm8, %xmm4
-; SSE-NEXT:    por %xmm4, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm8[0,1,0,1]
+; SSE-NEXT:    pand %xmm8, %xmm3
+; SSE-NEXT:    pand %xmm8, %xmm2
+; SSE-NEXT:    pand %xmm8, %xmm1
+; SSE-NEXT:    pand %xmm8, %xmm0
+; SSE-NEXT:    movdqa %xmm8, %xmm9
+; SSE-NEXT:    pandn %xmm7, %xmm9
+; SSE-NEXT:    por %xmm9, %xmm3
+; SSE-NEXT:    movdqa %xmm8, %xmm7
+; SSE-NEXT:    pandn %xmm6, %xmm7
+; SSE-NEXT:    por %xmm7, %xmm2
+; SSE-NEXT:    movdqa %xmm8, %xmm6
+; SSE-NEXT:    pandn %xmm5, %xmm6
+; SSE-NEXT:    por %xmm6, %xmm1
+; SSE-NEXT:    pandn %xmm4, %xmm8
+; SSE-NEXT:    por %xmm8, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; XOP-LABEL: bitselect_v8i64_broadcast_rrm:
 ; XOP:       # %bb.0:
-; XOP-NEXT:    vmovq {{.*#+}} xmm4 = mem[0],zero
-; XOP-NEXT:    vpshufd {{.*#+}} xmm5 = xmm4[0,1,0,1]
-; XOP-NEXT:    vinsertf128 $1, %xmm5, %ymm5, %ymm5
-; XOP-NEXT:    vpcmpeqd %xmm6, %xmm6, %xmm6
-; XOP-NEXT:    vpxor %xmm6, %xmm4, %xmm4
-; XOP-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,1]
-; XOP-NEXT:    vinsertf128 $1, %xmm4, %ymm4, %ymm4
-; XOP-NEXT:    vandps %ymm5, %ymm1, %ymm1
-; XOP-NEXT:    vandps %ymm5, %ymm0, %ymm0
-; XOP-NEXT:    vandps %ymm4, %ymm3, %ymm3
+; XOP-NEXT:    vbroadcastsd (%rdi), %ymm4
+; XOP-NEXT:    vinsertf128 $1, %xmm4, %ymm4, %ymm5
+; XOP-NEXT:    vandps %ymm4, %ymm1, %ymm1
+; XOP-NEXT:    vandps %ymm4, %ymm0, %ymm0
+; XOP-NEXT:    vandnps %ymm3, %ymm5, %ymm3
 ; XOP-NEXT:    vorps %ymm3, %ymm1, %ymm1
-; XOP-NEXT:    vandps %ymm4, %ymm2, %ymm2
+; XOP-NEXT:    vandnps %ymm2, %ymm5, %ymm2
 ; XOP-NEXT:    vorps %ymm2, %ymm0, %ymm0
 ; XOP-NEXT:    retq
 ;
 ; AVX1-LABEL: bitselect_v8i64_broadcast_rrm:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovq {{.*#+}} xmm4 = mem[0],zero
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm4[0,1,0,1]
-; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm5, %ymm5
-; AVX1-NEXT:    vpcmpeqd %xmm6, %xmm6, %xmm6
-; AVX1-NEXT:    vpxor %xmm6, %xmm4, %xmm4
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,1]
-; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm4, %ymm4
-; AVX1-NEXT:    vandps %ymm5, %ymm1, %ymm1
-; AVX1-NEXT:    vandps %ymm5, %ymm0, %ymm0
-; AVX1-NEXT:    vandps %ymm4, %ymm3, %ymm3
+; AVX1-NEXT:    vbroadcastsd (%rdi), %ymm4
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm4, %ymm5
+; AVX1-NEXT:    vandps %ymm4, %ymm1, %ymm1
+; AVX1-NEXT:    vandps %ymm4, %ymm0, %ymm0
+; AVX1-NEXT:    vandnps %ymm3, %ymm5, %ymm3
 ; AVX1-NEXT:    vorps %ymm3, %ymm1, %ymm1
-; AVX1-NEXT:    vandps %ymm4, %ymm2, %ymm2
+; AVX1-NEXT:    vandnps %ymm2, %ymm5, %ymm2
 ; AVX1-NEXT:    vorps %ymm2, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;


        


More information about the llvm-commits mailing list