[llvm] bf468f4 - [X86][SSE] Canonicalize UNARYSHUFFLE(XOR(X,-1) -> XOR(UNARYSHUFFLE(X),-1)
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Thu Apr 30 11:19:31 PDT 2020
Author: Simon Pilgrim
Date: 2020-04-30T19:18:51+01:00
New Revision: bf468f4349071f28755ca7ea7a53f21fc8c8ddbf
URL: https://github.com/llvm/llvm-project/commit/bf468f4349071f28755ca7ea7a53f21fc8c8ddbf
DIFF: https://github.com/llvm/llvm-project/commit/bf468f4349071f28755ca7ea7a53f21fc8c8ddbf.diff
LOG: [X86][SSE] Canonicalize UNARYSHUFFLE(XOR(X,-1) -> XOR(UNARYSHUFFLE(X),-1)
This pushes the NOT pattern up the DAG to help expose it for further combines (AND->ANDN in particular).
The PSHUFD/MOVDDUP 'splat' cases are the only ones I've seen in the wild so far, we can further generalize if/when we need to.
Added:
Modified:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/X86/avx512-vec-cmp.ll
llvm/test/CodeGen/X86/combine-bitselect.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 78015544c232..14b8872b2ebb 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -6089,8 +6089,8 @@ static SDValue getExtendInVec(unsigned Opcode, const SDLoc &DL, EVT VT,
// Match (xor X, -1) -> X.
// Match extract_subvector(xor X, -1) -> extract_subvector(X).
// Match concat_vectors(xor X, -1, xor Y, -1) -> concat_vectors(X, Y).
-static SDValue IsNOT(SDValue V, SelectionDAG &DAG) {
- V = peekThroughBitcasts(V);
+static SDValue IsNOT(SDValue V, SelectionDAG &DAG, bool OneUse = false) {
+ V = OneUse ? peekThroughOneUseBitcasts(V) : peekThroughBitcasts(V);
if (V.getOpcode() == ISD::XOR &&
ISD::isBuildVectorAllOnes(V.getOperand(1).getNode()))
return V.getOperand(0);
@@ -35455,6 +35455,31 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
if (SDValue R = combineCommutableSHUFP(N, VT, DL, DAG))
return R;
+ // Canonicalize UNARYSHUFFLE(XOR(X,-1) -> XOR(UNARYSHUFFLE(X),-1) to
+ // help expose the 'NOT' pattern further up the DAG.
+ // TODO: This might be beneficial for any binop with a 'splattable' operand.
+ switch (Opcode) {
+ case X86ISD::MOVDDUP:
+ case X86ISD::PSHUFD: {
+ SDValue Src = N.getOperand(0);
+ if (Src.hasOneUse() && Src.getValueType() == VT) {
+ if (SDValue Not = IsNOT(Src, DAG, /*OneUse*/ true)) {
+ Not = DAG.getBitcast(VT, Not);
+ Not = Opcode == X86ISD::MOVDDUP
+ ? DAG.getNode(Opcode, DL, VT, Not)
+ : DAG.getNode(Opcode, DL, VT, Not, N.getOperand(1));
+ EVT IntVT = Not.getValueType().changeTypeToInteger();
+ SDValue AllOnes = DAG.getConstant(-1, DL, IntVT);
+ Not = DAG.getBitcast(IntVT, Not);
+ Not = DAG.getNode(ISD::XOR, DL, IntVT, Not, AllOnes);
+ return DAG.getBitcast(VT, Not);
+ }
+ }
+ break;
+ }
+ }
+
+ // Handle specific target shuffles.
switch (Opcode) {
case X86ISD::MOVDDUP: {
SDValue Src = N.getOperand(0);
diff --git a/llvm/test/CodeGen/X86/avx512-vec-cmp.ll b/llvm/test/CodeGen/X86/avx512-vec-cmp.ll
index c782d9b5bcf0..719bd9f9d95f 100644
--- a/llvm/test/CodeGen/X86/avx512-vec-cmp.ll
+++ b/llvm/test/CodeGen/X86/avx512-vec-cmp.ll
@@ -1575,11 +1575,9 @@ define <2 x i32> @narrow_cmp_select_reverse(<2 x i64> %x, <2 x i32> %y) nounwind
; AVX512: ## %bb.0:
; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xef,0xd2]
; AVX512-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x29,0xc2]
-; AVX512-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ## encoding: [0x62,0xf3,0xfd,0x48,0x25,0xc0,0x0f]
; AVX512-NEXT: vpshufd $232, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x70,0xc0,0xe8]
; AVX512-NEXT: ## xmm0 = xmm0[0,2,2,3]
-; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xdb,0xc1]
-; AVX512-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; AVX512-NEXT: vpandn %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xdf,0xc1]
; AVX512-NEXT: retq ## encoding: [0xc3]
;
; SKX-LABEL: narrow_cmp_select_reverse:
diff --git a/llvm/test/CodeGen/X86/combine-bitselect.ll b/llvm/test/CodeGen/X86/combine-bitselect.ll
index bd4c65b77197..a2d55ea8cf0b 100644
--- a/llvm/test/CodeGen/X86/combine-bitselect.ll
+++ b/llvm/test/CodeGen/X86/combine-bitselect.ll
@@ -186,36 +186,25 @@ define <2 x i64> @bitselect_v2i64_broadcast_rrr(<2 x i64> %a0, <2 x i64> %a1, i6
; SSE-LABEL: bitselect_v2i64_broadcast_rrr:
; SSE: # %bb.0:
; SSE-NEXT: movq %rdi, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,1,0,1]
-; SSE-NEXT: pcmpeqd %xmm4, %xmm4
-; SSE-NEXT: pxor %xmm2, %xmm4
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,1,0,1]
-; SSE-NEXT: pand %xmm3, %xmm0
-; SSE-NEXT: pand %xmm1, %xmm2
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
+; SSE-NEXT: pand %xmm2, %xmm0
+; SSE-NEXT: pandn %xmm1, %xmm2
; SSE-NEXT: por %xmm2, %xmm0
; SSE-NEXT: retq
;
; XOP-LABEL: bitselect_v2i64_broadcast_rrr:
; XOP: # %bb.0:
; XOP-NEXT: vmovq %rdi, %xmm2
-; XOP-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,1,0,1]
-; XOP-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
-; XOP-NEXT: vpxor %xmm4, %xmm2, %xmm2
; XOP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
-; XOP-NEXT: vpand %xmm3, %xmm0, %xmm0
-; XOP-NEXT: vpand %xmm2, %xmm1, %xmm1
-; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0
+; XOP-NEXT: vpcmov %xmm2, %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
;
; AVX1-LABEL: bitselect_v2i64_broadcast_rrr:
; AVX1: # %bb.0:
; AVX1-NEXT: vmovq %rdi, %xmm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,1,0,1]
-; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
-; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
-; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpandn %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
@@ -256,37 +245,28 @@ define <2 x i64> @bitselect_v2i64_broadcast_rrm(<2 x i64> %a0, <2 x i64> %a1, i6
; SSE-LABEL: bitselect_v2i64_broadcast_rrm:
; SSE: # %bb.0:
; SSE-NEXT: movq {{.*#+}} xmm2 = mem[0],zero
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,1,0,1]
-; SSE-NEXT: pcmpeqd %xmm4, %xmm4
-; SSE-NEXT: pxor %xmm2, %xmm4
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,1,0,1]
-; SSE-NEXT: pand %xmm3, %xmm0
-; SSE-NEXT: pand %xmm1, %xmm2
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
+; SSE-NEXT: pand %xmm2, %xmm0
+; SSE-NEXT: pandn %xmm1, %xmm2
; SSE-NEXT: por %xmm2, %xmm0
; SSE-NEXT: retq
;
; XOP-LABEL: bitselect_v2i64_broadcast_rrm:
; XOP: # %bb.0:
-; XOP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
-; XOP-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,1,0,1]
-; XOP-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
-; XOP-NEXT: vpxor %xmm4, %xmm2, %xmm2
-; XOP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
-; XOP-NEXT: vpand %xmm3, %xmm0, %xmm0
-; XOP-NEXT: vpand %xmm2, %xmm1, %xmm1
-; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0
+; XOP-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
+; XOP-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,0,1]
+; XOP-NEXT: vandps %xmm2, %xmm0, %xmm0
+; XOP-NEXT: vandnps %xmm1, %xmm2, %xmm1
+; XOP-NEXT: vorps %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
;
; AVX1-LABEL: bitselect_v2i64_broadcast_rrm:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,1,0,1]
-; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
-; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
-; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
+; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,0,1]
+; AVX1-NEXT: vandps %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vandnps %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: bitselect_v2i64_broadcast_rrm:
@@ -524,16 +504,14 @@ define <4 x i64> @bitselect_v4i64_broadcast_rrr(<4 x i64> %a0, <4 x i64> %a1, i6
; SSE-LABEL: bitselect_v4i64_broadcast_rrr:
; SSE: # %bb.0:
; SSE-NEXT: movq %rdi, %xmm4
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,1,0,1]
-; SSE-NEXT: pcmpeqd %xmm6, %xmm6
-; SSE-NEXT: pxor %xmm4, %xmm6
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,1,0,1]
-; SSE-NEXT: pand %xmm5, %xmm1
-; SSE-NEXT: pand %xmm5, %xmm0
-; SSE-NEXT: pand %xmm4, %xmm3
-; SSE-NEXT: por %xmm3, %xmm1
-; SSE-NEXT: pand %xmm4, %xmm2
-; SSE-NEXT: por %xmm2, %xmm0
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,1]
+; SSE-NEXT: pand %xmm4, %xmm1
+; SSE-NEXT: pand %xmm4, %xmm0
+; SSE-NEXT: movdqa %xmm4, %xmm5
+; SSE-NEXT: pandn %xmm3, %xmm5
+; SSE-NEXT: por %xmm5, %xmm1
+; SSE-NEXT: pandn %xmm2, %xmm4
+; SSE-NEXT: por %xmm4, %xmm0
; SSE-NEXT: retq
;
; XOP-LABEL: bitselect_v4i64_broadcast_rrr:
@@ -542,12 +520,10 @@ define <4 x i64> @bitselect_v4i64_broadcast_rrr(<4 x i64> %a0, <4 x i64> %a1, i6
; XOP-NEXT: vmovq %rdi, %xmm3
; XOP-NEXT: vmovddup {{.*#+}} xmm2 = xmm2[0,0]
; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2
-; XOP-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
-; XOP-NEXT: vpxor %xmm4, %xmm3, %xmm3
; XOP-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,1]
; XOP-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm3
; XOP-NEXT: vandps %ymm2, %ymm0, %ymm0
-; XOP-NEXT: vandps %ymm3, %ymm1, %ymm1
+; XOP-NEXT: vandnps %ymm1, %ymm3, %ymm1
; XOP-NEXT: vorps %ymm1, %ymm0, %ymm0
; XOP-NEXT: retq
;
@@ -557,12 +533,10 @@ define <4 x i64> @bitselect_v4i64_broadcast_rrr(<4 x i64> %a0, <4 x i64> %a1, i6
; AVX1-NEXT: vmovq %rdi, %xmm3
; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = xmm2[0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2
-; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm3
; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
-; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1
+; AVX1-NEXT: vandnps %ymm1, %ymm3, %ymm1
; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
@@ -603,43 +577,31 @@ define <4 x i64> @bitselect_v4i64_broadcast_rrm(<4 x i64> %a0, <4 x i64> %a1, i6
; SSE-LABEL: bitselect_v4i64_broadcast_rrm:
; SSE: # %bb.0:
; SSE-NEXT: movq {{.*#+}} xmm4 = mem[0],zero
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,1,0,1]
-; SSE-NEXT: pcmpeqd %xmm6, %xmm6
-; SSE-NEXT: pxor %xmm4, %xmm6
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,1,0,1]
-; SSE-NEXT: pand %xmm5, %xmm1
-; SSE-NEXT: pand %xmm5, %xmm0
-; SSE-NEXT: pand %xmm4, %xmm3
-; SSE-NEXT: por %xmm3, %xmm1
-; SSE-NEXT: pand %xmm4, %xmm2
-; SSE-NEXT: por %xmm2, %xmm0
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,1]
+; SSE-NEXT: pand %xmm4, %xmm1
+; SSE-NEXT: pand %xmm4, %xmm0
+; SSE-NEXT: movdqa %xmm4, %xmm5
+; SSE-NEXT: pandn %xmm3, %xmm5
+; SSE-NEXT: por %xmm5, %xmm1
+; SSE-NEXT: pandn %xmm2, %xmm4
+; SSE-NEXT: por %xmm4, %xmm0
; SSE-NEXT: retq
;
; XOP-LABEL: bitselect_v4i64_broadcast_rrm:
; XOP: # %bb.0:
-; XOP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
-; XOP-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,1,0,1]
-; XOP-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm3
-; XOP-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
-; XOP-NEXT: vpxor %xmm4, %xmm2, %xmm2
-; XOP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
-; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2
-; XOP-NEXT: vandps %ymm3, %ymm0, %ymm0
-; XOP-NEXT: vandps %ymm2, %ymm1, %ymm1
+; XOP-NEXT: vbroadcastsd (%rdi), %ymm2
+; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm3
+; XOP-NEXT: vandps %ymm2, %ymm0, %ymm0
+; XOP-NEXT: vandnps %ymm1, %ymm3, %ymm1
; XOP-NEXT: vorps %ymm1, %ymm0, %ymm0
; XOP-NEXT: retq
;
; AVX1-LABEL: bitselect_v4i64_broadcast_rrm:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,1,0,1]
-; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm3
-; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2
-; AVX1-NEXT: vandps %ymm3, %ymm0, %ymm0
-; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
+; AVX1-NEXT: vbroadcastsd (%rdi), %ymm2
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm3
+; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT: vandnps %ymm1, %ymm3, %ymm1
; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
@@ -922,22 +884,22 @@ define <8 x i64> @bitselect_v8i64_broadcast_rrr(<8 x i64> %a0, <8 x i64> %a1, i6
; SSE-LABEL: bitselect_v8i64_broadcast_rrr:
; SSE: # %bb.0:
; SSE-NEXT: movq %rdi, %xmm8
-; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,1,0,1]
-; SSE-NEXT: pcmpeqd %xmm10, %xmm10
-; SSE-NEXT: pxor %xmm8, %xmm10
-; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm10[0,1,0,1]
-; SSE-NEXT: pand %xmm9, %xmm3
-; SSE-NEXT: pand %xmm9, %xmm2
-; SSE-NEXT: pand %xmm9, %xmm1
-; SSE-NEXT: pand %xmm9, %xmm0
-; SSE-NEXT: pand %xmm8, %xmm7
-; SSE-NEXT: por %xmm7, %xmm3
-; SSE-NEXT: pand %xmm8, %xmm6
-; SSE-NEXT: por %xmm6, %xmm2
-; SSE-NEXT: pand %xmm8, %xmm5
-; SSE-NEXT: por %xmm5, %xmm1
-; SSE-NEXT: pand %xmm8, %xmm4
-; SSE-NEXT: por %xmm4, %xmm0
+; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,0,1]
+; SSE-NEXT: pand %xmm8, %xmm3
+; SSE-NEXT: pand %xmm8, %xmm2
+; SSE-NEXT: pand %xmm8, %xmm1
+; SSE-NEXT: pand %xmm8, %xmm0
+; SSE-NEXT: movdqa %xmm8, %xmm9
+; SSE-NEXT: pandn %xmm7, %xmm9
+; SSE-NEXT: por %xmm9, %xmm3
+; SSE-NEXT: movdqa %xmm8, %xmm7
+; SSE-NEXT: pandn %xmm6, %xmm7
+; SSE-NEXT: por %xmm7, %xmm2
+; SSE-NEXT: movdqa %xmm8, %xmm6
+; SSE-NEXT: pandn %xmm5, %xmm6
+; SSE-NEXT: por %xmm6, %xmm1
+; SSE-NEXT: pandn %xmm4, %xmm8
+; SSE-NEXT: por %xmm8, %xmm0
; SSE-NEXT: retq
;
; XOP-LABEL: bitselect_v8i64_broadcast_rrr:
@@ -946,15 +908,13 @@ define <8 x i64> @bitselect_v8i64_broadcast_rrr(<8 x i64> %a0, <8 x i64> %a1, i6
; XOP-NEXT: vmovq %rdi, %xmm5
; XOP-NEXT: vmovddup {{.*#+}} xmm4 = xmm4[0,0]
; XOP-NEXT: vinsertf128 $1, %xmm4, %ymm4, %ymm4
-; XOP-NEXT: vpcmpeqd %xmm6, %xmm6, %xmm6
-; XOP-NEXT: vpxor %xmm6, %xmm5, %xmm5
; XOP-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,0,1]
; XOP-NEXT: vinsertf128 $1, %xmm5, %ymm5, %ymm5
; XOP-NEXT: vandps %ymm4, %ymm1, %ymm1
; XOP-NEXT: vandps %ymm4, %ymm0, %ymm0
-; XOP-NEXT: vandps %ymm5, %ymm3, %ymm3
+; XOP-NEXT: vandnps %ymm3, %ymm5, %ymm3
; XOP-NEXT: vorps %ymm3, %ymm1, %ymm1
-; XOP-NEXT: vandps %ymm5, %ymm2, %ymm2
+; XOP-NEXT: vandnps %ymm2, %ymm5, %ymm2
; XOP-NEXT: vorps %ymm2, %ymm0, %ymm0
; XOP-NEXT: retq
;
@@ -964,15 +924,13 @@ define <8 x i64> @bitselect_v8i64_broadcast_rrr(<8 x i64> %a0, <8 x i64> %a1, i6
; AVX1-NEXT: vmovq %rdi, %xmm5
; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = xmm4[0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm4, %ymm4
-; AVX1-NEXT: vpcmpeqd %xmm6, %xmm6, %xmm6
-; AVX1-NEXT: vpxor %xmm6, %xmm5, %xmm5
; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm5, %ymm5
; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1
; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0
-; AVX1-NEXT: vandps %ymm5, %ymm3, %ymm3
+; AVX1-NEXT: vandnps %ymm3, %ymm5, %ymm3
; AVX1-NEXT: vorps %ymm3, %ymm1, %ymm1
-; AVX1-NEXT: vandps %ymm5, %ymm2, %ymm2
+; AVX1-NEXT: vandnps %ymm2, %ymm5, %ymm2
; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0
; AVX1-NEXT: retq
;
@@ -1007,55 +965,45 @@ define <8 x i64> @bitselect_v8i64_broadcast_rrm(<8 x i64> %a0, <8 x i64> %a1, i6
; SSE-LABEL: bitselect_v8i64_broadcast_rrm:
; SSE: # %bb.0:
; SSE-NEXT: movq {{.*#+}} xmm8 = mem[0],zero
-; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,1,0,1]
-; SSE-NEXT: pcmpeqd %xmm10, %xmm10
-; SSE-NEXT: pxor %xmm8, %xmm10
-; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm10[0,1,0,1]
-; SSE-NEXT: pand %xmm9, %xmm3
-; SSE-NEXT: pand %xmm9, %xmm2
-; SSE-NEXT: pand %xmm9, %xmm1
-; SSE-NEXT: pand %xmm9, %xmm0
-; SSE-NEXT: pand %xmm8, %xmm7
-; SSE-NEXT: por %xmm7, %xmm3
-; SSE-NEXT: pand %xmm8, %xmm6
-; SSE-NEXT: por %xmm6, %xmm2
-; SSE-NEXT: pand %xmm8, %xmm5
-; SSE-NEXT: por %xmm5, %xmm1
-; SSE-NEXT: pand %xmm8, %xmm4
-; SSE-NEXT: por %xmm4, %xmm0
+; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,0,1]
+; SSE-NEXT: pand %xmm8, %xmm3
+; SSE-NEXT: pand %xmm8, %xmm2
+; SSE-NEXT: pand %xmm8, %xmm1
+; SSE-NEXT: pand %xmm8, %xmm0
+; SSE-NEXT: movdqa %xmm8, %xmm9
+; SSE-NEXT: pandn %xmm7, %xmm9
+; SSE-NEXT: por %xmm9, %xmm3
+; SSE-NEXT: movdqa %xmm8, %xmm7
+; SSE-NEXT: pandn %xmm6, %xmm7
+; SSE-NEXT: por %xmm7, %xmm2
+; SSE-NEXT: movdqa %xmm8, %xmm6
+; SSE-NEXT: pandn %xmm5, %xmm6
+; SSE-NEXT: por %xmm6, %xmm1
+; SSE-NEXT: pandn %xmm4, %xmm8
+; SSE-NEXT: por %xmm8, %xmm0
; SSE-NEXT: retq
;
; XOP-LABEL: bitselect_v8i64_broadcast_rrm:
; XOP: # %bb.0:
-; XOP-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero
-; XOP-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[0,1,0,1]
-; XOP-NEXT: vinsertf128 $1, %xmm5, %ymm5, %ymm5
-; XOP-NEXT: vpcmpeqd %xmm6, %xmm6, %xmm6
-; XOP-NEXT: vpxor %xmm6, %xmm4, %xmm4
-; XOP-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,1]
-; XOP-NEXT: vinsertf128 $1, %xmm4, %ymm4, %ymm4
-; XOP-NEXT: vandps %ymm5, %ymm1, %ymm1
-; XOP-NEXT: vandps %ymm5, %ymm0, %ymm0
-; XOP-NEXT: vandps %ymm4, %ymm3, %ymm3
+; XOP-NEXT: vbroadcastsd (%rdi), %ymm4
+; XOP-NEXT: vinsertf128 $1, %xmm4, %ymm4, %ymm5
+; XOP-NEXT: vandps %ymm4, %ymm1, %ymm1
+; XOP-NEXT: vandps %ymm4, %ymm0, %ymm0
+; XOP-NEXT: vandnps %ymm3, %ymm5, %ymm3
; XOP-NEXT: vorps %ymm3, %ymm1, %ymm1
-; XOP-NEXT: vandps %ymm4, %ymm2, %ymm2
+; XOP-NEXT: vandnps %ymm2, %ymm5, %ymm2
; XOP-NEXT: vorps %ymm2, %ymm0, %ymm0
; XOP-NEXT: retq
;
; AVX1-LABEL: bitselect_v8i64_broadcast_rrm:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero
-; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[0,1,0,1]
-; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm5, %ymm5
-; AVX1-NEXT: vpcmpeqd %xmm6, %xmm6, %xmm6
-; AVX1-NEXT: vpxor %xmm6, %xmm4, %xmm4
-; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,1]
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm4, %ymm4
-; AVX1-NEXT: vandps %ymm5, %ymm1, %ymm1
-; AVX1-NEXT: vandps %ymm5, %ymm0, %ymm0
-; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3
+; AVX1-NEXT: vbroadcastsd (%rdi), %ymm4
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm4, %ymm5
+; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1
+; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0
+; AVX1-NEXT: vandnps %ymm3, %ymm5, %ymm3
; AVX1-NEXT: vorps %ymm3, %ymm1, %ymm1
-; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2
+; AVX1-NEXT: vandnps %ymm2, %ymm5, %ymm2
; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0
; AVX1-NEXT: retq
;
More information about the llvm-commits
mailing list