[llvm] 15dd5ed - [X86] Support ANDNP combine through vector_shuffle
Phoebe Wang via llvm-commits
llvm-commits at lists.llvm.org
Thu Dec 22 01:30:13 PST 2022
Author: Evgenii Kudriashov
Date: 2022-12-22T16:55:14+08:00
New Revision: 15dd5ed96cf1ecd29842ef6d3bc9b5eea12dccfd
URL: https://github.com/llvm/llvm-project/commit/15dd5ed96cf1ecd29842ef6d3bc9b5eea12dccfd
DIFF: https://github.com/llvm/llvm-project/commit/15dd5ed96cf1ecd29842ef6d3bc9b5eea12dccfd.diff
LOG: [X86] Support ANDNP combine through vector_shuffle
Combine
```
and (vector_shuffle<Z,...,Z>
(insert_vector_elt undef, (xor X, -1), Z), undef), Y
->
andnp (vector_shuffle<Z,...,Z>
(insert_vector_elt undef, X, Z), undef), Y
```
Reviewed By: RKSimon, pengfei
Differential Revision: https://reviews.llvm.org/D138521
Added:
Modified:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/X86/combine-and.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 096b457e91225..119d96f75ef37 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -7038,7 +7038,8 @@ static SDValue getEXTEND_VECTOR_INREG(unsigned Opcode, const SDLoc &DL, EVT VT,
static SDValue IsNOT(SDValue V, SelectionDAG &DAG) {
V = peekThroughBitcasts(V);
if (V.getOpcode() == ISD::XOR &&
- ISD::isBuildVectorAllOnes(V.getOperand(1).getNode()))
+ (ISD::isBuildVectorAllOnes(V.getOperand(1).getNode()) ||
+ isAllOnesConstant(V.getOperand(1))))
return V.getOperand(0);
if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
(isNullConstant(V.getOperand(1)) || V.getOperand(0).hasOneUse())) {
@@ -48177,7 +48178,7 @@ static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
/// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
static SDValue combineAndNotIntoANDNP(SDNode *N, SelectionDAG &DAG) {
- assert(N->getOpcode() == ISD::AND);
+ assert(N->getOpcode() == ISD::AND && "Unexpected opcode combine into ANDNP");
MVT VT = N->getSimpleValueType(0);
if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector())
@@ -48187,23 +48188,69 @@ static SDValue combineAndNotIntoANDNP(SDNode *N, SelectionDAG &DAG) {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
- auto GetNot = [&VT, &DAG](SDValue V) {
- // Basic X = NOT(Y) detection.
- if (SDValue Not = IsNOT(V, DAG))
- return Not;
- // Fold BROADCAST(NOT(Y)) -> BROADCAST(Y).
- if (V.getOpcode() == X86ISD::VBROADCAST) {
- SDValue Src = V.getOperand(0);
- EVT SrcVT = Src.getValueType();
- if (!SrcVT.isVector())
- return SDValue();
- if (SDValue Not = IsNOT(Src, DAG))
- return DAG.getNode(X86ISD::VBROADCAST, SDLoc(V), VT,
- DAG.getBitcast(SrcVT, Not));
+ if (SDValue Not = IsNOT(N0, DAG)) {
+ X = Not;
+ Y = N1;
+ } else if (SDValue Not = IsNOT(N1, DAG)) {
+ X = Not;
+ Y = N0;
+ } else
+ return SDValue();
+
+ X = DAG.getBitcast(VT, X);
+ Y = DAG.getBitcast(VT, Y);
+ return DAG.getNode(X86ISD::ANDNP, SDLoc(N), VT, X, Y);
+}
+
+/// Try to fold:
+/// and (vector_shuffle<Z,...,Z>
+/// (insert_vector_elt undef, (xor X, -1), Z), undef), Y
+/// ->
+/// andnp (vector_shuffle<Z,...,Z>
+/// (insert_vector_elt undef, X, Z), undef), Y
+static SDValue combineAndShuffleNot(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ assert(N->getOpcode() == ISD::AND && "Unexpected opcode combine into ANDNP");
+
+ EVT VT = N->getValueType(0);
+ // Do not split 256 and 512 bit vectors with SSE2 as they overwrite original
+ // value and require extra moves.
+ if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||
+ ((VT.is256BitVector() || VT.is512BitVector()) && Subtarget.hasAVX())))
+ return SDValue();
+
+ auto GetNot = [&DAG](SDValue V) {
+ auto *SVN = dyn_cast<ShuffleVectorSDNode>(peekThroughOneUseBitcasts(V));
+ // TODO: SVN->hasOneUse() is a strong condition. It can be relaxed if all
+ // end-users are ISD::AND including cases
+ // (and(extract_vector_element(SVN), Y)).
+ if (!SVN || !SVN->hasOneUse() || !SVN->isSplat() ||
+ !SVN->getOperand(1).isUndef()) {
+ return SDValue();
+ }
+ SDValue IVEN = SVN->getOperand(0);
+ if (IVEN.getOpcode() != ISD::INSERT_VECTOR_ELT ||
+ !IVEN.getOperand(0).isUndef() || !IVEN.hasOneUse())
+ return SDValue();
+ if (!isa<ConstantSDNode>(IVEN.getOperand(2)) ||
+ IVEN.getConstantOperandAPInt(2) != SVN->getSplatIndex())
+ return SDValue();
+ SDValue Src = IVEN.getOperand(1);
+ if (SDValue Not = IsNOT(Src, DAG)) {
+ SDValue NotSrc = DAG.getBitcast(Src.getValueType(), Not);
+ SDValue NotIVEN =
+ DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(IVEN), IVEN.getValueType(),
+ IVEN.getOperand(0), NotSrc, IVEN.getOperand(2));
+ return DAG.getVectorShuffle(SVN->getValueType(0), SDLoc(SVN), NotIVEN,
+ SVN->getOperand(1), SVN->getMask());
}
return SDValue();
};
+ SDValue X, Y;
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+
if (SDValue Not = GetNot(N0)) {
X = Not;
Y = N1;
@@ -48215,7 +48262,20 @@ static SDValue combineAndNotIntoANDNP(SDNode *N, SelectionDAG &DAG) {
X = DAG.getBitcast(VT, X);
Y = DAG.getBitcast(VT, Y);
- return DAG.getNode(X86ISD::ANDNP, SDLoc(N), VT, X, Y);
+ SDLoc DL(N);
+ // We do not split for SSE at all, but we need to split vectors for AVX1 and
+ // AVX2.
+ if (!Subtarget.useAVX512Regs() && VT.is512BitVector()) {
+ SDValue LoX, HiX;
+ std::tie(LoX, HiX) = splitVector(X, DAG, DL);
+ SDValue LoY, HiY;
+ std::tie(LoY, HiY) = splitVector(Y, DAG, DL);
+ EVT SplitVT = LoX.getValueType();
+ SDValue LoV = DAG.getNode(X86ISD::ANDNP, DL, SplitVT, {LoX, LoY});
+ SDValue HiV = DAG.getNode(X86ISD::ANDNP, DL, SplitVT, {HiX, HiY});
+ return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, {LoV, HiV});
+ }
+ return DAG.getNode(X86ISD::ANDNP, DL, VT, {X, Y});
}
// Try to widen AND, OR and XOR nodes to VT in order to remove casts around
@@ -48795,6 +48855,9 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, DCI, Subtarget))
return FPLogic;
+ if (SDValue R = combineAndShuffleNot(N, DAG, Subtarget))
+ return R;
+
if (DCI.isBeforeLegalizeOps())
return SDValue();
diff --git a/llvm/test/CodeGen/X86/combine-and.ll b/llvm/test/CodeGen/X86/combine-and.ll
index 09abb86c5e1a4..2c51971bfadc3 100644
--- a/llvm/test/CodeGen/X86/combine-and.ll
+++ b/llvm/test/CodeGen/X86/combine-and.ll
@@ -519,9 +519,8 @@ define <8 x i64> @neg_scalar_broadcast_v8i64_arg(i64 %a0, <8 x i64> %a1) {
;
; AVX512-LABEL: neg_scalar_broadcast_v8i64_arg:
; AVX512: # %bb.0:
-; AVX512-NEXT: notq %rdi
; AVX512-NEXT: vpbroadcastq %rdi, %zmm1
-; AVX512-NEXT: vpandq %zmm0, %zmm1, %zmm0
+; AVX512-NEXT: vpandnq %zmm0, %zmm1, %zmm0
; AVX512-NEXT: retq
%1 = xor i64 %a0, -1
%2 = insertelement <8 x i64> undef, i64 %1, i64 0
@@ -549,38 +548,35 @@ define <8 x i64> @neg_scalar_broadcast_v8i64(i64 %a0, <2 x i64> %a1) {
; AVX1-LABEL: neg_scalar_broadcast_v8i64:
; AVX1: # %bb.0:
; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
-; AVX1-NEXT: notq %rdi
-; AVX1-NEXT: vmovq %rdi, %xmm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1
-; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[0,1,0,1]
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
+; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,1,0,1]
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,3]
-; AVX1-NEXT: vandpd %ymm1, %ymm0, %ymm0
-; AVX1-NEXT: vandpd %ymm1, %ymm2, %ymm1
+; AVX1-NEXT: vmovq %rdi, %xmm2
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2
+; AVX1-NEXT: vandnpd %ymm0, %ymm2, %ymm0
+; AVX1-NEXT: vandnpd %ymm1, %ymm2, %ymm1
; AVX1-NEXT: retq
;
; AVX2-LABEL: neg_scalar_broadcast_v8i64:
; AVX2: # %bb.0:
; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
-; AVX2-NEXT: notq %rdi
-; AVX2-NEXT: vmovq %rdi, %xmm1
-; AVX2-NEXT: vpbroadcastq %xmm1, %ymm1
-; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,0]
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,1,0,0]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,0,1,1]
-; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vmovq %rdi, %xmm2
+; AVX2-NEXT: vpbroadcastq %xmm2, %ymm2
+; AVX2-NEXT: vpandn %ymm0, %ymm2, %ymm0
+; AVX2-NEXT: vpandn %ymm1, %ymm2, %ymm1
; AVX2-NEXT: retq
;
; AVX512-LABEL: neg_scalar_broadcast_v8i64:
; AVX512: # %bb.0:
; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512-NEXT: notq %rdi
; AVX512-NEXT: vpbroadcastq %rdi, %zmm1
; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,0,1,1,0,1,0,0]
; AVX512-NEXT: vpermq %zmm0, %zmm2, %zmm0
-; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: vpandnq %zmm0, %zmm1, %zmm0
; AVX512-NEXT: retq
%1 = xor i64 %a0, -1
%2 = insertelement <8 x i64> undef, i64 %1, i64 0
@@ -602,26 +598,23 @@ define <4 x i64> @neg_scalar_broadcast_v4i64_arg(i64 %a0, <4 x i64> %a1) {
;
; AVX1-LABEL: neg_scalar_broadcast_v4i64_arg:
; AVX1: # %bb.0:
-; AVX1-NEXT: notq %rdi
; AVX1-NEXT: vmovq %rdi, %xmm1
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1
-; AVX1-NEXT: vandps %ymm0, %ymm1, %ymm0
+; AVX1-NEXT: vandnps %ymm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: neg_scalar_broadcast_v4i64_arg:
; AVX2: # %bb.0:
-; AVX2-NEXT: notq %rdi
; AVX2-NEXT: vmovq %rdi, %xmm1
; AVX2-NEXT: vpbroadcastq %xmm1, %ymm1
-; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpandn %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: neg_scalar_broadcast_v4i64_arg:
; AVX512: # %bb.0:
-; AVX512-NEXT: notq %rdi
; AVX512-NEXT: vpbroadcastq %rdi, %ymm1
-; AVX512-NEXT: vpand %ymm0, %ymm1, %ymm0
+; AVX512-NEXT: vpandn %ymm0, %ymm1, %ymm0
; AVX512-NEXT: retq
%1 = xor i64 %a0, -1
%2 = insertelement <4 x i64> undef, i64 %1, i64 0
@@ -645,32 +638,29 @@ define <4 x i64> @neg_scalar_broadcast_v4i64(i64 %a0, <2 x i64> %a1) {
; AVX1-LABEL: neg_scalar_broadcast_v4i64:
; AVX1: # %bb.0:
; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
-; AVX1-NEXT: notq %rdi
; AVX1-NEXT: vmovq %rdi, %xmm1
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,3]
-; AVX1-NEXT: vandpd %ymm1, %ymm0, %ymm0
+; AVX1-NEXT: vandnpd %ymm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: neg_scalar_broadcast_v4i64:
; AVX2: # %bb.0:
; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
-; AVX2-NEXT: notq %rdi
; AVX2-NEXT: vmovq %rdi, %xmm1
; AVX2-NEXT: vpbroadcastq %xmm1, %ymm1
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,0,1,1]
-; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpandn %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: neg_scalar_broadcast_v4i64:
; AVX512: # %bb.0:
; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
-; AVX512-NEXT: notq %rdi
; AVX512-NEXT: vpbroadcastq %rdi, %ymm1
; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,0,1,1]
-; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0
+; AVX512-NEXT: vpandn %ymm0, %ymm1, %ymm0
; AVX512-NEXT: retq
%1 = xor i64 %a0, -1
%2 = insertelement <4 x i64> undef, i64 %1, i64 0
@@ -683,33 +673,30 @@ define <4 x i64> @neg_scalar_broadcast_v4i64(i64 %a0, <2 x i64> %a1) {
define <2 x i64> @neg_scalar_broadcast_v2i64(i64 %a0, <2 x i64> %a1) {
; SSE-LABEL: neg_scalar_broadcast_v2i64:
; SSE: # %bb.0:
-; SSE-NEXT: notq %rdi
; SSE-NEXT: movq %rdi, %xmm1
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
-; SSE-NEXT: pand %xmm1, %xmm0
+; SSE-NEXT: pandn %xmm0, %xmm1
+; SSE-NEXT: movdqa %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: neg_scalar_broadcast_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: notq %rdi
; AVX1-NEXT: vmovq %rdi, %xmm1
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
-; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpandn %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: neg_scalar_broadcast_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: notq %rdi
; AVX2-NEXT: vmovq %rdi, %xmm1
; AVX2-NEXT: vpbroadcastq %xmm1, %xmm1
-; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vpandn %xmm0, %xmm1, %xmm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: neg_scalar_broadcast_v2i64:
; AVX512: # %bb.0:
-; AVX512-NEXT: notq %rdi
; AVX512-NEXT: vpbroadcastq %rdi, %xmm1
-; AVX512-NEXT: vpand %xmm0, %xmm1, %xmm0
+; AVX512-NEXT: vpandn %xmm0, %xmm1, %xmm0
; AVX512-NEXT: retq
%1 = xor i64 %a0, -1
%2 = insertelement <2 x i64> undef, i64 %1, i64 0
@@ -762,26 +749,23 @@ define <8 x i32> @neg_scalar_broadcast_v8i32(i32 %a0, <8 x i32> %a1) {
;
; AVX1-LABEL: neg_scalar_broadcast_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: notl %edi
; AVX1-NEXT: vmovd %edi, %xmm1
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1
-; AVX1-NEXT: vandps %ymm0, %ymm1, %ymm0
+; AVX1-NEXT: vandnps %ymm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: neg_scalar_broadcast_v8i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: notl %edi
; AVX2-NEXT: vmovd %edi, %xmm1
; AVX2-NEXT: vpbroadcastd %xmm1, %ymm1
-; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpandn %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: neg_scalar_broadcast_v8i32:
; AVX512: # %bb.0:
-; AVX512-NEXT: notl %edi
; AVX512-NEXT: vpbroadcastd %edi, %ymm1
-; AVX512-NEXT: vpand %ymm0, %ymm1, %ymm0
+; AVX512-NEXT: vpandn %ymm0, %ymm1, %ymm0
; AVX512-NEXT: retq
%1 = xor i32 %a0, -1
%2 = insertelement <8 x i32> undef, i32 %1, i64 0
@@ -793,35 +777,32 @@ define <8 x i32> @neg_scalar_broadcast_v8i32(i32 %a0, <8 x i32> %a1) {
define <8 x i16> @neg_scalar_broadcast_v8i16(i16 %a0, <8 x i16> %a1) {
; SSE-LABEL: neg_scalar_broadcast_v8i16:
; SSE: # %bb.0:
-; SSE-NEXT: notl %edi
; SSE-NEXT: movd %edi, %xmm1
; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
-; SSE-NEXT: pand %xmm1, %xmm0
+; SSE-NEXT: pandn %xmm0, %xmm1
+; SSE-NEXT: movdqa %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: neg_scalar_broadcast_v8i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: notl %edi
; AVX1-NEXT: vmovd %edi, %xmm1
; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
-; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpandn %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: neg_scalar_broadcast_v8i16:
; AVX2: # %bb.0:
-; AVX2-NEXT: notl %edi
; AVX2-NEXT: vmovd %edi, %xmm1
; AVX2-NEXT: vpbroadcastw %xmm1, %xmm1
-; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vpandn %xmm0, %xmm1, %xmm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: neg_scalar_broadcast_v8i16:
; AVX512: # %bb.0:
-; AVX512-NEXT: notl %edi
; AVX512-NEXT: vpbroadcastw %edi, %xmm1
-; AVX512-NEXT: vpand %xmm0, %xmm1, %xmm0
+; AVX512-NEXT: vpandn %xmm0, %xmm1, %xmm0
; AVX512-NEXT: retq
%1 = xor i16 %a0, -1
%2 = insertelement <8 x i16> undef, i16 %1, i64 0
@@ -833,36 +814,32 @@ define <8 x i16> @neg_scalar_broadcast_v8i16(i16 %a0, <8 x i16> %a1) {
define <16 x i8> @neg_scalar_broadcast_v16i8(i8 %a0, <16 x i8> %a1) {
; SSE-LABEL: neg_scalar_broadcast_v16i8:
; SSE: # %bb.0:
-; SSE-NEXT: notb %dil
-; SSE-NEXT: movzbl %dil, %eax
-; SSE-NEXT: movd %eax, %xmm1
+; SSE-NEXT: movd %edi, %xmm1
; SSE-NEXT: pxor %xmm2, %xmm2
; SSE-NEXT: pshufb %xmm2, %xmm1
-; SSE-NEXT: pand %xmm1, %xmm0
+; SSE-NEXT: pandn %xmm0, %xmm1
+; SSE-NEXT: movdqa %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: neg_scalar_broadcast_v16i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: notb %dil
; AVX1-NEXT: vmovd %edi, %xmm1
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpandn %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: neg_scalar_broadcast_v16i8:
; AVX2: # %bb.0:
-; AVX2-NEXT: notb %dil
; AVX2-NEXT: vmovd %edi, %xmm1
; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1
-; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vpandn %xmm0, %xmm1, %xmm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: neg_scalar_broadcast_v16i8:
; AVX512: # %bb.0:
-; AVX512-NEXT: notb %dil
; AVX512-NEXT: vpbroadcastb %edi, %xmm1
-; AVX512-NEXT: vpand %xmm0, %xmm1, %xmm0
+; AVX512-NEXT: vpandn %xmm0, %xmm1, %xmm0
; AVX512-NEXT: retq
%1 = xor i8 %a0, -1
%2 = insertelement <16 x i8> undef, i8 %1, i64 0
@@ -907,9 +884,8 @@ define <64 x i8> @neg_scalar_broadcast_v64i8(i8 %a0, <64 x i8> %a1) {
;
; AVX512-LABEL: neg_scalar_broadcast_v64i8:
; AVX512: # %bb.0:
-; AVX512-NEXT: notb %dil
; AVX512-NEXT: vpbroadcastb %edi, %zmm1
-; AVX512-NEXT: vpandq %zmm0, %zmm1, %zmm0
+; AVX512-NEXT: vpandnq %zmm0, %zmm1, %zmm0
; AVX512-NEXT: retq
%1 = xor i8 %a0, -1
%2 = insertelement <64 x i8> undef, i8 %1, i64 0
@@ -954,9 +930,8 @@ define <8 x i64> @neg_scalar_broadcast_v64i8_v8i64(i8 %a0, <8 x i64> %a1) {
;
; AVX512-LABEL: neg_scalar_broadcast_v64i8_v8i64:
; AVX512: # %bb.0:
-; AVX512-NEXT: notb %dil
; AVX512-NEXT: vpbroadcastb %edi, %zmm1
-; AVX512-NEXT: vpandq %zmm0, %zmm1, %zmm0
+; AVX512-NEXT: vpandnq %zmm0, %zmm1, %zmm0
; AVX512-NEXT: retq
%1 = xor i8 %a0, -1
%2 = insertelement <64 x i8> undef, i8 %1, i64 0
@@ -980,27 +955,24 @@ define <4 x i64> @neg_scalar_broadcast_v32i8_v4i64(i8 %a0, <4 x i64> %a1) {
;
; AVX1-LABEL: neg_scalar_broadcast_v32i8_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: notb %dil
; AVX1-NEXT: vmovd %edi, %xmm1
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1
-; AVX1-NEXT: vandps %ymm0, %ymm1, %ymm0
+; AVX1-NEXT: vandnps %ymm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: neg_scalar_broadcast_v32i8_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: notb %dil
; AVX2-NEXT: vmovd %edi, %xmm1
; AVX2-NEXT: vpbroadcastb %xmm1, %ymm1
-; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpandn %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: neg_scalar_broadcast_v32i8_v4i64:
; AVX512: # %bb.0:
-; AVX512-NEXT: notb %dil
; AVX512-NEXT: vpbroadcastb %edi, %ymm1
-; AVX512-NEXT: vpand %ymm0, %ymm1, %ymm0
+; AVX512-NEXT: vpandn %ymm0, %ymm1, %ymm0
; AVX512-NEXT: retq
%1 = xor i8 %a0, -1
%2 = insertelement <32 x i8> undef, i8 %1, i64 0
@@ -1013,36 +985,32 @@ define <4 x i64> @neg_scalar_broadcast_v32i8_v4i64(i8 %a0, <4 x i64> %a1) {
define <2 x i64> @neg_scalar_broadcast_v16i8_v2i64(i8 %a0, <2 x i64> %a1) {
; SSE-LABEL: neg_scalar_broadcast_v16i8_v2i64:
; SSE: # %bb.0:
-; SSE-NEXT: notb %dil
-; SSE-NEXT: movzbl %dil, %eax
-; SSE-NEXT: movd %eax, %xmm1
+; SSE-NEXT: movd %edi, %xmm1
; SSE-NEXT: pxor %xmm2, %xmm2
; SSE-NEXT: pshufb %xmm2, %xmm1
-; SSE-NEXT: pand %xmm1, %xmm0
+; SSE-NEXT: pandn %xmm0, %xmm1
+; SSE-NEXT: movdqa %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: neg_scalar_broadcast_v16i8_v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: notb %dil
; AVX1-NEXT: vmovd %edi, %xmm1
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpandn %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: neg_scalar_broadcast_v16i8_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: notb %dil
; AVX2-NEXT: vmovd %edi, %xmm1
; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1
-; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vpandn %xmm0, %xmm1, %xmm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: neg_scalar_broadcast_v16i8_v2i64:
; AVX512: # %bb.0:
-; AVX512-NEXT: notb %dil
; AVX512-NEXT: vpbroadcastb %edi, %xmm1
-; AVX512-NEXT: vpand %xmm0, %xmm1, %xmm0
+; AVX512-NEXT: vpandn %xmm0, %xmm1, %xmm0
; AVX512-NEXT: retq
%1 = xor i8 %a0, -1
%2 = insertelement <16 x i8> undef, i8 %1, i64 0
@@ -1064,26 +1032,23 @@ define <4 x i64> @neg_scalar_broadcast_v8i32_v4i64(i32 %a0, <4 x i64> %a1) {
;
; AVX1-LABEL: neg_scalar_broadcast_v8i32_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: notl %edi
; AVX1-NEXT: vmovd %edi, %xmm1
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1
-; AVX1-NEXT: vandps %ymm0, %ymm1, %ymm0
+; AVX1-NEXT: vandnps %ymm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: neg_scalar_broadcast_v8i32_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: notl %edi
; AVX2-NEXT: vmovd %edi, %xmm1
; AVX2-NEXT: vpbroadcastd %xmm1, %ymm1
-; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpandn %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: neg_scalar_broadcast_v8i32_v4i64:
; AVX512: # %bb.0:
-; AVX512-NEXT: notl %edi
; AVX512-NEXT: vpbroadcastd %edi, %ymm1
-; AVX512-NEXT: vpand %ymm0, %ymm1, %ymm0
+; AVX512-NEXT: vpandn %ymm0, %ymm1, %ymm0
; AVX512-NEXT: retq
%1 = xor i32 %a0, -1
%2 = insertelement <8 x i32> undef, i32 %1, i64 0
More information about the llvm-commits
mailing list