[llvm] r344348 - [X86][SSE] LowerVectorCTPOP - pull out repeated byte sum stage.
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Fri Oct 12 07:18:47 PDT 2018
Author: rksimon
Date: Fri Oct 12 07:18:47 2018
New Revision: 344348
URL: http://llvm.org/viewvc/llvm-project?rev=344348&view=rev
Log:
[X86][SSE] LowerVectorCTPOP - pull out repeated byte sum stage.
Pull out repeated byte sum stage for popcount of vector elements > 8bits.
This allows us to simplify the LUT/BITMATH popcnt code to always assume vXi8 vectors, and also improves avx512bitalg codegen which only has access to vpopcntb/vpopcntw.
Modified:
llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
llvm/trunk/test/CodeGen/X86/vec_ctbits.ll
llvm/trunk/test/CodeGen/X86/vector-lzcnt-128.ll
llvm/trunk/test/CodeGen/X86/vector-popcnt-128.ll
llvm/trunk/test/CodeGen/X86/vector-popcnt-256.ll
llvm/trunk/test/CodeGen/X86/vector-popcnt-512.ll
llvm/trunk/test/CodeGen/X86/vector-tzcnt-128.ll
llvm/trunk/test/CodeGen/X86/vector-tzcnt-256.ll
llvm/trunk/test/CodeGen/X86/vector-tzcnt-512.ll
Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=344348&r1=344347&r2=344348&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Fri Oct 12 07:18:47 2018
@@ -25023,7 +25023,8 @@ static SDValue LowerVectorCTPOPInRegLUT(
SelectionDAG &DAG) {
MVT VT = Op.getSimpleValueType();
MVT EltVT = VT.getVectorElementType();
- unsigned VecSize = VT.getSizeInBits();
+ int NumElts = VT.getVectorNumElements();
+ assert(EltVT == MVT::i8 && "Only vXi8 vector CTPOP lowering supported.");
// Implement a lookup table in register by using an algorithm based on:
// http://wm.ite.pl/articles/sse-popcount.html
@@ -25035,56 +25036,37 @@ static SDValue LowerVectorCTPOPInRegLUT(
// masked out higher ones) for each byte. PSHUFB is used separately with both
// to index the in-register table. Next, both are added and the result is a
// i8 vector where each element contains the pop count for input byte.
- //
- // To obtain the pop count for elements != i8, we follow up with the same
- // approach and use additional tricks as described below.
- //
const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
/* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
/* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
/* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4};
- int NumByteElts = VecSize / 8;
- MVT ByteVecVT = MVT::getVectorVT(MVT::i8, NumByteElts);
- SDValue In = DAG.getBitcast(ByteVecVT, Op);
SmallVector<SDValue, 64> LUTVec;
- for (int i = 0; i < NumByteElts; ++i)
+ for (int i = 0; i < NumElts; ++i)
LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
- SDValue InRegLUT = DAG.getBuildVector(ByteVecVT, DL, LUTVec);
- SDValue M0F = DAG.getConstant(0x0F, DL, ByteVecVT);
+ SDValue InRegLUT = DAG.getBuildVector(VT, DL, LUTVec);
+ SDValue M0F = DAG.getConstant(0x0F, DL, VT);
// High nibbles
- SDValue FourV = DAG.getConstant(4, DL, ByteVecVT);
- SDValue HighNibbles = DAG.getNode(ISD::SRL, DL, ByteVecVT, In, FourV);
+ SDValue FourV = DAG.getConstant(4, DL, VT);
+ SDValue HiNibbles = DAG.getNode(ISD::SRL, DL, VT, Op, FourV);
// Low nibbles
- SDValue LowNibbles = DAG.getNode(ISD::AND, DL, ByteVecVT, In, M0F);
+ SDValue LoNibbles = DAG.getNode(ISD::AND, DL, VT, Op, M0F);
// The input vector is used as the shuffle mask that index elements into the
// LUT. After counting low and high nibbles, add the vector to obtain the
// final pop count per i8 element.
- SDValue HighPopCnt =
- DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, HighNibbles);
- SDValue LowPopCnt =
- DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, LowNibbles);
- SDValue PopCnt = DAG.getNode(ISD::ADD, DL, ByteVecVT, HighPopCnt, LowPopCnt);
-
- if (EltVT == MVT::i8)
- return PopCnt;
-
- return LowerHorizontalByteSum(PopCnt, VT, Subtarget, DAG);
+ SDValue HiPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, HiNibbles);
+ SDValue LoPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, LoNibbles);
+ return DAG.getNode(ISD::ADD, DL, VT, HiPopCnt, LoPopCnt);
}
static SDValue LowerVectorCTPOPBitmath(SDValue Op, const SDLoc &DL,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
MVT VT = Op.getSimpleValueType();
- assert(VT.is128BitVector() &&
- "Only 128-bit vector bitmath lowering supported.");
-
- int VecSize = VT.getSizeInBits();
- MVT EltVT = VT.getVectorElementType();
- int Len = EltVT.getSizeInBits();
+ assert(VT == MVT::v16i8 && "Only v16i8 vector CTPOP lowering supported.");
// This is the vectorized version of the "best" algorithm from
// http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
@@ -25108,36 +25090,27 @@ static SDValue LowerVectorCTPOPBitmath(S
// x86, so set the SRL type to have elements at least i16 wide. This is
// correct because all of our SRLs are followed immediately by a mask anyways
// that handles any bits that sneak into the high bits of the byte elements.
- MVT SrlVT = Len > 8 ? VT : MVT::getVectorVT(MVT::i16, VecSize / 16);
-
+ MVT SrlVT = MVT::v8i16;
SDValue V = Op;
// v = v - ((v >> 1) & 0x55555555...)
SDValue Srl =
DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 1));
- SDValue And = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x55)));
+ SDValue And = GetMask(Srl, APInt(8, 0x55));
V = DAG.getNode(ISD::SUB, DL, VT, V, And);
// v = (v & 0x33333333...) + ((v >> 2) & 0x33333333...)
- SDValue AndLHS = GetMask(V, APInt::getSplat(Len, APInt(8, 0x33)));
+ SDValue AndLHS = GetMask(V, APInt(8, 0x33));
Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 2));
- SDValue AndRHS = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x33)));
+ SDValue AndRHS = GetMask(Srl, APInt(8, 0x33));
V = DAG.getNode(ISD::ADD, DL, VT, AndLHS, AndRHS);
// v = (v + (v >> 4)) & 0x0F0F0F0F...
Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 4));
SDValue Add = DAG.getNode(ISD::ADD, DL, VT, V, Srl);
- V = GetMask(Add, APInt::getSplat(Len, APInt(8, 0x0F)));
+ V = GetMask(Add, APInt(8, 0x0F));
- // At this point, V contains the byte-wise population count, and we are
- // merely doing a horizontal sum if necessary to get the wider element
- // counts.
- if (EltVT == MVT::i8)
- return V;
-
- return LowerHorizontalByteSum(
- DAG.getBitcast(MVT::getVectorVT(MVT::i8, VecSize / 8), V), VT, Subtarget,
- DAG);
+ return V;
}
// Please ensure that any codegen change from LowerVectorCTPOP is reflected in
@@ -25163,12 +25136,6 @@ static SDValue LowerVectorCTPOP(SDValue
}
}
- if (!Subtarget.hasSSSE3()) {
- // We can't use the fast LUT approach, so fall back on vectorized bitmath.
- assert(VT.is128BitVector() && "Only 128-bit vectors supported in SSE!");
- return LowerVectorCTPOPBitmath(Op0, DL, Subtarget, DAG);
- }
-
// Decompose 256-bit ops into smaller 128-bit ops.
if (VT.is256BitVector() && !Subtarget.hasInt256())
return Lower256IntUnary(Op, DAG);
@@ -25177,6 +25144,18 @@ static SDValue LowerVectorCTPOP(SDValue
if (VT.is512BitVector() && !Subtarget.hasBWI())
return Lower512IntUnary(Op, DAG);
+ // For element types greater than i8, do vXi8 pop counts and a bytesum.
+ if (VT.getScalarType() != MVT::i8) {
+ MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
+ SDValue ByteOp = DAG.getBitcast(ByteVT, Op0);
+ SDValue PopCnt8 = DAG.getNode(ISD::CTPOP, DL, ByteVT, ByteOp);
+ return LowerHorizontalByteSum(PopCnt8, VT, Subtarget, DAG);
+ }
+
+ // We can't use the fast LUT approach, so fall back on vectorized bitmath.
+ if (!Subtarget.hasSSSE3())
+ return LowerVectorCTPOPBitmath(Op0, DL, Subtarget, DAG);
+
return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);
}
Modified: llvm/trunk/test/CodeGen/X86/vec_ctbits.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vec_ctbits.ll?rev=344348&r1=344347&r2=344348&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vec_ctbits.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vec_ctbits.ll Fri Oct 12 07:18:47 2018
@@ -15,18 +15,18 @@ define <2 x i64> @footz(<2 x i64> %a) no
; CHECK-NEXT: pcmpeqd %xmm3, %xmm3
; CHECK-NEXT: paddq %xmm2, %xmm3
; CHECK-NEXT: movdqa %xmm3, %xmm0
-; CHECK-NEXT: psrlq $1, %xmm0
+; CHECK-NEXT: psrlw $1, %xmm0
; CHECK-NEXT: pand {{.*}}(%rip), %xmm0
-; CHECK-NEXT: psubq %xmm0, %xmm3
-; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [3689348814741910323,3689348814741910323]
+; CHECK-NEXT: psubb %xmm0, %xmm3
+; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
; CHECK-NEXT: movdqa %xmm3, %xmm2
; CHECK-NEXT: pand %xmm0, %xmm2
-; CHECK-NEXT: psrlq $2, %xmm3
+; CHECK-NEXT: psrlw $2, %xmm3
; CHECK-NEXT: pand %xmm0, %xmm3
-; CHECK-NEXT: paddq %xmm2, %xmm3
+; CHECK-NEXT: paddb %xmm2, %xmm3
; CHECK-NEXT: movdqa %xmm3, %xmm0
-; CHECK-NEXT: psrlq $4, %xmm0
-; CHECK-NEXT: paddq %xmm3, %xmm0
+; CHECK-NEXT: psrlw $4, %xmm0
+; CHECK-NEXT: paddb %xmm3, %xmm0
; CHECK-NEXT: pand {{.*}}(%rip), %xmm0
; CHECK-NEXT: psadbw %xmm1, %xmm0
; CHECK-NEXT: retq
@@ -58,18 +58,18 @@ define <2 x i64> @foolz(<2 x i64> %a) no
; CHECK-NEXT: pcmpeqd %xmm1, %xmm1
; CHECK-NEXT: pxor %xmm0, %xmm1
; CHECK-NEXT: movdqa %xmm1, %xmm0
-; CHECK-NEXT: psrlq $1, %xmm0
+; CHECK-NEXT: psrlw $1, %xmm0
; CHECK-NEXT: pand {{.*}}(%rip), %xmm0
-; CHECK-NEXT: psubq %xmm0, %xmm1
-; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [3689348814741910323,3689348814741910323]
+; CHECK-NEXT: psubb %xmm0, %xmm1
+; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
; CHECK-NEXT: movdqa %xmm1, %xmm2
; CHECK-NEXT: pand %xmm0, %xmm2
-; CHECK-NEXT: psrlq $2, %xmm1
+; CHECK-NEXT: psrlw $2, %xmm1
; CHECK-NEXT: pand %xmm0, %xmm1
-; CHECK-NEXT: paddq %xmm2, %xmm1
+; CHECK-NEXT: paddb %xmm2, %xmm1
; CHECK-NEXT: movdqa %xmm1, %xmm2
-; CHECK-NEXT: psrlq $4, %xmm2
-; CHECK-NEXT: paddq %xmm1, %xmm2
+; CHECK-NEXT: psrlw $4, %xmm2
+; CHECK-NEXT: paddb %xmm1, %xmm2
; CHECK-NEXT: pand {{.*}}(%rip), %xmm2
; CHECK-NEXT: pxor %xmm0, %xmm0
; CHECK-NEXT: psadbw %xmm2, %xmm0
@@ -83,18 +83,18 @@ define <2 x i64> @foopop(<2 x i64> %a) n
; CHECK-LABEL: foopop:
; CHECK: # %bb.0:
; CHECK-NEXT: movdqa %xmm0, %xmm1
-; CHECK-NEXT: psrlq $1, %xmm1
+; CHECK-NEXT: psrlw $1, %xmm1
; CHECK-NEXT: pand {{.*}}(%rip), %xmm1
-; CHECK-NEXT: psubq %xmm1, %xmm0
-; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [3689348814741910323,3689348814741910323]
+; CHECK-NEXT: psubb %xmm1, %xmm0
+; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
; CHECK-NEXT: movdqa %xmm0, %xmm2
; CHECK-NEXT: pand %xmm1, %xmm2
-; CHECK-NEXT: psrlq $2, %xmm0
+; CHECK-NEXT: psrlw $2, %xmm0
; CHECK-NEXT: pand %xmm1, %xmm0
-; CHECK-NEXT: paddq %xmm2, %xmm0
+; CHECK-NEXT: paddb %xmm2, %xmm0
; CHECK-NEXT: movdqa %xmm0, %xmm1
-; CHECK-NEXT: psrlq $4, %xmm1
-; CHECK-NEXT: paddq %xmm0, %xmm1
+; CHECK-NEXT: psrlw $4, %xmm1
+; CHECK-NEXT: paddb %xmm0, %xmm1
; CHECK-NEXT: pand {{.*}}(%rip), %xmm1
; CHECK-NEXT: pxor %xmm0, %xmm0
; CHECK-NEXT: psadbw %xmm0, %xmm1
@@ -119,18 +119,18 @@ define <2 x i32> @promtz(<2 x i32> %a) n
; CHECK-NEXT: pcmpeqd %xmm3, %xmm3
; CHECK-NEXT: paddq %xmm2, %xmm3
; CHECK-NEXT: movdqa %xmm3, %xmm0
-; CHECK-NEXT: psrlq $1, %xmm0
+; CHECK-NEXT: psrlw $1, %xmm0
; CHECK-NEXT: pand {{.*}}(%rip), %xmm0
-; CHECK-NEXT: psubq %xmm0, %xmm3
-; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [3689348814741910323,3689348814741910323]
+; CHECK-NEXT: psubb %xmm0, %xmm3
+; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
; CHECK-NEXT: movdqa %xmm3, %xmm2
; CHECK-NEXT: pand %xmm0, %xmm2
-; CHECK-NEXT: psrlq $2, %xmm3
+; CHECK-NEXT: psrlw $2, %xmm3
; CHECK-NEXT: pand %xmm0, %xmm3
-; CHECK-NEXT: paddq %xmm2, %xmm3
+; CHECK-NEXT: paddb %xmm2, %xmm3
; CHECK-NEXT: movdqa %xmm3, %xmm0
-; CHECK-NEXT: psrlq $4, %xmm0
-; CHECK-NEXT: paddq %xmm3, %xmm0
+; CHECK-NEXT: psrlw $4, %xmm0
+; CHECK-NEXT: paddb %xmm3, %xmm0
; CHECK-NEXT: pand {{.*}}(%rip), %xmm0
; CHECK-NEXT: psadbw %xmm1, %xmm0
; CHECK-NEXT: retq
@@ -164,18 +164,18 @@ define <2 x i32> @promlz(<2 x i32> %a) n
; CHECK-NEXT: pcmpeqd %xmm2, %xmm2
; CHECK-NEXT: pxor %xmm0, %xmm2
; CHECK-NEXT: movdqa %xmm2, %xmm0
-; CHECK-NEXT: psrlq $1, %xmm0
+; CHECK-NEXT: psrlw $1, %xmm0
; CHECK-NEXT: pand {{.*}}(%rip), %xmm0
-; CHECK-NEXT: psubq %xmm0, %xmm2
-; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [3689348814741910323,3689348814741910323]
+; CHECK-NEXT: psubb %xmm0, %xmm2
+; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
; CHECK-NEXT: movdqa %xmm2, %xmm3
; CHECK-NEXT: pand %xmm0, %xmm3
-; CHECK-NEXT: psrlq $2, %xmm2
+; CHECK-NEXT: psrlw $2, %xmm2
; CHECK-NEXT: pand %xmm0, %xmm2
-; CHECK-NEXT: paddq %xmm3, %xmm2
+; CHECK-NEXT: paddb %xmm3, %xmm2
; CHECK-NEXT: movdqa %xmm2, %xmm0
-; CHECK-NEXT: psrlq $4, %xmm0
-; CHECK-NEXT: paddq %xmm2, %xmm0
+; CHECK-NEXT: psrlw $4, %xmm0
+; CHECK-NEXT: paddb %xmm2, %xmm0
; CHECK-NEXT: pand {{.*}}(%rip), %xmm0
; CHECK-NEXT: psadbw %xmm1, %xmm0
; CHECK-NEXT: psubq {{.*}}(%rip), %xmm0
@@ -191,18 +191,18 @@ define <2 x i32> @prompop(<2 x i32> %a)
; CHECK-NEXT: pand {{.*}}(%rip), %xmm0
; CHECK-NEXT: pxor %xmm2, %xmm2
; CHECK-NEXT: movdqa %xmm0, %xmm1
-; CHECK-NEXT: psrlq $1, %xmm1
+; CHECK-NEXT: psrlw $1, %xmm1
; CHECK-NEXT: pand {{.*}}(%rip), %xmm1
-; CHECK-NEXT: psubq %xmm1, %xmm0
-; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [3689348814741910323,3689348814741910323]
+; CHECK-NEXT: psubb %xmm1, %xmm0
+; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
; CHECK-NEXT: movdqa %xmm0, %xmm3
; CHECK-NEXT: pand %xmm1, %xmm3
-; CHECK-NEXT: psrlq $2, %xmm0
+; CHECK-NEXT: psrlw $2, %xmm0
; CHECK-NEXT: pand %xmm1, %xmm0
-; CHECK-NEXT: paddq %xmm3, %xmm0
+; CHECK-NEXT: paddb %xmm3, %xmm0
; CHECK-NEXT: movdqa %xmm0, %xmm1
-; CHECK-NEXT: psrlq $4, %xmm1
-; CHECK-NEXT: paddq %xmm0, %xmm1
+; CHECK-NEXT: psrlw $4, %xmm1
+; CHECK-NEXT: paddb %xmm0, %xmm1
; CHECK-NEXT: pand {{.*}}(%rip), %xmm1
; CHECK-NEXT: psadbw %xmm2, %xmm1
; CHECK-NEXT: movdqa %xmm1, %xmm0
Modified: llvm/trunk/test/CodeGen/X86/vector-lzcnt-128.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-lzcnt-128.ll?rev=344348&r1=344347&r2=344348&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-lzcnt-128.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-lzcnt-128.ll Fri Oct 12 07:18:47 2018
@@ -37,18 +37,18 @@ define <2 x i64> @testv2i64(<2 x i64> %i
; SSE2-NEXT: pcmpeqd %xmm1, %xmm1
; SSE2-NEXT: pxor %xmm0, %xmm1
; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psrlq $1, %xmm0
+; SSE2-NEXT: psrlw $1, %xmm0
; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT: psubq %xmm0, %xmm1
-; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [3689348814741910323,3689348814741910323]
+; SSE2-NEXT: psubb %xmm0, %xmm1
+; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
; SSE2-NEXT: movdqa %xmm1, %xmm2
; SSE2-NEXT: pand %xmm0, %xmm2
-; SSE2-NEXT: psrlq $2, %xmm1
+; SSE2-NEXT: psrlw $2, %xmm1
; SSE2-NEXT: pand %xmm0, %xmm1
-; SSE2-NEXT: paddq %xmm2, %xmm1
+; SSE2-NEXT: paddb %xmm2, %xmm1
; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: psrlq $4, %xmm2
-; SSE2-NEXT: paddq %xmm1, %xmm2
+; SSE2-NEXT: psrlw $4, %xmm2
+; SSE2-NEXT: paddb %xmm1, %xmm2
; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
; SSE2-NEXT: pxor %xmm0, %xmm0
; SSE2-NEXT: psadbw %xmm2, %xmm0
@@ -77,18 +77,18 @@ define <2 x i64> @testv2i64(<2 x i64> %i
; SSE3-NEXT: pcmpeqd %xmm1, %xmm1
; SSE3-NEXT: pxor %xmm0, %xmm1
; SSE3-NEXT: movdqa %xmm1, %xmm0
-; SSE3-NEXT: psrlq $1, %xmm0
+; SSE3-NEXT: psrlw $1, %xmm0
; SSE3-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE3-NEXT: psubq %xmm0, %xmm1
-; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [3689348814741910323,3689348814741910323]
+; SSE3-NEXT: psubb %xmm0, %xmm1
+; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
; SSE3-NEXT: movdqa %xmm1, %xmm2
; SSE3-NEXT: pand %xmm0, %xmm2
-; SSE3-NEXT: psrlq $2, %xmm1
+; SSE3-NEXT: psrlw $2, %xmm1
; SSE3-NEXT: pand %xmm0, %xmm1
-; SSE3-NEXT: paddq %xmm2, %xmm1
+; SSE3-NEXT: paddb %xmm2, %xmm1
; SSE3-NEXT: movdqa %xmm1, %xmm2
-; SSE3-NEXT: psrlq $4, %xmm2
-; SSE3-NEXT: paddq %xmm1, %xmm2
+; SSE3-NEXT: psrlw $4, %xmm2
+; SSE3-NEXT: paddb %xmm1, %xmm2
; SSE3-NEXT: pand {{.*}}(%rip), %xmm2
; SSE3-NEXT: pxor %xmm0, %xmm0
; SSE3-NEXT: psadbw %xmm2, %xmm0
@@ -303,18 +303,18 @@ define <2 x i64> @testv2i64u(<2 x i64> %
; SSE2-NEXT: pcmpeqd %xmm1, %xmm1
; SSE2-NEXT: pxor %xmm0, %xmm1
; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psrlq $1, %xmm0
+; SSE2-NEXT: psrlw $1, %xmm0
; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT: psubq %xmm0, %xmm1
-; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [3689348814741910323,3689348814741910323]
+; SSE2-NEXT: psubb %xmm0, %xmm1
+; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
; SSE2-NEXT: movdqa %xmm1, %xmm2
; SSE2-NEXT: pand %xmm0, %xmm2
-; SSE2-NEXT: psrlq $2, %xmm1
+; SSE2-NEXT: psrlw $2, %xmm1
; SSE2-NEXT: pand %xmm0, %xmm1
-; SSE2-NEXT: paddq %xmm2, %xmm1
+; SSE2-NEXT: paddb %xmm2, %xmm1
; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: psrlq $4, %xmm2
-; SSE2-NEXT: paddq %xmm1, %xmm2
+; SSE2-NEXT: psrlw $4, %xmm2
+; SSE2-NEXT: paddb %xmm1, %xmm2
; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
; SSE2-NEXT: pxor %xmm0, %xmm0
; SSE2-NEXT: psadbw %xmm2, %xmm0
@@ -343,18 +343,18 @@ define <2 x i64> @testv2i64u(<2 x i64> %
; SSE3-NEXT: pcmpeqd %xmm1, %xmm1
; SSE3-NEXT: pxor %xmm0, %xmm1
; SSE3-NEXT: movdqa %xmm1, %xmm0
-; SSE3-NEXT: psrlq $1, %xmm0
+; SSE3-NEXT: psrlw $1, %xmm0
; SSE3-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE3-NEXT: psubq %xmm0, %xmm1
-; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [3689348814741910323,3689348814741910323]
+; SSE3-NEXT: psubb %xmm0, %xmm1
+; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
; SSE3-NEXT: movdqa %xmm1, %xmm2
; SSE3-NEXT: pand %xmm0, %xmm2
-; SSE3-NEXT: psrlq $2, %xmm1
+; SSE3-NEXT: psrlw $2, %xmm1
; SSE3-NEXT: pand %xmm0, %xmm1
-; SSE3-NEXT: paddq %xmm2, %xmm1
+; SSE3-NEXT: paddb %xmm2, %xmm1
; SSE3-NEXT: movdqa %xmm1, %xmm2
-; SSE3-NEXT: psrlq $4, %xmm2
-; SSE3-NEXT: paddq %xmm1, %xmm2
+; SSE3-NEXT: psrlw $4, %xmm2
+; SSE3-NEXT: paddb %xmm1, %xmm2
; SSE3-NEXT: pand {{.*}}(%rip), %xmm2
; SSE3-NEXT: pxor %xmm0, %xmm0
; SSE3-NEXT: psadbw %xmm2, %xmm0
@@ -566,18 +566,18 @@ define <4 x i32> @testv4i32(<4 x i32> %i
; SSE2-NEXT: pcmpeqd %xmm2, %xmm2
; SSE2-NEXT: pxor %xmm1, %xmm2
; SSE2-NEXT: movdqa %xmm2, %xmm0
-; SSE2-NEXT: psrld $1, %xmm0
+; SSE2-NEXT: psrlw $1, %xmm0
; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT: psubd %xmm0, %xmm2
-; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [858993459,858993459,858993459,858993459]
+; SSE2-NEXT: psubb %xmm0, %xmm2
+; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
; SSE2-NEXT: movdqa %xmm2, %xmm1
; SSE2-NEXT: pand %xmm0, %xmm1
-; SSE2-NEXT: psrld $2, %xmm2
+; SSE2-NEXT: psrlw $2, %xmm2
; SSE2-NEXT: pand %xmm0, %xmm2
-; SSE2-NEXT: paddd %xmm1, %xmm2
+; SSE2-NEXT: paddb %xmm1, %xmm2
; SSE2-NEXT: movdqa %xmm2, %xmm0
-; SSE2-NEXT: psrld $4, %xmm0
-; SSE2-NEXT: paddd %xmm2, %xmm0
+; SSE2-NEXT: psrlw $4, %xmm0
+; SSE2-NEXT: paddb %xmm2, %xmm0
; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: movdqa %xmm0, %xmm2
@@ -608,18 +608,18 @@ define <4 x i32> @testv4i32(<4 x i32> %i
; SSE3-NEXT: pcmpeqd %xmm2, %xmm2
; SSE3-NEXT: pxor %xmm1, %xmm2
; SSE3-NEXT: movdqa %xmm2, %xmm0
-; SSE3-NEXT: psrld $1, %xmm0
+; SSE3-NEXT: psrlw $1, %xmm0
; SSE3-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE3-NEXT: psubd %xmm0, %xmm2
-; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [858993459,858993459,858993459,858993459]
+; SSE3-NEXT: psubb %xmm0, %xmm2
+; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
; SSE3-NEXT: movdqa %xmm2, %xmm1
; SSE3-NEXT: pand %xmm0, %xmm1
-; SSE3-NEXT: psrld $2, %xmm2
+; SSE3-NEXT: psrlw $2, %xmm2
; SSE3-NEXT: pand %xmm0, %xmm2
-; SSE3-NEXT: paddd %xmm1, %xmm2
+; SSE3-NEXT: paddb %xmm1, %xmm2
; SSE3-NEXT: movdqa %xmm2, %xmm0
-; SSE3-NEXT: psrld $4, %xmm0
-; SSE3-NEXT: paddd %xmm2, %xmm0
+; SSE3-NEXT: psrlw $4, %xmm0
+; SSE3-NEXT: paddb %xmm2, %xmm0
; SSE3-NEXT: pand {{.*}}(%rip), %xmm0
; SSE3-NEXT: pxor %xmm1, %xmm1
; SSE3-NEXT: movdqa %xmm0, %xmm2
@@ -808,18 +808,18 @@ define <4 x i32> @testv4i32u(<4 x i32> %
; SSE2-NEXT: pcmpeqd %xmm2, %xmm2
; SSE2-NEXT: pxor %xmm1, %xmm2
; SSE2-NEXT: movdqa %xmm2, %xmm0
-; SSE2-NEXT: psrld $1, %xmm0
+; SSE2-NEXT: psrlw $1, %xmm0
; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT: psubd %xmm0, %xmm2
-; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [858993459,858993459,858993459,858993459]
+; SSE2-NEXT: psubb %xmm0, %xmm2
+; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
; SSE2-NEXT: movdqa %xmm2, %xmm1
; SSE2-NEXT: pand %xmm0, %xmm1
-; SSE2-NEXT: psrld $2, %xmm2
+; SSE2-NEXT: psrlw $2, %xmm2
; SSE2-NEXT: pand %xmm0, %xmm2
-; SSE2-NEXT: paddd %xmm1, %xmm2
+; SSE2-NEXT: paddb %xmm1, %xmm2
; SSE2-NEXT: movdqa %xmm2, %xmm0
-; SSE2-NEXT: psrld $4, %xmm0
-; SSE2-NEXT: paddd %xmm2, %xmm0
+; SSE2-NEXT: psrlw $4, %xmm0
+; SSE2-NEXT: paddb %xmm2, %xmm0
; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: movdqa %xmm0, %xmm2
@@ -850,18 +850,18 @@ define <4 x i32> @testv4i32u(<4 x i32> %
; SSE3-NEXT: pcmpeqd %xmm2, %xmm2
; SSE3-NEXT: pxor %xmm1, %xmm2
; SSE3-NEXT: movdqa %xmm2, %xmm0
-; SSE3-NEXT: psrld $1, %xmm0
+; SSE3-NEXT: psrlw $1, %xmm0
; SSE3-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE3-NEXT: psubd %xmm0, %xmm2
-; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [858993459,858993459,858993459,858993459]
+; SSE3-NEXT: psubb %xmm0, %xmm2
+; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
; SSE3-NEXT: movdqa %xmm2, %xmm1
; SSE3-NEXT: pand %xmm0, %xmm1
-; SSE3-NEXT: psrld $2, %xmm2
+; SSE3-NEXT: psrlw $2, %xmm2
; SSE3-NEXT: pand %xmm0, %xmm2
-; SSE3-NEXT: paddd %xmm1, %xmm2
+; SSE3-NEXT: paddb %xmm1, %xmm2
; SSE3-NEXT: movdqa %xmm2, %xmm0
-; SSE3-NEXT: psrld $4, %xmm0
-; SSE3-NEXT: paddd %xmm2, %xmm0
+; SSE3-NEXT: psrlw $4, %xmm0
+; SSE3-NEXT: paddb %xmm2, %xmm0
; SSE3-NEXT: pand {{.*}}(%rip), %xmm0
; SSE3-NEXT: pxor %xmm1, %xmm1
; SSE3-NEXT: movdqa %xmm0, %xmm2
@@ -1049,16 +1049,16 @@ define <8 x i16> @testv8i16(<8 x i16> %i
; SSE2-NEXT: movdqa %xmm1, %xmm0
; SSE2-NEXT: psrlw $1, %xmm0
; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT: psubw %xmm0, %xmm1
-; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [13107,13107,13107,13107,13107,13107,13107,13107]
+; SSE2-NEXT: psubb %xmm0, %xmm1
+; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
; SSE2-NEXT: movdqa %xmm1, %xmm2
; SSE2-NEXT: pand %xmm0, %xmm2
; SSE2-NEXT: psrlw $2, %xmm1
; SSE2-NEXT: pand %xmm0, %xmm1
-; SSE2-NEXT: paddw %xmm2, %xmm1
+; SSE2-NEXT: paddb %xmm2, %xmm1
; SSE2-NEXT: movdqa %xmm1, %xmm2
; SSE2-NEXT: psrlw $4, %xmm2
-; SSE2-NEXT: paddw %xmm1, %xmm2
+; SSE2-NEXT: paddb %xmm1, %xmm2
; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
; SSE2-NEXT: movdqa %xmm2, %xmm0
; SSE2-NEXT: psllw $8, %xmm0
@@ -1085,16 +1085,16 @@ define <8 x i16> @testv8i16(<8 x i16> %i
; SSE3-NEXT: movdqa %xmm1, %xmm0
; SSE3-NEXT: psrlw $1, %xmm0
; SSE3-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE3-NEXT: psubw %xmm0, %xmm1
-; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [13107,13107,13107,13107,13107,13107,13107,13107]
+; SSE3-NEXT: psubb %xmm0, %xmm1
+; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
; SSE3-NEXT: movdqa %xmm1, %xmm2
; SSE3-NEXT: pand %xmm0, %xmm2
; SSE3-NEXT: psrlw $2, %xmm1
; SSE3-NEXT: pand %xmm0, %xmm1
-; SSE3-NEXT: paddw %xmm2, %xmm1
+; SSE3-NEXT: paddb %xmm2, %xmm1
; SSE3-NEXT: movdqa %xmm1, %xmm2
; SSE3-NEXT: psrlw $4, %xmm2
-; SSE3-NEXT: paddw %xmm1, %xmm2
+; SSE3-NEXT: paddb %xmm1, %xmm2
; SSE3-NEXT: pand {{.*}}(%rip), %xmm2
; SSE3-NEXT: movdqa %xmm2, %xmm0
; SSE3-NEXT: psllw $8, %xmm0
@@ -1255,16 +1255,16 @@ define <8 x i16> @testv8i16u(<8 x i16> %
; SSE2-NEXT: movdqa %xmm1, %xmm0
; SSE2-NEXT: psrlw $1, %xmm0
; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT: psubw %xmm0, %xmm1
-; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [13107,13107,13107,13107,13107,13107,13107,13107]
+; SSE2-NEXT: psubb %xmm0, %xmm1
+; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
; SSE2-NEXT: movdqa %xmm1, %xmm2
; SSE2-NEXT: pand %xmm0, %xmm2
; SSE2-NEXT: psrlw $2, %xmm1
; SSE2-NEXT: pand %xmm0, %xmm1
-; SSE2-NEXT: paddw %xmm2, %xmm1
+; SSE2-NEXT: paddb %xmm2, %xmm1
; SSE2-NEXT: movdqa %xmm1, %xmm2
; SSE2-NEXT: psrlw $4, %xmm2
-; SSE2-NEXT: paddw %xmm1, %xmm2
+; SSE2-NEXT: paddb %xmm1, %xmm2
; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
; SSE2-NEXT: movdqa %xmm2, %xmm0
; SSE2-NEXT: psllw $8, %xmm0
@@ -1291,16 +1291,16 @@ define <8 x i16> @testv8i16u(<8 x i16> %
; SSE3-NEXT: movdqa %xmm1, %xmm0
; SSE3-NEXT: psrlw $1, %xmm0
; SSE3-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE3-NEXT: psubw %xmm0, %xmm1
-; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [13107,13107,13107,13107,13107,13107,13107,13107]
+; SSE3-NEXT: psubb %xmm0, %xmm1
+; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
; SSE3-NEXT: movdqa %xmm1, %xmm2
; SSE3-NEXT: pand %xmm0, %xmm2
; SSE3-NEXT: psrlw $2, %xmm1
; SSE3-NEXT: pand %xmm0, %xmm1
-; SSE3-NEXT: paddw %xmm2, %xmm1
+; SSE3-NEXT: paddb %xmm2, %xmm1
; SSE3-NEXT: movdqa %xmm1, %xmm2
; SSE3-NEXT: psrlw $4, %xmm2
-; SSE3-NEXT: paddw %xmm1, %xmm2
+; SSE3-NEXT: paddb %xmm1, %xmm2
; SSE3-NEXT: pand {{.*}}(%rip), %xmm2
; SSE3-NEXT: movdqa %xmm2, %xmm0
; SSE3-NEXT: psllw $8, %xmm0
Modified: llvm/trunk/test/CodeGen/X86/vector-popcnt-128.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-popcnt-128.ll?rev=344348&r1=344347&r2=344348&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-popcnt-128.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-popcnt-128.ll Fri Oct 12 07:18:47 2018
@@ -14,18 +14,18 @@ define <2 x i64> @testv2i64(<2 x i64> %i
; SSE2-LABEL: testv2i64:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrlq $1, %xmm1
+; SSE2-NEXT: psrlw $1, %xmm1
; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
-; SSE2-NEXT: psubq %xmm1, %xmm0
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3689348814741910323,3689348814741910323]
+; SSE2-NEXT: psubb %xmm1, %xmm0
+; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: pand %xmm1, %xmm2
-; SSE2-NEXT: psrlq $2, %xmm0
+; SSE2-NEXT: psrlw $2, %xmm0
; SSE2-NEXT: pand %xmm1, %xmm0
-; SSE2-NEXT: paddq %xmm2, %xmm0
+; SSE2-NEXT: paddb %xmm2, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrlq $4, %xmm1
-; SSE2-NEXT: paddq %xmm0, %xmm1
+; SSE2-NEXT: psrlw $4, %xmm1
+; SSE2-NEXT: paddb %xmm0, %xmm1
; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
; SSE2-NEXT: pxor %xmm0, %xmm0
; SSE2-NEXT: psadbw %xmm0, %xmm1
@@ -35,18 +35,18 @@ define <2 x i64> @testv2i64(<2 x i64> %i
; SSE3-LABEL: testv2i64:
; SSE3: # %bb.0:
; SSE3-NEXT: movdqa %xmm0, %xmm1
-; SSE3-NEXT: psrlq $1, %xmm1
+; SSE3-NEXT: psrlw $1, %xmm1
; SSE3-NEXT: pand {{.*}}(%rip), %xmm1
-; SSE3-NEXT: psubq %xmm1, %xmm0
-; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [3689348814741910323,3689348814741910323]
+; SSE3-NEXT: psubb %xmm1, %xmm0
+; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
; SSE3-NEXT: movdqa %xmm0, %xmm2
; SSE3-NEXT: pand %xmm1, %xmm2
-; SSE3-NEXT: psrlq $2, %xmm0
+; SSE3-NEXT: psrlw $2, %xmm0
; SSE3-NEXT: pand %xmm1, %xmm0
-; SSE3-NEXT: paddq %xmm2, %xmm0
+; SSE3-NEXT: paddb %xmm2, %xmm0
; SSE3-NEXT: movdqa %xmm0, %xmm1
-; SSE3-NEXT: psrlq $4, %xmm1
-; SSE3-NEXT: paddq %xmm0, %xmm1
+; SSE3-NEXT: psrlw $4, %xmm1
+; SSE3-NEXT: paddb %xmm0, %xmm1
; SSE3-NEXT: pand {{.*}}(%rip), %xmm1
; SSE3-NEXT: pxor %xmm0, %xmm0
; SSE3-NEXT: psadbw %xmm0, %xmm1
@@ -128,28 +128,16 @@ define <2 x i64> @testv2i64(<2 x i64> %i
;
; BITALG_NOVLX-LABEL: testv2i64:
; BITALG_NOVLX: # %bb.0:
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; BITALG_NOVLX-NEXT: vpand %xmm1, %xmm0, %xmm2
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; BITALG_NOVLX-NEXT: vpshufb %xmm2, %xmm3, %xmm2
-; BITALG_NOVLX-NEXT: vpsrlw $4, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT: vpshufb %xmm0, %xmm3, %xmm0
-; BITALG_NOVLX-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
+; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0
; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
+; BITALG_NOVLX-NEXT: vzeroupper
; BITALG_NOVLX-NEXT: retq
;
; BITALG-LABEL: testv2i64:
; BITALG: # %bb.0:
-; BITALG-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; BITALG-NEXT: vpand %xmm1, %xmm0, %xmm2
-; BITALG-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; BITALG-NEXT: vpshufb %xmm2, %xmm3, %xmm2
-; BITALG-NEXT: vpsrlw $4, %xmm0, %xmm0
-; BITALG-NEXT: vpand %xmm1, %xmm0, %xmm0
-; BITALG-NEXT: vpshufb %xmm0, %xmm3, %xmm0
-; BITALG-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; BITALG-NEXT: vpopcntb %xmm0, %xmm0
; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1
; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
; BITALG-NEXT: retq
@@ -161,18 +149,18 @@ define <4 x i32> @testv4i32(<4 x i32> %i
; SSE2-LABEL: testv4i32:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrld $1, %xmm1
+; SSE2-NEXT: psrlw $1, %xmm1
; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
-; SSE2-NEXT: psubd %xmm1, %xmm0
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [858993459,858993459,858993459,858993459]
+; SSE2-NEXT: psubb %xmm1, %xmm0
+; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: pand %xmm1, %xmm2
-; SSE2-NEXT: psrld $2, %xmm0
+; SSE2-NEXT: psrlw $2, %xmm0
; SSE2-NEXT: pand %xmm1, %xmm0
-; SSE2-NEXT: paddd %xmm2, %xmm0
+; SSE2-NEXT: paddb %xmm2, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrld $4, %xmm1
-; SSE2-NEXT: paddd %xmm0, %xmm1
+; SSE2-NEXT: psrlw $4, %xmm1
+; SSE2-NEXT: paddb %xmm0, %xmm1
; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
; SSE2-NEXT: pxor %xmm0, %xmm0
; SSE2-NEXT: movdqa %xmm1, %xmm2
@@ -187,18 +175,18 @@ define <4 x i32> @testv4i32(<4 x i32> %i
; SSE3-LABEL: testv4i32:
; SSE3: # %bb.0:
; SSE3-NEXT: movdqa %xmm0, %xmm1
-; SSE3-NEXT: psrld $1, %xmm1
+; SSE3-NEXT: psrlw $1, %xmm1
; SSE3-NEXT: pand {{.*}}(%rip), %xmm1
-; SSE3-NEXT: psubd %xmm1, %xmm0
-; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [858993459,858993459,858993459,858993459]
+; SSE3-NEXT: psubb %xmm1, %xmm0
+; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
; SSE3-NEXT: movdqa %xmm0, %xmm2
; SSE3-NEXT: pand %xmm1, %xmm2
-; SSE3-NEXT: psrld $2, %xmm0
+; SSE3-NEXT: psrlw $2, %xmm0
; SSE3-NEXT: pand %xmm1, %xmm0
-; SSE3-NEXT: paddd %xmm2, %xmm0
+; SSE3-NEXT: paddb %xmm2, %xmm0
; SSE3-NEXT: movdqa %xmm0, %xmm1
-; SSE3-NEXT: psrld $4, %xmm1
-; SSE3-NEXT: paddd %xmm0, %xmm1
+; SSE3-NEXT: psrlw $4, %xmm1
+; SSE3-NEXT: paddb %xmm0, %xmm1
; SSE3-NEXT: pand {{.*}}(%rip), %xmm1
; SSE3-NEXT: pxor %xmm0, %xmm0
; SSE3-NEXT: movdqa %xmm1, %xmm2
@@ -303,32 +291,20 @@ define <4 x i32> @testv4i32(<4 x i32> %i
;
; BITALG_NOVLX-LABEL: testv4i32:
; BITALG_NOVLX: # %bb.0:
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; BITALG_NOVLX-NEXT: vpand %xmm1, %xmm0, %xmm2
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; BITALG_NOVLX-NEXT: vpshufb %xmm2, %xmm3, %xmm2
-; BITALG_NOVLX-NEXT: vpsrlw $4, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT: vpshufb %xmm0, %xmm3, %xmm0
-; BITALG_NOVLX-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
+; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0
; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; BITALG_NOVLX-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm2, %xmm2
; BITALG_NOVLX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
; BITALG_NOVLX-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; BITALG_NOVLX-NEXT: vzeroupper
; BITALG_NOVLX-NEXT: retq
;
; BITALG-LABEL: testv4i32:
; BITALG: # %bb.0:
-; BITALG-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; BITALG-NEXT: vpand %xmm1, %xmm0, %xmm2
-; BITALG-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; BITALG-NEXT: vpshufb %xmm2, %xmm3, %xmm2
-; BITALG-NEXT: vpsrlw $4, %xmm0, %xmm0
-; BITALG-NEXT: vpand %xmm1, %xmm0, %xmm0
-; BITALG-NEXT: vpshufb %xmm0, %xmm3, %xmm0
-; BITALG-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; BITALG-NEXT: vpopcntb %xmm0, %xmm0
; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1
; BITALG-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; BITALG-NEXT: vpsadbw %xmm1, %xmm2, %xmm2
@@ -346,16 +322,16 @@ define <8 x i16> @testv8i16(<8 x i16> %i
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrlw $1, %xmm1
; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
-; SSE2-NEXT: psubw %xmm1, %xmm0
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [13107,13107,13107,13107,13107,13107,13107,13107]
+; SSE2-NEXT: psubb %xmm1, %xmm0
+; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: pand %xmm1, %xmm2
; SSE2-NEXT: psrlw $2, %xmm0
; SSE2-NEXT: pand %xmm1, %xmm0
-; SSE2-NEXT: paddw %xmm2, %xmm0
+; SSE2-NEXT: paddb %xmm2, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrlw $4, %xmm1
-; SSE2-NEXT: paddw %xmm0, %xmm1
+; SSE2-NEXT: paddb %xmm0, %xmm1
; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
; SSE2-NEXT: movdqa %xmm1, %xmm0
; SSE2-NEXT: psllw $8, %xmm0
@@ -368,16 +344,16 @@ define <8 x i16> @testv8i16(<8 x i16> %i
; SSE3-NEXT: movdqa %xmm0, %xmm1
; SSE3-NEXT: psrlw $1, %xmm1
; SSE3-NEXT: pand {{.*}}(%rip), %xmm1
-; SSE3-NEXT: psubw %xmm1, %xmm0
-; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [13107,13107,13107,13107,13107,13107,13107,13107]
+; SSE3-NEXT: psubb %xmm1, %xmm0
+; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
; SSE3-NEXT: movdqa %xmm0, %xmm2
; SSE3-NEXT: pand %xmm1, %xmm2
; SSE3-NEXT: psrlw $2, %xmm0
; SSE3-NEXT: pand %xmm1, %xmm0
-; SSE3-NEXT: paddw %xmm2, %xmm0
+; SSE3-NEXT: paddb %xmm2, %xmm0
; SSE3-NEXT: movdqa %xmm0, %xmm1
; SSE3-NEXT: psrlw $4, %xmm1
-; SSE3-NEXT: paddw %xmm0, %xmm1
+; SSE3-NEXT: paddb %xmm0, %xmm1
; SSE3-NEXT: pand {{.*}}(%rip), %xmm1
; SSE3-NEXT: movdqa %xmm1, %xmm0
; SSE3-NEXT: psllw $8, %xmm0
Modified: llvm/trunk/test/CodeGen/X86/vector-popcnt-256.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-popcnt-256.ll?rev=344348&r1=344347&r2=344348&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-popcnt-256.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-popcnt-256.ll Fri Oct 12 07:18:47 2018
@@ -58,28 +58,15 @@ define <4 x i64> @testv4i64(<4 x i64> %i
;
; BITALG_NOVLX-LABEL: testv4i64:
; BITALG_NOVLX: # %bb.0:
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; BITALG_NOVLX-NEXT: vpand %ymm1, %ymm0, %ymm2
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; BITALG_NOVLX-NEXT: vpshufb %ymm2, %ymm3, %ymm2
-; BITALG_NOVLX-NEXT: vpsrlw $4, %ymm0, %ymm0
-; BITALG_NOVLX-NEXT: vpand %ymm1, %ymm0, %ymm0
-; BITALG_NOVLX-NEXT: vpshufb %ymm0, %ymm3, %ymm0
-; BITALG_NOVLX-NEXT: vpaddb %ymm2, %ymm0, %ymm0
+; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
+; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0
; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; BITALG_NOVLX-NEXT: vpsadbw %ymm1, %ymm0, %ymm0
; BITALG_NOVLX-NEXT: retq
;
; BITALG-LABEL: testv4i64:
; BITALG: # %bb.0:
-; BITALG-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; BITALG-NEXT: vpand %ymm1, %ymm0, %ymm2
-; BITALG-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; BITALG-NEXT: vpshufb %ymm2, %ymm3, %ymm2
-; BITALG-NEXT: vpsrlw $4, %ymm0, %ymm0
-; BITALG-NEXT: vpand %ymm1, %ymm0, %ymm0
-; BITALG-NEXT: vpshufb %ymm0, %ymm3, %ymm0
-; BITALG-NEXT: vpaddb %ymm2, %ymm0, %ymm0
+; BITALG-NEXT: vpopcntb %ymm0, %ymm0
; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1
; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0
; BITALG-NEXT: retq
@@ -151,14 +138,8 @@ define <8 x i32> @testv8i32(<8 x i32> %i
;
; BITALG_NOVLX-LABEL: testv8i32:
; BITALG_NOVLX: # %bb.0:
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; BITALG_NOVLX-NEXT: vpand %ymm1, %ymm0, %ymm2
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; BITALG_NOVLX-NEXT: vpshufb %ymm2, %ymm3, %ymm2
-; BITALG_NOVLX-NEXT: vpsrlw $4, %ymm0, %ymm0
-; BITALG_NOVLX-NEXT: vpand %ymm1, %ymm0, %ymm0
-; BITALG_NOVLX-NEXT: vpshufb %ymm0, %ymm3, %ymm0
-; BITALG_NOVLX-NEXT: vpaddb %ymm2, %ymm0, %ymm0
+; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
+; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0
; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; BITALG_NOVLX-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
; BITALG_NOVLX-NEXT: vpsadbw %ymm1, %ymm2, %ymm2
@@ -169,14 +150,7 @@ define <8 x i32> @testv8i32(<8 x i32> %i
;
; BITALG-LABEL: testv8i32:
; BITALG: # %bb.0:
-; BITALG-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; BITALG-NEXT: vpand %ymm1, %ymm0, %ymm2
-; BITALG-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; BITALG-NEXT: vpshufb %ymm2, %ymm3, %ymm2
-; BITALG-NEXT: vpsrlw $4, %ymm0, %ymm0
-; BITALG-NEXT: vpand %ymm1, %ymm0, %ymm0
-; BITALG-NEXT: vpshufb %ymm0, %ymm3, %ymm0
-; BITALG-NEXT: vpaddb %ymm2, %ymm0, %ymm0
+; BITALG-NEXT: vpopcntb %ymm0, %ymm0
; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1
; BITALG-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
; BITALG-NEXT: vpsadbw %ymm1, %ymm2, %ymm2
Modified: llvm/trunk/test/CodeGen/X86/vector-popcnt-512.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-popcnt-512.ll?rev=344348&r1=344347&r2=344348&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-popcnt-512.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-popcnt-512.ll Fri Oct 12 07:18:47 2018
@@ -50,14 +50,7 @@ define <8 x i64> @testv8i64(<8 x i64> %i
;
; BITALG-LABEL: testv8i64:
; BITALG: # %bb.0:
-; BITALG-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; BITALG-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; BITALG-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; BITALG-NEXT: vpshufb %zmm2, %zmm3, %zmm2
-; BITALG-NEXT: vpsrlw $4, %zmm0, %zmm0
-; BITALG-NEXT: vpandq %zmm1, %zmm0, %zmm0
-; BITALG-NEXT: vpshufb %zmm0, %zmm3, %zmm0
-; BITALG-NEXT: vpaddb %zmm2, %zmm0, %zmm0
+; BITALG-NEXT: vpopcntb %zmm0, %zmm0
; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1
; BITALG-NEXT: vpsadbw %zmm1, %zmm0, %zmm0
; BITALG-NEXT: retq
@@ -122,14 +115,7 @@ define <16 x i32> @testv16i32(<16 x i32>
;
; BITALG-LABEL: testv16i32:
; BITALG: # %bb.0:
-; BITALG-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; BITALG-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; BITALG-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; BITALG-NEXT: vpshufb %zmm2, %zmm3, %zmm2
-; BITALG-NEXT: vpsrlw $4, %zmm0, %zmm0
-; BITALG-NEXT: vpandq %zmm1, %zmm0, %zmm0
-; BITALG-NEXT: vpshufb %zmm0, %zmm3, %zmm0
-; BITALG-NEXT: vpaddb %zmm2, %zmm0, %zmm0
+; BITALG-NEXT: vpopcntb %zmm0, %zmm0
; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1
; BITALG-NEXT: vpunpckhdq {{.*#+}} zmm2 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
; BITALG-NEXT: vpsadbw %zmm1, %zmm2, %zmm2
Modified: llvm/trunk/test/CodeGen/X86/vector-tzcnt-128.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-tzcnt-128.ll?rev=344348&r1=344347&r2=344348&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-tzcnt-128.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-tzcnt-128.ll Fri Oct 12 07:18:47 2018
@@ -25,18 +25,18 @@ define <2 x i64> @testv2i64(<2 x i64> %i
; SSE2-NEXT: pcmpeqd %xmm3, %xmm3
; SSE2-NEXT: paddq %xmm2, %xmm3
; SSE2-NEXT: movdqa %xmm3, %xmm0
-; SSE2-NEXT: psrlq $1, %xmm0
+; SSE2-NEXT: psrlw $1, %xmm0
; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT: psubq %xmm0, %xmm3
-; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [3689348814741910323,3689348814741910323]
+; SSE2-NEXT: psubb %xmm0, %xmm3
+; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
; SSE2-NEXT: movdqa %xmm3, %xmm2
; SSE2-NEXT: pand %xmm0, %xmm2
-; SSE2-NEXT: psrlq $2, %xmm3
+; SSE2-NEXT: psrlw $2, %xmm3
; SSE2-NEXT: pand %xmm0, %xmm3
-; SSE2-NEXT: paddq %xmm2, %xmm3
+; SSE2-NEXT: paddb %xmm2, %xmm3
; SSE2-NEXT: movdqa %xmm3, %xmm0
-; SSE2-NEXT: psrlq $4, %xmm0
-; SSE2-NEXT: paddq %xmm3, %xmm0
+; SSE2-NEXT: psrlw $4, %xmm0
+; SSE2-NEXT: paddb %xmm3, %xmm0
; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
; SSE2-NEXT: psadbw %xmm1, %xmm0
; SSE2-NEXT: retq
@@ -50,18 +50,18 @@ define <2 x i64> @testv2i64(<2 x i64> %i
; SSE3-NEXT: pcmpeqd %xmm3, %xmm3
; SSE3-NEXT: paddq %xmm2, %xmm3
; SSE3-NEXT: movdqa %xmm3, %xmm0
-; SSE3-NEXT: psrlq $1, %xmm0
+; SSE3-NEXT: psrlw $1, %xmm0
; SSE3-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE3-NEXT: psubq %xmm0, %xmm3
-; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [3689348814741910323,3689348814741910323]
+; SSE3-NEXT: psubb %xmm0, %xmm3
+; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
; SSE3-NEXT: movdqa %xmm3, %xmm2
; SSE3-NEXT: pand %xmm0, %xmm2
-; SSE3-NEXT: psrlq $2, %xmm3
+; SSE3-NEXT: psrlw $2, %xmm3
; SSE3-NEXT: pand %xmm0, %xmm3
-; SSE3-NEXT: paddq %xmm2, %xmm3
+; SSE3-NEXT: paddb %xmm2, %xmm3
; SSE3-NEXT: movdqa %xmm3, %xmm0
-; SSE3-NEXT: psrlq $4, %xmm0
-; SSE3-NEXT: paddq %xmm3, %xmm0
+; SSE3-NEXT: psrlw $4, %xmm0
+; SSE3-NEXT: paddb %xmm3, %xmm0
; SSE3-NEXT: pand {{.*}}(%rip), %xmm0
; SSE3-NEXT: psadbw %xmm1, %xmm0
; SSE3-NEXT: retq
@@ -155,15 +155,9 @@ define <2 x i64> @testv2i64(<2 x i64> %i
; BITALG_NOVLX-NEXT: vpand %xmm2, %xmm0, %xmm0
; BITALG_NOVLX-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; BITALG_NOVLX-NEXT: vpaddq %xmm2, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; BITALG_NOVLX-NEXT: vpand %xmm2, %xmm0, %xmm3
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; BITALG_NOVLX-NEXT: vpshufb %xmm3, %xmm4, %xmm3
-; BITALG_NOVLX-NEXT: vpsrlw $4, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT: vpand %xmm2, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT: vpshufb %xmm0, %xmm4, %xmm0
-; BITALG_NOVLX-NEXT: vpaddb %xmm3, %xmm0, %xmm0
+; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0
; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
+; BITALG_NOVLX-NEXT: vzeroupper
; BITALG_NOVLX-NEXT: retq
;
; BITALG-LABEL: testv2i64:
@@ -173,14 +167,7 @@ define <2 x i64> @testv2i64(<2 x i64> %i
; BITALG-NEXT: vpand %xmm2, %xmm0, %xmm0
; BITALG-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; BITALG-NEXT: vpaddq %xmm2, %xmm0, %xmm0
-; BITALG-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; BITALG-NEXT: vpand %xmm2, %xmm0, %xmm3
-; BITALG-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; BITALG-NEXT: vpshufb %xmm3, %xmm4, %xmm3
-; BITALG-NEXT: vpsrlw $4, %xmm0, %xmm0
-; BITALG-NEXT: vpand %xmm2, %xmm0, %xmm0
-; BITALG-NEXT: vpshufb %xmm0, %xmm4, %xmm0
-; BITALG-NEXT: vpaddb %xmm3, %xmm0, %xmm0
+; BITALG-NEXT: vpopcntb %xmm0, %xmm0
; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
; BITALG-NEXT: retq
;
@@ -217,18 +204,18 @@ define <2 x i64> @testv2i64u(<2 x i64> %
; SSE2-NEXT: pcmpeqd %xmm3, %xmm3
; SSE2-NEXT: paddq %xmm2, %xmm3
; SSE2-NEXT: movdqa %xmm3, %xmm0
-; SSE2-NEXT: psrlq $1, %xmm0
+; SSE2-NEXT: psrlw $1, %xmm0
; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT: psubq %xmm0, %xmm3
-; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [3689348814741910323,3689348814741910323]
+; SSE2-NEXT: psubb %xmm0, %xmm3
+; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
; SSE2-NEXT: movdqa %xmm3, %xmm2
; SSE2-NEXT: pand %xmm0, %xmm2
-; SSE2-NEXT: psrlq $2, %xmm3
+; SSE2-NEXT: psrlw $2, %xmm3
; SSE2-NEXT: pand %xmm0, %xmm3
-; SSE2-NEXT: paddq %xmm2, %xmm3
+; SSE2-NEXT: paddb %xmm2, %xmm3
; SSE2-NEXT: movdqa %xmm3, %xmm0
-; SSE2-NEXT: psrlq $4, %xmm0
-; SSE2-NEXT: paddq %xmm3, %xmm0
+; SSE2-NEXT: psrlw $4, %xmm0
+; SSE2-NEXT: paddb %xmm3, %xmm0
; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
; SSE2-NEXT: psadbw %xmm1, %xmm0
; SSE2-NEXT: retq
@@ -242,18 +229,18 @@ define <2 x i64> @testv2i64u(<2 x i64> %
; SSE3-NEXT: pcmpeqd %xmm3, %xmm3
; SSE3-NEXT: paddq %xmm2, %xmm3
; SSE3-NEXT: movdqa %xmm3, %xmm0
-; SSE3-NEXT: psrlq $1, %xmm0
+; SSE3-NEXT: psrlw $1, %xmm0
; SSE3-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE3-NEXT: psubq %xmm0, %xmm3
-; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [3689348814741910323,3689348814741910323]
+; SSE3-NEXT: psubb %xmm0, %xmm3
+; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
; SSE3-NEXT: movdqa %xmm3, %xmm2
; SSE3-NEXT: pand %xmm0, %xmm2
-; SSE3-NEXT: psrlq $2, %xmm3
+; SSE3-NEXT: psrlw $2, %xmm3
; SSE3-NEXT: pand %xmm0, %xmm3
-; SSE3-NEXT: paddq %xmm2, %xmm3
+; SSE3-NEXT: paddb %xmm2, %xmm3
; SSE3-NEXT: movdqa %xmm3, %xmm0
-; SSE3-NEXT: psrlq $4, %xmm0
-; SSE3-NEXT: paddq %xmm3, %xmm0
+; SSE3-NEXT: psrlw $4, %xmm0
+; SSE3-NEXT: paddb %xmm3, %xmm0
; SSE3-NEXT: pand {{.*}}(%rip), %xmm0
; SSE3-NEXT: psadbw %xmm1, %xmm0
; SSE3-NEXT: retq
@@ -386,15 +373,9 @@ define <2 x i64> @testv2i64u(<2 x i64> %
; BITALG_NOVLX-NEXT: vpand %xmm2, %xmm0, %xmm0
; BITALG_NOVLX-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; BITALG_NOVLX-NEXT: vpaddq %xmm2, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; BITALG_NOVLX-NEXT: vpand %xmm2, %xmm0, %xmm3
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; BITALG_NOVLX-NEXT: vpshufb %xmm3, %xmm4, %xmm3
-; BITALG_NOVLX-NEXT: vpsrlw $4, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT: vpand %xmm2, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT: vpshufb %xmm0, %xmm4, %xmm0
-; BITALG_NOVLX-NEXT: vpaddb %xmm3, %xmm0, %xmm0
+; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0
; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
+; BITALG_NOVLX-NEXT: vzeroupper
; BITALG_NOVLX-NEXT: retq
;
; BITALG-LABEL: testv2i64u:
@@ -404,14 +385,7 @@ define <2 x i64> @testv2i64u(<2 x i64> %
; BITALG-NEXT: vpand %xmm2, %xmm0, %xmm0
; BITALG-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; BITALG-NEXT: vpaddq %xmm2, %xmm0, %xmm0
-; BITALG-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; BITALG-NEXT: vpand %xmm2, %xmm0, %xmm3
-; BITALG-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; BITALG-NEXT: vpshufb %xmm3, %xmm4, %xmm3
-; BITALG-NEXT: vpsrlw $4, %xmm0, %xmm0
-; BITALG-NEXT: vpand %xmm2, %xmm0, %xmm0
-; BITALG-NEXT: vpshufb %xmm0, %xmm4, %xmm0
-; BITALG-NEXT: vpaddb %xmm3, %xmm0, %xmm0
+; BITALG-NEXT: vpopcntb %xmm0, %xmm0
; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
; BITALG-NEXT: retq
;
@@ -448,18 +422,18 @@ define <4 x i32> @testv4i32(<4 x i32> %i
; SSE2-NEXT: pcmpeqd %xmm3, %xmm3
; SSE2-NEXT: paddd %xmm2, %xmm3
; SSE2-NEXT: movdqa %xmm3, %xmm0
-; SSE2-NEXT: psrld $1, %xmm0
+; SSE2-NEXT: psrlw $1, %xmm0
; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT: psubd %xmm0, %xmm3
-; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [858993459,858993459,858993459,858993459]
+; SSE2-NEXT: psubb %xmm0, %xmm3
+; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
; SSE2-NEXT: movdqa %xmm3, %xmm2
; SSE2-NEXT: pand %xmm0, %xmm2
-; SSE2-NEXT: psrld $2, %xmm3
+; SSE2-NEXT: psrlw $2, %xmm3
; SSE2-NEXT: pand %xmm0, %xmm3
-; SSE2-NEXT: paddd %xmm2, %xmm3
+; SSE2-NEXT: paddb %xmm2, %xmm3
; SSE2-NEXT: movdqa %xmm3, %xmm0
-; SSE2-NEXT: psrld $4, %xmm0
-; SSE2-NEXT: paddd %xmm3, %xmm0
+; SSE2-NEXT: psrlw $4, %xmm0
+; SSE2-NEXT: paddb %xmm3, %xmm0
; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
@@ -478,18 +452,18 @@ define <4 x i32> @testv4i32(<4 x i32> %i
; SSE3-NEXT: pcmpeqd %xmm3, %xmm3
; SSE3-NEXT: paddd %xmm2, %xmm3
; SSE3-NEXT: movdqa %xmm3, %xmm0
-; SSE3-NEXT: psrld $1, %xmm0
+; SSE3-NEXT: psrlw $1, %xmm0
; SSE3-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE3-NEXT: psubd %xmm0, %xmm3
-; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [858993459,858993459,858993459,858993459]
+; SSE3-NEXT: psubb %xmm0, %xmm3
+; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
; SSE3-NEXT: movdqa %xmm3, %xmm2
; SSE3-NEXT: pand %xmm0, %xmm2
-; SSE3-NEXT: psrld $2, %xmm3
+; SSE3-NEXT: psrlw $2, %xmm3
; SSE3-NEXT: pand %xmm0, %xmm3
-; SSE3-NEXT: paddd %xmm2, %xmm3
+; SSE3-NEXT: paddb %xmm2, %xmm3
; SSE3-NEXT: movdqa %xmm3, %xmm0
-; SSE3-NEXT: psrld $4, %xmm0
-; SSE3-NEXT: paddd %xmm3, %xmm0
+; SSE3-NEXT: psrlw $4, %xmm0
+; SSE3-NEXT: paddb %xmm3, %xmm0
; SSE3-NEXT: pand {{.*}}(%rip), %xmm0
; SSE3-NEXT: movdqa %xmm0, %xmm2
; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
@@ -667,19 +641,13 @@ define <4 x i32> @testv4i32(<4 x i32> %i
; BITALG_NOVLX-NEXT: vpand %xmm2, %xmm0, %xmm0
; BITALG_NOVLX-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; BITALG_NOVLX-NEXT: vpaddd %xmm2, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; BITALG_NOVLX-NEXT: vpand %xmm2, %xmm0, %xmm3
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; BITALG_NOVLX-NEXT: vpshufb %xmm3, %xmm4, %xmm3
-; BITALG_NOVLX-NEXT: vpsrlw $4, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT: vpand %xmm2, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT: vpshufb %xmm0, %xmm4, %xmm0
-; BITALG_NOVLX-NEXT: vpaddb %xmm3, %xmm0, %xmm0
+; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0
; BITALG_NOVLX-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm2, %xmm2
; BITALG_NOVLX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
; BITALG_NOVLX-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; BITALG_NOVLX-NEXT: vzeroupper
; BITALG_NOVLX-NEXT: retq
;
; BITALG-LABEL: testv4i32:
@@ -689,14 +657,7 @@ define <4 x i32> @testv4i32(<4 x i32> %i
; BITALG-NEXT: vpand %xmm2, %xmm0, %xmm0
; BITALG-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; BITALG-NEXT: vpaddd %xmm2, %xmm0, %xmm0
-; BITALG-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; BITALG-NEXT: vpand %xmm2, %xmm0, %xmm3
-; BITALG-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; BITALG-NEXT: vpshufb %xmm3, %xmm4, %xmm3
-; BITALG-NEXT: vpsrlw $4, %xmm0, %xmm0
-; BITALG-NEXT: vpand %xmm2, %xmm0, %xmm0
-; BITALG-NEXT: vpshufb %xmm0, %xmm4, %xmm0
-; BITALG-NEXT: vpaddb %xmm3, %xmm0, %xmm0
+; BITALG-NEXT: vpopcntb %xmm0, %xmm0
; BITALG-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; BITALG-NEXT: vpsadbw %xmm1, %xmm2, %xmm2
; BITALG-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
@@ -742,18 +703,18 @@ define <4 x i32> @testv4i32u(<4 x i32> %
; SSE2-NEXT: pcmpeqd %xmm3, %xmm3
; SSE2-NEXT: paddd %xmm2, %xmm3
; SSE2-NEXT: movdqa %xmm3, %xmm0
-; SSE2-NEXT: psrld $1, %xmm0
+; SSE2-NEXT: psrlw $1, %xmm0
; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT: psubd %xmm0, %xmm3
-; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [858993459,858993459,858993459,858993459]
+; SSE2-NEXT: psubb %xmm0, %xmm3
+; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
; SSE2-NEXT: movdqa %xmm3, %xmm2
; SSE2-NEXT: pand %xmm0, %xmm2
-; SSE2-NEXT: psrld $2, %xmm3
+; SSE2-NEXT: psrlw $2, %xmm3
; SSE2-NEXT: pand %xmm0, %xmm3
-; SSE2-NEXT: paddd %xmm2, %xmm3
+; SSE2-NEXT: paddb %xmm2, %xmm3
; SSE2-NEXT: movdqa %xmm3, %xmm0
-; SSE2-NEXT: psrld $4, %xmm0
-; SSE2-NEXT: paddd %xmm3, %xmm0
+; SSE2-NEXT: psrlw $4, %xmm0
+; SSE2-NEXT: paddb %xmm3, %xmm0
; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
@@ -772,18 +733,18 @@ define <4 x i32> @testv4i32u(<4 x i32> %
; SSE3-NEXT: pcmpeqd %xmm3, %xmm3
; SSE3-NEXT: paddd %xmm2, %xmm3
; SSE3-NEXT: movdqa %xmm3, %xmm0
-; SSE3-NEXT: psrld $1, %xmm0
+; SSE3-NEXT: psrlw $1, %xmm0
; SSE3-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE3-NEXT: psubd %xmm0, %xmm3
-; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [858993459,858993459,858993459,858993459]
+; SSE3-NEXT: psubb %xmm0, %xmm3
+; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
; SSE3-NEXT: movdqa %xmm3, %xmm2
; SSE3-NEXT: pand %xmm0, %xmm2
-; SSE3-NEXT: psrld $2, %xmm3
+; SSE3-NEXT: psrlw $2, %xmm3
; SSE3-NEXT: pand %xmm0, %xmm3
-; SSE3-NEXT: paddd %xmm2, %xmm3
+; SSE3-NEXT: paddb %xmm2, %xmm3
; SSE3-NEXT: movdqa %xmm3, %xmm0
-; SSE3-NEXT: psrld $4, %xmm0
-; SSE3-NEXT: paddd %xmm3, %xmm0
+; SSE3-NEXT: psrlw $4, %xmm0
+; SSE3-NEXT: paddb %xmm3, %xmm0
; SSE3-NEXT: pand {{.*}}(%rip), %xmm0
; SSE3-NEXT: movdqa %xmm0, %xmm2
; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
@@ -938,19 +899,13 @@ define <4 x i32> @testv4i32u(<4 x i32> %
; BITALG_NOVLX-NEXT: vpand %xmm2, %xmm0, %xmm0
; BITALG_NOVLX-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; BITALG_NOVLX-NEXT: vpaddd %xmm2, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; BITALG_NOVLX-NEXT: vpand %xmm2, %xmm0, %xmm3
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; BITALG_NOVLX-NEXT: vpshufb %xmm3, %xmm4, %xmm3
-; BITALG_NOVLX-NEXT: vpsrlw $4, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT: vpand %xmm2, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT: vpshufb %xmm0, %xmm4, %xmm0
-; BITALG_NOVLX-NEXT: vpaddb %xmm3, %xmm0, %xmm0
+; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0
; BITALG_NOVLX-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm2, %xmm2
; BITALG_NOVLX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
; BITALG_NOVLX-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; BITALG_NOVLX-NEXT: vzeroupper
; BITALG_NOVLX-NEXT: retq
;
; BITALG-LABEL: testv4i32u:
@@ -960,14 +915,7 @@ define <4 x i32> @testv4i32u(<4 x i32> %
; BITALG-NEXT: vpand %xmm2, %xmm0, %xmm0
; BITALG-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; BITALG-NEXT: vpaddd %xmm2, %xmm0, %xmm0
-; BITALG-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; BITALG-NEXT: vpand %xmm2, %xmm0, %xmm3
-; BITALG-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; BITALG-NEXT: vpshufb %xmm3, %xmm4, %xmm3
-; BITALG-NEXT: vpsrlw $4, %xmm0, %xmm0
-; BITALG-NEXT: vpand %xmm2, %xmm0, %xmm0
-; BITALG-NEXT: vpshufb %xmm0, %xmm4, %xmm0
-; BITALG-NEXT: vpaddb %xmm3, %xmm0, %xmm0
+; BITALG-NEXT: vpopcntb %xmm0, %xmm0
; BITALG-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; BITALG-NEXT: vpsadbw %xmm1, %xmm2, %xmm2
; BITALG-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
@@ -1014,16 +962,16 @@ define <8 x i16> @testv8i16(<8 x i16> %i
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrlw $1, %xmm1
; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
-; SSE2-NEXT: psubw %xmm1, %xmm0
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [13107,13107,13107,13107,13107,13107,13107,13107]
+; SSE2-NEXT: psubb %xmm1, %xmm0
+; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: pand %xmm1, %xmm2
; SSE2-NEXT: psrlw $2, %xmm0
; SSE2-NEXT: pand %xmm1, %xmm0
-; SSE2-NEXT: paddw %xmm2, %xmm0
+; SSE2-NEXT: paddb %xmm2, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrlw $4, %xmm1
-; SSE2-NEXT: paddw %xmm0, %xmm1
+; SSE2-NEXT: paddb %xmm0, %xmm1
; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
; SSE2-NEXT: movdqa %xmm1, %xmm0
; SSE2-NEXT: psllw $8, %xmm0
@@ -1041,16 +989,16 @@ define <8 x i16> @testv8i16(<8 x i16> %i
; SSE3-NEXT: movdqa %xmm0, %xmm1
; SSE3-NEXT: psrlw $1, %xmm1
; SSE3-NEXT: pand {{.*}}(%rip), %xmm1
-; SSE3-NEXT: psubw %xmm1, %xmm0
-; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [13107,13107,13107,13107,13107,13107,13107,13107]
+; SSE3-NEXT: psubb %xmm1, %xmm0
+; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
; SSE3-NEXT: movdqa %xmm0, %xmm2
; SSE3-NEXT: pand %xmm1, %xmm2
; SSE3-NEXT: psrlw $2, %xmm0
; SSE3-NEXT: pand %xmm1, %xmm0
-; SSE3-NEXT: paddw %xmm2, %xmm0
+; SSE3-NEXT: paddb %xmm2, %xmm0
; SSE3-NEXT: movdqa %xmm0, %xmm1
; SSE3-NEXT: psrlw $4, %xmm1
-; SSE3-NEXT: paddw %xmm0, %xmm1
+; SSE3-NEXT: paddb %xmm0, %xmm1
; SSE3-NEXT: pand {{.*}}(%rip), %xmm1
; SSE3-NEXT: movdqa %xmm1, %xmm0
; SSE3-NEXT: psllw $8, %xmm0
@@ -1210,16 +1158,16 @@ define <8 x i16> @testv8i16u(<8 x i16> %
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrlw $1, %xmm1
; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
-; SSE2-NEXT: psubw %xmm1, %xmm0
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [13107,13107,13107,13107,13107,13107,13107,13107]
+; SSE2-NEXT: psubb %xmm1, %xmm0
+; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: pand %xmm1, %xmm2
; SSE2-NEXT: psrlw $2, %xmm0
; SSE2-NEXT: pand %xmm1, %xmm0
-; SSE2-NEXT: paddw %xmm2, %xmm0
+; SSE2-NEXT: paddb %xmm2, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrlw $4, %xmm1
-; SSE2-NEXT: paddw %xmm0, %xmm1
+; SSE2-NEXT: paddb %xmm0, %xmm1
; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
; SSE2-NEXT: movdqa %xmm1, %xmm0
; SSE2-NEXT: psllw $8, %xmm0
@@ -1237,16 +1185,16 @@ define <8 x i16> @testv8i16u(<8 x i16> %
; SSE3-NEXT: movdqa %xmm0, %xmm1
; SSE3-NEXT: psrlw $1, %xmm1
; SSE3-NEXT: pand {{.*}}(%rip), %xmm1
-; SSE3-NEXT: psubw %xmm1, %xmm0
-; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [13107,13107,13107,13107,13107,13107,13107,13107]
+; SSE3-NEXT: psubb %xmm1, %xmm0
+; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
; SSE3-NEXT: movdqa %xmm0, %xmm2
; SSE3-NEXT: pand %xmm1, %xmm2
; SSE3-NEXT: psrlw $2, %xmm0
; SSE3-NEXT: pand %xmm1, %xmm0
-; SSE3-NEXT: paddw %xmm2, %xmm0
+; SSE3-NEXT: paddb %xmm2, %xmm0
; SSE3-NEXT: movdqa %xmm0, %xmm1
; SSE3-NEXT: psrlw $4, %xmm1
-; SSE3-NEXT: paddw %xmm0, %xmm1
+; SSE3-NEXT: paddb %xmm0, %xmm1
; SSE3-NEXT: pand {{.*}}(%rip), %xmm1
; SSE3-NEXT: movdqa %xmm1, %xmm0
; SSE3-NEXT: psllw $8, %xmm0
Modified: llvm/trunk/test/CodeGen/X86/vector-tzcnt-256.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-tzcnt-256.ll?rev=344348&r1=344347&r2=344348&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-tzcnt-256.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-tzcnt-256.ll Fri Oct 12 07:18:47 2018
@@ -124,14 +124,7 @@ define <4 x i64> @testv4i64(<4 x i64> %i
; BITALG_NOVLX-NEXT: vpand %ymm2, %ymm0, %ymm0
; BITALG_NOVLX-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
; BITALG_NOVLX-NEXT: vpaddq %ymm2, %ymm0, %ymm0
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; BITALG_NOVLX-NEXT: vpand %ymm2, %ymm0, %ymm3
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; BITALG_NOVLX-NEXT: vpshufb %ymm3, %ymm4, %ymm3
-; BITALG_NOVLX-NEXT: vpsrlw $4, %ymm0, %ymm0
-; BITALG_NOVLX-NEXT: vpand %ymm2, %ymm0, %ymm0
-; BITALG_NOVLX-NEXT: vpshufb %ymm0, %ymm4, %ymm0
-; BITALG_NOVLX-NEXT: vpaddb %ymm3, %ymm0, %ymm0
+; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0
; BITALG_NOVLX-NEXT: vpsadbw %ymm1, %ymm0, %ymm0
; BITALG_NOVLX-NEXT: retq
;
@@ -142,14 +135,7 @@ define <4 x i64> @testv4i64(<4 x i64> %i
; BITALG-NEXT: vpand %ymm2, %ymm0, %ymm0
; BITALG-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
; BITALG-NEXT: vpaddq %ymm2, %ymm0, %ymm0
-; BITALG-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; BITALG-NEXT: vpand %ymm2, %ymm0, %ymm3
-; BITALG-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; BITALG-NEXT: vpshufb %ymm3, %ymm4, %ymm3
-; BITALG-NEXT: vpsrlw $4, %ymm0, %ymm0
-; BITALG-NEXT: vpand %ymm2, %ymm0, %ymm0
-; BITALG-NEXT: vpshufb %ymm0, %ymm4, %ymm0
-; BITALG-NEXT: vpaddb %ymm3, %ymm0, %ymm0
+; BITALG-NEXT: vpopcntb %ymm0, %ymm0
; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0
; BITALG-NEXT: retq
;
@@ -270,14 +256,7 @@ define <4 x i64> @testv4i64u(<4 x i64> %
; BITALG_NOVLX-NEXT: vpand %ymm2, %ymm0, %ymm0
; BITALG_NOVLX-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
; BITALG_NOVLX-NEXT: vpaddq %ymm2, %ymm0, %ymm0
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; BITALG_NOVLX-NEXT: vpand %ymm2, %ymm0, %ymm3
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; BITALG_NOVLX-NEXT: vpshufb %ymm3, %ymm4, %ymm3
-; BITALG_NOVLX-NEXT: vpsrlw $4, %ymm0, %ymm0
-; BITALG_NOVLX-NEXT: vpand %ymm2, %ymm0, %ymm0
-; BITALG_NOVLX-NEXT: vpshufb %ymm0, %ymm4, %ymm0
-; BITALG_NOVLX-NEXT: vpaddb %ymm3, %ymm0, %ymm0
+; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0
; BITALG_NOVLX-NEXT: vpsadbw %ymm1, %ymm0, %ymm0
; BITALG_NOVLX-NEXT: retq
;
@@ -288,14 +267,7 @@ define <4 x i64> @testv4i64u(<4 x i64> %
; BITALG-NEXT: vpand %ymm2, %ymm0, %ymm0
; BITALG-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
; BITALG-NEXT: vpaddq %ymm2, %ymm0, %ymm0
-; BITALG-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; BITALG-NEXT: vpand %ymm2, %ymm0, %ymm3
-; BITALG-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; BITALG-NEXT: vpshufb %ymm3, %ymm4, %ymm3
-; BITALG-NEXT: vpsrlw $4, %ymm0, %ymm0
-; BITALG-NEXT: vpand %ymm2, %ymm0, %ymm0
-; BITALG-NEXT: vpshufb %ymm0, %ymm4, %ymm0
-; BITALG-NEXT: vpaddb %ymm3, %ymm0, %ymm0
+; BITALG-NEXT: vpopcntb %ymm0, %ymm0
; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0
; BITALG-NEXT: retq
;
@@ -452,14 +424,7 @@ define <8 x i32> @testv8i32(<8 x i32> %i
; BITALG_NOVLX-NEXT: vpand %ymm2, %ymm0, %ymm0
; BITALG_NOVLX-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
; BITALG_NOVLX-NEXT: vpaddd %ymm2, %ymm0, %ymm0
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; BITALG_NOVLX-NEXT: vpand %ymm2, %ymm0, %ymm3
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; BITALG_NOVLX-NEXT: vpshufb %ymm3, %ymm4, %ymm3
-; BITALG_NOVLX-NEXT: vpsrlw $4, %ymm0, %ymm0
-; BITALG_NOVLX-NEXT: vpand %ymm2, %ymm0, %ymm0
-; BITALG_NOVLX-NEXT: vpshufb %ymm0, %ymm4, %ymm0
-; BITALG_NOVLX-NEXT: vpaddb %ymm3, %ymm0, %ymm0
+; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0
; BITALG_NOVLX-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
; BITALG_NOVLX-NEXT: vpsadbw %ymm1, %ymm2, %ymm2
; BITALG_NOVLX-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
@@ -474,14 +439,7 @@ define <8 x i32> @testv8i32(<8 x i32> %i
; BITALG-NEXT: vpand %ymm2, %ymm0, %ymm0
; BITALG-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
; BITALG-NEXT: vpaddd %ymm2, %ymm0, %ymm0
-; BITALG-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; BITALG-NEXT: vpand %ymm2, %ymm0, %ymm3
-; BITALG-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; BITALG-NEXT: vpshufb %ymm3, %ymm4, %ymm3
-; BITALG-NEXT: vpsrlw $4, %ymm0, %ymm0
-; BITALG-NEXT: vpand %ymm2, %ymm0, %ymm0
-; BITALG-NEXT: vpshufb %ymm0, %ymm4, %ymm0
-; BITALG-NEXT: vpaddb %ymm3, %ymm0, %ymm0
+; BITALG-NEXT: vpopcntb %ymm0, %ymm0
; BITALG-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
; BITALG-NEXT: vpsadbw %ymm1, %ymm2, %ymm2
; BITALG-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
@@ -623,14 +581,7 @@ define <8 x i32> @testv8i32u(<8 x i32> %
; BITALG_NOVLX-NEXT: vpand %ymm2, %ymm0, %ymm0
; BITALG_NOVLX-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
; BITALG_NOVLX-NEXT: vpaddd %ymm2, %ymm0, %ymm0
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; BITALG_NOVLX-NEXT: vpand %ymm2, %ymm0, %ymm3
-; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; BITALG_NOVLX-NEXT: vpshufb %ymm3, %ymm4, %ymm3
-; BITALG_NOVLX-NEXT: vpsrlw $4, %ymm0, %ymm0
-; BITALG_NOVLX-NEXT: vpand %ymm2, %ymm0, %ymm0
-; BITALG_NOVLX-NEXT: vpshufb %ymm0, %ymm4, %ymm0
-; BITALG_NOVLX-NEXT: vpaddb %ymm3, %ymm0, %ymm0
+; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0
; BITALG_NOVLX-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
; BITALG_NOVLX-NEXT: vpsadbw %ymm1, %ymm2, %ymm2
; BITALG_NOVLX-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
@@ -645,14 +596,7 @@ define <8 x i32> @testv8i32u(<8 x i32> %
; BITALG-NEXT: vpand %ymm2, %ymm0, %ymm0
; BITALG-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
; BITALG-NEXT: vpaddd %ymm2, %ymm0, %ymm0
-; BITALG-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; BITALG-NEXT: vpand %ymm2, %ymm0, %ymm3
-; BITALG-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; BITALG-NEXT: vpshufb %ymm3, %ymm4, %ymm3
-; BITALG-NEXT: vpsrlw $4, %ymm0, %ymm0
-; BITALG-NEXT: vpand %ymm2, %ymm0, %ymm0
-; BITALG-NEXT: vpshufb %ymm0, %ymm4, %ymm0
-; BITALG-NEXT: vpaddb %ymm3, %ymm0, %ymm0
+; BITALG-NEXT: vpopcntb %ymm0, %ymm0
; BITALG-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
; BITALG-NEXT: vpsadbw %ymm1, %ymm2, %ymm2
; BITALG-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
Modified: llvm/trunk/test/CodeGen/X86/vector-tzcnt-512.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-tzcnt-512.ll?rev=344348&r1=344347&r2=344348&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-tzcnt-512.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-tzcnt-512.ll Fri Oct 12 07:18:47 2018
@@ -87,14 +87,7 @@ define <8 x i64> @testv8i64(<8 x i64> %i
; BITALG-NEXT: vpandq %zmm2, %zmm0, %zmm0
; BITALG-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2
; BITALG-NEXT: vpaddq %zmm2, %zmm0, %zmm0
-; BITALG-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; BITALG-NEXT: vpandq %zmm2, %zmm0, %zmm3
-; BITALG-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; BITALG-NEXT: vpshufb %zmm3, %zmm4, %zmm3
-; BITALG-NEXT: vpsrlw $4, %zmm0, %zmm0
-; BITALG-NEXT: vpandq %zmm2, %zmm0, %zmm0
-; BITALG-NEXT: vpshufb %zmm0, %zmm4, %zmm0
-; BITALG-NEXT: vpaddb %zmm3, %zmm0, %zmm0
+; BITALG-NEXT: vpopcntb %zmm0, %zmm0
; BITALG-NEXT: vpsadbw %zmm1, %zmm0, %zmm0
; BITALG-NEXT: retq
%out = call <8 x i64> @llvm.cttz.v8i64(<8 x i64> %in, i1 0)
@@ -157,14 +150,7 @@ define <8 x i64> @testv8i64u(<8 x i64> %
; BITALG-NEXT: vpandq %zmm2, %zmm0, %zmm0
; BITALG-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2
; BITALG-NEXT: vpaddq %zmm2, %zmm0, %zmm0
-; BITALG-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; BITALG-NEXT: vpandq %zmm2, %zmm0, %zmm3
-; BITALG-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; BITALG-NEXT: vpshufb %zmm3, %zmm4, %zmm3
-; BITALG-NEXT: vpsrlw $4, %zmm0, %zmm0
-; BITALG-NEXT: vpandq %zmm2, %zmm0, %zmm0
-; BITALG-NEXT: vpshufb %zmm0, %zmm4, %zmm0
-; BITALG-NEXT: vpaddb %zmm3, %zmm0, %zmm0
+; BITALG-NEXT: vpopcntb %zmm0, %zmm0
; BITALG-NEXT: vpsadbw %zmm1, %zmm0, %zmm0
; BITALG-NEXT: retq
%out = call <8 x i64> @llvm.cttz.v8i64(<8 x i64> %in, i1 -1)
@@ -269,14 +255,7 @@ define <16 x i32> @testv16i32(<16 x i32>
; BITALG-NEXT: vpandq %zmm2, %zmm0, %zmm0
; BITALG-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2
; BITALG-NEXT: vpaddd %zmm2, %zmm0, %zmm0
-; BITALG-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; BITALG-NEXT: vpandq %zmm2, %zmm0, %zmm3
-; BITALG-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; BITALG-NEXT: vpshufb %zmm3, %zmm4, %zmm3
-; BITALG-NEXT: vpsrlw $4, %zmm0, %zmm0
-; BITALG-NEXT: vpandq %zmm2, %zmm0, %zmm0
-; BITALG-NEXT: vpshufb %zmm0, %zmm4, %zmm0
-; BITALG-NEXT: vpaddb %zmm3, %zmm0, %zmm0
+; BITALG-NEXT: vpopcntb %zmm0, %zmm0
; BITALG-NEXT: vpunpckhdq {{.*#+}} zmm2 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
; BITALG-NEXT: vpsadbw %zmm1, %zmm2, %zmm2
; BITALG-NEXT: vpunpckldq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
@@ -347,14 +326,7 @@ define <16 x i32> @testv16i32u(<16 x i32
; BITALG-NEXT: vpandq %zmm2, %zmm0, %zmm0
; BITALG-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2
; BITALG-NEXT: vpaddd %zmm2, %zmm0, %zmm0
-; BITALG-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; BITALG-NEXT: vpandq %zmm2, %zmm0, %zmm3
-; BITALG-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; BITALG-NEXT: vpshufb %zmm3, %zmm4, %zmm3
-; BITALG-NEXT: vpsrlw $4, %zmm0, %zmm0
-; BITALG-NEXT: vpandq %zmm2, %zmm0, %zmm0
-; BITALG-NEXT: vpshufb %zmm0, %zmm4, %zmm0
-; BITALG-NEXT: vpaddb %zmm3, %zmm0, %zmm0
+; BITALG-NEXT: vpopcntb %zmm0, %zmm0
; BITALG-NEXT: vpunpckhdq {{.*#+}} zmm2 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
; BITALG-NEXT: vpsadbw %zmm1, %zmm2, %zmm2
; BITALG-NEXT: vpunpckldq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
More information about the llvm-commits
mailing list