[llvm] r363841 - [X86][SSE] Combine shuffles to ANY_EXTEND/ANY_EXTEND_VECTOR_INREG.
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Wed Jun 19 10:21:16 PDT 2019
Author: rksimon
Date: Wed Jun 19 10:21:15 2019
New Revision: 363841
URL: http://llvm.org/viewvc/llvm-project?rev=363841&view=rev
Log:
[X86][SSE] Combine shuffles to ANY_EXTEND/ANY_EXTEND_VECTOR_INREG.
We already do this for ZERO_EXTEND/ZERO_EXTEND_VECTOR_INREG - this just extends the pattern matcher to recognize cases where we don't need the zeros in the extension.
Modified:
llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
llvm/trunk/test/CodeGen/X86/vector-reduce-mul.ll
Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=363841&r1=363840&r2=363841&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Wed Jun 19 10:21:15 2019
@@ -31301,19 +31301,25 @@ static bool matchUnaryShuffle(MVT MaskVT
return true;
}
- // Match against a ZERO_EXTEND_VECTOR_INREG/VZEXT instruction.
+ // Match against a ANY/ZERO_EXTEND_VECTOR_INREG instruction.
// TODO: Add 512-bit vector support (split AVX512F and AVX512BW).
if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) ||
(MaskVT.is256BitVector() && Subtarget.hasInt256()))) {
unsigned MaxScale = 64 / MaskEltSize;
for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) {
- bool Match = true;
+ bool MatchAny = true;
+ bool MatchZero = true;
unsigned NumDstElts = NumMaskElts / Scale;
- for (unsigned i = 0; i != NumDstElts && Match; ++i) {
- Match &= isUndefOrEqual(Mask[i * Scale], (int)i);
- Match &= isUndefOrZeroInRange(Mask, (i * Scale) + 1, Scale - 1);
+ for (unsigned i = 0; i != NumDstElts && (MatchAny || MatchZero); ++i) {
+ if (!isUndefOrEqual(Mask[i * Scale], (int)i)) {
+ MatchAny = MatchZero = false;
+ break;
+ }
+ MatchAny &= isUndefInRange(Mask, (i * Scale) + 1, Scale - 1);
+ MatchZero &= isUndefOrZeroInRange(Mask, (i * Scale) + 1, Scale - 1);
}
- if (Match) {
+ if (MatchAny || MatchZero) {
+ assert(MatchZero && "Failed to match zext but matched aext?");
unsigned SrcSize = std::max(128u, NumDstElts * MaskEltSize);
MVT ScalarTy = MaskVT.isInteger() ? MaskVT.getScalarType() :
MVT::getIntegerVT(MaskEltSize);
@@ -31322,10 +31328,9 @@ static bool matchUnaryShuffle(MVT MaskVT
if (SrcVT.getSizeInBits() != MaskVT.getSizeInBits())
V1 = extractSubVector(V1, 0, DAG, DL, SrcSize);
- if (SrcVT.getVectorNumElements() == NumDstElts)
- Shuffle = unsigned(ISD::ZERO_EXTEND);
- else
- Shuffle = unsigned(ISD::ZERO_EXTEND_VECTOR_INREG);
+ Shuffle = unsigned(MatchAny ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND);
+ if (SrcVT.getVectorNumElements() != NumDstElts)
+ Shuffle = getOpcode_EXTEND_VECTOR_INREG(Shuffle);
DstVT = MVT::getIntegerVT(Scale * MaskEltSize);
DstVT = MVT::getVectorVT(DstVT, NumDstElts);
Modified: llvm/trunk/test/CodeGen/X86/vector-reduce-mul.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-reduce-mul.ll?rev=363841&r1=363840&r2=363841&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-reduce-mul.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-reduce-mul.ll Wed Jun 19 10:21:15 2019
@@ -2023,32 +2023,25 @@ define i8 @test_v32i8(<32 x i8> %a0) {
; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; AVX2-NEXT: vpmullw %xmm2, %xmm3, %xmm2
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
-; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm4
; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX2-NEXT: vpmullw %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0
-; AVX2-NEXT: vpackuswb %xmm4, %xmm0, %xmm0
-; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX2-NEXT: vpmullw %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0
-; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
-; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX2-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3]
; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
; AVX2-NEXT: vpmullw %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0
-; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpsrld $16, %xmm0, %xmm2
-; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX2-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vpsrld $16, %xmm2, %xmm2
; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
; AVX2-NEXT: vpmullw %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0
-; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm1
+; AVX2-NEXT: vpackuswb %xmm3, %xmm1, %xmm1
+; AVX2-NEXT: vpsrlw $8, %xmm1, %xmm1
; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpextrb $0, %xmm0, %eax
@@ -2118,32 +2111,25 @@ define i8 @test_v32i8(<32 x i8> %a0) {
; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; AVX512DQ-NEXT: vpmullw %xmm2, %xmm3, %xmm2
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
-; AVX512DQ-NEXT: vpand %xmm3, %xmm2, %xmm4
; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX512DQ-NEXT: vpmullw %xmm2, %xmm1, %xmm1
; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX512DQ-NEXT: vpand %xmm3, %xmm0, %xmm0
-; AVX512DQ-NEXT: vpackuswb %xmm4, %xmm0, %xmm0
-; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX512DQ-NEXT: vpmullw %xmm2, %xmm0, %xmm0
-; AVX512DQ-NEXT: vpand %xmm3, %xmm0, %xmm0
-; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512DQ-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
-; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
+; AVX512DQ-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX512DQ-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX512DQ-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3]
; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
; AVX512DQ-NEXT: vpmullw %xmm2, %xmm0, %xmm0
-; AVX512DQ-NEXT: vpand %xmm3, %xmm0, %xmm0
-; AVX512DQ-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX512DQ-NEXT: vpsrld $16, %xmm0, %xmm2
-; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512DQ-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX512DQ-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
+; AVX512DQ-NEXT: vpsrld $16, %xmm2, %xmm2
; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
; AVX512DQ-NEXT: vpmullw %xmm2, %xmm0, %xmm0
-; AVX512DQ-NEXT: vpand %xmm3, %xmm0, %xmm0
-; AVX512DQ-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX512DQ-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512DQ-NEXT: vpand %xmm1, %xmm0, %xmm1
+; AVX512DQ-NEXT: vpackuswb %xmm3, %xmm1, %xmm1
+; AVX512DQ-NEXT: vpsrlw $8, %xmm1, %xmm1
; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX512DQ-NEXT: vpextrb $0, %xmm0, %eax
@@ -2339,32 +2325,25 @@ define i8 @test_v64i8(<64 x i8> %a0) {
; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; AVX2-NEXT: vpmullw %xmm2, %xmm3, %xmm2
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
-; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm4
; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX2-NEXT: vpmullw %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0
-; AVX2-NEXT: vpackuswb %xmm4, %xmm0, %xmm0
-; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX2-NEXT: vpmullw %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0
-; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
-; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX2-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3]
; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
; AVX2-NEXT: vpmullw %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0
-; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpsrld $16, %xmm0, %xmm2
-; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX2-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vpsrld $16, %xmm2, %xmm2
; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
; AVX2-NEXT: vpmullw %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0
-; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm1
+; AVX2-NEXT: vpackuswb %xmm3, %xmm1, %xmm1
+; AVX2-NEXT: vpsrlw $8, %xmm1, %xmm1
; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpextrb $0, %xmm0, %eax
@@ -2389,32 +2368,25 @@ define i8 @test_v64i8(<64 x i8> %a0) {
; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
; AVX512BW-NEXT: vpmullw %xmm3, %xmm2, %xmm2
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
-; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX512BW-NEXT: vpmullw %xmm2, %xmm1, %xmm1
; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX512BW-NEXT: vpand %xmm3, %xmm0, %xmm0
-; AVX512BW-NEXT: vpackuswb %xmm4, %xmm0, %xmm0
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX512BW-NEXT: vpmullw %xmm2, %xmm0, %xmm0
-; AVX512BW-NEXT: vpand %xmm3, %xmm0, %xmm0
-; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512BW-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
+; AVX512BW-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX512BW-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX512BW-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3]
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
; AVX512BW-NEXT: vpmullw %xmm2, %xmm0, %xmm0
-; AVX512BW-NEXT: vpand %xmm3, %xmm0, %xmm0
-; AVX512BW-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX512BW-NEXT: vpsrld $16, %xmm0, %xmm2
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512BW-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX512BW-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT: vpsrld $16, %xmm2, %xmm2
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
; AVX512BW-NEXT: vpmullw %xmm2, %xmm0, %xmm0
-; AVX512BW-NEXT: vpand %xmm3, %xmm0, %xmm0
-; AVX512BW-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX512BW-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512BW-NEXT: vpand %xmm1, %xmm0, %xmm1
+; AVX512BW-NEXT: vpackuswb %xmm3, %xmm1, %xmm1
+; AVX512BW-NEXT: vpsrlw $8, %xmm1, %xmm1
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX512BW-NEXT: vpextrb $0, %xmm0, %eax
@@ -2481,32 +2453,25 @@ define i8 @test_v64i8(<64 x i8> %a0) {
; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; AVX512DQ-NEXT: vpmullw %xmm2, %xmm3, %xmm2
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
-; AVX512DQ-NEXT: vpand %xmm3, %xmm2, %xmm4
; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX512DQ-NEXT: vpmullw %xmm2, %xmm1, %xmm1
; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX512DQ-NEXT: vpand %xmm3, %xmm0, %xmm0
-; AVX512DQ-NEXT: vpackuswb %xmm4, %xmm0, %xmm0
-; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX512DQ-NEXT: vpmullw %xmm2, %xmm0, %xmm0
-; AVX512DQ-NEXT: vpand %xmm3, %xmm0, %xmm0
-; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512DQ-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
-; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
+; AVX512DQ-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX512DQ-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX512DQ-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3]
; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
; AVX512DQ-NEXT: vpmullw %xmm2, %xmm0, %xmm0
-; AVX512DQ-NEXT: vpand %xmm3, %xmm0, %xmm0
-; AVX512DQ-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX512DQ-NEXT: vpsrld $16, %xmm0, %xmm2
-; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512DQ-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX512DQ-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
+; AVX512DQ-NEXT: vpsrld $16, %xmm2, %xmm2
; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
; AVX512DQ-NEXT: vpmullw %xmm2, %xmm0, %xmm0
-; AVX512DQ-NEXT: vpand %xmm3, %xmm0, %xmm0
-; AVX512DQ-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX512DQ-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512DQ-NEXT: vpand %xmm1, %xmm0, %xmm1
+; AVX512DQ-NEXT: vpackuswb %xmm3, %xmm1, %xmm1
+; AVX512DQ-NEXT: vpsrlw $8, %xmm1, %xmm1
; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX512DQ-NEXT: vpextrb $0, %xmm0, %eax
@@ -2786,32 +2751,25 @@ define i8 @test_v128i8(<128 x i8> %a0) {
; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; AVX2-NEXT: vpmullw %xmm2, %xmm3, %xmm2
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
-; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm4
; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX2-NEXT: vpmullw %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0
-; AVX2-NEXT: vpackuswb %xmm4, %xmm0, %xmm0
-; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX2-NEXT: vpmullw %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0
-; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
-; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX2-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3]
; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
; AVX2-NEXT: vpmullw %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0
-; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpsrld $16, %xmm0, %xmm2
-; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX2-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vpsrld $16, %xmm2, %xmm2
; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
; AVX2-NEXT: vpmullw %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0
-; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm1
+; AVX2-NEXT: vpackuswb %xmm3, %xmm1, %xmm1
+; AVX2-NEXT: vpsrlw $8, %xmm1, %xmm1
; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpextrb $0, %xmm0, %eax
@@ -2843,32 +2801,25 @@ define i8 @test_v128i8(<128 x i8> %a0) {
; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
; AVX512BW-NEXT: vpmullw %xmm3, %xmm2, %xmm2
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
-; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX512BW-NEXT: vpmullw %xmm2, %xmm1, %xmm1
; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX512BW-NEXT: vpand %xmm3, %xmm0, %xmm0
-; AVX512BW-NEXT: vpackuswb %xmm4, %xmm0, %xmm0
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX512BW-NEXT: vpmullw %xmm2, %xmm0, %xmm0
-; AVX512BW-NEXT: vpand %xmm3, %xmm0, %xmm0
-; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512BW-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
+; AVX512BW-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX512BW-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX512BW-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3]
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
; AVX512BW-NEXT: vpmullw %xmm2, %xmm0, %xmm0
-; AVX512BW-NEXT: vpand %xmm3, %xmm0, %xmm0
-; AVX512BW-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX512BW-NEXT: vpsrld $16, %xmm0, %xmm2
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512BW-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX512BW-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT: vpsrld $16, %xmm2, %xmm2
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
; AVX512BW-NEXT: vpmullw %xmm2, %xmm0, %xmm0
-; AVX512BW-NEXT: vpand %xmm3, %xmm0, %xmm0
-; AVX512BW-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX512BW-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512BW-NEXT: vpand %xmm1, %xmm0, %xmm1
+; AVX512BW-NEXT: vpackuswb %xmm3, %xmm1, %xmm1
+; AVX512BW-NEXT: vpsrlw $8, %xmm1, %xmm1
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX512BW-NEXT: vpextrb $0, %xmm0, %eax
@@ -2950,32 +2901,25 @@ define i8 @test_v128i8(<128 x i8> %a0) {
; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; AVX512DQ-NEXT: vpmullw %xmm2, %xmm3, %xmm2
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
-; AVX512DQ-NEXT: vpand %xmm3, %xmm2, %xmm4
; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX512DQ-NEXT: vpmullw %xmm2, %xmm1, %xmm1
; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX512DQ-NEXT: vpand %xmm3, %xmm0, %xmm0
-; AVX512DQ-NEXT: vpackuswb %xmm4, %xmm0, %xmm0
-; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX512DQ-NEXT: vpmullw %xmm2, %xmm0, %xmm0
-; AVX512DQ-NEXT: vpand %xmm3, %xmm0, %xmm0
-; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512DQ-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
-; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
+; AVX512DQ-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX512DQ-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX512DQ-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3]
; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
; AVX512DQ-NEXT: vpmullw %xmm2, %xmm0, %xmm0
-; AVX512DQ-NEXT: vpand %xmm3, %xmm0, %xmm0
-; AVX512DQ-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX512DQ-NEXT: vpsrld $16, %xmm0, %xmm2
-; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512DQ-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX512DQ-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
+; AVX512DQ-NEXT: vpsrld $16, %xmm2, %xmm2
; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
; AVX512DQ-NEXT: vpmullw %xmm2, %xmm0, %xmm0
-; AVX512DQ-NEXT: vpand %xmm3, %xmm0, %xmm0
-; AVX512DQ-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX512DQ-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512DQ-NEXT: vpand %xmm1, %xmm0, %xmm1
+; AVX512DQ-NEXT: vpackuswb %xmm3, %xmm1, %xmm1
+; AVX512DQ-NEXT: vpsrlw $8, %xmm1, %xmm1
; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX512DQ-NEXT: vpextrb $0, %xmm0, %eax
More information about the llvm-commits
mailing list