[llvm] d510fd2 - [X86] combineMulToPMADDWD - handle any pow2 vector type and split to legal types
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Tue Nov 9 07:20:58 PST 2021
Author: Simon Pilgrim
Date: 2021-11-09T15:20:43Z
New Revision: d510fd2bed030ff2c42e1a365e1898ac6e822694
URL: https://github.com/llvm/llvm-project/commit/d510fd2bed030ff2c42e1a365e1898ac6e822694
DIFF: https://github.com/llvm/llvm-project/commit/d510fd2bed030ff2c42e1a365e1898ac6e822694.diff
LOG: [X86] combineMulToPMADDWD - handle any pow2 vector type and split to legal types
combineMulToPMADDWD is currently limited to legal types, but there's no reason why we can't handle any larger type that the existing SplitOpsAndApply code can use to split to legal X86ISD::VPMADDWD ops.
This also exposed a missed opportunity for pre-SSE41 targets to handle SEXT ops from types smaller than vXi16 - without PMOVSX instructions these will always be expanded to unpack+shifts, so we can cheat and convert this into a ZEXT(SEXT()) sequence to make it a valid PMADDWD op.
Differential Revision: https://reviews.llvm.org/D110995
Added:
Modified:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/X86/pmaddubsw.ll
llvm/test/CodeGen/X86/pmulh.ll
llvm/test/CodeGen/X86/shrink_vmul.ll
llvm/test/CodeGen/X86/slow-pmulld.ll
llvm/test/CodeGen/X86/vshli-simplify-demanded-bits.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index aa60a8cbb3bf..c952a283a28a 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -44441,29 +44441,45 @@ static SDValue combineMulToPMADDWD(SDNode *N, SelectionDAG &DAG,
EVT VT = N->getValueType(0);
// Only support vXi32 vectors.
+ // TODO: Can we support > 32-bit elements?
if (!VT.isVector() || VT.getVectorElementType() != MVT::i32)
return SDValue();
- // Make sure the type is legal or will be widened to a legal type.
- if (VT != MVT::v2i32 && !DAG.getTargetLoweringInfo().isTypeLegal(VT))
+ // Make sure the type is legal or can split/widen to a legal type.
+ // With AVX512 but without BWI, we would need to split v32i16.
+ unsigned NumElts = VT.getVectorNumElements();
+ if (NumElts == 1 || !isPowerOf2_32(NumElts))
return SDValue();
- MVT WVT = MVT::getVectorVT(MVT::i16, 2 * VT.getVectorNumElements());
+ EVT WVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, 2 * NumElts);
- // Without BWI, we would need to split v32i16.
- if (WVT == MVT::v32i16 && !Subtarget.hasBWI())
+ // With AVX512 but without BWI, we would need to split v32i16.
+ if (32 <= (2 * NumElts) && Subtarget.hasAVX512() && !Subtarget.hasBWI())
return SDValue();
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
- // If we are zero extending two steps without SSE4.1, its better to reduce
+ // If we are zero/sign extending two steps without SSE4.1, its better to
+ // reduce the vmul width instead.
+ if (!Subtarget.hasSSE41() &&
+ (((N0.getOpcode() == ISD::ZERO_EXTEND &&
+ N0.getOperand(0).getScalarValueSizeInBits() <= 8) &&
+ (N1.getOpcode() == ISD::ZERO_EXTEND &&
+ N1.getOperand(0).getScalarValueSizeInBits() <= 8)) ||
+ ((N0.getOpcode() == ISD::SIGN_EXTEND &&
+ N0.getOperand(0).getScalarValueSizeInBits() <= 8) &&
+ (N1.getOpcode() == ISD::SIGN_EXTEND &&
+ N1.getOperand(0).getScalarValueSizeInBits() <= 8))))
+ return SDValue();
+
+ // If we are sign extending a wide vector without SSE4.1, its better to reduce
// the vmul width instead.
if (!Subtarget.hasSSE41() &&
- (N0.getOpcode() == ISD::ZERO_EXTEND &&
- N0.getOperand(0).getScalarValueSizeInBits() <= 8) &&
- (N1.getOpcode() == ISD::ZERO_EXTEND &&
- N1.getOperand(0).getScalarValueSizeInBits() <= 8))
+ (N0.getOpcode() == ISD::SIGN_EXTEND &&
+ N0.getOperand(0).getValueSizeInBits() > 128) &&
+ (N1.getOpcode() == ISD::SIGN_EXTEND &&
+ N1.getOperand(0).getValueSizeInBits() > 128))
return SDValue();
// Sign bits must extend down to the lowest i16.
@@ -44480,12 +44496,18 @@ static SDValue combineMulToPMADDWD(SDNode *N, SelectionDAG &DAG,
if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode()))
return DAG.getNode(ISD::AND, SDLoc(N), VT, Op,
DAG.getConstant(0xFFFF, SDLoc(N), VT));
- // Convert sext(vXi16) to zext(vXi16).
- if (Op.getOpcode() == ISD::SIGN_EXTEND && VT.getSizeInBits() <= 128 &&
- N->isOnlyUserOf(Op.getNode())) {
+ if (Op.getOpcode() == ISD::SIGN_EXTEND && N->isOnlyUserOf(Op.getNode())) {
SDValue Src = Op.getOperand(0);
- if (Src.getScalarValueSizeInBits() == 16)
+ // Convert sext(vXi16) to zext(vXi16).
+ if (Src.getScalarValueSizeInBits() == 16 && VT.getSizeInBits() <= 128)
return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, Src);
+ // Convert sext(vXi8) to zext(vXi16 sext(vXi8)) on pre-SSE41 targets
+ // which will expand the extension.
+ if (Src.getScalarValueSizeInBits() < 16 && !Subtarget.hasSSE41()) {
+ EVT ExtVT = VT.changeVectorElementType(MVT::i16);
+ Src = DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), ExtVT, Src);
+ return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, Src);
+ }
}
// Convert SIGN_EXTEND_VECTOR_INREG to ZEXT_EXTEND_VECTOR_INREG.
if (Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG &&
diff --git a/llvm/test/CodeGen/X86/pmaddubsw.ll b/llvm/test/CodeGen/X86/pmaddubsw.ll
index 8fc701954984..7229fd6bc855 100644
--- a/llvm/test/CodeGen/X86/pmaddubsw.ll
+++ b/llvm/test/CodeGen/X86/pmaddubsw.ll
@@ -296,29 +296,27 @@ define <8 x i16> @pmaddubsw_bad_extend(<16 x i8>* %Aptr, <16 x i8>* %Bptr) {
; SSE-LABEL: pmaddubsw_bad_extend:
; SSE: # %bb.0:
; SSE-NEXT: movdqa (%rdi), %xmm1
-; SSE-NEXT: movdqa (%rsi), %xmm0
-; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
-; SSE-NEXT: pand %xmm0, %xmm2
-; SSE-NEXT: movdqa %xmm1, %xmm3
-; SSE-NEXT: psllw $8, %xmm3
-; SSE-NEXT: psraw $8, %xmm3
-; SSE-NEXT: movdqa %xmm3, %xmm4
-; SSE-NEXT: pmulhw %xmm2, %xmm4
-; SSE-NEXT: pmullw %xmm2, %xmm3
-; SSE-NEXT: movdqa %xmm3, %xmm2
-; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
-; SSE-NEXT: psraw $8, %xmm0
-; SSE-NEXT: psrlw $8, %xmm1
-; SSE-NEXT: movdqa %xmm1, %xmm4
-; SSE-NEXT: pmulhw %xmm0, %xmm4
-; SSE-NEXT: pmullw %xmm0, %xmm1
+; SSE-NEXT: movdqa (%rsi), %xmm2
; SSE-NEXT: movdqa %xmm1, %xmm0
-; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; SSE-NEXT: paddd %xmm2, %xmm0
-; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
-; SSE-NEXT: paddd %xmm3, %xmm1
-; SSE-NEXT: packssdw %xmm1, %xmm0
+; SSE-NEXT: psllw $8, %xmm0
+; SSE-NEXT: psraw $8, %xmm0
+; SSE-NEXT: movdqa %xmm2, %xmm3
+; SSE-NEXT: psraw $8, %xmm3
+; SSE-NEXT: movdqa %xmm2, %xmm4
+; SSE-NEXT: pshufb {{.*#+}} xmm4 = xmm4[0],zero,xmm4[2],zero,xmm4[4],zero,xmm4[6],zero,xmm4[u,u,u,u,u,u,u,u]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; SSE-NEXT: movdqa %xmm1, %xmm5
+; SSE-NEXT: psrlw $8, %xmm5
+; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[2,3,2,3]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3]
+; SSE-NEXT: pmaddwd %xmm4, %xmm0
+; SSE-NEXT: pshufb {{.*#+}} xmm2 = xmm2[8],zero,xmm2[10],zero,xmm2[12],zero,xmm2[14],zero,xmm2[u,u,u,u,u,u,u,u]
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; SSE-NEXT: pshufb {{.*#+}} xmm1 = xmm1[9],zero,xmm1[11],zero,xmm1[13],zero,xmm1[15],zero,xmm1[u,u,u,u,u,u,u,u]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3]
+; SSE-NEXT: pmaddwd %xmm2, %xmm6
+; SSE-NEXT: packssdw %xmm6, %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: pmaddubsw_bad_extend:
@@ -395,30 +393,22 @@ define <8 x i16> @pmaddubsw_bad_indices(<16 x i8>* %Aptr, <16 x i8>* %Bptr) {
; SSE-LABEL: pmaddubsw_bad_indices:
; SSE: # %bb.0:
; SSE-NEXT: movdqa (%rdi), %xmm1
-; SSE-NEXT: movdqa (%rsi), %xmm0
-; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
-; SSE-NEXT: pand %xmm0, %xmm2
-; SSE-NEXT: movdqa %xmm1, %xmm3
-; SSE-NEXT: pshufb {{.*#+}} xmm3 = xmm3[u,1,u,2,u,5,u,6,u,9,u,10,u,13,u,14]
-; SSE-NEXT: psraw $8, %xmm3
-; SSE-NEXT: movdqa %xmm3, %xmm4
-; SSE-NEXT: pmulhw %xmm2, %xmm4
-; SSE-NEXT: pmullw %xmm2, %xmm3
-; SSE-NEXT: movdqa %xmm3, %xmm2
-; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
-; SSE-NEXT: psrlw $8, %xmm0
+; SSE-NEXT: movdqa (%rsi), %xmm2
+; SSE-NEXT: movdqa %xmm1, %xmm0
+; SSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[u,1,u,2,u,5,u,6,u,9,u,10,u,13,u,14]
+; SSE-NEXT: psraw $8, %xmm0
; SSE-NEXT: pshufb {{.*#+}} xmm1 = xmm1[u,0,u,3,u,4,u,7,u,8,u,11,u,12,u,15]
; SSE-NEXT: psraw $8, %xmm1
-; SSE-NEXT: movdqa %xmm1, %xmm4
-; SSE-NEXT: pmulhw %xmm0, %xmm4
-; SSE-NEXT: pmullw %xmm0, %xmm1
-; SSE-NEXT: movdqa %xmm1, %xmm0
-; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; SSE-NEXT: paddd %xmm2, %xmm0
-; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
-; SSE-NEXT: paddd %xmm3, %xmm1
-; SSE-NEXT: packssdw %xmm1, %xmm0
+; SSE-NEXT: pxor %xmm3, %xmm3
+; SSE-NEXT: movdqa %xmm2, %xmm4
+; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15]
+; SSE-NEXT: movdqa %xmm0, %xmm5
+; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
+; SSE-NEXT: pmaddwd %xmm4, %xmm5
+; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE-NEXT: pmaddwd %xmm2, %xmm0
+; SSE-NEXT: packssdw %xmm5, %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: pmaddubsw_bad_indices:
diff --git a/llvm/test/CodeGen/X86/pmulh.ll b/llvm/test/CodeGen/X86/pmulh.ll
index 4db1061de1da..c34f724855f9 100644
--- a/llvm/test/CodeGen/X86/pmulh.ll
+++ b/llvm/test/CodeGen/X86/pmulh.ll
@@ -326,32 +326,27 @@ define <16 x i16> @zext_mulhuw_v16i16(<16 x i16> %a, <16 x i16> %b) {
define <16 x i16> @and_mulhuw_v16i16(<16 x i32> %a, <16 x i32> %b) {
; SSE2-LABEL: and_mulhuw_v16i16:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [32767,32767,32767,32767]
-; SSE2-NEXT: pand %xmm8, %xmm3
-; SSE2-NEXT: pand %xmm8, %xmm2
-; SSE2-NEXT: packssdw %xmm3, %xmm2
-; SSE2-NEXT: pand %xmm8, %xmm1
-; SSE2-NEXT: pand %xmm8, %xmm0
-; SSE2-NEXT: packssdw %xmm1, %xmm0
-; SSE2-NEXT: pand %xmm8, %xmm7
-; SSE2-NEXT: pand %xmm8, %xmm6
-; SSE2-NEXT: packssdw %xmm7, %xmm6
-; SSE2-NEXT: pmulhw %xmm2, %xmm6
-; SSE2-NEXT: pand %xmm8, %xmm5
-; SSE2-NEXT: pand %xmm8, %xmm4
-; SSE2-NEXT: packssdw %xmm5, %xmm4
-; SSE2-NEXT: pmulhw %xmm0, %xmm4
-; SSE2-NEXT: pxor %xmm0, %xmm0
-; SSE2-NEXT: movdqa %xmm6, %xmm1
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3]
-; SSE2-NEXT: packssdw %xmm1, %xmm6
-; SSE2-NEXT: movdqa %xmm4, %xmm1
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
-; SSE2-NEXT: packssdw %xmm1, %xmm4
-; SSE2-NEXT: movdqa %xmm4, %xmm0
-; SSE2-NEXT: movdqa %xmm6, %xmm1
+; SSE2-NEXT: movdqa %xmm6, %xmm8
+; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [32767,32767,32767,32767]
+; SSE2-NEXT: pand %xmm6, %xmm3
+; SSE2-NEXT: pand %xmm6, %xmm2
+; SSE2-NEXT: pand %xmm6, %xmm1
+; SSE2-NEXT: pand %xmm6, %xmm0
+; SSE2-NEXT: pand %xmm6, %xmm7
+; SSE2-NEXT: pmaddwd %xmm3, %xmm7
+; SSE2-NEXT: pand %xmm6, %xmm8
+; SSE2-NEXT: pmaddwd %xmm2, %xmm8
+; SSE2-NEXT: pand %xmm6, %xmm5
+; SSE2-NEXT: pmaddwd %xmm1, %xmm5
+; SSE2-NEXT: pand %xmm4, %xmm6
+; SSE2-NEXT: pmaddwd %xmm6, %xmm0
+; SSE2-NEXT: psrld $16, %xmm7
+; SSE2-NEXT: psrld $16, %xmm8
+; SSE2-NEXT: packssdw %xmm7, %xmm8
+; SSE2-NEXT: psrld $16, %xmm5
+; SSE2-NEXT: psrld $16, %xmm0
+; SSE2-NEXT: packssdw %xmm5, %xmm0
+; SSE2-NEXT: movdqa %xmm8, %xmm1
; SSE2-NEXT: retq
;
; SSE41-LABEL: and_mulhuw_v16i16:
@@ -382,12 +377,12 @@ define <16 x i16> @and_mulhuw_v16i16(<16 x i32> %a, <16 x i32> %b) {
; AVX2-LABEL: and_mulhuw_v16i16:
; AVX2: # %bb.0:
; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm4 = [32767,32767,32767,32767,32767,32767,32767,32767]
-; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1
-; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0
-; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3
-; AVX2-NEXT: vpmaddwd %ymm3, %ymm1, %ymm1
; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2
+; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0
; AVX2-NEXT: vpmaddwd %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm2
+; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1
+; AVX2-NEXT: vpmaddwd %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vpsrld $16, %ymm1, %ymm1
; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0
; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
diff --git a/llvm/test/CodeGen/X86/shrink_vmul.ll b/llvm/test/CodeGen/X86/shrink_vmul.ll
index 7557b3fc2844..82a633b6d8ca 100644
--- a/llvm/test/CodeGen/X86/shrink_vmul.ll
+++ b/llvm/test/CodeGen/X86/shrink_vmul.ll
@@ -985,16 +985,16 @@ define void @mul_2xi8_sext_zext(i8* nocapture readonly %a, i8* nocapture readonl
; X86-SSE-NEXT: movl c, %ecx
; X86-SSE-NEXT: movzwl (%esi,%eax), %esi
; X86-SSE-NEXT: movd %esi, %xmm0
-; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; X86-SSE-NEXT: psrad $24, %xmm0
; X86-SSE-NEXT: movzwl (%edx,%eax), %edx
; X86-SSE-NEXT: movd %edx, %xmm1
; X86-SSE-NEXT: pxor %xmm2, %xmm2
; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; X86-SSE-NEXT: pmaddwd %xmm0, %xmm1
-; X86-SSE-NEXT: movq %xmm1, (%ecx,%eax,4)
+; X86-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7]
+; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X86-SSE-NEXT: psraw $8, %xmm0
+; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; X86-SSE-NEXT: pmaddwd %xmm1, %xmm0
+; X86-SSE-NEXT: movq %xmm0, (%ecx,%eax,4)
; X86-SSE-NEXT: popl %esi
; X86-SSE-NEXT: retl
;
@@ -1021,16 +1021,16 @@ define void @mul_2xi8_sext_zext(i8* nocapture readonly %a, i8* nocapture readonl
; X64-SSE-NEXT: movq c(%rip), %rax
; X64-SSE-NEXT: movzwl (%rdi,%rdx), %ecx
; X64-SSE-NEXT: movd %ecx, %xmm0
-; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; X64-SSE-NEXT: psrad $24, %xmm0
; X64-SSE-NEXT: movzwl (%rsi,%rdx), %ecx
; X64-SSE-NEXT: movd %ecx, %xmm1
; X64-SSE-NEXT: pxor %xmm2, %xmm2
; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; X64-SSE-NEXT: pmaddwd %xmm0, %xmm1
-; X64-SSE-NEXT: movq %xmm1, (%rax,%rdx,4)
+; X64-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7]
+; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X64-SSE-NEXT: psraw $8, %xmm0
+; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; X64-SSE-NEXT: pmaddwd %xmm1, %xmm0
+; X64-SSE-NEXT: movq %xmm0, (%rax,%rdx,4)
; X64-SSE-NEXT: retq
;
; X64-AVX-LABEL: mul_2xi8_sext_zext:
diff --git a/llvm/test/CodeGen/X86/slow-pmulld.ll b/llvm/test/CodeGen/X86/slow-pmulld.ll
index 1932905a78f0..5937b82264f3 100644
--- a/llvm/test/CodeGen/X86/slow-pmulld.ll
+++ b/llvm/test/CodeGen/X86/slow-pmulld.ll
@@ -113,50 +113,42 @@ define <4 x i32> @test_mul_v4i32_v4i8(<4 x i8> %A) {
define <8 x i32> @test_mul_v8i32_v8i8(<8 x i8> %A) {
; SLM32-LABEL: test_mul_v8i32_v8i8:
; SLM32: # %bb.0:
-; SLM32-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; SLM32-NEXT: movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778]
-; SLM32-NEXT: movdqa %xmm1, %xmm2
-; SLM32-NEXT: pmullw %xmm0, %xmm1
-; SLM32-NEXT: pmulhw %xmm0, %xmm2
-; SLM32-NEXT: movdqa %xmm1, %xmm0
-; SLM32-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; SLM32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SLM32-NEXT: movdqa {{.*#+}} xmm2 = [18778,0,18778,0,18778,0,18778,0]
+; SLM32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; SLM32-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SLM32-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; SLM32-NEXT: pmaddwd %xmm2, %xmm0
+; SLM32-NEXT: pmaddwd %xmm2, %xmm1
; SLM32-NEXT: retl
;
; SLM64-LABEL: test_mul_v8i32_v8i8:
; SLM64: # %bb.0:
-; SLM64-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; SLM64-NEXT: movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778]
-; SLM64-NEXT: movdqa %xmm1, %xmm2
-; SLM64-NEXT: pmullw %xmm0, %xmm1
-; SLM64-NEXT: pmulhw %xmm0, %xmm2
-; SLM64-NEXT: movdqa %xmm1, %xmm0
-; SLM64-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; SLM64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SLM64-NEXT: movdqa {{.*#+}} xmm2 = [18778,0,18778,0,18778,0,18778,0]
+; SLM64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; SLM64-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SLM64-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; SLM64-NEXT: pmaddwd %xmm2, %xmm0
+; SLM64-NEXT: pmaddwd %xmm2, %xmm1
; SLM64-NEXT: retq
;
; SLOW32-LABEL: test_mul_v8i32_v8i8:
; SLOW32: # %bb.0:
-; SLOW32-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; SLOW32-NEXT: movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778]
-; SLOW32-NEXT: movdqa %xmm1, %xmm2
-; SLOW32-NEXT: pmulhw %xmm0, %xmm2
-; SLOW32-NEXT: pmullw %xmm0, %xmm1
-; SLOW32-NEXT: movdqa %xmm1, %xmm0
-; SLOW32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SLOW32-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SLOW32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; SLOW32-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; SLOW32-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SLOW32-NEXT: movdqa {{.*#+}} xmm2 = [18778,0,18778,0,18778,0,18778,0]
+; SLOW32-NEXT: pmaddwd %xmm2, %xmm0
+; SLOW32-NEXT: pmaddwd %xmm2, %xmm1
; SLOW32-NEXT: retl
;
; SLOW64-LABEL: test_mul_v8i32_v8i8:
; SLOW64: # %bb.0:
-; SLOW64-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; SLOW64-NEXT: movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778]
-; SLOW64-NEXT: movdqa %xmm1, %xmm2
-; SLOW64-NEXT: pmulhw %xmm0, %xmm2
-; SLOW64-NEXT: pmullw %xmm0, %xmm1
-; SLOW64-NEXT: movdqa %xmm1, %xmm0
-; SLOW64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SLOW64-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SLOW64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; SLOW64-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; SLOW64-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SLOW64-NEXT: movdqa {{.*#+}} xmm2 = [18778,0,18778,0,18778,0,18778,0]
+; SLOW64-NEXT: pmaddwd %xmm2, %xmm0
+; SLOW64-NEXT: pmaddwd %xmm2, %xmm1
; SLOW64-NEXT: retq
;
; SSE4-32-LABEL: test_mul_v8i32_v8i8:
@@ -164,7 +156,7 @@ define <8 x i32> @test_mul_v8i32_v8i8(<8 x i8> %A) {
; SSE4-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; SSE4-32-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778]
+; SSE4-32-NEXT: movdqa {{.*#+}} xmm2 = [18778,0,18778,0,18778,0,18778,0]
; SSE4-32-NEXT: pmaddwd %xmm2, %xmm0
; SSE4-32-NEXT: pmaddwd %xmm2, %xmm1
; SSE4-32-NEXT: retl
@@ -174,7 +166,7 @@ define <8 x i32> @test_mul_v8i32_v8i8(<8 x i8> %A) {
; SSE4-64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; SSE4-64-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778]
+; SSE4-64-NEXT: movdqa {{.*#+}} xmm2 = [18778,0,18778,0,18778,0,18778,0]
; SSE4-64-NEXT: pmaddwd %xmm2, %xmm0
; SSE4-64-NEXT: pmaddwd %xmm2, %xmm1
; SSE4-64-NEXT: retq
@@ -248,86 +240,66 @@ define <8 x i32> @test_mul_v8i32_v8i8(<8 x i8> %A) {
define <16 x i32> @test_mul_v16i32_v16i8(<16 x i8> %A) {
; SLM32-LABEL: test_mul_v16i32_v16i8:
; SLM32: # %bb.0:
-; SLM32-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; SLM32-NEXT: movdqa %xmm0, %xmm3
-; SLM32-NEXT: movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778]
-; SLM32-NEXT: pxor %xmm4, %xmm4
-; SLM32-NEXT: movdqa %xmm1, %xmm2
-; SLM32-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15]
-; SLM32-NEXT: movdqa %xmm3, %xmm4
-; SLM32-NEXT: pmullw %xmm0, %xmm1
-; SLM32-NEXT: pmulhw %xmm0, %xmm2
-; SLM32-NEXT: pmullw %xmm0, %xmm3
-; SLM32-NEXT: pmulhw %xmm0, %xmm4
-; SLM32-NEXT: movdqa %xmm1, %xmm0
-; SLM32-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; SLM32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SLM32-NEXT: movdqa %xmm3, %xmm2
-; SLM32-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
-; SLM32-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
+; SLM32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
+; SLM32-NEXT: movdqa {{.*#+}} xmm5 = [18778,0,18778,0,18778,0,18778,0]
+; SLM32-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1]
+; SLM32-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; SLM32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; SLM32-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SLM32-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; SLM32-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
+; SLM32-NEXT: pmaddwd %xmm5, %xmm0
+; SLM32-NEXT: pmaddwd %xmm5, %xmm1
+; SLM32-NEXT: pmaddwd %xmm5, %xmm2
+; SLM32-NEXT: pmaddwd %xmm5, %xmm3
; SLM32-NEXT: retl
;
; SLM64-LABEL: test_mul_v16i32_v16i8:
; SLM64: # %bb.0:
-; SLM64-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; SLM64-NEXT: movdqa %xmm0, %xmm3
-; SLM64-NEXT: movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778]
-; SLM64-NEXT: pxor %xmm4, %xmm4
-; SLM64-NEXT: movdqa %xmm1, %xmm2
-; SLM64-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15]
-; SLM64-NEXT: movdqa %xmm3, %xmm4
-; SLM64-NEXT: pmullw %xmm0, %xmm1
-; SLM64-NEXT: pmulhw %xmm0, %xmm2
-; SLM64-NEXT: pmullw %xmm0, %xmm3
-; SLM64-NEXT: pmulhw %xmm0, %xmm4
-; SLM64-NEXT: movdqa %xmm1, %xmm0
-; SLM64-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; SLM64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SLM64-NEXT: movdqa %xmm3, %xmm2
-; SLM64-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
-; SLM64-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
+; SLM64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
+; SLM64-NEXT: movdqa {{.*#+}} xmm5 = [18778,0,18778,0,18778,0,18778,0]
+; SLM64-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1]
+; SLM64-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; SLM64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; SLM64-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SLM64-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; SLM64-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
+; SLM64-NEXT: pmaddwd %xmm5, %xmm0
+; SLM64-NEXT: pmaddwd %xmm5, %xmm1
+; SLM64-NEXT: pmaddwd %xmm5, %xmm2
+; SLM64-NEXT: pmaddwd %xmm5, %xmm3
; SLM64-NEXT: retq
;
; SLOW32-LABEL: test_mul_v16i32_v16i8:
; SLOW32: # %bb.0:
-; SLOW32-NEXT: movdqa %xmm0, %xmm3
-; SLOW32-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; SLOW32-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778,18778,18778,18778,18778]
-; SLOW32-NEXT: movdqa %xmm1, %xmm4
-; SLOW32-NEXT: pmulhw %xmm2, %xmm4
-; SLOW32-NEXT: pmullw %xmm2, %xmm1
-; SLOW32-NEXT: movdqa %xmm1, %xmm0
-; SLOW32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; SLOW32-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
-; SLOW32-NEXT: pxor %xmm4, %xmm4
-; SLOW32-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15]
-; SLOW32-NEXT: movdqa %xmm3, %xmm4
-; SLOW32-NEXT: pmulhw %xmm2, %xmm4
-; SLOW32-NEXT: pmullw %xmm2, %xmm3
-; SLOW32-NEXT: movdqa %xmm3, %xmm2
-; SLOW32-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; SLOW32-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; SLOW32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
+; SLOW32-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; SLOW32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; SLOW32-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; SLOW32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; SLOW32-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; SLOW32-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SLOW32-NEXT: movdqa {{.*#+}} xmm4 = [18778,0,18778,0,18778,0,18778,0]
+; SLOW32-NEXT: pmaddwd %xmm4, %xmm0
+; SLOW32-NEXT: pmaddwd %xmm4, %xmm1
+; SLOW32-NEXT: pmaddwd %xmm4, %xmm2
+; SLOW32-NEXT: pmaddwd %xmm4, %xmm3
; SLOW32-NEXT: retl
;
; SLOW64-LABEL: test_mul_v16i32_v16i8:
; SLOW64: # %bb.0:
-; SLOW64-NEXT: movdqa %xmm0, %xmm3
-; SLOW64-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; SLOW64-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778,18778,18778,18778,18778]
-; SLOW64-NEXT: movdqa %xmm1, %xmm4
-; SLOW64-NEXT: pmulhw %xmm2, %xmm4
-; SLOW64-NEXT: pmullw %xmm2, %xmm1
-; SLOW64-NEXT: movdqa %xmm1, %xmm0
-; SLOW64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; SLOW64-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
-; SLOW64-NEXT: pxor %xmm4, %xmm4
-; SLOW64-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15]
-; SLOW64-NEXT: movdqa %xmm3, %xmm4
-; SLOW64-NEXT: pmulhw %xmm2, %xmm4
-; SLOW64-NEXT: pmullw %xmm2, %xmm3
-; SLOW64-NEXT: movdqa %xmm3, %xmm2
-; SLOW64-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; SLOW64-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; SLOW64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
+; SLOW64-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; SLOW64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; SLOW64-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; SLOW64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; SLOW64-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; SLOW64-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SLOW64-NEXT: movdqa {{.*#+}} xmm4 = [18778,0,18778,0,18778,0,18778,0]
+; SLOW64-NEXT: pmaddwd %xmm4, %xmm0
+; SLOW64-NEXT: pmaddwd %xmm4, %xmm1
+; SLOW64-NEXT: pmaddwd %xmm4, %xmm2
+; SLOW64-NEXT: pmaddwd %xmm4, %xmm3
; SLOW64-NEXT: retq
;
; SSE4-32-LABEL: test_mul_v16i32_v16i8:
@@ -339,7 +311,7 @@ define <16 x i32> @test_mul_v16i32_v16i8(<16 x i8> %A) {
; SSE4-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; SSE4-32-NEXT: movdqa {{.*#+}} xmm4 = [18778,18778,18778,18778]
+; SSE4-32-NEXT: movdqa {{.*#+}} xmm4 = [18778,0,18778,0,18778,0,18778,0]
; SSE4-32-NEXT: pmaddwd %xmm4, %xmm0
; SSE4-32-NEXT: pmaddwd %xmm4, %xmm1
; SSE4-32-NEXT: pmaddwd %xmm4, %xmm2
@@ -355,7 +327,7 @@ define <16 x i32> @test_mul_v16i32_v16i8(<16 x i8> %A) {
; SSE4-64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; SSE4-64-NEXT: movdqa {{.*#+}} xmm4 = [18778,18778,18778,18778]
+; SSE4-64-NEXT: movdqa {{.*#+}} xmm4 = [18778,0,18778,0,18778,0,18778,0]
; SSE4-64-NEXT: pmaddwd %xmm4, %xmm0
; SSE4-64-NEXT: pmaddwd %xmm4, %xmm1
; SSE4-64-NEXT: pmaddwd %xmm4, %xmm2
@@ -364,18 +336,12 @@ define <16 x i32> @test_mul_v16i32_v16i8(<16 x i8> %A) {
;
; AVX2-SLOW-LABEL: test_mul_v16i32_v16i8:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778]
-; AVX2-SLOW-NEXT: vpmulhw %ymm1, %ymm0, %ymm2
-; AVX2-SLOW-NEXT: vpmullw %ymm1, %ymm0, %ymm1
-; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0
-; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm2
-; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1
-; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX2-SLOW-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
+; AVX2-SLOW-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; AVX2-SLOW-NEXT: vpbroadcastd {{.*#+}} ymm2 = [18778,18778,18778,18778,18778,18778,18778,18778]
+; AVX2-SLOW-NEXT: vpmaddwd %ymm2, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vpmaddwd %ymm2, %ymm1, %ymm1
; AVX2-SLOW-NEXT: ret{{[l|q]}}
;
; AVX2-32-LABEL: test_mul_v16i32_v16i8:
@@ -852,7 +818,7 @@ define <4 x i32> @test_mul_v4i32_v4i8_minsize(<4 x i8> %A) minsize {
define <8 x i32> @test_mul_v8i32_v8i8_minsize(<8 x i8> %A) minsize {
; SLM32-LABEL: test_mul_v8i32_v8i8_minsize:
; SLM32: # %bb.0:
-; SLM32-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778]
+; SLM32-NEXT: movdqa {{.*#+}} xmm2 = [18778,0,18778,0,18778,0,18778,0]
; SLM32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; SLM32-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; SLM32-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
@@ -862,7 +828,7 @@ define <8 x i32> @test_mul_v8i32_v8i8_minsize(<8 x i8> %A) minsize {
;
; SLM64-LABEL: test_mul_v8i32_v8i8_minsize:
; SLM64: # %bb.0:
-; SLM64-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778]
+; SLM64-NEXT: movdqa {{.*#+}} xmm2 = [18778,0,18778,0,18778,0,18778,0]
; SLM64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; SLM64-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; SLM64-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
@@ -875,7 +841,7 @@ define <8 x i32> @test_mul_v8i32_v8i8_minsize(<8 x i8> %A) minsize {
; SLOW32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; SLOW32-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
; SLOW32-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; SLOW32-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778]
+; SLOW32-NEXT: movdqa {{.*#+}} xmm2 = [18778,0,18778,0,18778,0,18778,0]
; SLOW32-NEXT: pmaddwd %xmm2, %xmm0
; SLOW32-NEXT: pmaddwd %xmm2, %xmm1
; SLOW32-NEXT: retl
@@ -885,7 +851,7 @@ define <8 x i32> @test_mul_v8i32_v8i8_minsize(<8 x i8> %A) minsize {
; SLOW64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; SLOW64-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
; SLOW64-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; SLOW64-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778]
+; SLOW64-NEXT: movdqa {{.*#+}} xmm2 = [18778,0,18778,0,18778,0,18778,0]
; SLOW64-NEXT: pmaddwd %xmm2, %xmm0
; SLOW64-NEXT: pmaddwd %xmm2, %xmm1
; SLOW64-NEXT: retq
@@ -895,7 +861,7 @@ define <8 x i32> @test_mul_v8i32_v8i8_minsize(<8 x i8> %A) minsize {
; SSE4-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; SSE4-32-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778]
+; SSE4-32-NEXT: movdqa {{.*#+}} xmm2 = [18778,0,18778,0,18778,0,18778,0]
; SSE4-32-NEXT: pmaddwd %xmm2, %xmm0
; SSE4-32-NEXT: pmaddwd %xmm2, %xmm1
; SSE4-32-NEXT: retl
@@ -905,7 +871,7 @@ define <8 x i32> @test_mul_v8i32_v8i8_minsize(<8 x i8> %A) minsize {
; SSE4-64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; SSE4-64-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778]
+; SSE4-64-NEXT: movdqa {{.*#+}} xmm2 = [18778,0,18778,0,18778,0,18778,0]
; SSE4-64-NEXT: pmaddwd %xmm2, %xmm0
; SSE4-64-NEXT: pmaddwd %xmm2, %xmm1
; SSE4-64-NEXT: retq
@@ -980,7 +946,7 @@ define <16 x i32> @test_mul_v16i32_v16i8_minsize(<16 x i8> %A) minsize {
; SLM32-LABEL: test_mul_v16i32_v16i8_minsize:
; SLM32: # %bb.0:
; SLM32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
-; SLM32-NEXT: movdqa {{.*#+}} xmm5 = [18778,18778,18778,18778]
+; SLM32-NEXT: movdqa {{.*#+}} xmm5 = [18778,0,18778,0,18778,0,18778,0]
; SLM32-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1]
; SLM32-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
; SLM32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
@@ -996,7 +962,7 @@ define <16 x i32> @test_mul_v16i32_v16i8_minsize(<16 x i8> %A) minsize {
; SLM64-LABEL: test_mul_v16i32_v16i8_minsize:
; SLM64: # %bb.0:
; SLM64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
-; SLM64-NEXT: movdqa {{.*#+}} xmm5 = [18778,18778,18778,18778]
+; SLM64-NEXT: movdqa {{.*#+}} xmm5 = [18778,0,18778,0,18778,0,18778,0]
; SLM64-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1]
; SLM64-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
; SLM64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
@@ -1018,7 +984,7 @@ define <16 x i32> @test_mul_v16i32_v16i8_minsize(<16 x i8> %A) minsize {
; SLOW32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; SLOW32-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
; SLOW32-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; SLOW32-NEXT: movdqa {{.*#+}} xmm4 = [18778,18778,18778,18778]
+; SLOW32-NEXT: movdqa {{.*#+}} xmm4 = [18778,0,18778,0,18778,0,18778,0]
; SLOW32-NEXT: pmaddwd %xmm4, %xmm0
; SLOW32-NEXT: pmaddwd %xmm4, %xmm1
; SLOW32-NEXT: pmaddwd %xmm4, %xmm2
@@ -1034,7 +1000,7 @@ define <16 x i32> @test_mul_v16i32_v16i8_minsize(<16 x i8> %A) minsize {
; SLOW64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; SLOW64-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
; SLOW64-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; SLOW64-NEXT: movdqa {{.*#+}} xmm4 = [18778,18778,18778,18778]
+; SLOW64-NEXT: movdqa {{.*#+}} xmm4 = [18778,0,18778,0,18778,0,18778,0]
; SLOW64-NEXT: pmaddwd %xmm4, %xmm0
; SLOW64-NEXT: pmaddwd %xmm4, %xmm1
; SLOW64-NEXT: pmaddwd %xmm4, %xmm2
@@ -1050,7 +1016,7 @@ define <16 x i32> @test_mul_v16i32_v16i8_minsize(<16 x i8> %A) minsize {
; SSE4-32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; SSE4-32-NEXT: movdqa {{.*#+}} xmm4 = [18778,18778,18778,18778]
+; SSE4-32-NEXT: movdqa {{.*#+}} xmm4 = [18778,0,18778,0,18778,0,18778,0]
; SSE4-32-NEXT: pmaddwd %xmm4, %xmm0
; SSE4-32-NEXT: pmaddwd %xmm4, %xmm1
; SSE4-32-NEXT: pmaddwd %xmm4, %xmm2
@@ -1066,7 +1032,7 @@ define <16 x i32> @test_mul_v16i32_v16i8_minsize(<16 x i8> %A) minsize {
; SSE4-64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; SSE4-64-NEXT: movdqa {{.*#+}} xmm4 = [18778,18778,18778,18778]
+; SSE4-64-NEXT: movdqa {{.*#+}} xmm4 = [18778,0,18778,0,18778,0,18778,0]
; SSE4-64-NEXT: pmaddwd %xmm4, %xmm0
; SSE4-64-NEXT: pmaddwd %xmm4, %xmm1
; SSE4-64-NEXT: pmaddwd %xmm4, %xmm2
diff --git a/llvm/test/CodeGen/X86/vshli-simplify-demanded-bits.ll b/llvm/test/CodeGen/X86/vshli-simplify-demanded-bits.ll
index 4a3aac4d06b1..e7445344cfd5 100644
--- a/llvm/test/CodeGen/X86/vshli-simplify-demanded-bits.ll
+++ b/llvm/test/CodeGen/X86/vshli-simplify-demanded-bits.ll
@@ -11,38 +11,37 @@ define <8 x i8> @vshli_target_constant(<8 x i16> %arg, <8 x i32> %arg1) {
; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [2863311531,2863311531,2863311531,2863311531]
; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
; CHECK-NEXT: pmuludq %xmm0, %xmm1
-; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,3,2,3]
+; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
; CHECK-NEXT: pmuludq %xmm0, %xmm3
-; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
-; CHECK-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
-; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
+; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; CHECK-NEXT: psrld $1, %xmm1
+; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
; CHECK-NEXT: pmuludq %xmm0, %xmm2
; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; CHECK-NEXT: pmuludq %xmm0, %xmm1
-; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3]
+; CHECK-NEXT: pmuludq %xmm0, %xmm3
+; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,3,2,3]
; CHECK-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; CHECK-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
-; CHECK-NEXT: pslld $15, %xmm2
-; CHECK-NEXT: psrad $16, %xmm2
-; CHECK-NEXT: pslld $15, %xmm4
-; CHECK-NEXT: psrad $16, %xmm4
-; CHECK-NEXT: packssdw %xmm2, %xmm4
-; CHECK-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
-; CHECK-NEXT: pxor %xmm0, %xmm0
-; CHECK-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; CHECK-NEXT: pmullw %xmm4, %xmm1
-; CHECK-NEXT: movdqa %xmm1, %xmm0
-; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; CHECK-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
-; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [128,128,128,128]
-; CHECK-NEXT: paddd %xmm2, %xmm1
-; CHECK-NEXT: paddd %xmm2, %xmm0
+; CHECK-NEXT: psrld $1, %xmm2
+; CHECK-NEXT: movdqa {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; CHECK-NEXT: pand %xmm3, %xmm2
+; CHECK-NEXT: pand %xmm3, %xmm1
+; CHECK-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT: pxor %xmm4, %xmm4
+; CHECK-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
+; CHECK-NEXT: movdqa %xmm0, %xmm5
+; CHECK-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
+; CHECK-NEXT: pmaddwd %xmm2, %xmm5
+; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
+; CHECK-NEXT: pmaddwd %xmm1, %xmm0
+; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [128,128,128,128]
+; CHECK-NEXT: paddd %xmm1, %xmm5
+; CHECK-NEXT: paddd %xmm1, %xmm0
; CHECK-NEXT: psrld $8, %xmm0
-; CHECK-NEXT: psrld $8, %xmm1
-; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
-; CHECK-NEXT: pand %xmm2, %xmm1
-; CHECK-NEXT: pand %xmm2, %xmm0
-; CHECK-NEXT: packuswb %xmm1, %xmm0
+; CHECK-NEXT: psrld $8, %xmm5
+; CHECK-NEXT: pand %xmm3, %xmm5
+; CHECK-NEXT: pand %xmm3, %xmm0
+; CHECK-NEXT: packuswb %xmm5, %xmm0
; CHECK-NEXT: packuswb %xmm0, %xmm0
; CHECK-NEXT: retq
bb:
More information about the llvm-commits
mailing list