[llvm] r323367 - [X86][SSE] Aggressively use PMADDWD for v4i32 multiplies with 17 or more leading zeros

Simon Pilgrim via llvm-commits llvm-commits at lists.llvm.org
Wed Jan 24 11:20:02 PST 2018


Author: rksimon
Date: Wed Jan 24 11:20:02 2018
New Revision: 323367

URL: http://llvm.org/viewvc/llvm-project?rev=323367&view=rev
Log:
[X86][SSE] Aggressively use PMADDWD for v4i32 multiplies with 17 or more leading zeros

As discussed in D41484, PMADDWD for 'zero extended' vXi32 is nearly always a better option than PMULLD:
On SNB it will result in code that isn't any faster, but not any slower so we may as well keep it.
On KNL it only has half the throughput, so I've disabled it on there - ideally there'd be a better way than this.

Differential Revision: https://reviews.llvm.org/D42258

Modified:
    llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
    llvm/trunk/test/CodeGen/X86/promote.ll
    llvm/trunk/test/CodeGen/X86/shrink_vmul.ll
    llvm/trunk/test/CodeGen/X86/slow-pmulld.ll

Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=323367&r1=323366&r2=323367&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Wed Jan 24 11:20:02 2018
@@ -22326,7 +22326,7 @@ static SDValue LowerMUL(SDValue Op, cons
     assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&
            "Should not custom lower when pmulld is available!");
 
-    // If the upper 17 bits of each element are zero then we can use PMADD.
+    // If the upper 17 bits of each element are zero then we can use PMADDWD.
     APInt Mask17 = APInt::getHighBitsSet(32, 17);
     if (DAG.MaskedValueIsZero(A, Mask17) && DAG.MaskedValueIsZero(B, Mask17))
       return DAG.getNode(X86ISD::VPMADDWD, dl, VT,
@@ -32707,13 +32707,6 @@ static SDValue reduceVMULWidth(SDNode *N
   if ((NumElts % 2) != 0)
     return SDValue();
 
-  // If the upper 17 bits of each element are zero then we can use PMADD.
-  APInt Mask17 = APInt::getHighBitsSet(32, 17);
-  if (VT == MVT::v4i32 && DAG.MaskedValueIsZero(N0, Mask17) &&
-      DAG.MaskedValueIsZero(N1, Mask17))
-    return DAG.getNode(X86ISD::VPMADDWD, DL, VT, DAG.getBitcast(MVT::v8i16, N0),
-                       DAG.getBitcast(MVT::v8i16, N1));
-
   unsigned RegSize = 128;
   MVT OpsVT = MVT::getVectorVT(MVT::i16, RegSize / 16);
   EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts);
@@ -32885,6 +32878,25 @@ static SDValue combineMul(SDNode *N, Sel
                           TargetLowering::DAGCombinerInfo &DCI,
                           const X86Subtarget &Subtarget) {
   EVT VT = N->getValueType(0);
+
+  // If the upper 17 bits of each element are zero then we can use PMADDWD,
+  // which is always at least as quick as PMULLD, expect on KNL.
+  if (Subtarget.getProcFamily() != X86Subtarget::IntelKNL &&
+      ((VT == MVT::v4i32 && Subtarget.hasSSE2()) ||
+       (VT == MVT::v8i32 && Subtarget.hasAVX2()) ||
+       (VT == MVT::v16i32 && Subtarget.hasBWI()))) {
+    SDValue N0 = N->getOperand(0);
+    SDValue N1 = N->getOperand(1);
+    APInt Mask17 = APInt::getHighBitsSet(32, 17);
+    if (DAG.MaskedValueIsZero(N0, Mask17) &&
+        DAG.MaskedValueIsZero(N1, Mask17)) {
+      unsigned NumElts = VT.getVectorNumElements();
+      MVT WVT = MVT::getVectorVT(MVT::i16, 2 * NumElts);
+      return DAG.getNode(X86ISD::VPMADDWD, SDLoc(N), VT,
+                         DAG.getBitcast(WVT, N0), DAG.getBitcast(WVT, N1));
+    }
+  }
+
   if (DCI.isBeforeLegalize() && VT.isVector())
     return reduceVMULWidth(N, DAG, Subtarget);
 

Modified: llvm/trunk/test/CodeGen/X86/promote.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/promote.ll?rev=323367&r1=323366&r2=323367&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/promote.ll (original)
+++ llvm/trunk/test/CodeGen/X86/promote.ll Wed Jan 24 11:20:02 2018
@@ -7,7 +7,7 @@ define i32 @mul_f(<4 x i8>* %A) {
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; X86-NEXT:    pmulld %xmm0, %xmm0
+; X86-NEXT:    pmaddwd %xmm0, %xmm0
 ; X86-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
 ; X86-NEXT:    movd %xmm0, (%eax)
 ; X86-NEXT:    xorl %eax, %eax
@@ -16,7 +16,7 @@ define i32 @mul_f(<4 x i8>* %A) {
 ; X64-LABEL: mul_f:
 ; X64:       # %bb.0: # %entry
 ; X64-NEXT:    pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; X64-NEXT:    pmulld %xmm0, %xmm0
+; X64-NEXT:    pmaddwd %xmm0, %xmm0
 ; X64-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
 ; X64-NEXT:    movd %xmm0, (%rax)
 ; X64-NEXT:    xorl %eax, %eax

Modified: llvm/trunk/test/CodeGen/X86/shrink_vmul.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/shrink_vmul.ll?rev=323367&r1=323366&r2=323367&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/shrink_vmul.ll (original)
+++ llvm/trunk/test/CodeGen/X86/shrink_vmul.ll Wed Jan 24 11:20:02 2018
@@ -48,7 +48,7 @@ define void @mul_2xi8(i8* nocapture read
 ; X86-AVX-NEXT:    movl c, %esi
 ; X86-AVX-NEXT:    vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
 ; X86-AVX-NEXT:    vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
-; X86-AVX-NEXT:    vpmulld %xmm0, %xmm1, %xmm0
+; X86-AVX-NEXT:    vpmaddwd %xmm0, %xmm1, %xmm0
 ; X86-AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; X86-AVX-NEXT:    vmovq %xmm0, (%esi,%ecx,4)
 ; X86-AVX-NEXT:    popl %esi
@@ -74,7 +74,7 @@ define void @mul_2xi8(i8* nocapture read
 ; X64-AVX-NEXT:    movq {{.*}}(%rip), %rax
 ; X64-AVX-NEXT:    vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
 ; X64-AVX-NEXT:    vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
-; X64-AVX-NEXT:    vpmulld %xmm0, %xmm1, %xmm0
+; X64-AVX-NEXT:    vpmaddwd %xmm0, %xmm1, %xmm0
 ; X64-AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; X64-AVX-NEXT:    vmovq %xmm0, (%rax,%rdx,4)
 ; X64-AVX-NEXT:    retq
@@ -134,7 +134,7 @@ define void @mul_4xi8(i8* nocapture read
 ; X86-AVX-NEXT:    movl c, %esi
 ; X86-AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
 ; X86-AVX-NEXT:    vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; X86-AVX-NEXT:    vpmulld %xmm0, %xmm1, %xmm0
+; X86-AVX-NEXT:    vpmaddwd %xmm0, %xmm1, %xmm0
 ; X86-AVX-NEXT:    vmovdqu %xmm0, (%esi,%ecx,4)
 ; X86-AVX-NEXT:    popl %esi
 ; X86-AVX-NEXT:    retl
@@ -158,7 +158,7 @@ define void @mul_4xi8(i8* nocapture read
 ; X64-AVX-NEXT:    movq {{.*}}(%rip), %rax
 ; X64-AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
 ; X64-AVX-NEXT:    vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; X64-AVX-NEXT:    vpmulld %xmm0, %xmm1, %xmm0
+; X64-AVX-NEXT:    vpmaddwd %xmm0, %xmm1, %xmm0
 ; X64-AVX-NEXT:    vmovdqu %xmm0, (%rax,%rdx,4)
 ; X64-AVX-NEXT:    retq
 entry:
@@ -220,9 +220,9 @@ define void @mul_8xi8(i8* nocapture read
 ; X86-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
 ; X86-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
 ; X86-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; X86-AVX1-NEXT:    vpmulld %xmm0, %xmm2, %xmm0
+; X86-AVX1-NEXT:    vpmaddwd %xmm0, %xmm2, %xmm0
 ; X86-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; X86-AVX1-NEXT:    vpmulld %xmm1, %xmm2, %xmm1
+; X86-AVX1-NEXT:    vpmaddwd %xmm1, %xmm2, %xmm1
 ; X86-AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; X86-AVX1-NEXT:    vmovups %ymm0, (%esi,%ecx,4)
 ; X86-AVX1-NEXT:    popl %esi
@@ -240,7 +240,7 @@ define void @mul_8xi8(i8* nocapture read
 ; X86-AVX2-NEXT:    movl c, %esi
 ; X86-AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
 ; X86-AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; X86-AVX2-NEXT:    vpmulld %ymm0, %ymm1, %ymm0
+; X86-AVX2-NEXT:    vpmaddwd %ymm0, %ymm1, %ymm0
 ; X86-AVX2-NEXT:    vmovdqu %ymm0, (%esi,%ecx,4)
 ; X86-AVX2-NEXT:    popl %esi
 ; X86-AVX2-NEXT:    vzeroupper
@@ -268,9 +268,9 @@ define void @mul_8xi8(i8* nocapture read
 ; X64-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
 ; X64-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
 ; X64-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; X64-AVX1-NEXT:    vpmulld %xmm0, %xmm2, %xmm0
+; X64-AVX1-NEXT:    vpmaddwd %xmm0, %xmm2, %xmm0
 ; X64-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; X64-AVX1-NEXT:    vpmulld %xmm1, %xmm2, %xmm1
+; X64-AVX1-NEXT:    vpmaddwd %xmm1, %xmm2, %xmm1
 ; X64-AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; X64-AVX1-NEXT:    vmovups %ymm0, (%rax,%rdx,4)
 ; X64-AVX1-NEXT:    vzeroupper
@@ -281,7 +281,7 @@ define void @mul_8xi8(i8* nocapture read
 ; X64-AVX2-NEXT:    movq {{.*}}(%rip), %rax
 ; X64-AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
 ; X64-AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; X64-AVX2-NEXT:    vpmulld %ymm0, %ymm1, %ymm0
+; X64-AVX2-NEXT:    vpmaddwd %ymm0, %ymm1, %ymm0
 ; X64-AVX2-NEXT:    vmovdqu %ymm0, (%rax,%rdx,4)
 ; X64-AVX2-NEXT:    vzeroupper
 ; X64-AVX2-NEXT:    retq
@@ -356,13 +356,13 @@ define void @mul_16xi8(i8* nocapture rea
 ; X86-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
 ; X86-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
 ; X86-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; X86-AVX1-NEXT:    vpmulld %xmm0, %xmm4, %xmm0
+; X86-AVX1-NEXT:    vpmaddwd %xmm0, %xmm4, %xmm0
 ; X86-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; X86-AVX1-NEXT:    vpmulld %xmm1, %xmm4, %xmm1
+; X86-AVX1-NEXT:    vpmaddwd %xmm1, %xmm4, %xmm1
 ; X86-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; X86-AVX1-NEXT:    vpmulld %xmm2, %xmm4, %xmm2
+; X86-AVX1-NEXT:    vpmaddwd %xmm2, %xmm4, %xmm2
 ; X86-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; X86-AVX1-NEXT:    vpmulld %xmm3, %xmm4, %xmm3
+; X86-AVX1-NEXT:    vpmaddwd %xmm3, %xmm4, %xmm3
 ; X86-AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
 ; X86-AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; X86-AVX1-NEXT:    vmovups %ymm0, 32(%esi,%ecx,4)
@@ -383,9 +383,9 @@ define void @mul_16xi8(i8* nocapture rea
 ; X86-AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
 ; X86-AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
 ; X86-AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; X86-AVX2-NEXT:    vpmulld %ymm0, %ymm2, %ymm0
+; X86-AVX2-NEXT:    vpmaddwd %ymm0, %ymm2, %ymm0
 ; X86-AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; X86-AVX2-NEXT:    vpmulld %ymm1, %ymm2, %ymm1
+; X86-AVX2-NEXT:    vpmaddwd %ymm1, %ymm2, %ymm1
 ; X86-AVX2-NEXT:    vmovdqu %ymm0, 32(%esi,%ecx,4)
 ; X86-AVX2-NEXT:    vmovdqu %ymm1, (%esi,%ecx,4)
 ; X86-AVX2-NEXT:    popl %esi
@@ -426,13 +426,13 @@ define void @mul_16xi8(i8* nocapture rea
 ; X64-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
 ; X64-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
 ; X64-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; X64-AVX1-NEXT:    vpmulld %xmm0, %xmm4, %xmm0
+; X64-AVX1-NEXT:    vpmaddwd %xmm0, %xmm4, %xmm0
 ; X64-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; X64-AVX1-NEXT:    vpmulld %xmm1, %xmm4, %xmm1
+; X64-AVX1-NEXT:    vpmaddwd %xmm1, %xmm4, %xmm1
 ; X64-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; X64-AVX1-NEXT:    vpmulld %xmm2, %xmm4, %xmm2
+; X64-AVX1-NEXT:    vpmaddwd %xmm2, %xmm4, %xmm2
 ; X64-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; X64-AVX1-NEXT:    vpmulld %xmm3, %xmm4, %xmm3
+; X64-AVX1-NEXT:    vpmaddwd %xmm3, %xmm4, %xmm3
 ; X64-AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
 ; X64-AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; X64-AVX1-NEXT:    vmovups %ymm0, 32(%rax,%rdx,4)
@@ -446,9 +446,9 @@ define void @mul_16xi8(i8* nocapture rea
 ; X64-AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
 ; X64-AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
 ; X64-AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; X64-AVX2-NEXT:    vpmulld %ymm0, %ymm2, %ymm0
+; X64-AVX2-NEXT:    vpmaddwd %ymm0, %ymm2, %ymm0
 ; X64-AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; X64-AVX2-NEXT:    vpmulld %ymm1, %ymm2, %ymm1
+; X64-AVX2-NEXT:    vpmaddwd %ymm1, %ymm2, %ymm1
 ; X64-AVX2-NEXT:    vmovdqu %ymm0, 32(%rax,%rdx,4)
 ; X64-AVX2-NEXT:    vmovdqu %ymm1, (%rax,%rdx,4)
 ; X64-AVX2-NEXT:    vzeroupper
@@ -1488,7 +1488,7 @@ define void @mul_2xi8_varconst1(i8* noca
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-AVX-NEXT:    movl c, %edx
 ; X86-AVX-NEXT:    vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
-; X86-AVX-NEXT:    vpmulld {{\.LCPI.*}}, %xmm0, %xmm0
+; X86-AVX-NEXT:    vpmaddwd {{\.LCPI.*}}, %xmm0, %xmm0
 ; X86-AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; X86-AVX-NEXT:    vmovq %xmm0, (%edx,%eax,4)
 ; X86-AVX-NEXT:    retl
@@ -1512,7 +1512,7 @@ define void @mul_2xi8_varconst1(i8* noca
 ; X64-AVX-NEXT:    movl $255, %ecx
 ; X64-AVX-NEXT:    vmovq %rcx, %xmm1
 ; X64-AVX-NEXT:    vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
-; X64-AVX-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT:    vpmaddwd %xmm1, %xmm0, %xmm0
 ; X64-AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; X64-AVX-NEXT:    vmovq %xmm0, (%rax,%rsi,4)
 ; X64-AVX-NEXT:    retq
@@ -1624,7 +1624,7 @@ define void @mul_2xi8_varconst3(i8* noca
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-AVX-NEXT:    movl c, %edx
 ; X86-AVX-NEXT:    vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
-; X86-AVX-NEXT:    vpmulld {{\.LCPI.*}}, %xmm0, %xmm0
+; X86-AVX-NEXT:    vpmaddwd {{\.LCPI.*}}, %xmm0, %xmm0
 ; X86-AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; X86-AVX-NEXT:    vmovq %xmm0, (%edx,%eax,4)
 ; X86-AVX-NEXT:    retl
@@ -1651,7 +1651,7 @@ define void @mul_2xi8_varconst3(i8* noca
 ; X64-AVX-NEXT:    movl $256, %ecx # imm = 0x100
 ; X64-AVX-NEXT:    vmovq %rcx, %xmm1
 ; X64-AVX-NEXT:    vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
-; X64-AVX-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT:    vpmaddwd %xmm1, %xmm0, %xmm0
 ; X64-AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; X64-AVX-NEXT:    vmovq %xmm0, (%rax,%rsi,4)
 ; X64-AVX-NEXT:    retq
@@ -2299,8 +2299,8 @@ define void @PR34947() {
 ; X86-AVX1-NEXT:    movl $8199, %eax # imm = 0x2007
 ; X86-AVX1-NEXT:    vmovd %eax, %xmm3
 ; X86-AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [8199,8199,8199,8199]
-; X86-AVX1-NEXT:    vpmulld %xmm4, %xmm0, %xmm0
-; X86-AVX1-NEXT:    vpmulld %xmm4, %xmm1, %xmm1
+; X86-AVX1-NEXT:    vpmaddwd %xmm4, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpmaddwd %xmm4, %xmm1, %xmm1
 ; X86-AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; X86-AVX1-NEXT:    vpmulld %xmm3, %xmm2, %xmm1
 ; X86-AVX1-NEXT:    vmovd %xmm1, (%eax)
@@ -2368,7 +2368,7 @@ define void @PR34947() {
 ; X86-AVX2-NEXT:    divl (%eax)
 ; X86-AVX2-NEXT:    vmovd %edx, %xmm1
 ; X86-AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [8199,8199,8199,8199,8199,8199,8199,8199]
-; X86-AVX2-NEXT:    vpmulld %ymm2, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpmaddwd %ymm2, %ymm0, %ymm0
 ; X86-AVX2-NEXT:    movl $8199, %eax # imm = 0x2007
 ; X86-AVX2-NEXT:    vmovd %eax, %xmm2
 ; X86-AVX2-NEXT:    vpmulld %ymm2, %ymm1, %ymm1
@@ -2477,12 +2477,12 @@ define void @PR34947() {
 ; X64-AVX1-NEXT:    vpinsrd $2, %ecx, %xmm0, %xmm0
 ; X64-AVX1-NEXT:    vpinsrd $3, %edi, %xmm0, %xmm0
 ; X64-AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [8199,8199,8199,8199]
-; X64-AVX1-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpmaddwd %xmm1, %xmm0, %xmm0
 ; X64-AVX1-NEXT:    vmovd %esi, %xmm2
 ; X64-AVX1-NEXT:    vpinsrd $1, %r11d, %xmm2, %xmm2
 ; X64-AVX1-NEXT:    vpinsrd $2, %r10d, %xmm2, %xmm2
 ; X64-AVX1-NEXT:    vpinsrd $3, %r9d, %xmm2, %xmm2
-; X64-AVX1-NEXT:    vpmulld %xmm1, %xmm2, %xmm1
+; X64-AVX1-NEXT:    vpmaddwd %xmm1, %xmm2, %xmm1
 ; X64-AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; X64-AVX1-NEXT:    vmovd %r8d, %xmm1
 ; X64-AVX1-NEXT:    movl $8199, %eax # imm = 0x2007
@@ -2547,7 +2547,7 @@ define void @PR34947() {
 ; X64-AVX2-NEXT:    divl (%rax)
 ; X64-AVX2-NEXT:    vmovd %edx, %xmm1
 ; X64-AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [8199,8199,8199,8199,8199,8199,8199,8199]
-; X64-AVX2-NEXT:    vpmulld %ymm2, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpmaddwd %ymm2, %ymm0, %ymm0
 ; X64-AVX2-NEXT:    movl $8199, %eax # imm = 0x2007
 ; X64-AVX2-NEXT:    vmovd %eax, %xmm2
 ; X64-AVX2-NEXT:    vpmulld %ymm2, %ymm1, %ymm1

Modified: llvm/trunk/test/CodeGen/X86/slow-pmulld.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/slow-pmulld.ll?rev=323367&r1=323366&r2=323367&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/slow-pmulld.ll (original)
+++ llvm/trunk/test/CodeGen/X86/slow-pmulld.ll Wed Jan 24 11:20:02 2018
@@ -33,28 +33,64 @@ define <4 x i32> @test_mul_v4i32_v4i8(<4
 ; SSE4-32-LABEL: test_mul_v4i32_v4i8:
 ; SSE4-32:       # %bb.0:
 ; SSE4-32-NEXT:    pand {{\.LCPI.*}}, %xmm0
-; SSE4-32-NEXT:    pmulld {{\.LCPI.*}}, %xmm0
+; SSE4-32-NEXT:    pmaddwd {{\.LCPI.*}}, %xmm0
 ; SSE4-32-NEXT:    retl
 ;
 ; SSE4-64-LABEL: test_mul_v4i32_v4i8:
 ; SSE4-64:       # %bb.0:
 ; SSE4-64-NEXT:    pand {{.*}}(%rip), %xmm0
-; SSE4-64-NEXT:    pmulld {{.*}}(%rip), %xmm0
+; SSE4-64-NEXT:    pmaddwd {{.*}}(%rip), %xmm0
 ; SSE4-64-NEXT:    retq
 ;
-; AVX-32-LABEL: test_mul_v4i32_v4i8:
-; AVX-32:       # %bb.0:
-; AVX-32-NEXT:    vpand {{\.LCPI.*}}, %xmm0, %xmm0
-; AVX-32-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [18778,18778,18778,18778]
-; AVX-32-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
-; AVX-32-NEXT:    retl
+; AVX2-32-LABEL: test_mul_v4i32_v4i8:
+; AVX2-32:       # %bb.0:
+; AVX2-32-NEXT:    vpand {{\.LCPI.*}}, %xmm0, %xmm0
+; AVX2-32-NEXT:    vpmaddwd {{\.LCPI.*}}, %xmm0, %xmm0
+; AVX2-32-NEXT:    retl
 ;
-; AVX-64-LABEL: test_mul_v4i32_v4i8:
-; AVX-64:       # %bb.0:
-; AVX-64-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX-64-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [18778,18778,18778,18778]
-; AVX-64-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
-; AVX-64-NEXT:    retq
+; AVX2-64-LABEL: test_mul_v4i32_v4i8:
+; AVX2-64:       # %bb.0:
+; AVX2-64-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-64-NEXT:    vpmaddwd {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-64-NEXT:    retq
+;
+; AVX512DQ-32-LABEL: test_mul_v4i32_v4i8:
+; AVX512DQ-32:       # %bb.0:
+; AVX512DQ-32-NEXT:    vpand {{\.LCPI.*}}, %xmm0, %xmm0
+; AVX512DQ-32-NEXT:    vpmaddwd {{\.LCPI.*}}, %xmm0, %xmm0
+; AVX512DQ-32-NEXT:    retl
+;
+; AVX512DQ-64-LABEL: test_mul_v4i32_v4i8:
+; AVX512DQ-64:       # %bb.0:
+; AVX512DQ-64-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512DQ-64-NEXT:    vpmaddwd {{.*}}(%rip), %xmm0, %xmm0
+; AVX512DQ-64-NEXT:    retq
+;
+; AVX512BW-32-LABEL: test_mul_v4i32_v4i8:
+; AVX512BW-32:       # %bb.0:
+; AVX512BW-32-NEXT:    vpand {{\.LCPI.*}}, %xmm0, %xmm0
+; AVX512BW-32-NEXT:    vpmaddwd {{\.LCPI.*}}, %xmm0, %xmm0
+; AVX512BW-32-NEXT:    retl
+;
+; AVX512BW-64-LABEL: test_mul_v4i32_v4i8:
+; AVX512BW-64:       # %bb.0:
+; AVX512BW-64-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512BW-64-NEXT:    vpmaddwd {{.*}}(%rip), %xmm0, %xmm0
+; AVX512BW-64-NEXT:    retq
+;
+; KNL-32-LABEL: test_mul_v4i32_v4i8:
+; KNL-32:       # %bb.0:
+; KNL-32-NEXT:    vpand {{\.LCPI.*}}, %xmm0, %xmm0
+; KNL-32-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [18778,18778,18778,18778]
+; KNL-32-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
+; KNL-32-NEXT:    retl
+;
+; KNL-64-LABEL: test_mul_v4i32_v4i8:
+; KNL-64:       # %bb.0:
+; KNL-64-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; KNL-64-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [18778,18778,18778,18778]
+; KNL-64-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
+; KNL-64-NEXT:    retq
   %z = zext <4 x i8> %A to <4 x i32>
   %m = mul nuw nsw <4 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778>
   ret <4 x i32> %m
@@ -120,8 +156,8 @@ define <8 x i32> @test_mul_v8i32_v8i8(<8
 ; SSE4-32-NEXT:    pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
 ; SSE4-32-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
 ; SSE4-32-NEXT:    movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778]
-; SSE4-32-NEXT:    pmulld %xmm2, %xmm0
-; SSE4-32-NEXT:    pmulld %xmm2, %xmm1
+; SSE4-32-NEXT:    pmaddwd %xmm2, %xmm0
+; SSE4-32-NEXT:    pmaddwd %xmm2, %xmm1
 ; SSE4-32-NEXT:    retl
 ;
 ; SSE4-64-LABEL: test_mul_v8i32_v8i8:
@@ -131,25 +167,67 @@ define <8 x i32> @test_mul_v8i32_v8i8(<8
 ; SSE4-64-NEXT:    pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
 ; SSE4-64-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
 ; SSE4-64-NEXT:    movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778]
-; SSE4-64-NEXT:    pmulld %xmm2, %xmm0
-; SSE4-64-NEXT:    pmulld %xmm2, %xmm1
+; SSE4-64-NEXT:    pmaddwd %xmm2, %xmm0
+; SSE4-64-NEXT:    pmaddwd %xmm2, %xmm1
 ; SSE4-64-NEXT:    retq
 ;
-; AVX-32-LABEL: test_mul_v8i32_v8i8:
-; AVX-32:       # %bb.0:
-; AVX-32-NEXT:    vpand {{\.LCPI.*}}, %xmm0, %xmm0
-; AVX-32-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX-32-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778]
-; AVX-32-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
-; AVX-32-NEXT:    retl
+; AVX2-32-LABEL: test_mul_v8i32_v8i8:
+; AVX2-32:       # %bb.0:
+; AVX2-32-NEXT:    vpand {{\.LCPI.*}}, %xmm0, %xmm0
+; AVX2-32-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-32-NEXT:    vpmaddwd {{\.LCPI.*}}, %ymm0, %ymm0
+; AVX2-32-NEXT:    retl
 ;
-; AVX-64-LABEL: test_mul_v8i32_v8i8:
-; AVX-64:       # %bb.0:
-; AVX-64-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX-64-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX-64-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778]
-; AVX-64-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
-; AVX-64-NEXT:    retq
+; AVX2-64-LABEL: test_mul_v8i32_v8i8:
+; AVX2-64:       # %bb.0:
+; AVX2-64-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-64-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-64-NEXT:    vpmaddwd {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-64-NEXT:    retq
+;
+; AVX512DQ-32-LABEL: test_mul_v8i32_v8i8:
+; AVX512DQ-32:       # %bb.0:
+; AVX512DQ-32-NEXT:    vpand {{\.LCPI.*}}, %xmm0, %xmm0
+; AVX512DQ-32-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512DQ-32-NEXT:    vpmaddwd {{\.LCPI.*}}, %ymm0, %ymm0
+; AVX512DQ-32-NEXT:    retl
+;
+; AVX512DQ-64-LABEL: test_mul_v8i32_v8i8:
+; AVX512DQ-64:       # %bb.0:
+; AVX512DQ-64-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512DQ-64-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512DQ-64-NEXT:    vpmaddwd {{.*}}(%rip), %ymm0, %ymm0
+; AVX512DQ-64-NEXT:    retq
+;
+; AVX512BW-32-LABEL: test_mul_v8i32_v8i8:
+; AVX512BW-32:       # %bb.0:
+; AVX512BW-32-NEXT:    vpand {{\.LCPI.*}}, %xmm0, %xmm0
+; AVX512BW-32-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512BW-32-NEXT:    vpmaddwd {{\.LCPI.*}}, %ymm0, %ymm0
+; AVX512BW-32-NEXT:    retl
+;
+; AVX512BW-64-LABEL: test_mul_v8i32_v8i8:
+; AVX512BW-64:       # %bb.0:
+; AVX512BW-64-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512BW-64-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512BW-64-NEXT:    vpmaddwd {{.*}}(%rip), %ymm0, %ymm0
+; AVX512BW-64-NEXT:    retq
+;
+; KNL-32-LABEL: test_mul_v8i32_v8i8:
+; KNL-32:       # %bb.0:
+; KNL-32-NEXT:    vpand {{\.LCPI.*}}, %xmm0, %xmm0
+; KNL-32-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; KNL-32-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778]
+; KNL-32-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
+; KNL-32-NEXT:    retl
+;
+; KNL-64-LABEL: test_mul_v8i32_v8i8:
+; KNL-64:       # %bb.0:
+; KNL-64-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; KNL-64-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; KNL-64-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778]
+; KNL-64-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
+; KNL-64-NEXT:    retq
   %z = zext <8 x i8> %A to <8 x i32>
   %m = mul nuw nsw <8 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778>
   ret <8 x i32> %m
@@ -248,10 +326,10 @@ define <16 x i32> @test_mul_v16i32_v16i8
 ; SSE4-32-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
 ; SSE4-32-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
 ; SSE4-32-NEXT:    movdqa {{.*#+}} xmm4 = [18778,18778,18778,18778]
-; SSE4-32-NEXT:    pmulld %xmm4, %xmm0
-; SSE4-32-NEXT:    pmulld %xmm4, %xmm1
-; SSE4-32-NEXT:    pmulld %xmm4, %xmm2
-; SSE4-32-NEXT:    pmulld %xmm4, %xmm3
+; SSE4-32-NEXT:    pmaddwd %xmm4, %xmm0
+; SSE4-32-NEXT:    pmaddwd %xmm4, %xmm1
+; SSE4-32-NEXT:    pmaddwd %xmm4, %xmm2
+; SSE4-32-NEXT:    pmaddwd %xmm4, %xmm3
 ; SSE4-32-NEXT:    retl
 ;
 ; SSE4-64-LABEL: test_mul_v16i32_v16i8:
@@ -264,10 +342,10 @@ define <16 x i32> @test_mul_v16i32_v16i8
 ; SSE4-64-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
 ; SSE4-64-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
 ; SSE4-64-NEXT:    movdqa {{.*#+}} xmm4 = [18778,18778,18778,18778]
-; SSE4-64-NEXT:    pmulld %xmm4, %xmm0
-; SSE4-64-NEXT:    pmulld %xmm4, %xmm1
-; SSE4-64-NEXT:    pmulld %xmm4, %xmm2
-; SSE4-64-NEXT:    pmulld %xmm4, %xmm3
+; SSE4-64-NEXT:    pmaddwd %xmm4, %xmm0
+; SSE4-64-NEXT:    pmaddwd %xmm4, %xmm1
+; SSE4-64-NEXT:    pmaddwd %xmm4, %xmm2
+; SSE4-64-NEXT:    pmaddwd %xmm4, %xmm3
 ; SSE4-64-NEXT:    retq
 ;
 ; AVX2-32-LABEL: test_mul_v16i32_v16i8:
@@ -276,8 +354,8 @@ define <16 x i32> @test_mul_v16i32_v16i8
 ; AVX2-32-NEXT:    vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
 ; AVX2-32-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
 ; AVX2-32-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [18778,18778,18778,18778,18778,18778,18778,18778]
-; AVX2-32-NEXT:    vpmulld %ymm2, %ymm0, %ymm0
-; AVX2-32-NEXT:    vpmulld %ymm2, %ymm1, %ymm1
+; AVX2-32-NEXT:    vpmaddwd %ymm2, %ymm0, %ymm0
+; AVX2-32-NEXT:    vpmaddwd %ymm2, %ymm1, %ymm1
 ; AVX2-32-NEXT:    retl
 ;
 ; AVX2-64-LABEL: test_mul_v16i32_v16i8:
@@ -286,21 +364,45 @@ define <16 x i32> @test_mul_v16i32_v16i8
 ; AVX2-64-NEXT:    vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
 ; AVX2-64-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
 ; AVX2-64-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [18778,18778,18778,18778,18778,18778,18778,18778]
-; AVX2-64-NEXT:    vpmulld %ymm2, %ymm0, %ymm0
-; AVX2-64-NEXT:    vpmulld %ymm2, %ymm1, %ymm1
+; AVX2-64-NEXT:    vpmaddwd %ymm2, %ymm0, %ymm0
+; AVX2-64-NEXT:    vpmaddwd %ymm2, %ymm1, %ymm1
 ; AVX2-64-NEXT:    retq
 ;
-; AVX512-32-LABEL: test_mul_v16i32_v16i8:
-; AVX512-32:       # %bb.0:
-; AVX512-32-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; AVX512-32-NEXT:    vpmulld {{\.LCPI.*}}{1to16}, %zmm0, %zmm0
-; AVX512-32-NEXT:    retl
-;
-; AVX512-64-LABEL: test_mul_v16i32_v16i8:
-; AVX512-64:       # %bb.0:
-; AVX512-64-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; AVX512-64-NEXT:    vpmulld {{.*}}(%rip){1to16}, %zmm0, %zmm0
-; AVX512-64-NEXT:    retq
+; AVX512DQ-32-LABEL: test_mul_v16i32_v16i8:
+; AVX512DQ-32:       # %bb.0:
+; AVX512DQ-32-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQ-32-NEXT:    vpmulld {{\.LCPI.*}}{1to16}, %zmm0, %zmm0
+; AVX512DQ-32-NEXT:    retl
+;
+; AVX512DQ-64-LABEL: test_mul_v16i32_v16i8:
+; AVX512DQ-64:       # %bb.0:
+; AVX512DQ-64-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQ-64-NEXT:    vpmulld {{.*}}(%rip){1to16}, %zmm0, %zmm0
+; AVX512DQ-64-NEXT:    retq
+;
+; AVX512BW-32-LABEL: test_mul_v16i32_v16i8:
+; AVX512BW-32:       # %bb.0:
+; AVX512BW-32-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512BW-32-NEXT:    vpmaddwd {{\.LCPI.*}}, %zmm0, %zmm0
+; AVX512BW-32-NEXT:    retl
+;
+; AVX512BW-64-LABEL: test_mul_v16i32_v16i8:
+; AVX512BW-64:       # %bb.0:
+; AVX512BW-64-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512BW-64-NEXT:    vpmaddwd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512BW-64-NEXT:    retq
+;
+; KNL-32-LABEL: test_mul_v16i32_v16i8:
+; KNL-32:       # %bb.0:
+; KNL-32-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; KNL-32-NEXT:    vpmulld {{\.LCPI.*}}{1to16}, %zmm0, %zmm0
+; KNL-32-NEXT:    retl
+;
+; KNL-64-LABEL: test_mul_v16i32_v16i8:
+; KNL-64:       # %bb.0:
+; KNL-64-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; KNL-64-NEXT:    vpmulld {{.*}}(%rip){1to16}, %zmm0, %zmm0
+; KNL-64-NEXT:    retq
   %z = zext <16 x i8> %A to <16 x i32>
   %m = mul nuw nsw <16 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778>
   ret <16 x i32> %m
@@ -621,40 +723,76 @@ define <4 x i32> @test_mul_v4i32_v4i8_mi
 ; CHECK32-LABEL: test_mul_v4i32_v4i8_minsize:
 ; CHECK32:       # %bb.0:
 ; CHECK32-NEXT:    pand {{\.LCPI.*}}, %xmm0
-; CHECK32-NEXT:    pmulld {{\.LCPI.*}}, %xmm0
+; CHECK32-NEXT:    pmaddwd {{\.LCPI.*}}, %xmm0
 ; CHECK32-NEXT:    retl
 ;
 ; CHECK64-LABEL: test_mul_v4i32_v4i8_minsize:
 ; CHECK64:       # %bb.0:
 ; CHECK64-NEXT:    pand {{.*}}(%rip), %xmm0
-; CHECK64-NEXT:    pmulld {{.*}}(%rip), %xmm0
+; CHECK64-NEXT:    pmaddwd {{.*}}(%rip), %xmm0
 ; CHECK64-NEXT:    retq
 ;
 ; SSE4-32-LABEL: test_mul_v4i32_v4i8_minsize:
 ; SSE4-32:       # %bb.0:
 ; SSE4-32-NEXT:    pand {{\.LCPI.*}}, %xmm0
-; SSE4-32-NEXT:    pmulld {{\.LCPI.*}}, %xmm0
+; SSE4-32-NEXT:    pmaddwd {{\.LCPI.*}}, %xmm0
 ; SSE4-32-NEXT:    retl
 ;
 ; SSE4-64-LABEL: test_mul_v4i32_v4i8_minsize:
 ; SSE4-64:       # %bb.0:
 ; SSE4-64-NEXT:    pand {{.*}}(%rip), %xmm0
-; SSE4-64-NEXT:    pmulld {{.*}}(%rip), %xmm0
+; SSE4-64-NEXT:    pmaddwd {{.*}}(%rip), %xmm0
 ; SSE4-64-NEXT:    retq
 ;
-; AVX-32-LABEL: test_mul_v4i32_v4i8_minsize:
-; AVX-32:       # %bb.0:
-; AVX-32-NEXT:    vpand {{\.LCPI.*}}, %xmm0, %xmm0
-; AVX-32-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [18778,18778,18778,18778]
-; AVX-32-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
-; AVX-32-NEXT:    retl
+; AVX2-32-LABEL: test_mul_v4i32_v4i8_minsize:
+; AVX2-32:       # %bb.0:
+; AVX2-32-NEXT:    vpand {{\.LCPI.*}}, %xmm0, %xmm0
+; AVX2-32-NEXT:    vpmaddwd {{\.LCPI.*}}, %xmm0, %xmm0
+; AVX2-32-NEXT:    retl
 ;
-; AVX-64-LABEL: test_mul_v4i32_v4i8_minsize:
-; AVX-64:       # %bb.0:
-; AVX-64-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX-64-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [18778,18778,18778,18778]
-; AVX-64-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
-; AVX-64-NEXT:    retq
+; AVX2-64-LABEL: test_mul_v4i32_v4i8_minsize:
+; AVX2-64:       # %bb.0:
+; AVX2-64-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-64-NEXT:    vpmaddwd {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-64-NEXT:    retq
+;
+; AVX512DQ-32-LABEL: test_mul_v4i32_v4i8_minsize:
+; AVX512DQ-32:       # %bb.0:
+; AVX512DQ-32-NEXT:    vpand {{\.LCPI.*}}, %xmm0, %xmm0
+; AVX512DQ-32-NEXT:    vpmaddwd {{\.LCPI.*}}, %xmm0, %xmm0
+; AVX512DQ-32-NEXT:    retl
+;
+; AVX512DQ-64-LABEL: test_mul_v4i32_v4i8_minsize:
+; AVX512DQ-64:       # %bb.0:
+; AVX512DQ-64-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512DQ-64-NEXT:    vpmaddwd {{.*}}(%rip), %xmm0, %xmm0
+; AVX512DQ-64-NEXT:    retq
+;
+; AVX512BW-32-LABEL: test_mul_v4i32_v4i8_minsize:
+; AVX512BW-32:       # %bb.0:
+; AVX512BW-32-NEXT:    vpand {{\.LCPI.*}}, %xmm0, %xmm0
+; AVX512BW-32-NEXT:    vpmaddwd {{\.LCPI.*}}, %xmm0, %xmm0
+; AVX512BW-32-NEXT:    retl
+;
+; AVX512BW-64-LABEL: test_mul_v4i32_v4i8_minsize:
+; AVX512BW-64:       # %bb.0:
+; AVX512BW-64-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512BW-64-NEXT:    vpmaddwd {{.*}}(%rip), %xmm0, %xmm0
+; AVX512BW-64-NEXT:    retq
+;
+; KNL-32-LABEL: test_mul_v4i32_v4i8_minsize:
+; KNL-32:       # %bb.0:
+; KNL-32-NEXT:    vpand {{\.LCPI.*}}, %xmm0, %xmm0
+; KNL-32-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [18778,18778,18778,18778]
+; KNL-32-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
+; KNL-32-NEXT:    retl
+;
+; KNL-64-LABEL: test_mul_v4i32_v4i8_minsize:
+; KNL-64:       # %bb.0:
+; KNL-64-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; KNL-64-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [18778,18778,18778,18778]
+; KNL-64-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
+; KNL-64-NEXT:    retq
   %z = zext <4 x i8> %A to <4 x i32>
   %m = mul nuw nsw <4 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778>
   ret <4 x i32> %m
@@ -668,8 +806,8 @@ define <8 x i32> @test_mul_v8i32_v8i8_mi
 ; SLM32-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; SLM32-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
 ; SLM32-NEXT:    pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; SLM32-NEXT:    pmulld %xmm2, %xmm0
-; SLM32-NEXT:    pmulld %xmm2, %xmm1
+; SLM32-NEXT:    pmaddwd %xmm2, %xmm0
+; SLM32-NEXT:    pmaddwd %xmm2, %xmm1
 ; SLM32-NEXT:    retl
 ;
 ; SLM64-LABEL: test_mul_v8i32_v8i8_minsize:
@@ -679,8 +817,8 @@ define <8 x i32> @test_mul_v8i32_v8i8_mi
 ; SLM64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; SLM64-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
 ; SLM64-NEXT:    pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; SLM64-NEXT:    pmulld %xmm2, %xmm0
-; SLM64-NEXT:    pmulld %xmm2, %xmm1
+; SLM64-NEXT:    pmaddwd %xmm2, %xmm0
+; SLM64-NEXT:    pmaddwd %xmm2, %xmm1
 ; SLM64-NEXT:    retq
 ;
 ; SLOW32-LABEL: test_mul_v8i32_v8i8_minsize:
@@ -690,8 +828,8 @@ define <8 x i32> @test_mul_v8i32_v8i8_mi
 ; SLOW32-NEXT:    pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
 ; SLOW32-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
 ; SLOW32-NEXT:    movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778]
-; SLOW32-NEXT:    pmulld %xmm2, %xmm0
-; SLOW32-NEXT:    pmulld %xmm2, %xmm1
+; SLOW32-NEXT:    pmaddwd %xmm2, %xmm0
+; SLOW32-NEXT:    pmaddwd %xmm2, %xmm1
 ; SLOW32-NEXT:    retl
 ;
 ; SLOW64-LABEL: test_mul_v8i32_v8i8_minsize:
@@ -701,8 +839,8 @@ define <8 x i32> @test_mul_v8i32_v8i8_mi
 ; SLOW64-NEXT:    pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
 ; SLOW64-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
 ; SLOW64-NEXT:    movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778]
-; SLOW64-NEXT:    pmulld %xmm2, %xmm0
-; SLOW64-NEXT:    pmulld %xmm2, %xmm1
+; SLOW64-NEXT:    pmaddwd %xmm2, %xmm0
+; SLOW64-NEXT:    pmaddwd %xmm2, %xmm1
 ; SLOW64-NEXT:    retq
 ;
 ; SSE4-32-LABEL: test_mul_v8i32_v8i8_minsize:
@@ -712,8 +850,8 @@ define <8 x i32> @test_mul_v8i32_v8i8_mi
 ; SSE4-32-NEXT:    pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
 ; SSE4-32-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
 ; SSE4-32-NEXT:    movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778]
-; SSE4-32-NEXT:    pmulld %xmm2, %xmm0
-; SSE4-32-NEXT:    pmulld %xmm2, %xmm1
+; SSE4-32-NEXT:    pmaddwd %xmm2, %xmm0
+; SSE4-32-NEXT:    pmaddwd %xmm2, %xmm1
 ; SSE4-32-NEXT:    retl
 ;
 ; SSE4-64-LABEL: test_mul_v8i32_v8i8_minsize:
@@ -723,25 +861,67 @@ define <8 x i32> @test_mul_v8i32_v8i8_mi
 ; SSE4-64-NEXT:    pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
 ; SSE4-64-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
 ; SSE4-64-NEXT:    movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778]
-; SSE4-64-NEXT:    pmulld %xmm2, %xmm0
-; SSE4-64-NEXT:    pmulld %xmm2, %xmm1
+; SSE4-64-NEXT:    pmaddwd %xmm2, %xmm0
+; SSE4-64-NEXT:    pmaddwd %xmm2, %xmm1
 ; SSE4-64-NEXT:    retq
 ;
-; AVX-32-LABEL: test_mul_v8i32_v8i8_minsize:
-; AVX-32:       # %bb.0:
-; AVX-32-NEXT:    vpand {{\.LCPI.*}}, %xmm0, %xmm0
-; AVX-32-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX-32-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778]
-; AVX-32-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
-; AVX-32-NEXT:    retl
+; AVX2-32-LABEL: test_mul_v8i32_v8i8_minsize:
+; AVX2-32:       # %bb.0:
+; AVX2-32-NEXT:    vpand {{\.LCPI.*}}, %xmm0, %xmm0
+; AVX2-32-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-32-NEXT:    vpmaddwd {{\.LCPI.*}}, %ymm0, %ymm0
+; AVX2-32-NEXT:    retl
 ;
-; AVX-64-LABEL: test_mul_v8i32_v8i8_minsize:
-; AVX-64:       # %bb.0:
-; AVX-64-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX-64-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX-64-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778]
-; AVX-64-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
-; AVX-64-NEXT:    retq
+; AVX2-64-LABEL: test_mul_v8i32_v8i8_minsize:
+; AVX2-64:       # %bb.0:
+; AVX2-64-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-64-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-64-NEXT:    vpmaddwd {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-64-NEXT:    retq
+;
+; AVX512DQ-32-LABEL: test_mul_v8i32_v8i8_minsize:
+; AVX512DQ-32:       # %bb.0:
+; AVX512DQ-32-NEXT:    vpand {{\.LCPI.*}}, %xmm0, %xmm0
+; AVX512DQ-32-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512DQ-32-NEXT:    vpmaddwd {{\.LCPI.*}}, %ymm0, %ymm0
+; AVX512DQ-32-NEXT:    retl
+;
+; AVX512DQ-64-LABEL: test_mul_v8i32_v8i8_minsize:
+; AVX512DQ-64:       # %bb.0:
+; AVX512DQ-64-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512DQ-64-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512DQ-64-NEXT:    vpmaddwd {{.*}}(%rip), %ymm0, %ymm0
+; AVX512DQ-64-NEXT:    retq
+;
+; AVX512BW-32-LABEL: test_mul_v8i32_v8i8_minsize:
+; AVX512BW-32:       # %bb.0:
+; AVX512BW-32-NEXT:    vpand {{\.LCPI.*}}, %xmm0, %xmm0
+; AVX512BW-32-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512BW-32-NEXT:    vpmaddwd {{\.LCPI.*}}, %ymm0, %ymm0
+; AVX512BW-32-NEXT:    retl
+;
+; AVX512BW-64-LABEL: test_mul_v8i32_v8i8_minsize:
+; AVX512BW-64:       # %bb.0:
+; AVX512BW-64-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512BW-64-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512BW-64-NEXT:    vpmaddwd {{.*}}(%rip), %ymm0, %ymm0
+; AVX512BW-64-NEXT:    retq
+;
+; KNL-32-LABEL: test_mul_v8i32_v8i8_minsize:
+; KNL-32:       # %bb.0:
+; KNL-32-NEXT:    vpand {{\.LCPI.*}}, %xmm0, %xmm0
+; KNL-32-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; KNL-32-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778]
+; KNL-32-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
+; KNL-32-NEXT:    retl
+;
+; KNL-64-LABEL: test_mul_v8i32_v8i8_minsize:
+; KNL-64:       # %bb.0:
+; KNL-64-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; KNL-64-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; KNL-64-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778]
+; KNL-64-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
+; KNL-64-NEXT:    retq
   %z = zext <8 x i8> %A to <8 x i32>
   %m = mul nuw nsw <8 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778>
   ret <8 x i32> %m
@@ -758,10 +938,10 @@ define <16 x i32> @test_mul_v16i32_v16i8
 ; SLM32-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
 ; SLM32-NEXT:    pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
 ; SLM32-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
-; SLM32-NEXT:    pmulld %xmm5, %xmm0
-; SLM32-NEXT:    pmulld %xmm5, %xmm1
-; SLM32-NEXT:    pmulld %xmm5, %xmm2
-; SLM32-NEXT:    pmulld %xmm5, %xmm3
+; SLM32-NEXT:    pmaddwd %xmm5, %xmm0
+; SLM32-NEXT:    pmaddwd %xmm5, %xmm1
+; SLM32-NEXT:    pmaddwd %xmm5, %xmm2
+; SLM32-NEXT:    pmaddwd %xmm5, %xmm3
 ; SLM32-NEXT:    retl
 ;
 ; SLM64-LABEL: test_mul_v16i32_v16i8_minsize:
@@ -774,10 +954,10 @@ define <16 x i32> @test_mul_v16i32_v16i8
 ; SLM64-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
 ; SLM64-NEXT:    pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
 ; SLM64-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
-; SLM64-NEXT:    pmulld %xmm5, %xmm0
-; SLM64-NEXT:    pmulld %xmm5, %xmm1
-; SLM64-NEXT:    pmulld %xmm5, %xmm2
-; SLM64-NEXT:    pmulld %xmm5, %xmm3
+; SLM64-NEXT:    pmaddwd %xmm5, %xmm0
+; SLM64-NEXT:    pmaddwd %xmm5, %xmm1
+; SLM64-NEXT:    pmaddwd %xmm5, %xmm2
+; SLM64-NEXT:    pmaddwd %xmm5, %xmm3
 ; SLM64-NEXT:    retq
 ;
 ; SLOW32-LABEL: test_mul_v16i32_v16i8_minsize:
@@ -790,10 +970,10 @@ define <16 x i32> @test_mul_v16i32_v16i8
 ; SLOW32-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
 ; SLOW32-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
 ; SLOW32-NEXT:    movdqa {{.*#+}} xmm4 = [18778,18778,18778,18778]
-; SLOW32-NEXT:    pmulld %xmm4, %xmm0
-; SLOW32-NEXT:    pmulld %xmm4, %xmm1
-; SLOW32-NEXT:    pmulld %xmm4, %xmm2
-; SLOW32-NEXT:    pmulld %xmm4, %xmm3
+; SLOW32-NEXT:    pmaddwd %xmm4, %xmm0
+; SLOW32-NEXT:    pmaddwd %xmm4, %xmm1
+; SLOW32-NEXT:    pmaddwd %xmm4, %xmm2
+; SLOW32-NEXT:    pmaddwd %xmm4, %xmm3
 ; SLOW32-NEXT:    retl
 ;
 ; SLOW64-LABEL: test_mul_v16i32_v16i8_minsize:
@@ -806,10 +986,10 @@ define <16 x i32> @test_mul_v16i32_v16i8
 ; SLOW64-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
 ; SLOW64-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
 ; SLOW64-NEXT:    movdqa {{.*#+}} xmm4 = [18778,18778,18778,18778]
-; SLOW64-NEXT:    pmulld %xmm4, %xmm0
-; SLOW64-NEXT:    pmulld %xmm4, %xmm1
-; SLOW64-NEXT:    pmulld %xmm4, %xmm2
-; SLOW64-NEXT:    pmulld %xmm4, %xmm3
+; SLOW64-NEXT:    pmaddwd %xmm4, %xmm0
+; SLOW64-NEXT:    pmaddwd %xmm4, %xmm1
+; SLOW64-NEXT:    pmaddwd %xmm4, %xmm2
+; SLOW64-NEXT:    pmaddwd %xmm4, %xmm3
 ; SLOW64-NEXT:    retq
 ;
 ; SSE4-32-LABEL: test_mul_v16i32_v16i8_minsize:
@@ -822,10 +1002,10 @@ define <16 x i32> @test_mul_v16i32_v16i8
 ; SSE4-32-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
 ; SSE4-32-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
 ; SSE4-32-NEXT:    movdqa {{.*#+}} xmm4 = [18778,18778,18778,18778]
-; SSE4-32-NEXT:    pmulld %xmm4, %xmm0
-; SSE4-32-NEXT:    pmulld %xmm4, %xmm1
-; SSE4-32-NEXT:    pmulld %xmm4, %xmm2
-; SSE4-32-NEXT:    pmulld %xmm4, %xmm3
+; SSE4-32-NEXT:    pmaddwd %xmm4, %xmm0
+; SSE4-32-NEXT:    pmaddwd %xmm4, %xmm1
+; SSE4-32-NEXT:    pmaddwd %xmm4, %xmm2
+; SSE4-32-NEXT:    pmaddwd %xmm4, %xmm3
 ; SSE4-32-NEXT:    retl
 ;
 ; SSE4-64-LABEL: test_mul_v16i32_v16i8_minsize:
@@ -838,10 +1018,10 @@ define <16 x i32> @test_mul_v16i32_v16i8
 ; SSE4-64-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
 ; SSE4-64-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
 ; SSE4-64-NEXT:    movdqa {{.*#+}} xmm4 = [18778,18778,18778,18778]
-; SSE4-64-NEXT:    pmulld %xmm4, %xmm0
-; SSE4-64-NEXT:    pmulld %xmm4, %xmm1
-; SSE4-64-NEXT:    pmulld %xmm4, %xmm2
-; SSE4-64-NEXT:    pmulld %xmm4, %xmm3
+; SSE4-64-NEXT:    pmaddwd %xmm4, %xmm0
+; SSE4-64-NEXT:    pmaddwd %xmm4, %xmm1
+; SSE4-64-NEXT:    pmaddwd %xmm4, %xmm2
+; SSE4-64-NEXT:    pmaddwd %xmm4, %xmm3
 ; SSE4-64-NEXT:    retq
 ;
 ; AVX2-32-LABEL: test_mul_v16i32_v16i8_minsize:
@@ -850,8 +1030,8 @@ define <16 x i32> @test_mul_v16i32_v16i8
 ; AVX2-32-NEXT:    vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
 ; AVX2-32-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
 ; AVX2-32-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [18778,18778,18778,18778,18778,18778,18778,18778]
-; AVX2-32-NEXT:    vpmulld %ymm2, %ymm0, %ymm0
-; AVX2-32-NEXT:    vpmulld %ymm2, %ymm1, %ymm1
+; AVX2-32-NEXT:    vpmaddwd %ymm2, %ymm0, %ymm0
+; AVX2-32-NEXT:    vpmaddwd %ymm2, %ymm1, %ymm1
 ; AVX2-32-NEXT:    retl
 ;
 ; AVX2-64-LABEL: test_mul_v16i32_v16i8_minsize:
@@ -860,21 +1040,45 @@ define <16 x i32> @test_mul_v16i32_v16i8
 ; AVX2-64-NEXT:    vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
 ; AVX2-64-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
 ; AVX2-64-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [18778,18778,18778,18778,18778,18778,18778,18778]
-; AVX2-64-NEXT:    vpmulld %ymm2, %ymm0, %ymm0
-; AVX2-64-NEXT:    vpmulld %ymm2, %ymm1, %ymm1
+; AVX2-64-NEXT:    vpmaddwd %ymm2, %ymm0, %ymm0
+; AVX2-64-NEXT:    vpmaddwd %ymm2, %ymm1, %ymm1
 ; AVX2-64-NEXT:    retq
 ;
-; AVX512-32-LABEL: test_mul_v16i32_v16i8_minsize:
-; AVX512-32:       # %bb.0:
-; AVX512-32-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; AVX512-32-NEXT:    vpmulld {{\.LCPI.*}}{1to16}, %zmm0, %zmm0
-; AVX512-32-NEXT:    retl
-;
-; AVX512-64-LABEL: test_mul_v16i32_v16i8_minsize:
-; AVX512-64:       # %bb.0:
-; AVX512-64-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; AVX512-64-NEXT:    vpmulld {{.*}}(%rip){1to16}, %zmm0, %zmm0
-; AVX512-64-NEXT:    retq
+; AVX512DQ-32-LABEL: test_mul_v16i32_v16i8_minsize:
+; AVX512DQ-32:       # %bb.0:
+; AVX512DQ-32-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQ-32-NEXT:    vpmulld {{\.LCPI.*}}{1to16}, %zmm0, %zmm0
+; AVX512DQ-32-NEXT:    retl
+;
+; AVX512DQ-64-LABEL: test_mul_v16i32_v16i8_minsize:
+; AVX512DQ-64:       # %bb.0:
+; AVX512DQ-64-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512DQ-64-NEXT:    vpmulld {{.*}}(%rip){1to16}, %zmm0, %zmm0
+; AVX512DQ-64-NEXT:    retq
+;
+; AVX512BW-32-LABEL: test_mul_v16i32_v16i8_minsize:
+; AVX512BW-32:       # %bb.0:
+; AVX512BW-32-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512BW-32-NEXT:    vpmaddwd {{\.LCPI.*}}, %zmm0, %zmm0
+; AVX512BW-32-NEXT:    retl
+;
+; AVX512BW-64-LABEL: test_mul_v16i32_v16i8_minsize:
+; AVX512BW-64:       # %bb.0:
+; AVX512BW-64-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512BW-64-NEXT:    vpmaddwd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512BW-64-NEXT:    retq
+;
+; KNL-32-LABEL: test_mul_v16i32_v16i8_minsize:
+; KNL-32:       # %bb.0:
+; KNL-32-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; KNL-32-NEXT:    vpmulld {{\.LCPI.*}}{1to16}, %zmm0, %zmm0
+; KNL-32-NEXT:    retl
+;
+; KNL-64-LABEL: test_mul_v16i32_v16i8_minsize:
+; KNL-64:       # %bb.0:
+; KNL-64-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; KNL-64-NEXT:    vpmulld {{.*}}(%rip){1to16}, %zmm0, %zmm0
+; KNL-64-NEXT:    retq
   %z = zext <16 x i8> %A to <16 x i32>
   %m = mul nuw nsw <16 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778>
   ret <16 x i32> %m




More information about the llvm-commits mailing list