[llvm] b8059e7 - [X86] Avoid extra (PMADDUBSW(X,AND(Y)) in <X x i8> multiplication (#168262)
via llvm-commits
llvm-commits at lists.llvm.org
Sun Nov 16 06:51:37 PST 2025
Author: Simon Pilgrim
Date: 2025-11-16T14:51:32Z
New Revision: b8059e757fb95b1d3cd9b657e540bf2cd47dad82
URL: https://github.com/llvm/llvm-project/commit/b8059e757fb95b1d3cd9b657e540bf2cd47dad82
DIFF: https://github.com/llvm/llvm-project/commit/b8059e757fb95b1d3cd9b657e540bf2cd47dad82.diff
LOG: [X86] Avoid extra (PMADDUBSW(X,AND(Y)) in <X x i8> multiplication (#168262)
On SSSE3 targets we use PMADDUBSW of odd/even with suitable masking to
avoid having to extend/truncate with `<X x i16>` types and avoid
additional Port0/5 pressure.
However, lower i8 elements in the pair can safely use PMULLW directly
without any pre-masking as we will only use the lower i8 bits of the
result which is only affected by the lower i8 of the inputs.
Added:
Modified:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/X86/avx2-arith.ll
llvm/test/CodeGen/X86/combine-mul.ll
llvm/test/CodeGen/X86/gfni-shifts.ll
llvm/test/CodeGen/X86/midpoint-int-vec-128.ll
llvm/test/CodeGen/X86/midpoint-int-vec-256.ll
llvm/test/CodeGen/X86/min-legal-vector-width.ll
llvm/test/CodeGen/X86/pmul.ll
llvm/test/CodeGen/X86/prefer-avx256-wide-mul.ll
llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
llvm/test/CodeGen/X86/vector-fshr-128.ll
llvm/test/CodeGen/X86/vector-fshr-256.ll
llvm/test/CodeGen/X86/vector-fshr-512.ll
llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll
llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll
llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll
llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll
llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll
llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll
llvm/test/CodeGen/X86/vector-mul.ll
llvm/test/CodeGen/X86/vector-shift-shl-128.ll
llvm/test/CodeGen/X86/vector-shift-shl-256.ll
llvm/test/CodeGen/X86/vector-shift-shl-512.ll
llvm/test/CodeGen/X86/vector-shuffle-combining-sse41.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 66f5802a67465..593c7627a6575 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -29629,9 +29629,9 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
}
if (!(IsLoLaneAllZeroOrUndef || IsHiLaneAllZeroOrUndef)) {
SDValue Mask = DAG.getBitcast(VT, DAG.getConstant(0x00FF, dl, ExVT));
- SDValue BLo = DAG.getNode(ISD::AND, dl, VT, Mask, B);
SDValue BHi = DAG.getNode(X86ISD::ANDNP, dl, VT, Mask, B);
- SDValue RLo = DAG.getNode(X86ISD::VPMADDUBSW, dl, ExVT, A, BLo);
+ SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, DAG.getBitcast(ExVT, A),
+ DAG.getBitcast(ExVT, B));
SDValue RHi = DAG.getNode(X86ISD::VPMADDUBSW, dl, ExVT, A, BHi);
RLo = DAG.getNode(ISD::AND, dl, VT, DAG.getBitcast(VT, RLo), Mask);
RHi = DAG.getNode(X86ISD::VSHLI, dl, ExVT, RHi,
diff --git a/llvm/test/CodeGen/X86/avx2-arith.ll b/llvm/test/CodeGen/X86/avx2-arith.ll
index 1133cdfd083be..d21df472f06cb 100644
--- a/llvm/test/CodeGen/X86/avx2-arith.ll
+++ b/llvm/test/CodeGen/X86/avx2-arith.ll
@@ -121,14 +121,13 @@ define <16 x i8> @mul_v16i8(<16 x i8> %i, <16 x i8> %j) nounwind readnone {
define <32 x i8> @mul_v32i8(<32 x i8> %i, <32 x i8> %j) nounwind readnone {
; CHECK-LABEL: mul_v32i8:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; CHECK-NEXT: vpand %ymm2, %ymm1, %ymm3
-; CHECK-NEXT: vpmaddubsw %ymm3, %ymm0, %ymm3
-; CHECK-NEXT: vpand %ymm2, %ymm3, %ymm3
-; CHECK-NEXT: vpandn %ymm1, %ymm2, %ymm1
+; CHECK-NEXT: vpmullw %ymm1, %ymm0, %ymm2
+; CHECK-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; CHECK-NEXT: vpand %ymm3, %ymm2, %ymm2
+; CHECK-NEXT: vpandn %ymm1, %ymm3, %ymm1
; CHECK-NEXT: vpmaddubsw %ymm1, %ymm0, %ymm0
; CHECK-NEXT: vpsllw $8, %ymm0, %ymm0
-; CHECK-NEXT: vpor %ymm0, %ymm3, %ymm0
+; CHECK-NEXT: vpor %ymm0, %ymm2, %ymm0
; CHECK-NEXT: ret{{[l|q]}}
%x = mul <32 x i8> %i, %j
ret <32 x i8> %x
diff --git a/llvm/test/CodeGen/X86/combine-mul.ll b/llvm/test/CodeGen/X86/combine-mul.ll
index 29c41cac222b2..15d187a5baeec 100644
--- a/llvm/test/CodeGen/X86/combine-mul.ll
+++ b/llvm/test/CodeGen/X86/combine-mul.ll
@@ -504,7 +504,7 @@ define <16 x i8> @PR35579(<16 x i8> %x) {
; SSE-NEXT: movdqa %xmm0, %xmm1
; SSE-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
; SSE-NEXT: psllw $8, %xmm1
-; SSE-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [0,0,2,0,4,0,2,0,8,0,2,0,4,0,2,0]
+; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [0,1,2,1,4,1,2,1,8,1,2,1,4,1,2,1]
; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE-NEXT: por %xmm1, %xmm0
; SSE-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/gfni-shifts.ll b/llvm/test/CodeGen/X86/gfni-shifts.ll
index 30f1874c51fed..638d88481f071 100644
--- a/llvm/test/CodeGen/X86/gfni-shifts.ll
+++ b/llvm/test/CodeGen/X86/gfni-shifts.ll
@@ -388,7 +388,7 @@ define <16 x i8> @constant_shl_v16i8(<16 x i8> %a) nounwind {
; GFNISSE-NEXT: movdqa %xmm0, %xmm1
; GFNISSE-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1]
; GFNISSE-NEXT: psllw $8, %xmm1
-; GFNISSE-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0]
+; GFNISSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1]
; GFNISSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; GFNISSE-NEXT: por %xmm1, %xmm0
; GFNISSE-NEXT: retq
@@ -397,7 +397,7 @@ define <16 x i8> @constant_shl_v16i8(<16 x i8> %a) nounwind {
; GFNIAVX1: # %bb.0:
; GFNIAVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1]
; GFNIAVX1-NEXT: vpsllw $8, %xmm1, %xmm1
-; GFNIAVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0]
+; GFNIAVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1]
; GFNIAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; GFNIAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
; GFNIAVX1-NEXT: retq
@@ -1213,21 +1213,20 @@ define <32 x i8> @splatvar_ashr_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
define <32 x i8> @constant_shl_v32i8(<32 x i8> %a) nounwind {
; GFNISSE-LABEL: constant_shl_v32i8:
; GFNISSE: # %bb.0:
-; GFNISSE-NEXT: pmovzxbw {{.*#+}} xmm2 = [1,4,16,64,128,32,8,2]
+; GFNISSE-NEXT: movdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1]
; GFNISSE-NEXT: movdqa %xmm0, %xmm3
-; GFNISSE-NEXT: pmaddubsw %xmm2, %xmm3
+; GFNISSE-NEXT: pmullw %xmm2, %xmm3
; GFNISSE-NEXT: pmovzxbw {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
; GFNISSE-NEXT: pand %xmm4, %xmm3
; GFNISSE-NEXT: movdqa {{.*#+}} xmm5 = [0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1]
; GFNISSE-NEXT: pmaddubsw %xmm5, %xmm0
; GFNISSE-NEXT: psllw $8, %xmm0
; GFNISSE-NEXT: por %xmm3, %xmm0
-; GFNISSE-NEXT: movdqa %xmm1, %xmm3
-; GFNISSE-NEXT: pmaddubsw %xmm2, %xmm3
-; GFNISSE-NEXT: pand %xmm4, %xmm3
+; GFNISSE-NEXT: pmullw %xmm1, %xmm2
+; GFNISSE-NEXT: pand %xmm4, %xmm2
; GFNISSE-NEXT: pmaddubsw %xmm5, %xmm1
; GFNISSE-NEXT: psllw $8, %xmm1
-; GFNISSE-NEXT: por %xmm3, %xmm1
+; GFNISSE-NEXT: por %xmm2, %xmm1
; GFNISSE-NEXT: retq
;
; GFNIAVX1-LABEL: constant_shl_v32i8:
@@ -1239,9 +1238,9 @@ define <32 x i8> @constant_shl_v32i8(<32 x i8> %a) nounwind {
; GFNIAVX1-NEXT: vpmaddubsw %xmm1, %xmm3, %xmm1
; GFNIAVX1-NEXT: vpsllw $8, %xmm1, %xmm1
; GFNIAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
-; GFNIAVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = [1,4,16,64,128,32,8,2]
-; GFNIAVX1-NEXT: vpmaddubsw %xmm2, %xmm3, %xmm3
-; GFNIAVX1-NEXT: vpmaddubsw %xmm2, %xmm0, %xmm0
+; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1]
+; GFNIAVX1-NEXT: vpmullw %xmm2, %xmm3, %xmm3
+; GFNIAVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0
; GFNIAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
; GFNIAVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; GFNIAVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
@@ -1251,14 +1250,14 @@ define <32 x i8> @constant_shl_v32i8(<32 x i8> %a) nounwind {
; GFNIAVX2: # %bb.0:
; GFNIAVX2-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 # [0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1,0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1]
; GFNIAVX2-NEXT: vpsllw $8, %ymm1, %ymm1
-; GFNIAVX2-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0,1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0]
+; GFNIAVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1,1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1]
; GFNIAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; GFNIAVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
; GFNIAVX2-NEXT: retq
;
; GFNIAVX512VL-LABEL: constant_shl_v32i8:
; GFNIAVX512VL: # %bb.0:
-; GFNIAVX512VL-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 # [1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0,1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0]
+; GFNIAVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 # [1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1,1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1]
; GFNIAVX512VL-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1,0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1]
; GFNIAVX512VL-NEXT: vpsllw $8, %ymm0, %ymm0
; GFNIAVX512VL-NEXT: vpternlogd {{.*#+}} ymm0 = ymm0 | (ymm1 & m32bcst)
@@ -2521,9 +2520,9 @@ define <64 x i8> @splatvar_ashr_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
define <64 x i8> @constant_shl_v64i8(<64 x i8> %a) nounwind {
; GFNISSE-LABEL: constant_shl_v64i8:
; GFNISSE: # %bb.0:
-; GFNISSE-NEXT: pmovzxbw {{.*#+}} xmm4 = [1,4,16,64,128,32,8,2]
+; GFNISSE-NEXT: movdqa {{.*#+}} xmm4 = [1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1]
; GFNISSE-NEXT: movdqa %xmm0, %xmm6
-; GFNISSE-NEXT: pmaddubsw %xmm4, %xmm6
+; GFNISSE-NEXT: pmullw %xmm4, %xmm6
; GFNISSE-NEXT: pmovzxbw {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
; GFNISSE-NEXT: pand %xmm5, %xmm6
; GFNISSE-NEXT: movdqa {{.*#+}} xmm7 = [0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1]
@@ -2531,23 +2530,22 @@ define <64 x i8> @constant_shl_v64i8(<64 x i8> %a) nounwind {
; GFNISSE-NEXT: psllw $8, %xmm0
; GFNISSE-NEXT: por %xmm6, %xmm0
; GFNISSE-NEXT: movdqa %xmm1, %xmm6
-; GFNISSE-NEXT: pmaddubsw %xmm4, %xmm6
+; GFNISSE-NEXT: pmullw %xmm4, %xmm6
; GFNISSE-NEXT: pand %xmm5, %xmm6
; GFNISSE-NEXT: pmaddubsw %xmm7, %xmm1
; GFNISSE-NEXT: psllw $8, %xmm1
; GFNISSE-NEXT: por %xmm6, %xmm1
; GFNISSE-NEXT: movdqa %xmm2, %xmm6
-; GFNISSE-NEXT: pmaddubsw %xmm4, %xmm6
+; GFNISSE-NEXT: pmullw %xmm4, %xmm6
; GFNISSE-NEXT: pand %xmm5, %xmm6
; GFNISSE-NEXT: pmaddubsw %xmm7, %xmm2
; GFNISSE-NEXT: psllw $8, %xmm2
; GFNISSE-NEXT: por %xmm6, %xmm2
-; GFNISSE-NEXT: movdqa %xmm3, %xmm6
-; GFNISSE-NEXT: pmaddubsw %xmm4, %xmm6
-; GFNISSE-NEXT: pand %xmm5, %xmm6
+; GFNISSE-NEXT: pmullw %xmm3, %xmm4
+; GFNISSE-NEXT: pand %xmm5, %xmm4
; GFNISSE-NEXT: pmaddubsw %xmm7, %xmm3
; GFNISSE-NEXT: psllw $8, %xmm3
-; GFNISSE-NEXT: por %xmm6, %xmm3
+; GFNISSE-NEXT: por %xmm4, %xmm3
; GFNISSE-NEXT: retq
;
; GFNIAVX1-LABEL: constant_shl_v64i8:
@@ -2559,9 +2557,9 @@ define <64 x i8> @constant_shl_v64i8(<64 x i8> %a) nounwind {
; GFNIAVX1-NEXT: vpmaddubsw %xmm2, %xmm4, %xmm5
; GFNIAVX1-NEXT: vpsllw $8, %xmm5, %xmm5
; GFNIAVX1-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3
-; GFNIAVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = [1,4,16,64,128,32,8,2]
-; GFNIAVX1-NEXT: vpmaddubsw %xmm5, %xmm4, %xmm4
-; GFNIAVX1-NEXT: vpmaddubsw %xmm5, %xmm0, %xmm0
+; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1]
+; GFNIAVX1-NEXT: vpmullw %xmm5, %xmm4, %xmm4
+; GFNIAVX1-NEXT: vpmullw %xmm5, %xmm0, %xmm0
; GFNIAVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
; GFNIAVX1-NEXT: vbroadcastss {{.*#+}} ymm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
; GFNIAVX1-NEXT: vandps %ymm4, %ymm0, %ymm0
@@ -2572,8 +2570,8 @@ define <64 x i8> @constant_shl_v64i8(<64 x i8> %a) nounwind {
; GFNIAVX1-NEXT: vpmaddubsw %xmm2, %xmm6, %xmm2
; GFNIAVX1-NEXT: vpsllw $8, %xmm2, %xmm2
; GFNIAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
-; GFNIAVX1-NEXT: vpmaddubsw %xmm5, %xmm6, %xmm3
-; GFNIAVX1-NEXT: vpmaddubsw %xmm5, %xmm1, %xmm1
+; GFNIAVX1-NEXT: vpmullw %xmm5, %xmm6, %xmm3
+; GFNIAVX1-NEXT: vpmullw %xmm5, %xmm1, %xmm1
; GFNIAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
; GFNIAVX1-NEXT: vandps %ymm4, %ymm1, %ymm1
; GFNIAVX1-NEXT: vorps %ymm2, %ymm1, %ymm1
@@ -2581,9 +2579,9 @@ define <64 x i8> @constant_shl_v64i8(<64 x i8> %a) nounwind {
;
; GFNIAVX2-LABEL: constant_shl_v64i8:
; GFNIAVX2: # %bb.0:
-; GFNIAVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0,1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0]
+; GFNIAVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1,1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1]
; GFNIAVX2-NEXT: # ymm2 = mem[0,1,0,1]
-; GFNIAVX2-NEXT: vpmaddubsw %ymm2, %ymm0, %ymm3
+; GFNIAVX2-NEXT: vpmullw %ymm2, %ymm0, %ymm3
; GFNIAVX2-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; GFNIAVX2-NEXT: vpand %ymm4, %ymm3, %ymm3
; GFNIAVX2-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1,0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1]
@@ -2591,7 +2589,7 @@ define <64 x i8> @constant_shl_v64i8(<64 x i8> %a) nounwind {
; GFNIAVX2-NEXT: vpmaddubsw %ymm5, %ymm0, %ymm0
; GFNIAVX2-NEXT: vpsllw $8, %ymm0, %ymm0
; GFNIAVX2-NEXT: vpor %ymm0, %ymm3, %ymm0
-; GFNIAVX2-NEXT: vpmaddubsw %ymm2, %ymm1, %ymm2
+; GFNIAVX2-NEXT: vpmullw %ymm2, %ymm1, %ymm2
; GFNIAVX2-NEXT: vpand %ymm4, %ymm2, %ymm2
; GFNIAVX2-NEXT: vpmaddubsw %ymm5, %ymm1, %ymm1
; GFNIAVX2-NEXT: vpsllw $8, %ymm1, %ymm1
@@ -2601,10 +2599,10 @@ define <64 x i8> @constant_shl_v64i8(<64 x i8> %a) nounwind {
; GFNIAVX512VL-LABEL: constant_shl_v64i8:
; GFNIAVX512VL: # %bb.0:
; GFNIAVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; GFNIAVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0,1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0]
+; GFNIAVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1,1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1]
; GFNIAVX512VL-NEXT: # ymm2 = mem[0,1,0,1]
-; GFNIAVX512VL-NEXT: vpmaddubsw %ymm2, %ymm1, %ymm3
-; GFNIAVX512VL-NEXT: vpmaddubsw %ymm2, %ymm0, %ymm2
+; GFNIAVX512VL-NEXT: vpmullw %ymm2, %ymm1, %ymm3
+; GFNIAVX512VL-NEXT: vpmullw %ymm2, %ymm0, %ymm2
; GFNIAVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
; GFNIAVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1,0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1]
; GFNIAVX512VL-NEXT: # ymm3 = mem[0,1,0,1]
@@ -2618,7 +2616,7 @@ define <64 x i8> @constant_shl_v64i8(<64 x i8> %a) nounwind {
;
; GFNIAVX512BW-LABEL: constant_shl_v64i8:
; GFNIAVX512BW: # %bb.0:
-; GFNIAVX512BW-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 # [1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0,1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0,1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0,1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0]
+; GFNIAVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 # [1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1,1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1,1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1,1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1]
; GFNIAVX512BW-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 # [0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1,0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1,0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1,0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1]
; GFNIAVX512BW-NEXT: vpsllw $8, %zmm0, %zmm0
; GFNIAVX512BW-NEXT: vpternlogd {{.*#+}} zmm0 = zmm0 | (zmm1 & m32bcst)
diff --git a/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll b/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll
index a798f4c38f68f..541ca9d4f4096 100644
--- a/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll
+++ b/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll
@@ -2368,17 +2368,15 @@ define <16 x i8> @vec128_i8_signed_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounwin
; SSE41-NEXT: psubb %xmm3, %xmm1
; SSE41-NEXT: psrlw $1, %xmm1
; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
-; SSE41-NEXT: movdqa %xmm2, %xmm4
-; SSE41-NEXT: pand %xmm3, %xmm4
-; SSE41-NEXT: movdqa %xmm1, %xmm5
-; SSE41-NEXT: pmaddubsw %xmm4, %xmm5
-; SSE41-NEXT: pand %xmm3, %xmm5
-; SSE41-NEXT: pandn %xmm2, %xmm3
-; SSE41-NEXT: pmaddubsw %xmm3, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm3
+; SSE41-NEXT: pmullw %xmm2, %xmm3
+; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; SSE41-NEXT: pand %xmm4, %xmm3
+; SSE41-NEXT: pandn %xmm2, %xmm4
+; SSE41-NEXT: pmaddubsw %xmm4, %xmm1
; SSE41-NEXT: psllw $8, %xmm1
-; SSE41-NEXT: por %xmm1, %xmm5
-; SSE41-NEXT: paddb %xmm5, %xmm0
+; SSE41-NEXT: por %xmm1, %xmm3
+; SSE41-NEXT: paddb %xmm3, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: vec128_i8_signed_reg_reg:
@@ -2390,14 +2388,13 @@ define <16 x i8> @vec128_i8_signed_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounwin
; AVX1-NEXT: vpsubb %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1
; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
-; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX1-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4
-; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm4
-; AVX1-NEXT: vpandn %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm3
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpandn %xmm2, %xmm4, %xmm2
; AVX1-NEXT: vpmaddubsw %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpsllw $8, %xmm1, %xmm1
-; AVX1-NEXT: vpor %xmm1, %xmm4, %xmm1
+; AVX1-NEXT: vpor %xmm1, %xmm3, %xmm1
; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
@@ -2429,12 +2426,10 @@ define <16 x i8> @vec128_i8_signed_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounwin
; XOP-FALLBACK-NEXT: vpsubb %xmm3, %xmm1, %xmm1
; XOP-FALLBACK-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
; XOP-FALLBACK-NEXT: vpshlb %xmm3, %xmm1, %xmm1
-; XOP-FALLBACK-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
-; XOP-FALLBACK-NEXT: vpandn %xmm2, %xmm3, %xmm4
-; XOP-FALLBACK-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4
-; XOP-FALLBACK-NEXT: vpand %xmm3, %xmm2, %xmm2
-; XOP-FALLBACK-NEXT: vpmaddubsw %xmm2, %xmm1, %xmm1
-; XOP-FALLBACK-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2],xmm4[2],xmm1[4],xmm4[4],xmm1[6],xmm4[6],xmm1[8],xmm4[8],xmm1[10],xmm4[10],xmm1[12],xmm4[12],xmm1[14],xmm4[14]
+; XOP-FALLBACK-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3
+; XOP-FALLBACK-NEXT: vpmaddubsw %xmm3, %xmm1, %xmm3
+; XOP-FALLBACK-NEXT: vpmullw %xmm2, %xmm1, %xmm1
+; XOP-FALLBACK-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[2],xmm3[2],xmm1[4],xmm3[4],xmm1[6],xmm3[6],xmm1[8],xmm3[8],xmm1[10],xmm3[10],xmm1[12],xmm3[12],xmm1[14],xmm3[14]
; XOP-FALLBACK-NEXT: vpaddb %xmm0, %xmm1, %xmm0
; XOP-FALLBACK-NEXT: retq
;
@@ -2447,12 +2442,10 @@ define <16 x i8> @vec128_i8_signed_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounwin
; XOPAVX1-NEXT: vpsubb %xmm3, %xmm1, %xmm1
; XOPAVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
; XOPAVX1-NEXT: vpshlb %xmm3, %xmm1, %xmm1
-; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
-; XOPAVX1-NEXT: vpandn %xmm2, %xmm3, %xmm4
-; XOPAVX1-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4
-; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
-; XOPAVX1-NEXT: vpmaddubsw %xmm2, %xmm1, %xmm1
-; XOPAVX1-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2],xmm4[2],xmm1[4],xmm4[4],xmm1[6],xmm4[6],xmm1[8],xmm4[8],xmm1[10],xmm4[10],xmm1[12],xmm4[12],xmm1[14],xmm4[14]
+; XOPAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3
+; XOPAVX1-NEXT: vpmaddubsw %xmm3, %xmm1, %xmm3
+; XOPAVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1
+; XOPAVX1-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[2],xmm3[2],xmm1[4],xmm3[4],xmm1[6],xmm3[6],xmm1[8],xmm3[8],xmm1[10],xmm3[10],xmm1[12],xmm3[12],xmm1[14],xmm3[14]
; XOPAVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0
; XOPAVX1-NEXT: retq
;
@@ -2591,17 +2584,15 @@ define <16 x i8> @vec128_i8_unsigned_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounw
; SSE41-NEXT: psubb %xmm2, %xmm1
; SSE41-NEXT: psrlw $1, %xmm1
; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
-; SSE41-NEXT: movdqa %xmm4, %xmm3
-; SSE41-NEXT: pand %xmm2, %xmm3
-; SSE41-NEXT: movdqa %xmm1, %xmm5
-; SSE41-NEXT: pmaddubsw %xmm3, %xmm5
-; SSE41-NEXT: pand %xmm2, %xmm5
-; SSE41-NEXT: pandn %xmm4, %xmm2
-; SSE41-NEXT: pmaddubsw %xmm2, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm2
+; SSE41-NEXT: pmullw %xmm4, %xmm2
+; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; SSE41-NEXT: pand %xmm3, %xmm2
+; SSE41-NEXT: pandn %xmm4, %xmm3
+; SSE41-NEXT: pmaddubsw %xmm3, %xmm1
; SSE41-NEXT: psllw $8, %xmm1
-; SSE41-NEXT: por %xmm1, %xmm5
-; SSE41-NEXT: paddb %xmm5, %xmm0
+; SSE41-NEXT: por %xmm1, %xmm2
+; SSE41-NEXT: paddb %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: vec128_i8_unsigned_reg_reg:
@@ -2615,14 +2606,13 @@ define <16 x i8> @vec128_i8_unsigned_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounw
; AVX1-NEXT: vpsubb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1
; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
-; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm4
-; AVX1-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4
-; AVX1-NEXT: vpand %xmm2, %xmm4, %xmm4
-; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpmaddubsw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpmullw %xmm3, %xmm1, %xmm2
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpandn %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpmaddubsw %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vpsllw $8, %xmm1, %xmm1
-; AVX1-NEXT: vpor %xmm1, %xmm4, %xmm1
+; AVX1-NEXT: vpor %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
@@ -2656,12 +2646,10 @@ define <16 x i8> @vec128_i8_unsigned_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounw
; XOP-FALLBACK-NEXT: vpsubb %xmm3, %xmm1, %xmm1
; XOP-FALLBACK-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
; XOP-FALLBACK-NEXT: vpshlb %xmm3, %xmm1, %xmm1
-; XOP-FALLBACK-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
-; XOP-FALLBACK-NEXT: vpandn %xmm2, %xmm3, %xmm4
-; XOP-FALLBACK-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4
-; XOP-FALLBACK-NEXT: vpand %xmm3, %xmm2, %xmm2
-; XOP-FALLBACK-NEXT: vpmaddubsw %xmm2, %xmm1, %xmm1
-; XOP-FALLBACK-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2],xmm4[2],xmm1[4],xmm4[4],xmm1[6],xmm4[6],xmm1[8],xmm4[8],xmm1[10],xmm4[10],xmm1[12],xmm4[12],xmm1[14],xmm4[14]
+; XOP-FALLBACK-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3
+; XOP-FALLBACK-NEXT: vpmaddubsw %xmm3, %xmm1, %xmm3
+; XOP-FALLBACK-NEXT: vpmullw %xmm2, %xmm1, %xmm1
+; XOP-FALLBACK-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[2],xmm3[2],xmm1[4],xmm3[4],xmm1[6],xmm3[6],xmm1[8],xmm3[8],xmm1[10],xmm3[10],xmm1[12],xmm3[12],xmm1[14],xmm3[14]
; XOP-FALLBACK-NEXT: vpaddb %xmm0, %xmm1, %xmm0
; XOP-FALLBACK-NEXT: retq
;
@@ -2674,12 +2662,10 @@ define <16 x i8> @vec128_i8_unsigned_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounw
; XOPAVX1-NEXT: vpsubb %xmm3, %xmm1, %xmm1
; XOPAVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
; XOPAVX1-NEXT: vpshlb %xmm3, %xmm1, %xmm1
-; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
-; XOPAVX1-NEXT: vpandn %xmm2, %xmm3, %xmm4
-; XOPAVX1-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4
-; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
-; XOPAVX1-NEXT: vpmaddubsw %xmm2, %xmm1, %xmm1
-; XOPAVX1-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2],xmm4[2],xmm1[4],xmm4[4],xmm1[6],xmm4[6],xmm1[8],xmm4[8],xmm1[10],xmm4[10],xmm1[12],xmm4[12],xmm1[14],xmm4[14]
+; XOPAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3
+; XOPAVX1-NEXT: vpmaddubsw %xmm3, %xmm1, %xmm3
+; XOPAVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1
+; XOPAVX1-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[2],xmm3[2],xmm1[4],xmm3[4],xmm1[6],xmm3[6],xmm1[8],xmm3[8],xmm1[10],xmm3[10],xmm1[12],xmm3[12],xmm1[14],xmm3[14]
; XOPAVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0
; XOPAVX1-NEXT: retq
;
@@ -2822,16 +2808,14 @@ define <16 x i8> @vec128_i8_signed_mem_reg(ptr %a1_addr, <16 x i8> %a2) nounwind
; SSE41-NEXT: psubb %xmm3, %xmm0
; SSE41-NEXT: psrlw $1, %xmm0
; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
-; SSE41-NEXT: movdqa %xmm2, %xmm4
-; SSE41-NEXT: pand %xmm3, %xmm4
-; SSE41-NEXT: movdqa %xmm0, %xmm5
-; SSE41-NEXT: pmaddubsw %xmm4, %xmm5
-; SSE41-NEXT: pand %xmm3, %xmm5
-; SSE41-NEXT: pandn %xmm2, %xmm3
-; SSE41-NEXT: pmaddubsw %xmm3, %xmm0
+; SSE41-NEXT: movdqa %xmm0, %xmm3
+; SSE41-NEXT: pmullw %xmm2, %xmm3
+; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; SSE41-NEXT: pand %xmm4, %xmm3
+; SSE41-NEXT: pandn %xmm2, %xmm4
+; SSE41-NEXT: pmaddubsw %xmm4, %xmm0
; SSE41-NEXT: psllw $8, %xmm0
-; SSE41-NEXT: por %xmm5, %xmm0
+; SSE41-NEXT: por %xmm3, %xmm0
; SSE41-NEXT: paddb %xmm1, %xmm0
; SSE41-NEXT: retq
;
@@ -2845,14 +2829,13 @@ define <16 x i8> @vec128_i8_signed_mem_reg(ptr %a1_addr, <16 x i8> %a2) nounwind
; AVX1-NEXT: vpsubb %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm0
; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
-; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX1-NEXT: vpmaddubsw %xmm4, %xmm0, %xmm4
-; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm4
-; AVX1-NEXT: vpandn %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm3
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpandn %xmm2, %xmm4, %xmm2
; AVX1-NEXT: vpmaddubsw %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpsllw $8, %xmm0, %xmm0
-; AVX1-NEXT: vpor %xmm0, %xmm4, %xmm0
+; AVX1-NEXT: vpor %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
@@ -2886,12 +2869,10 @@ define <16 x i8> @vec128_i8_signed_mem_reg(ptr %a1_addr, <16 x i8> %a2) nounwind
; XOP-FALLBACK-NEXT: vpsubb %xmm3, %xmm0, %xmm0
; XOP-FALLBACK-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
; XOP-FALLBACK-NEXT: vpshlb %xmm3, %xmm0, %xmm0
-; XOP-FALLBACK-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
-; XOP-FALLBACK-NEXT: vpandn %xmm2, %xmm3, %xmm4
-; XOP-FALLBACK-NEXT: vpmaddubsw %xmm4, %xmm0, %xmm4
-; XOP-FALLBACK-NEXT: vpand %xmm3, %xmm2, %xmm2
-; XOP-FALLBACK-NEXT: vpmaddubsw %xmm2, %xmm0, %xmm0
-; XOP-FALLBACK-NEXT: vpperm {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[2],xmm4[2],xmm0[4],xmm4[4],xmm0[6],xmm4[6],xmm0[8],xmm4[8],xmm0[10],xmm4[10],xmm0[12],xmm4[12],xmm0[14],xmm4[14]
+; XOP-FALLBACK-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3
+; XOP-FALLBACK-NEXT: vpmaddubsw %xmm3, %xmm0, %xmm3
+; XOP-FALLBACK-NEXT: vpmullw %xmm2, %xmm0, %xmm0
+; XOP-FALLBACK-NEXT: vpperm {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2],xmm3[2],xmm0[4],xmm3[4],xmm0[6],xmm3[6],xmm0[8],xmm3[8],xmm0[10],xmm3[10],xmm0[12],xmm3[12],xmm0[14],xmm3[14]
; XOP-FALLBACK-NEXT: vpaddb %xmm1, %xmm0, %xmm0
; XOP-FALLBACK-NEXT: retq
;
@@ -2905,12 +2886,10 @@ define <16 x i8> @vec128_i8_signed_mem_reg(ptr %a1_addr, <16 x i8> %a2) nounwind
; XOPAVX1-NEXT: vpsubb %xmm3, %xmm0, %xmm0
; XOPAVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
; XOPAVX1-NEXT: vpshlb %xmm3, %xmm0, %xmm0
-; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
-; XOPAVX1-NEXT: vpandn %xmm2, %xmm3, %xmm4
-; XOPAVX1-NEXT: vpmaddubsw %xmm4, %xmm0, %xmm4
-; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
-; XOPAVX1-NEXT: vpmaddubsw %xmm2, %xmm0, %xmm0
-; XOPAVX1-NEXT: vpperm {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[2],xmm4[2],xmm0[4],xmm4[4],xmm0[6],xmm4[6],xmm0[8],xmm4[8],xmm0[10],xmm4[10],xmm0[12],xmm4[12],xmm0[14],xmm4[14]
+; XOPAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3
+; XOPAVX1-NEXT: vpmaddubsw %xmm3, %xmm0, %xmm3
+; XOPAVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0
+; XOPAVX1-NEXT: vpperm {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2],xmm3[2],xmm0[4],xmm3[4],xmm0[6],xmm3[6],xmm0[8],xmm3[8],xmm0[10],xmm3[10],xmm0[12],xmm3[12],xmm0[14],xmm3[14]
; XOPAVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
; XOPAVX1-NEXT: retq
;
@@ -3053,16 +3032,14 @@ define <16 x i8> @vec128_i8_signed_reg_mem(<16 x i8> %a1, ptr %a2_addr) nounwind
; SSE41-NEXT: psubb %xmm3, %xmm1
; SSE41-NEXT: psrlw $1, %xmm1
; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
-; SSE41-NEXT: movdqa %xmm2, %xmm4
-; SSE41-NEXT: pand %xmm3, %xmm4
-; SSE41-NEXT: movdqa %xmm1, %xmm5
-; SSE41-NEXT: pmaddubsw %xmm4, %xmm5
-; SSE41-NEXT: pand %xmm3, %xmm5
-; SSE41-NEXT: pandn %xmm2, %xmm3
-; SSE41-NEXT: pmaddubsw %xmm3, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm3
+; SSE41-NEXT: pmullw %xmm2, %xmm3
+; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; SSE41-NEXT: pand %xmm4, %xmm3
+; SSE41-NEXT: pandn %xmm2, %xmm4
+; SSE41-NEXT: pmaddubsw %xmm4, %xmm1
; SSE41-NEXT: psllw $8, %xmm1
-; SSE41-NEXT: por %xmm5, %xmm1
+; SSE41-NEXT: por %xmm3, %xmm1
; SSE41-NEXT: paddb %xmm1, %xmm0
; SSE41-NEXT: retq
;
@@ -3076,14 +3053,13 @@ define <16 x i8> @vec128_i8_signed_reg_mem(<16 x i8> %a1, ptr %a2_addr) nounwind
; AVX1-NEXT: vpsubb %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1
; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
-; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX1-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4
-; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm4
-; AVX1-NEXT: vpandn %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm3
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpandn %xmm2, %xmm4, %xmm2
; AVX1-NEXT: vpmaddubsw %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpsllw $8, %xmm1, %xmm1
-; AVX1-NEXT: vpor %xmm1, %xmm4, %xmm1
+; AVX1-NEXT: vpor %xmm1, %xmm3, %xmm1
; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
@@ -3117,12 +3093,10 @@ define <16 x i8> @vec128_i8_signed_reg_mem(<16 x i8> %a1, ptr %a2_addr) nounwind
; XOP-FALLBACK-NEXT: vpsubb %xmm3, %xmm1, %xmm1
; XOP-FALLBACK-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
; XOP-FALLBACK-NEXT: vpshlb %xmm3, %xmm1, %xmm1
-; XOP-FALLBACK-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
-; XOP-FALLBACK-NEXT: vpandn %xmm2, %xmm3, %xmm4
-; XOP-FALLBACK-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4
-; XOP-FALLBACK-NEXT: vpand %xmm3, %xmm2, %xmm2
-; XOP-FALLBACK-NEXT: vpmaddubsw %xmm2, %xmm1, %xmm1
-; XOP-FALLBACK-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2],xmm4[2],xmm1[4],xmm4[4],xmm1[6],xmm4[6],xmm1[8],xmm4[8],xmm1[10],xmm4[10],xmm1[12],xmm4[12],xmm1[14],xmm4[14]
+; XOP-FALLBACK-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3
+; XOP-FALLBACK-NEXT: vpmaddubsw %xmm3, %xmm1, %xmm3
+; XOP-FALLBACK-NEXT: vpmullw %xmm2, %xmm1, %xmm1
+; XOP-FALLBACK-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[2],xmm3[2],xmm1[4],xmm3[4],xmm1[6],xmm3[6],xmm1[8],xmm3[8],xmm1[10],xmm3[10],xmm1[12],xmm3[12],xmm1[14],xmm3[14]
; XOP-FALLBACK-NEXT: vpaddb %xmm0, %xmm1, %xmm0
; XOP-FALLBACK-NEXT: retq
;
@@ -3136,12 +3110,10 @@ define <16 x i8> @vec128_i8_signed_reg_mem(<16 x i8> %a1, ptr %a2_addr) nounwind
; XOPAVX1-NEXT: vpsubb %xmm3, %xmm1, %xmm1
; XOPAVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
; XOPAVX1-NEXT: vpshlb %xmm3, %xmm1, %xmm1
-; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
-; XOPAVX1-NEXT: vpandn %xmm2, %xmm3, %xmm4
-; XOPAVX1-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4
-; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
-; XOPAVX1-NEXT: vpmaddubsw %xmm2, %xmm1, %xmm1
-; XOPAVX1-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2],xmm4[2],xmm1[4],xmm4[4],xmm1[6],xmm4[6],xmm1[8],xmm4[8],xmm1[10],xmm4[10],xmm1[12],xmm4[12],xmm1[14],xmm4[14]
+; XOPAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3
+; XOPAVX1-NEXT: vpmaddubsw %xmm3, %xmm1, %xmm3
+; XOPAVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1
+; XOPAVX1-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[2],xmm3[2],xmm1[4],xmm3[4],xmm1[6],xmm3[6],xmm1[8],xmm3[8],xmm1[10],xmm3[10],xmm1[12],xmm3[12],xmm1[14],xmm3[14]
; XOPAVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0
; XOPAVX1-NEXT: retq
;
@@ -3286,16 +3258,14 @@ define <16 x i8> @vec128_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind
; SSE41-NEXT: psubb %xmm3, %xmm0
; SSE41-NEXT: psrlw $1, %xmm0
; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
-; SSE41-NEXT: movdqa %xmm2, %xmm4
-; SSE41-NEXT: pand %xmm3, %xmm4
-; SSE41-NEXT: movdqa %xmm0, %xmm5
-; SSE41-NEXT: pmaddubsw %xmm4, %xmm5
-; SSE41-NEXT: pand %xmm3, %xmm5
-; SSE41-NEXT: pandn %xmm2, %xmm3
-; SSE41-NEXT: pmaddubsw %xmm3, %xmm0
+; SSE41-NEXT: movdqa %xmm0, %xmm3
+; SSE41-NEXT: pmullw %xmm2, %xmm3
+; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; SSE41-NEXT: pand %xmm4, %xmm3
+; SSE41-NEXT: pandn %xmm2, %xmm4
+; SSE41-NEXT: pmaddubsw %xmm4, %xmm0
; SSE41-NEXT: psllw $8, %xmm0
-; SSE41-NEXT: por %xmm5, %xmm0
+; SSE41-NEXT: por %xmm3, %xmm0
; SSE41-NEXT: paddb %xmm1, %xmm0
; SSE41-NEXT: retq
;
@@ -3310,14 +3280,13 @@ define <16 x i8> @vec128_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind
; AVX1-NEXT: vpsubb %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1
; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
-; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX1-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4
-; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm4
-; AVX1-NEXT: vpandn %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm3
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpandn %xmm2, %xmm4, %xmm2
; AVX1-NEXT: vpmaddubsw %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpsllw $8, %xmm1, %xmm1
-; AVX1-NEXT: vpor %xmm1, %xmm4, %xmm1
+; AVX1-NEXT: vpor %xmm1, %xmm3, %xmm1
; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
@@ -3353,12 +3322,10 @@ define <16 x i8> @vec128_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind
; XOP-FALLBACK-NEXT: vpsubb %xmm3, %xmm1, %xmm1
; XOP-FALLBACK-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
; XOP-FALLBACK-NEXT: vpshlb %xmm3, %xmm1, %xmm1
-; XOP-FALLBACK-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
-; XOP-FALLBACK-NEXT: vpandn %xmm2, %xmm3, %xmm4
-; XOP-FALLBACK-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4
-; XOP-FALLBACK-NEXT: vpand %xmm3, %xmm2, %xmm2
-; XOP-FALLBACK-NEXT: vpmaddubsw %xmm2, %xmm1, %xmm1
-; XOP-FALLBACK-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2],xmm4[2],xmm1[4],xmm4[4],xmm1[6],xmm4[6],xmm1[8],xmm4[8],xmm1[10],xmm4[10],xmm1[12],xmm4[12],xmm1[14],xmm4[14]
+; XOP-FALLBACK-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3
+; XOP-FALLBACK-NEXT: vpmaddubsw %xmm3, %xmm1, %xmm3
+; XOP-FALLBACK-NEXT: vpmullw %xmm2, %xmm1, %xmm1
+; XOP-FALLBACK-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[2],xmm3[2],xmm1[4],xmm3[4],xmm1[6],xmm3[6],xmm1[8],xmm3[8],xmm1[10],xmm3[10],xmm1[12],xmm3[12],xmm1[14],xmm3[14]
; XOP-FALLBACK-NEXT: vpaddb %xmm0, %xmm1, %xmm0
; XOP-FALLBACK-NEXT: retq
;
@@ -3373,12 +3340,10 @@ define <16 x i8> @vec128_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind
; XOPAVX1-NEXT: vpsubb %xmm3, %xmm1, %xmm1
; XOPAVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
; XOPAVX1-NEXT: vpshlb %xmm3, %xmm1, %xmm1
-; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
-; XOPAVX1-NEXT: vpandn %xmm2, %xmm3, %xmm4
-; XOPAVX1-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4
-; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
-; XOPAVX1-NEXT: vpmaddubsw %xmm2, %xmm1, %xmm1
-; XOPAVX1-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2],xmm4[2],xmm1[4],xmm4[4],xmm1[6],xmm4[6],xmm1[8],xmm4[8],xmm1[10],xmm4[10],xmm1[12],xmm4[12],xmm1[14],xmm4[14]
+; XOPAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3
+; XOPAVX1-NEXT: vpmaddubsw %xmm3, %xmm1, %xmm3
+; XOPAVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1
+; XOPAVX1-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[2],xmm3[2],xmm1[4],xmm3[4],xmm1[6],xmm3[6],xmm1[8],xmm3[8],xmm1[10],xmm3[10],xmm1[12],xmm3[12],xmm1[14],xmm3[14]
; XOPAVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0
; XOPAVX1-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll b/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll
index 7c9adaf31aff5..85791cd65163a 100644
--- a/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll
+++ b/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll
@@ -1896,40 +1896,38 @@ define <16 x i16> @vec256_i16_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwin
define <32 x i8> @vec256_i8_signed_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounwind {
; AVX1-LABEL: vec256_i8_signed_reg_reg:
; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vpcmpgtb %xmm3, %xmm2, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vpcmpgtb %xmm2, %xmm3, %xmm4
; AVX1-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm5
; AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm6
; AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm1
; AVX1-NEXT: vpsubb %xmm6, %xmm1, %xmm1
-; AVX1-NEXT: vpminsb %xmm3, %xmm2, %xmm6
-; AVX1-NEXT: vpmaxsb %xmm3, %xmm2, %xmm3
-; AVX1-NEXT: vpsubb %xmm6, %xmm3, %xmm3
-; AVX1-NEXT: vpsrlw $1, %xmm3, %xmm3
+; AVX1-NEXT: vpminsb %xmm2, %xmm3, %xmm6
+; AVX1-NEXT: vpmaxsb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpsubb %xmm6, %xmm2, %xmm2
+; AVX1-NEXT: vpsrlw $1, %xmm2, %xmm2
; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3
+; AVX1-NEXT: vpand %xmm6, %xmm2, %xmm2
; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1
; AVX1-NEXT: vpand %xmm6, %xmm1, %xmm1
; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; AVX1-NEXT: vpor %xmm6, %xmm5, %xmm5
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255]
-; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm8
-; AVX1-NEXT: vpmaddubsw %xmm8, %xmm1, %xmm8
-; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm8
-; AVX1-NEXT: vpandn %xmm5, %xmm7, %xmm5
+; AVX1-NEXT: vpmullw %xmm5, %xmm1, %xmm7
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm7
+; AVX1-NEXT: vpandn %xmm5, %xmm8, %xmm5
; AVX1-NEXT: vpmaddubsw %xmm5, %xmm1, %xmm1
; AVX1-NEXT: vpsllw $8, %xmm1, %xmm1
-; AVX1-NEXT: vpor %xmm1, %xmm8, %xmm1
+; AVX1-NEXT: vpor %xmm1, %xmm7, %xmm1
; AVX1-NEXT: vpor %xmm6, %xmm4, %xmm4
-; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm5
-; AVX1-NEXT: vpmaddubsw %xmm5, %xmm3, %xmm5
-; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5
-; AVX1-NEXT: vpandn %xmm4, %xmm7, %xmm4
-; AVX1-NEXT: vpmaddubsw %xmm4, %xmm3, %xmm3
-; AVX1-NEXT: vpsllw $8, %xmm3, %xmm3
-; AVX1-NEXT: vpor %xmm3, %xmm5, %xmm3
-; AVX1-NEXT: vpaddb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpmullw %xmm4, %xmm2, %xmm5
+; AVX1-NEXT: vpand %xmm5, %xmm8, %xmm5
+; AVX1-NEXT: vpandn %xmm4, %xmm8, %xmm4
+; AVX1-NEXT: vpmaddubsw %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpsllw $8, %xmm2, %xmm2
+; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2
+; AVX1-NEXT: vpaddb %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: retq
@@ -1943,14 +1941,13 @@ define <32 x i8> @vec256_i8_signed_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounwin
; AVX2-NEXT: vpsubb %ymm3, %ymm1, %ymm1
; AVX2-NEXT: vpsrlw $1, %ymm1, %ymm1
; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX2-NEXT: vpmaddubsw %ymm4, %ymm1, %ymm4
-; AVX2-NEXT: vpand %ymm3, %ymm4, %ymm4
-; AVX2-NEXT: vpandn %ymm2, %ymm3, %ymm2
+; AVX2-NEXT: vpmullw %ymm2, %ymm1, %ymm3
+; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3
+; AVX2-NEXT: vpandn %ymm2, %ymm4, %ymm2
; AVX2-NEXT: vpmaddubsw %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vpsllw $8, %ymm1, %ymm1
-; AVX2-NEXT: vpor %ymm1, %ymm4, %ymm1
+; AVX2-NEXT: vpor %ymm1, %ymm3, %ymm1
; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
@@ -1974,15 +1971,13 @@ define <32 x i8> @vec256_i8_signed_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounwin
; XOP-NEXT: vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255]
; XOP-NEXT: vpandn %xmm5, %xmm7, %xmm8
; XOP-NEXT: vpmaddubsw %xmm8, %xmm1, %xmm8
-; XOP-NEXT: vpand %xmm7, %xmm5, %xmm5
-; XOP-NEXT: vpmaddubsw %xmm5, %xmm1, %xmm1
+; XOP-NEXT: vpmullw %xmm5, %xmm1, %xmm1
; XOP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,16,2,18,4,20,6,22,8,24,10,26,12,28,14,30]
; XOP-NEXT: vpperm %xmm5, %xmm8, %xmm1, %xmm1
; XOP-NEXT: vpor %xmm6, %xmm4, %xmm4
; XOP-NEXT: vpandn %xmm4, %xmm7, %xmm6
; XOP-NEXT: vpmaddubsw %xmm6, %xmm2, %xmm6
-; XOP-NEXT: vpand %xmm7, %xmm4, %xmm4
-; XOP-NEXT: vpmaddubsw %xmm4, %xmm2, %xmm2
+; XOP-NEXT: vpmullw %xmm4, %xmm2, %xmm2
; XOP-NEXT: vpperm %xmm5, %xmm6, %xmm2, %xmm2
; XOP-NEXT: vpaddb %xmm3, %xmm2, %xmm2
; XOP-NEXT: vpaddb %xmm0, %xmm1, %xmm0
@@ -1998,14 +1993,13 @@ define <32 x i8> @vec256_i8_signed_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounwin
; AVX512F-NEXT: vpsubb %ymm3, %ymm1, %ymm1
; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1
; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512F-NEXT: vpmaddubsw %ymm4, %ymm1, %ymm4
-; AVX512F-NEXT: vpand %ymm3, %ymm4, %ymm4
-; AVX512F-NEXT: vpandn %ymm2, %ymm3, %ymm2
+; AVX512F-NEXT: vpmullw %ymm2, %ymm1, %ymm3
+; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-NEXT: vpand %ymm4, %ymm3, %ymm3
+; AVX512F-NEXT: vpandn %ymm2, %ymm4, %ymm2
; AVX512F-NEXT: vpmaddubsw %ymm2, %ymm1, %ymm1
; AVX512F-NEXT: vpsllw $8, %ymm1, %ymm1
-; AVX512F-NEXT: vpor %ymm1, %ymm4, %ymm1
+; AVX512F-NEXT: vpor %ymm1, %ymm3, %ymm1
; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0
; AVX512F-NEXT: retq
;
@@ -2087,19 +2081,17 @@ define <32 x i8> @vec256_i8_unsigned_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounw
; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1
; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; AVX1-NEXT: vpor %xmm4, %xmm6, %xmm6
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255]
-; AVX1-NEXT: vpand %xmm7, %xmm6, %xmm8
-; AVX1-NEXT: vpmaddubsw %xmm8, %xmm1, %xmm8
-; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm8
-; AVX1-NEXT: vpandn %xmm6, %xmm7, %xmm6
+; AVX1-NEXT: vpmullw %xmm6, %xmm1, %xmm7
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm7
+; AVX1-NEXT: vpandn %xmm6, %xmm8, %xmm6
; AVX1-NEXT: vpmaddubsw %xmm6, %xmm1, %xmm1
; AVX1-NEXT: vpsllw $8, %xmm1, %xmm1
-; AVX1-NEXT: vpor %xmm1, %xmm8, %xmm1
+; AVX1-NEXT: vpor %xmm1, %xmm7, %xmm1
; AVX1-NEXT: vpor %xmm4, %xmm5, %xmm4
-; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm5
-; AVX1-NEXT: vpmaddubsw %xmm5, %xmm3, %xmm5
-; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5
-; AVX1-NEXT: vpandn %xmm4, %xmm7, %xmm4
+; AVX1-NEXT: vpmullw %xmm4, %xmm3, %xmm5
+; AVX1-NEXT: vpand %xmm5, %xmm8, %xmm5
+; AVX1-NEXT: vpandn %xmm4, %xmm8, %xmm4
; AVX1-NEXT: vpmaddubsw %xmm4, %xmm3, %xmm3
; AVX1-NEXT: vpsllw $8, %xmm3, %xmm3
; AVX1-NEXT: vpor %xmm3, %xmm5, %xmm3
@@ -2119,14 +2111,13 @@ define <32 x i8> @vec256_i8_unsigned_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounw
; AVX2-NEXT: vpsubb %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vpsrlw $1, %ymm1, %ymm1
; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX2-NEXT: vpand %ymm2, %ymm3, %ymm4
-; AVX2-NEXT: vpmaddubsw %ymm4, %ymm1, %ymm4
-; AVX2-NEXT: vpand %ymm2, %ymm4, %ymm4
-; AVX2-NEXT: vpandn %ymm3, %ymm2, %ymm2
-; AVX2-NEXT: vpmaddubsw %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpmullw %ymm3, %ymm1, %ymm2
+; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2
+; AVX2-NEXT: vpandn %ymm3, %ymm4, %ymm3
+; AVX2-NEXT: vpmaddubsw %ymm3, %ymm1, %ymm1
; AVX2-NEXT: vpsllw $8, %ymm1, %ymm1
-; AVX2-NEXT: vpor %ymm1, %ymm4, %ymm1
+; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1
; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
@@ -2150,15 +2141,13 @@ define <32 x i8> @vec256_i8_unsigned_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounw
; XOP-NEXT: vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255]
; XOP-NEXT: vpandn %xmm5, %xmm7, %xmm8
; XOP-NEXT: vpmaddubsw %xmm8, %xmm1, %xmm8
-; XOP-NEXT: vpand %xmm7, %xmm5, %xmm5
-; XOP-NEXT: vpmaddubsw %xmm5, %xmm1, %xmm1
+; XOP-NEXT: vpmullw %xmm5, %xmm1, %xmm1
; XOP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,16,2,18,4,20,6,22,8,24,10,26,12,28,14,30]
; XOP-NEXT: vpperm %xmm5, %xmm8, %xmm1, %xmm1
; XOP-NEXT: vpor %xmm6, %xmm4, %xmm4
; XOP-NEXT: vpandn %xmm4, %xmm7, %xmm6
; XOP-NEXT: vpmaddubsw %xmm6, %xmm2, %xmm6
-; XOP-NEXT: vpand %xmm7, %xmm4, %xmm4
-; XOP-NEXT: vpmaddubsw %xmm4, %xmm2, %xmm2
+; XOP-NEXT: vpmullw %xmm4, %xmm2, %xmm2
; XOP-NEXT: vpperm %xmm5, %xmm6, %xmm2, %xmm2
; XOP-NEXT: vpaddb %xmm3, %xmm2, %xmm2
; XOP-NEXT: vpaddb %xmm0, %xmm1, %xmm0
@@ -2175,14 +2164,13 @@ define <32 x i8> @vec256_i8_unsigned_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounw
; AVX512F-NEXT: vpsubb %ymm2, %ymm1, %ymm1
; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1
; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-NEXT: vpand %ymm2, %ymm3, %ymm4
-; AVX512F-NEXT: vpmaddubsw %ymm4, %ymm1, %ymm4
-; AVX512F-NEXT: vpand %ymm2, %ymm4, %ymm4
-; AVX512F-NEXT: vpandn %ymm3, %ymm2, %ymm2
-; AVX512F-NEXT: vpmaddubsw %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT: vpmullw %ymm3, %ymm1, %ymm2
+; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-NEXT: vpand %ymm4, %ymm2, %ymm2
+; AVX512F-NEXT: vpandn %ymm3, %ymm4, %ymm3
+; AVX512F-NEXT: vpmaddubsw %ymm3, %ymm1, %ymm1
; AVX512F-NEXT: vpsllw $8, %ymm1, %ymm1
-; AVX512F-NEXT: vpor %ymm1, %ymm4, %ymm1
+; AVX512F-NEXT: vpor %ymm1, %ymm2, %ymm1
; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0
; AVX512F-NEXT: retq
;
@@ -2247,41 +2235,39 @@ define <32 x i8> @vec256_i8_unsigned_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounw
define <32 x i8> @vec256_i8_signed_mem_reg(ptr %a1_addr, <32 x i8> %a2) nounwind {
; AVX1-LABEL: vec256_i8_signed_mem_reg:
; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vmovdqa (%rdi), %xmm1
-; AVX1-NEXT: vmovdqa 16(%rdi), %xmm2
-; AVX1-NEXT: vpcmpgtb %xmm3, %xmm2, %xmm4
+; AVX1-NEXT: vmovdqa 16(%rdi), %xmm3
+; AVX1-NEXT: vpcmpgtb %xmm2, %xmm3, %xmm4
; AVX1-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm5
; AVX1-NEXT: vpminsb %xmm0, %xmm1, %xmm6
; AVX1-NEXT: vpmaxsb %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpsubb %xmm6, %xmm0, %xmm0
-; AVX1-NEXT: vpminsb %xmm3, %xmm2, %xmm6
-; AVX1-NEXT: vpmaxsb %xmm3, %xmm2, %xmm3
-; AVX1-NEXT: vpsubb %xmm6, %xmm3, %xmm3
-; AVX1-NEXT: vpsrlw $1, %xmm3, %xmm3
+; AVX1-NEXT: vpminsb %xmm2, %xmm3, %xmm6
+; AVX1-NEXT: vpmaxsb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpsubb %xmm6, %xmm2, %xmm2
+; AVX1-NEXT: vpsrlw $1, %xmm2, %xmm2
; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3
+; AVX1-NEXT: vpand %xmm6, %xmm2, %xmm2
; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm0
; AVX1-NEXT: vpand %xmm6, %xmm0, %xmm0
; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; AVX1-NEXT: vpor %xmm6, %xmm5, %xmm5
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255]
-; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm8
-; AVX1-NEXT: vpmaddubsw %xmm8, %xmm0, %xmm8
-; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm8
-; AVX1-NEXT: vpandn %xmm5, %xmm7, %xmm5
+; AVX1-NEXT: vpmullw %xmm5, %xmm0, %xmm7
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm7
+; AVX1-NEXT: vpandn %xmm5, %xmm8, %xmm5
; AVX1-NEXT: vpmaddubsw %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsllw $8, %xmm0, %xmm0
-; AVX1-NEXT: vpor %xmm0, %xmm8, %xmm0
+; AVX1-NEXT: vpor %xmm0, %xmm7, %xmm0
; AVX1-NEXT: vpor %xmm6, %xmm4, %xmm4
-; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm5
-; AVX1-NEXT: vpmaddubsw %xmm5, %xmm3, %xmm5
-; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5
-; AVX1-NEXT: vpandn %xmm4, %xmm7, %xmm4
-; AVX1-NEXT: vpmaddubsw %xmm4, %xmm3, %xmm3
-; AVX1-NEXT: vpsllw $8, %xmm3, %xmm3
-; AVX1-NEXT: vpor %xmm3, %xmm5, %xmm3
-; AVX1-NEXT: vpaddb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpmullw %xmm4, %xmm2, %xmm5
+; AVX1-NEXT: vpand %xmm5, %xmm8, %xmm5
+; AVX1-NEXT: vpandn %xmm4, %xmm8, %xmm4
+; AVX1-NEXT: vpmaddubsw %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpsllw $8, %xmm2, %xmm2
+; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2
+; AVX1-NEXT: vpaddb %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: retq
@@ -2296,14 +2282,13 @@ define <32 x i8> @vec256_i8_signed_mem_reg(ptr %a1_addr, <32 x i8> %a2) nounwind
; AVX2-NEXT: vpsubb %ymm3, %ymm0, %ymm0
; AVX2-NEXT: vpsrlw $1, %ymm0, %ymm0
; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX2-NEXT: vpmaddubsw %ymm4, %ymm0, %ymm4
-; AVX2-NEXT: vpand %ymm3, %ymm4, %ymm4
-; AVX2-NEXT: vpandn %ymm2, %ymm3, %ymm2
+; AVX2-NEXT: vpmullw %ymm2, %ymm0, %ymm3
+; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3
+; AVX2-NEXT: vpandn %ymm2, %ymm4, %ymm2
; AVX2-NEXT: vpmaddubsw %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpsllw $8, %ymm0, %ymm0
-; AVX2-NEXT: vpor %ymm0, %ymm4, %ymm0
+; AVX2-NEXT: vpor %ymm0, %ymm3, %ymm0
; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
@@ -2328,15 +2313,13 @@ define <32 x i8> @vec256_i8_signed_mem_reg(ptr %a1_addr, <32 x i8> %a2) nounwind
; XOP-NEXT: vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255]
; XOP-NEXT: vpandn %xmm5, %xmm7, %xmm8
; XOP-NEXT: vpmaddubsw %xmm8, %xmm0, %xmm8
-; XOP-NEXT: vpand %xmm7, %xmm5, %xmm5
-; XOP-NEXT: vpmaddubsw %xmm5, %xmm0, %xmm0
+; XOP-NEXT: vpmullw %xmm5, %xmm0, %xmm0
; XOP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,16,2,18,4,20,6,22,8,24,10,26,12,28,14,30]
; XOP-NEXT: vpperm %xmm5, %xmm8, %xmm0, %xmm0
; XOP-NEXT: vpor %xmm6, %xmm4, %xmm4
; XOP-NEXT: vpandn %xmm4, %xmm7, %xmm6
; XOP-NEXT: vpmaddubsw %xmm6, %xmm1, %xmm6
-; XOP-NEXT: vpand %xmm7, %xmm4, %xmm4
-; XOP-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm1
+; XOP-NEXT: vpmullw %xmm4, %xmm1, %xmm1
; XOP-NEXT: vpperm %xmm5, %xmm6, %xmm1, %xmm1
; XOP-NEXT: vpaddb %xmm3, %xmm1, %xmm1
; XOP-NEXT: vpaddb %xmm2, %xmm0, %xmm0
@@ -2353,14 +2336,13 @@ define <32 x i8> @vec256_i8_signed_mem_reg(ptr %a1_addr, <32 x i8> %a2) nounwind
; AVX512F-NEXT: vpsubb %ymm3, %ymm0, %ymm0
; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm0
; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512F-NEXT: vpmaddubsw %ymm4, %ymm0, %ymm4
-; AVX512F-NEXT: vpand %ymm3, %ymm4, %ymm4
-; AVX512F-NEXT: vpandn %ymm2, %ymm3, %ymm2
+; AVX512F-NEXT: vpmullw %ymm2, %ymm0, %ymm3
+; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-NEXT: vpand %ymm4, %ymm3, %ymm3
+; AVX512F-NEXT: vpandn %ymm2, %ymm4, %ymm2
; AVX512F-NEXT: vpmaddubsw %ymm2, %ymm0, %ymm0
; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm0
-; AVX512F-NEXT: vpor %ymm0, %ymm4, %ymm0
+; AVX512F-NEXT: vpor %ymm0, %ymm3, %ymm0
; AVX512F-NEXT: vpaddb %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: retq
;
@@ -2443,19 +2425,17 @@ define <32 x i8> @vec256_i8_signed_reg_mem(<32 x i8> %a1, ptr %a2_addr) nounwind
; AVX1-NEXT: vpand %xmm6, %xmm2, %xmm2
; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; AVX1-NEXT: vpor %xmm6, %xmm5, %xmm5
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255]
-; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm8
-; AVX1-NEXT: vpmaddubsw %xmm8, %xmm2, %xmm8
-; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm8
-; AVX1-NEXT: vpandn %xmm5, %xmm7, %xmm5
+; AVX1-NEXT: vpmullw %xmm5, %xmm2, %xmm7
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm7
+; AVX1-NEXT: vpandn %xmm5, %xmm8, %xmm5
; AVX1-NEXT: vpmaddubsw %xmm5, %xmm2, %xmm2
; AVX1-NEXT: vpsllw $8, %xmm2, %xmm2
-; AVX1-NEXT: vpor %xmm2, %xmm8, %xmm2
+; AVX1-NEXT: vpor %xmm2, %xmm7, %xmm2
; AVX1-NEXT: vpor %xmm6, %xmm4, %xmm4
-; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm5
-; AVX1-NEXT: vpmaddubsw %xmm5, %xmm3, %xmm5
-; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5
-; AVX1-NEXT: vpandn %xmm4, %xmm7, %xmm4
+; AVX1-NEXT: vpmullw %xmm4, %xmm3, %xmm5
+; AVX1-NEXT: vpand %xmm5, %xmm8, %xmm5
+; AVX1-NEXT: vpandn %xmm4, %xmm8, %xmm4
; AVX1-NEXT: vpmaddubsw %xmm4, %xmm3, %xmm3
; AVX1-NEXT: vpsllw $8, %xmm3, %xmm3
; AVX1-NEXT: vpor %xmm3, %xmm5, %xmm3
@@ -2474,14 +2454,13 @@ define <32 x i8> @vec256_i8_signed_reg_mem(<32 x i8> %a1, ptr %a2_addr) nounwind
; AVX2-NEXT: vpsubb %ymm3, %ymm1, %ymm1
; AVX2-NEXT: vpsrlw $1, %ymm1, %ymm1
; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX2-NEXT: vpmaddubsw %ymm4, %ymm1, %ymm4
-; AVX2-NEXT: vpand %ymm3, %ymm4, %ymm4
-; AVX2-NEXT: vpandn %ymm2, %ymm3, %ymm2
+; AVX2-NEXT: vpmullw %ymm2, %ymm1, %ymm3
+; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3
+; AVX2-NEXT: vpandn %ymm2, %ymm4, %ymm2
; AVX2-NEXT: vpmaddubsw %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vpsllw $8, %ymm1, %ymm1
-; AVX2-NEXT: vpor %ymm1, %ymm4, %ymm1
+; AVX2-NEXT: vpor %ymm1, %ymm3, %ymm1
; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
@@ -2506,15 +2485,13 @@ define <32 x i8> @vec256_i8_signed_reg_mem(<32 x i8> %a1, ptr %a2_addr) nounwind
; XOP-NEXT: vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255]
; XOP-NEXT: vpandn %xmm5, %xmm7, %xmm8
; XOP-NEXT: vpmaddubsw %xmm8, %xmm2, %xmm8
-; XOP-NEXT: vpand %xmm7, %xmm5, %xmm5
-; XOP-NEXT: vpmaddubsw %xmm5, %xmm2, %xmm2
+; XOP-NEXT: vpmullw %xmm5, %xmm2, %xmm2
; XOP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,16,2,18,4,20,6,22,8,24,10,26,12,28,14,30]
; XOP-NEXT: vpperm %xmm5, %xmm8, %xmm2, %xmm2
; XOP-NEXT: vpor %xmm6, %xmm4, %xmm4
; XOP-NEXT: vpandn %xmm4, %xmm7, %xmm6
; XOP-NEXT: vpmaddubsw %xmm6, %xmm3, %xmm6
-; XOP-NEXT: vpand %xmm7, %xmm4, %xmm4
-; XOP-NEXT: vpmaddubsw %xmm4, %xmm3, %xmm3
+; XOP-NEXT: vpmullw %xmm4, %xmm3, %xmm3
; XOP-NEXT: vpperm %xmm5, %xmm6, %xmm3, %xmm3
; XOP-NEXT: vpaddb %xmm1, %xmm3, %xmm1
; XOP-NEXT: vpaddb %xmm0, %xmm2, %xmm0
@@ -2531,14 +2508,13 @@ define <32 x i8> @vec256_i8_signed_reg_mem(<32 x i8> %a1, ptr %a2_addr) nounwind
; AVX512F-NEXT: vpsubb %ymm3, %ymm1, %ymm1
; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1
; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512F-NEXT: vpmaddubsw %ymm4, %ymm1, %ymm4
-; AVX512F-NEXT: vpand %ymm3, %ymm4, %ymm4
-; AVX512F-NEXT: vpandn %ymm2, %ymm3, %ymm2
+; AVX512F-NEXT: vpmullw %ymm2, %ymm1, %ymm3
+; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-NEXT: vpand %ymm4, %ymm3, %ymm3
+; AVX512F-NEXT: vpandn %ymm2, %ymm4, %ymm2
; AVX512F-NEXT: vpmaddubsw %ymm2, %ymm1, %ymm1
; AVX512F-NEXT: vpsllw $8, %ymm1, %ymm1
-; AVX512F-NEXT: vpor %ymm1, %ymm4, %ymm1
+; AVX512F-NEXT: vpor %ymm1, %ymm3, %ymm1
; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0
; AVX512F-NEXT: retq
;
@@ -2603,44 +2579,42 @@ define <32 x i8> @vec256_i8_signed_reg_mem(<32 x i8> %a1, ptr %a2_addr) nounwind
define <32 x i8> @vec256_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind {
; AVX1-LABEL: vec256_i8_signed_mem_mem:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa (%rsi), %xmm1
-; AVX1-NEXT: vmovdqa 16(%rsi), %xmm2
-; AVX1-NEXT: vmovdqa (%rdi), %xmm0
+; AVX1-NEXT: vmovdqa (%rsi), %xmm0
+; AVX1-NEXT: vmovdqa 16(%rsi), %xmm1
+; AVX1-NEXT: vmovdqa (%rdi), %xmm2
; AVX1-NEXT: vmovdqa 16(%rdi), %xmm3
-; AVX1-NEXT: vpcmpgtb %xmm2, %xmm3, %xmm4
-; AVX1-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm5
-; AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm6
-; AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vpcmpgtb %xmm1, %xmm3, %xmm4
+; AVX1-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm5
+; AVX1-NEXT: vpminsb %xmm0, %xmm2, %xmm6
+; AVX1-NEXT: vpmaxsb %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vpsubb %xmm6, %xmm0, %xmm0
+; AVX1-NEXT: vpminsb %xmm1, %xmm3, %xmm6
+; AVX1-NEXT: vpmaxsb %xmm1, %xmm3, %xmm1
; AVX1-NEXT: vpsubb %xmm6, %xmm1, %xmm1
-; AVX1-NEXT: vpminsb %xmm2, %xmm3, %xmm6
-; AVX1-NEXT: vpmaxsb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpsubb %xmm6, %xmm2, %xmm2
-; AVX1-NEXT: vpsrlw $1, %xmm2, %xmm2
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; AVX1-NEXT: vpand %xmm6, %xmm2, %xmm2
; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
; AVX1-NEXT: vpand %xmm6, %xmm1, %xmm1
+; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm6, %xmm0, %xmm0
; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; AVX1-NEXT: vpor %xmm6, %xmm5, %xmm5
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255]
-; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm8
-; AVX1-NEXT: vpmaddubsw %xmm8, %xmm1, %xmm8
-; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm8
-; AVX1-NEXT: vpandn %xmm5, %xmm7, %xmm5
-; AVX1-NEXT: vpmaddubsw %xmm5, %xmm1, %xmm1
-; AVX1-NEXT: vpsllw $8, %xmm1, %xmm1
-; AVX1-NEXT: vpor %xmm1, %xmm8, %xmm1
+; AVX1-NEXT: vpmullw %xmm5, %xmm0, %xmm7
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm7
+; AVX1-NEXT: vpandn %xmm5, %xmm8, %xmm5
+; AVX1-NEXT: vpmaddubsw %xmm5, %xmm0, %xmm0
+; AVX1-NEXT: vpsllw $8, %xmm0, %xmm0
+; AVX1-NEXT: vpor %xmm0, %xmm7, %xmm0
; AVX1-NEXT: vpor %xmm6, %xmm4, %xmm4
-; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm5
-; AVX1-NEXT: vpmaddubsw %xmm5, %xmm2, %xmm5
-; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5
-; AVX1-NEXT: vpandn %xmm4, %xmm7, %xmm4
-; AVX1-NEXT: vpmaddubsw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpsllw $8, %xmm2, %xmm2
-; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2
-; AVX1-NEXT: vpaddb %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vpmullw %xmm4, %xmm1, %xmm5
+; AVX1-NEXT: vpand %xmm5, %xmm8, %xmm5
+; AVX1-NEXT: vpandn %xmm4, %xmm8, %xmm4
+; AVX1-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpsllw $8, %xmm1, %xmm1
+; AVX1-NEXT: vpor %xmm1, %xmm5, %xmm1
+; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: vec256_i8_signed_mem_mem:
@@ -2654,14 +2628,13 @@ define <32 x i8> @vec256_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind
; AVX2-NEXT: vpsubb %ymm3, %ymm1, %ymm1
; AVX2-NEXT: vpsrlw $1, %ymm1, %ymm1
; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX2-NEXT: vpmaddubsw %ymm4, %ymm1, %ymm4
-; AVX2-NEXT: vpand %ymm3, %ymm4, %ymm4
-; AVX2-NEXT: vpandn %ymm2, %ymm3, %ymm2
+; AVX2-NEXT: vpmullw %ymm2, %ymm1, %ymm3
+; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3
+; AVX2-NEXT: vpandn %ymm2, %ymm4, %ymm2
; AVX2-NEXT: vpmaddubsw %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vpsllw $8, %ymm1, %ymm1
-; AVX2-NEXT: vpor %ymm1, %ymm4, %ymm1
+; AVX2-NEXT: vpor %ymm1, %ymm3, %ymm1
; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
@@ -2687,15 +2660,13 @@ define <32 x i8> @vec256_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind
; XOP-NEXT: vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255]
; XOP-NEXT: vpandn %xmm5, %xmm7, %xmm8
; XOP-NEXT: vpmaddubsw %xmm8, %xmm0, %xmm8
-; XOP-NEXT: vpand %xmm7, %xmm5, %xmm5
-; XOP-NEXT: vpmaddubsw %xmm5, %xmm0, %xmm0
+; XOP-NEXT: vpmullw %xmm5, %xmm0, %xmm0
; XOP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,16,2,18,4,20,6,22,8,24,10,26,12,28,14,30]
; XOP-NEXT: vpperm %xmm5, %xmm8, %xmm0, %xmm0
; XOP-NEXT: vpor %xmm6, %xmm4, %xmm4
; XOP-NEXT: vpandn %xmm4, %xmm7, %xmm6
; XOP-NEXT: vpmaddubsw %xmm6, %xmm1, %xmm6
-; XOP-NEXT: vpand %xmm7, %xmm4, %xmm4
-; XOP-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm1
+; XOP-NEXT: vpmullw %xmm4, %xmm1, %xmm1
; XOP-NEXT: vpperm %xmm5, %xmm6, %xmm1, %xmm1
; XOP-NEXT: vpaddb %xmm3, %xmm1, %xmm1
; XOP-NEXT: vpaddb %xmm2, %xmm0, %xmm0
@@ -2713,14 +2684,13 @@ define <32 x i8> @vec256_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind
; AVX512F-NEXT: vpsubb %ymm3, %ymm1, %ymm1
; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1
; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512F-NEXT: vpmaddubsw %ymm4, %ymm1, %ymm4
-; AVX512F-NEXT: vpand %ymm3, %ymm4, %ymm4
-; AVX512F-NEXT: vpandn %ymm2, %ymm3, %ymm2
+; AVX512F-NEXT: vpmullw %ymm2, %ymm1, %ymm3
+; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-NEXT: vpand %ymm4, %ymm3, %ymm3
+; AVX512F-NEXT: vpandn %ymm2, %ymm4, %ymm2
; AVX512F-NEXT: vpmaddubsw %ymm2, %ymm1, %ymm1
; AVX512F-NEXT: vpsllw $8, %ymm1, %ymm1
-; AVX512F-NEXT: vpor %ymm1, %ymm4, %ymm1
+; AVX512F-NEXT: vpor %ymm1, %ymm3, %ymm1
; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0
; AVX512F-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/min-legal-vector-width.ll b/llvm/test/CodeGen/X86/min-legal-vector-width.ll
index 04f0a65c99da8..aa2dd00237b07 100644
--- a/llvm/test/CodeGen/X86/min-legal-vector-width.ll
+++ b/llvm/test/CodeGen/X86/min-legal-vector-width.ll
@@ -889,19 +889,17 @@ define dso_local void @mul256(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"="
; CHECK-SKX-NOVBMI-NEXT: vmovdqa 32(%rdi), %ymm1
; CHECK-SKX-NOVBMI-NEXT: vmovdqa (%rsi), %ymm2
; CHECK-SKX-NOVBMI-NEXT: vmovdqa 32(%rsi), %ymm3
-; CHECK-SKX-NOVBMI-NEXT: vpbroadcastd {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; CHECK-SKX-NOVBMI-NEXT: vpand %ymm4, %ymm3, %ymm5
-; CHECK-SKX-NOVBMI-NEXT: vpmaddubsw %ymm5, %ymm1, %ymm5
-; CHECK-SKX-NOVBMI-NEXT: vpandn %ymm3, %ymm4, %ymm3
+; CHECK-SKX-NOVBMI-NEXT: vpmullw %ymm3, %ymm1, %ymm4
+; CHECK-SKX-NOVBMI-NEXT: vpbroadcastd {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; CHECK-SKX-NOVBMI-NEXT: vpandn %ymm3, %ymm5, %ymm3
; CHECK-SKX-NOVBMI-NEXT: vpmaddubsw %ymm3, %ymm1, %ymm1
; CHECK-SKX-NOVBMI-NEXT: vpsllw $8, %ymm1, %ymm1
-; CHECK-SKX-NOVBMI-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 | (ymm5 & ymm4)
-; CHECK-SKX-NOVBMI-NEXT: vpand %ymm4, %ymm2, %ymm3
-; CHECK-SKX-NOVBMI-NEXT: vpmaddubsw %ymm3, %ymm0, %ymm3
-; CHECK-SKX-NOVBMI-NEXT: vpandn %ymm2, %ymm4, %ymm2
+; CHECK-SKX-NOVBMI-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 | (ymm4 & ymm5)
+; CHECK-SKX-NOVBMI-NEXT: vpmullw %ymm2, %ymm0, %ymm3
+; CHECK-SKX-NOVBMI-NEXT: vpandn %ymm2, %ymm5, %ymm2
; CHECK-SKX-NOVBMI-NEXT: vpmaddubsw %ymm2, %ymm0, %ymm0
; CHECK-SKX-NOVBMI-NEXT: vpsllw $8, %ymm0, %ymm0
-; CHECK-SKX-NOVBMI-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 | (ymm3 & ymm4)
+; CHECK-SKX-NOVBMI-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 | (ymm3 & ymm5)
; CHECK-SKX-NOVBMI-NEXT: vmovdqa %ymm0, (%rdx)
; CHECK-SKX-NOVBMI-NEXT: vmovdqa %ymm1, 32(%rdx)
; CHECK-SKX-NOVBMI-NEXT: vzeroupper
@@ -913,20 +911,18 @@ define dso_local void @mul256(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"="
; CHECK-SKX-VBMI-NEXT: vmovdqa 32(%rdi), %ymm1
; CHECK-SKX-VBMI-NEXT: vmovdqa (%rsi), %ymm2
; CHECK-SKX-VBMI-NEXT: vmovdqa 32(%rsi), %ymm3
-; CHECK-SKX-VBMI-NEXT: vpbroadcastd {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; CHECK-SKX-VBMI-NEXT: vpandn %ymm3, %ymm4, %ymm5
-; CHECK-SKX-VBMI-NEXT: vpmaddubsw %ymm5, %ymm1, %ymm5
-; CHECK-SKX-VBMI-NEXT: vpand %ymm4, %ymm3, %ymm3
+; CHECK-SKX-VBMI-NEXT: vpmullw %ymm3, %ymm1, %ymm4
+; CHECK-SKX-VBMI-NEXT: vpbroadcastd {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; CHECK-SKX-VBMI-NEXT: vpandn %ymm3, %ymm5, %ymm3
; CHECK-SKX-VBMI-NEXT: vpmaddubsw %ymm3, %ymm1, %ymm1
; CHECK-SKX-VBMI-NEXT: vmovdqa {{.*#+}} ymm3 = [0,32,2,34,4,36,6,38,8,40,10,42,12,44,14,46,16,48,18,50,20,52,22,54,24,56,26,58,28,60,30,62]
-; CHECK-SKX-VBMI-NEXT: vpermt2b %ymm5, %ymm3, %ymm1
-; CHECK-SKX-VBMI-NEXT: vpandn %ymm2, %ymm4, %ymm5
-; CHECK-SKX-VBMI-NEXT: vpmaddubsw %ymm5, %ymm0, %ymm5
-; CHECK-SKX-VBMI-NEXT: vpand %ymm4, %ymm2, %ymm2
+; CHECK-SKX-VBMI-NEXT: vpermt2b %ymm1, %ymm3, %ymm4
+; CHECK-SKX-VBMI-NEXT: vpmullw %ymm2, %ymm0, %ymm1
+; CHECK-SKX-VBMI-NEXT: vpandn %ymm2, %ymm5, %ymm2
; CHECK-SKX-VBMI-NEXT: vpmaddubsw %ymm2, %ymm0, %ymm0
-; CHECK-SKX-VBMI-NEXT: vpermt2b %ymm5, %ymm3, %ymm0
-; CHECK-SKX-VBMI-NEXT: vmovdqa %ymm0, (%rdx)
-; CHECK-SKX-VBMI-NEXT: vmovdqa %ymm1, 32(%rdx)
+; CHECK-SKX-VBMI-NEXT: vpermt2b %ymm0, %ymm3, %ymm1
+; CHECK-SKX-VBMI-NEXT: vmovdqa %ymm1, (%rdx)
+; CHECK-SKX-VBMI-NEXT: vmovdqa %ymm4, 32(%rdx)
; CHECK-SKX-VBMI-NEXT: vzeroupper
; CHECK-SKX-VBMI-NEXT: retq
;
@@ -936,19 +932,17 @@ define dso_local void @mul256(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"="
; CHECK-AVX512-NEXT: vmovdqa 32(%rdi), %ymm1
; CHECK-AVX512-NEXT: vmovdqa (%rsi), %ymm2
; CHECK-AVX512-NEXT: vmovdqa 32(%rsi), %ymm3
-; CHECK-AVX512-NEXT: vpbroadcastd {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; CHECK-AVX512-NEXT: vpand %ymm4, %ymm3, %ymm5
-; CHECK-AVX512-NEXT: vpmaddubsw %ymm5, %ymm1, %ymm5
-; CHECK-AVX512-NEXT: vpandn %ymm3, %ymm4, %ymm3
+; CHECK-AVX512-NEXT: vpmullw %ymm3, %ymm1, %ymm4
+; CHECK-AVX512-NEXT: vpbroadcastd {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; CHECK-AVX512-NEXT: vpandn %ymm3, %ymm5, %ymm3
; CHECK-AVX512-NEXT: vpmaddubsw %ymm3, %ymm1, %ymm1
; CHECK-AVX512-NEXT: vpsllw $8, %ymm1, %ymm1
-; CHECK-AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 | (ymm5 & ymm4)
-; CHECK-AVX512-NEXT: vpand %ymm4, %ymm2, %ymm3
-; CHECK-AVX512-NEXT: vpmaddubsw %ymm3, %ymm0, %ymm3
-; CHECK-AVX512-NEXT: vpandn %ymm2, %ymm4, %ymm2
+; CHECK-AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 | (ymm4 & ymm5)
+; CHECK-AVX512-NEXT: vpmullw %ymm2, %ymm0, %ymm3
+; CHECK-AVX512-NEXT: vpandn %ymm2, %ymm5, %ymm2
; CHECK-AVX512-NEXT: vpmaddubsw %ymm2, %ymm0, %ymm0
; CHECK-AVX512-NEXT: vpsllw $8, %ymm0, %ymm0
-; CHECK-AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 | (ymm3 & ymm4)
+; CHECK-AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 | (ymm3 & ymm5)
; CHECK-AVX512-NEXT: vmovdqa %ymm0, (%rdx)
; CHECK-AVX512-NEXT: vmovdqa %ymm1, 32(%rdx)
; CHECK-AVX512-NEXT: vzeroupper
@@ -960,20 +954,18 @@ define dso_local void @mul256(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"="
; CHECK-VBMI-NEXT: vmovdqa 32(%rdi), %ymm1
; CHECK-VBMI-NEXT: vmovdqa (%rsi), %ymm2
; CHECK-VBMI-NEXT: vmovdqa 32(%rsi), %ymm3
-; CHECK-VBMI-NEXT: vpbroadcastd {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; CHECK-VBMI-NEXT: vpandn %ymm3, %ymm4, %ymm5
-; CHECK-VBMI-NEXT: vpmaddubsw %ymm5, %ymm1, %ymm5
-; CHECK-VBMI-NEXT: vpand %ymm4, %ymm3, %ymm3
+; CHECK-VBMI-NEXT: vpmullw %ymm3, %ymm1, %ymm4
+; CHECK-VBMI-NEXT: vpbroadcastd {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; CHECK-VBMI-NEXT: vpandn %ymm3, %ymm5, %ymm3
; CHECK-VBMI-NEXT: vpmaddubsw %ymm3, %ymm1, %ymm1
; CHECK-VBMI-NEXT: vmovdqa {{.*#+}} ymm3 = [0,32,2,34,4,36,6,38,8,40,10,42,12,44,14,46,16,48,18,50,20,52,22,54,24,56,26,58,28,60,30,62]
-; CHECK-VBMI-NEXT: vpermt2b %ymm5, %ymm3, %ymm1
-; CHECK-VBMI-NEXT: vpandn %ymm2, %ymm4, %ymm5
-; CHECK-VBMI-NEXT: vpmaddubsw %ymm5, %ymm0, %ymm5
-; CHECK-VBMI-NEXT: vpand %ymm4, %ymm2, %ymm2
+; CHECK-VBMI-NEXT: vpermt2b %ymm1, %ymm3, %ymm4
+; CHECK-VBMI-NEXT: vpmullw %ymm2, %ymm0, %ymm1
+; CHECK-VBMI-NEXT: vpandn %ymm2, %ymm5, %ymm2
; CHECK-VBMI-NEXT: vpmaddubsw %ymm2, %ymm0, %ymm0
-; CHECK-VBMI-NEXT: vpermt2b %ymm5, %ymm3, %ymm0
-; CHECK-VBMI-NEXT: vmovdqa %ymm0, (%rdx)
-; CHECK-VBMI-NEXT: vmovdqa %ymm1, 32(%rdx)
+; CHECK-VBMI-NEXT: vpermt2b %ymm0, %ymm3, %ymm1
+; CHECK-VBMI-NEXT: vmovdqa %ymm1, (%rdx)
+; CHECK-VBMI-NEXT: vmovdqa %ymm4, 32(%rdx)
; CHECK-VBMI-NEXT: vzeroupper
; CHECK-VBMI-NEXT: retq
%d = load <64 x i8>, ptr %a
@@ -988,13 +980,12 @@ define dso_local void @mul512(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"="
; CHECK-SKX-NOVBMI: # %bb.0:
; CHECK-SKX-NOVBMI-NEXT: vmovdqa64 (%rdi), %zmm0
; CHECK-SKX-NOVBMI-NEXT: vmovdqa64 (%rsi), %zmm1
-; CHECK-SKX-NOVBMI-NEXT: vpbroadcastd {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; CHECK-SKX-NOVBMI-NEXT: vpandq %zmm2, %zmm1, %zmm3
-; CHECK-SKX-NOVBMI-NEXT: vpmaddubsw %zmm3, %zmm0, %zmm3
-; CHECK-SKX-NOVBMI-NEXT: vpandnq %zmm1, %zmm2, %zmm1
+; CHECK-SKX-NOVBMI-NEXT: vpmullw %zmm1, %zmm0, %zmm2
+; CHECK-SKX-NOVBMI-NEXT: vpbroadcastd {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; CHECK-SKX-NOVBMI-NEXT: vpandnq %zmm1, %zmm3, %zmm1
; CHECK-SKX-NOVBMI-NEXT: vpmaddubsw %zmm1, %zmm0, %zmm0
; CHECK-SKX-NOVBMI-NEXT: vpsllw $8, %zmm0, %zmm0
-; CHECK-SKX-NOVBMI-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 | (zmm3 & zmm2)
+; CHECK-SKX-NOVBMI-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 | (zmm2 & zmm3)
; CHECK-SKX-NOVBMI-NEXT: vmovdqa64 %zmm0, (%rdx)
; CHECK-SKX-NOVBMI-NEXT: vzeroupper
; CHECK-SKX-NOVBMI-NEXT: retq
@@ -1003,13 +994,11 @@ define dso_local void @mul512(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"="
; CHECK-SKX-VBMI: # %bb.0:
; CHECK-SKX-VBMI-NEXT: vmovdqa64 (%rdi), %zmm0
; CHECK-SKX-VBMI-NEXT: vmovdqa64 (%rsi), %zmm1
-; CHECK-SKX-VBMI-NEXT: vpbroadcastd {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; CHECK-SKX-VBMI-NEXT: vpandnq %zmm1, %zmm2, %zmm3
-; CHECK-SKX-VBMI-NEXT: vpmaddubsw %zmm3, %zmm0, %zmm3
-; CHECK-SKX-VBMI-NEXT: vpandq %zmm2, %zmm1, %zmm1
+; CHECK-SKX-VBMI-NEXT: vpmullw %zmm1, %zmm0, %zmm2
+; CHECK-SKX-VBMI-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1
; CHECK-SKX-VBMI-NEXT: vpmaddubsw %zmm1, %zmm0, %zmm0
; CHECK-SKX-VBMI-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,64,2,66,4,68,6,70,8,72,10,74,12,76,14,78,16,80,18,82,20,84,22,86,24,88,26,90,28,92,30,94,32,96,34,98,36,100,38,102,40,104,42,106,44,108,46,110,48,112,50,114,52,116,54,118,56,120,58,122,60,124,62,126]
-; CHECK-SKX-VBMI-NEXT: vpermi2b %zmm3, %zmm0, %zmm1
+; CHECK-SKX-VBMI-NEXT: vpermi2b %zmm0, %zmm2, %zmm1
; CHECK-SKX-VBMI-NEXT: vmovdqa64 %zmm1, (%rdx)
; CHECK-SKX-VBMI-NEXT: vzeroupper
; CHECK-SKX-VBMI-NEXT: retq
@@ -1018,13 +1007,12 @@ define dso_local void @mul512(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"="
; CHECK-AVX512: # %bb.0:
; CHECK-AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
; CHECK-AVX512-NEXT: vmovdqa64 (%rsi), %zmm1
-; CHECK-AVX512-NEXT: vpbroadcastd {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; CHECK-AVX512-NEXT: vpandq %zmm2, %zmm1, %zmm3
-; CHECK-AVX512-NEXT: vpmaddubsw %zmm3, %zmm0, %zmm3
-; CHECK-AVX512-NEXT: vpandnq %zmm1, %zmm2, %zmm1
+; CHECK-AVX512-NEXT: vpmullw %zmm1, %zmm0, %zmm2
+; CHECK-AVX512-NEXT: vpbroadcastd {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; CHECK-AVX512-NEXT: vpandnq %zmm1, %zmm3, %zmm1
; CHECK-AVX512-NEXT: vpmaddubsw %zmm1, %zmm0, %zmm0
; CHECK-AVX512-NEXT: vpsllw $8, %zmm0, %zmm0
-; CHECK-AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 | (zmm3 & zmm2)
+; CHECK-AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 | (zmm2 & zmm3)
; CHECK-AVX512-NEXT: vmovdqa64 %zmm0, (%rdx)
; CHECK-AVX512-NEXT: vzeroupper
; CHECK-AVX512-NEXT: retq
@@ -1033,13 +1021,11 @@ define dso_local void @mul512(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"="
; CHECK-VBMI: # %bb.0:
; CHECK-VBMI-NEXT: vmovdqa64 (%rdi), %zmm0
; CHECK-VBMI-NEXT: vmovdqa64 (%rsi), %zmm1
-; CHECK-VBMI-NEXT: vpbroadcastd {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; CHECK-VBMI-NEXT: vpandnq %zmm1, %zmm2, %zmm3
-; CHECK-VBMI-NEXT: vpmaddubsw %zmm3, %zmm0, %zmm3
-; CHECK-VBMI-NEXT: vpandq %zmm2, %zmm1, %zmm1
+; CHECK-VBMI-NEXT: vpmullw %zmm1, %zmm0, %zmm2
+; CHECK-VBMI-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1
; CHECK-VBMI-NEXT: vpmaddubsw %zmm1, %zmm0, %zmm0
; CHECK-VBMI-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,64,2,66,4,68,6,70,8,72,10,74,12,76,14,78,16,80,18,82,20,84,22,86,24,88,26,90,28,92,30,94,32,96,34,98,36,100,38,102,40,104,42,106,44,108,46,110,48,112,50,114,52,116,54,118,56,120,58,122,60,124,62,126]
-; CHECK-VBMI-NEXT: vpermi2b %zmm3, %zmm0, %zmm1
+; CHECK-VBMI-NEXT: vpermi2b %zmm0, %zmm2, %zmm1
; CHECK-VBMI-NEXT: vmovdqa64 %zmm1, (%rdx)
; CHECK-VBMI-NEXT: vzeroupper
; CHECK-VBMI-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/pmul.ll b/llvm/test/CodeGen/X86/pmul.ll
index 00731fe3e9556..189c5aa9fee20 100644
--- a/llvm/test/CodeGen/X86/pmul.ll
+++ b/llvm/test/CodeGen/X86/pmul.ll
@@ -25,7 +25,7 @@ define <16 x i8> @mul_v16i8c(<16 x i8> %i) nounwind {
; SSE41-NEXT: movdqa %xmm0, %xmm1
; SSE41-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117]
; SSE41-NEXT: psllw $8, %xmm1
-; SSE41-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0]
+; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117]
; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE41-NEXT: por %xmm1, %xmm0
; SSE41-NEXT: retq
@@ -160,16 +160,14 @@ define <16 x i8> @mul_v16i8(<16 x i8> %i, <16 x i8> %j) nounwind {
;
; SSE41-LABEL: mul_v16i8:
; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
-; SSE41-NEXT: movdqa %xmm1, %xmm3
-; SSE41-NEXT: pand %xmm2, %xmm3
-; SSE41-NEXT: movdqa %xmm0, %xmm4
-; SSE41-NEXT: pmaddubsw %xmm3, %xmm4
-; SSE41-NEXT: pand %xmm2, %xmm4
-; SSE41-NEXT: pandn %xmm1, %xmm2
-; SSE41-NEXT: pmaddubsw %xmm2, %xmm0
+; SSE41-NEXT: movdqa %xmm0, %xmm2
+; SSE41-NEXT: pmullw %xmm1, %xmm2
+; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; SSE41-NEXT: pand %xmm3, %xmm2
+; SSE41-NEXT: pandn %xmm1, %xmm3
+; SSE41-NEXT: pmaddubsw %xmm3, %xmm0
; SSE41-NEXT: psllw $8, %xmm0
-; SSE41-NEXT: por %xmm4, %xmm0
+; SSE41-NEXT: por %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX2-LABEL: mul_v16i8:
@@ -400,28 +398,27 @@ define <32 x i8> @mul_v32i8c(<32 x i8> %i) nounwind {
;
; SSE41-LABEL: mul_v32i8c:
; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: pmovsxbw {{.*#+}} xmm2 = [117,117,117,117,117,117,117,117]
+; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117]
; SSE41-NEXT: movdqa %xmm0, %xmm3
-; SSE41-NEXT: pmaddubsw %xmm2, %xmm3
+; SSE41-NEXT: pmullw %xmm2, %xmm3
; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
; SSE41-NEXT: pand %xmm4, %xmm3
; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117]
; SSE41-NEXT: pmaddubsw %xmm5, %xmm0
; SSE41-NEXT: psllw $8, %xmm0
; SSE41-NEXT: por %xmm3, %xmm0
-; SSE41-NEXT: movdqa %xmm1, %xmm3
-; SSE41-NEXT: pmaddubsw %xmm2, %xmm3
-; SSE41-NEXT: pand %xmm4, %xmm3
+; SSE41-NEXT: pmullw %xmm1, %xmm2
+; SSE41-NEXT: pand %xmm4, %xmm2
; SSE41-NEXT: pmaddubsw %xmm5, %xmm1
; SSE41-NEXT: psllw $8, %xmm1
-; SSE41-NEXT: por %xmm3, %xmm1
+; SSE41-NEXT: por %xmm2, %xmm1
; SSE41-NEXT: retq
;
; AVX2-LABEL: mul_v32i8c:
; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 # [0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117]
; AVX2-NEXT: vpsllw $8, %ymm1, %ymm1
-; AVX2-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0]
+; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117]
; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
@@ -430,7 +427,7 @@ define <32 x i8> @mul_v32i8c(<32 x i8> %i) nounwind {
; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 # [0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117]
; AVX512F-NEXT: vpsllw $8, %ymm1, %ymm1
-; AVX512F-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0]
+; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117]
; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: retq
@@ -584,49 +581,44 @@ define <32 x i8> @mul_v32i8(<32 x i8> %i, <32 x i8> %j) nounwind {
;
; SSE41-LABEL: mul_v32i8:
; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
-; SSE41-NEXT: movdqa %xmm4, %xmm5
-; SSE41-NEXT: pandn %xmm2, %xmm5
-; SSE41-NEXT: pand %xmm4, %xmm2
-; SSE41-NEXT: movdqa %xmm0, %xmm6
-; SSE41-NEXT: pmaddubsw %xmm2, %xmm6
-; SSE41-NEXT: pand %xmm4, %xmm6
-; SSE41-NEXT: pmaddubsw %xmm5, %xmm0
+; SSE41-NEXT: movdqa %xmm0, %xmm4
+; SSE41-NEXT: pmullw %xmm2, %xmm4
+; SSE41-NEXT: pmovzxbw {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
+; SSE41-NEXT: pand %xmm5, %xmm4
+; SSE41-NEXT: movdqa %xmm5, %xmm6
+; SSE41-NEXT: pandn %xmm2, %xmm6
+; SSE41-NEXT: pmaddubsw %xmm6, %xmm0
; SSE41-NEXT: psllw $8, %xmm0
-; SSE41-NEXT: por %xmm6, %xmm0
-; SSE41-NEXT: movdqa %xmm3, %xmm2
-; SSE41-NEXT: pand %xmm4, %xmm2
-; SSE41-NEXT: movdqa %xmm1, %xmm5
-; SSE41-NEXT: pmaddubsw %xmm2, %xmm5
-; SSE41-NEXT: pand %xmm4, %xmm5
-; SSE41-NEXT: pandn %xmm3, %xmm4
-; SSE41-NEXT: pmaddubsw %xmm4, %xmm1
+; SSE41-NEXT: por %xmm4, %xmm0
+; SSE41-NEXT: movdqa %xmm1, %xmm2
+; SSE41-NEXT: pmullw %xmm3, %xmm2
+; SSE41-NEXT: pand %xmm5, %xmm2
+; SSE41-NEXT: pandn %xmm3, %xmm5
+; SSE41-NEXT: pmaddubsw %xmm5, %xmm1
; SSE41-NEXT: psllw $8, %xmm1
-; SSE41-NEXT: por %xmm5, %xmm1
+; SSE41-NEXT: por %xmm2, %xmm1
; SSE41-NEXT: retq
;
; AVX2-LABEL: mul_v32i8:
; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX2-NEXT: vpmaddubsw %ymm3, %ymm0, %ymm3
-; AVX2-NEXT: vpand %ymm2, %ymm3, %ymm3
-; AVX2-NEXT: vpandn %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm2
+; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2
+; AVX2-NEXT: vpandn %ymm1, %ymm3, %ymm1
; AVX2-NEXT: vpmaddubsw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpsllw $8, %ymm0, %ymm0
-; AVX2-NEXT: vpor %ymm0, %ymm3, %ymm0
+; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: mul_v32i8:
; AVX512F: # %bb.0: # %entry
-; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vpmaddubsw %ymm3, %ymm0, %ymm3
-; AVX512F-NEXT: vpand %ymm2, %ymm3, %ymm3
-; AVX512F-NEXT: vpandn %ymm1, %ymm2, %ymm1
+; AVX512F-NEXT: vpmullw %ymm1, %ymm0, %ymm2
+; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT: vpandn %ymm1, %ymm3, %ymm1
; AVX512F-NEXT: vpmaddubsw %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm0
-; AVX512F-NEXT: vpor %ymm0, %ymm3, %ymm0
+; AVX512F-NEXT: vpor %ymm0, %ymm2, %ymm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: mul_v32i8:
@@ -773,9 +765,9 @@ define <64 x i8> @mul_v64i8c(<64 x i8> %i) nounwind {
;
; SSE41-LABEL: mul_v64i8c:
; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: pmovsxbw {{.*#+}} xmm4 = [117,117,117,117,117,117,117,117]
+; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117]
; SSE41-NEXT: movdqa %xmm0, %xmm6
-; SSE41-NEXT: pmaddubsw %xmm4, %xmm6
+; SSE41-NEXT: pmullw %xmm4, %xmm6
; SSE41-NEXT: pmovzxbw {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
; SSE41-NEXT: pand %xmm5, %xmm6
; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117]
@@ -783,36 +775,35 @@ define <64 x i8> @mul_v64i8c(<64 x i8> %i) nounwind {
; SSE41-NEXT: psllw $8, %xmm0
; SSE41-NEXT: por %xmm6, %xmm0
; SSE41-NEXT: movdqa %xmm1, %xmm6
-; SSE41-NEXT: pmaddubsw %xmm4, %xmm6
+; SSE41-NEXT: pmullw %xmm4, %xmm6
; SSE41-NEXT: pand %xmm5, %xmm6
; SSE41-NEXT: pmaddubsw %xmm7, %xmm1
; SSE41-NEXT: psllw $8, %xmm1
; SSE41-NEXT: por %xmm6, %xmm1
; SSE41-NEXT: movdqa %xmm2, %xmm6
-; SSE41-NEXT: pmaddubsw %xmm4, %xmm6
+; SSE41-NEXT: pmullw %xmm4, %xmm6
; SSE41-NEXT: pand %xmm5, %xmm6
; SSE41-NEXT: pmaddubsw %xmm7, %xmm2
; SSE41-NEXT: psllw $8, %xmm2
; SSE41-NEXT: por %xmm6, %xmm2
-; SSE41-NEXT: movdqa %xmm3, %xmm6
-; SSE41-NEXT: pmaddubsw %xmm4, %xmm6
-; SSE41-NEXT: pand %xmm5, %xmm6
+; SSE41-NEXT: pmullw %xmm3, %xmm4
+; SSE41-NEXT: pand %xmm5, %xmm4
; SSE41-NEXT: pmaddubsw %xmm7, %xmm3
; SSE41-NEXT: psllw $8, %xmm3
-; SSE41-NEXT: por %xmm6, %xmm3
+; SSE41-NEXT: por %xmm4, %xmm3
; SSE41-NEXT: retq
;
; AVX2-LABEL: mul_v64i8c:
; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0]
-; AVX2-NEXT: vpmaddubsw %ymm2, %ymm0, %ymm3
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm2 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117]
+; AVX2-NEXT: vpmullw %ymm2, %ymm0, %ymm3
; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3
; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm5 = [0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117]
; AVX2-NEXT: vpmaddubsw %ymm5, %ymm0, %ymm0
; AVX2-NEXT: vpsllw $8, %ymm0, %ymm0
; AVX2-NEXT: vpor %ymm0, %ymm3, %ymm0
-; AVX2-NEXT: vpmaddubsw %ymm2, %ymm1, %ymm2
+; AVX2-NEXT: vpmullw %ymm2, %ymm1, %ymm2
; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2
; AVX2-NEXT: vpmaddubsw %ymm5, %ymm1, %ymm1
; AVX2-NEXT: vpsllw $8, %ymm1, %ymm1
@@ -822,9 +813,9 @@ define <64 x i8> @mul_v64i8c(<64 x i8> %i) nounwind {
; AVX512F-LABEL: mul_v64i8c:
; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm2 = [117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0]
-; AVX512F-NEXT: vpmaddubsw %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vpmaddubsw %ymm2, %ymm0, %ymm2
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117]
+; AVX512F-NEXT: vpmullw %ymm2, %ymm1, %ymm3
+; AVX512F-NEXT: vpmullw %ymm2, %ymm0, %ymm2
; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm3 = [0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117]
; AVX512F-NEXT: vpmaddubsw %ymm3, %ymm0, %ymm0
@@ -837,7 +828,7 @@ define <64 x i8> @mul_v64i8c(<64 x i8> %i) nounwind {
;
; AVX512BW-LABEL: mul_v64i8c:
; AVX512BW: # %bb.0: # %entry
-; AVX512BW-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 # [117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0]
+; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 # [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117]
; AVX512BW-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 # [0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117]
; AVX512BW-NEXT: vpsllw $8, %zmm0, %zmm0
; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm0 = zmm0 | (zmm1 & m32bcst)
@@ -899,59 +890,52 @@ define <64 x i8> @mul_v64i8(<64 x i8> %i, <64 x i8> %j) nounwind {
;
; SSE41-LABEL: mul_v64i8:
; SSE41: # %bb.0: # %entry
+; SSE41-NEXT: movdqa %xmm0, %xmm9
+; SSE41-NEXT: pmullw %xmm4, %xmm9
; SSE41-NEXT: pmovzxbw {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255]
-; SSE41-NEXT: movdqa %xmm8, %xmm9
-; SSE41-NEXT: pandn %xmm4, %xmm9
-; SSE41-NEXT: pand %xmm8, %xmm4
-; SSE41-NEXT: movdqa %xmm0, %xmm10
-; SSE41-NEXT: pmaddubsw %xmm4, %xmm10
-; SSE41-NEXT: pand %xmm8, %xmm10
-; SSE41-NEXT: pmaddubsw %xmm9, %xmm0
-; SSE41-NEXT: psllw $8, %xmm0
-; SSE41-NEXT: por %xmm10, %xmm0
-; SSE41-NEXT: movdqa %xmm8, %xmm4
-; SSE41-NEXT: pandn %xmm5, %xmm4
-; SSE41-NEXT: pand %xmm8, %xmm5
-; SSE41-NEXT: movdqa %xmm1, %xmm9
-; SSE41-NEXT: pmaddubsw %xmm5, %xmm9
; SSE41-NEXT: pand %xmm8, %xmm9
-; SSE41-NEXT: pmaddubsw %xmm4, %xmm1
+; SSE41-NEXT: movdqa %xmm8, %xmm10
+; SSE41-NEXT: pandn %xmm4, %xmm10
+; SSE41-NEXT: pmaddubsw %xmm10, %xmm0
+; SSE41-NEXT: psllw $8, %xmm0
+; SSE41-NEXT: por %xmm9, %xmm0
+; SSE41-NEXT: movdqa %xmm1, %xmm4
+; SSE41-NEXT: pmullw %xmm5, %xmm4
+; SSE41-NEXT: pand %xmm8, %xmm4
+; SSE41-NEXT: movdqa %xmm8, %xmm9
+; SSE41-NEXT: pandn %xmm5, %xmm9
+; SSE41-NEXT: pmaddubsw %xmm9, %xmm1
; SSE41-NEXT: psllw $8, %xmm1
-; SSE41-NEXT: por %xmm9, %xmm1
-; SSE41-NEXT: movdqa %xmm8, %xmm4
-; SSE41-NEXT: pandn %xmm6, %xmm4
-; SSE41-NEXT: pand %xmm8, %xmm6
-; SSE41-NEXT: movdqa %xmm2, %xmm5
-; SSE41-NEXT: pmaddubsw %xmm6, %xmm5
-; SSE41-NEXT: pand %xmm8, %xmm5
-; SSE41-NEXT: pmaddubsw %xmm4, %xmm2
+; SSE41-NEXT: por %xmm4, %xmm1
+; SSE41-NEXT: movdqa %xmm2, %xmm4
+; SSE41-NEXT: pmullw %xmm6, %xmm4
+; SSE41-NEXT: pand %xmm8, %xmm4
+; SSE41-NEXT: movdqa %xmm8, %xmm5
+; SSE41-NEXT: pandn %xmm6, %xmm5
+; SSE41-NEXT: pmaddubsw %xmm5, %xmm2
; SSE41-NEXT: psllw $8, %xmm2
-; SSE41-NEXT: por %xmm5, %xmm2
-; SSE41-NEXT: movdqa %xmm7, %xmm4
+; SSE41-NEXT: por %xmm4, %xmm2
+; SSE41-NEXT: movdqa %xmm3, %xmm4
+; SSE41-NEXT: pmullw %xmm7, %xmm4
; SSE41-NEXT: pand %xmm8, %xmm4
-; SSE41-NEXT: movdqa %xmm3, %xmm5
-; SSE41-NEXT: pmaddubsw %xmm4, %xmm5
-; SSE41-NEXT: pand %xmm8, %xmm5
; SSE41-NEXT: pandn %xmm7, %xmm8
; SSE41-NEXT: pmaddubsw %xmm8, %xmm3
; SSE41-NEXT: psllw $8, %xmm3
-; SSE41-NEXT: por %xmm5, %xmm3
+; SSE41-NEXT: por %xmm4, %xmm3
; SSE41-NEXT: retq
;
; AVX2-LABEL: mul_v64i8:
; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm5
-; AVX2-NEXT: vpmaddubsw %ymm5, %ymm0, %ymm5
-; AVX2-NEXT: vpand %ymm4, %ymm5, %ymm5
-; AVX2-NEXT: vpandn %ymm2, %ymm4, %ymm2
+; AVX2-NEXT: vpmullw %ymm2, %ymm0, %ymm4
+; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpand %ymm5, %ymm4, %ymm4
+; AVX2-NEXT: vpandn %ymm2, %ymm5, %ymm2
; AVX2-NEXT: vpmaddubsw %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpsllw $8, %ymm0, %ymm0
-; AVX2-NEXT: vpor %ymm0, %ymm5, %ymm0
-; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm2
-; AVX2-NEXT: vpmaddubsw %ymm2, %ymm1, %ymm2
-; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2
-; AVX2-NEXT: vpandn %ymm3, %ymm4, %ymm3
+; AVX2-NEXT: vpor %ymm0, %ymm4, %ymm0
+; AVX2-NEXT: vpmullw %ymm3, %ymm1, %ymm2
+; AVX2-NEXT: vpand %ymm5, %ymm2, %ymm2
+; AVX2-NEXT: vpandn %ymm3, %ymm5, %ymm3
; AVX2-NEXT: vpmaddubsw %ymm3, %ymm1, %ymm1
; AVX2-NEXT: vpsllw $8, %ymm1, %ymm1
; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1
@@ -959,33 +943,30 @@ define <64 x i8> @mul_v64i8(<64 x i8> %i, <64 x i8> %j) nounwind {
;
; AVX512F-LABEL: mul_v64i8:
; AVX512F: # %bb.0: # %entry
-; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
-; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3
-; AVX512F-NEXT: vpand %ymm2, %ymm3, %ymm4
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm5
-; AVX512F-NEXT: vpmaddubsw %ymm4, %ymm5, %ymm4
-; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm6
-; AVX512F-NEXT: vpmaddubsw %ymm6, %ymm0, %ymm6
-; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm6, %zmm4
-; AVX512F-NEXT: vpandn %ymm1, %ymm2, %ymm1
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; AVX512F-NEXT: vpmullw %ymm2, %ymm3, %ymm4
+; AVX512F-NEXT: vpmullw %ymm1, %ymm0, %ymm5
+; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4
+; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm5 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; AVX512F-NEXT: vpandn %ymm1, %ymm5, %ymm1
; AVX512F-NEXT: vpmaddubsw %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm0
-; AVX512F-NEXT: vpandn %ymm3, %ymm2, %ymm1
-; AVX512F-NEXT: vpmaddubsw %ymm1, %ymm5, %ymm1
+; AVX512F-NEXT: vpandn %ymm2, %ymm5, %ymm1
+; AVX512F-NEXT: vpmaddubsw %ymm1, %ymm3, %ymm1
; AVX512F-NEXT: vpsllw $8, %ymm1, %ymm1
; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 | (zmm4 & zmm2)
+; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 | (zmm4 & zmm5)
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: mul_v64i8:
; AVX512BW: # %bb.0: # %entry
-; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512BW-NEXT: vpandq %zmm2, %zmm1, %zmm3
-; AVX512BW-NEXT: vpmaddubsw %zmm3, %zmm0, %zmm3
-; AVX512BW-NEXT: vpandnq %zmm1, %zmm2, %zmm1
+; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm2
+; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512BW-NEXT: vpandnq %zmm1, %zmm3, %zmm1
; AVX512BW-NEXT: vpmaddubsw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpsllw $8, %zmm0, %zmm0
-; AVX512BW-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 | (zmm3 & zmm2)
+; AVX512BW-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 | (zmm2 & zmm3)
; AVX512BW-NEXT: retq
entry:
%A = mul <64 x i8> %i, %j
diff --git a/llvm/test/CodeGen/X86/prefer-avx256-wide-mul.ll b/llvm/test/CodeGen/X86/prefer-avx256-wide-mul.ll
index 59b03f8c02223..c9e48f817fb44 100644
--- a/llvm/test/CodeGen/X86/prefer-avx256-wide-mul.ll
+++ b/llvm/test/CodeGen/X86/prefer-avx256-wide-mul.ll
@@ -58,13 +58,12 @@ define <32 x i8> @test_div7_32i8(<32 x i8> %a) {
define <32 x i8> @test_mul_32i8(<32 x i8> %a, <32 x i8> %b) {
; AVX256BW-LABEL: test_mul_32i8:
; AVX256BW: # %bb.0:
-; AVX256BW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX256BW-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX256BW-NEXT: vpmaddubsw %ymm3, %ymm0, %ymm3
-; AVX256BW-NEXT: vpandn %ymm1, %ymm2, %ymm1
+; AVX256BW-NEXT: vpmullw %ymm1, %ymm0, %ymm2
+; AVX256BW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX256BW-NEXT: vpandn %ymm1, %ymm3, %ymm1
; AVX256BW-NEXT: vpmaddubsw %ymm1, %ymm0, %ymm0
; AVX256BW-NEXT: vpsllw $8, %ymm0, %ymm0
-; AVX256BW-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 | (ymm3 & ymm2)
+; AVX256BW-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 | (ymm2 & ymm3)
; AVX256BW-NEXT: retq
;
; AVX512BWVL-LABEL: test_mul_32i8:
diff --git a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
index bb7245c31b326..ec94d003f10ea 100644
--- a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
+++ b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
@@ -2275,8 +2275,8 @@ define <32 x i1> @pr51133(<32 x i8> %x, <32 x i8> %y) {
; CHECK-SSE41: # %bb.0:
; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm4
; CHECK-SSE41-NEXT: movq %rdi, %rax
-; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm0
-; CHECK-SSE41-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [171,0,183,0,61,0,127,0,9,0,41,0,1,0,161,0]
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [171,103,183,171,61,1,127,183,9,0,41,183,1,1,161,221]
+; CHECK-SSE41-NEXT: pmullw %xmm1, %xmm0
; CHECK-SSE41-NEXT: pmovzxbw {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
; CHECK-SSE41-NEXT: pand %xmm5, %xmm0
; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm6
@@ -2302,8 +2302,8 @@ define <32 x i1> @pr51133(<32 x i8> %x, <32 x i8> %y) {
; CHECK-SSE41-NEXT: pcmpgtb %xmm6, %xmm1
; CHECK-SSE41-NEXT: movaps {{.*#+}} xmm0 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255]
; CHECK-SSE41-NEXT: pblendvb %xmm0, %xmm7, %xmm1
-; CHECK-SSE41-NEXT: movdqa %xmm4, %xmm0
-; CHECK-SSE41-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [197,0,27,0,1,0,1,0,223,0,205,0,161,0,171,0]
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [197,205,27,241,1,1,1,163,223,223,205,183,161,1,171,239]
+; CHECK-SSE41-NEXT: pmullw %xmm4, %xmm0
; CHECK-SSE41-NEXT: pand %xmm5, %xmm0
; CHECK-SSE41-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 # [0,205,0,241,0,1,0,163,0,223,0,183,0,1,0,239]
; CHECK-SSE41-NEXT: psllw $8, %xmm4
@@ -2341,7 +2341,7 @@ define <32 x i1> @pr51133(<32 x i8> %x, <32 x i8> %y) {
; CHECK-AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [0,79,0,103,0,27,0,121,0,129,0,129,0,129,0,47]
; CHECK-AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
; CHECK-AVX1-NEXT: vpackuswb %xmm3, %xmm4, %xmm4
-; CHECK-AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm5 # [0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0]
+; CHECK-AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm5 # [0,0,0,0,1,1,1,0,1,1,0,1,0,1,0,1]
; CHECK-AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
; CHECK-AVX1-NEXT: vpand %xmm3, %xmm5, %xmm5
; CHECK-AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm6 # [0,0,0,0,0,1,0,0,0,1,0,1,0,1,0,1]
@@ -2361,7 +2361,7 @@ define <32 x i1> @pr51133(<32 x i8> %x, <32 x i8> %y) {
; CHECK-AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; CHECK-AVX1-NEXT: vpand %xmm5, %xmm4, %xmm4
; CHECK-AVX1-NEXT: vpaddb %xmm4, %xmm6, %xmm4
-; CHECK-AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm6 # [13,0,19,0,2,0,2,0,62,0,5,0,97,0,3,0]
+; CHECK-AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm6 # [13,5,19,34,2,8,2,88,62,62,5,7,97,2,3,60]
; CHECK-AVX1-NEXT: vpand %xmm3, %xmm6, %xmm6
; CHECK-AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [0,5,0,34,0,8,0,88,0,62,0,7,0,2,0,60]
; CHECK-AVX1-NEXT: vpsllw $8, %xmm4, %xmm4
@@ -2375,7 +2375,7 @@ define <32 x i1> @pr51133(<32 x i8> %x, <32 x i8> %y) {
; CHECK-AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm7, %xmm7 # [0,86,0,95,0,147,0,43,0,49,0,127,0,65,0,147]
; CHECK-AVX1-NEXT: vpsrlw $8, %xmm7, %xmm7
; CHECK-AVX1-NEXT: vpackuswb %xmm6, %xmm7, %xmm6
-; CHECK-AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm7 # [0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0]
+; CHECK-AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm7 # [0,0,1,0,0,255,0,1,0,1,0,1,1,1,0,1]
; CHECK-AVX1-NEXT: vpand %xmm3, %xmm7, %xmm7
; CHECK-AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm8 # [0,0,0,0,0,255,0,1,0,1,0,1,0,1,0,1]
; CHECK-AVX1-NEXT: vpsllw $8, %xmm8, %xmm8
@@ -2394,7 +2394,7 @@ define <32 x i1> @pr51133(<32 x i8> %x, <32 x i8> %y) {
; CHECK-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6, %xmm6
; CHECK-AVX1-NEXT: vpand %xmm5, %xmm6, %xmm5
; CHECK-AVX1-NEXT: vpaddb %xmm5, %xmm7, %xmm5
-; CHECK-AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5, %xmm6 # [3,0,7,0,84,0,127,0,114,0,50,0,2,0,97,0]
+; CHECK-AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5, %xmm6 # [3,87,7,6,84,128,127,56,114,1,50,7,2,8,97,117]
; CHECK-AVX1-NEXT: vpand %xmm3, %xmm6, %xmm3
; CHECK-AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5, %xmm5 # [0,87,0,6,0,128,0,56,0,1,0,7,0,8,0,117]
; CHECK-AVX1-NEXT: vpsllw $8, %xmm5, %xmm5
@@ -2423,7 +2423,7 @@ define <32 x i1> @pr51133(<32 x i8> %x, <32 x i8> %y) {
; CHECK-AVX2-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [0,79,0,103,0,27,0,121,0,129,0,129,0,129,0,47,0,86,0,95,0,147,0,43,0,49,0,127,0,65,0,147]
; CHECK-AVX2-NEXT: vpsrlw $8, %ymm4, %ymm4
; CHECK-AVX2-NEXT: vpackuswb %ymm3, %ymm4, %ymm3
-; CHECK-AVX2-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm4 # [0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0]
+; CHECK-AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm4 # [0,0,0,0,1,1,1,0,1,1,0,1,0,1,0,1,0,0,1,0,0,255,0,1,0,1,0,1,1,1,0,1]
; CHECK-AVX2-NEXT: vpbroadcastw {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; CHECK-AVX2-NEXT: vpand %ymm5, %ymm4, %ymm4
; CHECK-AVX2-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm6 # [0,0,0,0,0,1,0,0,0,1,0,1,0,1,0,1,0,0,0,0,0,255,0,1,0,1,0,1,0,1,0,1]
@@ -2443,7 +2443,7 @@ define <32 x i1> @pr51133(<32 x i8> %x, <32 x i8> %y) {
; CHECK-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3
; CHECK-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3
; CHECK-AVX2-NEXT: vpaddb %ymm3, %ymm4, %ymm3
-; CHECK-AVX2-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm4 # [13,0,19,0,2,0,2,0,62,0,5,0,97,0,3,0,3,0,7,0,84,0,127,0,114,0,50,0,2,0,97,0]
+; CHECK-AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm4 # [13,5,19,34,2,8,2,88,62,62,5,7,97,2,3,60,3,87,7,6,84,128,127,56,114,1,50,7,2,8,97,117]
; CHECK-AVX2-NEXT: vpand %ymm5, %ymm4, %ymm4
; CHECK-AVX2-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [0,5,0,34,0,8,0,88,0,62,0,7,0,2,0,60,0,87,0,6,0,128,0,56,0,1,0,7,0,8,0,117]
; CHECK-AVX2-NEXT: vpsllw $8, %ymm3, %ymm3
@@ -2458,7 +2458,7 @@ define <32 x i1> @pr51133(<32 x i8> %x, <32 x i8> %y) {
;
; CHECK-AVX512VL-LABEL: pr51133:
; CHECK-AVX512VL: # %bb.0:
-; CHECK-AVX512VL-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 # [197,0,27,0,1,0,1,0,223,0,205,0,161,0,171,0,171,0,183,0,61,0,127,0,9,0,41,0,1,0,161,0]
+; CHECK-AVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 # [197,205,27,241,1,1,1,163,223,223,205,183,161,1,171,239,171,103,183,171,61,1,127,183,9,0,41,183,1,1,161,221]
; CHECK-AVX512VL-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3 # [0,205,0,241,0,1,0,163,0,223,0,183,0,1,0,239,0,103,0,171,0,1,0,183,0,0,0,183,0,1,0,221]
; CHECK-AVX512VL-NEXT: vpsllw $8, %ymm3, %ymm3
; CHECK-AVX512VL-NEXT: vpternlogd {{.*#+}} ymm3 = ymm3 | (ymm2 & m32bcst)
diff --git a/llvm/test/CodeGen/X86/vector-fshr-128.ll b/llvm/test/CodeGen/X86/vector-fshr-128.ll
index 33a6a7679bb9a..a5d6900f77f97 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-128.ll
@@ -2014,7 +2014,7 @@ define <16 x i8> @constant_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y) nounwind {
; SSE41-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [0,64,0,16,0,4,0,1,0,1,0,4,0,16,0,64]
; SSE41-NEXT: psllw $8, %xmm1
; SSE41-NEXT: por %xmm3, %xmm1
-; SSE41-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [128,0,32,0,8,0,2,0,128,0,2,0,8,0,32,0]
+; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [128,64,32,16,8,4,2,1,128,1,2,4,8,16,32,64]
; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE41-NEXT: por %xmm1, %xmm0
; SSE41-NEXT: retq
@@ -2033,7 +2033,7 @@ define <16 x i8> @constant_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y) nounwind {
; AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 # [0,64,0,16,0,4,0,1,0,1,0,4,0,16,0,64]
; AVX1-NEXT: vpsllw $8, %xmm2, %xmm2
; AVX1-NEXT: vpor %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [128,0,32,0,8,0,2,0,128,0,2,0,8,0,32,0]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [128,64,32,16,8,4,2,1,128,1,2,4,8,16,32,64]
; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vector-fshr-256.ll b/llvm/test/CodeGen/X86/vector-fshr-256.ll
index 217431be10d88..0cffa1b78a654 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-256.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-256.ll
@@ -1631,9 +1631,9 @@ define <32 x i8> @constant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwind {
; AVX1-NEXT: vpsllw $8, %xmm3, %xmm3
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3
; AVX1-NEXT: vorps %ymm1, %ymm3, %ymm1
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = [128,32,8,2,128,2,8,32]
-; AVX1-NEXT: vpmaddubsw %xmm3, %xmm0, %xmm0
-; AVX1-NEXT: vpmaddubsw %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [128,64,32,16,8,4,2,1,128,1,2,4,8,16,32,64]
+; AVX1-NEXT: vpmullw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
@@ -1653,7 +1653,7 @@ define <32 x i8> @constant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwind {
; AVX2-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 # [0,64,0,16,0,4,0,1,0,1,0,4,0,16,0,64,0,64,0,16,0,4,0,1,0,1,0,4,0,16,0,64]
; AVX2-NEXT: vpsllw $8, %ymm2, %ymm2
; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1
-; AVX2-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [128,0,32,0,8,0,2,0,128,0,2,0,8,0,32,0,128,0,32,0,8,0,2,0,128,0,2,0,8,0,32,0]
+; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [128,64,32,16,8,4,2,1,128,1,2,4,8,16,32,64,128,64,32,16,8,4,2,1,128,1,2,4,8,16,32,64]
; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
@@ -1672,7 +1672,7 @@ define <32 x i8> @constant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwind {
; AVX512F-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 # [0,64,0,16,0,4,0,1,0,1,0,4,0,16,0,64,0,64,0,16,0,4,0,1,0,1,0,4,0,16,0,64]
; AVX512F-NEXT: vpsllw $8, %ymm2, %ymm2
; AVX512F-NEXT: vpor %ymm1, %ymm2, %ymm1
-; AVX512F-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [128,0,32,0,8,0,2,0,128,0,2,0,8,0,32,0,128,0,32,0,8,0,2,0,128,0,2,0,8,0,32,0]
+; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [128,64,32,16,8,4,2,1,128,1,2,4,8,16,32,64,128,64,32,16,8,4,2,1,128,1,2,4,8,16,32,64]
; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: retq
@@ -1690,7 +1690,7 @@ define <32 x i8> @constant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwind {
; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm0
; AVX512VL-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 # [0,64,0,16,0,4,0,1,0,1,0,4,0,16,0,64,0,64,0,16,0,4,0,1,0,1,0,4,0,16,0,64]
; AVX512VL-NEXT: vpsllw $8, %ymm2, %ymm2
-; AVX512VL-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [128,0,32,0,8,0,2,0,128,0,2,0,8,0,32,0,128,0,32,0,8,0,2,0,128,0,2,0,8,0,32,0]
+; AVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [128,64,32,16,8,4,2,1,128,1,2,4,8,16,32,64,128,64,32,16,8,4,2,1,128,1,2,4,8,16,32,64]
; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0
; AVX512VL-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 | ymm1 | ymm2
; AVX512VL-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vector-fshr-512.ll b/llvm/test/CodeGen/X86/vector-fshr-512.ll
index 3a522ccb6214a..25f8f94eb834c 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-512.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-512.ll
@@ -915,10 +915,10 @@ define <64 x i8> @constant_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y) nounwind {
; AVX512F-NEXT: vpmaddubsw %ymm3, %ymm0, %ymm3
; AVX512F-NEXT: vpsllw $8, %ymm3, %ymm3
; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3
-; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [128,0,32,0,8,0,2,0,128,0,2,0,8,0,32,0,128,0,32,0,8,0,2,0,128,0,2,0,8,0,32,0]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [128,64,32,16,8,4,2,1,128,1,2,4,8,16,32,64,128,64,32,16,8,4,2,1,128,1,2,4,8,16,32,64]
; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
-; AVX512F-NEXT: vpmaddubsw %ymm4, %ymm0, %ymm0
-; AVX512F-NEXT: vpmaddubsw %ymm4, %ymm2, %ymm2
+; AVX512F-NEXT: vpmullw %ymm4, %ymm0, %ymm0
+; AVX512F-NEXT: vpmullw %ymm4, %ymm2, %ymm2
; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0
; AVX512F-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 | zmm1 | zmm3
@@ -957,10 +957,10 @@ define <64 x i8> @constant_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y) nounwind {
; AVX512VL-NEXT: vpmaddubsw %ymm3, %ymm0, %ymm3
; AVX512VL-NEXT: vpsllw $8, %ymm3, %ymm3
; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3
-; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [128,0,32,0,8,0,2,0,128,0,2,0,8,0,32,0,128,0,32,0,8,0,2,0,128,0,2,0,8,0,32,0]
+; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [128,64,32,16,8,4,2,1,128,1,2,4,8,16,32,64,128,64,32,16,8,4,2,1,128,1,2,4,8,16,32,64]
; AVX512VL-NEXT: # ymm4 = mem[0,1,0,1]
-; AVX512VL-NEXT: vpmaddubsw %ymm4, %ymm0, %ymm0
-; AVX512VL-NEXT: vpmaddubsw %ymm4, %ymm2, %ymm2
+; AVX512VL-NEXT: vpmullw %ymm4, %ymm0, %ymm0
+; AVX512VL-NEXT: vpmullw %ymm4, %ymm2, %ymm2
; AVX512VL-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0
; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
; AVX512VL-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 | zmm1 | zmm3
diff --git a/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll b/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll
index e68d1d792c90a..9b7d66def8b5b 100644
--- a/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll
+++ b/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll
@@ -731,7 +731,7 @@ define <16 x i8> @test_remconstant_16i8(<16 x i8> %a) nounwind {
; SSE41-NEXT: movdqa %xmm2, %xmm1
; SSE41-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [0,8,0,10,0,12,0,14,0,13,0,11,0,9,0,7]
; SSE41-NEXT: psllw $8, %xmm1
-; SSE41-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [7,0,9,0,11,0,13,0,14,0,12,0,10,0,9,0]
+; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [7,8,9,10,11,12,13,14,14,13,12,11,10,9,9,7]
; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
; SSE41-NEXT: por %xmm1, %xmm2
; SSE41-NEXT: psubb %xmm2, %xmm0
@@ -762,7 +762,7 @@ define <16 x i8> @test_remconstant_16i8(<16 x i8> %a) nounwind {
; AVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 # [0,8,0,10,0,12,0,14,0,13,0,11,0,9,0,7]
; AVX1-NEXT: vpsllw $8, %xmm2, %xmm2
-; AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [7,0,9,0,11,0,13,0,14,0,12,0,10,0,9,0]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [7,8,9,10,11,12,13,14,14,13,12,11,10,9,9,7]
; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll b/llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll
index 7355f3683fc2e..fa5692aa9cef1 100644
--- a/llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll
+++ b/llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll
@@ -660,7 +660,7 @@ define <32 x i8> @test_remconstant_32i8(<32 x i8> %a) nounwind {
; AVX1-NEXT: vpackuswb %xmm5, %xmm3, %xmm3
; AVX1-NEXT: vpcmpgtb %xmm3, %xmm1, %xmm5
; AVX1-NEXT: vpsubb %xmm5, %xmm3, %xmm3
-; AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm5 # [22,0,20,0,18,0,16,0,14,0,12,0,10,0,8,0]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm5 # [22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7]
; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255]
; AVX1-NEXT: vpand %xmm6, %xmm5, %xmm5
; AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [0,21,0,19,0,17,0,15,0,13,0,11,0,9,0,7]
@@ -686,7 +686,7 @@ define <32 x i8> @test_remconstant_32i8(<32 x i8> %a) nounwind {
; AVX1-NEXT: vpackuswb %xmm4, %xmm3, %xmm3
; AVX1-NEXT: vpcmpgtb %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vpsubb %xmm1, %xmm3, %xmm1
-; AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm3 # [7,0,9,0,11,0,13,0,15,0,17,0,19,0,21,0]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm3 # [7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22]
; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3
; AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [0,8,0,10,0,12,0,14,0,16,0,18,0,20,0,22]
; AVX1-NEXT: vpsllw $8, %xmm1, %xmm1
@@ -720,7 +720,7 @@ define <32 x i8> @test_remconstant_32i8(<32 x i8> %a) nounwind {
; AVX2NOBW-NEXT: vpsubb %ymm1, %ymm2, %ymm1
; AVX2NOBW-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2 # [0,8,0,10,0,12,0,14,0,16,0,18,0,20,0,22,0,21,0,19,0,17,0,15,0,13,0,11,0,9,0,7]
; AVX2NOBW-NEXT: vpsllw $8, %ymm2, %ymm2
-; AVX2NOBW-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [7,0,9,0,11,0,13,0,15,0,17,0,19,0,21,0,22,0,20,0,18,0,16,0,14,0,12,0,10,0,8,0]
+; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7]
; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
; AVX2NOBW-NEXT: vpor %ymm2, %ymm1, %ymm1
; AVX2NOBW-NEXT: vpsubb %ymm1, %ymm0, %ymm0
diff --git a/llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll b/llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll
index 5445330c82922..b11756a5e3b4e 100644
--- a/llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll
+++ b/llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll
@@ -544,7 +544,7 @@ define <64 x i8> @test_remconstant_64i8(<64 x i8> %a) nounwind {
; AVX512F-NEXT: vpackuswb %ymm5, %ymm3, %ymm3
; AVX512F-NEXT: vpcmpgtb %ymm3, %ymm1, %ymm5
; AVX512F-NEXT: vpsubb %ymm5, %ymm3, %ymm3
-; AVX512F-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm5 # [38,0,36,0,34,0,32,0,30,0,28,0,26,0,24,0,22,0,20,0,18,0,16,0,14,0,12,0,10,0,8,0]
+; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm5 # [38,37,36,35,34,33,32,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7]
; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX512F-NEXT: vpand %ymm6, %ymm5, %ymm5
; AVX512F-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [0,37,0,35,0,33,0,31,0,29,0,27,0,25,0,23,0,21,0,19,0,17,0,15,0,13,0,11,0,9,0,7]
@@ -570,7 +570,7 @@ define <64 x i8> @test_remconstant_64i8(<64 x i8> %a) nounwind {
; AVX512F-NEXT: vpackuswb %ymm4, %ymm3, %ymm3
; AVX512F-NEXT: vpcmpgtb %ymm3, %ymm1, %ymm1
; AVX512F-NEXT: vpsubb %ymm1, %ymm3, %ymm1
-; AVX512F-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm3 # [7,0,9,0,11,0,13,0,15,0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31,0,33,0,35,0,37,0]
+; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm3 # [7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38]
; AVX512F-NEXT: vpand %ymm6, %ymm3, %ymm3
; AVX512F-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [0,8,0,10,0,12,0,14,0,16,0,18,0,20,0,22,0,24,0,26,0,28,0,30,0,32,0,34,0,36,0,38]
; AVX512F-NEXT: vpsllw $8, %ymm1, %ymm1
@@ -603,7 +603,7 @@ define <64 x i8> @test_remconstant_64i8(<64 x i8> %a) nounwind {
; AVX512BW-NEXT: vpmovb2m %zmm1, %k0
; AVX512BW-NEXT: vpmovm2b %k0, %zmm2
; AVX512BW-NEXT: vpsubb %zmm2, %zmm1, %zmm1
-; AVX512BW-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm2 # [7,0,9,0,11,0,13,0,15,0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31,0,33,0,35,0,37,0,38,0,36,0,34,0,32,0,30,0,28,0,26,0,24,0,22,0,20,0,18,0,16,0,14,0,12,0,10,0,8,0]
+; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm2 # [7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,38,37,36,35,34,33,32,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7]
; AVX512BW-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 # [0,8,0,10,0,12,0,14,0,16,0,18,0,20,0,22,0,24,0,26,0,28,0,30,0,32,0,34,0,36,0,38,0,37,0,35,0,33,0,31,0,29,0,27,0,25,0,23,0,21,0,19,0,17,0,15,0,13,0,11,0,9,0,7]
; AVX512BW-NEXT: vpsllw $8, %zmm1, %zmm1
; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm1 = zmm1 | (zmm2 & m32bcst)
diff --git a/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll b/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll
index 6cd5098504f91..ef255e598e4a1 100644
--- a/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll
+++ b/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll
@@ -840,7 +840,7 @@ define <16 x i8> @test_remconstant_16i8(<16 x i8> %a) nounwind {
; SSE41-NEXT: movdqa %xmm2, %xmm1
; SSE41-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [0,8,0,10,0,12,0,14,0,13,0,11,0,9,0,7]
; SSE41-NEXT: psllw $8, %xmm1
-; SSE41-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [7,0,9,0,11,0,13,0,14,0,12,0,10,0,9,0]
+; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [7,8,9,10,11,12,13,14,14,13,12,11,10,9,9,7]
; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
; SSE41-NEXT: por %xmm1, %xmm2
; SSE41-NEXT: psubb %xmm2, %xmm0
@@ -882,7 +882,7 @@ define <16 x i8> @test_remconstant_16i8(<16 x i8> %a) nounwind {
; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 # [0,8,0,10,0,12,0,14,0,13,0,11,0,9,0,7]
; AVX1-NEXT: vpsllw $8, %xmm2, %xmm2
-; AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [7,0,9,0,11,0,13,0,14,0,12,0,10,0,9,0]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [7,8,9,10,11,12,13,14,14,13,12,11,10,9,9,7]
; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll b/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll
index 98ea87cbe18f3..ca57359183312 100644
--- a/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll
+++ b/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll
@@ -702,7 +702,7 @@ define <32 x i8> @test_remconstant_32i8(<32 x i8> %a) nounwind {
; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [32,16,16,128,64,16,256,32]
; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
; AVX1-NEXT: vpackuswb %xmm4, %xmm3, %xmm4
-; AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm5 # [22,0,20,0,18,0,16,0,14,0,12,0,10,0,8,0]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm5 # [22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7]
; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
; AVX1-NEXT: vpand %xmm3, %xmm5, %xmm5
; AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [0,21,0,19,0,17,0,15,0,13,0,11,0,9,0,7]
@@ -739,7 +739,7 @@ define <32 x i8> @test_remconstant_32i8(<32 x i8> %a) nounwind {
; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [64,256,128,32,32,32,64,64]
; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
; AVX1-NEXT: vpackuswb %xmm1, %xmm4, %xmm1
-; AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm4 # [7,0,9,0,11,0,13,0,15,0,17,0,19,0,21,0]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm4 # [7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22]
; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm3
; AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [0,8,0,10,0,12,0,14,0,16,0,18,0,20,0,22]
; AVX1-NEXT: vpsllw $8, %xmm1, %xmm1
@@ -781,7 +781,7 @@ define <32 x i8> @test_remconstant_32i8(<32 x i8> %a) nounwind {
; AVX2NOBW-NEXT: vpackuswb %ymm3, %ymm1, %ymm1
; AVX2NOBW-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2 # [0,8,0,10,0,12,0,14,0,16,0,18,0,20,0,22,0,21,0,19,0,17,0,15,0,13,0,11,0,9,0,7]
; AVX2NOBW-NEXT: vpsllw $8, %ymm2, %ymm2
-; AVX2NOBW-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [7,0,9,0,11,0,13,0,15,0,17,0,19,0,21,0,22,0,20,0,18,0,16,0,14,0,12,0,10,0,8,0]
+; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7]
; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
; AVX2NOBW-NEXT: vpor %ymm2, %ymm1, %ymm1
; AVX2NOBW-NEXT: vpsubb %ymm1, %ymm0, %ymm0
diff --git a/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll b/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll
index a11fa370a86b7..b8a131e628007 100644
--- a/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll
+++ b/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll
@@ -575,7 +575,7 @@ define <64 x i8> @test_remconstant_64i8(<64 x i8> %a) nounwind {
; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [64,16,32,8,8,8,256,16,32,16,16,128,64,16,256,32]
; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
; AVX512F-NEXT: vpackuswb %ymm4, %ymm3, %ymm4
-; AVX512F-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm5 # [38,0,36,0,34,0,32,0,30,0,28,0,26,0,24,0,22,0,20,0,18,0,16,0,14,0,12,0,10,0,8,0]
+; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm5 # [38,37,36,35,34,33,32,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7]
; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX512F-NEXT: vpand %ymm3, %ymm5, %ymm5
; AVX512F-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [0,37,0,35,0,33,0,31,0,29,0,27,0,25,0,23,0,21,0,19,0,17,0,15,0,13,0,11,0,9,0,7]
@@ -609,7 +609,7 @@ define <64 x i8> @test_remconstant_64i8(<64 x i8> %a) nounwind {
; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [64,256,128,32,32,32,64,64,16,16,64,32,128,256,16,16]
; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1
; AVX512F-NEXT: vpackuswb %ymm5, %ymm1, %ymm1
-; AVX512F-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm4 # [7,0,9,0,11,0,13,0,15,0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31,0,33,0,35,0,37,0]
+; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm4 # [7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38]
; AVX512F-NEXT: vpand %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [0,8,0,10,0,12,0,14,0,16,0,18,0,20,0,22,0,24,0,26,0,28,0,30,0,32,0,34,0,36,0,38]
; AVX512F-NEXT: vpsllw $8, %ymm1, %ymm1
@@ -648,7 +648,7 @@ define <64 x i8> @test_remconstant_64i8(<64 x i8> %a) nounwind {
; AVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1
; AVX512BW-NEXT: vpackuswb %zmm3, %zmm1, %zmm1
-; AVX512BW-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm2 # [7,0,9,0,11,0,13,0,15,0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31,0,33,0,35,0,37,0,38,0,36,0,34,0,32,0,30,0,28,0,26,0,24,0,22,0,20,0,18,0,16,0,14,0,12,0,10,0,8,0]
+; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm2 # [7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,38,37,36,35,34,33,32,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7]
; AVX512BW-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 # [0,8,0,10,0,12,0,14,0,16,0,18,0,20,0,22,0,24,0,26,0,28,0,30,0,32,0,34,0,36,0,38,0,37,0,35,0,33,0,31,0,29,0,27,0,25,0,23,0,21,0,19,0,17,0,15,0,13,0,11,0,9,0,7]
; AVX512BW-NEXT: vpsllw $8, %zmm1, %zmm1
; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm1 = zmm1 | (zmm2 & m32bcst)
diff --git a/llvm/test/CodeGen/X86/vector-mul.ll b/llvm/test/CodeGen/X86/vector-mul.ll
index d0bb90c5fc8ab..6d6f1c28ca282 100644
--- a/llvm/test/CodeGen/X86/vector-mul.ll
+++ b/llvm/test/CodeGen/X86/vector-mul.ll
@@ -265,7 +265,7 @@ define <16 x i8> @mul_v16i8_1_2_4_8_1_2_4_8_1_2_4_8_1_2_4_8(<16 x i8> %a0) nounw
; X86-SSE4-NEXT: movdqa %xmm0, %xmm1
; X86-SSE4-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [0,2,0,8,0,2,0,8,0,2,0,8,0,2,0,8]
; X86-SSE4-NEXT: psllw $8, %xmm1
-; X86-SSE4-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [1,0,4,0,1,0,4,0,1,0,4,0,1,0,4,0]
+; X86-SSE4-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [1,2,4,8,1,2,4,8,1,2,4,8,1,2,4,8]
; X86-SSE4-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
; X86-SSE4-NEXT: por %xmm1, %xmm0
; X86-SSE4-NEXT: retl
@@ -275,7 +275,7 @@ define <16 x i8> @mul_v16i8_1_2_4_8_1_2_4_8_1_2_4_8_1_2_4_8(<16 x i8> %a0) nounw
; X64-SSE4-NEXT: movdqa %xmm0, %xmm1
; X64-SSE4-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [0,2,0,8,0,2,0,8,0,2,0,8,0,2,0,8]
; X64-SSE4-NEXT: psllw $8, %xmm1
-; X64-SSE4-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,0,4,0,1,0,4,0,1,0,4,0,1,0,4,0]
+; X64-SSE4-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,2,4,8,1,2,4,8,1,2,4,8,1,2,4,8]
; X64-SSE4-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; X64-SSE4-NEXT: por %xmm1, %xmm0
; X64-SSE4-NEXT: retq
@@ -1072,7 +1072,7 @@ define <16 x i8> @mul_v16i8_2_3_9_17_33_65_129_2_3_9_17_33_65_129_2_3(<16 x i8>
; X86-SSE4-NEXT: movdqa %xmm0, %xmm1
; X86-SSE4-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [0,3,0,17,0,65,0,2,0,9,0,33,0,129,0,3]
; X86-SSE4-NEXT: psllw $8, %xmm1
-; X86-SSE4-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [2,0,9,0,33,0,129,0,3,0,17,0,65,0,2,0]
+; X86-SSE4-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [2,3,9,17,33,65,129,2,3,9,17,33,65,129,2,3]
; X86-SSE4-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
; X86-SSE4-NEXT: por %xmm1, %xmm0
; X86-SSE4-NEXT: retl
@@ -1095,7 +1095,7 @@ define <16 x i8> @mul_v16i8_2_3_9_17_33_65_129_2_3_9_17_33_65_129_2_3(<16 x i8>
; X64-SSE4-NEXT: movdqa %xmm0, %xmm1
; X64-SSE4-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [0,3,0,17,0,65,0,2,0,9,0,33,0,129,0,3]
; X64-SSE4-NEXT: psllw $8, %xmm1
-; X64-SSE4-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2,0,9,0,33,0,129,0,3,0,17,0,65,0,2,0]
+; X64-SSE4-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2,3,9,17,33,65,129,2,3,9,17,33,65,129,2,3]
; X64-SSE4-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; X64-SSE4-NEXT: por %xmm1, %xmm0
; X64-SSE4-NEXT: retq
@@ -1103,7 +1103,7 @@ define <16 x i8> @mul_v16i8_2_3_9_17_33_65_129_2_3_9_17_33_65_129_2_3(<16 x i8>
; X64-XOP-LABEL: mul_v16i8_2_3_9_17_33_65_129_2_3_9_17_33_65_129_2_3:
; X64-XOP: # %bb.0:
; X64-XOP-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [0,3,0,17,0,65,0,2,0,9,0,33,0,129,0,3]
-; X64-XOP-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [2,0,9,0,33,0,129,0,3,0,17,0,65,0,2,0]
+; X64-XOP-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [2,3,9,17,33,65,129,2,3,9,17,33,65,129,2,3]
; X64-XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2],xmm1[2],xmm0[4],xmm1[4],xmm0[6],xmm1[6],xmm0[8],xmm1[8],xmm0[10],xmm1[10],xmm0[12],xmm1[12],xmm0[14],xmm1[14]
; X64-XOP-NEXT: retq
;
@@ -1847,7 +1847,7 @@ define <16 x i8> @mul_v16i8_0_1_3_7_15_31_63_127_0_1_3_7_15_31_63_127(<16 x i8>
; X86-SSE4-NEXT: movdqa %xmm0, %xmm1
; X86-SSE4-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [0,1,0,7,0,31,0,127,0,1,0,7,0,31,0,127]
; X86-SSE4-NEXT: psllw $8, %xmm1
-; X86-SSE4-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [0,0,3,0,15,0,63,0,0,0,3,0,15,0,63,0]
+; X86-SSE4-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [0,1,3,7,15,31,63,127,0,1,3,7,15,31,63,127]
; X86-SSE4-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
; X86-SSE4-NEXT: por %xmm1, %xmm0
; X86-SSE4-NEXT: retl
@@ -1857,7 +1857,7 @@ define <16 x i8> @mul_v16i8_0_1_3_7_15_31_63_127_0_1_3_7_15_31_63_127(<16 x i8>
; X64-SSE4-NEXT: movdqa %xmm0, %xmm1
; X64-SSE4-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [0,1,0,7,0,31,0,127,0,1,0,7,0,31,0,127]
; X64-SSE4-NEXT: psllw $8, %xmm1
-; X64-SSE4-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [0,0,3,0,15,0,63,0,0,0,3,0,15,0,63,0]
+; X64-SSE4-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [0,1,3,7,15,31,63,127,0,1,3,7,15,31,63,127]
; X64-SSE4-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; X64-SSE4-NEXT: por %xmm1, %xmm0
; X64-SSE4-NEXT: retq
@@ -1865,7 +1865,7 @@ define <16 x i8> @mul_v16i8_0_1_3_7_15_31_63_127_0_1_3_7_15_31_63_127(<16 x i8>
; X64-XOP-LABEL: mul_v16i8_0_1_3_7_15_31_63_127_0_1_3_7_15_31_63_127:
; X64-XOP: # %bb.0:
; X64-XOP-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [0,1,0,7,0,31,0,127,0,1,0,7,0,31,0,127]
-; X64-XOP-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [0,0,3,0,15,0,63,0,0,0,3,0,15,0,63,0]
+; X64-XOP-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [0,1,3,7,15,31,63,127,0,1,3,7,15,31,63,127]
; X64-XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2],xmm1[2],xmm0[4],xmm1[4],xmm0[6],xmm1[6],xmm0[8],xmm1[8],xmm0[10],xmm1[10],xmm0[12],xmm1[12],xmm0[14],xmm1[14]
; X64-XOP-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-128.ll b/llvm/test/CodeGen/X86/vector-shift-shl-128.ll
index 3085c325e0968..37b96b8f3f927 100644
--- a/llvm/test/CodeGen/X86/vector-shift-shl-128.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-shl-128.ll
@@ -1165,7 +1165,7 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind {
; SSE41-NEXT: movdqa %xmm0, %xmm1
; SSE41-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1]
; SSE41-NEXT: psllw $8, %xmm1
-; SSE41-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0]
+; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1]
; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE41-NEXT: por %xmm1, %xmm0
; SSE41-NEXT: retq
@@ -1174,7 +1174,7 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind {
; AVX1: # %bb.0:
; AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1]
; AVX1-NEXT: vpsllw $8, %xmm1, %xmm1
-; AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1]
; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-256.ll b/llvm/test/CodeGen/X86/vector-shift-shl-256.ll
index f9ccd1e8ca156..c7d2532e9acb2 100644
--- a/llvm/test/CodeGen/X86/vector-shift-shl-256.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-shl-256.ll
@@ -1313,9 +1313,9 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind {
; AVX1-NEXT: vpmaddubsw %xmm1, %xmm3, %xmm1
; AVX1-NEXT: vpsllw $8, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = [1,4,16,64,128,32,8,2]
-; AVX1-NEXT: vpmaddubsw %xmm2, %xmm3, %xmm3
-; AVX1-NEXT: vpmaddubsw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1]
+; AVX1-NEXT: vpmullw %xmm2, %xmm3, %xmm3
+; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
@@ -1325,7 +1325,7 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind {
; AVX2: # %bb.0:
; AVX2-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 # [0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1,0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1]
; AVX2-NEXT: vpsllw $8, %ymm1, %ymm1
-; AVX2-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0,1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0]
+; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1,1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1]
; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
@@ -1352,7 +1352,7 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind {
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 # [0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1,0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1]
; AVX512DQ-NEXT: vpsllw $8, %ymm1, %ymm1
-; AVX512DQ-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0,1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0]
+; AVX512DQ-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1,1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1]
; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX512DQ-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512DQ-NEXT: retq
@@ -1366,7 +1366,7 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind {
;
; AVX512DQVL-LABEL: constant_shift_v32i8:
; AVX512DQVL: # %bb.0:
-; AVX512DQVL-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 # [1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0,1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0]
+; AVX512DQVL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 # [1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1,1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1]
; AVX512DQVL-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1,0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1]
; AVX512DQVL-NEXT: vpsllw $8, %ymm0, %ymm0
; AVX512DQVL-NEXT: vpternlogd {{.*#+}} ymm0 = ymm0 | (ymm1 & m32bcst)
@@ -1388,9 +1388,9 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind {
; X86-AVX1-NEXT: vpmaddubsw %xmm1, %xmm3, %xmm1
; X86-AVX1-NEXT: vpsllw $8, %xmm1, %xmm1
; X86-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
-; X86-AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = [1,4,16,64,128,32,8,2]
-; X86-AVX1-NEXT: vpmaddubsw %xmm2, %xmm3, %xmm3
-; X86-AVX1-NEXT: vpmaddubsw %xmm2, %xmm0, %xmm0
+; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1]
+; X86-AVX1-NEXT: vpmullw %xmm2, %xmm3, %xmm3
+; X86-AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0
; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
; X86-AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
; X86-AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
@@ -1400,7 +1400,7 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind {
; X86-AVX2: # %bb.0:
; X86-AVX2-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm1 # [0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1,0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1]
; X86-AVX2-NEXT: vpsllw $8, %ymm1, %ymm1
-; X86-AVX2-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 # [1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0,1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0]
+; X86-AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 # [1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1,1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1]
; X86-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
; X86-AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
; X86-AVX2-NEXT: retl
diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-512.ll b/llvm/test/CodeGen/X86/vector-shift-shl-512.ll
index 41238acc4b74d..1e5f1b8729d47 100644
--- a/llvm/test/CodeGen/X86/vector-shift-shl-512.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-shl-512.ll
@@ -307,10 +307,10 @@ define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) nounwind {
; AVX512DQ-LABEL: constant_shift_v64i8:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0,1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0]
+; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1,1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1]
; AVX512DQ-NEXT: # ymm2 = mem[0,1,0,1]
-; AVX512DQ-NEXT: vpmaddubsw %ymm2, %ymm1, %ymm3
-; AVX512DQ-NEXT: vpmaddubsw %ymm2, %ymm0, %ymm2
+; AVX512DQ-NEXT: vpmullw %ymm2, %ymm1, %ymm3
+; AVX512DQ-NEXT: vpmullw %ymm2, %ymm0, %ymm2
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1,0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1]
; AVX512DQ-NEXT: # ymm3 = mem[0,1,0,1]
@@ -324,7 +324,7 @@ define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) nounwind {
;
; AVX512BW-LABEL: constant_shift_v64i8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 # [1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0,1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0,1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0,1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0]
+; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 # [1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1,1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1,1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1,1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1]
; AVX512BW-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 # [0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1,0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1,0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1,0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1]
; AVX512BW-NEXT: vpsllw $8, %zmm0, %zmm0
; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm0 = zmm0 | (zmm1 & m32bcst)
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-sse41.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-sse41.ll
index 3590c4d027be7..ac5830604461c 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-sse41.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-sse41.ll
@@ -100,16 +100,14 @@ define <16 x i8> @PR50049(ptr %p1, ptr %p2) {
; SSE-NEXT: pshufb %xmm3, %xmm4
; SSE-NEXT: pshufb %xmm8, %xmm1
; SSE-NEXT: por %xmm4, %xmm1
-; SSE-NEXT: pmovzxbw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
-; SSE-NEXT: movdqa %xmm1, %xmm3
-; SSE-NEXT: pand %xmm2, %xmm3
-; SSE-NEXT: movdqa %xmm0, %xmm4
-; SSE-NEXT: pmaddubsw %xmm3, %xmm4
-; SSE-NEXT: pand %xmm2, %xmm4
-; SSE-NEXT: pandn %xmm1, %xmm2
-; SSE-NEXT: pmaddubsw %xmm2, %xmm0
+; SSE-NEXT: movdqa %xmm0, %xmm2
+; SSE-NEXT: pmullw %xmm1, %xmm2
+; SSE-NEXT: pmovzxbw {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; SSE-NEXT: pand %xmm3, %xmm2
+; SSE-NEXT: pandn %xmm1, %xmm3
+; SSE-NEXT: pmaddubsw %xmm3, %xmm0
; SSE-NEXT: psllw $8, %xmm0
-; SSE-NEXT: por %xmm4, %xmm0
+; SSE-NEXT: por %xmm2, %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: PR50049:
@@ -129,21 +127,20 @@ define <16 x i8> @PR50049(ptr %p1, ptr %p2) {
; AVX1-NEXT: vpshufb %xmm4, %xmm2, %xmm2
; AVX1-NEXT: vpor %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [5,6,7,8,9,10,128,128,128,128,128,0,1,2,3,4]
-; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [128,128,128,128,128,128,2,5,8,11,14,128,128,128,128,128]
+; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm5
+; AVX1-NEXT: vpor %xmm5, %xmm2, %xmm2
+; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpshufb %xmm4, %xmm1, %xmm1
; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm1
-; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm2
-; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
-; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3
-; AVX1-NEXT: vpmaddubsw %xmm3, %xmm0, %xmm3
-; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm3
-; AVX1-NEXT: vpandn %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vpmaddubsw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm1
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpandn %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpmaddubsw %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpsllw $8, %xmm0, %xmm0
-; AVX1-NEXT: vpor %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: PR50049:
More information about the llvm-commits
mailing list