[llvm] a7b85e4 - [X86] Freeze shl(x, 1) -> add(x, x) vector fold (PR50468)
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Mon Aug 15 08:17:52 PDT 2022
Author: Simon Pilgrim
Date: 2022-08-15T16:17:21+01:00
New Revision: a7b85e4c0c2d71018949f409ecf47981fe71f3a0
URL: https://github.com/llvm/llvm-project/commit/a7b85e4c0c2d71018949f409ecf47981fe71f3a0
DIFF: https://github.com/llvm/llvm-project/commit/a7b85e4c0c2d71018949f409ecf47981fe71f3a0.diff
LOG: [X86] Freeze shl(x,1) -> add(x,x) vector fold (PR50468)
Vector fold shl(x,1) -> add(freeze(x),freeze(x)) to avoid the undef issues identified in PR50468
Differential Revision: https://reviews.llvm.org/D106675
Added:
Modified:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/X86/combine-mul.ll
llvm/test/CodeGen/X86/freeze-binary.ll
llvm/test/CodeGen/X86/oddsubvector.ll
llvm/test/CodeGen/X86/rotate_vec.ll
llvm/test/CodeGen/X86/sdiv_fix_sat.ll
llvm/test/CodeGen/X86/udiv_fix_sat.ll
llvm/test/CodeGen/X86/vector-fshr-128.ll
llvm/test/CodeGen/X86/vector-fshr-256.ll
llvm/test/CodeGen/X86/vector-fshr-512.ll
llvm/test/CodeGen/X86/vector-fshr-rot-128.ll
llvm/test/CodeGen/X86/vector-fshr-rot-256.ll
llvm/test/CodeGen/X86/vector-fshr-rot-512.ll
llvm/test/CodeGen/X86/vector-mulfix-legalize.ll
llvm/test/CodeGen/X86/vector-shift-shl-128.ll
llvm/test/CodeGen/X86/vector-shift-shl-256.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index cb6a87ab9c78..58cb2c603499 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -29735,8 +29735,22 @@ static SDValue LowerShiftByScalarImmediate(SDValue Op, SelectionDAG &DAG,
uint64_t ShiftAmt = APIntShiftAmt.getZExtValue();
- if (supportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
+ if (supportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode())) {
+ // Hardware support for vector shifts is sparse which makes us scalarize the
+ // vector operations in many cases. Also, on sandybridge ADD is faster than
+ // shl: (shl V, 1) -> (add (freeze V), (freeze V))
+ if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1) {
+ // R may be undef at run-time, but (shl R, 1) must be an even number (LSB
+ // must be 0). (add undef, undef) however can be any value. To make this
+ // safe, we must freeze R to ensure that register allocation uses the same
+ // register for an undefined value. This ensures that the result will
+ // still be even and preserves the original semantics.
+ R = DAG.getFreeze(R);
+ return DAG.getNode(ISD::ADD, dl, VT, R, R);
+ }
+
return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
+ }
// i64 SRA needs to be performed as partial shifts.
if (((!Subtarget.hasXOP() && VT == MVT::v2i64) ||
@@ -46674,20 +46688,6 @@ static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) {
}
}
- // Hardware support for vector shifts is sparse which makes us scalarize the
- // vector operations in many cases. Also, on sandybridge ADD is faster than
- // shl.
- // (shl V, 1) -> add V,V
- if (auto *N1BV = dyn_cast<BuildVectorSDNode>(N1))
- if (auto *N1SplatC = N1BV->getConstantSplatNode()) {
- assert(N0.getValueType().isVector() && "Invalid vector shift type");
- // We shift all of the values by one. In many cases we do not have
- // hardware support for this operation. This is better expressed as an ADD
- // of two values.
- if (N1SplatC->isOne())
- return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0);
- }
-
return SDValue();
}
diff --git a/llvm/test/CodeGen/X86/combine-mul.ll b/llvm/test/CodeGen/X86/combine-mul.ll
index da8ba3659914..461b4c40be10 100644
--- a/llvm/test/CodeGen/X86/combine-mul.ll
+++ b/llvm/test/CodeGen/X86/combine-mul.ll
@@ -80,13 +80,13 @@ define <4 x i32> @combine_vec_mul_pow2b(<4 x i32> %x) {
define <4 x i64> @combine_vec_mul_pow2c(<4 x i64> %x) {
; SSE-LABEL: combine_vec_mul_pow2c:
; SSE: # %bb.0:
-; SSE-NEXT: movdqa %xmm0, %xmm2
-; SSE-NEXT: psllq $1, %xmm2
-; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
; SSE-NEXT: movdqa %xmm1, %xmm2
; SSE-NEXT: psllq $4, %xmm2
; SSE-NEXT: psllq $2, %xmm1
; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
+; SSE-NEXT: movdqa %xmm0, %xmm2
+; SSE-NEXT: paddq %xmm0, %xmm2
+; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_mul_pow2c:
diff --git a/llvm/test/CodeGen/X86/freeze-binary.ll b/llvm/test/CodeGen/X86/freeze-binary.ll
index 8ab5fa212860..3d41927a57a8 100644
--- a/llvm/test/CodeGen/X86/freeze-binary.ll
+++ b/llvm/test/CodeGen/X86/freeze-binary.ll
@@ -398,7 +398,7 @@ define <2 x i64> @freeze_shl_vec(<2 x i64> %a0) nounwind {
define <2 x i64> @freeze_shl_vec_outofrange(<2 x i64> %a0) nounwind {
; X86-LABEL: freeze_shl_vec_outofrange:
; X86: # %bb.0:
-; X86-NEXT: psllq $1, %xmm0
+; X86-NEXT: paddq %xmm0, %xmm0
; X86-NEXT: psllq $2, %xmm0
; X86-NEXT: retl
;
diff --git a/llvm/test/CodeGen/X86/oddsubvector.ll b/llvm/test/CodeGen/X86/oddsubvector.ll
index d8729aee217f..ce0e1048ed04 100644
--- a/llvm/test/CodeGen/X86/oddsubvector.ll
+++ b/llvm/test/CodeGen/X86/oddsubvector.ll
@@ -157,71 +157,71 @@ define void @PR42833() {
; SSE2-LABEL: PR42833:
; SSE2: # %bb.0:
; SSE2-NEXT: movl b(%rip), %eax
-; SSE2-NEXT: movdqa c+144(%rip), %xmm0
-; SSE2-NEXT: movdqa c+128(%rip), %xmm1
+; SSE2-NEXT: movdqa c+128(%rip), %xmm0
+; SSE2-NEXT: movdqa c+144(%rip), %xmm1
; SSE2-NEXT: addl c+128(%rip), %eax
; SSE2-NEXT: movd %eax, %xmm2
; SSE2-NEXT: movd %eax, %xmm3
-; SSE2-NEXT: paddd %xmm1, %xmm3
+; SSE2-NEXT: paddd %xmm0, %xmm3
; SSE2-NEXT: movdqa d+144(%rip), %xmm4
-; SSE2-NEXT: psubd %xmm0, %xmm4
-; SSE2-NEXT: paddd %xmm0, %xmm0
-; SSE2-NEXT: movdqa %xmm1, %xmm5
-; SSE2-NEXT: paddd %xmm1, %xmm5
+; SSE2-NEXT: psubd %xmm1, %xmm4
+; SSE2-NEXT: paddd %xmm1, %xmm1
+; SSE2-NEXT: movdqa %xmm0, %xmm5
+; SSE2-NEXT: paddd %xmm0, %xmm5
; SSE2-NEXT: movss {{.*#+}} xmm5 = xmm3[0],xmm5[1,2,3]
-; SSE2-NEXT: movdqa %xmm0, c+144(%rip)
+; SSE2-NEXT: movdqa %xmm1, c+144(%rip)
; SSE2-NEXT: movaps %xmm5, c+128(%rip)
-; SSE2-NEXT: movdqa c+160(%rip), %xmm0
+; SSE2-NEXT: movdqa c+160(%rip), %xmm1
; SSE2-NEXT: movdqa c+176(%rip), %xmm3
; SSE2-NEXT: movdqa d+160(%rip), %xmm5
; SSE2-NEXT: movdqa d+176(%rip), %xmm6
; SSE2-NEXT: movdqa d+128(%rip), %xmm7
-; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
-; SSE2-NEXT: psubd %xmm1, %xmm7
+; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
+; SSE2-NEXT: psubd %xmm0, %xmm7
; SSE2-NEXT: psubd %xmm3, %xmm6
-; SSE2-NEXT: psubd %xmm0, %xmm5
+; SSE2-NEXT: psubd %xmm1, %xmm5
; SSE2-NEXT: movdqa %xmm5, d+160(%rip)
; SSE2-NEXT: movdqa %xmm6, d+176(%rip)
; SSE2-NEXT: movdqa %xmm4, d+144(%rip)
; SSE2-NEXT: movdqa %xmm7, d+128(%rip)
; SSE2-NEXT: paddd %xmm3, %xmm3
-; SSE2-NEXT: paddd %xmm0, %xmm0
-; SSE2-NEXT: movdqa %xmm0, c+160(%rip)
+; SSE2-NEXT: paddd %xmm1, %xmm1
+; SSE2-NEXT: movdqa %xmm1, c+160(%rip)
; SSE2-NEXT: movdqa %xmm3, c+176(%rip)
; SSE2-NEXT: retq
;
; SSE42-LABEL: PR42833:
; SSE42: # %bb.0:
; SSE42-NEXT: movl b(%rip), %eax
-; SSE42-NEXT: movdqa c+144(%rip), %xmm0
-; SSE42-NEXT: movdqa c+128(%rip), %xmm1
+; SSE42-NEXT: movdqa c+128(%rip), %xmm0
+; SSE42-NEXT: movdqa c+144(%rip), %xmm1
; SSE42-NEXT: addl c+128(%rip), %eax
; SSE42-NEXT: movd %eax, %xmm2
-; SSE42-NEXT: paddd %xmm1, %xmm2
+; SSE42-NEXT: paddd %xmm0, %xmm2
; SSE42-NEXT: movdqa d+144(%rip), %xmm3
-; SSE42-NEXT: psubd %xmm0, %xmm3
-; SSE42-NEXT: paddd %xmm0, %xmm0
-; SSE42-NEXT: movdqa %xmm1, %xmm4
-; SSE42-NEXT: paddd %xmm1, %xmm4
+; SSE42-NEXT: psubd %xmm1, %xmm3
+; SSE42-NEXT: paddd %xmm1, %xmm1
+; SSE42-NEXT: movdqa %xmm0, %xmm4
+; SSE42-NEXT: paddd %xmm0, %xmm4
; SSE42-NEXT: pblendw {{.*#+}} xmm4 = xmm2[0,1],xmm4[2,3,4,5,6,7]
-; SSE42-NEXT: movdqa %xmm0, c+144(%rip)
+; SSE42-NEXT: movdqa %xmm1, c+144(%rip)
; SSE42-NEXT: movdqa %xmm4, c+128(%rip)
-; SSE42-NEXT: movdqa c+160(%rip), %xmm0
+; SSE42-NEXT: movdqa c+160(%rip), %xmm1
; SSE42-NEXT: movdqa c+176(%rip), %xmm2
; SSE42-NEXT: movdqa d+160(%rip), %xmm4
; SSE42-NEXT: movdqa d+176(%rip), %xmm5
; SSE42-NEXT: movdqa d+128(%rip), %xmm6
-; SSE42-NEXT: pinsrd $0, %eax, %xmm1
-; SSE42-NEXT: psubd %xmm1, %xmm6
+; SSE42-NEXT: pinsrd $0, %eax, %xmm0
+; SSE42-NEXT: psubd %xmm0, %xmm6
; SSE42-NEXT: psubd %xmm2, %xmm5
-; SSE42-NEXT: psubd %xmm0, %xmm4
+; SSE42-NEXT: psubd %xmm1, %xmm4
; SSE42-NEXT: movdqa %xmm4, d+160(%rip)
; SSE42-NEXT: movdqa %xmm5, d+176(%rip)
; SSE42-NEXT: movdqa %xmm3, d+144(%rip)
; SSE42-NEXT: movdqa %xmm6, d+128(%rip)
; SSE42-NEXT: paddd %xmm2, %xmm2
-; SSE42-NEXT: paddd %xmm0, %xmm0
-; SSE42-NEXT: movdqa %xmm0, c+160(%rip)
+; SSE42-NEXT: paddd %xmm1, %xmm1
+; SSE42-NEXT: movdqa %xmm1, c+160(%rip)
; SSE42-NEXT: movdqa %xmm2, c+176(%rip)
; SSE42-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/rotate_vec.ll b/llvm/test/CodeGen/X86/rotate_vec.ll
index af7e24887328..ac493a5df0dd 100644
--- a/llvm/test/CodeGen/X86/rotate_vec.ll
+++ b/llvm/test/CodeGen/X86/rotate_vec.ll
@@ -111,21 +111,18 @@ define <4 x i32> @rot_v4i32_mask_ashr0(<4 x i32> %a0) {
; XOPAVX1-LABEL: rot_v4i32_mask_ashr0:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vpshad {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; XOPAVX1-NEXT: vpaddd %xmm0, %xmm0, %xmm0
; XOPAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: rot_v4i32_mask_ashr0:
; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; XOPAVX2-NEXT: vpaddd %xmm0, %xmm0, %xmm0
; XOPAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; XOPAVX2-NEXT: retq
;
; AVX512-LABEL: rot_v4i32_mask_ashr0:
; AVX512: # %bb.0:
; AVX512-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX512-NEXT: vpaddd %xmm0, %xmm0, %xmm0
; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX512-NEXT: retq
%1 = ashr <4 x i32> %a0, <i32 25, i32 26, i32 27, i32 28>
@@ -139,7 +136,6 @@ define <4 x i32> @rot_v4i32_mask_ashr1(<4 x i32> %a0) {
; XOPAVX1-LABEL: rot_v4i32_mask_ashr1:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vpsrad $25, %xmm0, %xmm0
-; XOPAVX1-NEXT: vpaddd %xmm0, %xmm0, %xmm0
; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; XOPAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; XOPAVX1-NEXT: retq
@@ -147,7 +143,6 @@ define <4 x i32> @rot_v4i32_mask_ashr1(<4 x i32> %a0) {
; XOPAVX2-LABEL: rot_v4i32_mask_ashr1:
; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vpsrad $25, %xmm0, %xmm0
-; XOPAVX2-NEXT: vpaddd %xmm0, %xmm0, %xmm0
; XOPAVX2-NEXT: vpbroadcastd %xmm0, %xmm0
; XOPAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; XOPAVX2-NEXT: retq
@@ -155,7 +150,6 @@ define <4 x i32> @rot_v4i32_mask_ashr1(<4 x i32> %a0) {
; AVX512-LABEL: rot_v4i32_mask_ashr1:
; AVX512: # %bb.0:
; AVX512-NEXT: vpsrad $25, %xmm0, %xmm0
-; AVX512-NEXT: vpaddd %xmm0, %xmm0, %xmm0
; AVX512-NEXT: vpbroadcastd %xmm0, %xmm0
; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX512-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/sdiv_fix_sat.ll b/llvm/test/CodeGen/X86/sdiv_fix_sat.ll
index 9e890e666250..20ea9c5aeab2 100644
--- a/llvm/test/CodeGen/X86/sdiv_fix_sat.ll
+++ b/llvm/test/CodeGen/X86/sdiv_fix_sat.ll
@@ -581,28 +581,33 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
; X64-NEXT: subq $104, %rsp
; X64-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; X64-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-NEXT: pxor %xmm2, %xmm2
-; X64-NEXT: pcmpgtd %xmm0, %xmm2
+; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,1,3,3]
+; X64-NEXT: psllq $32, %xmm3
+; X64-NEXT: movdqa %xmm3, %xmm2
+; X64-NEXT: psrad $31, %xmm2
+; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
+; X64-NEXT: psrlq $31, %xmm3
+; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; X64-NEXT: paddq %xmm0, %xmm0
; X64-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-NEXT: movq %xmm0, %rbx
-; X64-NEXT: movq %rbx, %rbp
+; X64-NEXT: movq %xmm0, %r15
+; X64-NEXT: movq %r15, %rbp
; X64-NEXT: sarq $63, %rbp
-; X64-NEXT: shldq $31, %rbx, %rbp
+; X64-NEXT: shldq $31, %r15, %rbp
+; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; X64-NEXT: pxor %xmm0, %xmm0
; X64-NEXT: pcmpgtd %xmm1, %xmm0
; X64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; X64-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; X64-NEXT: movq %xmm1, %rdx
; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq %rdx, %r15
-; X64-NEXT: sarq $63, %r15
-; X64-NEXT: movq %rbx, %r12
+; X64-NEXT: movq %rdx, %rbx
+; X64-NEXT: sarq $63, %rbx
+; X64-NEXT: movq %r15, %r12
; X64-NEXT: shlq $31, %r12
; X64-NEXT: movq %r12, %rdi
; X64-NEXT: movq %rbp, %rsi
-; X64-NEXT: movq %r15, %rcx
+; X64-NEXT: movq %rbx, %rcx
; X64-NEXT: callq __divti3 at PLT
; X64-NEXT: movq %rax, %r13
; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
@@ -610,16 +615,16 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; X64-NEXT: subq $1, %r13
; X64-NEXT: sbbq $0, %r14
-; X64-NEXT: shrq $63, %rbx
-; X64-NEXT: xorl %r15d, %ebx
+; X64-NEXT: shrq $63, %r15
+; X64-NEXT: xorl %ebx, %r15d
; X64-NEXT: movq %r12, %rdi
; X64-NEXT: movq %rbp, %rsi
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; X64-NEXT: movq %r15, %rcx
+; X64-NEXT: movq %rbx, %rcx
; X64-NEXT: callq __modti3 at PLT
; X64-NEXT: orq %rax, %rdx
; X64-NEXT: setne %al
-; X64-NEXT: testb %bl, %al
+; X64-NEXT: testb %r15b, %al
; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload
; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
; X64-NEXT: movl $4294967295, %edx # imm = 0xFFFFFFFF
@@ -699,43 +704,45 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
; X64-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
; X64-NEXT: psrlq $1, %xmm1
; X64-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; X64-NEXT: # xmm1 = mem[2,3,2,3]
-; X64-NEXT: pxor %xmm0, %xmm0
-; X64-NEXT: pcmpgtd %xmm1, %xmm0
-; X64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; X64-NEXT: paddq %xmm1, %xmm1
-; X64-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-NEXT: movq %xmm1, %rbx
-; X64-NEXT: movq %rbx, %r12
-; X64-NEXT: sarq $63, %r12
-; X64-NEXT: shldq $31, %rbx, %r12
-; X64-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; X64-NEXT: # xmm1 = mem[2,3,2,3]
-; X64-NEXT: pxor %xmm0, %xmm0
-; X64-NEXT: pcmpgtd %xmm1, %xmm0
-; X64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; X64-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; X64-NEXT: movq %xmm1, %rdx
+; X64-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; X64-NEXT: # xmm0 = mem[0,1,1,3]
+; X64-NEXT: psllq $32, %xmm0
+; X64-NEXT: movdqa %xmm0, %xmm1
+; X64-NEXT: psrad $31, %xmm1
+; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; X64-NEXT: psrlq $31, %xmm0
+; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X64-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; X64-NEXT: movq %xmm0, %rbx
+; X64-NEXT: movq %rbx, %r13
+; X64-NEXT: sarq $63, %r13
+; X64-NEXT: shldq $31, %rbx, %r13
+; X64-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; X64-NEXT: pxor %xmm1, %xmm1
+; X64-NEXT: pcmpgtd %xmm0, %xmm1
+; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X64-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; X64-NEXT: movq %xmm0, %rdx
; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; X64-NEXT: movq %rdx, %rbp
; X64-NEXT: sarq $63, %rbp
; X64-NEXT: movq %rbx, %r15
; X64-NEXT: shlq $31, %r15
; X64-NEXT: movq %r15, %rdi
-; X64-NEXT: movq %r12, %rsi
+; X64-NEXT: movq %r13, %rsi
; X64-NEXT: movq %rbp, %rcx
; X64-NEXT: callq __divti3 at PLT
-; X64-NEXT: movq %rax, %r13
+; X64-NEXT: movq %rax, %r12
; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; X64-NEXT: movq %rdx, %r14
; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: subq $1, %r13
+; X64-NEXT: subq $1, %r12
; X64-NEXT: sbbq $0, %r14
; X64-NEXT: shrq $63, %rbx
; X64-NEXT: xorl %ebp, %ebx
; X64-NEXT: movq %r15, %rdi
-; X64-NEXT: movq %r12, %rsi
+; X64-NEXT: movq %r13, %rsi
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
; X64-NEXT: movq %rbp, %rcx
; X64-NEXT: callq __modti3 at PLT
@@ -743,25 +750,25 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
; X64-NEXT: setne %al
; X64-NEXT: testb %bl, %al
; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload
-; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
+; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload
; X64-NEXT: movl $4294967295, %ecx # imm = 0xFFFFFFFF
-; X64-NEXT: cmpq %rcx, %r13
+; X64-NEXT: cmpq %rcx, %r12
; X64-NEXT: movl $4294967295, %eax # imm = 0xFFFFFFFF
-; X64-NEXT: cmovbq %r13, %rax
+; X64-NEXT: cmovbq %r12, %rax
; X64-NEXT: testq %r14, %r14
-; X64-NEXT: cmovnsq %rcx, %r13
-; X64-NEXT: cmoveq %rax, %r13
+; X64-NEXT: cmovnsq %rcx, %r12
+; X64-NEXT: cmoveq %rax, %r12
; X64-NEXT: movl $0, %eax
; X64-NEXT: cmovnsq %rax, %r14
; X64-NEXT: movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000
-; X64-NEXT: cmpq %rcx, %r13
+; X64-NEXT: cmpq %rcx, %r12
; X64-NEXT: movq %rcx, %rax
-; X64-NEXT: cmovaq %r13, %rax
+; X64-NEXT: cmovaq %r12, %rax
; X64-NEXT: testq %r14, %r14
-; X64-NEXT: cmovsq %rcx, %r13
+; X64-NEXT: cmovsq %rcx, %r12
; X64-NEXT: cmpq $-1, %r14
-; X64-NEXT: cmoveq %rax, %r13
-; X64-NEXT: movq %r13, %xmm0
+; X64-NEXT: cmoveq %rax, %r12
+; X64-NEXT: movq %r12, %xmm0
; X64-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; X64-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; X64-NEXT: # xmm0 = mem[2,3,2,3]
@@ -816,12 +823,12 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
; X64-NEXT: cmovsq %rcx, %r12
; X64-NEXT: cmpq $-1, %r14
; X64-NEXT: cmoveq %rax, %r12
-; X64-NEXT: movq %r12, %xmm0
-; X64-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; X64-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; X64-NEXT: psrlq $1, %xmm1
-; X64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; X64-NEXT: movq %r12, %xmm1
+; X64-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; X64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X64-NEXT: psrlq $1, %xmm0
+; X64-NEXT: shufps $136, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; X64-NEXT: # xmm0 = xmm0[0,2],mem[0,2]
; X64-NEXT: addq $104, %rsp
; X64-NEXT: popq %rbx
; X64-NEXT: popq %r12
@@ -840,116 +847,108 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
; X86-NEXT: pushl %esi
; X86-NEXT: andl $-8, %esp
; X86-NEXT: subl $256, %esp # imm = 0x100
-; X86-NEXT: movl 24(%ebp), %edx
-; X86-NEXT: movl 40(%ebp), %edi
-; X86-NEXT: leal {{[0-9]+}}(%esp), %ebx
-; X86-NEXT: movl %edi, %esi
-; X86-NEXT: sarl $31, %esi
-; X86-NEXT: movl %edx, %eax
-; X86-NEXT: sarl $31, %eax
-; X86-NEXT: addl %edx, %edx
-; X86-NEXT: adcl %eax, %eax
+; X86-NEXT: movl 16(%ebp), %edi
+; X86-NEXT: movl 32(%ebp), %eax
+; X86-NEXT: movl %eax, %esi
; X86-NEXT: movl %eax, %ecx
-; X86-NEXT: shldl $31, %edx, %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shll $31, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl $1, %eax
-; X86-NEXT: negl %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: sarl $31, %esi
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edi, %ebx
+; X86-NEXT: sarl $31, %ebx
+; X86-NEXT: leal (%edi,%edi), %eax
+; X86-NEXT: shrl $31, %edi
+; X86-NEXT: shldl $31, %eax, %edi
+; X86-NEXT: leal {{[0-9]+}}(%esp), %eax
; X86-NEXT: pushl %esi
; X86-NEXT: pushl %esi
; X86-NEXT: pushl %esi
+; X86-NEXT: pushl %ecx
+; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
+; X86-NEXT: pushl $0
; X86-NEXT: pushl %eax
-; X86-NEXT: pushl %eax
-; X86-NEXT: pushl %ecx
-; X86-NEXT: pushl %edx
+; X86-NEXT: calll __divti3
+; X86-NEXT: addl $32, %esp
+; X86-NEXT: leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT: pushl %esi
+; X86-NEXT: pushl %esi
+; X86-NEXT: pushl %esi
+; X86-NEXT: pushl 32(%ebp)
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl $0
+; X86-NEXT: pushl %eax
; X86-NEXT: calll __modti3
; X86-NEXT: addl $32, %esp
-; X86-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl 36(%ebp), %esi
-; X86-NEXT: movl %esi, %edi
-; X86-NEXT: sarl $31, %edi
-; X86-NEXT: movl 20(%ebp), %ecx
-; X86-NEXT: movl %ecx, %edx
-; X86-NEXT: sarl $31, %edx
-; X86-NEXT: addl %ecx, %ecx
-; X86-NEXT: adcl %edx, %edx
+; X86-NEXT: movl 36(%ebp), %edx
; X86-NEXT: movl %edx, %ebx
-; X86-NEXT: shldl $31, %ecx, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shll $31, %ecx
+; X86-NEXT: sarl $31, %ebx
+; X86-NEXT: movl 20(%ebp), %ecx
+; X86-NEXT: movl %ecx, %esi
+; X86-NEXT: sarl $31, %esi
+; X86-NEXT: leal (%ecx,%ecx), %eax
+; X86-NEXT: shrl $31, %ecx
+; X86-NEXT: shldl $31, %eax, %ecx
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl $1, %edx
-; X86-NEXT: negl %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: pushl %edi
-; X86-NEXT: pushl %edi
-; X86-NEXT: pushl %edi
-; X86-NEXT: pushl %esi
-; X86-NEXT: pushl %edx
-; X86-NEXT: pushl %edx
+; X86-NEXT: leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl %edx
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: pushl %esi
+; X86-NEXT: pushl %esi
; X86-NEXT: pushl %ecx
+; X86-NEXT: pushl $0
; X86-NEXT: pushl %eax
; X86-NEXT: calll __modti3
; X86-NEXT: addl $32, %esp
-; X86-NEXT: leal {{[0-9]+}}(%esp), %edi
-; X86-NEXT: movl 28(%ebp), %ebx
-; X86-NEXT: movl %ebx, %edx
-; X86-NEXT: sarl $31, %edx
-; X86-NEXT: movl 12(%ebp), %eax
-; X86-NEXT: movl %eax, %esi
+; X86-NEXT: movl 28(%ebp), %edx
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: sarl $31, %edi
+; X86-NEXT: movl 12(%ebp), %ecx
+; X86-NEXT: movl %ecx, %esi
; X86-NEXT: sarl $31, %esi
-; X86-NEXT: addl %eax, %eax
-; X86-NEXT: adcl %esi, %esi
-; X86-NEXT: movl %esi, %ecx
+; X86-NEXT: leal (%ecx,%ecx), %eax
+; X86-NEXT: shrl $31, %ecx
; X86-NEXT: shldl $31, %eax, %ecx
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shll $31, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl $1, %esi
-; X86-NEXT: negl %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: pushl %edx
-; X86-NEXT: pushl %edx
+; X86-NEXT: leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %edi
; X86-NEXT: pushl %edx
-; X86-NEXT: pushl %ebx
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: pushl %esi
; X86-NEXT: pushl %esi
; X86-NEXT: pushl %ecx
+; X86-NEXT: pushl $0
; X86-NEXT: pushl %eax
-; X86-NEXT: pushl %edi
; X86-NEXT: calll __divti3
; X86-NEXT: addl $32, %esp
-; X86-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl 32(%ebp), %edx
+; X86-NEXT: movl 40(%ebp), %edx
; X86-NEXT: movl %edx, %esi
; X86-NEXT: sarl $31, %esi
-; X86-NEXT: movl 16(%ebp), %ecx
-; X86-NEXT: movl %ecx, %ebx
-; X86-NEXT: sarl $31, %ebx
-; X86-NEXT: addl %ecx, %ecx
-; X86-NEXT: adcl %ebx, %ebx
-; X86-NEXT: movl %ebx, %edi
-; X86-NEXT: shldl $31, %ecx, %edi
-; X86-NEXT: shll $31, %ecx
+; X86-NEXT: movl 24(%ebp), %ecx
+; X86-NEXT: movl %ecx, %edi
+; X86-NEXT: sarl $31, %edi
+; X86-NEXT: leal (%ecx,%ecx), %eax
+; X86-NEXT: shrl $31, %ecx
+; X86-NEXT: shldl $31, %eax, %ecx
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl $1, %ebx
-; X86-NEXT: negl %ebx
+; X86-NEXT: leal {{[0-9]+}}(%esp), %eax
; X86-NEXT: pushl %esi
; X86-NEXT: pushl %esi
; X86-NEXT: pushl %esi
; X86-NEXT: pushl %edx
-; X86-NEXT: pushl %ebx
-; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl %edi
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %ecx
+; X86-NEXT: pushl $0
; X86-NEXT: pushl %eax
; X86-NEXT: calll __modti3
; X86-NEXT: addl $32, %esp
@@ -958,39 +957,25 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
; X86-NEXT: pushl %esi
; X86-NEXT: pushl %esi
; X86-NEXT: pushl %esi
-; X86-NEXT: pushl 32(%ebp)
-; X86-NEXT: pushl %ebx
-; X86-NEXT: pushl %ebx
-; X86-NEXT: pushl %edi
-; X86-NEXT: pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X86-NEXT: pushl %eax
-; X86-NEXT: calll __divti3
-; X86-NEXT: addl $32, %esp
-; X86-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: pushl %ecx
-; X86-NEXT: pushl %ecx
-; X86-NEXT: pushl %ecx
; X86-NEXT: pushl 40(%ebp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: pushl %ecx
-; X86-NEXT: pushl %ecx
-; X86-NEXT: pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %edi
; X86-NEXT: pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT: pushl $0
; X86-NEXT: pushl %eax
; X86-NEXT: calll __divti3
; X86-NEXT: addl $32, %esp
; X86-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: pushl %ecx
-; X86-NEXT: pushl %ecx
-; X86-NEXT: pushl %ecx
+; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl %ebx
; X86-NEXT: pushl 36(%ebp)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %edi
; X86-NEXT: pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X86-NEXT: pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT: pushl $0
; X86-NEXT: pushl %eax
; X86-NEXT: calll __divti3
; X86-NEXT: addl $32, %esp
@@ -1005,22 +990,22 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: sbbl $0, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl %ecx, %eax
; X86-NEXT: sbbl $0, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: testl %ebx, %ebx
+; X86-NEXT: sets %bl
; X86-NEXT: testl %edi, %edi
-; X86-NEXT: sets %al
-; X86-NEXT: cmpl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X86-NEXT: sets %ah
-; X86-NEXT: xorb %al, %ah
+; X86-NEXT: sets %bh
+; X86-NEXT: xorb %bl, %bh
; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
; X86-NEXT: orl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: orl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: orl %edi, %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: orl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: orl %edi, %eax
; X86-NEXT: setne %al
-; X86-NEXT: testb %ah, %al
+; X86-NEXT: testb %bh, %al
; X86-NEXT: cmovel %esi, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -1030,7 +1015,7 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: cmovel %ecx, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -1047,7 +1032,7 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: sbbl $0, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: testl %ebx, %ebx
+; X86-NEXT: cmpl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
; X86-NEXT: sets %bl
; X86-NEXT: cmpl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
; X86-NEXT: sets %bh
@@ -1085,11 +1070,11 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: sbbl $0, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: testl %ecx, %ecx
-; X86-NEXT: sets %al
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X86-NEXT: testl %edx, %edx
+; X86-NEXT: sets %al
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: testl %ecx, %ecx
; X86-NEXT: sets %bl
; X86-NEXT: xorb %al, %bl
; X86-NEXT: leal {{[0-9]+}}(%esp), %eax
@@ -1100,7 +1085,7 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
; X86-NEXT: pushl %ecx
; X86-NEXT: pushl %ecx
; X86-NEXT: pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X86-NEXT: pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT: pushl $0
; X86-NEXT: pushl %eax
; X86-NEXT: calll __modti3
; X86-NEXT: addl $32, %esp
diff --git a/llvm/test/CodeGen/X86/udiv_fix_sat.ll b/llvm/test/CodeGen/X86/udiv_fix_sat.ll
index 04c9befa40fb..a3be2999354c 100644
--- a/llvm/test/CodeGen/X86/udiv_fix_sat.ll
+++ b/llvm/test/CodeGen/X86/udiv_fix_sat.ll
@@ -382,93 +382,85 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: subl $12, %esp
+; X86-NEXT: subl $8, %esp
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: xorl %ecx, %ecx
-; X86-NEXT: addl %eax, %eax
-; X86-NEXT: setb %cl
-; X86-NEXT: shldl $31, %eax, %ecx
-; X86-NEXT: shll $31, %eax
+; X86-NEXT: leal (%eax,%eax), %ecx
+; X86-NEXT: shrl $31, %eax
+; X86-NEXT: shldl $31, %ecx, %eax
; X86-NEXT: pushl $0
; X86-NEXT: pushl {{[0-9]+}}(%esp)
-; X86-NEXT: pushl %ecx
; X86-NEXT: pushl %eax
+; X86-NEXT: pushl $0
; X86-NEXT: calll __udivdi3
; X86-NEXT: addl $16, %esp
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: xorl %eax, %eax
-; X86-NEXT: addl %ebp, %ebp
-; X86-NEXT: setb %al
-; X86-NEXT: shldl $31, %ebp, %eax
-; X86-NEXT: shll $31, %ebp
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: leal (%ebx,%ebx), %eax
+; X86-NEXT: shrl $31, %ebx
+; X86-NEXT: shldl $31, %eax, %ebx
; X86-NEXT: pushl $0
-; X86-NEXT: pushl %ebx
-; X86-NEXT: pushl %eax
; X86-NEXT: pushl %ebp
+; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl $0
; X86-NEXT: calll __udivdi3
; X86-NEXT: addl $16, %esp
; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT: movl %edx, %ebp
-; X86-NEXT: xorl %eax, %eax
-; X86-NEXT: addl %edi, %edi
-; X86-NEXT: setb %al
-; X86-NEXT: shldl $31, %edi, %eax
-; X86-NEXT: shll $31, %edi
+; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: leal (%esi,%esi), %eax
+; X86-NEXT: shrl $31, %esi
+; X86-NEXT: shldl $31, %eax, %esi
; X86-NEXT: pushl $0
; X86-NEXT: pushl {{[0-9]+}}(%esp)
-; X86-NEXT: pushl %eax
-; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %esi
+; X86-NEXT: pushl $0
; X86-NEXT: calll __udivdi3
; X86-NEXT: addl $16, %esp
-; X86-NEXT: movl %eax, %ebx
-; X86-NEXT: movl %edx, %edi
-; X86-NEXT: xorl %eax, %eax
-; X86-NEXT: addl %esi, %esi
-; X86-NEXT: setb %al
-; X86-NEXT: shldl $31, %esi, %eax
-; X86-NEXT: shll $31, %esi
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: leal (%edx,%edx), %ecx
+; X86-NEXT: shrl $31, %edx
+; X86-NEXT: shldl $31, %ecx, %edx
+; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: cmpl $2, %esi
+; X86-NEXT: movl $-1, %edx
+; X86-NEXT: cmovael %edx, %eax
+; X86-NEXT: cmpl $1, %esi
+; X86-NEXT: movl $1, %ebp
+; X86-NEXT: cmovael %ebp, %esi
+; X86-NEXT: shldl $31, %eax, %esi
+; X86-NEXT: cmpl $2, %ebx
+; X86-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-NEXT: cmovael %edx, %eax
+; X86-NEXT: cmpl $1, %ebx
+; X86-NEXT: cmovael %ebp, %ebx
+; X86-NEXT: shldl $31, %eax, %ebx
+; X86-NEXT: cmpl $2, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: cmovael %edx, %eax
+; X86-NEXT: cmpl $1, %edi
+; X86-NEXT: cmovael %ebp, %edi
+; X86-NEXT: shldl $31, %eax, %edi
; X86-NEXT: pushl $0
; X86-NEXT: pushl {{[0-9]+}}(%esp)
-; X86-NEXT: pushl %eax
-; X86-NEXT: pushl %esi
+; X86-NEXT: pushl %ecx
+; X86-NEXT: pushl $0
; X86-NEXT: calll __udivdi3
; X86-NEXT: addl $16, %esp
; X86-NEXT: cmpl $2, %edx
-; X86-NEXT: movl $-1, %esi
-; X86-NEXT: cmovael %esi, %eax
+; X86-NEXT: movl $-1, %ecx
+; X86-NEXT: cmovael %ecx, %eax
; X86-NEXT: cmpl $1, %edx
-; X86-NEXT: movl $1, %ecx
-; X86-NEXT: cmovael %ecx, %edx
-; X86-NEXT: shldl $31, %eax, %edx
-; X86-NEXT: cmpl $2, %edi
-; X86-NEXT: cmovael %esi, %ebx
-; X86-NEXT: cmpl $1, %edi
-; X86-NEXT: cmovael %ecx, %edi
-; X86-NEXT: shldl $31, %ebx, %edi
-; X86-NEXT: cmpl $2, %ebp
-; X86-NEXT: movl (%esp), %eax # 4-byte Reload
-; X86-NEXT: cmovael %esi, %eax
-; X86-NEXT: cmpl $1, %ebp
-; X86-NEXT: cmovael %ecx, %ebp
+; X86-NEXT: cmovbl %edx, %ebp
; X86-NEXT: shldl $31, %eax, %ebp
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: cmpl $2, %ebx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: cmovael %esi, %eax
-; X86-NEXT: cmpl $1, %ebx
-; X86-NEXT: cmovbl %ebx, %ecx
-; X86-NEXT: shldl $31, %eax, %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl %ecx, 12(%eax)
-; X86-NEXT: movl %ebp, 8(%eax)
-; X86-NEXT: movl %edi, 4(%eax)
-; X86-NEXT: movl %edx, (%eax)
-; X86-NEXT: addl $12, %esp
+; X86-NEXT: movl %ebp, 12(%eax)
+; X86-NEXT: movl %edi, 8(%eax)
+; X86-NEXT: movl %ebx, 4(%eax)
+; X86-NEXT: movl %esi, (%eax)
+; X86-NEXT: addl $8, %esp
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: popl %ebx
diff --git a/llvm/test/CodeGen/X86/vector-fshr-128.ll b/llvm/test/CodeGen/X86/vector-fshr-128.ll
index 2858a2a2bdb0..97046ac71270 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-128.ll
@@ -36,7 +36,7 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt)
; SSE2-NEXT: psrlq %xmm4, %xmm1
; SSE2-NEXT: shufpd {{.*#+}} xmm5 = xmm5[0],xmm1[1]
; SSE2-NEXT: pandn %xmm3, %xmm2
-; SSE2-NEXT: psllq $1, %xmm0
+; SSE2-NEXT: paddq %xmm0, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psllq %xmm2, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
@@ -56,12 +56,12 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt)
; SSE41-NEXT: psrlq %xmm4, %xmm1
; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm1[4,5,6,7]
; SSE41-NEXT: pandn %xmm3, %xmm2
-; SSE41-NEXT: psllq $1, %xmm0
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psllq %xmm2, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
+; SSE41-NEXT: paddq %xmm0, %xmm0
+; SSE41-NEXT: movdqa %xmm0, %xmm3
+; SSE41-NEXT: psllq %xmm1, %xmm3
; SSE41-NEXT: psllq %xmm2, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
; SSE41-NEXT: por %xmm5, %xmm0
; SSE41-NEXT: retq
;
@@ -74,11 +74,11 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt)
; AVX1-NEXT: vpsrlq %xmm4, %xmm1, %xmm1
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm1[4,5,6,7]
; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpsllq $1, %xmm0, %xmm0
-; AVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm3
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,3,2,3]
+; AVX1-NEXT: vpaddq %xmm0, %xmm0, %xmm0
+; AVX1-NEXT: vpsllq %xmm3, %xmm0, %xmm3
; AVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
@@ -88,7 +88,7 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt)
; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm4
; AVX2-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1
; AVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2
-; AVX2-NEXT: vpsllq $1, %xmm0, %xmm0
+; AVX2-NEXT: vpaddq %xmm0, %xmm0, %xmm0
; AVX2-NEXT: vpsllvq %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
@@ -99,7 +99,7 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt)
; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4
; AVX512F-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1
; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm2
-; AVX512F-NEXT: vpsllq $1, %xmm0, %xmm0
+; AVX512F-NEXT: vpaddq %xmm0, %xmm0, %xmm0
; AVX512F-NEXT: vpsllvq %xmm2, %xmm0, %xmm0
; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512F-NEXT: retq
@@ -110,7 +110,7 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt)
; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4
; AVX512VL-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1
; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm2
-; AVX512VL-NEXT: vpsllq $1, %xmm0, %xmm0
+; AVX512VL-NEXT: vpaddq %xmm0, %xmm0, %xmm0
; AVX512VL-NEXT: vpsllvq %xmm2, %xmm0, %xmm0
; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512VL-NEXT: retq
@@ -121,7 +121,7 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt)
; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4
; AVX512BW-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1
; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm2
-; AVX512BW-NEXT: vpsllq $1, %xmm0, %xmm0
+; AVX512BW-NEXT: vpaddq %xmm0, %xmm0, %xmm0
; AVX512BW-NEXT: vpsllvq %xmm2, %xmm0, %xmm0
; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512BW-NEXT: retq
@@ -142,7 +142,7 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt)
; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4
; AVX512VLBW-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1
; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm2
-; AVX512VLBW-NEXT: vpsllq $1, %xmm0, %xmm0
+; AVX512VLBW-NEXT: vpaddq %xmm0, %xmm0, %xmm0
; AVX512VLBW-NEXT: vpsllvq %xmm2, %xmm0, %xmm0
; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512VLBW-NEXT: retq
@@ -156,13 +156,13 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt)
; XOPAVX1-LABEL: var_funnnel_v2i64:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
-; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm4
-; XOPAVX1-NEXT: vpsllq $1, %xmm0, %xmm0
-; XOPAVX1-NEXT: vpshlq %xmm4, %xmm0, %xmm0
-; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
-; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; XOPAVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm2
-; XOPAVX1-NEXT: vpshlq %xmm2, %xmm1, %xmm1
+; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm4
+; XOPAVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
+; XOPAVX1-NEXT: vpsubq %xmm4, %xmm5, %xmm4
+; XOPAVX1-NEXT: vpshlq %xmm4, %xmm1, %xmm1
+; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2
+; XOPAVX1-NEXT: vpaddq %xmm0, %xmm0, %xmm0
+; XOPAVX1-NEXT: vpshlq %xmm2, %xmm0, %xmm0
; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
; XOPAVX1-NEXT: retq
;
@@ -172,7 +172,7 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt)
; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm4
; XOPAVX2-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1
; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2
-; XOPAVX2-NEXT: vpsllq $1, %xmm0, %xmm0
+; XOPAVX2-NEXT: vpaddq %xmm0, %xmm0, %xmm0
; XOPAVX2-NEXT: vpsllvq %xmm2, %xmm0, %xmm0
; XOPAVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
; XOPAVX2-NEXT: retq
@@ -188,7 +188,7 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt)
; X86-SSE2-NEXT: psrlq %xmm5, %xmm1
; X86-SSE2-NEXT: shufpd {{.*#+}} xmm3 = xmm3[0],xmm1[1]
; X86-SSE2-NEXT: pandn %xmm4, %xmm2
-; X86-SSE2-NEXT: psllq $1, %xmm0
+; X86-SSE2-NEXT: paddq %xmm0, %xmm0
; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
; X86-SSE2-NEXT: psllq %xmm2, %xmm1
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
@@ -225,7 +225,7 @@ define <4 x i32> @var_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt)
; SSE2-NEXT: pslld $23, %xmm2
; SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
; SSE2-NEXT: cvttps2dq %xmm2, %xmm1
-; SSE2-NEXT: pslld $1, %xmm0
+; SSE2-NEXT: paddd %xmm0, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; SSE2-NEXT: pmuludq %xmm1, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
@@ -260,7 +260,7 @@ define <4 x i32> @var_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt)
; SSE41-NEXT: pslld $23, %xmm2
; SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
; SSE41-NEXT: cvttps2dq %xmm2, %xmm1
-; SSE41-NEXT: pslld $1, %xmm0
+; SSE41-NEXT: paddd %xmm0, %xmm0
; SSE41-NEXT: pmulld %xmm1, %xmm0
; SSE41-NEXT: por %xmm6, %xmm0
; SSE41-NEXT: retq
@@ -285,7 +285,7 @@ define <4 x i32> @var_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt)
; AVX1-NEXT: vpslld $23, %xmm2, %xmm2
; AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2
-; AVX1-NEXT: vpslld $1, %xmm0, %xmm0
+; AVX1-NEXT: vpaddd %xmm0, %xmm0, %xmm0
; AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
@@ -296,7 +296,7 @@ define <4 x i32> @var_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt)
; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm4
; AVX2-NEXT: vpsrlvd %xmm4, %xmm1, %xmm1
; AVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2
-; AVX2-NEXT: vpslld $1, %xmm0, %xmm0
+; AVX2-NEXT: vpaddd %xmm0, %xmm0, %xmm0
; AVX2-NEXT: vpsllvd %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
@@ -307,7 +307,7 @@ define <4 x i32> @var_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt)
; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4
; AVX512F-NEXT: vpsrlvd %xmm4, %xmm1, %xmm1
; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm2
-; AVX512F-NEXT: vpslld $1, %xmm0, %xmm0
+; AVX512F-NEXT: vpaddd %xmm0, %xmm0, %xmm0
; AVX512F-NEXT: vpsllvd %xmm2, %xmm0, %xmm0
; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512F-NEXT: retq
@@ -318,7 +318,7 @@ define <4 x i32> @var_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt)
; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4
; AVX512VL-NEXT: vpsrlvd %xmm4, %xmm1, %xmm1
; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm2
-; AVX512VL-NEXT: vpslld $1, %xmm0, %xmm0
+; AVX512VL-NEXT: vpaddd %xmm0, %xmm0, %xmm0
; AVX512VL-NEXT: vpsllvd %xmm2, %xmm0, %xmm0
; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512VL-NEXT: retq
@@ -329,7 +329,7 @@ define <4 x i32> @var_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt)
; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4
; AVX512BW-NEXT: vpsrlvd %xmm4, %xmm1, %xmm1
; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm2
-; AVX512BW-NEXT: vpslld $1, %xmm0, %xmm0
+; AVX512BW-NEXT: vpaddd %xmm0, %xmm0, %xmm0
; AVX512BW-NEXT: vpsllvd %xmm2, %xmm0, %xmm0
; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512BW-NEXT: retq
@@ -350,7 +350,7 @@ define <4 x i32> @var_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt)
; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4
; AVX512VLBW-NEXT: vpsrlvd %xmm4, %xmm1, %xmm1
; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm2
-; AVX512VLBW-NEXT: vpslld $1, %xmm0, %xmm0
+; AVX512VLBW-NEXT: vpaddd %xmm0, %xmm0, %xmm0
; AVX512VLBW-NEXT: vpsllvd %xmm2, %xmm0, %xmm0
; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512VLBW-NEXT: retq
@@ -364,13 +364,13 @@ define <4 x i32> @var_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt)
; XOPAVX1-LABEL: var_funnnel_v4i32:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [31,31,31,31]
-; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm4
-; XOPAVX1-NEXT: vpslld $1, %xmm0, %xmm0
-; XOPAVX1-NEXT: vpshld %xmm4, %xmm0, %xmm0
-; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
-; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; XOPAVX1-NEXT: vpsubd %xmm2, %xmm3, %xmm2
-; XOPAVX1-NEXT: vpshld %xmm2, %xmm1, %xmm1
+; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm4
+; XOPAVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
+; XOPAVX1-NEXT: vpsubd %xmm4, %xmm5, %xmm4
+; XOPAVX1-NEXT: vpshld %xmm4, %xmm1, %xmm1
+; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2
+; XOPAVX1-NEXT: vpaddd %xmm0, %xmm0, %xmm0
+; XOPAVX1-NEXT: vpshld %xmm2, %xmm0, %xmm0
; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
; XOPAVX1-NEXT: retq
;
@@ -380,7 +380,7 @@ define <4 x i32> @var_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt)
; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm4
; XOPAVX2-NEXT: vpsrlvd %xmm4, %xmm1, %xmm1
; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2
-; XOPAVX2-NEXT: vpslld $1, %xmm0, %xmm0
+; XOPAVX2-NEXT: vpaddd %xmm0, %xmm0, %xmm0
; XOPAVX2-NEXT: vpsllvd %xmm2, %xmm0, %xmm0
; XOPAVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
; XOPAVX2-NEXT: retq
@@ -409,7 +409,7 @@ define <4 x i32> @var_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt)
; X86-SSE2-NEXT: pslld $23, %xmm2
; X86-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
; X86-SSE2-NEXT: cvttps2dq %xmm2, %xmm1
-; X86-SSE2-NEXT: pslld $1, %xmm0
+; X86-SSE2-NEXT: paddd %xmm0, %xmm0
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; X86-SSE2-NEXT: pmuludq %xmm1, %xmm0
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
@@ -473,7 +473,7 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt)
; SSE2-NEXT: pslld $16, %xmm2
; SSE2-NEXT: psrad $16, %xmm2
; SSE2-NEXT: packssdw %xmm4, %xmm2
-; SSE2-NEXT: psllw $1, %xmm0
+; SSE2-NEXT: paddw %xmm0, %xmm0
; SSE2-NEXT: pmullw %xmm2, %xmm0
; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: por %xmm3, %xmm0
@@ -519,7 +519,7 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt)
; SSE41-NEXT: paddd %xmm4, %xmm0
; SSE41-NEXT: cvttps2dq %xmm0, %xmm0
; SSE41-NEXT: packusdw %xmm2, %xmm0
-; SSE41-NEXT: psllw $1, %xmm3
+; SSE41-NEXT: paddw %xmm3, %xmm3
; SSE41-NEXT: pmullw %xmm0, %xmm3
; SSE41-NEXT: por %xmm1, %xmm3
; SSE41-NEXT: movdqa %xmm3, %xmm0
@@ -554,7 +554,7 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt)
; AVX1-NEXT: vpaddd %xmm4, %xmm2, %xmm2
; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2
; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpsllw $1, %xmm0, %xmm0
+; AVX1-NEXT: vpaddw %xmm0, %xmm0, %xmm0
; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
@@ -608,7 +608,7 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt)
; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4
; AVX512BW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1
; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm2
-; AVX512BW-NEXT: vpsllw $1, %xmm0, %xmm0
+; AVX512BW-NEXT: vpaddw %xmm0, %xmm0, %xmm0
; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm0
; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512BW-NEXT: vzeroupper
@@ -630,7 +630,7 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt)
; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4
; AVX512VLBW-NEXT: vpsrlvw %xmm4, %xmm1, %xmm1
; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm2
-; AVX512VLBW-NEXT: vpsllw $1, %xmm0, %xmm0
+; AVX512VLBW-NEXT: vpaddw %xmm0, %xmm0, %xmm0
; AVX512VLBW-NEXT: vpsllvw %xmm2, %xmm0, %xmm0
; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512VLBW-NEXT: retq
@@ -644,13 +644,13 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt)
; XOP-LABEL: var_funnnel_v8i16:
; XOP: # %bb.0:
; XOP-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
-; XOP-NEXT: vpandn %xmm3, %xmm2, %xmm4
-; XOP-NEXT: vpsllw $1, %xmm0, %xmm0
-; XOP-NEXT: vpshlw %xmm4, %xmm0, %xmm0
-; XOP-NEXT: vpand %xmm3, %xmm2, %xmm2
-; XOP-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; XOP-NEXT: vpsubw %xmm2, %xmm3, %xmm2
-; XOP-NEXT: vpshlw %xmm2, %xmm1, %xmm1
+; XOP-NEXT: vpand %xmm3, %xmm2, %xmm4
+; XOP-NEXT: vpxor %xmm5, %xmm5, %xmm5
+; XOP-NEXT: vpsubw %xmm4, %xmm5, %xmm4
+; XOP-NEXT: vpshlw %xmm4, %xmm1, %xmm1
+; XOP-NEXT: vpandn %xmm3, %xmm2, %xmm2
+; XOP-NEXT: vpaddw %xmm0, %xmm0, %xmm0
+; XOP-NEXT: vpshlw %xmm2, %xmm0, %xmm0
; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
;
@@ -703,7 +703,7 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt)
; X86-SSE2-NEXT: pslld $16, %xmm2
; X86-SSE2-NEXT: psrad $16, %xmm2
; X86-SSE2-NEXT: packssdw %xmm4, %xmm2
-; X86-SSE2-NEXT: psllw $1, %xmm0
+; X86-SSE2-NEXT: paddw %xmm0, %xmm0
; X86-SSE2-NEXT: pmullw %xmm2, %xmm0
; X86-SSE2-NEXT: por %xmm1, %xmm0
; X86-SSE2-NEXT: por %xmm3, %xmm0
@@ -1036,7 +1036,7 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %
; SSE-NEXT: pand %xmm3, %xmm4
; SSE-NEXT: psrlq %xmm4, %xmm1
; SSE-NEXT: pandn %xmm3, %xmm2
-; SSE-NEXT: psllq $1, %xmm0
+; SSE-NEXT: paddq %xmm0, %xmm0
; SSE-NEXT: psllq %xmm2, %xmm0
; SSE-NEXT: por %xmm1, %xmm0
; SSE-NEXT: retq
@@ -1047,7 +1047,7 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %
; AVX-NEXT: vpand %xmm3, %xmm2, %xmm4
; AVX-NEXT: vpsrlq %xmm4, %xmm1, %xmm1
; AVX-NEXT: vpandn %xmm3, %xmm2, %xmm2
-; AVX-NEXT: vpsllq $1, %xmm0, %xmm0
+; AVX-NEXT: vpaddq %xmm0, %xmm0, %xmm0
; AVX-NEXT: vpsllq %xmm2, %xmm0, %xmm0
; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
@@ -1058,7 +1058,7 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %
; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4
; AVX512F-NEXT: vpsrlq %xmm4, %xmm1, %xmm1
; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm2
-; AVX512F-NEXT: vpsllq $1, %xmm0, %xmm0
+; AVX512F-NEXT: vpaddq %xmm0, %xmm0, %xmm0
; AVX512F-NEXT: vpsllq %xmm2, %xmm0, %xmm0
; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512F-NEXT: retq
@@ -1069,7 +1069,7 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %
; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4
; AVX512VL-NEXT: vpsrlq %xmm4, %xmm1, %xmm1
; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm2
-; AVX512VL-NEXT: vpsllq $1, %xmm0, %xmm0
+; AVX512VL-NEXT: vpaddq %xmm0, %xmm0, %xmm0
; AVX512VL-NEXT: vpsllq %xmm2, %xmm0, %xmm0
; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512VL-NEXT: retq
@@ -1080,7 +1080,7 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %
; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4
; AVX512BW-NEXT: vpsrlq %xmm4, %xmm1, %xmm1
; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm2
-; AVX512BW-NEXT: vpsllq $1, %xmm0, %xmm0
+; AVX512BW-NEXT: vpaddq %xmm0, %xmm0, %xmm0
; AVX512BW-NEXT: vpsllq %xmm2, %xmm0, %xmm0
; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512BW-NEXT: retq
@@ -1101,7 +1101,7 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %
; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4
; AVX512VLBW-NEXT: vpsrlq %xmm4, %xmm1, %xmm1
; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm2
-; AVX512VLBW-NEXT: vpsllq $1, %xmm0, %xmm0
+; AVX512VLBW-NEXT: vpaddq %xmm0, %xmm0, %xmm0
; AVX512VLBW-NEXT: vpsllq %xmm2, %xmm0, %xmm0
; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512VLBW-NEXT: retq
@@ -1119,7 +1119,7 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %
; XOP-NEXT: vpand %xmm3, %xmm2, %xmm4
; XOP-NEXT: vpsrlq %xmm4, %xmm1, %xmm1
; XOP-NEXT: vpandn %xmm3, %xmm2, %xmm2
-; XOP-NEXT: vpsllq $1, %xmm0, %xmm0
+; XOP-NEXT: vpaddq %xmm0, %xmm0, %xmm0
; XOP-NEXT: vpsllq %xmm2, %xmm0, %xmm0
; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
@@ -1131,7 +1131,7 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %
; X86-SSE2-NEXT: pand %xmm3, %xmm4
; X86-SSE2-NEXT: psrlq %xmm4, %xmm1
; X86-SSE2-NEXT: pandn %xmm3, %xmm2
-; X86-SSE2-NEXT: psllq $1, %xmm0
+; X86-SSE2-NEXT: paddq %xmm0, %xmm0
; X86-SSE2-NEXT: psllq %xmm2, %xmm0
; X86-SSE2-NEXT: por %xmm1, %xmm0
; X86-SSE2-NEXT: retl
@@ -1256,7 +1256,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %
; SSE-NEXT: pand %xmm3, %xmm4
; SSE-NEXT: psrlw %xmm4, %xmm1
; SSE-NEXT: pandn %xmm3, %xmm2
-; SSE-NEXT: psllw $1, %xmm0
+; SSE-NEXT: paddw %xmm0, %xmm0
; SSE-NEXT: psllw %xmm2, %xmm0
; SSE-NEXT: por %xmm1, %xmm0
; SSE-NEXT: retq
@@ -1267,7 +1267,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %
; AVX-NEXT: vpand %xmm3, %xmm2, %xmm4
; AVX-NEXT: vpsrlw %xmm4, %xmm1, %xmm1
; AVX-NEXT: vpandn %xmm3, %xmm2, %xmm2
-; AVX-NEXT: vpsllw $1, %xmm0, %xmm0
+; AVX-NEXT: vpaddw %xmm0, %xmm0, %xmm0
; AVX-NEXT: vpsllw %xmm2, %xmm0, %xmm0
; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
@@ -1278,7 +1278,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %
; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4
; AVX512F-NEXT: vpsrlw %xmm4, %xmm1, %xmm1
; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm2
-; AVX512F-NEXT: vpsllw $1, %xmm0, %xmm0
+; AVX512F-NEXT: vpaddw %xmm0, %xmm0, %xmm0
; AVX512F-NEXT: vpsllw %xmm2, %xmm0, %xmm0
; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512F-NEXT: retq
@@ -1289,7 +1289,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %
; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4
; AVX512VL-NEXT: vpsrlw %xmm4, %xmm1, %xmm1
; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm2
-; AVX512VL-NEXT: vpsllw $1, %xmm0, %xmm0
+; AVX512VL-NEXT: vpaddw %xmm0, %xmm0, %xmm0
; AVX512VL-NEXT: vpsllw %xmm2, %xmm0, %xmm0
; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512VL-NEXT: retq
@@ -1300,7 +1300,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %
; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4
; AVX512BW-NEXT: vpsrlw %xmm4, %xmm1, %xmm1
; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm2
-; AVX512BW-NEXT: vpsllw $1, %xmm0, %xmm0
+; AVX512BW-NEXT: vpaddw %xmm0, %xmm0, %xmm0
; AVX512BW-NEXT: vpsllw %xmm2, %xmm0, %xmm0
; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512BW-NEXT: retq
@@ -1321,7 +1321,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %
; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4
; AVX512VLBW-NEXT: vpsrlw %xmm4, %xmm1, %xmm1
; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm2
-; AVX512VLBW-NEXT: vpsllw $1, %xmm0, %xmm0
+; AVX512VLBW-NEXT: vpaddw %xmm0, %xmm0, %xmm0
; AVX512VLBW-NEXT: vpsllw %xmm2, %xmm0, %xmm0
; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512VLBW-NEXT: retq
@@ -1339,7 +1339,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %
; XOP-NEXT: vpand %xmm3, %xmm2, %xmm4
; XOP-NEXT: vpsrlw %xmm4, %xmm1, %xmm1
; XOP-NEXT: vpandn %xmm3, %xmm2, %xmm2
-; XOP-NEXT: vpsllw $1, %xmm0, %xmm0
+; XOP-NEXT: vpaddw %xmm0, %xmm0, %xmm0
; XOP-NEXT: vpsllw %xmm2, %xmm0, %xmm0
; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
@@ -1351,7 +1351,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %
; X86-SSE2-NEXT: pand %xmm3, %xmm4
; X86-SSE2-NEXT: psrlw %xmm4, %xmm1
; X86-SSE2-NEXT: pandn %xmm3, %xmm2
-; X86-SSE2-NEXT: psllw $1, %xmm0
+; X86-SSE2-NEXT: paddw %xmm0, %xmm0
; X86-SSE2-NEXT: psllw %xmm2, %xmm0
; X86-SSE2-NEXT: por %xmm1, %xmm0
; X86-SSE2-NEXT: retl
@@ -1761,7 +1761,7 @@ define <8 x i16> @constant_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind {
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,65535]
; SSE2-NEXT: pandn %xmm1, %xmm2
; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: psllw $1, %xmm0
+; SSE2-NEXT: paddw %xmm0, %xmm0
; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE2-NEXT: por %xmm2, %xmm0
; SSE2-NEXT: por %xmm1, %xmm0
@@ -1772,7 +1772,7 @@ define <8 x i16> @constant_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind {
; SSE41-NEXT: movdqa {{.*#+}} xmm2 = <u,32768,16384,8192,4096,2048,1024,512>
; SSE41-NEXT: pmulhuw %xmm1, %xmm2
; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3,4,5,6,7]
-; SSE41-NEXT: psllw $1, %xmm0
+; SSE41-NEXT: paddw %xmm0, %xmm0
; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE41-NEXT: por %xmm2, %xmm0
; SSE41-NEXT: retq
@@ -1781,7 +1781,7 @@ define <8 x i16> @constant_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind {
; AVX: # %bb.0:
; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
-; AVX-NEXT: vpsllw $1, %xmm0, %xmm0
+; AVX-NEXT: vpaddw %xmm0, %xmm0, %xmm0
; AVX-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
@@ -1790,7 +1790,7 @@ define <8 x i16> @constant_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind {
; AVX512F: # %bb.0:
; AVX512F-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
-; AVX512F-NEXT: vpsllw $1, %xmm0, %xmm0
+; AVX512F-NEXT: vpaddw %xmm0, %xmm0, %xmm0
; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512F-NEXT: retq
@@ -1799,7 +1799,7 @@ define <8 x i16> @constant_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind {
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
; AVX512VL-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
-; AVX512VL-NEXT: vpsllw $1, %xmm0, %xmm0
+; AVX512VL-NEXT: vpaddw %xmm0, %xmm0, %xmm0
; AVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512VL-NEXT: retq
@@ -1810,7 +1810,7 @@ define <8 x i16> @constant_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind {
; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7]
; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm1, %zmm1
; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [15,14,13,12,11,10,9,8]
-; AVX512BW-NEXT: vpsllw $1, %xmm0, %xmm0
+; AVX512BW-NEXT: vpaddw %xmm0, %xmm0, %xmm0
; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm0
; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512BW-NEXT: vzeroupper
@@ -1829,7 +1829,7 @@ define <8 x i16> @constant_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind {
; AVX512VLBW-LABEL: constant_funnnel_v8i16:
; AVX512VLBW: # %bb.0:
; AVX512VLBW-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX512VLBW-NEXT: vpsllw $1, %xmm0, %xmm0
+; AVX512VLBW-NEXT: vpaddw %xmm0, %xmm0, %xmm0
; AVX512VLBW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512VLBW-NEXT: retq
@@ -1843,7 +1843,7 @@ define <8 x i16> @constant_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind {
; XOP-LABEL: constant_funnnel_v8i16:
; XOP: # %bb.0:
; XOP-NEXT: vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; XOP-NEXT: vpsllw $1, %xmm0, %xmm0
+; XOP-NEXT: vpaddw %xmm0, %xmm0, %xmm0
; XOP-NEXT: vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
@@ -1853,7 +1853,7 @@ define <8 x i16> @constant_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind {
; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,65535]
; X86-SSE2-NEXT: pandn %xmm1, %xmm2
; X86-SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-SSE2-NEXT: psllw $1, %xmm0
+; X86-SSE2-NEXT: paddw %xmm0, %xmm0
; X86-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
; X86-SSE2-NEXT: por %xmm2, %xmm0
; X86-SSE2-NEXT: por %xmm1, %xmm0
diff --git a/llvm/test/CodeGen/X86/vector-fshr-256.ll b/llvm/test/CodeGen/X86/vector-fshr-256.ll
index 3a78ca84e861..6d7496e94476 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-256.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-256.ll
@@ -37,17 +37,17 @@ define <4 x i64> @var_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %amt)
; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1
; AVX1-NEXT: vandnps %ymm3, %ymm2, %ymm2
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
-; AVX1-NEXT: vpsllq $1, %xmm4, %xmm4
-; AVX1-NEXT: vpsllq %xmm3, %xmm4, %xmm5
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
-; AVX1-NEXT: vpsllq %xmm3, %xmm4, %xmm3
-; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2,3],xmm3[4,5,6,7]
-; AVX1-NEXT: vpsllq $1, %xmm0, %xmm0
-; AVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm4
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,3,2,3]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT: vpaddq %xmm5, %xmm5, %xmm5
+; AVX1-NEXT: vpsllq %xmm4, %xmm5, %xmm4
+; AVX1-NEXT: vpsllq %xmm3, %xmm5, %xmm3
+; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4,5,6,7]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm4 = xmm2[2,3,2,3]
+; AVX1-NEXT: vpaddq %xmm0, %xmm0, %xmm0
+; AVX1-NEXT: vpsllq %xmm4, %xmm0, %xmm4
; AVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1,2,3],xmm0[4,5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4,5,6,7]
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: retq
@@ -58,7 +58,7 @@ define <4 x i64> @var_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %amt)
; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm4
; AVX2-NEXT: vpsrlvq %ymm4, %ymm1, %ymm1
; AVX2-NEXT: vpandn %ymm3, %ymm2, %ymm2
-; AVX2-NEXT: vpsllq $1, %ymm0, %ymm0
+; AVX2-NEXT: vpaddq %ymm0, %ymm0, %ymm0
; AVX2-NEXT: vpsllvq %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
@@ -69,7 +69,7 @@ define <4 x i64> @var_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %amt)
; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm4
; AVX512F-NEXT: vpsrlvq %ymm4, %ymm1, %ymm1
; AVX512F-NEXT: vpandn %ymm3, %ymm2, %ymm2
-; AVX512F-NEXT: vpsllq $1, %ymm0, %ymm0
+; AVX512F-NEXT: vpaddq %ymm0, %ymm0, %ymm0
; AVX512F-NEXT: vpsllvq %ymm2, %ymm0, %ymm0
; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: retq
@@ -80,7 +80,7 @@ define <4 x i64> @var_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %amt)
; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm4
; AVX512VL-NEXT: vpsrlvq %ymm4, %ymm1, %ymm1
; AVX512VL-NEXT: vpandn %ymm3, %ymm2, %ymm2
-; AVX512VL-NEXT: vpsllq $1, %ymm0, %ymm0
+; AVX512VL-NEXT: vpaddq %ymm0, %ymm0, %ymm0
; AVX512VL-NEXT: vpsllvq %ymm2, %ymm0, %ymm0
; AVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512VL-NEXT: retq
@@ -91,7 +91,7 @@ define <4 x i64> @var_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %amt)
; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm4
; AVX512BW-NEXT: vpsrlvq %ymm4, %ymm1, %ymm1
; AVX512BW-NEXT: vpandn %ymm3, %ymm2, %ymm2
-; AVX512BW-NEXT: vpsllq $1, %ymm0, %ymm0
+; AVX512BW-NEXT: vpaddq %ymm0, %ymm0, %ymm0
; AVX512BW-NEXT: vpsllvq %ymm2, %ymm0, %ymm0
; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512BW-NEXT: retq
@@ -111,7 +111,7 @@ define <4 x i64> @var_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %amt)
; AVX512VLBW-NEXT: vpand %ymm3, %ymm2, %ymm4
; AVX512VLBW-NEXT: vpsrlvq %ymm4, %ymm1, %ymm1
; AVX512VLBW-NEXT: vpandn %ymm3, %ymm2, %ymm2
-; AVX512VLBW-NEXT: vpsllq $1, %ymm0, %ymm0
+; AVX512VLBW-NEXT: vpaddq %ymm0, %ymm0, %ymm0
; AVX512VLBW-NEXT: vpsllvq %ymm2, %ymm0, %ymm0
; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512VLBW-NEXT: retq
@@ -125,23 +125,23 @@ define <4 x i64> @var_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %amt)
; XOPAVX1-LABEL: var_funnnel_v4i64:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vmovaps {{.*#+}} ymm3 = [63,63,63,63]
-; XOPAVX1-NEXT: vandnps %ymm3, %ymm2, %ymm4
+; XOPAVX1-NEXT: vandps %ymm3, %ymm2, %ymm4
; XOPAVX1-NEXT: vextractf128 $1, %ymm4, %xmm5
-; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm6
-; XOPAVX1-NEXT: vpsllq $1, %xmm6, %xmm6
-; XOPAVX1-NEXT: vpshlq %xmm5, %xmm6, %xmm5
-; XOPAVX1-NEXT: vpsllq $1, %xmm0, %xmm0
-; XOPAVX1-NEXT: vpshlq %xmm4, %xmm0, %xmm0
-; XOPAVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0
-; XOPAVX1-NEXT: vandps %ymm3, %ymm2, %ymm2
+; XOPAVX1-NEXT: vpxor %xmm6, %xmm6, %xmm6
+; XOPAVX1-NEXT: vpsubq %xmm5, %xmm6, %xmm5
+; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm7
+; XOPAVX1-NEXT: vpshlq %xmm5, %xmm7, %xmm5
+; XOPAVX1-NEXT: vpsubq %xmm4, %xmm6, %xmm4
+; XOPAVX1-NEXT: vpshlq %xmm4, %xmm1, %xmm1
+; XOPAVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1
+; XOPAVX1-NEXT: vandnps %ymm3, %ymm2, %ymm2
; XOPAVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
-; XOPAVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; XOPAVX1-NEXT: vpsubq %xmm3, %xmm4, %xmm3
-; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
-; XOPAVX1-NEXT: vpshlq %xmm3, %xmm5, %xmm3
-; XOPAVX1-NEXT: vpsubq %xmm2, %xmm4, %xmm2
-; XOPAVX1-NEXT: vpshlq %xmm2, %xmm1, %xmm1
-; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; XOPAVX1-NEXT: vpaddq %xmm4, %xmm4, %xmm4
+; XOPAVX1-NEXT: vpshlq %xmm3, %xmm4, %xmm3
+; XOPAVX1-NEXT: vpaddq %xmm0, %xmm0, %xmm0
+; XOPAVX1-NEXT: vpshlq %xmm2, %xmm0, %xmm0
+; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
; XOPAVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; XOPAVX1-NEXT: retq
;
@@ -151,7 +151,7 @@ define <4 x i64> @var_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %amt)
; XOPAVX2-NEXT: vpand %ymm3, %ymm2, %ymm4
; XOPAVX2-NEXT: vpsrlvq %ymm4, %ymm1, %ymm1
; XOPAVX2-NEXT: vpandn %ymm3, %ymm2, %ymm2
-; XOPAVX2-NEXT: vpsllq $1, %ymm0, %ymm0
+; XOPAVX2-NEXT: vpaddq %ymm0, %ymm0, %ymm0
; XOPAVX2-NEXT: vpsllvq %ymm2, %ymm0, %ymm0
; XOPAVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
; XOPAVX2-NEXT: retq
@@ -184,7 +184,7 @@ define <8 x i32> @var_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %amt)
; AVX1-NEXT: vpaddd %xmm6, %xmm4, %xmm4
; AVX1-NEXT: vcvttps2dq %xmm4, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm7
-; AVX1-NEXT: vpslld $1, %xmm7, %xmm7
+; AVX1-NEXT: vpaddd %xmm7, %xmm7, %xmm7
; AVX1-NEXT: vpmulld %xmm4, %xmm7, %xmm4
; AVX1-NEXT: vpor %xmm3, %xmm4, %xmm3
; AVX1-NEXT: vpsrldq {{.*#+}} xmm4 = xmm2[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -202,7 +202,7 @@ define <8 x i32> @var_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %amt)
; AVX1-NEXT: vpslld $23, %xmm2, %xmm2
; AVX1-NEXT: vpaddd %xmm6, %xmm2, %xmm2
; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2
-; AVX1-NEXT: vpslld $1, %xmm0, %xmm0
+; AVX1-NEXT: vpaddd %xmm0, %xmm0, %xmm0
; AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
@@ -214,7 +214,7 @@ define <8 x i32> @var_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %amt)
; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm4
; AVX2-NEXT: vpsrlvd %ymm4, %ymm1, %ymm1
; AVX2-NEXT: vpandn %ymm3, %ymm2, %ymm2
-; AVX2-NEXT: vpslld $1, %ymm0, %ymm0
+; AVX2-NEXT: vpaddd %ymm0, %ymm0, %ymm0
; AVX2-NEXT: vpsllvd %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
@@ -225,7 +225,7 @@ define <8 x i32> @var_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %amt)
; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm4
; AVX512F-NEXT: vpsrlvd %ymm4, %ymm1, %ymm1
; AVX512F-NEXT: vpandn %ymm3, %ymm2, %ymm2
-; AVX512F-NEXT: vpslld $1, %ymm0, %ymm0
+; AVX512F-NEXT: vpaddd %ymm0, %ymm0, %ymm0
; AVX512F-NEXT: vpsllvd %ymm2, %ymm0, %ymm0
; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: retq
@@ -236,7 +236,7 @@ define <8 x i32> @var_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %amt)
; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm4
; AVX512VL-NEXT: vpsrlvd %ymm4, %ymm1, %ymm1
; AVX512VL-NEXT: vpandn %ymm3, %ymm2, %ymm2
-; AVX512VL-NEXT: vpslld $1, %ymm0, %ymm0
+; AVX512VL-NEXT: vpaddd %ymm0, %ymm0, %ymm0
; AVX512VL-NEXT: vpsllvd %ymm2, %ymm0, %ymm0
; AVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512VL-NEXT: retq
@@ -247,7 +247,7 @@ define <8 x i32> @var_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %amt)
; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm4
; AVX512BW-NEXT: vpsrlvd %ymm4, %ymm1, %ymm1
; AVX512BW-NEXT: vpandn %ymm3, %ymm2, %ymm2
-; AVX512BW-NEXT: vpslld $1, %ymm0, %ymm0
+; AVX512BW-NEXT: vpaddd %ymm0, %ymm0, %ymm0
; AVX512BW-NEXT: vpsllvd %ymm2, %ymm0, %ymm0
; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512BW-NEXT: retq
@@ -267,7 +267,7 @@ define <8 x i32> @var_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %amt)
; AVX512VLBW-NEXT: vpand %ymm3, %ymm2, %ymm4
; AVX512VLBW-NEXT: vpsrlvd %ymm4, %ymm1, %ymm1
; AVX512VLBW-NEXT: vpandn %ymm3, %ymm2, %ymm2
-; AVX512VLBW-NEXT: vpslld $1, %ymm0, %ymm0
+; AVX512VLBW-NEXT: vpaddd %ymm0, %ymm0, %ymm0
; AVX512VLBW-NEXT: vpsllvd %ymm2, %ymm0, %ymm0
; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512VLBW-NEXT: retq
@@ -289,13 +289,13 @@ define <8 x i32> @var_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %amt)
; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [31,31,31,31]
; XOPAVX1-NEXT: vpxor %xmm6, %xmm3, %xmm3
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm7
-; XOPAVX1-NEXT: vpslld $1, %xmm7, %xmm7
+; XOPAVX1-NEXT: vpaddd %xmm7, %xmm7, %xmm7
; XOPAVX1-NEXT: vpshld %xmm3, %xmm7, %xmm3
; XOPAVX1-NEXT: vpor %xmm5, %xmm3, %xmm3
; XOPAVX1-NEXT: vpsubd %xmm2, %xmm4, %xmm4
; XOPAVX1-NEXT: vpshld %xmm4, %xmm1, %xmm1
; XOPAVX1-NEXT: vpxor %xmm6, %xmm2, %xmm2
-; XOPAVX1-NEXT: vpslld $1, %xmm0, %xmm0
+; XOPAVX1-NEXT: vpaddd %xmm0, %xmm0, %xmm0
; XOPAVX1-NEXT: vpshld %xmm2, %xmm0, %xmm0
; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
@@ -307,7 +307,7 @@ define <8 x i32> @var_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %amt)
; XOPAVX2-NEXT: vpand %ymm3, %ymm2, %ymm4
; XOPAVX2-NEXT: vpsrlvd %ymm4, %ymm1, %ymm1
; XOPAVX2-NEXT: vpandn %ymm3, %ymm2, %ymm2
-; XOPAVX2-NEXT: vpslld $1, %ymm0, %ymm0
+; XOPAVX2-NEXT: vpaddd %ymm0, %ymm0, %ymm0
; XOPAVX2-NEXT: vpsllvd %ymm2, %ymm0, %ymm0
; XOPAVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
; XOPAVX2-NEXT: retq
@@ -348,7 +348,7 @@ define <16 x i16> @var_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> %
; AVX1-NEXT: vcvttps2dq %xmm6, %xmm6
; AVX1-NEXT: vpackusdw %xmm7, %xmm6, %xmm6
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm7
-; AVX1-NEXT: vpsllw $1, %xmm7, %xmm7
+; AVX1-NEXT: vpaddw %xmm7, %xmm7, %xmm7
; AVX1-NEXT: vpmullw %xmm6, %xmm7, %xmm6
; AVX1-NEXT: vpor %xmm4, %xmm6, %xmm4
; AVX1-NEXT: vpsllw $12, %xmm2, %xmm6
@@ -375,7 +375,7 @@ define <16 x i16> @var_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> %
; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2
; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2
-; AVX1-NEXT: vpsllw $1, %xmm0, %xmm0
+; AVX1-NEXT: vpaddw %xmm0, %xmm0, %xmm0
; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
@@ -427,7 +427,7 @@ define <16 x i16> @var_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> %
; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm4
; AVX512BW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1
; AVX512BW-NEXT: vpandn %ymm3, %ymm2, %ymm2
-; AVX512BW-NEXT: vpsllw $1, %ymm0, %ymm0
+; AVX512BW-NEXT: vpaddw %ymm0, %ymm0, %ymm0
; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm0
; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512BW-NEXT: retq
@@ -447,7 +447,7 @@ define <16 x i16> @var_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> %
; AVX512VLBW-NEXT: vpand %ymm3, %ymm2, %ymm4
; AVX512VLBW-NEXT: vpsrlvw %ymm4, %ymm1, %ymm1
; AVX512VLBW-NEXT: vpandn %ymm3, %ymm2, %ymm2
-; AVX512VLBW-NEXT: vpsllw $1, %ymm0, %ymm0
+; AVX512VLBW-NEXT: vpaddw %ymm0, %ymm0, %ymm0
; AVX512VLBW-NEXT: vpsllvw %ymm2, %ymm0, %ymm0
; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512VLBW-NEXT: retq
@@ -469,13 +469,13 @@ define <16 x i16> @var_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> %
; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [15,15,15,15,15,15,15,15]
; XOPAVX1-NEXT: vpxor %xmm6, %xmm3, %xmm3
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm7
-; XOPAVX1-NEXT: vpsllw $1, %xmm7, %xmm7
+; XOPAVX1-NEXT: vpaddw %xmm7, %xmm7, %xmm7
; XOPAVX1-NEXT: vpshlw %xmm3, %xmm7, %xmm3
; XOPAVX1-NEXT: vpor %xmm5, %xmm3, %xmm3
; XOPAVX1-NEXT: vpsubw %xmm2, %xmm4, %xmm4
; XOPAVX1-NEXT: vpshlw %xmm4, %xmm1, %xmm1
; XOPAVX1-NEXT: vpxor %xmm6, %xmm2, %xmm2
-; XOPAVX1-NEXT: vpsllw $1, %xmm0, %xmm0
+; XOPAVX1-NEXT: vpaddw %xmm0, %xmm0, %xmm0
; XOPAVX1-NEXT: vpshlw %xmm2, %xmm0, %xmm0
; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
@@ -484,22 +484,22 @@ define <16 x i16> @var_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> %
; XOPAVX2-LABEL: var_funnnel_v16i16:
; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; XOPAVX2-NEXT: vpandn %ymm3, %ymm2, %ymm4
+; XOPAVX2-NEXT: vpand %ymm3, %ymm2, %ymm4
; XOPAVX2-NEXT: vextracti128 $1, %ymm4, %xmm5
-; XOPAVX2-NEXT: vpsllw $1, %ymm0, %ymm0
-; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm6
-; XOPAVX2-NEXT: vpshlw %xmm5, %xmm6, %xmm5
-; XOPAVX2-NEXT: vpshlw %xmm4, %xmm0, %xmm0
-; XOPAVX2-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm0
-; XOPAVX2-NEXT: vpand %ymm3, %ymm2, %ymm2
+; XOPAVX2-NEXT: vpxor %xmm6, %xmm6, %xmm6
+; XOPAVX2-NEXT: vpsubw %xmm5, %xmm6, %xmm5
+; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm7
+; XOPAVX2-NEXT: vpshlw %xmm5, %xmm7, %xmm5
+; XOPAVX2-NEXT: vpsubw %xmm4, %xmm6, %xmm4
+; XOPAVX2-NEXT: vpshlw %xmm4, %xmm1, %xmm1
+; XOPAVX2-NEXT: vinserti128 $1, %xmm5, %ymm1, %ymm1
+; XOPAVX2-NEXT: vpandn %ymm3, %ymm2, %ymm2
; XOPAVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
-; XOPAVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; XOPAVX2-NEXT: vpsubw %xmm3, %xmm4, %xmm3
-; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm5
-; XOPAVX2-NEXT: vpshlw %xmm3, %xmm5, %xmm3
-; XOPAVX2-NEXT: vpsubw %xmm2, %xmm4, %xmm2
-; XOPAVX2-NEXT: vpshlw %xmm2, %xmm1, %xmm1
-; XOPAVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; XOPAVX2-NEXT: vpaddw %ymm0, %ymm0, %ymm0
+; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm4
+; XOPAVX2-NEXT: vpshlw %xmm3, %xmm4, %xmm3
+; XOPAVX2-NEXT: vpshlw %xmm2, %xmm0, %xmm0
+; XOPAVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
; XOPAVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
; XOPAVX2-NEXT: retq
%res = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> %amt)
@@ -782,9 +782,9 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %
; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1
; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT: vpsllq $1, %xmm3, %xmm3
+; AVX1-NEXT: vpaddq %xmm3, %xmm3, %xmm3
; AVX1-NEXT: vpsllq %xmm2, %xmm3, %xmm3
-; AVX1-NEXT: vpsllq $1, %xmm0, %xmm0
+; AVX1-NEXT: vpaddq %xmm0, %xmm0, %xmm0
; AVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
@@ -796,7 +796,7 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %
; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm4
; AVX2-NEXT: vpsrlq %xmm4, %ymm1, %ymm1
; AVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2
-; AVX2-NEXT: vpsllq $1, %ymm0, %ymm0
+; AVX2-NEXT: vpaddq %ymm0, %ymm0, %ymm0
; AVX2-NEXT: vpsllq %xmm2, %ymm0, %ymm0
; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
@@ -807,7 +807,7 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %
; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4
; AVX512F-NEXT: vpsrlq %xmm4, %ymm1, %ymm1
; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm2
-; AVX512F-NEXT: vpsllq $1, %ymm0, %ymm0
+; AVX512F-NEXT: vpaddq %ymm0, %ymm0, %ymm0
; AVX512F-NEXT: vpsllq %xmm2, %ymm0, %ymm0
; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: retq
@@ -818,7 +818,7 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %
; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4
; AVX512VL-NEXT: vpsrlq %xmm4, %ymm1, %ymm1
; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm2
-; AVX512VL-NEXT: vpsllq $1, %ymm0, %ymm0
+; AVX512VL-NEXT: vpaddq %ymm0, %ymm0, %ymm0
; AVX512VL-NEXT: vpsllq %xmm2, %ymm0, %ymm0
; AVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512VL-NEXT: retq
@@ -829,7 +829,7 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %
; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4
; AVX512BW-NEXT: vpsrlq %xmm4, %ymm1, %ymm1
; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm2
-; AVX512BW-NEXT: vpsllq $1, %ymm0, %ymm0
+; AVX512BW-NEXT: vpaddq %ymm0, %ymm0, %ymm0
; AVX512BW-NEXT: vpsllq %xmm2, %ymm0, %ymm0
; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512BW-NEXT: retq
@@ -849,7 +849,7 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %
; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4
; AVX512VLBW-NEXT: vpsrlq %xmm4, %ymm1, %ymm1
; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm2
-; AVX512VLBW-NEXT: vpsllq $1, %ymm0, %ymm0
+; AVX512VLBW-NEXT: vpaddq %ymm0, %ymm0, %ymm0
; AVX512VLBW-NEXT: vpsllq %xmm2, %ymm0, %ymm0
; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512VLBW-NEXT: retq
@@ -871,9 +871,9 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %
; XOPAVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1
; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
-; XOPAVX1-NEXT: vpsllq $1, %xmm3, %xmm3
+; XOPAVX1-NEXT: vpaddq %xmm3, %xmm3, %xmm3
; XOPAVX1-NEXT: vpsllq %xmm2, %xmm3, %xmm3
-; XOPAVX1-NEXT: vpsllq $1, %xmm0, %xmm0
+; XOPAVX1-NEXT: vpaddq %xmm0, %xmm0, %xmm0
; XOPAVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm0
; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
; XOPAVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
@@ -885,7 +885,7 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %
; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm4
; XOPAVX2-NEXT: vpsrlq %xmm4, %ymm1, %ymm1
; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2
-; XOPAVX2-NEXT: vpsllq $1, %ymm0, %ymm0
+; XOPAVX2-NEXT: vpaddq %ymm0, %ymm0, %ymm0
; XOPAVX2-NEXT: vpsllq %xmm2, %ymm0, %ymm0
; XOPAVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
; XOPAVX2-NEXT: retq
@@ -1020,11 +1020,11 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i
; AVX1-NEXT: vpsrlw %xmm4, %xmm5, %xmm5
; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT: vpsllw $1, %xmm3, %xmm3
+; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3
; AVX1-NEXT: vpsllw %xmm2, %xmm3, %xmm3
; AVX1-NEXT: vpor %xmm5, %xmm3, %xmm3
; AVX1-NEXT: vpsrlw %xmm4, %xmm1, %xmm1
-; AVX1-NEXT: vpsllw $1, %xmm0, %xmm0
+; AVX1-NEXT: vpaddw %xmm0, %xmm0, %xmm0
; AVX1-NEXT: vpsllw %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
@@ -1036,7 +1036,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i
; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm4
; AVX2-NEXT: vpsrlw %xmm4, %ymm1, %ymm1
; AVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2
-; AVX2-NEXT: vpsllw $1, %ymm0, %ymm0
+; AVX2-NEXT: vpaddw %ymm0, %ymm0, %ymm0
; AVX2-NEXT: vpsllw %xmm2, %ymm0, %ymm0
; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
@@ -1047,7 +1047,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i
; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4
; AVX512F-NEXT: vpsrlw %xmm4, %ymm1, %ymm1
; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm2
-; AVX512F-NEXT: vpsllw $1, %ymm0, %ymm0
+; AVX512F-NEXT: vpaddw %ymm0, %ymm0, %ymm0
; AVX512F-NEXT: vpsllw %xmm2, %ymm0, %ymm0
; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: retq
@@ -1058,7 +1058,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i
; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4
; AVX512VL-NEXT: vpsrlw %xmm4, %ymm1, %ymm1
; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm2
-; AVX512VL-NEXT: vpsllw $1, %ymm0, %ymm0
+; AVX512VL-NEXT: vpaddw %ymm0, %ymm0, %ymm0
; AVX512VL-NEXT: vpsllw %xmm2, %ymm0, %ymm0
; AVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512VL-NEXT: retq
@@ -1069,7 +1069,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i
; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4
; AVX512BW-NEXT: vpsrlw %xmm4, %ymm1, %ymm1
; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm2
-; AVX512BW-NEXT: vpsllw $1, %ymm0, %ymm0
+; AVX512BW-NEXT: vpaddw %ymm0, %ymm0, %ymm0
; AVX512BW-NEXT: vpsllw %xmm2, %ymm0, %ymm0
; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512BW-NEXT: retq
@@ -1089,7 +1089,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i
; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4
; AVX512VLBW-NEXT: vpsrlw %xmm4, %ymm1, %ymm1
; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm2
-; AVX512VLBW-NEXT: vpsllw $1, %ymm0, %ymm0
+; AVX512VLBW-NEXT: vpaddw %ymm0, %ymm0, %ymm0
; AVX512VLBW-NEXT: vpsllw %xmm2, %ymm0, %ymm0
; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512VLBW-NEXT: retq
@@ -1109,11 +1109,11 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i
; XOPAVX1-NEXT: vpsrlw %xmm4, %xmm5, %xmm5
; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
-; XOPAVX1-NEXT: vpsllw $1, %xmm3, %xmm3
+; XOPAVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3
; XOPAVX1-NEXT: vpsllw %xmm2, %xmm3, %xmm3
; XOPAVX1-NEXT: vpor %xmm5, %xmm3, %xmm3
; XOPAVX1-NEXT: vpsrlw %xmm4, %xmm1, %xmm1
-; XOPAVX1-NEXT: vpsllw $1, %xmm0, %xmm0
+; XOPAVX1-NEXT: vpaddw %xmm0, %xmm0, %xmm0
; XOPAVX1-NEXT: vpsllw %xmm2, %xmm0, %xmm0
; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
@@ -1125,7 +1125,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i
; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm4
; XOPAVX2-NEXT: vpsrlw %xmm4, %ymm1, %ymm1
; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2
-; XOPAVX2-NEXT: vpsllw $1, %ymm0, %ymm0
+; XOPAVX2-NEXT: vpaddw %ymm0, %ymm0, %ymm0
; XOPAVX2-NEXT: vpsllw %xmm2, %ymm0, %ymm0
; XOPAVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
; XOPAVX2-NEXT: retq
@@ -1494,10 +1494,10 @@ define <16 x i16> @constant_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y) nounwin
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
-; AVX1-NEXT: vpsllw $1, %xmm0, %xmm2
+; AVX1-NEXT: vpaddw %xmm0, %xmm0, %xmm2
; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpsllw $1, %xmm0, %xmm0
+; AVX1-NEXT: vpaddw %xmm0, %xmm0, %xmm0
; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
@@ -1508,7 +1508,7 @@ define <16 x i16> @constant_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y) nounwin
; AVX2-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2
; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; AVX2-NEXT: vpsllw $1, %ymm0, %ymm0
+; AVX2-NEXT: vpaddw %ymm0, %ymm0, %ymm0
; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
@@ -1518,7 +1518,7 @@ define <16 x i16> @constant_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y) nounwin
; AVX512F-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2
; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; AVX512F-NEXT: vpsllw $1, %ymm0, %ymm0
+; AVX512F-NEXT: vpaddw %ymm0, %ymm0, %ymm0
; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: retq
@@ -1528,7 +1528,7 @@ define <16 x i16> @constant_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y) nounwin
; AVX512VL-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2
; AVX512VL-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
; AVX512VL-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; AVX512VL-NEXT: vpsllw $1, %ymm0, %ymm0
+; AVX512VL-NEXT: vpaddw %ymm0, %ymm0, %ymm0
; AVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512VL-NEXT: retq
@@ -1539,7 +1539,7 @@ define <16 x i16> @constant_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y) nounwin
; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm1, %zmm1
; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
-; AVX512BW-NEXT: vpsllw $1, %ymm0, %ymm0
+; AVX512BW-NEXT: vpaddw %ymm0, %ymm0, %ymm0
; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm0
; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512BW-NEXT: retq
@@ -1556,7 +1556,7 @@ define <16 x i16> @constant_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y) nounwin
; AVX512VLBW-LABEL: constant_funnnel_v16i16:
; AVX512VLBW: # %bb.0:
; AVX512VLBW-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; AVX512VLBW-NEXT: vpsllw $1, %ymm0, %ymm0
+; AVX512VLBW-NEXT: vpaddw %ymm0, %ymm0, %ymm0
; AVX512VLBW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512VLBW-NEXT: retq
@@ -1573,10 +1573,10 @@ define <16 x i16> @constant_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y) nounwin
; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
; XOPAVX1-NEXT: vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
-; XOPAVX1-NEXT: vpsllw $1, %xmm0, %xmm2
+; XOPAVX1-NEXT: vpaddw %xmm0, %xmm0, %xmm2
; XOPAVX1-NEXT: vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; XOPAVX1-NEXT: vpsllw $1, %xmm0, %xmm0
+; XOPAVX1-NEXT: vpaddw %xmm0, %xmm0, %xmm0
; XOPAVX1-NEXT: vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
; XOPAVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
@@ -1587,7 +1587,7 @@ define <16 x i16> @constant_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y) nounwin
; XOPAVX2-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2
; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
; XOPAVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; XOPAVX2-NEXT: vpsllw $1, %ymm0, %ymm0
+; XOPAVX2-NEXT: vpaddw %ymm0, %ymm0, %ymm0
; XOPAVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; XOPAVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
; XOPAVX2-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vector-fshr-512.ll b/llvm/test/CodeGen/X86/vector-fshr-512.ll
index e9dd400c1f78..38998b8ab295 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-512.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-512.ll
@@ -22,7 +22,7 @@ define <8 x i64> @var_funnnel_v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> %amt)
; AVX512F-NEXT: vpandq %zmm3, %zmm2, %zmm4
; AVX512F-NEXT: vpsrlvq %zmm4, %zmm1, %zmm1
; AVX512F-NEXT: vpandnq %zmm3, %zmm2, %zmm2
-; AVX512F-NEXT: vpsllq $1, %zmm0, %zmm0
+; AVX512F-NEXT: vpaddq %zmm0, %zmm0, %zmm0
; AVX512F-NEXT: vpsllvq %zmm2, %zmm0, %zmm0
; AVX512F-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
@@ -33,7 +33,7 @@ define <8 x i64> @var_funnnel_v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> %amt)
; AVX512VL-NEXT: vpandq %zmm3, %zmm2, %zmm4
; AVX512VL-NEXT: vpsrlvq %zmm4, %zmm1, %zmm1
; AVX512VL-NEXT: vpandnq %zmm3, %zmm2, %zmm2
-; AVX512VL-NEXT: vpsllq $1, %zmm0, %zmm0
+; AVX512VL-NEXT: vpaddq %zmm0, %zmm0, %zmm0
; AVX512VL-NEXT: vpsllvq %zmm2, %zmm0, %zmm0
; AVX512VL-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512VL-NEXT: retq
@@ -44,7 +44,7 @@ define <8 x i64> @var_funnnel_v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> %amt)
; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm4
; AVX512BW-NEXT: vpsrlvq %zmm4, %zmm1, %zmm1
; AVX512BW-NEXT: vpandnq %zmm3, %zmm2, %zmm2
-; AVX512BW-NEXT: vpsllq $1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpaddq %zmm0, %zmm0, %zmm0
; AVX512BW-NEXT: vpsllvq %zmm2, %zmm0, %zmm0
; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: retq
@@ -61,7 +61,7 @@ define <8 x i64> @var_funnnel_v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> %amt)
; AVX512VLBW-NEXT: vpandq %zmm3, %zmm2, %zmm4
; AVX512VLBW-NEXT: vpsrlvq %zmm4, %zmm1, %zmm1
; AVX512VLBW-NEXT: vpandnq %zmm3, %zmm2, %zmm2
-; AVX512VLBW-NEXT: vpsllq $1, %zmm0, %zmm0
+; AVX512VLBW-NEXT: vpaddq %zmm0, %zmm0, %zmm0
; AVX512VLBW-NEXT: vpsllvq %zmm2, %zmm0, %zmm0
; AVX512VLBW-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512VLBW-NEXT: retq
@@ -82,7 +82,7 @@ define <16 x i32> @var_funnnel_v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i32> %
; AVX512F-NEXT: vpandd %zmm3, %zmm2, %zmm4
; AVX512F-NEXT: vpsrlvd %zmm4, %zmm1, %zmm1
; AVX512F-NEXT: vpandnd %zmm3, %zmm2, %zmm2
-; AVX512F-NEXT: vpslld $1, %zmm0, %zmm0
+; AVX512F-NEXT: vpaddd %zmm0, %zmm0, %zmm0
; AVX512F-NEXT: vpsllvd %zmm2, %zmm0, %zmm0
; AVX512F-NEXT: vpord %zmm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
@@ -93,7 +93,7 @@ define <16 x i32> @var_funnnel_v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i32> %
; AVX512VL-NEXT: vpandd %zmm3, %zmm2, %zmm4
; AVX512VL-NEXT: vpsrlvd %zmm4, %zmm1, %zmm1
; AVX512VL-NEXT: vpandnd %zmm3, %zmm2, %zmm2
-; AVX512VL-NEXT: vpslld $1, %zmm0, %zmm0
+; AVX512VL-NEXT: vpaddd %zmm0, %zmm0, %zmm0
; AVX512VL-NEXT: vpsllvd %zmm2, %zmm0, %zmm0
; AVX512VL-NEXT: vpord %zmm1, %zmm0, %zmm0
; AVX512VL-NEXT: retq
@@ -104,7 +104,7 @@ define <16 x i32> @var_funnnel_v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i32> %
; AVX512BW-NEXT: vpandd %zmm3, %zmm2, %zmm4
; AVX512BW-NEXT: vpsrlvd %zmm4, %zmm1, %zmm1
; AVX512BW-NEXT: vpandnd %zmm3, %zmm2, %zmm2
-; AVX512BW-NEXT: vpslld $1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpaddd %zmm0, %zmm0, %zmm0
; AVX512BW-NEXT: vpsllvd %zmm2, %zmm0, %zmm0
; AVX512BW-NEXT: vpord %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: retq
@@ -121,7 +121,7 @@ define <16 x i32> @var_funnnel_v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i32> %
; AVX512VLBW-NEXT: vpandd %zmm3, %zmm2, %zmm4
; AVX512VLBW-NEXT: vpsrlvd %zmm4, %zmm1, %zmm1
; AVX512VLBW-NEXT: vpandnd %zmm3, %zmm2, %zmm2
-; AVX512VLBW-NEXT: vpslld $1, %zmm0, %zmm0
+; AVX512VLBW-NEXT: vpaddd %zmm0, %zmm0, %zmm0
; AVX512VLBW-NEXT: vpsllvd %zmm2, %zmm0, %zmm0
; AVX512VLBW-NEXT: vpord %zmm1, %zmm0, %zmm0
; AVX512VLBW-NEXT: retq
@@ -188,7 +188,7 @@ define <32 x i16> @var_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i16> %
; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm4
; AVX512BW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1
; AVX512BW-NEXT: vpandnq %zmm3, %zmm2, %zmm2
-; AVX512BW-NEXT: vpsllw $1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpaddw %zmm0, %zmm0, %zmm0
; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm0
; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: retq
@@ -205,7 +205,7 @@ define <32 x i16> @var_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i16> %
; AVX512VLBW-NEXT: vpandq %zmm3, %zmm2, %zmm4
; AVX512VLBW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1
; AVX512VLBW-NEXT: vpandnq %zmm3, %zmm2, %zmm2
-; AVX512VLBW-NEXT: vpsllw $1, %zmm0, %zmm0
+; AVX512VLBW-NEXT: vpaddw %zmm0, %zmm0, %zmm0
; AVX512VLBW-NEXT: vpsllvw %zmm2, %zmm0, %zmm0
; AVX512VLBW-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512VLBW-NEXT: retq
@@ -428,7 +428,7 @@ define <8 x i64> @splatvar_funnnel_v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> %
; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4
; AVX512F-NEXT: vpsrlq %xmm4, %zmm1, %zmm1
; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm2
-; AVX512F-NEXT: vpsllq $1, %zmm0, %zmm0
+; AVX512F-NEXT: vpaddq %zmm0, %zmm0, %zmm0
; AVX512F-NEXT: vpsllq %xmm2, %zmm0, %zmm0
; AVX512F-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
@@ -439,7 +439,7 @@ define <8 x i64> @splatvar_funnnel_v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> %
; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4
; AVX512VL-NEXT: vpsrlq %xmm4, %zmm1, %zmm1
; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm2
-; AVX512VL-NEXT: vpsllq $1, %zmm0, %zmm0
+; AVX512VL-NEXT: vpaddq %zmm0, %zmm0, %zmm0
; AVX512VL-NEXT: vpsllq %xmm2, %zmm0, %zmm0
; AVX512VL-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512VL-NEXT: retq
@@ -450,7 +450,7 @@ define <8 x i64> @splatvar_funnnel_v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> %
; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4
; AVX512BW-NEXT: vpsrlq %xmm4, %zmm1, %zmm1
; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm2
-; AVX512BW-NEXT: vpsllq $1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpaddq %zmm0, %zmm0, %zmm0
; AVX512BW-NEXT: vpsllq %xmm2, %zmm0, %zmm0
; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: retq
@@ -468,7 +468,7 @@ define <8 x i64> @splatvar_funnnel_v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> %
; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4
; AVX512VLBW-NEXT: vpsrlq %xmm4, %zmm1, %zmm1
; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm2
-; AVX512VLBW-NEXT: vpsllq $1, %zmm0, %zmm0
+; AVX512VLBW-NEXT: vpaddq %zmm0, %zmm0, %zmm0
; AVX512VLBW-NEXT: vpsllq %xmm2, %zmm0, %zmm0
; AVX512VLBW-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512VLBW-NEXT: retq
@@ -554,9 +554,9 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i
; AVX512F-NEXT: vinserti64x4 $1, %ymm5, %zmm1, %zmm1
; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm2
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
-; AVX512F-NEXT: vpsllw $1, %ymm3, %ymm3
+; AVX512F-NEXT: vpaddw %ymm3, %ymm3, %ymm3
; AVX512F-NEXT: vpsllw %xmm2, %ymm3, %ymm3
-; AVX512F-NEXT: vpsllw $1, %ymm0, %ymm0
+; AVX512F-NEXT: vpaddw %ymm0, %ymm0, %ymm0
; AVX512F-NEXT: vpsllw %xmm2, %ymm0, %ymm0
; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
; AVX512F-NEXT: vporq %zmm1, %zmm0, %zmm0
@@ -572,9 +572,9 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i
; AVX512VL-NEXT: vinserti64x4 $1, %ymm5, %zmm1, %zmm1
; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm2
; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm3
-; AVX512VL-NEXT: vpsllw $1, %ymm3, %ymm3
+; AVX512VL-NEXT: vpaddw %ymm3, %ymm3, %ymm3
; AVX512VL-NEXT: vpsllw %xmm2, %ymm3, %ymm3
-; AVX512VL-NEXT: vpsllw $1, %ymm0, %ymm0
+; AVX512VL-NEXT: vpaddw %ymm0, %ymm0, %ymm0
; AVX512VL-NEXT: vpsllw %xmm2, %ymm0, %ymm0
; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
; AVX512VL-NEXT: vporq %zmm1, %zmm0, %zmm0
@@ -586,7 +586,7 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i
; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4
; AVX512BW-NEXT: vpsrlw %xmm4, %zmm1, %zmm1
; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm2
-; AVX512BW-NEXT: vpsllw $1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpaddw %zmm0, %zmm0, %zmm0
; AVX512BW-NEXT: vpsllw %xmm2, %zmm0, %zmm0
; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: retq
@@ -604,7 +604,7 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i
; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4
; AVX512VLBW-NEXT: vpsrlw %xmm4, %zmm1, %zmm1
; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm2
-; AVX512VLBW-NEXT: vpsllw $1, %zmm0, %zmm0
+; AVX512VLBW-NEXT: vpaddw %zmm0, %zmm0, %zmm0
; AVX512VLBW-NEXT: vpsllw %xmm2, %zmm0, %zmm0
; AVX512VLBW-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512VLBW-NEXT: retq
@@ -853,7 +853,7 @@ define <32 x i16> @constant_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y) nounwin
; AVX512BW-LABEL: constant_funnnel_v32i16:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
-; AVX512BW-NEXT: vpsllw $1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpaddw %zmm0, %zmm0, %zmm0
; AVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: retq
@@ -867,7 +867,7 @@ define <32 x i16> @constant_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y) nounwin
; AVX512VLBW-LABEL: constant_funnnel_v32i16:
; AVX512VLBW: # %bb.0:
; AVX512VLBW-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
-; AVX512VLBW-NEXT: vpsllw $1, %zmm0, %zmm0
+; AVX512VLBW-NEXT: vpaddw %zmm0, %zmm0, %zmm0
; AVX512VLBW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
; AVX512VLBW-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512VLBW-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll
index e1dcaa82b327..fa9560059a16 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll
@@ -963,7 +963,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind
; SSE41-NEXT: movdqa %xmm0, %xmm4
; SSE41-NEXT: psrlw %xmm3, %xmm4
; SSE41-NEXT: pandn %xmm2, %xmm1
-; SSE41-NEXT: psllw $1, %xmm0
+; SSE41-NEXT: paddw %xmm0, %xmm0
; SSE41-NEXT: psllw %xmm1, %xmm0
; SSE41-NEXT: por %xmm4, %xmm0
; SSE41-NEXT: retq
@@ -974,7 +974,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind
; AVX-NEXT: vpand %xmm2, %xmm1, %xmm3
; AVX-NEXT: vpsrlw %xmm3, %xmm0, %xmm3
; AVX-NEXT: vpandn %xmm2, %xmm1, %xmm1
-; AVX-NEXT: vpsllw $1, %xmm0, %xmm0
+; AVX-NEXT: vpaddw %xmm0, %xmm0, %xmm0
; AVX-NEXT: vpsllw %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpor %xmm3, %xmm0, %xmm0
; AVX-NEXT: retq
@@ -985,7 +985,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind
; AVX512F-NEXT: vpand %xmm2, %xmm1, %xmm3
; AVX512F-NEXT: vpsrlw %xmm3, %xmm0, %xmm3
; AVX512F-NEXT: vpandn %xmm2, %xmm1, %xmm1
-; AVX512F-NEXT: vpsllw $1, %xmm0, %xmm0
+; AVX512F-NEXT: vpaddw %xmm0, %xmm0, %xmm0
; AVX512F-NEXT: vpsllw %xmm1, %xmm0, %xmm0
; AVX512F-NEXT: vpor %xmm3, %xmm0, %xmm0
; AVX512F-NEXT: retq
@@ -996,7 +996,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind
; AVX512VL-NEXT: vpand %xmm2, %xmm1, %xmm3
; AVX512VL-NEXT: vpsrlw %xmm3, %xmm0, %xmm3
; AVX512VL-NEXT: vpandn %xmm2, %xmm1, %xmm1
-; AVX512VL-NEXT: vpsllw $1, %xmm0, %xmm0
+; AVX512VL-NEXT: vpaddw %xmm0, %xmm0, %xmm0
; AVX512VL-NEXT: vpsllw %xmm1, %xmm0, %xmm0
; AVX512VL-NEXT: vpor %xmm3, %xmm0, %xmm0
; AVX512VL-NEXT: retq
@@ -1007,7 +1007,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind
; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm3
; AVX512BW-NEXT: vpsrlw %xmm3, %xmm0, %xmm3
; AVX512BW-NEXT: vpandn %xmm2, %xmm1, %xmm1
-; AVX512BW-NEXT: vpsllw $1, %xmm0, %xmm0
+; AVX512BW-NEXT: vpaddw %xmm0, %xmm0, %xmm0
; AVX512BW-NEXT: vpsllw %xmm1, %xmm0, %xmm0
; AVX512BW-NEXT: vpor %xmm3, %xmm0, %xmm0
; AVX512BW-NEXT: retq
@@ -1018,7 +1018,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind
; AVX512VLBW-NEXT: vpand %xmm2, %xmm1, %xmm3
; AVX512VLBW-NEXT: vpsrlw %xmm3, %xmm0, %xmm3
; AVX512VLBW-NEXT: vpandn %xmm2, %xmm1, %xmm1
-; AVX512VLBW-NEXT: vpsllw $1, %xmm0, %xmm0
+; AVX512VLBW-NEXT: vpaddw %xmm0, %xmm0, %xmm0
; AVX512VLBW-NEXT: vpsllw %xmm1, %xmm0, %xmm0
; AVX512VLBW-NEXT: vpor %xmm3, %xmm0, %xmm0
; AVX512VLBW-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll
index 1c00644d8e77..1380a5496eaa 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll
@@ -789,11 +789,11 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %amt) nounw
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
; AVX1-NEXT: vpsrlw %xmm3, %xmm4, %xmm5
; AVX1-NEXT: vpandn %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpsllw $1, %xmm4, %xmm2
+; AVX1-NEXT: vpaddw %xmm4, %xmm4, %xmm2
; AVX1-NEXT: vpsllw %xmm1, %xmm2, %xmm2
; AVX1-NEXT: vpor %xmm5, %xmm2, %xmm2
; AVX1-NEXT: vpsrlw %xmm3, %xmm0, %xmm3
-; AVX1-NEXT: vpsllw $1, %xmm0, %xmm0
+; AVX1-NEXT: vpaddw %xmm0, %xmm0, %xmm0
; AVX1-NEXT: vpsllw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
@@ -805,7 +805,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %amt) nounw
; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm3
; AVX2-NEXT: vpsrlw %xmm3, %ymm0, %ymm3
; AVX2-NEXT: vpandn %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vpsllw $1, %ymm0, %ymm0
+; AVX2-NEXT: vpaddw %ymm0, %ymm0, %ymm0
; AVX2-NEXT: vpsllw %xmm1, %ymm0, %ymm0
; AVX2-NEXT: vpor %ymm3, %ymm0, %ymm0
; AVX2-NEXT: retq
@@ -816,7 +816,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %amt) nounw
; AVX512F-NEXT: vpand %xmm2, %xmm1, %xmm3
; AVX512F-NEXT: vpsrlw %xmm3, %ymm0, %ymm3
; AVX512F-NEXT: vpandn %xmm2, %xmm1, %xmm1
-; AVX512F-NEXT: vpsllw $1, %ymm0, %ymm0
+; AVX512F-NEXT: vpaddw %ymm0, %ymm0, %ymm0
; AVX512F-NEXT: vpsllw %xmm1, %ymm0, %ymm0
; AVX512F-NEXT: vpor %ymm3, %ymm0, %ymm0
; AVX512F-NEXT: retq
@@ -827,7 +827,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %amt) nounw
; AVX512VL-NEXT: vpand %xmm2, %xmm1, %xmm3
; AVX512VL-NEXT: vpsrlw %xmm3, %ymm0, %ymm3
; AVX512VL-NEXT: vpandn %xmm2, %xmm1, %xmm1
-; AVX512VL-NEXT: vpsllw $1, %ymm0, %ymm0
+; AVX512VL-NEXT: vpaddw %ymm0, %ymm0, %ymm0
; AVX512VL-NEXT: vpsllw %xmm1, %ymm0, %ymm0
; AVX512VL-NEXT: vpor %ymm3, %ymm0, %ymm0
; AVX512VL-NEXT: retq
@@ -838,7 +838,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %amt) nounw
; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm3
; AVX512BW-NEXT: vpsrlw %xmm3, %ymm0, %ymm3
; AVX512BW-NEXT: vpandn %xmm2, %xmm1, %xmm1
-; AVX512BW-NEXT: vpsllw $1, %ymm0, %ymm0
+; AVX512BW-NEXT: vpaddw %ymm0, %ymm0, %ymm0
; AVX512BW-NEXT: vpsllw %xmm1, %ymm0, %ymm0
; AVX512BW-NEXT: vpor %ymm3, %ymm0, %ymm0
; AVX512BW-NEXT: retq
@@ -849,7 +849,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %amt) nounw
; AVX512VLBW-NEXT: vpand %xmm2, %xmm1, %xmm3
; AVX512VLBW-NEXT: vpsrlw %xmm3, %ymm0, %ymm3
; AVX512VLBW-NEXT: vpandn %xmm2, %xmm1, %xmm1
-; AVX512VLBW-NEXT: vpsllw $1, %ymm0, %ymm0
+; AVX512VLBW-NEXT: vpaddw %ymm0, %ymm0, %ymm0
; AVX512VLBW-NEXT: vpsllw %xmm1, %ymm0, %ymm0
; AVX512VLBW-NEXT: vpor %ymm3, %ymm0, %ymm0
; AVX512VLBW-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll
index 3a533f6bc20f..59f3e0c85573 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll
@@ -301,9 +301,9 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounw
; AVX512F-NEXT: vpsrlw %xmm3, %ymm0, %ymm3
; AVX512F-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm3
; AVX512F-NEXT: vpandn %xmm2, %xmm1, %xmm1
-; AVX512F-NEXT: vpsllw $1, %ymm4, %ymm2
+; AVX512F-NEXT: vpaddw %ymm4, %ymm4, %ymm2
; AVX512F-NEXT: vpsllw %xmm1, %ymm2, %ymm2
-; AVX512F-NEXT: vpsllw $1, %ymm0, %ymm0
+; AVX512F-NEXT: vpaddw %ymm0, %ymm0, %ymm0
; AVX512F-NEXT: vpsllw %xmm1, %ymm0, %ymm0
; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512F-NEXT: vporq %zmm3, %zmm0, %zmm0
@@ -318,9 +318,9 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounw
; AVX512VL-NEXT: vpsrlw %xmm3, %ymm0, %ymm3
; AVX512VL-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm3
; AVX512VL-NEXT: vpandn %xmm2, %xmm1, %xmm1
-; AVX512VL-NEXT: vpsllw $1, %ymm4, %ymm2
+; AVX512VL-NEXT: vpaddw %ymm4, %ymm4, %ymm2
; AVX512VL-NEXT: vpsllw %xmm1, %ymm2, %ymm2
-; AVX512VL-NEXT: vpsllw $1, %ymm0, %ymm0
+; AVX512VL-NEXT: vpaddw %ymm0, %ymm0, %ymm0
; AVX512VL-NEXT: vpsllw %xmm1, %ymm0, %ymm0
; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512VL-NEXT: vporq %zmm3, %zmm0, %zmm0
@@ -332,7 +332,7 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounw
; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm3
; AVX512BW-NEXT: vpsrlw %xmm3, %zmm0, %zmm3
; AVX512BW-NEXT: vpandn %xmm2, %xmm1, %xmm1
-; AVX512BW-NEXT: vpsllw $1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpaddw %zmm0, %zmm0, %zmm0
; AVX512BW-NEXT: vpsllw %xmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vporq %zmm3, %zmm0, %zmm0
; AVX512BW-NEXT: retq
@@ -343,7 +343,7 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounw
; AVX512VLBW-NEXT: vpand %xmm2, %xmm1, %xmm3
; AVX512VLBW-NEXT: vpsrlw %xmm3, %zmm0, %zmm3
; AVX512VLBW-NEXT: vpandn %xmm2, %xmm1, %xmm1
-; AVX512VLBW-NEXT: vpsllw $1, %zmm0, %zmm0
+; AVX512VLBW-NEXT: vpaddw %zmm0, %zmm0, %zmm0
; AVX512VLBW-NEXT: vpsllw %xmm1, %zmm0, %zmm0
; AVX512VLBW-NEXT: vporq %zmm3, %zmm0, %zmm0
; AVX512VLBW-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vector-mulfix-legalize.ll b/llvm/test/CodeGen/X86/vector-mulfix-legalize.ll
index 563056544e69..505d9c8463c1 100644
--- a/llvm/test/CodeGen/X86/vector-mulfix-legalize.ll
+++ b/llvm/test/CodeGen/X86/vector-mulfix-legalize.ll
@@ -18,7 +18,7 @@ define <4 x i16> @smulfix(<4 x i16> %a) {
; CHECK-NEXT: pmullw %xmm1, %xmm2
; CHECK-NEXT: psrlw $15, %xmm2
; CHECK-NEXT: pmulhw %xmm1, %xmm0
-; CHECK-NEXT: psllw $1, %xmm0
+; CHECK-NEXT: paddw %xmm0, %xmm0
; CHECK-NEXT: por %xmm2, %xmm0
; CHECK-NEXT: retq
%t = call <4 x i16> @llvm.smul.fix.v4i16(<4 x i16> <i16 1, i16 2, i16 3, i16 4>, <4 x i16> %a, i32 15)
@@ -33,7 +33,7 @@ define <4 x i16> @umulfix(<4 x i16> %a) {
; CHECK-NEXT: pmullw %xmm1, %xmm2
; CHECK-NEXT: psrlw $15, %xmm2
; CHECK-NEXT: pmulhuw %xmm1, %xmm0
-; CHECK-NEXT: psllw $1, %xmm0
+; CHECK-NEXT: paddw %xmm0, %xmm0
; CHECK-NEXT: por %xmm2, %xmm0
; CHECK-NEXT: retq
%t = call <4 x i16> @llvm.umul.fix.v4i16(<4 x i16> <i16 1, i16 2, i16 3, i16 4>, <4 x i16> %a, i32 15)
diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-128.ll b/llvm/test/CodeGen/X86/vector-shift-shl-128.ll
index a745edec73e8..80c6f77383f8 100644
--- a/llvm/test/CodeGen/X86/vector-shift-shl-128.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-shl-128.ll
@@ -927,23 +927,23 @@ define <2 x i64> @constant_shift_v2i64(<2 x i64> %a) nounwind {
; SSE2-LABEL: constant_shift_v2i64:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psllq $1, %xmm1
-; SSE2-NEXT: psllq $7, %xmm0
-; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE2-NEXT: psllq $7, %xmm1
+; SSE2-NEXT: paddq %xmm0, %xmm0
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; SSE2-NEXT: retq
;
; SSE41-LABEL: constant_shift_v2i64:
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm1
; SSE41-NEXT: psllq $7, %xmm1
-; SSE41-NEXT: psllq $1, %xmm0
+; SSE41-NEXT: paddq %xmm0, %xmm0
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
; SSE41-NEXT: retq
;
; AVX1-LABEL: constant_shift_v2i64:
; AVX1: # %bb.0:
; AVX1-NEXT: vpsllq $7, %xmm0, %xmm1
-; AVX1-NEXT: vpsllq $1, %xmm0, %xmm0
+; AVX1-NEXT: vpaddq %xmm0, %xmm0, %xmm0
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
; AVX1-NEXT: retq
;
@@ -975,9 +975,9 @@ define <2 x i64> @constant_shift_v2i64(<2 x i64> %a) nounwind {
; X86-SSE-LABEL: constant_shift_v2i64:
; X86-SSE: # %bb.0:
; X86-SSE-NEXT: movdqa %xmm0, %xmm1
-; X86-SSE-NEXT: psllq $1, %xmm1
-; X86-SSE-NEXT: psllq $7, %xmm0
-; X86-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X86-SSE-NEXT: psllq $7, %xmm1
+; X86-SSE-NEXT: paddq %xmm0, %xmm0
+; X86-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; X86-SSE-NEXT: retl
%shift = shl <2 x i64> %a, <i64 1, i64 7>
ret <2 x i64> %shift
diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-256.ll b/llvm/test/CodeGen/X86/vector-shift-shl-256.ll
index 3867094346ce..d45d9da94f52 100644
--- a/llvm/test/CodeGen/X86/vector-shift-shl-256.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-shl-256.ll
@@ -1061,7 +1061,7 @@ define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) nounwind {
; AVX1-NEXT: vpsllq $31, %xmm1, %xmm1
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
; AVX1-NEXT: vpsllq $7, %xmm0, %xmm2
-; AVX1-NEXT: vpsllq $1, %xmm0, %xmm0
+; AVX1-NEXT: vpaddq %xmm0, %xmm0, %xmm0
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
@@ -1101,7 +1101,7 @@ define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) nounwind {
; X86-AVX1-NEXT: vpsllq $31, %xmm1, %xmm1
; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
; X86-AVX1-NEXT: vpsllq $7, %xmm0, %xmm2
-; X86-AVX1-NEXT: vpsllq $1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpaddq %xmm0, %xmm0, %xmm0
; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
; X86-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; X86-AVX1-NEXT: retl
More information about the llvm-commits
mailing list