[llvm] f1d8345 - [TwoAddressInstructionPass] Create register mapping for registers with multiple uses in the current MBB
Guozhi Wei via llvm-commits
llvm-commits at lists.llvm.org
Mon Nov 29 19:05:24 PST 2021
Author: Guozhi Wei
Date: 2021-11-29T19:01:59-08:00
New Revision: f1d8345a2ab3c343929212d1c62174cfaa46e71a
URL: https://github.com/llvm/llvm-project/commit/f1d8345a2ab3c343929212d1c62174cfaa46e71a
DIFF: https://github.com/llvm/llvm-project/commit/f1d8345a2ab3c343929212d1c62174cfaa46e71a.diff
LOG: [TwoAddressInstructionPass] Create register mapping for registers with multiple uses in the current MBB
Currently we create register mappings for registers used only once in current
MBB. For registers with multiple uses, when all the uses are in the current MBB,
we can also create mappings for them similarly according to the last use.
For example
%reg101 = ...
= ... reg101
%reg103 = ADD %reg101, %reg102
We can create mapping between %reg101 and %reg103.
Differential Revision: https://reviews.llvm.org/D113193
Added:
Modified:
llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
llvm/test/CodeGen/ARM/fpclamptosat.ll
llvm/test/CodeGen/ARM/hoist-and-by-const-from-lshr-in-eqcmp-zero.ll
llvm/test/CodeGen/ARM/hoist-and-by-const-from-shl-in-eqcmp-zero.ll
llvm/test/CodeGen/ARM/ssat.ll
llvm/test/CodeGen/ARM/usat.ll
llvm/test/CodeGen/SystemZ/int-div-01.ll
llvm/test/CodeGen/SystemZ/int-div-03.ll
llvm/test/CodeGen/SystemZ/int-div-04.ll
llvm/test/CodeGen/SystemZ/int-mul-08.ll
llvm/test/CodeGen/Thumb/srem-seteq-illegal-types.ll
llvm/test/CodeGen/X86/64-bit-shift-by-32-minus-y.ll
llvm/test/CodeGen/X86/8bit_cmov_of_trunc_promotion.ll
llvm/test/CodeGen/X86/atomic-unordered.ll
llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
llvm/test/CodeGen/X86/bitreverse.ll
llvm/test/CodeGen/X86/bmi2.ll
llvm/test/CodeGen/X86/bypass-slow-division-32.ll
llvm/test/CodeGen/X86/combine-bitselect.ll
llvm/test/CodeGen/X86/combine-sdiv.ll
llvm/test/CodeGen/X86/ctpop-combine.ll
llvm/test/CodeGen/X86/divide-by-constant.ll
llvm/test/CodeGen/X86/fpclamptosat.ll
llvm/test/CodeGen/X86/haddsub-3.ll
llvm/test/CodeGen/X86/haddsub-shuf.ll
llvm/test/CodeGen/X86/haddsub.ll
llvm/test/CodeGen/X86/horizontal-reduce-fadd.ll
llvm/test/CodeGen/X86/horizontal-sum.ll
llvm/test/CodeGen/X86/lzcnt-cmp.ll
llvm/test/CodeGen/X86/nontemporal-loads.ll
llvm/test/CodeGen/X86/omit-urem-of-power-of-two-or-zero-when-comparing-with-zero.ll
llvm/test/CodeGen/X86/pmulh.ll
llvm/test/CodeGen/X86/popcnt.ll
llvm/test/CodeGen/X86/pull-binop-through-shift.ll
llvm/test/CodeGen/X86/rem.ll
llvm/test/CodeGen/X86/sat-add.ll
llvm/test/CodeGen/X86/sdiv_fix_sat.ll
llvm/test/CodeGen/X86/setcc-combine.ll
llvm/test/CodeGen/X86/shift-combine.ll
llvm/test/CodeGen/X86/shl-crash-on-legalize.ll
llvm/test/CodeGen/X86/slow-pmulld.ll
llvm/test/CodeGen/X86/smul_fix.ll
llvm/test/CodeGen/X86/smul_fix_sat.ll
llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
llvm/test/CodeGen/X86/sse3-avx-addsub-2.ll
llvm/test/CodeGen/X86/uadd_sat.ll
llvm/test/CodeGen/X86/uadd_sat_vec.ll
llvm/test/CodeGen/X86/umul-with-overflow.ll
llvm/test/CodeGen/X86/umul_fix.ll
llvm/test/CodeGen/X86/umul_fix_sat.ll
llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll
llvm/test/CodeGen/X86/vec-strict-cmp-128.ll
llvm/test/CodeGen/X86/vec_ctbits.ll
llvm/test/CodeGen/X86/vec_umulo.ll
llvm/test/CodeGen/X86/vector-bitreverse.ll
llvm/test/CodeGen/X86/vector-ext-logic.ll
llvm/test/CodeGen/X86/vector-fshl-128.ll
llvm/test/CodeGen/X86/vector-fshr-128.ll
llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll
llvm/test/CodeGen/X86/vector-lzcnt-128.ll
llvm/test/CodeGen/X86/vector-lzcnt-sub128.ll
llvm/test/CodeGen/X86/vector-narrow-binop.ll
llvm/test/CodeGen/X86/vector-popcnt-128-ult-ugt.ll
llvm/test/CodeGen/X86/vector-popcnt-128.ll
llvm/test/CodeGen/X86/vector-reduce-fadd-fast.ll
llvm/test/CodeGen/X86/vector-reduce-fmul-fast.ll
llvm/test/CodeGen/X86/vector-shift-ashr-128.ll
llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll
llvm/test/CodeGen/X86/vector-shift-lshr-128.ll
llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll
llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll
llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll
llvm/test/CodeGen/X86/vector-shuffle-combining-avx512f.ll
llvm/test/CodeGen/X86/vector-shuffle-combining-avx512vbmi.ll
llvm/test/CodeGen/X86/vector-trunc-packus.ll
llvm/test/CodeGen/X86/vector-trunc-ssat.ll
llvm/test/CodeGen/X86/vector-trunc-usat.ll
llvm/test/CodeGen/X86/vector-tzcnt-128.ll
llvm/test/CodeGen/X86/vselect-packss.ll
Removed:
################################################################################
diff --git a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
index 627f90b834bd0..dfd962be2882a 100644
--- a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -373,19 +373,25 @@ static bool isTwoAddrUse(MachineInstr &MI, Register Reg, Register &DstReg) {
return false;
}
-/// Given a register, if has a single in-basic block use, return the use
-/// instruction if it's a copy or a two-address use.
+/// Given a register, if all its uses are in the same basic block, return the
+/// last use instruction if it's a copy or a two-address use.
static MachineInstr *
findOnlyInterestingUse(Register Reg, MachineBasicBlock *MBB,
MachineRegisterInfo *MRI, const TargetInstrInfo *TII,
- bool &IsCopy, Register &DstReg, bool &IsDstPhys) {
- if (!MRI->hasOneNonDBGUse(Reg))
- // None or more than one use.
- return nullptr;
- MachineOperand &UseOp = *MRI->use_nodbg_begin(Reg);
- MachineInstr &UseMI = *UseOp.getParent();
- if (UseMI.getParent() != MBB)
+ bool &IsCopy, Register &DstReg, bool &IsDstPhys,
+ LiveIntervals *LIS) {
+ MachineOperand *UseOp = nullptr;
+ for (MachineOperand &MO : MRI->use_nodbg_operands(Reg)) {
+ MachineInstr *MI = MO.getParent();
+ if (MI->getParent() != MBB)
+ return nullptr;
+ if (isPlainlyKilled(MI, Reg, LIS))
+ UseOp = &MO;
+ }
+ if (!UseOp)
return nullptr;
+ MachineInstr &UseMI = *UseOp->getParent();
+
Register SrcReg;
bool IsSrcPhys;
if (isCopyToReg(UseMI, TII, SrcReg, DstReg, IsSrcPhys, IsDstPhys)) {
@@ -399,7 +405,7 @@ findOnlyInterestingUse(Register Reg, MachineBasicBlock *MBB,
}
if (UseMI.isCommutable()) {
unsigned Src1 = TargetInstrInfo::CommuteAnyOperandIndex;
- unsigned Src2 = UseMI.getOperandNo(&UseOp);
+ unsigned Src2 = UseMI.getOperandNo(UseOp);
if (TII->findCommutedOpIndices(UseMI, Src1, Src2)) {
MachineOperand &MO = UseMI.getOperand(Src1);
if (MO.isReg() && MO.isUse() &&
@@ -722,7 +728,7 @@ void TwoAddressInstructionPass::scanUses(Register DstReg) {
Register NewReg;
Register Reg = DstReg;
while (MachineInstr *UseMI = findOnlyInterestingUse(Reg, MBB, MRI, TII,IsCopy,
- NewReg, IsDstPhys)) {
+ NewReg, IsDstPhys, LIS)) {
if (IsCopy && !Processed.insert(UseMI).second)
break;
diff --git a/llvm/test/CodeGen/ARM/fpclamptosat.ll b/llvm/test/CodeGen/ARM/fpclamptosat.ll
index 9d5308ba0e709..eff17552fe529 100644
--- a/llvm/test/CodeGen/ARM/fpclamptosat.ll
+++ b/llvm/test/CodeGen/ARM/fpclamptosat.ll
@@ -57,22 +57,21 @@ define i32 @stest_f64i32(double %x) {
; VFP2-NEXT: push {r7, lr}
; VFP2-NEXT: vmov r0, r1, d0
; VFP2-NEXT: bl __aeabi_d2lz
-; VFP2-NEXT: mvn r2, #-2147483648
-; VFP2-NEXT: subs r3, r0, r2
-; VFP2-NEXT: mov.w r12, #0
+; VFP2-NEXT: mvn r12, #-2147483648
+; VFP2-NEXT: subs.w r3, r0, r12
+; VFP2-NEXT: mov.w r2, #0
; VFP2-NEXT: sbcs r3, r1, #0
; VFP2-NEXT: it lt
-; VFP2-NEXT: movlt.w r12, #1
-; VFP2-NEXT: cmp.w r12, #0
-; VFP2-NEXT: itt ne
-; VFP2-NEXT: movne r12, r1
-; VFP2-NEXT: movne r2, r0
-; VFP2-NEXT: mov.w r0, #-1
-; VFP2-NEXT: rsbs.w r1, r2, #-2147483648
-; VFP2-NEXT: sbcs.w r0, r0, r12
+; VFP2-NEXT: movlt r2, #1
+; VFP2-NEXT: cmp r2, #0
+; VFP2-NEXT: ite ne
+; VFP2-NEXT: movne r2, r1
+; VFP2-NEXT: moveq r0, r12
+; VFP2-NEXT: mov.w r1, #-1
+; VFP2-NEXT: rsbs.w r3, r0, #-2147483648
+; VFP2-NEXT: sbcs r1, r2
; VFP2-NEXT: it ge
-; VFP2-NEXT: movge.w r2, #-2147483648
-; VFP2-NEXT: mov r0, r2
+; VFP2-NEXT: movge.w r0, #-2147483648
; VFP2-NEXT: pop {r7, pc}
;
; FULL-LABEL: stest_f64i32:
@@ -293,22 +292,21 @@ define i32 @stest_f32i32(float %x) {
; VFP2-NEXT: push {r7, lr}
; VFP2-NEXT: vmov r0, s0
; VFP2-NEXT: bl __aeabi_f2lz
-; VFP2-NEXT: mvn r2, #-2147483648
-; VFP2-NEXT: subs r3, r0, r2
-; VFP2-NEXT: mov.w r12, #0
+; VFP2-NEXT: mvn r12, #-2147483648
+; VFP2-NEXT: subs.w r3, r0, r12
+; VFP2-NEXT: mov.w r2, #0
; VFP2-NEXT: sbcs r3, r1, #0
; VFP2-NEXT: it lt
-; VFP2-NEXT: movlt.w r12, #1
-; VFP2-NEXT: cmp.w r12, #0
-; VFP2-NEXT: itt ne
-; VFP2-NEXT: movne r12, r1
-; VFP2-NEXT: movne r2, r0
-; VFP2-NEXT: mov.w r0, #-1
-; VFP2-NEXT: rsbs.w r1, r2, #-2147483648
-; VFP2-NEXT: sbcs.w r0, r0, r12
+; VFP2-NEXT: movlt r2, #1
+; VFP2-NEXT: cmp r2, #0
+; VFP2-NEXT: ite ne
+; VFP2-NEXT: movne r2, r1
+; VFP2-NEXT: moveq r0, r12
+; VFP2-NEXT: mov.w r1, #-1
+; VFP2-NEXT: rsbs.w r3, r0, #-2147483648
+; VFP2-NEXT: sbcs r1, r2
; VFP2-NEXT: it ge
-; VFP2-NEXT: movge.w r2, #-2147483648
-; VFP2-NEXT: mov r0, r2
+; VFP2-NEXT: movge.w r0, #-2147483648
; VFP2-NEXT: pop {r7, pc}
;
; FULL-LABEL: stest_f32i32:
@@ -532,22 +530,21 @@ define i32 @stest_f16i32(half %x) {
; VFP2-NEXT: vmov r0, s0
; VFP2-NEXT: bl __aeabi_h2f
; VFP2-NEXT: bl __aeabi_f2lz
-; VFP2-NEXT: mvn r2, #-2147483648
-; VFP2-NEXT: subs r3, r0, r2
-; VFP2-NEXT: mov.w r12, #0
+; VFP2-NEXT: mvn r12, #-2147483648
+; VFP2-NEXT: subs.w r3, r0, r12
+; VFP2-NEXT: mov.w r2, #0
; VFP2-NEXT: sbcs r3, r1, #0
; VFP2-NEXT: it lt
-; VFP2-NEXT: movlt.w r12, #1
-; VFP2-NEXT: cmp.w r12, #0
-; VFP2-NEXT: itt ne
-; VFP2-NEXT: movne r12, r1
-; VFP2-NEXT: movne r2, r0
-; VFP2-NEXT: mov.w r0, #-1
-; VFP2-NEXT: rsbs.w r1, r2, #-2147483648
-; VFP2-NEXT: sbcs.w r0, r0, r12
+; VFP2-NEXT: movlt r2, #1
+; VFP2-NEXT: cmp r2, #0
+; VFP2-NEXT: ite ne
+; VFP2-NEXT: movne r2, r1
+; VFP2-NEXT: moveq r0, r12
+; VFP2-NEXT: mov.w r1, #-1
+; VFP2-NEXT: rsbs.w r3, r0, #-2147483648
+; VFP2-NEXT: sbcs r1, r2
; VFP2-NEXT: it ge
-; VFP2-NEXT: movge.w r2, #-2147483648
-; VFP2-NEXT: mov r0, r2
+; VFP2-NEXT: movge.w r0, #-2147483648
; VFP2-NEXT: pop {r7, pc}
;
; FULL-LABEL: stest_f16i32:
@@ -1256,26 +1253,25 @@ define i64 @stest_f64i64(double %x) {
; VFP2-NEXT: push {r4, r5, r7, lr}
; VFP2-NEXT: bl __fixdfti
; VFP2-NEXT: subs.w r4, r0, #-1
-; VFP2-NEXT: mov r12, r1
-; VFP2-NEXT: mvn r1, #-2147483648
-; VFP2-NEXT: sbcs.w r4, r12, r1
+; VFP2-NEXT: mvn lr, #-2147483648
+; VFP2-NEXT: sbcs.w r4, r1, lr
+; VFP2-NEXT: mov.w r12, #0
; VFP2-NEXT: sbcs r4, r2, #0
-; VFP2-NEXT: mov.w lr, #0
; VFP2-NEXT: sbcs r4, r3, #0
; VFP2-NEXT: mov.w r4, #0
; VFP2-NEXT: it lt
; VFP2-NEXT: movlt r4, #1
; VFP2-NEXT: cmp r4, #0
-; VFP2-NEXT: itee eq
+; VFP2-NEXT: itet eq
; VFP2-NEXT: moveq r3, r4
; VFP2-NEXT: movne r4, r2
-; VFP2-NEXT: movne r1, r12
+; VFP2-NEXT: moveq r1, lr
; VFP2-NEXT: mov.w r2, #-1
; VFP2-NEXT: it eq
; VFP2-NEXT: moveq r0, r2
; VFP2-NEXT: rsbs r5, r0, #0
-; VFP2-NEXT: mov.w r12, #-2147483648
-; VFP2-NEXT: sbcs.w r5, r12, r1
+; VFP2-NEXT: mov.w lr, #-2147483648
+; VFP2-NEXT: sbcs.w r5, lr, r1
; VFP2-NEXT: sbcs.w r4, r2, r4
; VFP2-NEXT: sbcs r2, r3
; VFP2-NEXT: mov.w r2, #0
@@ -1283,8 +1279,8 @@ define i64 @stest_f64i64(double %x) {
; VFP2-NEXT: movlt r2, #1
; VFP2-NEXT: cmp r2, #0
; VFP2-NEXT: itt eq
-; VFP2-NEXT: moveq r0, lr
-; VFP2-NEXT: moveq r1, r12
+; VFP2-NEXT: moveq r0, r12
+; VFP2-NEXT: moveq r1, lr
; VFP2-NEXT: pop {r4, r5, r7, pc}
;
; FULL-LABEL: stest_f64i64:
@@ -1623,26 +1619,25 @@ define i64 @stest_f32i64(float %x) {
; VFP2-NEXT: push {r4, r5, r7, lr}
; VFP2-NEXT: bl __fixsfti
; VFP2-NEXT: subs.w r4, r0, #-1
-; VFP2-NEXT: mov r12, r1
-; VFP2-NEXT: mvn r1, #-2147483648
-; VFP2-NEXT: sbcs.w r4, r12, r1
+; VFP2-NEXT: mvn lr, #-2147483648
+; VFP2-NEXT: sbcs.w r4, r1, lr
+; VFP2-NEXT: mov.w r12, #0
; VFP2-NEXT: sbcs r4, r2, #0
-; VFP2-NEXT: mov.w lr, #0
; VFP2-NEXT: sbcs r4, r3, #0
; VFP2-NEXT: mov.w r4, #0
; VFP2-NEXT: it lt
; VFP2-NEXT: movlt r4, #1
; VFP2-NEXT: cmp r4, #0
-; VFP2-NEXT: itee eq
+; VFP2-NEXT: itet eq
; VFP2-NEXT: moveq r3, r4
; VFP2-NEXT: movne r4, r2
-; VFP2-NEXT: movne r1, r12
+; VFP2-NEXT: moveq r1, lr
; VFP2-NEXT: mov.w r2, #-1
; VFP2-NEXT: it eq
; VFP2-NEXT: moveq r0, r2
; VFP2-NEXT: rsbs r5, r0, #0
-; VFP2-NEXT: mov.w r12, #-2147483648
-; VFP2-NEXT: sbcs.w r5, r12, r1
+; VFP2-NEXT: mov.w lr, #-2147483648
+; VFP2-NEXT: sbcs.w r5, lr, r1
; VFP2-NEXT: sbcs.w r4, r2, r4
; VFP2-NEXT: sbcs r2, r3
; VFP2-NEXT: mov.w r2, #0
@@ -1650,8 +1645,8 @@ define i64 @stest_f32i64(float %x) {
; VFP2-NEXT: movlt r2, #1
; VFP2-NEXT: cmp r2, #0
; VFP2-NEXT: itt eq
-; VFP2-NEXT: moveq r0, lr
-; VFP2-NEXT: moveq r1, r12
+; VFP2-NEXT: moveq r0, r12
+; VFP2-NEXT: moveq r1, lr
; VFP2-NEXT: pop {r4, r5, r7, pc}
;
; FULL-LABEL: stest_f32i64:
@@ -1995,26 +1990,25 @@ define i64 @stest_f16i64(half %x) {
; VFP2-NEXT: vmov s0, r0
; VFP2-NEXT: bl __fixsfti
; VFP2-NEXT: subs.w r4, r0, #-1
-; VFP2-NEXT: mov r12, r1
-; VFP2-NEXT: mvn r1, #-2147483648
-; VFP2-NEXT: sbcs.w r4, r12, r1
+; VFP2-NEXT: mvn lr, #-2147483648
+; VFP2-NEXT: sbcs.w r4, r1, lr
+; VFP2-NEXT: mov.w r12, #0
; VFP2-NEXT: sbcs r4, r2, #0
-; VFP2-NEXT: mov.w lr, #0
; VFP2-NEXT: sbcs r4, r3, #0
; VFP2-NEXT: mov.w r4, #0
; VFP2-NEXT: it lt
; VFP2-NEXT: movlt r4, #1
; VFP2-NEXT: cmp r4, #0
-; VFP2-NEXT: itee eq
+; VFP2-NEXT: itet eq
; VFP2-NEXT: moveq r3, r4
; VFP2-NEXT: movne r4, r2
-; VFP2-NEXT: movne r1, r12
+; VFP2-NEXT: moveq r1, lr
; VFP2-NEXT: mov.w r2, #-1
; VFP2-NEXT: it eq
; VFP2-NEXT: moveq r0, r2
; VFP2-NEXT: rsbs r5, r0, #0
-; VFP2-NEXT: mov.w r12, #-2147483648
-; VFP2-NEXT: sbcs.w r5, r12, r1
+; VFP2-NEXT: mov.w lr, #-2147483648
+; VFP2-NEXT: sbcs.w r5, lr, r1
; VFP2-NEXT: sbcs.w r4, r2, r4
; VFP2-NEXT: sbcs r2, r3
; VFP2-NEXT: mov.w r2, #0
@@ -2022,8 +2016,8 @@ define i64 @stest_f16i64(half %x) {
; VFP2-NEXT: movlt r2, #1
; VFP2-NEXT: cmp r2, #0
; VFP2-NEXT: itt eq
-; VFP2-NEXT: moveq r0, lr
-; VFP2-NEXT: moveq r1, r12
+; VFP2-NEXT: moveq r0, r12
+; VFP2-NEXT: moveq r1, lr
; VFP2-NEXT: pop {r4, r5, r7, pc}
;
; FULL-LABEL: stest_f16i64:
@@ -2365,24 +2359,23 @@ define i32 @stest_f64i32_mm(double %x) {
; VFP2-NEXT: it lo
; VFP2-NEXT: movlo r3, r0
; VFP2-NEXT: cmp r1, #0
-; VFP2-NEXT: it mi
-; VFP2-NEXT: movmi r2, r0
-; VFP2-NEXT: mov.w r0, #-2147483648
+; VFP2-NEXT: it pl
+; VFP2-NEXT: movpl r0, r2
+; VFP2-NEXT: mov.w r2, #-2147483648
; VFP2-NEXT: it eq
-; VFP2-NEXT: moveq r2, r3
+; VFP2-NEXT: moveq r0, r3
; VFP2-NEXT: it pl
; VFP2-NEXT: movpl r1, #0
; VFP2-NEXT: cmp.w r1, #-1
; VFP2-NEXT: mov.w r3, #-2147483648
; VFP2-NEXT: it gt
-; VFP2-NEXT: movgt r3, r2
-; VFP2-NEXT: cmp.w r2, #-2147483648
+; VFP2-NEXT: movgt r3, r0
+; VFP2-NEXT: cmp.w r0, #-2147483648
; VFP2-NEXT: it ls
-; VFP2-NEXT: movls r2, r0
-; VFP2-NEXT: adds r0, r1, #1
+; VFP2-NEXT: movls r0, r2
+; VFP2-NEXT: adds r1, #1
; VFP2-NEXT: it ne
-; VFP2-NEXT: movne r2, r3
-; VFP2-NEXT: mov r0, r2
+; VFP2-NEXT: movne r0, r3
; VFP2-NEXT: pop {r7, pc}
;
; FULL-LABEL: stest_f64i32_mm:
@@ -2500,16 +2493,16 @@ define i32 @ustest_f64i32_mm(double %x) {
; VFP2-NEXT: cmp r1, #0
; VFP2-NEXT: it pl
; VFP2-NEXT: movpl.w r2, #-1
-; VFP2-NEXT: it eq
-; VFP2-NEXT: moveq r2, r0
-; VFP2-NEXT: mov.w r0, #0
+; VFP2-NEXT: it ne
+; VFP2-NEXT: movne r0, r2
+; VFP2-NEXT: mov.w r2, #0
; VFP2-NEXT: it pl
-; VFP2-NEXT: movpl r1, r0
+; VFP2-NEXT: movpl r1, r2
; VFP2-NEXT: cmp r1, #0
; VFP2-NEXT: it gt
-; VFP2-NEXT: movgt r0, r2
-; VFP2-NEXT: it eq
-; VFP2-NEXT: moveq r0, r2
+; VFP2-NEXT: movgt r2, r0
+; VFP2-NEXT: it ne
+; VFP2-NEXT: movne r0, r2
; VFP2-NEXT: pop {r7, pc}
;
; FULL-LABEL: ustest_f64i32_mm:
@@ -2613,24 +2606,23 @@ define i32 @stest_f32i32_mm(float %x) {
; VFP2-NEXT: it lo
; VFP2-NEXT: movlo r3, r0
; VFP2-NEXT: cmp r1, #0
-; VFP2-NEXT: it mi
-; VFP2-NEXT: movmi r2, r0
-; VFP2-NEXT: mov.w r0, #-2147483648
+; VFP2-NEXT: it pl
+; VFP2-NEXT: movpl r0, r2
+; VFP2-NEXT: mov.w r2, #-2147483648
; VFP2-NEXT: it eq
-; VFP2-NEXT: moveq r2, r3
+; VFP2-NEXT: moveq r0, r3
; VFP2-NEXT: it pl
; VFP2-NEXT: movpl r1, #0
; VFP2-NEXT: cmp.w r1, #-1
; VFP2-NEXT: mov.w r3, #-2147483648
; VFP2-NEXT: it gt
-; VFP2-NEXT: movgt r3, r2
-; VFP2-NEXT: cmp.w r2, #-2147483648
+; VFP2-NEXT: movgt r3, r0
+; VFP2-NEXT: cmp.w r0, #-2147483648
; VFP2-NEXT: it ls
-; VFP2-NEXT: movls r2, r0
-; VFP2-NEXT: adds r0, r1, #1
+; VFP2-NEXT: movls r0, r2
+; VFP2-NEXT: adds r1, #1
; VFP2-NEXT: it ne
-; VFP2-NEXT: movne r2, r3
-; VFP2-NEXT: mov r0, r2
+; VFP2-NEXT: movne r0, r3
; VFP2-NEXT: pop {r7, pc}
;
; FULL-LABEL: stest_f32i32_mm:
@@ -2744,16 +2736,16 @@ define i32 @ustest_f32i32_mm(float %x) {
; VFP2-NEXT: cmp r1, #0
; VFP2-NEXT: it pl
; VFP2-NEXT: movpl.w r2, #-1
-; VFP2-NEXT: it eq
-; VFP2-NEXT: moveq r2, r0
-; VFP2-NEXT: mov.w r0, #0
+; VFP2-NEXT: it ne
+; VFP2-NEXT: movne r0, r2
+; VFP2-NEXT: mov.w r2, #0
; VFP2-NEXT: it pl
-; VFP2-NEXT: movpl r1, r0
+; VFP2-NEXT: movpl r1, r2
; VFP2-NEXT: cmp r1, #0
; VFP2-NEXT: it gt
-; VFP2-NEXT: movgt r0, r2
-; VFP2-NEXT: it eq
-; VFP2-NEXT: moveq r0, r2
+; VFP2-NEXT: movgt r2, r0
+; VFP2-NEXT: it ne
+; VFP2-NEXT: movne r0, r2
; VFP2-NEXT: pop {r7, pc}
;
; FULL-LABEL: ustest_f32i32_mm:
@@ -2860,24 +2852,23 @@ define i32 @stest_f16i32_mm(half %x) {
; VFP2-NEXT: it lo
; VFP2-NEXT: movlo r3, r0
; VFP2-NEXT: cmp r1, #0
-; VFP2-NEXT: it mi
-; VFP2-NEXT: movmi r2, r0
-; VFP2-NEXT: mov.w r0, #-2147483648
+; VFP2-NEXT: it pl
+; VFP2-NEXT: movpl r0, r2
+; VFP2-NEXT: mov.w r2, #-2147483648
; VFP2-NEXT: it eq
-; VFP2-NEXT: moveq r2, r3
+; VFP2-NEXT: moveq r0, r3
; VFP2-NEXT: it pl
; VFP2-NEXT: movpl r1, #0
; VFP2-NEXT: cmp.w r1, #-1
; VFP2-NEXT: mov.w r3, #-2147483648
; VFP2-NEXT: it gt
-; VFP2-NEXT: movgt r3, r2
-; VFP2-NEXT: cmp.w r2, #-2147483648
+; VFP2-NEXT: movgt r3, r0
+; VFP2-NEXT: cmp.w r0, #-2147483648
; VFP2-NEXT: it ls
-; VFP2-NEXT: movls r2, r0
-; VFP2-NEXT: adds r0, r1, #1
+; VFP2-NEXT: movls r0, r2
+; VFP2-NEXT: adds r1, #1
; VFP2-NEXT: it ne
-; VFP2-NEXT: movne r2, r3
-; VFP2-NEXT: mov r0, r2
+; VFP2-NEXT: movne r0, r3
; VFP2-NEXT: pop {r7, pc}
;
; FULL-LABEL: stest_f16i32_mm:
@@ -3010,16 +3001,16 @@ define i32 @ustest_f16i32_mm(half %x) {
; VFP2-NEXT: cmp r1, #0
; VFP2-NEXT: it pl
; VFP2-NEXT: movpl.w r2, #-1
-; VFP2-NEXT: it eq
-; VFP2-NEXT: moveq r2, r0
-; VFP2-NEXT: mov.w r0, #0
+; VFP2-NEXT: it ne
+; VFP2-NEXT: movne r0, r2
+; VFP2-NEXT: mov.w r2, #0
; VFP2-NEXT: it pl
-; VFP2-NEXT: movpl r1, r0
+; VFP2-NEXT: movpl r1, r2
; VFP2-NEXT: cmp r1, #0
; VFP2-NEXT: it gt
-; VFP2-NEXT: movgt r0, r2
-; VFP2-NEXT: it eq
-; VFP2-NEXT: moveq r0, r2
+; VFP2-NEXT: movgt r2, r0
+; VFP2-NEXT: it ne
+; VFP2-NEXT: movne r0, r2
; VFP2-NEXT: pop {r7, pc}
;
; FULL-LABEL: ustest_f16i32_mm:
@@ -3084,13 +3075,13 @@ define i16 @stest_f64i16_mm(double %x) {
; VFP2-NEXT: bl __aeabi_d2iz
; VFP2-NEXT: movw r1, #32767
; VFP2-NEXT: cmp r0, r1
-; VFP2-NEXT: it lt
-; VFP2-NEXT: movlt r1, r0
-; VFP2-NEXT: movw r0, #32768
-; VFP2-NEXT: movt r0, #65535
-; VFP2-NEXT: cmn.w r1, #32768
-; VFP2-NEXT: it gt
-; VFP2-NEXT: movgt r0, r1
+; VFP2-NEXT: it ge
+; VFP2-NEXT: movge r0, r1
+; VFP2-NEXT: movw r1, #32768
+; VFP2-NEXT: movt r1, #65535
+; VFP2-NEXT: cmn.w r0, #32768
+; VFP2-NEXT: it le
+; VFP2-NEXT: movle r0, r1
; VFP2-NEXT: pop {r7, pc}
;
; FULL-LABEL: stest_f64i16_mm:
@@ -3704,36 +3695,35 @@ define i64 @stest_f64i64_mm(double %x) {
; VFP2-NEXT: it ne
; VFP2-NEXT: andne.w r6, r2, r6, asr #31
; VFP2-NEXT: and.w r2, r6, r5
-; VFP2-NEXT: mov.w lr, #0
-; VFP2-NEXT: adds r6, r2, #1
+; VFP2-NEXT: mov.w r6, #-1
+; VFP2-NEXT: adds r2, #1
; VFP2-NEXT: it ne
; VFP2-NEXT: movne r1, r7
; VFP2-NEXT: mov.w r7, #-1
; VFP2-NEXT: cmp r12, r8
; VFP2-NEXT: it lo
; VFP2-NEXT: movlo r7, r0
-; VFP2-NEXT: mov.w r2, #-1
+; VFP2-NEXT: mov.w lr, #0
; VFP2-NEXT: it eq
; VFP2-NEXT: moveq r7, r0
; VFP2-NEXT: cmp r3, #0
-; VFP2-NEXT: it mi
-; VFP2-NEXT: movmi r2, r0
+; VFP2-NEXT: it pl
+; VFP2-NEXT: movpl r0, r6
; VFP2-NEXT: cmp.w r9, #0
-; VFP2-NEXT: mov.w r0, #0
+; VFP2-NEXT: mov.w r3, #0
; VFP2-NEXT: it eq
-; VFP2-NEXT: moveq r2, r7
+; VFP2-NEXT: moveq r0, r7
; VFP2-NEXT: cmp.w r4, #-2147483648
; VFP2-NEXT: it hi
-; VFP2-NEXT: movhi r0, r2
+; VFP2-NEXT: movhi r3, r0
; VFP2-NEXT: it eq
-; VFP2-NEXT: moveq r0, r2
+; VFP2-NEXT: moveq r3, r0
; VFP2-NEXT: cmp.w r5, #-1
; VFP2-NEXT: it le
-; VFP2-NEXT: movle r2, lr
-; VFP2-NEXT: cmp r6, #0
+; VFP2-NEXT: movle r0, lr
+; VFP2-NEXT: cmp r2, #0
; VFP2-NEXT: it eq
-; VFP2-NEXT: moveq r2, r0
-; VFP2-NEXT: mov r0, r2
+; VFP2-NEXT: moveq r0, r3
; VFP2-NEXT: add sp, #4
; VFP2-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc}
;
@@ -4022,59 +4012,58 @@ define i64 @ustest_f64i64_mm(double %x) {
; VFP2-NEXT: push {r4, r5, r7, lr}
; VFP2-NEXT: bl __fixdfti
; VFP2-NEXT: subs r4, r2, #1
-; VFP2-NEXT: mov r12, r1
-; VFP2-NEXT: eor r1, r2, #1
+; VFP2-NEXT: eor r12, r2, #1
; VFP2-NEXT: sbcs r4, r3, #0
-; VFP2-NEXT: mov.w lr, #0
-; VFP2-NEXT: orr.w r5, r1, r3
+; VFP2-NEXT: orr.w lr, r12, r3
+; VFP2-NEXT: mov.w r4, #0
+; VFP2-NEXT: mov.w r5, #1
; VFP2-NEXT: it lt
-; VFP2-NEXT: movlt.w lr, #1
+; VFP2-NEXT: movlt r4, #1
+; VFP2-NEXT: cmp r4, #0
+; VFP2-NEXT: it eq
+; VFP2-NEXT: moveq r0, r4
; VFP2-NEXT: cmp.w lr, #0
; VFP2-NEXT: it eq
; VFP2-NEXT: moveq r0, lr
-; VFP2-NEXT: cmp r5, #0
+; VFP2-NEXT: cmp r4, #0
; VFP2-NEXT: it eq
-; VFP2-NEXT: moveq r0, r5
+; VFP2-NEXT: moveq r1, r4
; VFP2-NEXT: cmp.w lr, #0
-; VFP2-NEXT: it ne
-; VFP2-NEXT: movne lr, r12
-; VFP2-NEXT: cmp r5, #0
; VFP2-NEXT: it eq
-; VFP2-NEXT: moveq lr, r5
-; VFP2-NEXT: cmp.w lr, #0
-; VFP2-NEXT: mov r12, lr
+; VFP2-NEXT: moveq r1, lr
+; VFP2-NEXT: cmp r1, #0
+; VFP2-NEXT: mov lr, r1
; VFP2-NEXT: mov.w r4, #1
; VFP2-NEXT: ite ne
-; VFP2-NEXT: movne r12, r0
-; VFP2-NEXT: moveq r12, r0
+; VFP2-NEXT: movne lr, r0
+; VFP2-NEXT: moveq lr, r0
; VFP2-NEXT: cmp r2, #1
-; VFP2-NEXT: mov.w r5, #1
-; VFP2-NEXT: mov.w r1, #0
; VFP2-NEXT: it lo
; VFP2-NEXT: movlo r5, r2
; VFP2-NEXT: cmp r3, #0
; VFP2-NEXT: it mi
; VFP2-NEXT: movmi r4, r2
+; VFP2-NEXT: mov.w r12, #0
; VFP2-NEXT: it eq
; VFP2-NEXT: moveq r4, r5
; VFP2-NEXT: it pl
-; VFP2-NEXT: movpl r3, r1
+; VFP2-NEXT: movpl r3, r12
; VFP2-NEXT: rsbs r2, r4, #0
-; VFP2-NEXT: sbcs.w r2, r1, r3
+; VFP2-NEXT: sbcs.w r2, r12, r3
; VFP2-NEXT: it lt
-; VFP2-NEXT: movlt r1, #1
-; VFP2-NEXT: cmp r1, #0
+; VFP2-NEXT: movlt.w r12, #1
+; VFP2-NEXT: cmp.w r12, #0
; VFP2-NEXT: it eq
-; VFP2-NEXT: moveq r0, r1
+; VFP2-NEXT: moveq r0, r12
; VFP2-NEXT: orrs.w r2, r4, r3
; VFP2-NEXT: it eq
-; VFP2-NEXT: moveq r0, r12
-; VFP2-NEXT: cmp r1, #0
+; VFP2-NEXT: moveq r0, lr
+; VFP2-NEXT: cmp.w r12, #0
; VFP2-NEXT: it ne
-; VFP2-NEXT: movne r1, lr
+; VFP2-NEXT: movne r12, r1
; VFP2-NEXT: cmp r2, #0
-; VFP2-NEXT: it eq
-; VFP2-NEXT: moveq r1, lr
+; VFP2-NEXT: it ne
+; VFP2-NEXT: movne r1, r12
; VFP2-NEXT: pop {r4, r5, r7, pc}
;
; FULL-LABEL: ustest_f64i64_mm:
@@ -4293,36 +4282,35 @@ define i64 @stest_f32i64_mm(float %x) {
; VFP2-NEXT: it ne
; VFP2-NEXT: andne.w r6, r2, r6, asr #31
; VFP2-NEXT: and.w r2, r6, r5
-; VFP2-NEXT: mov.w lr, #0
-; VFP2-NEXT: adds r6, r2, #1
+; VFP2-NEXT: mov.w r6, #-1
+; VFP2-NEXT: adds r2, #1
; VFP2-NEXT: it ne
; VFP2-NEXT: movne r1, r7
; VFP2-NEXT: mov.w r7, #-1
; VFP2-NEXT: cmp r12, r8
; VFP2-NEXT: it lo
; VFP2-NEXT: movlo r7, r0
-; VFP2-NEXT: mov.w r2, #-1
+; VFP2-NEXT: mov.w lr, #0
; VFP2-NEXT: it eq
; VFP2-NEXT: moveq r7, r0
; VFP2-NEXT: cmp r3, #0
-; VFP2-NEXT: it mi
-; VFP2-NEXT: movmi r2, r0
+; VFP2-NEXT: it pl
+; VFP2-NEXT: movpl r0, r6
; VFP2-NEXT: cmp.w r9, #0
-; VFP2-NEXT: mov.w r0, #0
+; VFP2-NEXT: mov.w r3, #0
; VFP2-NEXT: it eq
-; VFP2-NEXT: moveq r2, r7
+; VFP2-NEXT: moveq r0, r7
; VFP2-NEXT: cmp.w r4, #-2147483648
; VFP2-NEXT: it hi
-; VFP2-NEXT: movhi r0, r2
+; VFP2-NEXT: movhi r3, r0
; VFP2-NEXT: it eq
-; VFP2-NEXT: moveq r0, r2
+; VFP2-NEXT: moveq r3, r0
; VFP2-NEXT: cmp.w r5, #-1
; VFP2-NEXT: it le
-; VFP2-NEXT: movle r2, lr
-; VFP2-NEXT: cmp r6, #0
+; VFP2-NEXT: movle r0, lr
+; VFP2-NEXT: cmp r2, #0
; VFP2-NEXT: it eq
-; VFP2-NEXT: moveq r2, r0
-; VFP2-NEXT: mov r0, r2
+; VFP2-NEXT: moveq r0, r3
; VFP2-NEXT: add sp, #4
; VFP2-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc}
;
@@ -4605,59 +4593,58 @@ define i64 @ustest_f32i64_mm(float %x) {
; VFP2-NEXT: push {r4, r5, r7, lr}
; VFP2-NEXT: bl __fixsfti
; VFP2-NEXT: subs r4, r2, #1
-; VFP2-NEXT: mov r12, r1
-; VFP2-NEXT: eor r1, r2, #1
+; VFP2-NEXT: eor r12, r2, #1
; VFP2-NEXT: sbcs r4, r3, #0
-; VFP2-NEXT: mov.w lr, #0
-; VFP2-NEXT: orr.w r5, r1, r3
+; VFP2-NEXT: orr.w lr, r12, r3
+; VFP2-NEXT: mov.w r4, #0
+; VFP2-NEXT: mov.w r5, #1
; VFP2-NEXT: it lt
-; VFP2-NEXT: movlt.w lr, #1
+; VFP2-NEXT: movlt r4, #1
+; VFP2-NEXT: cmp r4, #0
+; VFP2-NEXT: it eq
+; VFP2-NEXT: moveq r0, r4
; VFP2-NEXT: cmp.w lr, #0
; VFP2-NEXT: it eq
; VFP2-NEXT: moveq r0, lr
-; VFP2-NEXT: cmp r5, #0
+; VFP2-NEXT: cmp r4, #0
; VFP2-NEXT: it eq
-; VFP2-NEXT: moveq r0, r5
+; VFP2-NEXT: moveq r1, r4
; VFP2-NEXT: cmp.w lr, #0
-; VFP2-NEXT: it ne
-; VFP2-NEXT: movne lr, r12
-; VFP2-NEXT: cmp r5, #0
; VFP2-NEXT: it eq
-; VFP2-NEXT: moveq lr, r5
-; VFP2-NEXT: cmp.w lr, #0
-; VFP2-NEXT: mov r12, lr
+; VFP2-NEXT: moveq r1, lr
+; VFP2-NEXT: cmp r1, #0
+; VFP2-NEXT: mov lr, r1
; VFP2-NEXT: mov.w r4, #1
; VFP2-NEXT: ite ne
-; VFP2-NEXT: movne r12, r0
-; VFP2-NEXT: moveq r12, r0
+; VFP2-NEXT: movne lr, r0
+; VFP2-NEXT: moveq lr, r0
; VFP2-NEXT: cmp r2, #1
-; VFP2-NEXT: mov.w r5, #1
-; VFP2-NEXT: mov.w r1, #0
; VFP2-NEXT: it lo
; VFP2-NEXT: movlo r5, r2
; VFP2-NEXT: cmp r3, #0
; VFP2-NEXT: it mi
; VFP2-NEXT: movmi r4, r2
+; VFP2-NEXT: mov.w r12, #0
; VFP2-NEXT: it eq
; VFP2-NEXT: moveq r4, r5
; VFP2-NEXT: it pl
-; VFP2-NEXT: movpl r3, r1
+; VFP2-NEXT: movpl r3, r12
; VFP2-NEXT: rsbs r2, r4, #0
-; VFP2-NEXT: sbcs.w r2, r1, r3
+; VFP2-NEXT: sbcs.w r2, r12, r3
; VFP2-NEXT: it lt
-; VFP2-NEXT: movlt r1, #1
-; VFP2-NEXT: cmp r1, #0
+; VFP2-NEXT: movlt.w r12, #1
+; VFP2-NEXT: cmp.w r12, #0
; VFP2-NEXT: it eq
-; VFP2-NEXT: moveq r0, r1
+; VFP2-NEXT: moveq r0, r12
; VFP2-NEXT: orrs.w r2, r4, r3
; VFP2-NEXT: it eq
-; VFP2-NEXT: moveq r0, r12
-; VFP2-NEXT: cmp r1, #0
+; VFP2-NEXT: moveq r0, lr
+; VFP2-NEXT: cmp.w r12, #0
; VFP2-NEXT: it ne
-; VFP2-NEXT: movne r1, lr
+; VFP2-NEXT: movne r12, r1
; VFP2-NEXT: cmp r2, #0
-; VFP2-NEXT: it eq
-; VFP2-NEXT: moveq r1, lr
+; VFP2-NEXT: it ne
+; VFP2-NEXT: movne r1, r12
; VFP2-NEXT: pop {r4, r5, r7, pc}
;
; FULL-LABEL: ustest_f32i64_mm:
@@ -4881,36 +4868,35 @@ define i64 @stest_f16i64_mm(half %x) {
; VFP2-NEXT: it ne
; VFP2-NEXT: andne.w r6, r2, r6, asr #31
; VFP2-NEXT: and.w r2, r6, r5
-; VFP2-NEXT: mov.w lr, #0
-; VFP2-NEXT: adds r6, r2, #1
+; VFP2-NEXT: mov.w r6, #-1
+; VFP2-NEXT: adds r2, #1
; VFP2-NEXT: it ne
; VFP2-NEXT: movne r1, r7
; VFP2-NEXT: mov.w r7, #-1
; VFP2-NEXT: cmp r12, r8
; VFP2-NEXT: it lo
; VFP2-NEXT: movlo r7, r0
-; VFP2-NEXT: mov.w r2, #-1
+; VFP2-NEXT: mov.w lr, #0
; VFP2-NEXT: it eq
; VFP2-NEXT: moveq r7, r0
; VFP2-NEXT: cmp r3, #0
-; VFP2-NEXT: it mi
-; VFP2-NEXT: movmi r2, r0
+; VFP2-NEXT: it pl
+; VFP2-NEXT: movpl r0, r6
; VFP2-NEXT: cmp.w r9, #0
-; VFP2-NEXT: mov.w r0, #0
+; VFP2-NEXT: mov.w r3, #0
; VFP2-NEXT: it eq
-; VFP2-NEXT: moveq r2, r7
+; VFP2-NEXT: moveq r0, r7
; VFP2-NEXT: cmp.w r4, #-2147483648
; VFP2-NEXT: it hi
-; VFP2-NEXT: movhi r0, r2
+; VFP2-NEXT: movhi r3, r0
; VFP2-NEXT: it eq
-; VFP2-NEXT: moveq r0, r2
+; VFP2-NEXT: moveq r3, r0
; VFP2-NEXT: cmp.w r5, #-1
; VFP2-NEXT: it le
-; VFP2-NEXT: movle r2, lr
-; VFP2-NEXT: cmp r6, #0
+; VFP2-NEXT: movle r0, lr
+; VFP2-NEXT: cmp r2, #0
; VFP2-NEXT: it eq
-; VFP2-NEXT: moveq r2, r0
-; VFP2-NEXT: mov r0, r2
+; VFP2-NEXT: moveq r0, r3
; VFP2-NEXT: add sp, #4
; VFP2-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc}
;
@@ -5207,59 +5193,58 @@ define i64 @ustest_f16i64_mm(half %x) {
; VFP2-NEXT: vmov s0, r0
; VFP2-NEXT: bl __fixsfti
; VFP2-NEXT: subs r4, r2, #1
-; VFP2-NEXT: mov r12, r1
-; VFP2-NEXT: eor r1, r2, #1
+; VFP2-NEXT: eor r12, r2, #1
; VFP2-NEXT: sbcs r4, r3, #0
-; VFP2-NEXT: mov.w lr, #0
-; VFP2-NEXT: orr.w r5, r1, r3
+; VFP2-NEXT: orr.w lr, r12, r3
+; VFP2-NEXT: mov.w r4, #0
+; VFP2-NEXT: mov.w r5, #1
; VFP2-NEXT: it lt
-; VFP2-NEXT: movlt.w lr, #1
+; VFP2-NEXT: movlt r4, #1
+; VFP2-NEXT: cmp r4, #0
+; VFP2-NEXT: it eq
+; VFP2-NEXT: moveq r0, r4
; VFP2-NEXT: cmp.w lr, #0
; VFP2-NEXT: it eq
; VFP2-NEXT: moveq r0, lr
-; VFP2-NEXT: cmp r5, #0
+; VFP2-NEXT: cmp r4, #0
; VFP2-NEXT: it eq
-; VFP2-NEXT: moveq r0, r5
+; VFP2-NEXT: moveq r1, r4
; VFP2-NEXT: cmp.w lr, #0
-; VFP2-NEXT: it ne
-; VFP2-NEXT: movne lr, r12
-; VFP2-NEXT: cmp r5, #0
; VFP2-NEXT: it eq
-; VFP2-NEXT: moveq lr, r5
-; VFP2-NEXT: cmp.w lr, #0
-; VFP2-NEXT: mov r12, lr
+; VFP2-NEXT: moveq r1, lr
+; VFP2-NEXT: cmp r1, #0
+; VFP2-NEXT: mov lr, r1
; VFP2-NEXT: mov.w r4, #1
; VFP2-NEXT: ite ne
-; VFP2-NEXT: movne r12, r0
-; VFP2-NEXT: moveq r12, r0
+; VFP2-NEXT: movne lr, r0
+; VFP2-NEXT: moveq lr, r0
; VFP2-NEXT: cmp r2, #1
-; VFP2-NEXT: mov.w r5, #1
-; VFP2-NEXT: mov.w r1, #0
; VFP2-NEXT: it lo
; VFP2-NEXT: movlo r5, r2
; VFP2-NEXT: cmp r3, #0
; VFP2-NEXT: it mi
; VFP2-NEXT: movmi r4, r2
+; VFP2-NEXT: mov.w r12, #0
; VFP2-NEXT: it eq
; VFP2-NEXT: moveq r4, r5
; VFP2-NEXT: it pl
-; VFP2-NEXT: movpl r3, r1
+; VFP2-NEXT: movpl r3, r12
; VFP2-NEXT: rsbs r2, r4, #0
-; VFP2-NEXT: sbcs.w r2, r1, r3
+; VFP2-NEXT: sbcs.w r2, r12, r3
; VFP2-NEXT: it lt
-; VFP2-NEXT: movlt r1, #1
-; VFP2-NEXT: cmp r1, #0
+; VFP2-NEXT: movlt.w r12, #1
+; VFP2-NEXT: cmp.w r12, #0
; VFP2-NEXT: it eq
-; VFP2-NEXT: moveq r0, r1
+; VFP2-NEXT: moveq r0, r12
; VFP2-NEXT: orrs.w r2, r4, r3
; VFP2-NEXT: it eq
-; VFP2-NEXT: moveq r0, r12
-; VFP2-NEXT: cmp r1, #0
+; VFP2-NEXT: moveq r0, lr
+; VFP2-NEXT: cmp.w r12, #0
; VFP2-NEXT: it ne
-; VFP2-NEXT: movne r1, lr
+; VFP2-NEXT: movne r12, r1
; VFP2-NEXT: cmp r2, #0
-; VFP2-NEXT: it eq
-; VFP2-NEXT: moveq r1, lr
+; VFP2-NEXT: it ne
+; VFP2-NEXT: movne r1, r12
; VFP2-NEXT: pop {r4, r5, r7, pc}
;
; FULL-LABEL: ustest_f16i64_mm:
diff --git a/llvm/test/CodeGen/ARM/hoist-and-by-const-from-lshr-in-eqcmp-zero.ll b/llvm/test/CodeGen/ARM/hoist-and-by-const-from-lshr-in-eqcmp-zero.ll
index c4fce7751cf8e..cddbc5dc6508c 100644
--- a/llvm/test/CodeGen/ARM/hoist-and-by-const-from-lshr-in-eqcmp-zero.ll
+++ b/llvm/test/CodeGen/ARM/hoist-and-by-const-from-lshr-in-eqcmp-zero.ll
@@ -32,8 +32,8 @@ define i1 @scalar_i8_signbit_eq(i8 %x, i8 %y) nounwind {
; THUMB6-NEXT: uxtb r1, r1
; THUMB6-NEXT: lsls r0, r1
; THUMB6-NEXT: movs r1, #128
-; THUMB6-NEXT: ands r1, r0
-; THUMB6-NEXT: rsbs r0, r1, #0
+; THUMB6-NEXT: ands r0, r1
+; THUMB6-NEXT: rsbs r1, r0, #0
; THUMB6-NEXT: adcs r0, r1
; THUMB6-NEXT: bx lr
;
@@ -64,8 +64,8 @@ define i1 @scalar_i8_lowestbit_eq(i8 %x, i8 %y) nounwind {
; THUMB6-NEXT: uxtb r1, r1
; THUMB6-NEXT: lsls r0, r1
; THUMB6-NEXT: movs r1, #1
-; THUMB6-NEXT: ands r1, r0
-; THUMB6-NEXT: rsbs r0, r1, #0
+; THUMB6-NEXT: ands r0, r1
+; THUMB6-NEXT: rsbs r1, r0, #0
; THUMB6-NEXT: adcs r0, r1
; THUMB6-NEXT: bx lr
;
@@ -97,8 +97,8 @@ define i1 @scalar_i8_bitsinmiddle_eq(i8 %x, i8 %y) nounwind {
; THUMB6-NEXT: uxtb r1, r1
; THUMB6-NEXT: lsls r0, r1
; THUMB6-NEXT: movs r1, #24
-; THUMB6-NEXT: ands r1, r0
-; THUMB6-NEXT: rsbs r0, r1, #0
+; THUMB6-NEXT: ands r0, r1
+; THUMB6-NEXT: rsbs r1, r0, #0
; THUMB6-NEXT: adcs r0, r1
; THUMB6-NEXT: bx lr
;
@@ -134,8 +134,8 @@ define i1 @scalar_i16_signbit_eq(i16 %x, i16 %y) nounwind {
; THUMB6-NEXT: lsls r0, r1
; THUMB6-NEXT: movs r1, #1
; THUMB6-NEXT: lsls r1, r1, #15
-; THUMB6-NEXT: ands r1, r0
-; THUMB6-NEXT: rsbs r0, r1, #0
+; THUMB6-NEXT: ands r0, r1
+; THUMB6-NEXT: rsbs r1, r0, #0
; THUMB6-NEXT: adcs r0, r1
; THUMB6-NEXT: bx lr
;
@@ -166,8 +166,8 @@ define i1 @scalar_i16_lowestbit_eq(i16 %x, i16 %y) nounwind {
; THUMB6-NEXT: uxth r1, r1
; THUMB6-NEXT: lsls r0, r1
; THUMB6-NEXT: movs r1, #1
-; THUMB6-NEXT: ands r1, r0
-; THUMB6-NEXT: rsbs r0, r1, #0
+; THUMB6-NEXT: ands r0, r1
+; THUMB6-NEXT: rsbs r1, r0, #0
; THUMB6-NEXT: adcs r0, r1
; THUMB6-NEXT: bx lr
;
@@ -200,8 +200,8 @@ define i1 @scalar_i16_bitsinmiddle_eq(i16 %x, i16 %y) nounwind {
; THUMB6-NEXT: lsls r0, r1
; THUMB6-NEXT: movs r1, #255
; THUMB6-NEXT: lsls r1, r1, #4
-; THUMB6-NEXT: ands r1, r0
-; THUMB6-NEXT: rsbs r0, r1, #0
+; THUMB6-NEXT: ands r0, r1
+; THUMB6-NEXT: rsbs r1, r0, #0
; THUMB6-NEXT: adcs r0, r1
; THUMB6-NEXT: bx lr
;
@@ -233,8 +233,8 @@ define i1 @scalar_i32_signbit_eq(i32 %x, i32 %y) nounwind {
; THUMB6-NEXT: lsls r0, r1
; THUMB6-NEXT: movs r1, #1
; THUMB6-NEXT: lsls r1, r1, #31
-; THUMB6-NEXT: ands r1, r0
-; THUMB6-NEXT: rsbs r0, r1, #0
+; THUMB6-NEXT: ands r0, r1
+; THUMB6-NEXT: rsbs r1, r0, #0
; THUMB6-NEXT: adcs r0, r1
; THUMB6-NEXT: bx lr
;
@@ -261,8 +261,8 @@ define i1 @scalar_i32_lowestbit_eq(i32 %x, i32 %y) nounwind {
; THUMB6: @ %bb.0:
; THUMB6-NEXT: lsls r0, r1
; THUMB6-NEXT: movs r1, #1
-; THUMB6-NEXT: ands r1, r0
-; THUMB6-NEXT: rsbs r0, r1, #0
+; THUMB6-NEXT: ands r0, r1
+; THUMB6-NEXT: rsbs r1, r0, #0
; THUMB6-NEXT: adcs r0, r1
; THUMB6-NEXT: bx lr
;
@@ -301,8 +301,8 @@ define i1 @scalar_i32_bitsinmiddle_eq(i32 %x, i32 %y) nounwind {
; THUMB6: @ %bb.0:
; THUMB6-NEXT: lsls r0, r1
; THUMB6-NEXT: ldr r1, .LCPI8_0
-; THUMB6-NEXT: ands r1, r0
-; THUMB6-NEXT: rsbs r0, r1, #0
+; THUMB6-NEXT: ands r0, r1
+; THUMB6-NEXT: rsbs r1, r0, #0
; THUMB6-NEXT: adcs r0, r1
; THUMB6-NEXT: bx lr
; THUMB6-NEXT: .p2align 2
@@ -406,8 +406,8 @@ define i1 @scalar_i64_lowestbit_eq(i64 %x, i64 %y) nounwind {
; THUMB6-NEXT: push {r7, lr}
; THUMB6-NEXT: bl __ashldi3
; THUMB6-NEXT: movs r1, #1
-; THUMB6-NEXT: ands r1, r0
-; THUMB6-NEXT: rsbs r0, r1, #0
+; THUMB6-NEXT: ands r0, r1
+; THUMB6-NEXT: rsbs r1, r0, #0
; THUMB6-NEXT: adcs r0, r1
; THUMB6-NEXT: pop {r7, pc}
;
@@ -642,14 +642,14 @@ define <4 x i1> @vec_4xi32_nonsplat_eq(<4 x i32> %x, <4 x i32> %y) nounwind {
; THUMB6-NEXT: ldr r4, [sp, #16]
; THUMB6-NEXT: lsls r2, r4
; THUMB6-NEXT: ldr r4, .LCPI13_0
-; THUMB6-NEXT: ands r4, r2
-; THUMB6-NEXT: rsbs r2, r4, #0
+; THUMB6-NEXT: ands r2, r4
+; THUMB6-NEXT: rsbs r4, r2, #0
; THUMB6-NEXT: adcs r2, r4
; THUMB6-NEXT: ldr r4, [sp, #20]
; THUMB6-NEXT: lsls r3, r4
; THUMB6-NEXT: lsls r4, r0, #31
-; THUMB6-NEXT: ands r4, r3
-; THUMB6-NEXT: rsbs r3, r4, #0
+; THUMB6-NEXT: ands r3, r4
+; THUMB6-NEXT: rsbs r4, r3, #0
; THUMB6-NEXT: adcs r3, r4
; THUMB6-NEXT: pop {r4, pc}
; THUMB6-NEXT: .p2align 2
diff --git a/llvm/test/CodeGen/ARM/hoist-and-by-const-from-shl-in-eqcmp-zero.ll b/llvm/test/CodeGen/ARM/hoist-and-by-const-from-shl-in-eqcmp-zero.ll
index a5d3347e664cf..4f112c3a6f41c 100644
--- a/llvm/test/CodeGen/ARM/hoist-and-by-const-from-shl-in-eqcmp-zero.ll
+++ b/llvm/test/CodeGen/ARM/hoist-and-by-const-from-shl-in-eqcmp-zero.ll
@@ -303,8 +303,8 @@ define i1 @scalar_i32_signbit_eq(i32 %x, i32 %y) nounwind {
; THUMB6-NEXT: lsrs r0, r1
; THUMB6-NEXT: movs r1, #1
; THUMB6-NEXT: lsls r1, r1, #31
-; THUMB6-NEXT: ands r1, r0
-; THUMB6-NEXT: rsbs r0, r1, #0
+; THUMB6-NEXT: ands r0, r1
+; THUMB6-NEXT: rsbs r1, r0, #0
; THUMB6-NEXT: adcs r0, r1
; THUMB6-NEXT: bx lr
;
@@ -331,8 +331,8 @@ define i1 @scalar_i32_lowestbit_eq(i32 %x, i32 %y) nounwind {
; THUMB6: @ %bb.0:
; THUMB6-NEXT: lsrs r0, r1
; THUMB6-NEXT: movs r1, #1
-; THUMB6-NEXT: ands r1, r0
-; THUMB6-NEXT: rsbs r0, r1, #0
+; THUMB6-NEXT: ands r0, r1
+; THUMB6-NEXT: rsbs r1, r0, #0
; THUMB6-NEXT: adcs r0, r1
; THUMB6-NEXT: bx lr
;
@@ -371,8 +371,8 @@ define i1 @scalar_i32_bitsinmiddle_eq(i32 %x, i32 %y) nounwind {
; THUMB6: @ %bb.0:
; THUMB6-NEXT: lsrs r0, r1
; THUMB6-NEXT: ldr r1, .LCPI8_0
-; THUMB6-NEXT: ands r1, r0
-; THUMB6-NEXT: rsbs r0, r1, #0
+; THUMB6-NEXT: ands r0, r1
+; THUMB6-NEXT: rsbs r1, r0, #0
; THUMB6-NEXT: adcs r0, r1
; THUMB6-NEXT: bx lr
; THUMB6-NEXT: .p2align 2
@@ -459,8 +459,8 @@ define i1 @scalar_i64_lowestbit_eq(i64 %x, i64 %y) nounwind {
; THUMB6-NEXT: push {r7, lr}
; THUMB6-NEXT: bl __lshrdi3
; THUMB6-NEXT: movs r1, #1
-; THUMB6-NEXT: ands r1, r0
-; THUMB6-NEXT: rsbs r0, r1, #0
+; THUMB6-NEXT: ands r0, r1
+; THUMB6-NEXT: rsbs r1, r0, #0
; THUMB6-NEXT: adcs r0, r1
; THUMB6-NEXT: pop {r7, pc}
;
@@ -713,14 +713,14 @@ define <4 x i1> @vec_4xi32_nonsplat_eq(<4 x i32> %x, <4 x i32> %y) nounwind {
; THUMB6-NEXT: ldr r4, [sp, #16]
; THUMB6-NEXT: lsrs r2, r4
; THUMB6-NEXT: ldr r4, .LCPI13_0
-; THUMB6-NEXT: ands r4, r2
-; THUMB6-NEXT: rsbs r2, r4, #0
+; THUMB6-NEXT: ands r2, r4
+; THUMB6-NEXT: rsbs r4, r2, #0
; THUMB6-NEXT: adcs r2, r4
; THUMB6-NEXT: ldr r4, [sp, #20]
; THUMB6-NEXT: lsrs r3, r4
; THUMB6-NEXT: lsls r4, r0, #31
-; THUMB6-NEXT: ands r4, r3
-; THUMB6-NEXT: rsbs r3, r4, #0
+; THUMB6-NEXT: ands r3, r4
+; THUMB6-NEXT: rsbs r4, r3, #0
; THUMB6-NEXT: adcs r3, r4
; THUMB6-NEXT: pop {r4, pc}
; THUMB6-NEXT: .p2align 2
diff --git a/llvm/test/CodeGen/ARM/ssat.ll b/llvm/test/CodeGen/ARM/ssat.ll
index 3e1cdd4f8238f..436f6edc2bbbb 100644
--- a/llvm/test/CodeGen/ARM/ssat.ll
+++ b/llvm/test/CodeGen/ARM/ssat.ll
@@ -21,11 +21,11 @@ define i32 @sat_base_32bit(i32 %x) #0 {
; V4T: @ %bb.0: @ %entry
; V4T-NEXT: ldr r1, .LCPI0_0
; V4T-NEXT: cmp r0, r1
-; V4T-NEXT: movlt r1, r0
-; V4T-NEXT: mov r0, #1065353216
-; V4T-NEXT: orr r0, r0, #-1073741824
-; V4T-NEXT: cmn r1, #8388608
-; V4T-NEXT: movgt r0, r1
+; V4T-NEXT: movge r0, r1
+; V4T-NEXT: mov r1, #1065353216
+; V4T-NEXT: orr r1, r1, #-1073741824
+; V4T-NEXT: cmn r0, #8388608
+; V4T-NEXT: movle r0, r1
; V4T-NEXT: bx lr
; V4T-NEXT: .p2align 2
; V4T-NEXT: @ %bb.1:
@@ -54,12 +54,12 @@ define i16 @sat_base_16bit(i16 %x) #0 {
; V4T-NEXT: orr r2, r2, #1792
; V4T-NEXT: asr r1, r1, #16
; V4T-NEXT: cmp r1, r2
-; V4T-NEXT: movlt r2, r0
-; V4T-NEXT: lsl r0, r2, #16
-; V4T-NEXT: asr r1, r0, #16
-; V4T-NEXT: ldr r0, .LCPI1_0
+; V4T-NEXT: movge r0, r2
+; V4T-NEXT: ldr r2, .LCPI1_0
+; V4T-NEXT: lsl r1, r0, #16
+; V4T-NEXT: asr r1, r1, #16
; V4T-NEXT: cmn r1, #2048
-; V4T-NEXT: movgt r0, r2
+; V4T-NEXT: movle r0, r2
; V4T-NEXT: bx lr
; V4T-NEXT: .p2align 2
; V4T-NEXT: @ %bb.1:
@@ -71,12 +71,12 @@ define i16 @sat_base_16bit(i16 %x) #0 {
; V6T2-NEXT: sxth r1, r0
; V6T2-NEXT: movw r2, #2047
; V6T2-NEXT: cmp r1, r2
-; V6T2-NEXT: movlt r2, r0
-; V6T2-NEXT: movw r0, #63488
-; V6T2-NEXT: sxth r1, r2
-; V6T2-NEXT: movt r0, #65535
+; V6T2-NEXT: movge r0, r2
+; V6T2-NEXT: movw r2, #63488
+; V6T2-NEXT: sxth r1, r0
+; V6T2-NEXT: movt r2, #65535
; V6T2-NEXT: cmn r1, #2048
-; V6T2-NEXT: movgt r0, r2
+; V6T2-NEXT: movle r0, r2
; V6T2-NEXT: bx lr
entry:
%0 = icmp slt i16 %x, 2047
@@ -130,11 +130,11 @@ define i32 @sat_lower_upper_1(i32 %x) #0 {
; V4T: @ %bb.0: @ %entry
; V4T-NEXT: ldr r1, .LCPI3_0
; V4T-NEXT: cmp r0, r1
-; V4T-NEXT: movlt r1, r0
-; V4T-NEXT: mov r0, #1065353216
-; V4T-NEXT: orr r0, r0, #-1073741824
-; V4T-NEXT: cmn r1, #8388608
-; V4T-NEXT: movgt r0, r1
+; V4T-NEXT: movge r0, r1
+; V4T-NEXT: mov r1, #1065353216
+; V4T-NEXT: orr r1, r1, #-1073741824
+; V4T-NEXT: cmn r0, #8388608
+; V4T-NEXT: movle r0, r1
; V4T-NEXT: bx lr
; V4T-NEXT: .p2align 2
; V4T-NEXT: @ %bb.1:
@@ -159,11 +159,11 @@ define i32 @sat_lower_upper_2(i32 %x) #0 {
; V4T: @ %bb.0: @ %entry
; V4T-NEXT: ldr r1, .LCPI4_0
; V4T-NEXT: cmp r0, r1
-; V4T-NEXT: movlt r1, r0
-; V4T-NEXT: mov r0, #1065353216
-; V4T-NEXT: orr r0, r0, #-1073741824
-; V4T-NEXT: cmn r1, #8388608
-; V4T-NEXT: movgt r0, r1
+; V4T-NEXT: movge r0, r1
+; V4T-NEXT: mov r1, #1065353216
+; V4T-NEXT: orr r1, r1, #-1073741824
+; V4T-NEXT: cmn r0, #8388608
+; V4T-NEXT: movle r0, r1
; V4T-NEXT: bx lr
; V4T-NEXT: .p2align 2
; V4T-NEXT: @ %bb.1:
@@ -189,11 +189,10 @@ define i32 @sat_upper_lower_1(i32 %x) #0 {
; V4T-NEXT: mov r1, #1065353216
; V4T-NEXT: cmn r0, #8388608
; V4T-NEXT: orr r1, r1, #-1073741824
-; V4T-NEXT: movgt r1, r0
-; V4T-NEXT: ldr r0, .LCPI5_0
-; V4T-NEXT: cmp r1, r0
-; V4T-NEXT: movge r1, r0
-; V4T-NEXT: mov r0, r1
+; V4T-NEXT: movle r0, r1
+; V4T-NEXT: ldr r1, .LCPI5_0
+; V4T-NEXT: cmp r0, r1
+; V4T-NEXT: movge r0, r1
; V4T-NEXT: bx lr
; V4T-NEXT: .p2align 2
; V4T-NEXT: @ %bb.1:
@@ -219,11 +218,10 @@ define i32 @sat_upper_lower_2(i32 %x) #0 {
; V4T-NEXT: mov r1, #1065353216
; V4T-NEXT: cmn r0, #8388608
; V4T-NEXT: orr r1, r1, #-1073741824
-; V4T-NEXT: movgt r1, r0
-; V4T-NEXT: ldr r0, .LCPI6_0
-; V4T-NEXT: cmp r1, r0
-; V4T-NEXT: movge r1, r0
-; V4T-NEXT: mov r0, r1
+; V4T-NEXT: movle r0, r1
+; V4T-NEXT: ldr r1, .LCPI6_0
+; V4T-NEXT: cmp r0, r1
+; V4T-NEXT: movge r0, r1
; V4T-NEXT: bx lr
; V4T-NEXT: .p2align 2
; V4T-NEXT: @ %bb.1:
@@ -249,11 +247,10 @@ define i32 @sat_upper_lower_3(i32 %x) #0 {
; V4T-NEXT: mov r1, #1065353216
; V4T-NEXT: cmn r0, #8388608
; V4T-NEXT: orr r1, r1, #-1073741824
-; V4T-NEXT: movgt r1, r0
-; V4T-NEXT: ldr r0, .LCPI7_0
-; V4T-NEXT: cmp r1, r0
-; V4T-NEXT: movge r1, r0
-; V4T-NEXT: mov r0, r1
+; V4T-NEXT: movle r0, r1
+; V4T-NEXT: ldr r1, .LCPI7_0
+; V4T-NEXT: cmp r0, r1
+; V4T-NEXT: movge r0, r1
; V4T-NEXT: bx lr
; V4T-NEXT: .p2align 2
; V4T-NEXT: @ %bb.1:
@@ -284,11 +281,10 @@ define i32 @sat_le_ge(i32 %x) #0 {
; V4T-NEXT: mov r1, #1065353216
; V4T-NEXT: cmn r0, #8388608
; V4T-NEXT: orr r1, r1, #-1073741824
-; V4T-NEXT: movgt r1, r0
-; V4T-NEXT: ldr r0, .LCPI8_0
-; V4T-NEXT: cmp r1, r0
-; V4T-NEXT: movge r1, r0
-; V4T-NEXT: mov r0, r1
+; V4T-NEXT: movle r0, r1
+; V4T-NEXT: ldr r1, .LCPI8_0
+; V4T-NEXT: cmp r0, r1
+; V4T-NEXT: movge r0, r1
; V4T-NEXT: bx lr
; V4T-NEXT: .p2align 2
; V4T-NEXT: @ %bb.1:
@@ -435,11 +431,10 @@ define i32 @no_sat_incorrect_interval(i32 %x) #0 {
; V4T: @ %bb.0: @ %entry
; V4T-NEXT: ldr r1, .LCPI12_0
; V4T-NEXT: cmp r0, r1
-; V4T-NEXT: movgt r1, r0
-; V4T-NEXT: ldr r0, .LCPI12_1
-; V4T-NEXT: cmp r1, r0
-; V4T-NEXT: movge r1, r0
-; V4T-NEXT: mov r0, r1
+; V4T-NEXT: movle r0, r1
+; V4T-NEXT: ldr r1, .LCPI12_1
+; V4T-NEXT: cmp r0, r1
+; V4T-NEXT: movge r0, r1
; V4T-NEXT: bx lr
; V4T-NEXT: .p2align 2
; V4T-NEXT: @ %bb.1:
@@ -453,12 +448,11 @@ define i32 @no_sat_incorrect_interval(i32 %x) #0 {
; V6T2-NEXT: movw r1, #47768
; V6T2-NEXT: movt r1, #65244
; V6T2-NEXT: cmp r0, r1
-; V6T2-NEXT: movgt r1, r0
-; V6T2-NEXT: movw r0, #65535
-; V6T2-NEXT: movt r0, #127
-; V6T2-NEXT: cmp r1, r0
-; V6T2-NEXT: movge r1, r0
-; V6T2-NEXT: mov r0, r1
+; V6T2-NEXT: movle r0, r1
+; V6T2-NEXT: movw r1, #65535
+; V6T2-NEXT: movt r1, #127
+; V6T2-NEXT: cmp r0, r1
+; V6T2-NEXT: movge r0, r1
; V6T2-NEXT: bx lr
entry:
%0 = icmp sgt i32 %x, -19088744
diff --git a/llvm/test/CodeGen/ARM/usat.ll b/llvm/test/CodeGen/ARM/usat.ll
index 9bb4104f6b71e..84de3c9a0ecae 100644
--- a/llvm/test/CodeGen/ARM/usat.ll
+++ b/llvm/test/CodeGen/ARM/usat.ll
@@ -52,42 +52,39 @@ entry:
define i16 @unsigned_sat_base_16bit(i16 %x) #0 {
; V4T-LABEL: unsigned_sat_base_16bit:
; V4T: @ %bb.0: @ %entry
+; V4T-NEXT: mov r2, #255
; V4T-NEXT: lsl r1, r0, #16
-; V4T-NEXT: asr r2, r1, #16
-; V4T-NEXT: mov r1, #255
-; V4T-NEXT: orr r1, r1, #1792
-; V4T-NEXT: cmp r2, r1
-; V4T-NEXT: movlt r1, r0
-; V4T-NEXT: lsl r0, r1, #16
-; V4T-NEXT: asr r0, r0, #16
-; V4T-NEXT: cmp r0, #0
-; V4T-NEXT: movle r1, #0
-; V4T-NEXT: mov r0, r1
+; V4T-NEXT: orr r2, r2, #1792
+; V4T-NEXT: asr r1, r1, #16
+; V4T-NEXT: cmp r1, r2
+; V4T-NEXT: movge r0, r2
+; V4T-NEXT: lsl r1, r0, #16
+; V4T-NEXT: asr r1, r1, #16
+; V4T-NEXT: cmp r1, #0
+; V4T-NEXT: movle r0, #0
; V4T-NEXT: bx lr
;
; V6-LABEL: unsigned_sat_base_16bit:
; V6: @ %bb.0: @ %entry
-; V6-NEXT: mov r1, #255
-; V6-NEXT: sxth r2, r0
-; V6-NEXT: orr r1, r1, #1792
-; V6-NEXT: cmp r2, r1
-; V6-NEXT: movlt r1, r0
-; V6-NEXT: sxth r0, r1
-; V6-NEXT: cmp r0, #0
-; V6-NEXT: movle r1, #0
-; V6-NEXT: mov r0, r1
+; V6-NEXT: mov r2, #255
+; V6-NEXT: sxth r1, r0
+; V6-NEXT: orr r2, r2, #1792
+; V6-NEXT: cmp r1, r2
+; V6-NEXT: movge r0, r2
+; V6-NEXT: sxth r1, r0
+; V6-NEXT: cmp r1, #0
+; V6-NEXT: movle r0, #0
; V6-NEXT: bx lr
;
; V6T2-LABEL: unsigned_sat_base_16bit:
; V6T2: @ %bb.0: @ %entry
-; V6T2-NEXT: sxth r2, r0
-; V6T2-NEXT: movw r1, #2047
-; V6T2-NEXT: cmp r2, r1
-; V6T2-NEXT: movlt r1, r0
-; V6T2-NEXT: sxth r0, r1
-; V6T2-NEXT: cmp r0, #0
-; V6T2-NEXT: movle r1, #0
-; V6T2-NEXT: mov r0, r1
+; V6T2-NEXT: sxth r1, r0
+; V6T2-NEXT: movw r2, #2047
+; V6T2-NEXT: cmp r1, r2
+; V6T2-NEXT: movge r0, r2
+; V6T2-NEXT: sxth r1, r0
+; V6T2-NEXT: cmp r1, #0
+; V6T2-NEXT: movle r0, #0
; V6T2-NEXT: bx lr
entry:
%0 = icmp slt i16 %x, 2047
diff --git a/llvm/test/CodeGen/SystemZ/int-div-01.ll b/llvm/test/CodeGen/SystemZ/int-div-01.ll
index f67d6dc181eb9..04bff41e4c9b7 100644
--- a/llvm/test/CodeGen/SystemZ/int-div-01.ll
+++ b/llvm/test/CodeGen/SystemZ/int-div-01.ll
@@ -51,7 +51,8 @@ define i32 @f4(i32 %dummy, i32 signext %a, i32 %b) {
; CHECK-NOT: {{%r[234]}}
; CHECK: dsgfr %r2, %r4
; CHECK-NOT: dsgfr
-; CHECK: or %r2, %r3
+; CHECK: or %r3, %r2
+; CHECK: lr %r2, %r3
; CHECK: br %r14
%div = sdiv i32 %a, %b
%rem = srem i32 %a, %b
diff --git a/llvm/test/CodeGen/SystemZ/int-div-03.ll b/llvm/test/CodeGen/SystemZ/int-div-03.ll
index 48f99c2a55c39..c4a00ca0aaafc 100644
--- a/llvm/test/CodeGen/SystemZ/int-div-03.ll
+++ b/llvm/test/CodeGen/SystemZ/int-div-03.ll
@@ -36,7 +36,8 @@ define i64 @f3(i64 %dummy, i64 %a, i32 %b) {
; CHECK-LABEL: f3:
; CHECK-NOT: {{%r[234]}}
; CHECK: dsgfr %r2, %r4
-; CHECK: ogr %r2, %r3
+; CHECK: ogr %r3, %r2
+; CHECK: lgr %r2, %r3
; CHECK: br %r14
%bext = sext i32 %b to i64
%div = sdiv i64 %a, %bext
@@ -102,7 +103,8 @@ define i64 @f8(i64 %dummy, i64 %a, i32 *%src) {
; CHECK-NOT: {{%r[234]}}
; CHECK: dsgf %r2, 0(%r4)
; CHECK-NOT: {{dsgf|dsgfr}}
-; CHECK: ogr %r2, %r3
+; CHECK: ogr %r3, %r2
+; CHECK: lgr %r2, %r3
; CHECK: br %r14
%b = load i32, i32 *%src
%bext = sext i32 %b to i64
diff --git a/llvm/test/CodeGen/SystemZ/int-div-04.ll b/llvm/test/CodeGen/SystemZ/int-div-04.ll
index c06d7bdadceb0..0448eedcba5ac 100644
--- a/llvm/test/CodeGen/SystemZ/int-div-04.ll
+++ b/llvm/test/CodeGen/SystemZ/int-div-04.ll
@@ -34,7 +34,8 @@ define i64 @f3(i64 %dummy1, i64 %a, i64 %b) {
; CHECK-NOT: {{%r[234]}}
; CHECK: dsgr %r2, %r4
; CHECK-NOT: dsgr
-; CHECK: ogr %r2, %r3
+; CHECK: ogr %r3, %r2
+; CHECK: lgr %r2, %r3
; CHECK: br %r14
%div = sdiv i64 %a, %b
%rem = srem i64 %a, %b
@@ -74,7 +75,8 @@ define i64 @f6(i64 %dummy, i64 %a, i64 *%src) {
; CHECK-NOT: {{%r[234]}}
; CHECK: dsg %r2, 0(%r4)
; CHECK-NOT: {{dsg|dsgr}}
-; CHECK: ogr %r2, %r3
+; CHECK: ogr %r3, %r2
+; CHECK: lgr %r2, %r3
; CHECK: br %r14
%b = load i64, i64 *%src
%div = sdiv i64 %a, %b
diff --git a/llvm/test/CodeGen/SystemZ/int-mul-08.ll b/llvm/test/CodeGen/SystemZ/int-mul-08.ll
index 75aba3ceee228..cb61a0c7c4edc 100644
--- a/llvm/test/CodeGen/SystemZ/int-mul-08.ll
+++ b/llvm/test/CodeGen/SystemZ/int-mul-08.ll
@@ -60,7 +60,8 @@ define i64 @f4(i64 %dummy, i64 %a, i64 %b) {
; CHECK-LABEL: f4:
; CHECK-NOT: {{%r[234]}}
; CHECK: mlgr %r2, %r4
-; CHECK: ogr %r2, %r3
+; CHECK: ogr %r3, %r2
+; CHECK: lgr %r2, %r3
; CHECK: br %r14
%ax = zext i64 %a to i128
%bx = zext i64 %b to i128
diff --git a/llvm/test/CodeGen/Thumb/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/Thumb/srem-seteq-illegal-types.ll
index 2eac1f450dcc7..a704d7b508333 100644
--- a/llvm/test/CodeGen/Thumb/srem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/Thumb/srem-seteq-illegal-types.ll
@@ -117,10 +117,10 @@ define <3 x i1> @test_srem_vec(<3 x i33> %X) nounwind {
; CHECK-NEXT: mvns r3, r7
; CHECK-NEXT: ldr r0, [sp, #32]
; CHECK-NEXT: bl __aeabi_ldivmod
-; CHECK-NEXT: ands r3, r5
+; CHECK-NEXT: ands r5, r3
; CHECK-NEXT: ldr r0, [sp, #8] @ 4-byte Reload
; CHECK-NEXT: eors r2, r0
-; CHECK-NEXT: orrs r2, r3
+; CHECK-NEXT: orrs r2, r5
; CHECK-NEXT: subs r0, r2, #1
; CHECK-NEXT: sbcs r2, r0
; CHECK-NEXT: movs r0, r4
diff --git a/llvm/test/CodeGen/X86/64-bit-shift-by-32-minus-y.ll b/llvm/test/CodeGen/X86/64-bit-shift-by-32-minus-y.ll
index 6f8661f8a2551..6e903819c2ee2 100644
--- a/llvm/test/CodeGen/X86/64-bit-shift-by-32-minus-y.ll
+++ b/llvm/test/CodeGen/X86/64-bit-shift-by-32-minus-y.ll
@@ -284,9 +284,8 @@ define i64 @t4(i64 %val, i64 %shamt) nounwind {
define i64 @t5_cse(i64 %val, i64 %shamt, i64*%dst) nounwind {
; X64-NOBMI2-LABEL: t5_cse:
; X64-NOBMI2: # %bb.0:
-; X64-NOBMI2-NEXT: movq %rsi, %rcx
; X64-NOBMI2-NEXT: movq %rdi, %rax
-; X64-NOBMI2-NEXT: addq $32, %rcx
+; X64-NOBMI2-NEXT: leaq 32(%rsi), %rcx
; X64-NOBMI2-NEXT: movq %rcx, (%rdx)
; X64-NOBMI2-NEXT: negq %rcx
; X64-NOBMI2-NEXT: # kill: def $cl killed $cl killed $rcx
diff --git a/llvm/test/CodeGen/X86/8bit_cmov_of_trunc_promotion.ll b/llvm/test/CodeGen/X86/8bit_cmov_of_trunc_promotion.ll
index 2485599f28007..b8b3440f3aa54 100644
--- a/llvm/test/CodeGen/X86/8bit_cmov_of_trunc_promotion.ll
+++ b/llvm/test/CodeGen/X86/8bit_cmov_of_trunc_promotion.ll
@@ -61,11 +61,13 @@ define i8 @t0(i32 %a1_wide_orig, i32 %a2_wide_orig, i32 %inc) nounwind {
;
; X86_64-LABEL: t0:
; X86_64: # %bb.0:
-; X86_64-NEXT: movl %esi, %eax
-; X86_64-NEXT: addl %edx, %edi
-; X86_64-NEXT: addl %edx, %eax
-; X86_64-NEXT: cmpb %al, %dil
-; X86_64-NEXT: cmovgl %edi, %eax
+; X86_64-NEXT: # kill: def $edx killed $edx def $rdx
+; X86_64-NEXT: # kill: def $esi killed $esi def $rsi
+; X86_64-NEXT: # kill: def $edi killed $edi def $rdi
+; X86_64-NEXT: leal (%rdi,%rdx), %ecx
+; X86_64-NEXT: leal (%rsi,%rdx), %eax
+; X86_64-NEXT: cmpb %al, %cl
+; X86_64-NEXT: cmovgl %ecx, %eax
; X86_64-NEXT: # kill: def $al killed $al killed $eax
; X86_64-NEXT: retq
%a1_wide = add i32 %a1_wide_orig, %inc
@@ -134,11 +136,13 @@ define i8 @neg_only_one_truncation(i32 %a1_wide_orig, i8 %a2_orig, i32 %inc) nou
;
; X86_64-LABEL: neg_only_one_truncation:
; X86_64: # %bb.0:
-; X86_64-NEXT: addl %edx, %edi
+; X86_64-NEXT: # kill: def $edx killed $edx def $rdx
+; X86_64-NEXT: # kill: def $edi killed $edi def $rdi
+; X86_64-NEXT: leal (%rdi,%rdx), %ecx
; X86_64-NEXT: addb %sil, %dl
-; X86_64-NEXT: cmpb %dl, %dil
+; X86_64-NEXT: cmpb %dl, %cl
; X86_64-NEXT: movzbl %dl, %eax
-; X86_64-NEXT: cmovgl %edi, %eax
+; X86_64-NEXT: cmovgl %ecx, %eax
; X86_64-NEXT: # kill: def $al killed $al killed $eax
; X86_64-NEXT: retq
%a1_wide = add i32 %a1_wide_orig, %inc
@@ -205,11 +209,13 @@ define i8 @neg_type_mismatch(i32 %a1_wide_orig, i16 %a2_wide_orig, i32 %inc) nou
;
; X86_64-LABEL: neg_type_mismatch:
; X86_64: # %bb.0:
-; X86_64-NEXT: movl %esi, %eax
-; X86_64-NEXT: addl %edx, %edi
-; X86_64-NEXT: addl %edx, %eax
-; X86_64-NEXT: cmpb %al, %dil
-; X86_64-NEXT: cmovgl %edi, %eax
+; X86_64-NEXT: # kill: def $edx killed $edx def $rdx
+; X86_64-NEXT: # kill: def $esi killed $esi def $rsi
+; X86_64-NEXT: # kill: def $edi killed $edi def $rdi
+; X86_64-NEXT: leal (%rdi,%rdx), %ecx
+; X86_64-NEXT: leal (%rsi,%rdx), %eax
+; X86_64-NEXT: cmpb %al, %cl
+; X86_64-NEXT: cmovgl %ecx, %eax
; X86_64-NEXT: # kill: def $al killed $al killed $eax
; X86_64-NEXT: retq
%a1_wide = add i32 %a1_wide_orig, %inc
@@ -271,8 +277,9 @@ define i8 @negative_CopyFromReg(i32 %a1_wide, i32 %a2_wide_orig, i32 %inc) nounw
;
; X86_64-LABEL: negative_CopyFromReg:
; X86_64: # %bb.0:
-; X86_64-NEXT: movl %esi, %eax
-; X86_64-NEXT: addl %edx, %eax
+; X86_64-NEXT: # kill: def $edx killed $edx def $rdx
+; X86_64-NEXT: # kill: def $esi killed $esi def $rsi
+; X86_64-NEXT: leal (%rsi,%rdx), %eax
; X86_64-NEXT: cmpb %al, %dil
; X86_64-NEXT: cmovgl %edi, %eax
; X86_64-NEXT: # kill: def $al killed $al killed $eax
diff --git a/llvm/test/CodeGen/X86/atomic-unordered.ll b/llvm/test/CodeGen/X86/atomic-unordered.ll
index 5cfb7a3320c7e..0162a0e66ec3c 100644
--- a/llvm/test/CodeGen/X86/atomic-unordered.ll
+++ b/llvm/test/CodeGen/X86/atomic-unordered.ll
@@ -734,11 +734,12 @@ define i64 @load_fold_sdiv1(i64* %p) {
; CHECK-O3-NEXT: movabsq $-8608480567731124087, %rdx # imm = 0x8888888888888889
; CHECK-O3-NEXT: movq %rcx, %rax
; CHECK-O3-NEXT: imulq %rdx
-; CHECK-O3-NEXT: addq %rcx, %rdx
-; CHECK-O3-NEXT: movq %rdx, %rax
+; CHECK-O3-NEXT: addq %rdx, %rcx
+; CHECK-O3-NEXT: movq %rcx, %rax
; CHECK-O3-NEXT: shrq $63, %rax
-; CHECK-O3-NEXT: sarq $3, %rdx
-; CHECK-O3-NEXT: addq %rdx, %rax
+; CHECK-O3-NEXT: sarq $3, %rcx
+; CHECK-O3-NEXT: addq %rax, %rcx
+; CHECK-O3-NEXT: movq %rcx, %rax
; CHECK-O3-NEXT: retq
%v = load atomic i64, i64* %p unordered, align 8
%ret = sdiv i64 %v, 15
diff --git a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
index 71682094d64e9..9d71fe78a6a89 100644
--- a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
+++ b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
@@ -393,10 +393,10 @@ define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mask2(<32 x i16> %vec, <1
define <16 x i16> @test_32xi16_to_16xi16_perm_mask3(<32 x i16> %vec) {
; CHECK-LABEL: test_32xi16_to_16xi16_perm_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
-; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [1,0,30,5,3,6,25,29,0,13,3,8,7,20,11,5]
-; CHECK-NEXT: vpermi2w %ymm2, %ymm0, %ymm1
-; CHECK-NEXT: vmovdqa %ymm1, %ymm0
+; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [1,0,30,5,3,6,25,29,0,13,3,8,7,20,11,5]
+; CHECK-NEXT: vpermt2w %ymm1, %ymm2, %ymm0
+; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; CHECK-NEXT: retq
%res = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 1, i32 0, i32 30, i32 5, i32 3, i32 6, i32 25, i32 29, i32 0, i32 13, i32 3, i32 8, i32 7, i32 20, i32 11, i32 5>
ret <16 x i16> %res
@@ -419,11 +419,11 @@ define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mask3(<32 x i16> %vec, <16
define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mask3(<32 x i16> %vec, <16 x i16> %mask) {
; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
-; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [1,0,30,5,3,6,25,29,0,13,3,8,7,20,11,5]
+; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [1,0,30,5,3,6,25,29,0,13,3,8,7,20,11,5]
; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1
-; CHECK-NEXT: vpermi2w %ymm3, %ymm0, %ymm2 {%k1} {z}
-; CHECK-NEXT: vmovdqa %ymm2, %ymm0
+; CHECK-NEXT: vpermt2w %ymm2, %ymm3, %ymm0 {%k1} {z}
+; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; CHECK-NEXT: retq
%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 1, i32 0, i32 30, i32 5, i32 3, i32 6, i32 25, i32 29, i32 0, i32 13, i32 3, i32 8, i32 7, i32 20, i32 11, i32 5>
%cmp = icmp eq <16 x i16> %mask, zeroinitializer
@@ -2248,11 +2248,11 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask7(<8 x i64> %vec, <4 x i64
define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask7(<8 x i64> %vec, <4 x i64> %mask) {
; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask7:
; CHECK: # %bb.0:
-; CHECK-NEXT: vextracti32x4 $2, %zmm0, %xmm3
-; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [2,0,3,4]
+; CHECK-NEXT: vextracti32x4 $2, %zmm0, %xmm2
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [2,0,3,4]
; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1
-; CHECK-NEXT: vpermi2q %ymm3, %ymm0, %ymm2 {%k1} {z}
-; CHECK-NEXT: vmovdqa %ymm2, %ymm0
+; CHECK-NEXT: vpermt2q %ymm2, %ymm3, %ymm0 {%k1} {z}
+; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; CHECK-NEXT: retq
%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 2, i32 0, i32 3, i32 4>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -4029,12 +4029,12 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask4(<8 x double> %v
define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask4(<8 x double> %vec, <4 x double> %mask) {
; CHECK-FAST-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask4:
; CHECK-FAST: # %bb.0:
-; CHECK-FAST-NEXT: vextractf32x4 $2, %zmm0, %xmm3
-; CHECK-FAST-NEXT: vmovapd {{.*#+}} ymm2 = [1,1,5,5]
+; CHECK-FAST-NEXT: vextractf32x4 $2, %zmm0, %xmm2
+; CHECK-FAST-NEXT: vmovapd {{.*#+}} ymm3 = [1,1,5,5]
; CHECK-FAST-NEXT: vxorpd %xmm4, %xmm4, %xmm4
; CHECK-FAST-NEXT: vcmpeqpd %ymm4, %ymm1, %k1
-; CHECK-FAST-NEXT: vpermi2pd %ymm3, %ymm0, %ymm2 {%k1} {z}
-; CHECK-FAST-NEXT: vmovapd %ymm2, %ymm0
+; CHECK-FAST-NEXT: vpermt2pd %ymm2, %ymm3, %ymm0 {%k1} {z}
+; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; CHECK-FAST-NEXT: retq
;
; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask4:
diff --git a/llvm/test/CodeGen/X86/bitreverse.ll b/llvm/test/CodeGen/X86/bitreverse.ll
index 8433c7583d138..7ac030139efd8 100644
--- a/llvm/test/CodeGen/X86/bitreverse.ll
+++ b/llvm/test/CodeGen/X86/bitreverse.ll
@@ -372,13 +372,13 @@ define i8 @test_bitreverse_i8(i8 %a) {
; X64-NEXT: shlb $2, %al
; X64-NEXT: shrb $2, %dil
; X64-NEXT: andb $51, %dil
-; X64-NEXT: orb %al, %dil
-; X64-NEXT: movl %edi, %eax
-; X64-NEXT: andb $85, %al
-; X64-NEXT: addb %al, %al
-; X64-NEXT: shrb %dil
-; X64-NEXT: andb $85, %dil
; X64-NEXT: orb %dil, %al
+; X64-NEXT: movl %eax, %ecx
+; X64-NEXT: andb $85, %cl
+; X64-NEXT: addb %cl, %cl
+; X64-NEXT: shrb %al
+; X64-NEXT: andb $85, %al
+; X64-NEXT: orb %cl, %al
; X64-NEXT: retq
;
; X86XOP-LABEL: test_bitreverse_i8:
@@ -422,13 +422,13 @@ define i4 @test_bitreverse_i4(i4 %a) {
; X64-NEXT: shlb $2, %al
; X64-NEXT: shrb $2, %dil
; X64-NEXT: andb $51, %dil
-; X64-NEXT: orb %al, %dil
-; X64-NEXT: movl %edi, %eax
-; X64-NEXT: andb $80, %al
-; X64-NEXT: addb %al, %al
-; X64-NEXT: shrb %dil
-; X64-NEXT: andb $80, %dil
; X64-NEXT: orb %dil, %al
+; X64-NEXT: movl %eax, %ecx
+; X64-NEXT: andb $80, %cl
+; X64-NEXT: addb %cl, %cl
+; X64-NEXT: shrb %al
+; X64-NEXT: andb $80, %al
+; X64-NEXT: orb %cl, %al
; X64-NEXT: shrb $4, %al
; X64-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/bmi2.ll b/llvm/test/CodeGen/X86/bmi2.ll
index 8c86c5eb3007b..a32c3097561bd 100644
--- a/llvm/test/CodeGen/X86/bmi2.ll
+++ b/llvm/test/CodeGen/X86/bmi2.ll
@@ -305,10 +305,10 @@ define i32 @mulx32(i32 %x, i32 %y, i32* %p) {
;
; X64-LABEL: mulx32:
; X64: # %bb.0:
-; X64-NEXT: movl %esi, %eax
+; X64-NEXT: # kill: def $esi killed $esi def $rsi
; X64-NEXT: # kill: def $edi killed $edi def $rdi
; X64-NEXT: addl %edi, %edi
-; X64-NEXT: addl %eax, %eax
+; X64-NEXT: leal (%rsi,%rsi), %eax
; X64-NEXT: imulq %rdi, %rax
; X64-NEXT: movq %rax, %rcx
; X64-NEXT: shrq $32, %rcx
@@ -340,8 +340,8 @@ define i32 @mulx32_load(i32 %x, i32* %y, i32* %p) {
;
; X64-LABEL: mulx32_load:
; X64: # %bb.0:
-; X64-NEXT: movl %edi, %eax
-; X64-NEXT: addl %eax, %eax
+; X64-NEXT: # kill: def $edi killed $edi def $rdi
+; X64-NEXT: leal (%rdi,%rdi), %eax
; X64-NEXT: movl (%rsi), %ecx
; X64-NEXT: imulq %rcx, %rax
; X64-NEXT: movq %rax, %rcx
diff --git a/llvm/test/CodeGen/X86/bypass-slow-division-32.ll b/llvm/test/CodeGen/X86/bypass-slow-division-32.ll
index 988e4f9b9ca85..fe4201d9a2f73 100644
--- a/llvm/test/CodeGen/X86/bypass-slow-division-32.ll
+++ b/llvm/test/CodeGen/X86/bypass-slow-division-32.ll
@@ -175,12 +175,12 @@ define i32 @Test_use_divrem_reg_imm(i32 %a) nounwind {
; CHECK-NEXT: movl %edx, %eax
; CHECK-NEXT: shrl $31, %eax
; CHECK-NEXT: sarl $3, %edx
-; CHECK-NEXT: addl %eax, %edx
-; CHECK-NEXT: movl %edx, %eax
-; CHECK-NEXT: shll $5, %eax
; CHECK-NEXT: addl %edx, %eax
-; CHECK-NEXT: subl %eax, %ecx
-; CHECK-NEXT: addl %edx, %ecx
+; CHECK-NEXT: movl %eax, %edx
+; CHECK-NEXT: shll $5, %edx
+; CHECK-NEXT: addl %eax, %edx
+; CHECK-NEXT: subl %edx, %ecx
+; CHECK-NEXT: addl %eax, %ecx
; CHECK-NEXT: movl %ecx, %eax
; CHECK-NEXT: retl
%resultdiv = sdiv i32 %a, 33
diff --git a/llvm/test/CodeGen/X86/combine-bitselect.ll b/llvm/test/CodeGen/X86/combine-bitselect.ll
index 207bfeedcfecb..d25baf1b54e3b 100644
--- a/llvm/test/CodeGen/X86/combine-bitselect.ll
+++ b/llvm/test/CodeGen/X86/combine-bitselect.ll
@@ -991,14 +991,13 @@ define <4 x i1> @bitselect_v4i1_loop(<4 x i32> %a0, <4 x i32> %a1) {
; SSE-LABEL: bitselect_v4i1_loop:
; SSE: # %bb.0: # %bb
; SSE-NEXT: pxor %xmm2, %xmm2
-; SSE-NEXT: pcmpeqd %xmm0, %xmm2
-; SSE-NEXT: movdqa {{.*#+}} xmm0 = [12,12,12,12]
-; SSE-NEXT: pcmpeqd %xmm1, %xmm0
+; SSE-NEXT: pcmpeqd %xmm2, %xmm0
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = [12,12,12,12]
+; SSE-NEXT: pcmpeqd %xmm1, %xmm2
; SSE-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE-NEXT: pand %xmm2, %xmm1
-; SSE-NEXT: pandn %xmm0, %xmm2
-; SSE-NEXT: por %xmm1, %xmm2
-; SSE-NEXT: movdqa %xmm2, %xmm0
+; SSE-NEXT: pand %xmm0, %xmm1
+; SSE-NEXT: pandn %xmm2, %xmm0
+; SSE-NEXT: por %xmm1, %xmm0
; SSE-NEXT: retq
;
; XOP-LABEL: bitselect_v4i1_loop:
diff --git a/llvm/test/CodeGen/X86/combine-sdiv.ll b/llvm/test/CodeGen/X86/combine-sdiv.ll
index 4ac4056bc0dde..b60c1d3f1317d 100644
--- a/llvm/test/CodeGen/X86/combine-sdiv.ll
+++ b/llvm/test/CodeGen/X86/combine-sdiv.ll
@@ -1613,15 +1613,14 @@ define <4 x i64> @combine_vec_sdiv_by_pow2b_v4i64(<4 x i64> %x) {
; SSE2-NEXT: psrlq $61, %xmm3
; SSE2-NEXT: psrlq $60, %xmm2
; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1]
-; SSE2-NEXT: paddq %xmm1, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm1
-; SSE2-NEXT: psrlq $3, %xmm1
-; SSE2-NEXT: psrlq $4, %xmm2
-; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
-; SSE2-NEXT: movapd {{.*#+}} xmm1 = [1152921504606846976,576460752303423488]
-; SSE2-NEXT: xorpd %xmm1, %xmm2
-; SSE2-NEXT: psubq %xmm1, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm1
+; SSE2-NEXT: paddq %xmm2, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: psrlq $3, %xmm2
+; SSE2-NEXT: psrlq $4, %xmm1
+; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
+; SSE2-NEXT: movapd {{.*#+}} xmm2 = [1152921504606846976,576460752303423488]
+; SSE2-NEXT: xorpd %xmm2, %xmm1
+; SSE2-NEXT: psubq %xmm2, %xmm1
; SSE2-NEXT: retq
;
; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v4i64:
@@ -1642,15 +1641,14 @@ define <4 x i64> @combine_vec_sdiv_by_pow2b_v4i64(<4 x i64> %x) {
; SSE41-NEXT: psrlq $60, %xmm3
; SSE41-NEXT: psrlq $61, %xmm2
; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7]
-; SSE41-NEXT: paddq %xmm1, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm1
-; SSE41-NEXT: psrlq $4, %xmm1
-; SSE41-NEXT: psrlq $3, %xmm2
-; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
-; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1152921504606846976,576460752303423488]
-; SSE41-NEXT: pxor %xmm1, %xmm2
-; SSE41-NEXT: psubq %xmm1, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm1
+; SSE41-NEXT: paddq %xmm2, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm2
+; SSE41-NEXT: psrlq $4, %xmm2
+; SSE41-NEXT: psrlq $3, %xmm1
+; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
+; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [1152921504606846976,576460752303423488]
+; SSE41-NEXT: pxor %xmm2, %xmm1
+; SSE41-NEXT: psubq %xmm2, %xmm1
; SSE41-NEXT: retq
;
; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v4i64:
@@ -1762,14 +1760,14 @@ define <8 x i64> @combine_vec_sdiv_by_pow2b_v8i64(<8 x i64> %x) {
; SSE2-NEXT: psrlq $61, %xmm5
; SSE2-NEXT: psrlq $60, %xmm4
; SSE2-NEXT: movsd {{.*#+}} xmm4 = xmm5[0],xmm4[1]
-; SSE2-NEXT: paddq %xmm1, %xmm4
-; SSE2-NEXT: movdqa %xmm4, %xmm1
-; SSE2-NEXT: psrlq $3, %xmm1
-; SSE2-NEXT: psrlq $4, %xmm4
-; SSE2-NEXT: movsd {{.*#+}} xmm4 = xmm1[0],xmm4[1]
-; SSE2-NEXT: movapd {{.*#+}} xmm1 = [1152921504606846976,576460752303423488]
-; SSE2-NEXT: xorpd %xmm1, %xmm4
-; SSE2-NEXT: psubq %xmm1, %xmm4
+; SSE2-NEXT: paddq %xmm4, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm4
+; SSE2-NEXT: psrlq $3, %xmm4
+; SSE2-NEXT: psrlq $4, %xmm1
+; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm4[0],xmm1[1]
+; SSE2-NEXT: movapd {{.*#+}} xmm4 = [1152921504606846976,576460752303423488]
+; SSE2-NEXT: xorpd %xmm4, %xmm1
+; SSE2-NEXT: psubq %xmm4, %xmm1
; SSE2-NEXT: movdqa %xmm3, %xmm5
; SSE2-NEXT: psrad $31, %xmm5
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
@@ -1777,68 +1775,64 @@ define <8 x i64> @combine_vec_sdiv_by_pow2b_v8i64(<8 x i64> %x) {
; SSE2-NEXT: psrlq $61, %xmm6
; SSE2-NEXT: psrlq $60, %xmm5
; SSE2-NEXT: movsd {{.*#+}} xmm5 = xmm6[0],xmm5[1]
-; SSE2-NEXT: paddq %xmm3, %xmm5
-; SSE2-NEXT: movdqa %xmm5, %xmm3
-; SSE2-NEXT: psrlq $3, %xmm3
-; SSE2-NEXT: psrlq $4, %xmm5
-; SSE2-NEXT: movsd {{.*#+}} xmm5 = xmm3[0],xmm5[1]
-; SSE2-NEXT: xorpd %xmm1, %xmm5
-; SSE2-NEXT: psubq %xmm1, %xmm5
-; SSE2-NEXT: movdqa %xmm4, %xmm1
-; SSE2-NEXT: movdqa %xmm5, %xmm3
+; SSE2-NEXT: paddq %xmm5, %xmm3
+; SSE2-NEXT: movdqa %xmm3, %xmm5
+; SSE2-NEXT: psrlq $3, %xmm5
+; SSE2-NEXT: psrlq $4, %xmm3
+; SSE2-NEXT: movsd {{.*#+}} xmm3 = xmm5[0],xmm3[1]
+; SSE2-NEXT: xorpd %xmm4, %xmm3
+; SSE2-NEXT: psubq %xmm4, %xmm3
; SSE2-NEXT: retq
;
; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v8i64:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm1, %xmm4
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrad $31, %xmm1
-; SSE41-NEXT: psrlq $62, %xmm1
-; SSE41-NEXT: paddq %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm5
+; SSE41-NEXT: movdqa %xmm0, %xmm4
+; SSE41-NEXT: psrad $31, %xmm4
+; SSE41-NEXT: psrlq $62, %xmm4
+; SSE41-NEXT: paddq %xmm0, %xmm4
+; SSE41-NEXT: movdqa %xmm4, %xmm5
; SSE41-NEXT: psrad $2, %xmm5
-; SSE41-NEXT: psrlq $2, %xmm1
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,3],xmm1[4,5],xmm5[6,7]
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
-; SSE41-NEXT: movdqa %xmm2, %xmm1
-; SSE41-NEXT: psrad $31, %xmm1
-; SSE41-NEXT: psrlq $62, %xmm1
-; SSE41-NEXT: paddq %xmm2, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm5
+; SSE41-NEXT: psrlq $2, %xmm4
+; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3],xmm4[4,5],xmm5[6,7]
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4,5,6,7]
+; SSE41-NEXT: movdqa %xmm2, %xmm4
+; SSE41-NEXT: psrad $31, %xmm4
+; SSE41-NEXT: psrlq $62, %xmm4
+; SSE41-NEXT: paddq %xmm2, %xmm4
+; SSE41-NEXT: movdqa %xmm4, %xmm5
; SSE41-NEXT: psrad $2, %xmm5
-; SSE41-NEXT: psrlq $2, %xmm1
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,3],xmm1[4,5],xmm5[6,7]
-; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
-; SSE41-NEXT: movdqa %xmm4, %xmm1
-; SSE41-NEXT: psrad $31, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE41-NEXT: movdqa %xmm1, %xmm5
+; SSE41-NEXT: psrlq $2, %xmm4
+; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3],xmm4[4,5],xmm5[6,7]
+; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4,5,6,7]
+; SSE41-NEXT: movdqa %xmm1, %xmm4
+; SSE41-NEXT: psrad $31, %xmm4
+; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE41-NEXT: movdqa %xmm4, %xmm5
; SSE41-NEXT: psrlq $60, %xmm5
-; SSE41-NEXT: psrlq $61, %xmm1
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm5[4,5,6,7]
+; SSE41-NEXT: psrlq $61, %xmm4
+; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm5[4,5,6,7]
; SSE41-NEXT: paddq %xmm4, %xmm1
; SSE41-NEXT: movdqa %xmm1, %xmm4
; SSE41-NEXT: psrlq $4, %xmm4
; SSE41-NEXT: psrlq $3, %xmm1
; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4,5,6,7]
-; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [1152921504606846976,576460752303423488]
-; SSE41-NEXT: pxor %xmm5, %xmm1
-; SSE41-NEXT: psubq %xmm5, %xmm1
-; SSE41-NEXT: movdqa %xmm3, %xmm4
-; SSE41-NEXT: psrad $31, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE41-NEXT: movdqa %xmm4, %xmm6
+; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [1152921504606846976,576460752303423488]
+; SSE41-NEXT: pxor %xmm4, %xmm1
+; SSE41-NEXT: psubq %xmm4, %xmm1
+; SSE41-NEXT: movdqa %xmm3, %xmm5
+; SSE41-NEXT: psrad $31, %xmm5
+; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSE41-NEXT: movdqa %xmm5, %xmm6
; SSE41-NEXT: psrlq $60, %xmm6
-; SSE41-NEXT: psrlq $61, %xmm4
-; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm6[4,5,6,7]
-; SSE41-NEXT: paddq %xmm3, %xmm4
-; SSE41-NEXT: movdqa %xmm4, %xmm3
-; SSE41-NEXT: psrlq $4, %xmm3
-; SSE41-NEXT: psrlq $3, %xmm4
-; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7]
-; SSE41-NEXT: pxor %xmm5, %xmm4
-; SSE41-NEXT: psubq %xmm5, %xmm4
-; SSE41-NEXT: movdqa %xmm4, %xmm3
+; SSE41-NEXT: psrlq $61, %xmm5
+; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4,5,6,7]
+; SSE41-NEXT: paddq %xmm5, %xmm3
+; SSE41-NEXT: movdqa %xmm3, %xmm5
+; SSE41-NEXT: psrlq $4, %xmm5
+; SSE41-NEXT: psrlq $3, %xmm3
+; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm5[4,5,6,7]
+; SSE41-NEXT: pxor %xmm4, %xmm3
+; SSE41-NEXT: psubq %xmm4, %xmm3
; SSE41-NEXT: retq
;
; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v8i64:
@@ -1968,29 +1962,28 @@ define <4 x i32> @combine_vec_sdiv_by_pow2b_PosAndNeg(<4 x i32> %x) {
; SSE2-LABEL: combine_vec_sdiv_by_pow2b_PosAndNeg:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrad $31, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: psrad $31, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm2
; SSE2-NEXT: psrld $28, %xmm2
-; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: movdqa %xmm1, %xmm3
; SSE2-NEXT: psrld $29, %xmm3
; SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1]
-; SSE2-NEXT: psrld $30, %xmm0
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0,3]
-; SSE2-NEXT: paddd %xmm1, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: psrld $30, %xmm1
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0,3]
+; SSE2-NEXT: paddd %xmm0, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm2
; SSE2-NEXT: psrad $4, %xmm2
-; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: movdqa %xmm1, %xmm3
; SSE2-NEXT: psrad $3, %xmm3
; SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1]
-; SSE2-NEXT: psrad $2, %xmm0
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0,3]
-; SSE2-NEXT: movaps %xmm0, %xmm2
-; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm0[2,3]
-; SSE2-NEXT: pxor %xmm3, %xmm3
-; SSE2-NEXT: psubd %xmm2, %xmm3
-; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE2-NEXT: psrad $2, %xmm1
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0,3]
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: psubd %xmm1, %xmm2
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; SSE2-NEXT: retq
;
; SSE41-LABEL: combine_vec_sdiv_by_pow2b_PosAndNeg:
@@ -2340,10 +2333,10 @@ define <4 x i32> @non_splat_minus_one_divisor_2(<4 x i32> %A) {
; SSE41-NEXT: psrld $31, %xmm1
; SSE41-NEXT: paddd %xmm0, %xmm1
; SSE41-NEXT: psrad $1, %xmm1
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
-; SSE41-NEXT: pxor %xmm0, %xmm0
-; SSE41-NEXT: psubd %xmm1, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5],xmm0[6,7]
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; SSE41-NEXT: pxor %xmm1, %xmm1
+; SSE41-NEXT: psubd %xmm0, %xmm1
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7]
; SSE41-NEXT: retq
;
; AVX1-LABEL: non_splat_minus_one_divisor_2:
@@ -2480,30 +2473,28 @@ define <8 x i16> @combine_vec_sdiv_nonuniform3(<8 x i16> %x) {
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [45591,45591,45591,45591,32833,32833,32833,32833]
; SSE2-NEXT: pmulhw %xmm0, %xmm1
-; SSE2-NEXT: paddw %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psraw $4, %xmm0
-; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: paddw %xmm1, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psraw $4, %xmm1
+; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: psraw $8, %xmm2
-; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
-; SSE2-NEXT: psrlw $15, %xmm1
-; SSE2-NEXT: paddw %xmm2, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
+; SSE2-NEXT: psrlw $15, %xmm0
+; SSE2-NEXT: paddw %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: combine_vec_sdiv_nonuniform3:
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [45591,45591,45591,45591,32833,32833,32833,32833]
; SSE41-NEXT: pmulhw %xmm0, %xmm1
-; SSE41-NEXT: paddw %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: psraw $8, %xmm0
-; SSE41-NEXT: movdqa %xmm1, %xmm2
+; SSE41-NEXT: paddw %xmm1, %xmm0
+; SSE41-NEXT: movdqa %xmm0, %xmm1
+; SSE41-NEXT: psraw $8, %xmm1
+; SSE41-NEXT: movdqa %xmm0, %xmm2
; SSE41-NEXT: psraw $4, %xmm2
-; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5,6,7]
-; SSE41-NEXT: psrlw $15, %xmm1
-; SSE41-NEXT: paddw %xmm2, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
+; SSE41-NEXT: psrlw $15, %xmm0
+; SSE41-NEXT: paddw %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: combine_vec_sdiv_nonuniform3:
@@ -2918,26 +2909,25 @@ define <8 x i16> @combine_vec_sdiv_nonuniform7(<8 x i16> %x) {
define <16 x i8> @pr38658(<16 x i8> %x) {
; SSE2-LABEL: pr38658:
; SSE2: # %bb.0:
-; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: pxor %xmm3, %xmm3
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
-; SSE2-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE2-NEXT: psrlw $8, %xmm3
; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: packuswb %xmm3, %xmm1
-; SSE2-NEXT: paddb %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
-; SSE2-NEXT: psraw $8, %xmm2
-; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
+; SSE2-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
; SSE2-NEXT: psrlw $8, %xmm2
-; SSE2-NEXT: packuswb %xmm2, %xmm0
-; SSE2-NEXT: psrlw $7, %xmm1
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: paddb %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: pxor %xmm3, %xmm3
+; SSE2-NEXT: packuswb %xmm2, %xmm3
+; SSE2-NEXT: paddb %xmm3, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; SSE2-NEXT: psraw $8, %xmm1
+; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT: psrlw $8, %xmm1
+; SSE2-NEXT: packuswb %xmm1, %xmm2
+; SSE2-NEXT: psrlw $7, %xmm0
+; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: paddb %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: pr38658:
@@ -2948,21 +2938,20 @@ define <16 x i8> @pr38658(<16 x i8> %x) {
; SSE41-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
; SSE41-NEXT: psrlw $8, %xmm2
; SSE41-NEXT: packuswb %xmm2, %xmm1
-; SSE41-NEXT: paddb %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
-; SSE41-NEXT: psraw $8, %xmm0
-; SSE41-NEXT: movdqa %xmm0, %xmm2
+; SSE41-NEXT: paddb %xmm1, %xmm0
+; SSE41-NEXT: movdqa %xmm0, %xmm1
+; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; SSE41-NEXT: psraw $8, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm2
; SSE41-NEXT: psllw $6, %xmm2
-; SSE41-NEXT: psllw $8, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
-; SSE41-NEXT: psrlw $8, %xmm0
-; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; SSE41-NEXT: packuswb %xmm0, %xmm2
-; SSE41-NEXT: psrlw $7, %xmm1
-; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE41-NEXT: paddb %xmm2, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: psllw $8, %xmm1
+; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6],xmm2[7]
+; SSE41-NEXT: psrlw $8, %xmm1
+; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; SSE41-NEXT: packuswb %xmm1, %xmm2
+; SSE41-NEXT: psrlw $7, %xmm0
+; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT: paddb %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: pr38658:
diff --git a/llvm/test/CodeGen/X86/ctpop-combine.ll b/llvm/test/CodeGen/X86/ctpop-combine.ll
index 0040ee2ffeb5a..0070b1a4ac922 100644
--- a/llvm/test/CodeGen/X86/ctpop-combine.ll
+++ b/llvm/test/CodeGen/X86/ctpop-combine.ll
@@ -93,14 +93,14 @@ define i8 @test4(i8 %x) nounwind readnone {
; NO-POPCOUNT-NEXT: shrb %al
; NO-POPCOUNT-NEXT: andb $21, %al
; NO-POPCOUNT-NEXT: subb %al, %dil
-; NO-POPCOUNT-NEXT: movl %edi, %eax
-; NO-POPCOUNT-NEXT: andb $51, %al
+; NO-POPCOUNT-NEXT: movl %edi, %ecx
+; NO-POPCOUNT-NEXT: andb $51, %cl
; NO-POPCOUNT-NEXT: shrb $2, %dil
; NO-POPCOUNT-NEXT: andb $51, %dil
-; NO-POPCOUNT-NEXT: addb %al, %dil
-; NO-POPCOUNT-NEXT: movl %edi, %eax
+; NO-POPCOUNT-NEXT: addb %dil, %cl
+; NO-POPCOUNT-NEXT: movl %ecx, %eax
; NO-POPCOUNT-NEXT: shrb $4, %al
-; NO-POPCOUNT-NEXT: addb %dil, %al
+; NO-POPCOUNT-NEXT: addb %cl, %al
; NO-POPCOUNT-NEXT: andb $15, %al
; NO-POPCOUNT-NEXT: retq
%x2 = and i8 %x, 127
diff --git a/llvm/test/CodeGen/X86/divide-by-constant.ll b/llvm/test/CodeGen/X86/divide-by-constant.ll
index 23a3d1e095ca7..6cb36ea5609b7 100644
--- a/llvm/test/CodeGen/X86/divide-by-constant.ll
+++ b/llvm/test/CodeGen/X86/divide-by-constant.ll
@@ -442,10 +442,9 @@ define { i64, i32 } @PR38622_signed(i64) nounwind {
; X64-NEXT: movq %rdi, %rax
; X64-NEXT: imulq %rcx
; X64-NEXT: movq %rdx, %rax
-; X64-NEXT: movq %rdx, %rcx
-; X64-NEXT: shrq $63, %rcx
-; X64-NEXT: sarq $28, %rax
-; X64-NEXT: addq %rcx, %rax
+; X64-NEXT: shrq $63, %rax
+; X64-NEXT: sarq $28, %rdx
+; X64-NEXT: addq %rdx, %rax
; X64-NEXT: imull $-294967296, %eax, %ecx # imm = 0xEE6B2800
; X64-NEXT: subl %ecx, %edi
; X64-NEXT: movl %edi, %edx
diff --git a/llvm/test/CodeGen/X86/fpclamptosat.ll b/llvm/test/CodeGen/X86/fpclamptosat.ll
index dd21c081f891a..aba2740778ac2 100644
--- a/llvm/test/CodeGen/X86/fpclamptosat.ll
+++ b/llvm/test/CodeGen/X86/fpclamptosat.ll
@@ -430,19 +430,18 @@ define i64 @stest_f64i64(double %x) {
; CHECK-NEXT: pushq %rax
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: callq __fixdfti at PLT
-; CHECK-NEXT: xorl %esi, %esi
-; CHECK-NEXT: movabsq $9223372036854775807, %rcx # imm = 0x7FFFFFFFFFFFFFFF
-; CHECK-NEXT: cmpq %rcx, %rax
+; CHECK-NEXT: xorl %ecx, %ecx
+; CHECK-NEXT: movabsq $9223372036854775807, %rsi # imm = 0x7FFFFFFFFFFFFFFF
+; CHECK-NEXT: cmpq %rsi, %rax
; CHECK-NEXT: movq %rdx, %rdi
; CHECK-NEXT: sbbq $0, %rdi
-; CHECK-NEXT: cmovlq %rdx, %rsi
-; CHECK-NEXT: cmovlq %rax, %rcx
-; CHECK-NEXT: movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
-; CHECK-NEXT: cmpq %rcx, %rax
-; CHECK-NEXT: movq $-1, %rdx
-; CHECK-NEXT: sbbq %rsi, %rdx
-; CHECK-NEXT: cmovgeq %rax, %rcx
-; CHECK-NEXT: movq %rcx, %rax
+; CHECK-NEXT: cmovlq %rdx, %rcx
+; CHECK-NEXT: cmovgeq %rsi, %rax
+; CHECK-NEXT: movabsq $-9223372036854775808, %rdx # imm = 0x8000000000000000
+; CHECK-NEXT: cmpq %rax, %rdx
+; CHECK-NEXT: movq $-1, %rsi
+; CHECK-NEXT: sbbq %rcx, %rsi
+; CHECK-NEXT: cmovgeq %rdx, %rax
; CHECK-NEXT: popq %rcx
; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
@@ -511,19 +510,18 @@ define i64 @stest_f32i64(float %x) {
; CHECK-NEXT: pushq %rax
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: callq __fixsfti at PLT
-; CHECK-NEXT: xorl %esi, %esi
-; CHECK-NEXT: movabsq $9223372036854775807, %rcx # imm = 0x7FFFFFFFFFFFFFFF
-; CHECK-NEXT: cmpq %rcx, %rax
+; CHECK-NEXT: xorl %ecx, %ecx
+; CHECK-NEXT: movabsq $9223372036854775807, %rsi # imm = 0x7FFFFFFFFFFFFFFF
+; CHECK-NEXT: cmpq %rsi, %rax
; CHECK-NEXT: movq %rdx, %rdi
; CHECK-NEXT: sbbq $0, %rdi
-; CHECK-NEXT: cmovlq %rdx, %rsi
-; CHECK-NEXT: cmovlq %rax, %rcx
-; CHECK-NEXT: movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
-; CHECK-NEXT: cmpq %rcx, %rax
-; CHECK-NEXT: movq $-1, %rdx
-; CHECK-NEXT: sbbq %rsi, %rdx
-; CHECK-NEXT: cmovgeq %rax, %rcx
-; CHECK-NEXT: movq %rcx, %rax
+; CHECK-NEXT: cmovlq %rdx, %rcx
+; CHECK-NEXT: cmovgeq %rsi, %rax
+; CHECK-NEXT: movabsq $-9223372036854775808, %rdx # imm = 0x8000000000000000
+; CHECK-NEXT: cmpq %rax, %rdx
+; CHECK-NEXT: movq $-1, %rsi
+; CHECK-NEXT: sbbq %rcx, %rsi
+; CHECK-NEXT: cmovgeq %rdx, %rax
; CHECK-NEXT: popq %rcx
; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
@@ -594,19 +592,18 @@ define i64 @stest_f16i64(half %x) {
; CHECK-NEXT: movzwl %di, %edi
; CHECK-NEXT: callq __gnu_h2f_ieee at PLT
; CHECK-NEXT: callq __fixsfti at PLT
-; CHECK-NEXT: xorl %esi, %esi
-; CHECK-NEXT: movabsq $9223372036854775807, %rcx # imm = 0x7FFFFFFFFFFFFFFF
-; CHECK-NEXT: cmpq %rcx, %rax
+; CHECK-NEXT: xorl %ecx, %ecx
+; CHECK-NEXT: movabsq $9223372036854775807, %rsi # imm = 0x7FFFFFFFFFFFFFFF
+; CHECK-NEXT: cmpq %rsi, %rax
; CHECK-NEXT: movq %rdx, %rdi
; CHECK-NEXT: sbbq $0, %rdi
-; CHECK-NEXT: cmovlq %rdx, %rsi
-; CHECK-NEXT: cmovlq %rax, %rcx
-; CHECK-NEXT: movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
-; CHECK-NEXT: cmpq %rcx, %rax
-; CHECK-NEXT: movq $-1, %rdx
-; CHECK-NEXT: sbbq %rsi, %rdx
-; CHECK-NEXT: cmovgeq %rax, %rcx
-; CHECK-NEXT: movq %rcx, %rax
+; CHECK-NEXT: cmovlq %rdx, %rcx
+; CHECK-NEXT: cmovgeq %rsi, %rax
+; CHECK-NEXT: movabsq $-9223372036854775808, %rdx # imm = 0x8000000000000000
+; CHECK-NEXT: cmpq %rax, %rdx
+; CHECK-NEXT: movq $-1, %rsi
+; CHECK-NEXT: sbbq %rcx, %rsi
+; CHECK-NEXT: cmovgeq %rdx, %rax
; CHECK-NEXT: popq %rcx
; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
@@ -1084,18 +1081,17 @@ define i64 @stest_f64i64_mm(double %x) {
; CHECK-NEXT: cmovbq %rax, %rsi
; CHECK-NEXT: xorl %edi, %edi
; CHECK-NEXT: testq %rdx, %rdx
-; CHECK-NEXT: cmovsq %rax, %rcx
-; CHECK-NEXT: cmoveq %rsi, %rcx
+; CHECK-NEXT: cmovnsq %rcx, %rax
+; CHECK-NEXT: cmoveq %rsi, %rax
; CHECK-NEXT: cmovsq %rdx, %rdi
; CHECK-NEXT: testq %rdi, %rdi
-; CHECK-NEXT: movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
-; CHECK-NEXT: movq %rax, %rdx
-; CHECK-NEXT: cmovnsq %rcx, %rdx
-; CHECK-NEXT: cmpq %rax, %rcx
-; CHECK-NEXT: cmovbeq %rax, %rcx
+; CHECK-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
+; CHECK-NEXT: movq %rcx, %rdx
+; CHECK-NEXT: cmovnsq %rax, %rdx
+; CHECK-NEXT: cmpq %rcx, %rax
+; CHECK-NEXT: cmovbeq %rcx, %rax
; CHECK-NEXT: cmpq $-1, %rdi
-; CHECK-NEXT: cmovneq %rdx, %rcx
-; CHECK-NEXT: movq %rcx, %rax
+; CHECK-NEXT: cmovneq %rdx, %rax
; CHECK-NEXT: popq %rcx
; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
@@ -1166,18 +1162,17 @@ define i64 @stest_f32i64_mm(float %x) {
; CHECK-NEXT: cmovbq %rax, %rsi
; CHECK-NEXT: xorl %edi, %edi
; CHECK-NEXT: testq %rdx, %rdx
-; CHECK-NEXT: cmovsq %rax, %rcx
-; CHECK-NEXT: cmoveq %rsi, %rcx
+; CHECK-NEXT: cmovnsq %rcx, %rax
+; CHECK-NEXT: cmoveq %rsi, %rax
; CHECK-NEXT: cmovsq %rdx, %rdi
; CHECK-NEXT: testq %rdi, %rdi
-; CHECK-NEXT: movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
-; CHECK-NEXT: movq %rax, %rdx
-; CHECK-NEXT: cmovnsq %rcx, %rdx
-; CHECK-NEXT: cmpq %rax, %rcx
-; CHECK-NEXT: cmovbeq %rax, %rcx
+; CHECK-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
+; CHECK-NEXT: movq %rcx, %rdx
+; CHECK-NEXT: cmovnsq %rax, %rdx
+; CHECK-NEXT: cmpq %rcx, %rax
+; CHECK-NEXT: cmovbeq %rcx, %rax
; CHECK-NEXT: cmpq $-1, %rdi
-; CHECK-NEXT: cmovneq %rdx, %rcx
-; CHECK-NEXT: movq %rcx, %rax
+; CHECK-NEXT: cmovneq %rdx, %rax
; CHECK-NEXT: popq %rcx
; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
@@ -1250,18 +1245,17 @@ define i64 @stest_f16i64_mm(half %x) {
; CHECK-NEXT: cmovbq %rax, %rsi
; CHECK-NEXT: xorl %edi, %edi
; CHECK-NEXT: testq %rdx, %rdx
-; CHECK-NEXT: cmovsq %rax, %rcx
-; CHECK-NEXT: cmoveq %rsi, %rcx
+; CHECK-NEXT: cmovnsq %rcx, %rax
+; CHECK-NEXT: cmoveq %rsi, %rax
; CHECK-NEXT: cmovsq %rdx, %rdi
; CHECK-NEXT: testq %rdi, %rdi
-; CHECK-NEXT: movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
-; CHECK-NEXT: movq %rax, %rdx
-; CHECK-NEXT: cmovnsq %rcx, %rdx
-; CHECK-NEXT: cmpq %rax, %rcx
-; CHECK-NEXT: cmovbeq %rax, %rcx
+; CHECK-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
+; CHECK-NEXT: movq %rcx, %rdx
+; CHECK-NEXT: cmovnsq %rax, %rdx
+; CHECK-NEXT: cmpq %rcx, %rax
+; CHECK-NEXT: cmovbeq %rcx, %rax
; CHECK-NEXT: cmpq $-1, %rdi
-; CHECK-NEXT: cmovneq %rdx, %rcx
-; CHECK-NEXT: movq %rcx, %rax
+; CHECK-NEXT: cmovneq %rdx, %rax
; CHECK-NEXT: popq %rcx
; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/haddsub-3.ll b/llvm/test/CodeGen/X86/haddsub-3.ll
index baf7be7afccb1..4805945161a9f 100644
--- a/llvm/test/CodeGen/X86/haddsub-3.ll
+++ b/llvm/test/CodeGen/X86/haddsub-3.ll
@@ -11,18 +11,18 @@ define float @pr26491(<4 x float> %a0) {
; SSE2: # %bb.0:
; SSE2-NEXT: movaps %xmm0, %xmm1
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[3,3]
-; SSE2-NEXT: addps %xmm0, %xmm1
-; SSE2-NEXT: movaps %xmm1, %xmm0
-; SSE2-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; SSE2-NEXT: addps %xmm1, %xmm0
+; SSE2-NEXT: movaps %xmm0, %xmm1
+; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
; SSE2-NEXT: addss %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-SLOW-LABEL: pr26491:
; SSSE3-SLOW: # %bb.0:
; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; SSSE3-SLOW-NEXT: addps %xmm0, %xmm1
-; SSSE3-SLOW-NEXT: movaps %xmm1, %xmm0
-; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; SSSE3-SLOW-NEXT: addps %xmm1, %xmm0
+; SSSE3-SLOW-NEXT: movaps %xmm0, %xmm1
+; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
; SSSE3-SLOW-NEXT: addss %xmm1, %xmm0
; SSSE3-SLOW-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/haddsub-shuf.ll b/llvm/test/CodeGen/X86/haddsub-shuf.ll
index 74c10d365493a..06f36146baae5 100644
--- a/llvm/test/CodeGen/X86/haddsub-shuf.ll
+++ b/llvm/test/CodeGen/X86/haddsub-shuf.ll
@@ -479,8 +479,8 @@ define <8 x i32> @hadd_v8i32a(<8 x i32> %a) {
; SSE3-LABEL: hadd_v8i32a:
; SSE3: # %bb.0:
; SSE3-NEXT: movaps %xmm0, %xmm2
-; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; SSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm1[1,3]
+; SSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[0,2]
+; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
; SSE3-NEXT: paddd %xmm0, %xmm2
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,0,1]
; SSE3-NEXT: movdqa %xmm2, %xmm1
diff --git a/llvm/test/CodeGen/X86/haddsub.ll b/llvm/test/CodeGen/X86/haddsub.ll
index 8e7cadcd15565..19f562fa57262 100644
--- a/llvm/test/CodeGen/X86/haddsub.ll
+++ b/llvm/test/CodeGen/X86/haddsub.ll
@@ -1777,8 +1777,8 @@ define float @hadd32_4(<4 x float> %x225) {
; SSE3-SLOW: # %bb.0:
; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1
; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE3-SLOW-NEXT: addps %xmm0, %xmm1
-; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE3-SLOW-NEXT: addps %xmm1, %xmm0
+; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; SSE3-SLOW-NEXT: addss %xmm1, %xmm0
; SSE3-SLOW-NEXT: retq
;
@@ -1786,9 +1786,8 @@ define float @hadd32_4(<4 x float> %x225) {
; SSE3-FAST: # %bb.0:
; SSE3-FAST-NEXT: movaps %xmm0, %xmm1
; SSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE3-FAST-NEXT: addps %xmm0, %xmm1
-; SSE3-FAST-NEXT: haddps %xmm1, %xmm1
-; SSE3-FAST-NEXT: movaps %xmm1, %xmm0
+; SSE3-FAST-NEXT: addps %xmm1, %xmm0
+; SSE3-FAST-NEXT: haddps %xmm0, %xmm0
; SSE3-FAST-NEXT: retq
;
; AVX-SLOW-LABEL: hadd32_4:
@@ -1818,8 +1817,8 @@ define float @hadd32_8(<8 x float> %x225) {
; SSE3-SLOW: # %bb.0:
; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1
; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE3-SLOW-NEXT: addps %xmm0, %xmm1
-; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE3-SLOW-NEXT: addps %xmm1, %xmm0
+; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; SSE3-SLOW-NEXT: addss %xmm1, %xmm0
; SSE3-SLOW-NEXT: retq
;
@@ -1827,9 +1826,8 @@ define float @hadd32_8(<8 x float> %x225) {
; SSE3-FAST: # %bb.0:
; SSE3-FAST-NEXT: movaps %xmm0, %xmm1
; SSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE3-FAST-NEXT: addps %xmm0, %xmm1
-; SSE3-FAST-NEXT: haddps %xmm1, %xmm1
-; SSE3-FAST-NEXT: movaps %xmm1, %xmm0
+; SSE3-FAST-NEXT: addps %xmm1, %xmm0
+; SSE3-FAST-NEXT: haddps %xmm0, %xmm0
; SSE3-FAST-NEXT: retq
;
; AVX-SLOW-LABEL: hadd32_8:
@@ -1861,8 +1859,8 @@ define float @hadd32_16(<16 x float> %x225) {
; SSE3-SLOW: # %bb.0:
; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1
; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE3-SLOW-NEXT: addps %xmm0, %xmm1
-; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE3-SLOW-NEXT: addps %xmm1, %xmm0
+; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; SSE3-SLOW-NEXT: addss %xmm1, %xmm0
; SSE3-SLOW-NEXT: retq
;
@@ -1870,9 +1868,8 @@ define float @hadd32_16(<16 x float> %x225) {
; SSE3-FAST: # %bb.0:
; SSE3-FAST-NEXT: movaps %xmm0, %xmm1
; SSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE3-FAST-NEXT: addps %xmm0, %xmm1
-; SSE3-FAST-NEXT: haddps %xmm1, %xmm1
-; SSE3-FAST-NEXT: movaps %xmm1, %xmm0
+; SSE3-FAST-NEXT: addps %xmm1, %xmm0
+; SSE3-FAST-NEXT: haddps %xmm0, %xmm0
; SSE3-FAST-NEXT: retq
;
; AVX-SLOW-LABEL: hadd32_16:
@@ -1904,9 +1901,8 @@ define float @hadd32_4_optsize(<4 x float> %x225) optsize {
; SSE3: # %bb.0:
; SSE3-NEXT: movaps %xmm0, %xmm1
; SSE3-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE3-NEXT: addps %xmm0, %xmm1
-; SSE3-NEXT: haddps %xmm1, %xmm1
-; SSE3-NEXT: movaps %xmm1, %xmm0
+; SSE3-NEXT: addps %xmm1, %xmm0
+; SSE3-NEXT: haddps %xmm0, %xmm0
; SSE3-NEXT: retq
;
; AVX-LABEL: hadd32_4_optsize:
@@ -1928,9 +1924,8 @@ define float @hadd32_8_optsize(<8 x float> %x225) optsize {
; SSE3: # %bb.0:
; SSE3-NEXT: movaps %xmm0, %xmm1
; SSE3-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE3-NEXT: addps %xmm0, %xmm1
-; SSE3-NEXT: haddps %xmm1, %xmm1
-; SSE3-NEXT: movaps %xmm1, %xmm0
+; SSE3-NEXT: addps %xmm1, %xmm0
+; SSE3-NEXT: haddps %xmm0, %xmm0
; SSE3-NEXT: retq
;
; AVX-LABEL: hadd32_8_optsize:
@@ -1953,9 +1948,8 @@ define float @hadd32_16_optsize(<16 x float> %x225) optsize {
; SSE3: # %bb.0:
; SSE3-NEXT: movaps %xmm0, %xmm1
; SSE3-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE3-NEXT: addps %xmm0, %xmm1
-; SSE3-NEXT: haddps %xmm1, %xmm1
-; SSE3-NEXT: movaps %xmm1, %xmm0
+; SSE3-NEXT: addps %xmm1, %xmm0
+; SSE3-NEXT: haddps %xmm0, %xmm0
; SSE3-NEXT: retq
;
; AVX-LABEL: hadd32_16_optsize:
@@ -1978,9 +1972,8 @@ define float @hadd32_4_pgso(<4 x float> %x225) !prof !14 {
; SSE3: # %bb.0:
; SSE3-NEXT: movaps %xmm0, %xmm1
; SSE3-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE3-NEXT: addps %xmm0, %xmm1
-; SSE3-NEXT: haddps %xmm1, %xmm1
-; SSE3-NEXT: movaps %xmm1, %xmm0
+; SSE3-NEXT: addps %xmm1, %xmm0
+; SSE3-NEXT: haddps %xmm0, %xmm0
; SSE3-NEXT: retq
;
; AVX-LABEL: hadd32_4_pgso:
@@ -2002,9 +1995,8 @@ define float @hadd32_8_pgso(<8 x float> %x225) !prof !14 {
; SSE3: # %bb.0:
; SSE3-NEXT: movaps %xmm0, %xmm1
; SSE3-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE3-NEXT: addps %xmm0, %xmm1
-; SSE3-NEXT: haddps %xmm1, %xmm1
-; SSE3-NEXT: movaps %xmm1, %xmm0
+; SSE3-NEXT: addps %xmm1, %xmm0
+; SSE3-NEXT: haddps %xmm0, %xmm0
; SSE3-NEXT: retq
;
; AVX-LABEL: hadd32_8_pgso:
@@ -2027,9 +2019,8 @@ define float @hadd32_16_pgso(<16 x float> %x225) !prof !14 {
; SSE3: # %bb.0:
; SSE3-NEXT: movaps %xmm0, %xmm1
; SSE3-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE3-NEXT: addps %xmm0, %xmm1
-; SSE3-NEXT: haddps %xmm1, %xmm1
-; SSE3-NEXT: movaps %xmm1, %xmm0
+; SSE3-NEXT: addps %xmm1, %xmm0
+; SSE3-NEXT: haddps %xmm0, %xmm0
; SSE3-NEXT: retq
;
; AVX-LABEL: hadd32_16_pgso:
@@ -2052,8 +2043,8 @@ define float @partial_reduction_fadd_v8f32(<8 x float> %x) {
; SSE3-SLOW: # %bb.0:
; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1
; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE3-SLOW-NEXT: addps %xmm0, %xmm1
-; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE3-SLOW-NEXT: addps %xmm1, %xmm0
+; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; SSE3-SLOW-NEXT: addss %xmm1, %xmm0
; SSE3-SLOW-NEXT: retq
;
@@ -2061,9 +2052,8 @@ define float @partial_reduction_fadd_v8f32(<8 x float> %x) {
; SSE3-FAST: # %bb.0:
; SSE3-FAST-NEXT: movaps %xmm0, %xmm1
; SSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE3-FAST-NEXT: addps %xmm0, %xmm1
-; SSE3-FAST-NEXT: haddps %xmm1, %xmm1
-; SSE3-FAST-NEXT: movaps %xmm1, %xmm0
+; SSE3-FAST-NEXT: addps %xmm1, %xmm0
+; SSE3-FAST-NEXT: haddps %xmm0, %xmm0
; SSE3-FAST-NEXT: retq
;
; AVX-SLOW-LABEL: partial_reduction_fadd_v8f32:
@@ -2097,8 +2087,8 @@ define float @partial_reduction_fadd_v8f32_wrong_flags(<8 x float> %x) {
; SSE3-SLOW: # %bb.0:
; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1
; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE3-SLOW-NEXT: addps %xmm0, %xmm1
-; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE3-SLOW-NEXT: addps %xmm1, %xmm0
+; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; SSE3-SLOW-NEXT: addss %xmm1, %xmm0
; SSE3-SLOW-NEXT: retq
;
@@ -2106,9 +2096,8 @@ define float @partial_reduction_fadd_v8f32_wrong_flags(<8 x float> %x) {
; SSE3-FAST: # %bb.0:
; SSE3-FAST-NEXT: movaps %xmm0, %xmm1
; SSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE3-FAST-NEXT: addps %xmm0, %xmm1
-; SSE3-FAST-NEXT: haddps %xmm1, %xmm1
-; SSE3-FAST-NEXT: movaps %xmm1, %xmm0
+; SSE3-FAST-NEXT: addps %xmm1, %xmm0
+; SSE3-FAST-NEXT: haddps %xmm0, %xmm0
; SSE3-FAST-NEXT: retq
;
; AVX-SLOW-LABEL: partial_reduction_fadd_v8f32_wrong_flags:
@@ -2140,8 +2129,8 @@ define float @partial_reduction_fadd_v16f32(<16 x float> %x) {
; SSE3-SLOW: # %bb.0:
; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1
; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE3-SLOW-NEXT: addps %xmm0, %xmm1
-; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE3-SLOW-NEXT: addps %xmm1, %xmm0
+; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; SSE3-SLOW-NEXT: addss %xmm1, %xmm0
; SSE3-SLOW-NEXT: retq
;
@@ -2149,9 +2138,8 @@ define float @partial_reduction_fadd_v16f32(<16 x float> %x) {
; SSE3-FAST: # %bb.0:
; SSE3-FAST-NEXT: movaps %xmm0, %xmm1
; SSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE3-FAST-NEXT: addps %xmm0, %xmm1
-; SSE3-FAST-NEXT: haddps %xmm1, %xmm1
-; SSE3-FAST-NEXT: movaps %xmm1, %xmm0
+; SSE3-FAST-NEXT: addps %xmm1, %xmm0
+; SSE3-FAST-NEXT: haddps %xmm0, %xmm0
; SSE3-FAST-NEXT: retq
;
; AVX-SLOW-LABEL: partial_reduction_fadd_v16f32:
diff --git a/llvm/test/CodeGen/X86/horizontal-reduce-fadd.ll b/llvm/test/CodeGen/X86/horizontal-reduce-fadd.ll
index 5a5fa774c8477..3e97c22e48e3d 100644
--- a/llvm/test/CodeGen/X86/horizontal-reduce-fadd.ll
+++ b/llvm/test/CodeGen/X86/horizontal-reduce-fadd.ll
@@ -13,9 +13,9 @@ define float @PR37890_v4f32(<4 x float> %a) {
; SSE2: # %bb.0:
; SSE2-NEXT: movaps %xmm0, %xmm1
; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE2-NEXT: addps %xmm0, %xmm1
-; SSE2-NEXT: movaps %xmm1, %xmm0
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1]
+; SSE2-NEXT: addps %xmm1, %xmm0
+; SSE2-NEXT: movaps %xmm0, %xmm1
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
; SSE2-NEXT: addss %xmm1, %xmm0
; SSE2-NEXT: retq
;
@@ -23,8 +23,8 @@ define float @PR37890_v4f32(<4 x float> %a) {
; SSSE3-SLOW: # %bb.0:
; SSSE3-SLOW-NEXT: movaps %xmm0, %xmm1
; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSSE3-SLOW-NEXT: addps %xmm0, %xmm1
-; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSSE3-SLOW-NEXT: addps %xmm1, %xmm0
+; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; SSSE3-SLOW-NEXT: addss %xmm1, %xmm0
; SSSE3-SLOW-NEXT: retq
;
@@ -127,9 +127,9 @@ define float @PR37890_v8f32(<8 x float> %a) {
; SSE2-NEXT: addps %xmm1, %xmm0
; SSE2-NEXT: movaps %xmm0, %xmm1
; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE2-NEXT: addps %xmm0, %xmm1
-; SSE2-NEXT: movaps %xmm1, %xmm0
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1]
+; SSE2-NEXT: addps %xmm1, %xmm0
+; SSE2-NEXT: movaps %xmm0, %xmm1
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
; SSE2-NEXT: addss %xmm1, %xmm0
; SSE2-NEXT: retq
;
@@ -138,8 +138,8 @@ define float @PR37890_v8f32(<8 x float> %a) {
; SSSE3-SLOW-NEXT: addps %xmm1, %xmm0
; SSSE3-SLOW-NEXT: movaps %xmm0, %xmm1
; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSSE3-SLOW-NEXT: addps %xmm0, %xmm1
-; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSSE3-SLOW-NEXT: addps %xmm1, %xmm0
+; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; SSSE3-SLOW-NEXT: addss %xmm1, %xmm0
; SSSE3-SLOW-NEXT: retq
;
@@ -197,9 +197,9 @@ define double @PR37890_v8f64(<8 x double> %a) {
; SSE2: # %bb.0:
; SSE2-NEXT: addpd %xmm3, %xmm1
; SSE2-NEXT: addpd %xmm2, %xmm1
-; SSE2-NEXT: addpd %xmm0, %xmm1
-; SSE2-NEXT: movapd %xmm1, %xmm0
-; SSE2-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; SSE2-NEXT: addpd %xmm1, %xmm0
+; SSE2-NEXT: movapd %xmm0, %xmm1
+; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
; SSE2-NEXT: addsd %xmm1, %xmm0
; SSE2-NEXT: retq
;
@@ -207,9 +207,9 @@ define double @PR37890_v8f64(<8 x double> %a) {
; SSSE3-SLOW: # %bb.0:
; SSSE3-SLOW-NEXT: addpd %xmm3, %xmm1
; SSSE3-SLOW-NEXT: addpd %xmm2, %xmm1
-; SSSE3-SLOW-NEXT: addpd %xmm0, %xmm1
-; SSSE3-SLOW-NEXT: movapd %xmm1, %xmm0
-; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; SSSE3-SLOW-NEXT: addpd %xmm1, %xmm0
+; SSSE3-SLOW-NEXT: movapd %xmm0, %xmm1
+; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
; SSSE3-SLOW-NEXT: addsd %xmm1, %xmm0
; SSSE3-SLOW-NEXT: retq
;
@@ -217,9 +217,8 @@ define double @PR37890_v8f64(<8 x double> %a) {
; SSSE3-FAST: # %bb.0:
; SSSE3-FAST-NEXT: addpd %xmm3, %xmm1
; SSSE3-FAST-NEXT: addpd %xmm2, %xmm1
-; SSSE3-FAST-NEXT: addpd %xmm0, %xmm1
-; SSSE3-FAST-NEXT: haddpd %xmm1, %xmm1
-; SSSE3-FAST-NEXT: movapd %xmm1, %xmm0
+; SSSE3-FAST-NEXT: addpd %xmm1, %xmm0
+; SSSE3-FAST-NEXT: haddpd %xmm0, %xmm0
; SSSE3-FAST-NEXT: retq
;
; AVX1-SLOW-LABEL: PR37890_v8f64:
@@ -267,34 +266,34 @@ define float @PR37890_v16f32(<16 x float> %a) {
; SSE2: # %bb.0:
; SSE2-NEXT: addps %xmm3, %xmm1
; SSE2-NEXT: addps %xmm2, %xmm1
-; SSE2-NEXT: addps %xmm0, %xmm1
-; SSE2-NEXT: movaps %xmm1, %xmm2
-; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
-; SSE2-NEXT: addps %xmm1, %xmm2
-; SSE2-NEXT: movaps %xmm2, %xmm0
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[1,1]
-; SSE2-NEXT: addss %xmm2, %xmm0
+; SSE2-NEXT: addps %xmm1, %xmm0
+; SSE2-NEXT: movaps %xmm0, %xmm1
+; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
+; SSE2-NEXT: addps %xmm1, %xmm0
+; SSE2-NEXT: movaps %xmm0, %xmm1
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
+; SSE2-NEXT: addss %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-SLOW-LABEL: PR37890_v16f32:
; SSSE3-SLOW: # %bb.0:
; SSSE3-SLOW-NEXT: addps %xmm3, %xmm1
; SSSE3-SLOW-NEXT: addps %xmm2, %xmm1
-; SSSE3-SLOW-NEXT: addps %xmm0, %xmm1
-; SSSE3-SLOW-NEXT: movaps %xmm1, %xmm2
-; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
-; SSSE3-SLOW-NEXT: addps %xmm1, %xmm2
-; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; SSSE3-SLOW-NEXT: addss %xmm2, %xmm0
+; SSSE3-SLOW-NEXT: addps %xmm1, %xmm0
+; SSSE3-SLOW-NEXT: movaps %xmm0, %xmm1
+; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
+; SSSE3-SLOW-NEXT: addps %xmm1, %xmm0
+; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; SSSE3-SLOW-NEXT: addss %xmm1, %xmm0
; SSSE3-SLOW-NEXT: retq
;
; SSSE3-FAST-LABEL: PR37890_v16f32:
; SSSE3-FAST: # %bb.0:
; SSSE3-FAST-NEXT: addps %xmm3, %xmm1
; SSSE3-FAST-NEXT: addps %xmm2, %xmm1
-; SSSE3-FAST-NEXT: addps %xmm0, %xmm1
-; SSSE3-FAST-NEXT: movaps %xmm1, %xmm0
-; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; SSSE3-FAST-NEXT: addps %xmm1, %xmm0
+; SSSE3-FAST-NEXT: movaps %xmm0, %xmm1
+; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
; SSSE3-FAST-NEXT: addps %xmm1, %xmm0
; SSSE3-FAST-NEXT: haddps %xmm0, %xmm0
; SSSE3-FAST-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/horizontal-sum.ll b/llvm/test/CodeGen/X86/horizontal-sum.ll
index 98414c34864cd..3565165dc863a 100644
--- a/llvm/test/CodeGen/X86/horizontal-sum.ll
+++ b/llvm/test/CodeGen/X86/horizontal-sum.ll
@@ -535,53 +535,53 @@ define <8 x i32> @pair_sum_v8i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2,
define <4 x float> @sequential_sum_v4f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x float> %2, <4 x float> %3) {
; SSSE3-SLOW-LABEL: sequential_sum_v4f32_v4f32:
; SSSE3-SLOW: # %bb.0:
-; SSSE3-SLOW-NEXT: movaps %xmm0, %xmm4
-; SSSE3-SLOW-NEXT: haddps %xmm1, %xmm4
; SSSE3-SLOW-NEXT: movaps %xmm0, %xmm5
-; SSSE3-SLOW-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm1[2],xmm5[3],xmm1[3]
+; SSSE3-SLOW-NEXT: haddps %xmm1, %xmm5
+; SSSE3-SLOW-NEXT: movaps %xmm0, %xmm4
+; SSSE3-SLOW-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3]
; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3]
; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm0 = xmm2[1,1,3,3]
; SSSE3-SLOW-NEXT: addps %xmm2, %xmm0
-; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm0[0,1]
-; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm2[2,3]
-; SSSE3-SLOW-NEXT: addps %xmm4, %xmm5
+; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm0[0,1]
+; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,3]
+; SSSE3-SLOW-NEXT: addps %xmm5, %xmm4
; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,3]
-; SSSE3-SLOW-NEXT: addps %xmm5, %xmm1
+; SSSE3-SLOW-NEXT: addps %xmm1, %xmm4
; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm0 = xmm3[1,1,3,3]
; SSSE3-SLOW-NEXT: addps %xmm3, %xmm0
-; SSSE3-SLOW-NEXT: movaps %xmm3, %xmm2
-; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1]
-; SSSE3-SLOW-NEXT: addps %xmm0, %xmm2
+; SSSE3-SLOW-NEXT: movaps %xmm3, %xmm1
+; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1]
+; SSSE3-SLOW-NEXT: addps %xmm0, %xmm1
; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3,3,3]
-; SSSE3-SLOW-NEXT: addps %xmm2, %xmm3
-; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,3]
-; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,0]
-; SSSE3-SLOW-NEXT: movaps %xmm1, %xmm0
+; SSSE3-SLOW-NEXT: addps %xmm1, %xmm3
+; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3]
+; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,0]
+; SSSE3-SLOW-NEXT: movaps %xmm4, %xmm0
; SSSE3-SLOW-NEXT: retq
;
; SSSE3-FAST-LABEL: sequential_sum_v4f32_v4f32:
; SSSE3-FAST: # %bb.0:
-; SSSE3-FAST-NEXT: movaps %xmm0, %xmm4
-; SSSE3-FAST-NEXT: haddps %xmm1, %xmm4
; SSSE3-FAST-NEXT: movaps %xmm0, %xmm5
-; SSSE3-FAST-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm1[2],xmm5[3],xmm1[3]
+; SSSE3-FAST-NEXT: haddps %xmm1, %xmm5
+; SSSE3-FAST-NEXT: movaps %xmm0, %xmm4
+; SSSE3-FAST-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3]
; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3]
-; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm2[2,3]
+; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,3]
; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,3]
; SSSE3-FAST-NEXT: haddps %xmm2, %xmm2
-; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm2[0,1]
-; SSSE3-FAST-NEXT: addps %xmm4, %xmm5
-; SSSE3-FAST-NEXT: addps %xmm5, %xmm1
+; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm2[0,1]
+; SSSE3-FAST-NEXT: addps %xmm5, %xmm4
+; SSSE3-FAST-NEXT: addps %xmm1, %xmm4
; SSSE3-FAST-NEXT: movaps %xmm3, %xmm0
; SSSE3-FAST-NEXT: haddps %xmm3, %xmm0
-; SSSE3-FAST-NEXT: movaps %xmm3, %xmm2
-; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1]
-; SSSE3-FAST-NEXT: addps %xmm0, %xmm2
+; SSSE3-FAST-NEXT: movaps %xmm3, %xmm1
+; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1]
+; SSSE3-FAST-NEXT: addps %xmm0, %xmm1
; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3,3,3]
-; SSSE3-FAST-NEXT: addps %xmm2, %xmm3
-; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,3]
-; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,0]
-; SSSE3-FAST-NEXT: movaps %xmm1, %xmm0
+; SSSE3-FAST-NEXT: addps %xmm1, %xmm3
+; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3]
+; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,0]
+; SSSE3-FAST-NEXT: movaps %xmm4, %xmm0
; SSSE3-FAST-NEXT: retq
;
; AVX-SLOW-LABEL: sequential_sum_v4f32_v4f32:
@@ -961,13 +961,13 @@ define <4 x float> @reduction_sum_v4f32_v4f32_reassoc(<4 x float> %0, <4 x float
; SSSE3-SLOW: # %bb.0:
; SSSE3-SLOW-NEXT: movaps %xmm0, %xmm4
; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1]
-; SSSE3-SLOW-NEXT: addps %xmm0, %xmm4
-; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm0 = xmm4[1,1,3,3]
+; SSSE3-SLOW-NEXT: addps %xmm4, %xmm0
+; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
; SSSE3-SLOW-NEXT: movaps %xmm1, %xmm5
; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1]
; SSSE3-SLOW-NEXT: addps %xmm1, %xmm5
; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm5[1,1,3,3]
-; SSSE3-SLOW-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSSE3-SLOW-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
; SSSE3-SLOW-NEXT: movaps %xmm2, %xmm1
; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
; SSSE3-SLOW-NEXT: addps %xmm2, %xmm1
@@ -976,10 +976,10 @@ define <4 x float> @reduction_sum_v4f32_v4f32_reassoc(<4 x float> %0, <4 x float
; SSSE3-SLOW-NEXT: addps %xmm3, %xmm2
; SSSE3-SLOW-NEXT: movaps %xmm2, %xmm3
; SSSE3-SLOW-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm1[0]
-; SSSE3-SLOW-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
-; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,0]
+; SSSE3-SLOW-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1]
+; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,0]
; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[1,1]
-; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0]
+; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,0]
; SSSE3-SLOW-NEXT: addps %xmm4, %xmm0
; SSSE3-SLOW-NEXT: retq
;
@@ -1050,12 +1050,12 @@ define <4 x i32> @reduction_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32
; SSSE3-SLOW-LABEL: reduction_sum_v4i32_v4i32:
; SSSE3-SLOW: # %bb.0:
; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
-; SSSE3-SLOW-NEXT: paddd %xmm0, %xmm4
-; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1]
+; SSSE3-SLOW-NEXT: paddd %xmm4, %xmm0
+; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1]
; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,3,2,3]
; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm5
; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,1,1]
-; SSSE3-SLOW-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSSE3-SLOW-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
; SSSE3-SLOW-NEXT: paddd %xmm2, %xmm1
; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
@@ -1063,12 +1063,11 @@ define <4 x i32> @reduction_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32
; SSSE3-SLOW-NEXT: paddd %xmm3, %xmm6
; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,1,1]
; SSSE3-SLOW-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; SSSE3-SLOW-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; SSSE3-SLOW-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0]
; SSSE3-SLOW-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1]
-; SSSE3-SLOW-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
-; SSSE3-SLOW-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm1[0]
-; SSSE3-SLOW-NEXT: paddd %xmm0, %xmm4
-; SSSE3-SLOW-NEXT: movdqa %xmm4, %xmm0
+; SSSE3-SLOW-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1]
+; SSSE3-SLOW-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-SLOW-NEXT: paddd %xmm4, %xmm0
; SSSE3-SLOW-NEXT: retq
;
; SSSE3-FAST-LABEL: reduction_sum_v4i32_v4i32:
diff --git a/llvm/test/CodeGen/X86/lzcnt-cmp.ll b/llvm/test/CodeGen/X86/lzcnt-cmp.ll
index c094920d59eba..01d6c0ed483b7 100644
--- a/llvm/test/CodeGen/X86/lzcnt-cmp.ll
+++ b/llvm/test/CodeGen/X86/lzcnt-cmp.ll
@@ -194,11 +194,11 @@ define <2 x i64> @lshr_ctlz_cmpeq_zero_v2i64(<2 x i64> %in) nounwind {
; X64-LABEL: lshr_ctlz_cmpeq_zero_v2i64:
; X64: # %bb.0:
; X64-NEXT: pxor %xmm1, %xmm1
-; X64-NEXT: pcmpeqd %xmm0, %xmm1
-; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,0,3,2]
-; X64-NEXT: pand %xmm1, %xmm2
-; X64-NEXT: pcmpeqd %xmm0, %xmm0
-; X64-NEXT: pxor %xmm2, %xmm0
+; X64-NEXT: pcmpeqd %xmm1, %xmm0
+; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
+; X64-NEXT: pand %xmm1, %xmm0
+; X64-NEXT: pcmpeqd %xmm1, %xmm1
+; X64-NEXT: pxor %xmm1, %xmm0
; X64-NEXT: retq
%ctlz = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %in, i1 0)
%lshr = lshr <2 x i64> %ctlz, <i64 6, i64 6>
@@ -232,8 +232,8 @@ define <2 x i64> @lshr_ctlz_cmpne_zero_v2i64(<2 x i64> %in) nounwind {
; X64-LABEL: lshr_ctlz_cmpne_zero_v2i64:
; X64: # %bb.0:
; X64-NEXT: pxor %xmm1, %xmm1
-; X64-NEXT: pcmpeqd %xmm0, %xmm1
-; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,0,3,2]
+; X64-NEXT: pcmpeqd %xmm1, %xmm0
+; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
; X64-NEXT: pand %xmm1, %xmm0
; X64-NEXT: retq
%ctlz = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %in, i1 0)
diff --git a/llvm/test/CodeGen/X86/nontemporal-loads.ll b/llvm/test/CodeGen/X86/nontemporal-loads.ll
index 16e684b633349..88d8aa7f4a9b4 100644
--- a/llvm/test/CodeGen/X86/nontemporal-loads.ll
+++ b/llvm/test/CodeGen/X86/nontemporal-loads.ll
@@ -1764,10 +1764,10 @@ define <16 x i32> @test_masked_v16i32(i8 * %addr, <16 x i32> %old, <16 x i32> %m
; SSE2-NEXT: pcmpeqd %xmm8, %xmm7
; SSE2-NEXT: pcmpeqd %xmm8, %xmm6
; SSE2-NEXT: pcmpeqd %xmm8, %xmm5
-; SSE2-NEXT: pcmpeqd %xmm8, %xmm4
-; SSE2-NEXT: pand %xmm4, %xmm0
-; SSE2-NEXT: pandn (%rdi), %xmm4
-; SSE2-NEXT: por %xmm4, %xmm0
+; SSE2-NEXT: pcmpeqd %xmm4, %xmm8
+; SSE2-NEXT: pand %xmm8, %xmm0
+; SSE2-NEXT: pandn (%rdi), %xmm8
+; SSE2-NEXT: por %xmm8, %xmm0
; SSE2-NEXT: pand %xmm5, %xmm1
; SSE2-NEXT: pandn 16(%rdi), %xmm5
; SSE2-NEXT: por %xmm5, %xmm1
diff --git a/llvm/test/CodeGen/X86/omit-urem-of-power-of-two-or-zero-when-comparing-with-zero.ll b/llvm/test/CodeGen/X86/omit-urem-of-power-of-two-or-zero-when-comparing-with-zero.ll
index 2ba5cb9d65df7..85ed99831e5e1 100644
--- a/llvm/test/CodeGen/X86/omit-urem-of-power-of-two-or-zero-when-comparing-with-zero.ll
+++ b/llvm/test/CodeGen/X86/omit-urem-of-power-of-two-or-zero-when-comparing-with-zero.ll
@@ -159,10 +159,10 @@ define <4 x i1> @p5_vector_urem_by_const__nonsplat(<4 x i32> %x, <4 x i32> %y) {
; SSE4-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; SSE4-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
-; SSE4-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; SSE4-NEXT: por %xmm2, %xmm1
-; SSE4-NEXT: movdqa {{.*#+}} xmm0 = [1431655765,858993459,715827882,477218588]
-; SSE4-NEXT: pminud %xmm1, %xmm0
+; SSE4-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; SSE4-NEXT: por %xmm2, %xmm0
+; SSE4-NEXT: movdqa {{.*#+}} xmm1 = [1431655765,858993459,715827882,477218588]
+; SSE4-NEXT: pminud %xmm0, %xmm1
; SSE4-NEXT: pcmpeqd %xmm1, %xmm0
; SSE4-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/pmulh.ll b/llvm/test/CodeGen/X86/pmulh.ll
index c34f724855f98..8ac9357177d89 100644
--- a/llvm/test/CodeGen/X86/pmulh.ll
+++ b/llvm/test/CodeGen/X86/pmulh.ll
@@ -820,13 +820,11 @@ define <4 x i32> @mulhsw_v4i16_ashr(<4 x i16> %a, <4 x i16> %b) {
define <8 x i32> @zext_mulhuw_v8i16_lshr(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: zext_mulhuw_v8i16_lshr:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: pmulhuw %xmm1, %xmm2
-; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: movdqa %xmm2, %xmm0
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; SSE2-NEXT: movdqa %xmm2, %xmm1
+; SSE2-NEXT: pmulhuw %xmm0, %xmm1
+; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
; SSE2-NEXT: retq
;
; SSE41-LABEL: zext_mulhuw_v8i16_lshr:
@@ -854,13 +852,11 @@ define <8 x i32> @zext_mulhuw_v8i16_lshr(<8 x i16> %a, <8 x i16> %b) {
define <8 x i32> @mulhsw_v8i16_lshr(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: mulhsw_v8i16_lshr:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: pmulhw %xmm1, %xmm2
-; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: movdqa %xmm2, %xmm0
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; SSE2-NEXT: movdqa %xmm2, %xmm1
+; SSE2-NEXT: pmulhw %xmm0, %xmm1
+; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
; SSE2-NEXT: retq
;
; SSE41-LABEL: mulhsw_v8i16_lshr:
@@ -920,18 +916,17 @@ define <8 x i32> @mulhsw_v8i16_ashr(<8 x i16> %a, <8 x i16> %b) {
define <16 x i32> @zext_mulhuw_v16i16_lshr(<16 x i16> %a, <16 x i16> %b) {
; SSE2-LABEL: zext_mulhuw_v16i16_lshr:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm1, %xmm4
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: pmulhuw %xmm2, %xmm1
+; SSE2-NEXT: movdqa %xmm0, %xmm4
+; SSE2-NEXT: pmulhuw %xmm2, %xmm4
; SSE2-NEXT: pxor %xmm5, %xmm5
-; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: movdqa %xmm4, %xmm0
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3]
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7]
-; SSE2-NEXT: pmulhuw %xmm3, %xmm4
-; SSE2-NEXT: movdqa %xmm4, %xmm2
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3]
; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
-; SSE2-NEXT: movdqa %xmm4, %xmm3
+; SSE2-NEXT: pmulhuw %xmm1, %xmm3
+; SSE2-NEXT: movdqa %xmm3, %xmm2
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3]
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
+; SSE2-NEXT: movdqa %xmm4, %xmm1
; SSE2-NEXT: retq
;
; SSE41-LABEL: zext_mulhuw_v16i16_lshr:
@@ -971,18 +966,17 @@ define <16 x i32> @zext_mulhuw_v16i16_lshr(<16 x i16> %a, <16 x i16> %b) {
define <16 x i32> @mulhsw_v16i16_lshr(<16 x i16> %a, <16 x i16> %b) {
; SSE2-LABEL: mulhsw_v16i16_lshr:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm1, %xmm4
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: pmulhw %xmm2, %xmm1
+; SSE2-NEXT: movdqa %xmm0, %xmm4
+; SSE2-NEXT: pmulhw %xmm2, %xmm4
; SSE2-NEXT: pxor %xmm5, %xmm5
-; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: movdqa %xmm4, %xmm0
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3]
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7]
-; SSE2-NEXT: pmulhw %xmm3, %xmm4
-; SSE2-NEXT: movdqa %xmm4, %xmm2
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3]
; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
-; SSE2-NEXT: movdqa %xmm4, %xmm3
+; SSE2-NEXT: pmulhw %xmm1, %xmm3
+; SSE2-NEXT: movdqa %xmm3, %xmm2
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3]
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
+; SSE2-NEXT: movdqa %xmm4, %xmm1
; SSE2-NEXT: retq
;
; SSE41-LABEL: mulhsw_v16i16_lshr:
diff --git a/llvm/test/CodeGen/X86/popcnt.ll b/llvm/test/CodeGen/X86/popcnt.ll
index ad0fb4ccfd85b..010a615eef755 100644
--- a/llvm/test/CodeGen/X86/popcnt.ll
+++ b/llvm/test/CodeGen/X86/popcnt.ll
@@ -31,14 +31,14 @@ define i8 @cnt8(i8 %x) nounwind readnone {
; X64-NEXT: shrb %al
; X64-NEXT: andb $85, %al
; X64-NEXT: subb %al, %dil
-; X64-NEXT: movl %edi, %eax
-; X64-NEXT: andb $51, %al
+; X64-NEXT: movl %edi, %ecx
+; X64-NEXT: andb $51, %cl
; X64-NEXT: shrb $2, %dil
; X64-NEXT: andb $51, %dil
-; X64-NEXT: addb %al, %dil
-; X64-NEXT: movl %edi, %eax
+; X64-NEXT: addb %dil, %cl
+; X64-NEXT: movl %ecx, %eax
; X64-NEXT: shrb $4, %al
-; X64-NEXT: addb %dil, %al
+; X64-NEXT: addb %cl, %al
; X64-NEXT: andb $15, %al
; X64-NEXT: retq
;
@@ -225,15 +225,15 @@ define i64 @cnt64(i64 %x) nounwind readnone {
; X64-NEXT: movq %rdi, %rcx
; X64-NEXT: andq %rax, %rcx
; X64-NEXT: shrq $2, %rdi
-; X64-NEXT: andq %rax, %rdi
-; X64-NEXT: addq %rcx, %rdi
-; X64-NEXT: movq %rdi, %rax
-; X64-NEXT: shrq $4, %rax
-; X64-NEXT: addq %rdi, %rax
-; X64-NEXT: movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F
-; X64-NEXT: andq %rax, %rcx
+; X64-NEXT: andq %rdi, %rax
+; X64-NEXT: addq %rcx, %rax
+; X64-NEXT: movq %rax, %rcx
+; X64-NEXT: shrq $4, %rcx
+; X64-NEXT: addq %rax, %rcx
+; X64-NEXT: movabsq $1085102592571150095, %rdx # imm = 0xF0F0F0F0F0F0F0F
+; X64-NEXT: andq %rcx, %rdx
; X64-NEXT: movabsq $72340172838076673, %rax # imm = 0x101010101010101
-; X64-NEXT: imulq %rcx, %rax
+; X64-NEXT: imulq %rdx, %rax
; X64-NEXT: shrq $56, %rax
; X64-NEXT: retq
;
@@ -385,36 +385,36 @@ define i128 @cnt128(i128 %x) nounwind readnone {
; X64-NEXT: movabsq $6148914691236517205, %r8 # imm = 0x5555555555555555
; X64-NEXT: andq %r8, %rax
; X64-NEXT: subq %rax, %rsi
-; X64-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
-; X64-NEXT: movq %rsi, %rcx
-; X64-NEXT: andq %rax, %rcx
+; X64-NEXT: movabsq $3689348814741910323, %rcx # imm = 0x3333333333333333
+; X64-NEXT: movq %rsi, %rax
+; X64-NEXT: andq %rcx, %rax
; X64-NEXT: shrq $2, %rsi
-; X64-NEXT: andq %rax, %rsi
-; X64-NEXT: addq %rcx, %rsi
-; X64-NEXT: movq %rsi, %rcx
-; X64-NEXT: shrq $4, %rcx
-; X64-NEXT: addq %rsi, %rcx
+; X64-NEXT: andq %rcx, %rsi
+; X64-NEXT: addq %rsi, %rax
+; X64-NEXT: movq %rax, %rdx
+; X64-NEXT: shrq $4, %rdx
+; X64-NEXT: addq %rax, %rdx
; X64-NEXT: movabsq $1085102592571150095, %r9 # imm = 0xF0F0F0F0F0F0F0F
-; X64-NEXT: andq %r9, %rcx
-; X64-NEXT: movabsq $72340172838076673, %rdx # imm = 0x101010101010101
-; X64-NEXT: imulq %rdx, %rcx
-; X64-NEXT: shrq $56, %rcx
-; X64-NEXT: movq %rdi, %rsi
-; X64-NEXT: shrq %rsi
-; X64-NEXT: andq %r8, %rsi
-; X64-NEXT: subq %rsi, %rdi
-; X64-NEXT: movq %rdi, %rsi
-; X64-NEXT: andq %rax, %rsi
-; X64-NEXT: shrq $2, %rdi
-; X64-NEXT: andq %rax, %rdi
-; X64-NEXT: addq %rsi, %rdi
+; X64-NEXT: andq %r9, %rdx
+; X64-NEXT: movabsq $72340172838076673, %rsi # imm = 0x101010101010101
+; X64-NEXT: imulq %rsi, %rdx
+; X64-NEXT: shrq $56, %rdx
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: shrq %rax
+; X64-NEXT: andq %r8, %rax
+; X64-NEXT: subq %rax, %rdi
; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: andq %rcx, %rax
+; X64-NEXT: shrq $2, %rdi
+; X64-NEXT: andq %rdi, %rcx
+; X64-NEXT: addq %rax, %rcx
+; X64-NEXT: movq %rcx, %rax
; X64-NEXT: shrq $4, %rax
-; X64-NEXT: addq %rdi, %rax
+; X64-NEXT: addq %rcx, %rax
; X64-NEXT: andq %r9, %rax
-; X64-NEXT: imulq %rdx, %rax
+; X64-NEXT: imulq %rsi, %rax
; X64-NEXT: shrq $56, %rax
-; X64-NEXT: addq %rcx, %rax
+; X64-NEXT: addq %rdx, %rax
; X64-NEXT: xorl %edx, %edx
; X64-NEXT: retq
;
@@ -579,15 +579,15 @@ define i64 @cnt64_noimplicitfloat(i64 %x) nounwind readnone noimplicitfloat {
; X64-NEXT: movq %rdi, %rcx
; X64-NEXT: andq %rax, %rcx
; X64-NEXT: shrq $2, %rdi
-; X64-NEXT: andq %rax, %rdi
-; X64-NEXT: addq %rcx, %rdi
-; X64-NEXT: movq %rdi, %rax
-; X64-NEXT: shrq $4, %rax
-; X64-NEXT: addq %rdi, %rax
-; X64-NEXT: movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F
-; X64-NEXT: andq %rax, %rcx
+; X64-NEXT: andq %rdi, %rax
+; X64-NEXT: addq %rcx, %rax
+; X64-NEXT: movq %rax, %rcx
+; X64-NEXT: shrq $4, %rcx
+; X64-NEXT: addq %rax, %rcx
+; X64-NEXT: movabsq $1085102592571150095, %rdx # imm = 0xF0F0F0F0F0F0F0F
+; X64-NEXT: andq %rcx, %rdx
; X64-NEXT: movabsq $72340172838076673, %rax # imm = 0x101010101010101
-; X64-NEXT: imulq %rcx, %rax
+; X64-NEXT: imulq %rdx, %rax
; X64-NEXT: shrq $56, %rax
; X64-NEXT: retq
;
@@ -721,15 +721,15 @@ define i64 @cnt64_optsize(i64 %x) nounwind readnone optsize {
; X64-NEXT: movq %rdi, %rcx
; X64-NEXT: andq %rax, %rcx
; X64-NEXT: shrq $2, %rdi
-; X64-NEXT: andq %rax, %rdi
-; X64-NEXT: addq %rcx, %rdi
-; X64-NEXT: movq %rdi, %rax
-; X64-NEXT: shrq $4, %rax
-; X64-NEXT: addq %rdi, %rax
-; X64-NEXT: movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F
-; X64-NEXT: andq %rax, %rcx
+; X64-NEXT: andq %rdi, %rax
+; X64-NEXT: addq %rcx, %rax
+; X64-NEXT: movq %rax, %rcx
+; X64-NEXT: shrq $4, %rcx
+; X64-NEXT: addq %rax, %rcx
+; X64-NEXT: movabsq $1085102592571150095, %rdx # imm = 0xF0F0F0F0F0F0F0F
+; X64-NEXT: andq %rcx, %rdx
; X64-NEXT: movabsq $72340172838076673, %rax # imm = 0x101010101010101
-; X64-NEXT: imulq %rcx, %rax
+; X64-NEXT: imulq %rdx, %rax
; X64-NEXT: shrq $56, %rax
; X64-NEXT: retq
;
@@ -890,36 +890,36 @@ define i128 @cnt128_optsize(i128 %x) nounwind readnone optsize {
; X64-NEXT: movabsq $6148914691236517205, %r8 # imm = 0x5555555555555555
; X64-NEXT: andq %r8, %rax
; X64-NEXT: subq %rax, %rsi
-; X64-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
-; X64-NEXT: movq %rsi, %rcx
-; X64-NEXT: andq %rax, %rcx
+; X64-NEXT: movabsq $3689348814741910323, %rcx # imm = 0x3333333333333333
+; X64-NEXT: movq %rsi, %rax
+; X64-NEXT: andq %rcx, %rax
; X64-NEXT: shrq $2, %rsi
-; X64-NEXT: andq %rax, %rsi
-; X64-NEXT: addq %rcx, %rsi
-; X64-NEXT: movq %rsi, %rcx
-; X64-NEXT: shrq $4, %rcx
-; X64-NEXT: addq %rsi, %rcx
+; X64-NEXT: andq %rcx, %rsi
+; X64-NEXT: addq %rsi, %rax
+; X64-NEXT: movq %rax, %rdx
+; X64-NEXT: shrq $4, %rdx
+; X64-NEXT: addq %rax, %rdx
; X64-NEXT: movabsq $1085102592571150095, %r9 # imm = 0xF0F0F0F0F0F0F0F
-; X64-NEXT: andq %r9, %rcx
-; X64-NEXT: movabsq $72340172838076673, %rdx # imm = 0x101010101010101
-; X64-NEXT: imulq %rdx, %rcx
-; X64-NEXT: shrq $56, %rcx
-; X64-NEXT: movq %rdi, %rsi
-; X64-NEXT: shrq %rsi
-; X64-NEXT: andq %r8, %rsi
-; X64-NEXT: subq %rsi, %rdi
-; X64-NEXT: movq %rdi, %rsi
-; X64-NEXT: andq %rax, %rsi
-; X64-NEXT: shrq $2, %rdi
-; X64-NEXT: andq %rax, %rdi
-; X64-NEXT: addq %rsi, %rdi
+; X64-NEXT: andq %r9, %rdx
+; X64-NEXT: movabsq $72340172838076673, %rsi # imm = 0x101010101010101
+; X64-NEXT: imulq %rsi, %rdx
+; X64-NEXT: shrq $56, %rdx
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: shrq %rax
+; X64-NEXT: andq %r8, %rax
+; X64-NEXT: subq %rax, %rdi
; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: andq %rcx, %rax
+; X64-NEXT: shrq $2, %rdi
+; X64-NEXT: andq %rdi, %rcx
+; X64-NEXT: addq %rax, %rcx
+; X64-NEXT: movq %rcx, %rax
; X64-NEXT: shrq $4, %rax
-; X64-NEXT: addq %rdi, %rax
+; X64-NEXT: addq %rcx, %rax
; X64-NEXT: andq %r9, %rax
-; X64-NEXT: imulq %rdx, %rax
+; X64-NEXT: imulq %rsi, %rax
; X64-NEXT: shrq $56, %rax
-; X64-NEXT: addq %rcx, %rax
+; X64-NEXT: addq %rdx, %rax
; X64-NEXT: xorl %edx, %edx
; X64-NEXT: retq
;
@@ -1151,15 +1151,15 @@ define i64 @cnt64_pgso(i64 %x) nounwind readnone !prof !14 {
; X64-NEXT: movq %rdi, %rcx
; X64-NEXT: andq %rax, %rcx
; X64-NEXT: shrq $2, %rdi
-; X64-NEXT: andq %rax, %rdi
-; X64-NEXT: addq %rcx, %rdi
-; X64-NEXT: movq %rdi, %rax
-; X64-NEXT: shrq $4, %rax
-; X64-NEXT: addq %rdi, %rax
-; X64-NEXT: movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F
-; X64-NEXT: andq %rax, %rcx
+; X64-NEXT: andq %rdi, %rax
+; X64-NEXT: addq %rcx, %rax
+; X64-NEXT: movq %rax, %rcx
+; X64-NEXT: shrq $4, %rcx
+; X64-NEXT: addq %rax, %rcx
+; X64-NEXT: movabsq $1085102592571150095, %rdx # imm = 0xF0F0F0F0F0F0F0F
+; X64-NEXT: andq %rcx, %rdx
; X64-NEXT: movabsq $72340172838076673, %rax # imm = 0x101010101010101
-; X64-NEXT: imulq %rcx, %rax
+; X64-NEXT: imulq %rdx, %rax
; X64-NEXT: shrq $56, %rax
; X64-NEXT: retq
;
@@ -1320,36 +1320,36 @@ define i128 @cnt128_pgso(i128 %x) nounwind readnone !prof !14 {
; X64-NEXT: movabsq $6148914691236517205, %r8 # imm = 0x5555555555555555
; X64-NEXT: andq %r8, %rax
; X64-NEXT: subq %rax, %rsi
-; X64-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
-; X64-NEXT: movq %rsi, %rcx
-; X64-NEXT: andq %rax, %rcx
+; X64-NEXT: movabsq $3689348814741910323, %rcx # imm = 0x3333333333333333
+; X64-NEXT: movq %rsi, %rax
+; X64-NEXT: andq %rcx, %rax
; X64-NEXT: shrq $2, %rsi
-; X64-NEXT: andq %rax, %rsi
-; X64-NEXT: addq %rcx, %rsi
-; X64-NEXT: movq %rsi, %rcx
-; X64-NEXT: shrq $4, %rcx
-; X64-NEXT: addq %rsi, %rcx
+; X64-NEXT: andq %rcx, %rsi
+; X64-NEXT: addq %rsi, %rax
+; X64-NEXT: movq %rax, %rdx
+; X64-NEXT: shrq $4, %rdx
+; X64-NEXT: addq %rax, %rdx
; X64-NEXT: movabsq $1085102592571150095, %r9 # imm = 0xF0F0F0F0F0F0F0F
-; X64-NEXT: andq %r9, %rcx
-; X64-NEXT: movabsq $72340172838076673, %rdx # imm = 0x101010101010101
-; X64-NEXT: imulq %rdx, %rcx
-; X64-NEXT: shrq $56, %rcx
-; X64-NEXT: movq %rdi, %rsi
-; X64-NEXT: shrq %rsi
-; X64-NEXT: andq %r8, %rsi
-; X64-NEXT: subq %rsi, %rdi
-; X64-NEXT: movq %rdi, %rsi
-; X64-NEXT: andq %rax, %rsi
-; X64-NEXT: shrq $2, %rdi
-; X64-NEXT: andq %rax, %rdi
-; X64-NEXT: addq %rsi, %rdi
+; X64-NEXT: andq %r9, %rdx
+; X64-NEXT: movabsq $72340172838076673, %rsi # imm = 0x101010101010101
+; X64-NEXT: imulq %rsi, %rdx
+; X64-NEXT: shrq $56, %rdx
; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: shrq %rax
+; X64-NEXT: andq %r8, %rax
+; X64-NEXT: subq %rax, %rdi
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: andq %rcx, %rax
+; X64-NEXT: shrq $2, %rdi
+; X64-NEXT: andq %rdi, %rcx
+; X64-NEXT: addq %rax, %rcx
+; X64-NEXT: movq %rcx, %rax
; X64-NEXT: shrq $4, %rax
-; X64-NEXT: addq %rdi, %rax
+; X64-NEXT: addq %rcx, %rax
; X64-NEXT: andq %r9, %rax
-; X64-NEXT: imulq %rdx, %rax
+; X64-NEXT: imulq %rsi, %rax
; X64-NEXT: shrq $56, %rax
-; X64-NEXT: addq %rcx, %rax
+; X64-NEXT: addq %rdx, %rax
; X64-NEXT: xorl %edx, %edx
; X64-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/pull-binop-through-shift.ll b/llvm/test/CodeGen/X86/pull-binop-through-shift.ll
index 28f6c9f2e706f..180b609a0de31 100644
--- a/llvm/test/CodeGen/X86/pull-binop-through-shift.ll
+++ b/llvm/test/CodeGen/X86/pull-binop-through-shift.ll
@@ -140,9 +140,9 @@ define i32 @xor_nosignbit_shl(i32 %x, i32* %dst) {
define i32 @add_signbit_shl(i32 %x, i32* %dst) {
; X64-LABEL: add_signbit_shl:
; X64: # %bb.0:
-; X64-NEXT: movl %edi, %eax
-; X64-NEXT: shll $8, %eax
-; X64-NEXT: addl $-16777216, %eax # imm = 0xFF000000
+; X64-NEXT: # kill: def $edi killed $edi def $rdi
+; X64-NEXT: shll $8, %edi
+; X64-NEXT: leal -16777216(%rdi), %eax
; X64-NEXT: movl %eax, (%rsi)
; X64-NEXT: retq
;
@@ -162,9 +162,9 @@ define i32 @add_signbit_shl(i32 %x, i32* %dst) {
define i32 @add_nosignbit_shl(i32 %x, i32* %dst) {
; X64-LABEL: add_nosignbit_shl:
; X64: # %bb.0:
-; X64-NEXT: movl %edi, %eax
-; X64-NEXT: shll $8, %eax
-; X64-NEXT: addl $-16777216, %eax # imm = 0xFF000000
+; X64-NEXT: # kill: def $edi killed $edi def $rdi
+; X64-NEXT: shll $8, %edi
+; X64-NEXT: leal -16777216(%rdi), %eax
; X64-NEXT: movl %eax, (%rsi)
; X64-NEXT: retq
;
@@ -322,8 +322,8 @@ define i32 @xor_nosignbit_lshr(i32 %x, i32* %dst) {
define i32 @add_signbit_lshr(i32 %x, i32* %dst) {
; X64-LABEL: add_signbit_lshr:
; X64: # %bb.0:
-; X64-NEXT: movl %edi, %eax
-; X64-NEXT: addl $-65536, %eax # imm = 0xFFFF0000
+; X64-NEXT: # kill: def $edi killed $edi def $rdi
+; X64-NEXT: leal -65536(%rdi), %eax
; X64-NEXT: shrl $8, %eax
; X64-NEXT: movl %eax, (%rsi)
; X64-NEXT: retq
@@ -344,8 +344,8 @@ define i32 @add_signbit_lshr(i32 %x, i32* %dst) {
define i32 @add_nosignbit_lshr(i32 %x, i32* %dst) {
; X64-LABEL: add_nosignbit_lshr:
; X64: # %bb.0:
-; X64-NEXT: movl %edi, %eax
-; X64-NEXT: addl $2147418112, %eax # imm = 0x7FFF0000
+; X64-NEXT: # kill: def $edi killed $edi def $rdi
+; X64-NEXT: leal 2147418112(%rdi), %eax
; X64-NEXT: shrl $8, %eax
; X64-NEXT: movl %eax, (%rsi)
; X64-NEXT: retq
@@ -503,8 +503,8 @@ define i32 @xor_nosignbit_ashr(i32 %x, i32* %dst) {
define i32 @add_signbit_ashr(i32 %x, i32* %dst) {
; X64-LABEL: add_signbit_ashr:
; X64: # %bb.0:
-; X64-NEXT: movl %edi, %eax
-; X64-NEXT: addl $-65536, %eax # imm = 0xFFFF0000
+; X64-NEXT: # kill: def $edi killed $edi def $rdi
+; X64-NEXT: leal -65536(%rdi), %eax
; X64-NEXT: sarl $8, %eax
; X64-NEXT: movl %eax, (%rsi)
; X64-NEXT: retq
@@ -525,8 +525,8 @@ define i32 @add_signbit_ashr(i32 %x, i32* %dst) {
define i32 @add_nosignbit_ashr(i32 %x, i32* %dst) {
; X64-LABEL: add_nosignbit_ashr:
; X64: # %bb.0:
-; X64-NEXT: movl %edi, %eax
-; X64-NEXT: addl $2147418112, %eax # imm = 0x7FFF0000
+; X64-NEXT: # kill: def $edi killed $edi def $rdi
+; X64-NEXT: leal 2147418112(%rdi), %eax
; X64-NEXT: sarl $8, %eax
; X64-NEXT: movl %eax, (%rsi)
; X64-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/rem.ll b/llvm/test/CodeGen/X86/rem.ll
index c3f6d61f4ab6a..893b49f9a0179 100644
--- a/llvm/test/CodeGen/X86/rem.ll
+++ b/llvm/test/CodeGen/X86/rem.ll
@@ -8,16 +8,15 @@ define i32 @test1(i32 %X) {
; CHECK-NEXT: movl $-2139062143, %edx # imm = 0x80808081
; CHECK-NEXT: movl %ecx, %eax
; CHECK-NEXT: imull %edx
-; CHECK-NEXT: addl %ecx, %edx
-; CHECK-NEXT: movl %edx, %eax
-; CHECK-NEXT: shrl $31, %eax
-; CHECK-NEXT: sarl $7, %edx
-; CHECK-NEXT: addl %eax, %edx
-; CHECK-NEXT: movl %edx, %eax
-; CHECK-NEXT: shll $8, %eax
-; CHECK-NEXT: subl %eax, %edx
-; CHECK-NEXT: addl %edx, %ecx
-; CHECK-NEXT: movl %ecx, %eax
+; CHECK-NEXT: leal (%edx,%ecx), %eax
+; CHECK-NEXT: movl %eax, %edx
+; CHECK-NEXT: shrl $31, %edx
+; CHECK-NEXT: sarl $7, %eax
+; CHECK-NEXT: addl %edx, %eax
+; CHECK-NEXT: movl %eax, %edx
+; CHECK-NEXT: shll $8, %edx
+; CHECK-NEXT: subl %edx, %eax
+; CHECK-NEXT: addl %ecx, %eax
; CHECK-NEXT: retl
%tmp1 = srem i32 %X, 255
ret i32 %tmp1
diff --git a/llvm/test/CodeGen/X86/sat-add.ll b/llvm/test/CodeGen/X86/sat-add.ll
index 81474dd338b21..3feff29b1773e 100644
--- a/llvm/test/CodeGen/X86/sat-add.ll
+++ b/llvm/test/CodeGen/X86/sat-add.ll
@@ -1265,11 +1265,12 @@ define <2 x i64> @unsigned_sat_variable_v2i64_using_cmp_sum(<2 x i64> %x, <2 x i
; SSE42-LABEL: unsigned_sat_variable_v2i64_using_cmp_sum:
; SSE42: # %bb.0:
; SSE42-NEXT: movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
-; SSE42-NEXT: paddq %xmm0, %xmm1
-; SSE42-NEXT: pxor %xmm2, %xmm0
-; SSE42-NEXT: pxor %xmm1, %xmm2
-; SSE42-NEXT: pcmpgtq %xmm2, %xmm0
-; SSE42-NEXT: por %xmm1, %xmm0
+; SSE42-NEXT: movdqa %xmm0, %xmm3
+; SSE42-NEXT: pxor %xmm2, %xmm3
+; SSE42-NEXT: paddq %xmm1, %xmm0
+; SSE42-NEXT: pxor %xmm0, %xmm2
+; SSE42-NEXT: pcmpgtq %xmm2, %xmm3
+; SSE42-NEXT: por %xmm3, %xmm0
; SSE42-NEXT: retq
;
; AVX2-LABEL: unsigned_sat_variable_v2i64_using_cmp_sum:
diff --git a/llvm/test/CodeGen/X86/sdiv_fix_sat.ll b/llvm/test/CodeGen/X86/sdiv_fix_sat.ll
index 192a124dfe832..44acdc7e676a5 100644
--- a/llvm/test/CodeGen/X86/sdiv_fix_sat.ll
+++ b/llvm/test/CodeGen/X86/sdiv_fix_sat.ll
@@ -29,15 +29,15 @@ define i16 @func(i16 %x, i16 %y) nounwind {
; X64-NEXT: testl %edx, %edx
; X64-NEXT: setne %dl
; X64-NEXT: testb %cl, %dl
-; X64-NEXT: cmovel %eax, %edi
-; X64-NEXT: cmpl $65535, %edi # imm = 0xFFFF
+; X64-NEXT: cmovnel %edi, %eax
+; X64-NEXT: cmpl $65535, %eax # imm = 0xFFFF
; X64-NEXT: movl $65535, %ecx # imm = 0xFFFF
-; X64-NEXT: cmovll %edi, %ecx
-; X64-NEXT: cmpl $-65535, %ecx # imm = 0xFFFF0001
-; X64-NEXT: movl $-65536, %eax # imm = 0xFFFF0000
; X64-NEXT: cmovgel %ecx, %eax
+; X64-NEXT: cmpl $-65535, %eax # imm = 0xFFFF0001
+; X64-NEXT: movl $-65536, %ecx # imm = 0xFFFF0000
+; X64-NEXT: cmovll %ecx, %eax
; X64-NEXT: shrl %eax
-; X64-NEXT: # kill: def $ax killed $ax killed $eax
+; X64-NEXT: # kill: def $ax killed $ax killed $rax
; X64-NEXT: retq
;
; X86-LABEL: func:
@@ -45,14 +45,14 @@ define i16 @func(i16 %x, i16 %y) nounwind {
; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: movswl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: movswl {{[0-9]+}}(%esp), %esi
; X86-NEXT: movswl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: shll $8, %ecx
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: cltd
-; X86-NEXT: idivl %edi
-; X86-NEXT: leal -1(%eax), %esi
-; X86-NEXT: testl %edi, %edi
+; X86-NEXT: idivl %esi
+; X86-NEXT: leal -1(%eax), %edi
+; X86-NEXT: testl %esi, %esi
; X86-NEXT: sets %bl
; X86-NEXT: testl %ecx, %ecx
; X86-NEXT: sets %cl
@@ -60,13 +60,13 @@ define i16 @func(i16 %x, i16 %y) nounwind {
; X86-NEXT: testl %edx, %edx
; X86-NEXT: setne %dl
; X86-NEXT: testb %cl, %dl
-; X86-NEXT: cmovel %eax, %esi
-; X86-NEXT: cmpl $65535, %esi # imm = 0xFFFF
+; X86-NEXT: cmovnel %edi, %eax
+; X86-NEXT: cmpl $65535, %eax # imm = 0xFFFF
; X86-NEXT: movl $65535, %ecx # imm = 0xFFFF
-; X86-NEXT: cmovll %esi, %ecx
-; X86-NEXT: cmpl $-65535, %ecx # imm = 0xFFFF0001
-; X86-NEXT: movl $-65536, %eax # imm = 0xFFFF0000
; X86-NEXT: cmovgel %ecx, %eax
+; X86-NEXT: cmpl $-65535, %eax # imm = 0xFFFF0001
+; X86-NEXT: movl $-65536, %ecx # imm = 0xFFFF0000
+; X86-NEXT: cmovll %ecx, %eax
; X86-NEXT: shrl %eax
; X86-NEXT: # kill: def $ax killed $ax killed $eax
; X86-NEXT: popl %esi
@@ -99,14 +99,14 @@ define i16 @func2(i8 %x, i8 %y) nounwind {
; X64-NEXT: testl %edx, %edx
; X64-NEXT: setne %dl
; X64-NEXT: testb %cl, %dl
-; X64-NEXT: cmovel %eax, %edi
-; X64-NEXT: cmpl $16383, %edi # imm = 0x3FFF
+; X64-NEXT: cmovnel %edi, %eax
+; X64-NEXT: cmpl $16383, %eax # imm = 0x3FFF
; X64-NEXT: movl $16383, %ecx # imm = 0x3FFF
-; X64-NEXT: cmovll %edi, %ecx
-; X64-NEXT: cmpl $-16383, %ecx # imm = 0xC001
-; X64-NEXT: movl $-16384, %eax # imm = 0xC000
; X64-NEXT: cmovgel %ecx, %eax
-; X64-NEXT: # kill: def $ax killed $ax killed $eax
+; X64-NEXT: cmpl $-16383, %eax # imm = 0xC001
+; X64-NEXT: movl $-16384, %ecx # imm = 0xC000
+; X64-NEXT: cmovll %ecx, %eax
+; X64-NEXT: # kill: def $ax killed $ax killed $rax
; X64-NEXT: retq
;
; X86-LABEL: func2:
@@ -114,14 +114,14 @@ define i16 @func2(i8 %x, i8 %y) nounwind {
; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: movsbl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: movsbl {{[0-9]+}}(%esp), %esi
; X86-NEXT: movsbl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: shll $14, %ecx
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: cltd
-; X86-NEXT: idivl %edi
-; X86-NEXT: leal -1(%eax), %esi
-; X86-NEXT: testl %edi, %edi
+; X86-NEXT: idivl %esi
+; X86-NEXT: leal -1(%eax), %edi
+; X86-NEXT: testl %esi, %esi
; X86-NEXT: sets %bl
; X86-NEXT: testl %ecx, %ecx
; X86-NEXT: sets %cl
@@ -129,13 +129,13 @@ define i16 @func2(i8 %x, i8 %y) nounwind {
; X86-NEXT: testl %edx, %edx
; X86-NEXT: setne %dl
; X86-NEXT: testb %cl, %dl
-; X86-NEXT: cmovel %eax, %esi
-; X86-NEXT: cmpl $16383, %esi # imm = 0x3FFF
+; X86-NEXT: cmovnel %edi, %eax
+; X86-NEXT: cmpl $16383, %eax # imm = 0x3FFF
; X86-NEXT: movl $16383, %ecx # imm = 0x3FFF
-; X86-NEXT: cmovll %esi, %ecx
-; X86-NEXT: cmpl $-16383, %ecx # imm = 0xC001
-; X86-NEXT: movl $-16384, %eax # imm = 0xC000
; X86-NEXT: cmovgel %ecx, %eax
+; X86-NEXT: cmpl $-16383, %eax # imm = 0xC001
+; X86-NEXT: movl $-16384, %ecx # imm = 0xC000
+; X86-NEXT: cmovll %ecx, %eax
; X86-NEXT: # kill: def $ax killed $ax killed $eax
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
@@ -169,16 +169,16 @@ define i16 @func3(i15 %x, i8 %y) nounwind {
; X64-NEXT: testw %dx, %dx
; X64-NEXT: setne %dl
; X64-NEXT: testb %cl, %dl
-; X64-NEXT: cmovel %eax, %esi
-; X64-NEXT: movswl %si, %eax
-; X64-NEXT: cmpl $16383, %eax # imm = 0x3FFF
+; X64-NEXT: cmovnel %esi, %eax
+; X64-NEXT: movswl %ax, %ecx
+; X64-NEXT: cmpl $16383, %ecx # imm = 0x3FFF
; X64-NEXT: movl $16383, %ecx # imm = 0x3FFF
-; X64-NEXT: cmovll %esi, %ecx
-; X64-NEXT: movswl %cx, %eax
-; X64-NEXT: cmpl $-16383, %eax # imm = 0xC001
-; X64-NEXT: movl $49152, %eax # imm = 0xC000
; X64-NEXT: cmovgel %ecx, %eax
-; X64-NEXT: # kill: def $ax killed $ax killed $eax
+; X64-NEXT: movswl %ax, %ecx
+; X64-NEXT: cmpl $-16383, %ecx # imm = 0xC001
+; X64-NEXT: movl $49152, %ecx # imm = 0xC000
+; X64-NEXT: cmovll %ecx, %eax
+; X64-NEXT: # kill: def $ax killed $ax killed $rax
; X64-NEXT: retq
;
; X86-LABEL: func3:
@@ -188,31 +188,31 @@ define i16 @func3(i15 %x, i8 %y) nounwind {
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: shll $8, %eax
-; X86-NEXT: movswl %ax, %edi
+; X86-NEXT: movswl %ax, %esi
; X86-NEXT: addl %ecx, %ecx
-; X86-NEXT: shrl $4, %edi
+; X86-NEXT: shrl $4, %esi
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: cwtd
-; X86-NEXT: idivw %di
+; X86-NEXT: idivw %si
; X86-NEXT: # kill: def $ax killed $ax def $eax
-; X86-NEXT: leal -1(%eax), %esi
+; X86-NEXT: leal -1(%eax), %edi
; X86-NEXT: testw %cx, %cx
; X86-NEXT: sets %cl
-; X86-NEXT: testw %di, %di
+; X86-NEXT: testw %si, %si
; X86-NEXT: sets %ch
; X86-NEXT: xorb %cl, %ch
; X86-NEXT: testw %dx, %dx
; X86-NEXT: setne %cl
; X86-NEXT: testb %ch, %cl
-; X86-NEXT: cmovel %eax, %esi
-; X86-NEXT: movswl %si, %eax
-; X86-NEXT: cmpl $16383, %eax # imm = 0x3FFF
+; X86-NEXT: cmovnel %edi, %eax
+; X86-NEXT: movswl %ax, %ecx
+; X86-NEXT: cmpl $16383, %ecx # imm = 0x3FFF
; X86-NEXT: movl $16383, %ecx # imm = 0x3FFF
-; X86-NEXT: cmovll %esi, %ecx
-; X86-NEXT: movswl %cx, %eax
-; X86-NEXT: cmpl $-16383, %eax # imm = 0xC001
-; X86-NEXT: movl $49152, %eax # imm = 0xC000
; X86-NEXT: cmovgel %ecx, %eax
+; X86-NEXT: movswl %ax, %ecx
+; X86-NEXT: cmpl $-16383, %ecx # imm = 0xC001
+; X86-NEXT: movl $49152, %ecx # imm = 0xC000
+; X86-NEXT: cmovll %ecx, %eax
; X86-NEXT: # kill: def $ax killed $ax killed $eax
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
@@ -521,13 +521,14 @@ define i18 @func6(i16 %x, i16 %y) nounwind {
; X64-NEXT: testl %edx, %edx
; X64-NEXT: setne %dl
; X64-NEXT: testb %cl, %dl
-; X64-NEXT: cmovel %eax, %edi
-; X64-NEXT: cmpl $131071, %edi # imm = 0x1FFFF
+; X64-NEXT: cmovnel %edi, %eax
+; X64-NEXT: cmpl $131071, %eax # imm = 0x1FFFF
; X64-NEXT: movl $131071, %ecx # imm = 0x1FFFF
-; X64-NEXT: cmovll %edi, %ecx
-; X64-NEXT: cmpl $-131071, %ecx # imm = 0xFFFE0001
-; X64-NEXT: movl $-131072, %eax # imm = 0xFFFE0000
; X64-NEXT: cmovgel %ecx, %eax
+; X64-NEXT: cmpl $-131071, %eax # imm = 0xFFFE0001
+; X64-NEXT: movl $-131072, %ecx # imm = 0xFFFE0000
+; X64-NEXT: cmovll %ecx, %eax
+; X64-NEXT: # kill: def $eax killed $eax killed $rax
; X64-NEXT: retq
;
; X86-LABEL: func6:
@@ -535,14 +536,14 @@ define i18 @func6(i16 %x, i16 %y) nounwind {
; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: movswl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: movswl {{[0-9]+}}(%esp), %esi
; X86-NEXT: movswl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: shll $7, %ecx
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: cltd
-; X86-NEXT: idivl %edi
-; X86-NEXT: leal -1(%eax), %esi
-; X86-NEXT: testl %edi, %edi
+; X86-NEXT: idivl %esi
+; X86-NEXT: leal -1(%eax), %edi
+; X86-NEXT: testl %esi, %esi
; X86-NEXT: sets %bl
; X86-NEXT: testl %ecx, %ecx
; X86-NEXT: sets %cl
@@ -550,13 +551,13 @@ define i18 @func6(i16 %x, i16 %y) nounwind {
; X86-NEXT: testl %edx, %edx
; X86-NEXT: setne %dl
; X86-NEXT: testb %cl, %dl
-; X86-NEXT: cmovel %eax, %esi
-; X86-NEXT: cmpl $131071, %esi # imm = 0x1FFFF
+; X86-NEXT: cmovnel %edi, %eax
+; X86-NEXT: cmpl $131071, %eax # imm = 0x1FFFF
; X86-NEXT: movl $131071, %ecx # imm = 0x1FFFF
-; X86-NEXT: cmovll %esi, %ecx
-; X86-NEXT: cmpl $-131071, %ecx # imm = 0xFFFE0001
-; X86-NEXT: movl $-131072, %eax # imm = 0xFFFE0000
; X86-NEXT: cmovgel %ecx, %eax
+; X86-NEXT: cmpl $-131071, %eax # imm = 0xFFFE0001
+; X86-NEXT: movl $-131072, %ecx # imm = 0xFFFE0000
+; X86-NEXT: cmovll %ecx, %eax
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: popl %ebx
diff --git a/llvm/test/CodeGen/X86/setcc-combine.ll b/llvm/test/CodeGen/X86/setcc-combine.ll
index 600a123681f32..85b920b2aea98 100644
--- a/llvm/test/CodeGen/X86/setcc-combine.ll
+++ b/llvm/test/CodeGen/X86/setcc-combine.ll
@@ -326,11 +326,10 @@ define <4 x float> @sub_to_shift_to_add_vec(<4 x i32> %x, <4 x i32> %y, <4 x flo
; SSE2-LABEL: sub_to_shift_to_add_vec:
; SSE2: # %bb.0:
; SSE2-NEXT: paddd %xmm1, %xmm1
-; SSE2-NEXT: pcmpeqd %xmm0, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm2
-; SSE2-NEXT: pandn %xmm3, %xmm1
-; SSE2-NEXT: por %xmm2, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: pcmpeqd %xmm1, %xmm0
+; SSE2-NEXT: pand %xmm0, %xmm2
+; SSE2-NEXT: pandn %xmm3, %xmm0
+; SSE2-NEXT: por %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: sub_to_shift_to_add_vec:
diff --git a/llvm/test/CodeGen/X86/shift-combine.ll b/llvm/test/CodeGen/X86/shift-combine.ll
index e1f6f3e82fda5..b0e45f610c358 100644
--- a/llvm/test/CodeGen/X86/shift-combine.ll
+++ b/llvm/test/CodeGen/X86/shift-combine.ll
@@ -317,9 +317,9 @@ define dso_local i32 @ashr_add_shl_i32_i8_extra_use1(i32 %r, i32* %p) nounwind {
;
; X64-LABEL: ashr_add_shl_i32_i8_extra_use1:
; X64: # %bb.0:
-; X64-NEXT: movl %edi, %eax
-; X64-NEXT: shll $24, %eax
-; X64-NEXT: addl $33554432, %eax # imm = 0x2000000
+; X64-NEXT: # kill: def $edi killed $edi def $rdi
+; X64-NEXT: shll $24, %edi
+; X64-NEXT: leal 33554432(%rdi), %eax
; X64-NEXT: movl %eax, (%rsi)
; X64-NEXT: sarl $24, %eax
; X64-NEXT: retq
@@ -371,10 +371,10 @@ define dso_local i32 @ashr_add_shl_i32_i8_extra_use3(i32 %r, i32* %p1, i32* %p2)
;
; X64-LABEL: ashr_add_shl_i32_i8_extra_use3:
; X64: # %bb.0:
-; X64-NEXT: movl %edi, %eax
-; X64-NEXT: shll $24, %eax
-; X64-NEXT: movl %eax, (%rsi)
-; X64-NEXT: addl $33554432, %eax # imm = 0x2000000
+; X64-NEXT: # kill: def $edi killed $edi def $rdi
+; X64-NEXT: shll $24, %edi
+; X64-NEXT: movl %edi, (%rsi)
+; X64-NEXT: leal 33554432(%rdi), %eax
; X64-NEXT: movl %eax, (%rdx)
; X64-NEXT: sarl $24, %eax
; X64-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/shl-crash-on-legalize.ll b/llvm/test/CodeGen/X86/shl-crash-on-legalize.ll
index 301c26ee2a529..0168fd90a95de 100644
--- a/llvm/test/CodeGen/X86/shl-crash-on-legalize.ll
+++ b/llvm/test/CodeGen/X86/shl-crash-on-legalize.ll
@@ -11,14 +11,13 @@ target triple = "x86_64-unknown-linux-gnu"
define i32 @PR29058(i8 %x, i32 %y) {
; CHECK-LABEL: PR29058:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: movl %esi, %ecx
; CHECK-NEXT: testb %dil, %dil
; CHECK-NEXT: movl $2147483646, %eax # imm = 0x7FFFFFFE
; CHECK-NEXT: cmovnel %esi, %eax
-; CHECK-NEXT: xorl %edx, %edx
+; CHECK-NEXT: xorl %ecx, %ecx
; CHECK-NEXT: cmpb $1, %dil
-; CHECK-NEXT: sbbl %edx, %edx
-; CHECK-NEXT: orb %dl, %cl
+; CHECK-NEXT: sbbl %ecx, %ecx
+; CHECK-NEXT: orb %sil, %cl
; CHECK-NEXT: # kill: def $cl killed $cl killed $ecx
; CHECK-NEXT: shll %cl, %eax
; CHECK-NEXT: movq %rax, structMember(%rip)
diff --git a/llvm/test/CodeGen/X86/slow-pmulld.ll b/llvm/test/CodeGen/X86/slow-pmulld.ll
index 5937b82264f34..5009056034c0a 100644
--- a/llvm/test/CodeGen/X86/slow-pmulld.ll
+++ b/llvm/test/CodeGen/X86/slow-pmulld.ll
@@ -464,11 +464,10 @@ define <4 x i32> @test_mul_v4i32_v4i16(<4 x i16> %A) {
define <8 x i32> @test_mul_v8i32_v8i16(<8 x i16> %A) {
; SLM32-LABEL: test_mul_v8i32_v8i16:
; SLM32: # %bb.0:
-; SLM32-NEXT: movdqa %xmm0, %xmm1
-; SLM32-NEXT: movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778]
-; SLM32-NEXT: movdqa %xmm1, %xmm2
+; SLM32-NEXT: movdqa {{.*#+}} xmm1 = [18778,18778,18778,18778,18778,18778,18778,18778]
+; SLM32-NEXT: movdqa %xmm0, %xmm2
+; SLM32-NEXT: pmulhuw %xmm1, %xmm2
; SLM32-NEXT: pmullw %xmm0, %xmm1
-; SLM32-NEXT: pmulhuw %xmm0, %xmm2
; SLM32-NEXT: movdqa %xmm1, %xmm0
; SLM32-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
; SLM32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
@@ -476,11 +475,10 @@ define <8 x i32> @test_mul_v8i32_v8i16(<8 x i16> %A) {
;
; SLM64-LABEL: test_mul_v8i32_v8i16:
; SLM64: # %bb.0:
-; SLM64-NEXT: movdqa %xmm0, %xmm1
-; SLM64-NEXT: movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778]
-; SLM64-NEXT: movdqa %xmm1, %xmm2
+; SLM64-NEXT: movdqa {{.*#+}} xmm1 = [18778,18778,18778,18778,18778,18778,18778,18778]
+; SLM64-NEXT: movdqa %xmm0, %xmm2
+; SLM64-NEXT: pmulhuw %xmm1, %xmm2
; SLM64-NEXT: pmullw %xmm0, %xmm1
-; SLM64-NEXT: pmulhuw %xmm0, %xmm2
; SLM64-NEXT: movdqa %xmm1, %xmm0
; SLM64-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
; SLM64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
@@ -488,10 +486,9 @@ define <8 x i32> @test_mul_v8i32_v8i16(<8 x i16> %A) {
;
; SLOW32-LABEL: test_mul_v8i32_v8i16:
; SLOW32: # %bb.0:
-; SLOW32-NEXT: movdqa %xmm0, %xmm1
-; SLOW32-NEXT: movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778]
-; SLOW32-NEXT: movdqa %xmm1, %xmm2
-; SLOW32-NEXT: pmulhuw %xmm0, %xmm2
+; SLOW32-NEXT: movdqa {{.*#+}} xmm1 = [18778,18778,18778,18778,18778,18778,18778,18778]
+; SLOW32-NEXT: movdqa %xmm0, %xmm2
+; SLOW32-NEXT: pmulhuw %xmm1, %xmm2
; SLOW32-NEXT: pmullw %xmm0, %xmm1
; SLOW32-NEXT: movdqa %xmm1, %xmm0
; SLOW32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
@@ -500,10 +497,9 @@ define <8 x i32> @test_mul_v8i32_v8i16(<8 x i16> %A) {
;
; SLOW64-LABEL: test_mul_v8i32_v8i16:
; SLOW64: # %bb.0:
-; SLOW64-NEXT: movdqa %xmm0, %xmm1
-; SLOW64-NEXT: movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778]
-; SLOW64-NEXT: movdqa %xmm1, %xmm2
-; SLOW64-NEXT: pmulhuw %xmm0, %xmm2
+; SLOW64-NEXT: movdqa {{.*#+}} xmm1 = [18778,18778,18778,18778,18778,18778,18778,18778]
+; SLOW64-NEXT: movdqa %xmm0, %xmm2
+; SLOW64-NEXT: pmulhuw %xmm1, %xmm2
; SLOW64-NEXT: pmullw %xmm0, %xmm1
; SLOW64-NEXT: movdqa %xmm1, %xmm0
; SLOW64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
@@ -563,78 +559,78 @@ define <8 x i32> @test_mul_v8i32_v8i16(<8 x i16> %A) {
define <16 x i32> @test_mul_v16i32_v16i16(<16 x i16> %A) {
; SLM32-LABEL: test_mul_v16i32_v16i16:
; SLM32: # %bb.0:
-; SLM32-NEXT: movdqa %xmm1, %xmm3
-; SLM32-NEXT: movdqa %xmm0, %xmm1
-; SLM32-NEXT: movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778]
-; SLM32-NEXT: movdqa %xmm1, %xmm2
-; SLM32-NEXT: movdqa %xmm3, %xmm4
-; SLM32-NEXT: pmullw %xmm0, %xmm1
-; SLM32-NEXT: pmulhuw %xmm0, %xmm2
-; SLM32-NEXT: pmullw %xmm0, %xmm3
-; SLM32-NEXT: pmulhuw %xmm0, %xmm4
-; SLM32-NEXT: movdqa %xmm1, %xmm0
-; SLM32-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SLM32-NEXT: movdqa {{.*#+}} xmm3 = [18778,18778,18778,18778,18778,18778,18778,18778]
+; SLM32-NEXT: movdqa %xmm0, %xmm4
+; SLM32-NEXT: movdqa %xmm0, %xmm2
+; SLM32-NEXT: movdqa %xmm1, %xmm5
+; SLM32-NEXT: pmullw %xmm3, %xmm4
+; SLM32-NEXT: pmulhuw %xmm3, %xmm2
+; SLM32-NEXT: pmulhuw %xmm3, %xmm5
+; SLM32-NEXT: pmullw %xmm1, %xmm3
+; SLM32-NEXT: movdqa %xmm4, %xmm0
+; SLM32-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
; SLM32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
; SLM32-NEXT: movdqa %xmm3, %xmm2
-; SLM32-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
-; SLM32-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
+; SLM32-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
+; SLM32-NEXT: movdqa %xmm4, %xmm1
+; SLM32-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3]
; SLM32-NEXT: retl
;
; SLM64-LABEL: test_mul_v16i32_v16i16:
; SLM64: # %bb.0:
-; SLM64-NEXT: movdqa %xmm1, %xmm3
-; SLM64-NEXT: movdqa %xmm0, %xmm1
-; SLM64-NEXT: movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778]
-; SLM64-NEXT: movdqa %xmm1, %xmm2
-; SLM64-NEXT: movdqa %xmm3, %xmm4
-; SLM64-NEXT: pmullw %xmm0, %xmm1
-; SLM64-NEXT: pmulhuw %xmm0, %xmm2
-; SLM64-NEXT: pmullw %xmm0, %xmm3
-; SLM64-NEXT: pmulhuw %xmm0, %xmm4
-; SLM64-NEXT: movdqa %xmm1, %xmm0
-; SLM64-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SLM64-NEXT: movdqa {{.*#+}} xmm3 = [18778,18778,18778,18778,18778,18778,18778,18778]
+; SLM64-NEXT: movdqa %xmm0, %xmm4
+; SLM64-NEXT: movdqa %xmm0, %xmm2
+; SLM64-NEXT: movdqa %xmm1, %xmm5
+; SLM64-NEXT: pmullw %xmm3, %xmm4
+; SLM64-NEXT: pmulhuw %xmm3, %xmm2
+; SLM64-NEXT: pmulhuw %xmm3, %xmm5
+; SLM64-NEXT: pmullw %xmm1, %xmm3
+; SLM64-NEXT: movdqa %xmm4, %xmm0
+; SLM64-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
; SLM64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
; SLM64-NEXT: movdqa %xmm3, %xmm2
-; SLM64-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
-; SLM64-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
+; SLM64-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
+; SLM64-NEXT: movdqa %xmm4, %xmm1
+; SLM64-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3]
; SLM64-NEXT: retq
;
; SLOW32-LABEL: test_mul_v16i32_v16i16:
; SLOW32: # %bb.0:
-; SLOW32-NEXT: movdqa %xmm1, %xmm3
-; SLOW32-NEXT: movdqa %xmm0, %xmm1
-; SLOW32-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778,18778,18778,18778,18778]
; SLOW32-NEXT: movdqa %xmm0, %xmm4
-; SLOW32-NEXT: pmulhuw %xmm2, %xmm4
-; SLOW32-NEXT: pmullw %xmm2, %xmm1
-; SLOW32-NEXT: movdqa %xmm1, %xmm0
-; SLOW32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; SLOW32-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
-; SLOW32-NEXT: movdqa %xmm3, %xmm4
-; SLOW32-NEXT: pmulhuw %xmm2, %xmm4
-; SLOW32-NEXT: pmullw %xmm2, %xmm3
+; SLOW32-NEXT: movdqa {{.*#+}} xmm3 = [18778,18778,18778,18778,18778,18778,18778,18778]
+; SLOW32-NEXT: movdqa %xmm0, %xmm2
+; SLOW32-NEXT: pmulhuw %xmm3, %xmm2
+; SLOW32-NEXT: pmullw %xmm3, %xmm4
+; SLOW32-NEXT: movdqa %xmm4, %xmm0
+; SLOW32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SLOW32-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
+; SLOW32-NEXT: movdqa %xmm1, %xmm5
+; SLOW32-NEXT: pmulhuw %xmm3, %xmm5
+; SLOW32-NEXT: pmullw %xmm1, %xmm3
; SLOW32-NEXT: movdqa %xmm3, %xmm2
-; SLOW32-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; SLOW32-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; SLOW32-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3]
+; SLOW32-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
+; SLOW32-NEXT: movdqa %xmm4, %xmm1
; SLOW32-NEXT: retl
;
; SLOW64-LABEL: test_mul_v16i32_v16i16:
; SLOW64: # %bb.0:
-; SLOW64-NEXT: movdqa %xmm1, %xmm3
-; SLOW64-NEXT: movdqa %xmm0, %xmm1
-; SLOW64-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778,18778,18778,18778,18778]
; SLOW64-NEXT: movdqa %xmm0, %xmm4
-; SLOW64-NEXT: pmulhuw %xmm2, %xmm4
-; SLOW64-NEXT: pmullw %xmm2, %xmm1
-; SLOW64-NEXT: movdqa %xmm1, %xmm0
-; SLOW64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; SLOW64-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
-; SLOW64-NEXT: movdqa %xmm3, %xmm4
-; SLOW64-NEXT: pmulhuw %xmm2, %xmm4
-; SLOW64-NEXT: pmullw %xmm2, %xmm3
+; SLOW64-NEXT: movdqa {{.*#+}} xmm3 = [18778,18778,18778,18778,18778,18778,18778,18778]
+; SLOW64-NEXT: movdqa %xmm0, %xmm2
+; SLOW64-NEXT: pmulhuw %xmm3, %xmm2
+; SLOW64-NEXT: pmullw %xmm3, %xmm4
+; SLOW64-NEXT: movdqa %xmm4, %xmm0
+; SLOW64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SLOW64-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
+; SLOW64-NEXT: movdqa %xmm1, %xmm5
+; SLOW64-NEXT: pmulhuw %xmm3, %xmm5
+; SLOW64-NEXT: pmullw %xmm1, %xmm3
; SLOW64-NEXT: movdqa %xmm3, %xmm2
-; SLOW64-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; SLOW64-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; SLOW64-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3]
+; SLOW64-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
+; SLOW64-NEXT: movdqa %xmm4, %xmm1
; SLOW64-NEXT: retq
;
; SSE4-32-LABEL: test_mul_v16i32_v16i16:
diff --git a/llvm/test/CodeGen/X86/smul_fix.ll b/llvm/test/CodeGen/X86/smul_fix.ll
index 45d089d28f1d6..ccfc53c02e9a3 100644
--- a/llvm/test/CodeGen/X86/smul_fix.ll
+++ b/llvm/test/CodeGen/X86/smul_fix.ll
@@ -56,12 +56,11 @@ define i64 @func2(i64 %x, i64 %y) {
; X86-NEXT: movl %esi, %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
; X86-NEXT: movl %edx, %edi
-; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: movl %eax, %ebp
; X86-NEXT: movl %esi, %eax
; X86-NEXT: mull %ecx
; X86-NEXT: movl %eax, %esi
-; X86-NEXT: movl %edx, %ebp
-; X86-NEXT: addl %ebx, %ebp
+; X86-NEXT: addl %edx, %ebp
; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
; X86-NEXT: adcl $0, %edi
; X86-NEXT: movl %ebx, %eax
@@ -69,16 +68,17 @@ define i64 @func2(i64 %x, i64 %y) {
; X86-NEXT: addl %ebp, %eax
; X86-NEXT: adcl %edi, %edx
; X86-NEXT: movl %ebx, %edi
-; X86-NEXT: imull {{[0-9]+}}(%esp), %edi
-; X86-NEXT: addl %edx, %edi
-; X86-NEXT: movl %edi, %ebp
-; X86-NEXT: subl %ecx, %ebp
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT: imull %ebp, %edi
+; X86-NEXT: addl %edi, %edx
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: subl %ecx, %edi
; X86-NEXT: testl %ebx, %ebx
-; X86-NEXT: cmovnsl %edi, %ebp
-; X86-NEXT: movl %ebp, %edx
-; X86-NEXT: subl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: cmpl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: cmovnsl %ebp, %edx
+; X86-NEXT: cmovsl %edi, %edx
+; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: subl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: testl %ebp, %ebp
+; X86-NEXT: cmovsl %ecx, %edx
; X86-NEXT: shldl $30, %eax, %edx
; X86-NEXT: shldl $30, %esi, %eax
; X86-NEXT: popl %esi
@@ -333,16 +333,17 @@ define i64 @func7(i64 %x, i64 %y) nounwind {
; X86-NEXT: addl %ebx, %eax
; X86-NEXT: adcl %edi, %edx
; X86-NEXT: movl %ebp, %edi
-; X86-NEXT: imull {{[0-9]+}}(%esp), %edi
-; X86-NEXT: addl %edx, %edi
-; X86-NEXT: movl %edi, %ebx
-; X86-NEXT: subl %esi, %ebx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: imull %ebx, %edi
+; X86-NEXT: addl %edi, %edx
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: subl %esi, %edi
; X86-NEXT: testl %ebp, %ebp
-; X86-NEXT: cmovnsl %edi, %ebx
-; X86-NEXT: movl %ebx, %edx
-; X86-NEXT: subl %ecx, %edx
-; X86-NEXT: cmpl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: cmovnsl %ebx, %edx
+; X86-NEXT: cmovsl %edi, %edx
+; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: subl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: testl %ebx, %ebx
+; X86-NEXT: cmovsl %ecx, %edx
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: popl %ebx
@@ -368,33 +369,32 @@ define i64 @func8(i64 %x, i64 %y) nounwind {
; X86-NEXT: pushl %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: mull %edi
-; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: mull {{[0-9]+}}(%esp)
+; X86-NEXT: movl %edx, %edi
; X86-NEXT: movl %eax, %ebp
; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: mull %esi
+; X86-NEXT: mull {{[0-9]+}}(%esp)
; X86-NEXT: addl %edx, %ebp
-; X86-NEXT: adcl $0, %ebx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: imull %edi
-; X86-NEXT: movl %edx, %edi
+; X86-NEXT: adcl $0, %edi
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: imull {{[0-9]+}}(%esp)
+; X86-NEXT: movl %edx, %ebx
; X86-NEXT: movl %eax, %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %esi
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: mull {{[0-9]+}}(%esp)
; X86-NEXT: addl %ebp, %eax
-; X86-NEXT: adcl %ebx, %edx
-; X86-NEXT: adcl $0, %edi
-; X86-NEXT: addl %ecx, %edx
-; X86-NEXT: adcl $0, %edi
-; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: subl %esi, %ecx
-; X86-NEXT: movl %edi, %esi
+; X86-NEXT: adcl %edx, %edi
+; X86-NEXT: adcl $0, %ebx
+; X86-NEXT: addl %ecx, %edi
+; X86-NEXT: adcl $0, %ebx
+; X86-NEXT: movl %edi, %ecx
+; X86-NEXT: subl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl %ebx, %esi
; X86-NEXT: sbbl $0, %esi
; X86-NEXT: cmpl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: cmovnsl %edi, %esi
-; X86-NEXT: cmovnsl %edx, %ecx
+; X86-NEXT: cmovnsl %ebx, %esi
+; X86-NEXT: cmovnsl %edi, %ecx
; X86-NEXT: movl %ecx, %edi
; X86-NEXT: subl {{[0-9]+}}(%esp), %edi
; X86-NEXT: movl %esi, %edx
diff --git a/llvm/test/CodeGen/X86/smul_fix_sat.ll b/llvm/test/CodeGen/X86/smul_fix_sat.ll
index d752e367a9b70..0463886fe2285 100644
--- a/llvm/test/CodeGen/X86/smul_fix_sat.ll
+++ b/llvm/test/CodeGen/X86/smul_fix_sat.ll
@@ -61,26 +61,27 @@ define i64 @func2(i64 %x, i64 %y) nounwind {
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
; X86-NEXT: pushl %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: mull %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: mull %ecx
+; X86-NEXT: movl %ecx, %ebx
; X86-NEXT: movl %edx, %ebp
-; X86-NEXT: movl %eax, %edi
-; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: movl %esi, %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: addl %edi, %ecx
+; X86-NEXT: addl %edx, %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
; X86-NEXT: adcl $0, %ebp
; X86-NEXT: movl %edi, %eax
-; X86-NEXT: imull %esi
+; X86-NEXT: imull %ebx
; X86-NEXT: movl %edx, %ebx
; X86-NEXT: movl %eax, %esi
; X86-NEXT: movl %edi, %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: addl %ecx, %eax
+; X86-NEXT: movl %eax, %edi
+; X86-NEXT: addl %ecx, %edi
; X86-NEXT: adcl %ebp, %edx
; X86-NEXT: adcl $0, %ebx
; X86-NEXT: addl %esi, %edx
@@ -89,40 +90,40 @@ define i64 @func2(i64 %x, i64 %y) nounwind {
; X86-NEXT: subl {{[0-9]+}}(%esp), %esi
; X86-NEXT: movl %ebx, %ebp
; X86-NEXT: sbbl $0, %ebp
-; X86-NEXT: testl %edi, %edi
+; X86-NEXT: cmpl $0, {{[0-9]+}}(%esp)
; X86-NEXT: cmovnsl %ebx, %ebp
; X86-NEXT: cmovnsl %edx, %esi
; X86-NEXT: movl %esi, %ecx
; X86-NEXT: subl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl %ebp, %edi
-; X86-NEXT: sbbl $0, %edi
+; X86-NEXT: movl %ebp, %edx
+; X86-NEXT: sbbl $0, %edx
; X86-NEXT: cmpl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: cmovnsl %ebp, %edi
+; X86-NEXT: cmovnsl %ebp, %edx
; X86-NEXT: cmovnsl %esi, %ecx
-; X86-NEXT: testl %edi, %edi
+; X86-NEXT: testl %edx, %edx
; X86-NEXT: setg %bl
; X86-NEXT: sete %bh
; X86-NEXT: cmpl $2, %ecx
-; X86-NEXT: setae %dl
-; X86-NEXT: andb %bh, %dl
-; X86-NEXT: orb %bl, %dl
-; X86-NEXT: movl (%esp), %ebx
-; X86-NEXT: shrdl $2, %eax, %ebx
-; X86-NEXT: shrdl $2, %ecx, %eax
-; X86-NEXT: testb %dl, %dl
+; X86-NEXT: setae %al
+; X86-NEXT: andb %bh, %al
+; X86-NEXT: orb %bl, %al
+; X86-NEXT: movl (%esp), %ebx # 4-byte Reload
+; X86-NEXT: shrdl $2, %edi, %ebx
+; X86-NEXT: shrdl $2, %ecx, %edi
+; X86-NEXT: testb %al, %al
; X86-NEXT: movl $2147483647, %esi # imm = 0x7FFFFFFF
-; X86-NEXT: cmovel %eax, %esi
-; X86-NEXT: movl $-1, %edx
-; X86-NEXT: cmovel %ebx, %edx
-; X86-NEXT: cmpl $-1, %edi
-; X86-NEXT: setl %bl
+; X86-NEXT: cmovel %edi, %esi
+; X86-NEXT: movl $-1, %edi
+; X86-NEXT: cmovel %ebx, %edi
+; X86-NEXT: cmpl $-1, %edx
+; X86-NEXT: setl %dl
; X86-NEXT: sete %al
; X86-NEXT: cmpl $-2, %ecx
; X86-NEXT: setb %cl
; X86-NEXT: andb %al, %cl
; X86-NEXT: xorl %eax, %eax
-; X86-NEXT: orb %bl, %cl
-; X86-NEXT: cmovel %edx, %eax
+; X86-NEXT: orb %dl, %cl
+; X86-NEXT: cmovel %edi, %eax
; X86-NEXT: movl $-2147483648, %edx # imm = 0x80000000
; X86-NEXT: cmovel %esi, %edx
; X86-NEXT: addl $4, %esp
@@ -368,65 +369,62 @@ define i64 @func5(i64 %x, i64 %y) {
; X86-NEXT: .cfi_def_cfa_offset 16
; X86-NEXT: pushl %esi
; X86-NEXT: .cfi_def_cfa_offset 20
-; X86-NEXT: subl $12, %esp
-; X86-NEXT: .cfi_def_cfa_offset 32
+; X86-NEXT: subl $8, %esp
+; X86-NEXT: .cfi_def_cfa_offset 28
; X86-NEXT: .cfi_offset %esi, -20
; X86-NEXT: .cfi_offset %edi, -16
; X86-NEXT: .cfi_offset %ebx, -12
; X86-NEXT: .cfi_offset %ebp, -8
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl %ecx, %edi
-; X86-NEXT: sarl $31, %edi
-; X86-NEXT: movl %eax, %esi
-; X86-NEXT: imull %edi, %esi
-; X86-NEXT: mull %edi
-; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT: addl %esi, %edx
-; X86-NEXT: movl %ebx, %esi
+; X86-NEXT: movl %ecx, %ebx
+; X86-NEXT: sarl $31, %ebx
+; X86-NEXT: movl %eax, %edi
; X86-NEXT: imull %ebx, %edi
+; X86-NEXT: mull %ebx
+; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT: addl %edi, %edx
+; X86-NEXT: movl %ebp, %edi
+; X86-NEXT: imull %ebp, %ebx
+; X86-NEXT: addl %edx, %ebx
+; X86-NEXT: sarl $31, %edi
+; X86-NEXT: movl %edi, %ebp
+; X86-NEXT: imull %ecx, %ebp
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: mull %esi
+; X86-NEXT: addl %ebp, %edx
+; X86-NEXT: imull %esi, %edi
; X86-NEXT: addl %edx, %edi
-; X86-NEXT: sarl $31, %esi
-; X86-NEXT: movl %esi, %ebx
-; X86-NEXT: imull %ecx, %ebx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT: movl %esi, %eax
-; X86-NEXT: mull %ebp
-; X86-NEXT: addl %ebx, %edx
-; X86-NEXT: imull %ebp, %esi
-; X86-NEXT: addl %edx, %esi
; X86-NEXT: addl (%esp), %eax # 4-byte Folded Reload
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl %edi, %esi
-; X86-NEXT: movl %ebp, %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: mull %edi
+; X86-NEXT: adcl %ebx, %edi
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: mull %esi
; X86-NEXT: movl %edx, %ebp
; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: mull %edi
+; X86-NEXT: mull %esi
; X86-NEXT: movl %edx, %ebx
-; X86-NEXT: movl %eax, %edi
-; X86-NEXT: addl %ebp, %edi
+; X86-NEXT: addl %eax, %ebp
; X86-NEXT: adcl $0, %ebx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: movl %edx, %ebp
-; X86-NEXT: addl %edi, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl %ebx, %ebp
-; X86-NEXT: setb %bl
-; X86-NEXT: movl %ecx, %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: mull %edx
-; X86-NEXT: addl %ebp, %eax
-; X86-NEXT: movzbl %bl, %edi
-; X86-NEXT: adcl %edi, %edx
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: addl %eax, %ebp
+; X86-NEXT: adcl %ebx, %esi
+; X86-NEXT: setb %bl
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: mull {{[0-9]+}}(%esp)
+; X86-NEXT: addl %esi, %eax
+; X86-NEXT: movzbl %bl, %esi
; X86-NEXT: adcl %esi, %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: movl %ebx, %edi
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: adcl %edi, %edx
+; X86-NEXT: movl %ebp, %edi
; X86-NEXT: sarl $31, %edi
; X86-NEXT: xorl %edi, %edx
; X86-NEXT: xorl %eax, %edi
@@ -437,10 +435,10 @@ define i64 @func5(i64 %x, i64 %y) {
; X86-NEXT: orl %edx, %edi
; X86-NEXT: notl %ecx
; X86-NEXT: cmovel (%esp), %ecx # 4-byte Folded Reload
-; X86-NEXT: cmovel %ebx, %esi
+; X86-NEXT: cmovel %ebp, %esi
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: movl %esi, %edx
-; X86-NEXT: addl $12, %esp
+; X86-NEXT: addl $8, %esp
; X86-NEXT: .cfi_def_cfa_offset 20
; X86-NEXT: popl %esi
; X86-NEXT: .cfi_def_cfa_offset 16
@@ -651,20 +649,20 @@ define i64 @func7(i64 %x, i64 %y) nounwind {
; X86-NEXT: adcl $0, %edi
; X86-NEXT: addl %ecx, %edx
; X86-NEXT: adcl $0, %edi
-; X86-NEXT: movl %edx, %ebx
-; X86-NEXT: subl %esi, %ebx
+; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: subl %esi, %ecx
; X86-NEXT: movl %edi, %esi
; X86-NEXT: sbbl $0, %esi
; X86-NEXT: cmpl $0, {{[0-9]+}}(%esp)
; X86-NEXT: cmovnsl %edi, %esi
-; X86-NEXT: cmovnsl %edx, %ebx
-; X86-NEXT: movl %ebx, %edx
-; X86-NEXT: subl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: cmovsl %ecx, %edx
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: subl {{[0-9]+}}(%esp), %edi
; X86-NEXT: movl %esi, %ecx
; X86-NEXT: sbbl $0, %ecx
; X86-NEXT: cmpl $0, {{[0-9]+}}(%esp)
; X86-NEXT: cmovnsl %esi, %ecx
-; X86-NEXT: cmovnsl %ebx, %edx
+; X86-NEXT: cmovsl %edi, %edx
; X86-NEXT: testl %edx, %edx
; X86-NEXT: setns {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
; X86-NEXT: sets %bh
@@ -674,18 +672,18 @@ define i64 @func7(i64 %x, i64 %y) nounwind {
; X86-NEXT: andb %bh, %bl
; X86-NEXT: orb {{[-0-9]+}}(%e{{[sb]}}p), %bl # 1-byte Folded Reload
; X86-NEXT: movl $2147483647, %esi # imm = 0x7FFFFFFF
-; X86-NEXT: cmovel %edx, %esi
-; X86-NEXT: movl $-1, %edx
-; X86-NEXT: cmovnel %edx, %eax
+; X86-NEXT: cmovnel %esi, %edx
+; X86-NEXT: movl $-1, %esi
+; X86-NEXT: cmovnel %esi, %eax
; X86-NEXT: cmpl $-1, %ecx
; X86-NEXT: setl %cl
-; X86-NEXT: sete %dl
-; X86-NEXT: andb {{[-0-9]+}}(%e{{[sb]}}p), %dl # 1-byte Folded Reload
-; X86-NEXT: xorl %edi, %edi
-; X86-NEXT: orb %cl, %dl
-; X86-NEXT: cmovnel %edi, %eax
-; X86-NEXT: movl $-2147483648, %edx # imm = 0x80000000
-; X86-NEXT: cmovel %esi, %edx
+; X86-NEXT: sete %ch
+; X86-NEXT: andb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Folded Reload
+; X86-NEXT: xorl %esi, %esi
+; X86-NEXT: orb %cl, %ch
+; X86-NEXT: cmovnel %esi, %eax
+; X86-NEXT: movl $-2147483648, %ecx # imm = 0x80000000
+; X86-NEXT: cmovnel %ecx, %edx
; X86-NEXT: addl $4, %esp
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
@@ -720,52 +718,51 @@ define i64 @func8(i64 %x, i64 %y) nounwind {
; X86-NEXT: pushl %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: mull %edi
-; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: mull {{[0-9]+}}(%esp)
+; X86-NEXT: movl %edx, %edi
; X86-NEXT: movl %eax, %ebp
; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: mull %esi
+; X86-NEXT: mull {{[0-9]+}}(%esp)
; X86-NEXT: addl %edx, %ebp
-; X86-NEXT: adcl $0, %ebx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: imull %edi
-; X86-NEXT: movl %edx, %edi
-; X86-NEXT: movl %eax, %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %esi
-; X86-NEXT: addl %ebp, %eax
-; X86-NEXT: adcl %ebx, %edx
-; X86-NEXT: adcl $0, %edi
-; X86-NEXT: addl %ecx, %edx
; X86-NEXT: adcl $0, %edi
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: imull {{[0-9]+}}(%esp)
; X86-NEXT: movl %edx, %ebx
-; X86-NEXT: subl %esi, %ebx
-; X86-NEXT: movl %edi, %esi
-; X86-NEXT: sbbl $0, %esi
-; X86-NEXT: cmpl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: cmovnsl %edi, %esi
-; X86-NEXT: cmovnsl %edx, %ebx
-; X86-NEXT: movl %ebx, %edx
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: mull {{[0-9]+}}(%esp)
+; X86-NEXT: addl %ebp, %eax
+; X86-NEXT: adcl %edx, %edi
+; X86-NEXT: adcl $0, %ebx
+; X86-NEXT: addl %ecx, %edi
+; X86-NEXT: adcl $0, %ebx
+; X86-NEXT: movl %edi, %edx
; X86-NEXT: subl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movl %esi, %ecx
+; X86-NEXT: movl %ebx, %ebp
+; X86-NEXT: sbbl $0, %ebp
+; X86-NEXT: testl %esi, %esi
+; X86-NEXT: cmovnsl %ebx, %ebp
+; X86-NEXT: cmovnsl %edi, %edx
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: subl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl %ebp, %ecx
; X86-NEXT: sbbl $0, %ecx
; X86-NEXT: cmpl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: cmovnsl %esi, %ecx
-; X86-NEXT: cmovnsl %ebx, %edx
-; X86-NEXT: shrdl $31, %edx, %eax
-; X86-NEXT: shrdl $31, %ecx, %edx
+; X86-NEXT: cmovnsl %ebp, %ecx
+; X86-NEXT: cmovnsl %edx, %esi
+; X86-NEXT: shrdl $31, %esi, %eax
+; X86-NEXT: shrdl $31, %ecx, %esi
; X86-NEXT: cmpl $1073741824, %ecx # imm = 0x40000000
-; X86-NEXT: movl $2147483647, %esi # imm = 0x7FFFFFFF
-; X86-NEXT: cmovll %edx, %esi
+; X86-NEXT: movl $2147483647, %edi # imm = 0x7FFFFFFF
+; X86-NEXT: cmovll %esi, %edi
; X86-NEXT: movl $-1, %edx
; X86-NEXT: cmovgel %edx, %eax
; X86-NEXT: xorl %edx, %edx
; X86-NEXT: cmpl $-1073741824, %ecx # imm = 0xC0000000
; X86-NEXT: cmovll %edx, %eax
; X86-NEXT: movl $-2147483648, %edx # imm = 0x80000000
-; X86-NEXT: cmovgel %esi, %edx
+; X86-NEXT: cmovgel %edi, %edx
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: popl %ebx
diff --git a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
index b93ecdff39412..c0d9287ba16ed 100644
--- a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
+++ b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
@@ -42,10 +42,10 @@ define <4 x i32> @test_srem_odd_even(<4 x i32> %X) nounwind {
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT: por %xmm2, %xmm1
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993458,306783378,171798690,42949672]
-; CHECK-SSE41-NEXT: pminud %xmm1, %xmm0
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT: por %xmm2, %xmm0
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993458,306783378,171798690,42949672]
+; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
; CHECK-SSE41-NEXT: psrld $31, %xmm0
; CHECK-SSE41-NEXT: retq
@@ -405,10 +405,10 @@ define <4 x i32> @test_srem_odd_even_allones_eq(<4 x i32> %X) nounwind {
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT: por %xmm2, %xmm1
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993458,306783378,4294967295,42949672]
-; CHECK-SSE41-NEXT: pminud %xmm1, %xmm0
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT: por %xmm2, %xmm0
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993458,306783378,4294967295,42949672]
+; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
; CHECK-SSE41-NEXT: psrld $31, %xmm0
; CHECK-SSE41-NEXT: retq
@@ -492,10 +492,10 @@ define <4 x i32> @test_srem_odd_even_allones_ne(<4 x i32> %X) nounwind {
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT: por %xmm2, %xmm1
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993458,306783378,4294967295,42949672]
-; CHECK-SSE41-NEXT: pminud %xmm1, %xmm0
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT: por %xmm2, %xmm0
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993458,306783378,4294967295,42949672]
+; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
; CHECK-SSE41-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE41-NEXT: retq
@@ -585,10 +585,10 @@ define <4 x i32> @test_srem_odd_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT: por %xmm2, %xmm1
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993458,858993458,268435454,858993458]
-; CHECK-SSE41-NEXT: pminud %xmm1, %xmm0
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT: por %xmm2, %xmm0
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993458,858993458,268435454,858993458]
+; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
; CHECK-SSE41-NEXT: psrld $31, %xmm0
; CHECK-SSE41-NEXT: retq
@@ -675,10 +675,10 @@ define <4 x i32> @test_srem_even_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT: por %xmm2, %xmm1
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [306783378,306783378,268435454,306783378]
-; CHECK-SSE41-NEXT: pminud %xmm1, %xmm0
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT: por %xmm2, %xmm0
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [306783378,306783378,268435454,306783378]
+; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
; CHECK-SSE41-NEXT: psrld $31, %xmm0
; CHECK-SSE41-NEXT: retq
@@ -764,10 +764,10 @@ define <4 x i32> @test_srem_odd_even_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT: por %xmm2, %xmm1
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993458,306783378,268435454,42949672]
-; CHECK-SSE41-NEXT: pminud %xmm1, %xmm0
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT: por %xmm2, %xmm0
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993458,306783378,268435454,42949672]
+; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
; CHECK-SSE41-NEXT: psrld $31, %xmm0
; CHECK-SSE41-NEXT: retq
@@ -992,10 +992,10 @@ define <4 x i32> @test_srem_odd_even_one(<4 x i32> %X) nounwind {
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT: por %xmm2, %xmm1
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993458,306783378,4294967295,42949672]
-; CHECK-SSE41-NEXT: pminud %xmm1, %xmm0
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT: por %xmm2, %xmm0
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993458,306783378,4294967295,42949672]
+; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
; CHECK-SSE41-NEXT: psrld $31, %xmm0
; CHECK-SSE41-NEXT: retq
@@ -1406,10 +1406,10 @@ define <4 x i32> @test_srem_odd_allones_and_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT: por %xmm2, %xmm1
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993458,4294967295,268435454,858993458]
-; CHECK-SSE41-NEXT: pminud %xmm1, %xmm0
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT: por %xmm2, %xmm0
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993458,4294967295,268435454,858993458]
+; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
; CHECK-SSE41-NEXT: psrld $31, %xmm0
; CHECK-SSE41-NEXT: retq
@@ -1495,10 +1495,10 @@ define <4 x i32> @test_srem_even_allones_and_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT: por %xmm2, %xmm1
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [306783378,4294967295,268435454,306783378]
-; CHECK-SSE41-NEXT: pminud %xmm1, %xmm0
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT: por %xmm2, %xmm0
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [306783378,4294967295,268435454,306783378]
+; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
; CHECK-SSE41-NEXT: psrld $31, %xmm0
; CHECK-SSE41-NEXT: retq
@@ -1584,10 +1584,10 @@ define <4 x i32> @test_srem_odd_even_allones_and_poweroftwo(<4 x i32> %X) nounwi
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT: por %xmm2, %xmm1
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993458,4294967295,268435454,42949672]
-; CHECK-SSE41-NEXT: pminud %xmm1, %xmm0
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT: por %xmm2, %xmm0
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993458,4294967295,268435454,42949672]
+; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
; CHECK-SSE41-NEXT: psrld $31, %xmm0
; CHECK-SSE41-NEXT: retq
@@ -1812,10 +1812,10 @@ define <4 x i32> @test_srem_odd_even_allones_and_one(<4 x i32> %X) nounwind {
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT: por %xmm2, %xmm1
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993458,4294967295,4294967295,42949672]
-; CHECK-SSE41-NEXT: pminud %xmm1, %xmm0
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT: por %xmm2, %xmm0
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993458,4294967295,4294967295,42949672]
+; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
; CHECK-SSE41-NEXT: psrld $31, %xmm0
; CHECK-SSE41-NEXT: retq
@@ -1903,10 +1903,10 @@ define <4 x i32> @test_srem_odd_poweroftwo_and_one(<4 x i32> %X) nounwind {
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT: por %xmm2, %xmm1
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993458,268435454,4294967295,858993458]
-; CHECK-SSE41-NEXT: pminud %xmm1, %xmm0
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT: por %xmm2, %xmm0
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993458,268435454,4294967295,858993458]
+; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
; CHECK-SSE41-NEXT: psrld $31, %xmm0
; CHECK-SSE41-NEXT: retq
@@ -1992,10 +1992,10 @@ define <4 x i32> @test_srem_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT: por %xmm2, %xmm1
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [306783378,268435454,4294967295,306783378]
-; CHECK-SSE41-NEXT: pminud %xmm1, %xmm0
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT: por %xmm2, %xmm0
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [306783378,268435454,4294967295,306783378]
+; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
; CHECK-SSE41-NEXT: psrld $31, %xmm0
; CHECK-SSE41-NEXT: retq
@@ -2081,10 +2081,10 @@ define <4 x i32> @test_srem_odd_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT: por %xmm2, %xmm1
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993458,268435454,4294967295,42949672]
-; CHECK-SSE41-NEXT: pminud %xmm1, %xmm0
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT: por %xmm2, %xmm0
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993458,268435454,4294967295,42949672]
+; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
; CHECK-SSE41-NEXT: psrld $31, %xmm0
; CHECK-SSE41-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/sse3-avx-addsub-2.ll b/llvm/test/CodeGen/X86/sse3-avx-addsub-2.ll
index e26b26f1dfa5d..95587ee7ae6b0 100644
--- a/llvm/test/CodeGen/X86/sse3-avx-addsub-2.ll
+++ b/llvm/test/CodeGen/X86/sse3-avx-addsub-2.ll
@@ -332,9 +332,8 @@ define <4 x float> @test13(<4 x float> %A, <4 x float> %B) {
; SSE: # %bb.0:
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
-; SSE-NEXT: addss %xmm0, %xmm1
-; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,0]
-; SSE-NEXT: movaps %xmm1, %xmm0
+; SSE-NEXT: addss %xmm1, %xmm0
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
; SSE-NEXT: retq
;
; AVX1-LABEL: test13:
diff --git a/llvm/test/CodeGen/X86/uadd_sat.ll b/llvm/test/CodeGen/X86/uadd_sat.ll
index 39ad8ab47bcd7..cbecdefbec260 100644
--- a/llvm/test/CodeGen/X86/uadd_sat.ll
+++ b/llvm/test/CodeGen/X86/uadd_sat.ll
@@ -151,11 +151,12 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
; X64-LABEL: vec:
; X64: # %bb.0:
; X64-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
-; X64-NEXT: paddd %xmm0, %xmm1
-; X64-NEXT: pxor %xmm2, %xmm0
-; X64-NEXT: pxor %xmm1, %xmm2
-; X64-NEXT: pcmpgtd %xmm2, %xmm0
-; X64-NEXT: por %xmm1, %xmm0
+; X64-NEXT: movdqa %xmm0, %xmm3
+; X64-NEXT: pxor %xmm2, %xmm3
+; X64-NEXT: paddd %xmm1, %xmm0
+; X64-NEXT: pxor %xmm0, %xmm2
+; X64-NEXT: pcmpgtd %xmm2, %xmm3
+; X64-NEXT: por %xmm3, %xmm0
; X64-NEXT: retq
%tmp = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> %x, <4 x i32> %y)
ret <4 x i32> %tmp
diff --git a/llvm/test/CodeGen/X86/uadd_sat_vec.ll b/llvm/test/CodeGen/X86/uadd_sat_vec.ll
index b49b0717a6749..997205b610179 100644
--- a/llvm/test/CodeGen/X86/uadd_sat_vec.ll
+++ b/llvm/test/CodeGen/X86/uadd_sat_vec.ll
@@ -522,21 +522,23 @@ define <2 x i32> @v2i32(<2 x i32> %x, <2 x i32> %y) nounwind {
; SSE2-LABEL: v2i32:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-NEXT: paddd %xmm0, %xmm1
-; SSE2-NEXT: pxor %xmm2, %xmm0
-; SSE2-NEXT: pxor %xmm1, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm0
-; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: pxor %xmm2, %xmm3
+; SSE2-NEXT: paddd %xmm1, %xmm0
+; SSE2-NEXT: pxor %xmm0, %xmm2
+; SSE2-NEXT: pcmpgtd %xmm2, %xmm3
+; SSE2-NEXT: por %xmm3, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: v2i32:
; SSSE3: # %bb.0:
; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
-; SSSE3-NEXT: paddd %xmm0, %xmm1
-; SSSE3-NEXT: pxor %xmm2, %xmm0
-; SSSE3-NEXT: pxor %xmm1, %xmm2
-; SSSE3-NEXT: pcmpgtd %xmm2, %xmm0
-; SSSE3-NEXT: por %xmm1, %xmm0
+; SSSE3-NEXT: movdqa %xmm0, %xmm3
+; SSSE3-NEXT: pxor %xmm2, %xmm3
+; SSSE3-NEXT: paddd %xmm1, %xmm0
+; SSSE3-NEXT: pxor %xmm0, %xmm2
+; SSSE3-NEXT: pcmpgtd %xmm2, %xmm3
+; SSSE3-NEXT: por %xmm3, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: v2i32:
@@ -588,21 +590,23 @@ define <4 x i32> @v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
; SSE2-LABEL: v4i32:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-NEXT: paddd %xmm0, %xmm1
-; SSE2-NEXT: pxor %xmm2, %xmm0
-; SSE2-NEXT: pxor %xmm1, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm0
-; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: pxor %xmm2, %xmm3
+; SSE2-NEXT: paddd %xmm1, %xmm0
+; SSE2-NEXT: pxor %xmm0, %xmm2
+; SSE2-NEXT: pcmpgtd %xmm2, %xmm3
+; SSE2-NEXT: por %xmm3, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: v4i32:
; SSSE3: # %bb.0:
; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
-; SSSE3-NEXT: paddd %xmm0, %xmm1
-; SSSE3-NEXT: pxor %xmm2, %xmm0
-; SSSE3-NEXT: pxor %xmm1, %xmm2
-; SSSE3-NEXT: pcmpgtd %xmm2, %xmm0
-; SSSE3-NEXT: por %xmm1, %xmm0
+; SSSE3-NEXT: movdqa %xmm0, %xmm3
+; SSSE3-NEXT: pxor %xmm2, %xmm3
+; SSSE3-NEXT: paddd %xmm1, %xmm0
+; SSSE3-NEXT: pxor %xmm0, %xmm2
+; SSSE3-NEXT: pcmpgtd %xmm2, %xmm3
+; SSSE3-NEXT: por %xmm3, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: v4i32:
@@ -654,33 +658,37 @@ define <8 x i32> @v8i32(<8 x i32> %x, <8 x i32> %y) nounwind {
; SSE2-LABEL: v8i32:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-NEXT: paddd %xmm0, %xmm2
-; SSE2-NEXT: pxor %xmm4, %xmm0
-; SSE2-NEXT: movdqa %xmm2, %xmm5
+; SSE2-NEXT: movdqa %xmm0, %xmm5
; SSE2-NEXT: pxor %xmm4, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm5, %xmm0
-; SSE2-NEXT: por %xmm2, %xmm0
-; SSE2-NEXT: paddd %xmm1, %xmm3
-; SSE2-NEXT: pxor %xmm4, %xmm1
-; SSE2-NEXT: pxor %xmm3, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm4, %xmm1
-; SSE2-NEXT: por %xmm3, %xmm1
+; SSE2-NEXT: paddd %xmm2, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: pxor %xmm4, %xmm2
+; SSE2-NEXT: pcmpgtd %xmm2, %xmm5
+; SSE2-NEXT: por %xmm5, %xmm0
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: pxor %xmm4, %xmm2
+; SSE2-NEXT: paddd %xmm3, %xmm1
+; SSE2-NEXT: pxor %xmm1, %xmm4
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm2
+; SSE2-NEXT: por %xmm2, %xmm1
; SSE2-NEXT: retq
;
; SSSE3-LABEL: v8i32:
; SSSE3: # %bb.0:
; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
-; SSSE3-NEXT: paddd %xmm0, %xmm2
-; SSSE3-NEXT: pxor %xmm4, %xmm0
-; SSSE3-NEXT: movdqa %xmm2, %xmm5
+; SSSE3-NEXT: movdqa %xmm0, %xmm5
; SSSE3-NEXT: pxor %xmm4, %xmm5
-; SSSE3-NEXT: pcmpgtd %xmm5, %xmm0
-; SSSE3-NEXT: por %xmm2, %xmm0
-; SSSE3-NEXT: paddd %xmm1, %xmm3
-; SSSE3-NEXT: pxor %xmm4, %xmm1
-; SSSE3-NEXT: pxor %xmm3, %xmm4
-; SSSE3-NEXT: pcmpgtd %xmm4, %xmm1
-; SSSE3-NEXT: por %xmm3, %xmm1
+; SSSE3-NEXT: paddd %xmm2, %xmm0
+; SSSE3-NEXT: movdqa %xmm0, %xmm2
+; SSSE3-NEXT: pxor %xmm4, %xmm2
+; SSSE3-NEXT: pcmpgtd %xmm2, %xmm5
+; SSSE3-NEXT: por %xmm5, %xmm0
+; SSSE3-NEXT: movdqa %xmm1, %xmm2
+; SSSE3-NEXT: pxor %xmm4, %xmm2
+; SSSE3-NEXT: paddd %xmm3, %xmm1
+; SSSE3-NEXT: pxor %xmm1, %xmm4
+; SSSE3-NEXT: pcmpgtd %xmm4, %xmm2
+; SSSE3-NEXT: por %xmm2, %xmm1
; SSSE3-NEXT: retq
;
; SSE41-LABEL: v8i32:
@@ -741,57 +749,65 @@ define <16 x i32> @v16i32(<16 x i32> %x, <16 x i32> %y) nounwind {
; SSE2-LABEL: v16i32:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-NEXT: paddd %xmm0, %xmm4
-; SSE2-NEXT: pxor %xmm8, %xmm0
-; SSE2-NEXT: movdqa %xmm4, %xmm9
+; SSE2-NEXT: movdqa %xmm0, %xmm9
; SSE2-NEXT: pxor %xmm8, %xmm9
-; SSE2-NEXT: pcmpgtd %xmm9, %xmm0
-; SSE2-NEXT: por %xmm4, %xmm0
-; SSE2-NEXT: paddd %xmm1, %xmm5
-; SSE2-NEXT: pxor %xmm8, %xmm1
-; SSE2-NEXT: movdqa %xmm5, %xmm4
+; SSE2-NEXT: paddd %xmm4, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm4
; SSE2-NEXT: pxor %xmm8, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm4, %xmm1
-; SSE2-NEXT: por %xmm5, %xmm1
-; SSE2-NEXT: paddd %xmm2, %xmm6
-; SSE2-NEXT: pxor %xmm8, %xmm2
-; SSE2-NEXT: movdqa %xmm6, %xmm4
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm9
+; SSE2-NEXT: por %xmm9, %xmm0
+; SSE2-NEXT: movdqa %xmm1, %xmm4
; SSE2-NEXT: pxor %xmm8, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm4, %xmm2
-; SSE2-NEXT: por %xmm6, %xmm2
-; SSE2-NEXT: paddd %xmm3, %xmm7
-; SSE2-NEXT: pxor %xmm8, %xmm3
-; SSE2-NEXT: pxor %xmm7, %xmm8
-; SSE2-NEXT: pcmpgtd %xmm8, %xmm3
-; SSE2-NEXT: por %xmm7, %xmm3
+; SSE2-NEXT: paddd %xmm5, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm5
+; SSE2-NEXT: pxor %xmm8, %xmm5
+; SSE2-NEXT: pcmpgtd %xmm5, %xmm4
+; SSE2-NEXT: por %xmm4, %xmm1
+; SSE2-NEXT: movdqa %xmm2, %xmm4
+; SSE2-NEXT: pxor %xmm8, %xmm4
+; SSE2-NEXT: paddd %xmm6, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm5
+; SSE2-NEXT: pxor %xmm8, %xmm5
+; SSE2-NEXT: pcmpgtd %xmm5, %xmm4
+; SSE2-NEXT: por %xmm4, %xmm2
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: pxor %xmm8, %xmm4
+; SSE2-NEXT: paddd %xmm7, %xmm3
+; SSE2-NEXT: pxor %xmm3, %xmm8
+; SSE2-NEXT: pcmpgtd %xmm8, %xmm4
+; SSE2-NEXT: por %xmm4, %xmm3
; SSE2-NEXT: retq
;
; SSSE3-LABEL: v16i32:
; SSSE3: # %bb.0:
; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,2147483648,2147483648,2147483648]
-; SSSE3-NEXT: paddd %xmm0, %xmm4
-; SSSE3-NEXT: pxor %xmm8, %xmm0
-; SSSE3-NEXT: movdqa %xmm4, %xmm9
+; SSSE3-NEXT: movdqa %xmm0, %xmm9
; SSSE3-NEXT: pxor %xmm8, %xmm9
-; SSSE3-NEXT: pcmpgtd %xmm9, %xmm0
-; SSSE3-NEXT: por %xmm4, %xmm0
-; SSSE3-NEXT: paddd %xmm1, %xmm5
-; SSSE3-NEXT: pxor %xmm8, %xmm1
-; SSSE3-NEXT: movdqa %xmm5, %xmm4
+; SSSE3-NEXT: paddd %xmm4, %xmm0
+; SSSE3-NEXT: movdqa %xmm0, %xmm4
; SSSE3-NEXT: pxor %xmm8, %xmm4
-; SSSE3-NEXT: pcmpgtd %xmm4, %xmm1
-; SSSE3-NEXT: por %xmm5, %xmm1
-; SSSE3-NEXT: paddd %xmm2, %xmm6
-; SSSE3-NEXT: pxor %xmm8, %xmm2
-; SSSE3-NEXT: movdqa %xmm6, %xmm4
+; SSSE3-NEXT: pcmpgtd %xmm4, %xmm9
+; SSSE3-NEXT: por %xmm9, %xmm0
+; SSSE3-NEXT: movdqa %xmm1, %xmm4
; SSSE3-NEXT: pxor %xmm8, %xmm4
-; SSSE3-NEXT: pcmpgtd %xmm4, %xmm2
-; SSSE3-NEXT: por %xmm6, %xmm2
-; SSSE3-NEXT: paddd %xmm3, %xmm7
-; SSSE3-NEXT: pxor %xmm8, %xmm3
-; SSSE3-NEXT: pxor %xmm7, %xmm8
-; SSSE3-NEXT: pcmpgtd %xmm8, %xmm3
-; SSSE3-NEXT: por %xmm7, %xmm3
+; SSSE3-NEXT: paddd %xmm5, %xmm1
+; SSSE3-NEXT: movdqa %xmm1, %xmm5
+; SSSE3-NEXT: pxor %xmm8, %xmm5
+; SSSE3-NEXT: pcmpgtd %xmm5, %xmm4
+; SSSE3-NEXT: por %xmm4, %xmm1
+; SSSE3-NEXT: movdqa %xmm2, %xmm4
+; SSSE3-NEXT: pxor %xmm8, %xmm4
+; SSSE3-NEXT: paddd %xmm6, %xmm2
+; SSSE3-NEXT: movdqa %xmm2, %xmm5
+; SSSE3-NEXT: pxor %xmm8, %xmm5
+; SSSE3-NEXT: pcmpgtd %xmm5, %xmm4
+; SSSE3-NEXT: por %xmm4, %xmm2
+; SSSE3-NEXT: movdqa %xmm3, %xmm4
+; SSSE3-NEXT: pxor %xmm8, %xmm4
+; SSSE3-NEXT: paddd %xmm7, %xmm3
+; SSSE3-NEXT: pxor %xmm3, %xmm8
+; SSSE3-NEXT: pcmpgtd %xmm8, %xmm4
+; SSSE3-NEXT: por %xmm4, %xmm3
; SSSE3-NEXT: retq
;
; SSE41-LABEL: v16i32:
@@ -863,16 +879,17 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
; SSE-LABEL: v2i64:
; SSE: # %bb.0:
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456]
-; SSE-NEXT: paddq %xmm0, %xmm1
-; SSE-NEXT: pxor %xmm2, %xmm0
-; SSE-NEXT: pxor %xmm1, %xmm2
; SSE-NEXT: movdqa %xmm0, %xmm3
-; SSE-NEXT: pcmpgtd %xmm2, %xmm3
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
-; SSE-NEXT: pcmpeqd %xmm0, %xmm2
+; SSE-NEXT: pxor %xmm2, %xmm3
+; SSE-NEXT: paddq %xmm1, %xmm0
+; SSE-NEXT: pxor %xmm0, %xmm2
+; SSE-NEXT: movdqa %xmm3, %xmm1
+; SSE-NEXT: pcmpgtd %xmm2, %xmm1
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,0,2,2]
+; SSE-NEXT: pcmpeqd %xmm3, %xmm2
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; SSE-NEXT: pand %xmm4, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSE-NEXT: por %xmm1, %xmm0
; SSE-NEXT: por %xmm2, %xmm0
; SSE-NEXT: retq
@@ -923,31 +940,33 @@ define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind {
; SSE-LABEL: v4i64:
; SSE: # %bb.0:
; SSE-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259456,9223372039002259456]
-; SSE-NEXT: paddq %xmm0, %xmm2
-; SSE-NEXT: pxor %xmm4, %xmm0
-; SSE-NEXT: movdqa %xmm2, %xmm5
+; SSE-NEXT: movdqa %xmm0, %xmm5
; SSE-NEXT: pxor %xmm4, %xmm5
-; SSE-NEXT: movdqa %xmm0, %xmm6
-; SSE-NEXT: pcmpgtd %xmm5, %xmm6
+; SSE-NEXT: paddq %xmm2, %xmm0
+; SSE-NEXT: movdqa %xmm0, %xmm2
+; SSE-NEXT: pxor %xmm4, %xmm2
+; SSE-NEXT: movdqa %xmm5, %xmm6
+; SSE-NEXT: pcmpgtd %xmm2, %xmm6
; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSE-NEXT: pcmpeqd %xmm0, %xmm5
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE-NEXT: pand %xmm7, %xmm5
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
-; SSE-NEXT: por %xmm2, %xmm0
+; SSE-NEXT: pcmpeqd %xmm5, %xmm2
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE-NEXT: pand %xmm7, %xmm2
+; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
; SSE-NEXT: por %xmm5, %xmm0
-; SSE-NEXT: paddq %xmm1, %xmm3
-; SSE-NEXT: pxor %xmm4, %xmm1
-; SSE-NEXT: pxor %xmm3, %xmm4
+; SSE-NEXT: por %xmm2, %xmm0
; SSE-NEXT: movdqa %xmm1, %xmm2
-; SSE-NEXT: pcmpgtd %xmm4, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2]
-; SSE-NEXT: pcmpeqd %xmm1, %xmm4
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE-NEXT: pand %xmm5, %xmm4
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; SSE-NEXT: pxor %xmm4, %xmm2
+; SSE-NEXT: paddq %xmm3, %xmm1
+; SSE-NEXT: pxor %xmm1, %xmm4
+; SSE-NEXT: movdqa %xmm2, %xmm3
+; SSE-NEXT: pcmpgtd %xmm4, %xmm3
+; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
+; SSE-NEXT: pcmpeqd %xmm2, %xmm4
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
+; SSE-NEXT: pand %xmm5, %xmm2
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
; SSE-NEXT: por %xmm3, %xmm1
-; SSE-NEXT: por %xmm4, %xmm1
+; SSE-NEXT: por %xmm2, %xmm1
; SSE-NEXT: retq
;
; AVX1-LABEL: v4i64:
@@ -1003,57 +1022,61 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
; SSE-LABEL: v8i64:
; SSE: # %bb.0:
; SSE-NEXT: movdqa {{.*#+}} xmm8 = [9223372039002259456,9223372039002259456]
-; SSE-NEXT: paddq %xmm0, %xmm4
-; SSE-NEXT: pxor %xmm8, %xmm0
-; SSE-NEXT: movdqa %xmm4, %xmm9
+; SSE-NEXT: movdqa %xmm0, %xmm9
; SSE-NEXT: pxor %xmm8, %xmm9
-; SSE-NEXT: movdqa %xmm0, %xmm10
-; SSE-NEXT: pcmpgtd %xmm9, %xmm10
+; SSE-NEXT: paddq %xmm4, %xmm0
+; SSE-NEXT: movdqa %xmm0, %xmm4
+; SSE-NEXT: pxor %xmm8, %xmm4
+; SSE-NEXT: movdqa %xmm9, %xmm10
+; SSE-NEXT: pcmpgtd %xmm4, %xmm10
; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
-; SSE-NEXT: pcmpeqd %xmm0, %xmm9
-; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[1,1,3,3]
+; SSE-NEXT: pcmpeqd %xmm9, %xmm4
+; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm4[1,1,3,3]
; SSE-NEXT: pand %xmm11, %xmm9
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,3,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm10[1,1,3,3]
; SSE-NEXT: por %xmm4, %xmm0
; SSE-NEXT: por %xmm9, %xmm0
-; SSE-NEXT: paddq %xmm1, %xmm5
-; SSE-NEXT: pxor %xmm8, %xmm1
-; SSE-NEXT: movdqa %xmm5, %xmm4
-; SSE-NEXT: pxor %xmm8, %xmm4
; SSE-NEXT: movdqa %xmm1, %xmm9
-; SSE-NEXT: pcmpgtd %xmm4, %xmm9
-; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2]
-; SSE-NEXT: pcmpeqd %xmm1, %xmm4
+; SSE-NEXT: pxor %xmm8, %xmm9
+; SSE-NEXT: paddq %xmm5, %xmm1
+; SSE-NEXT: movdqa %xmm1, %xmm5
+; SSE-NEXT: pxor %xmm8, %xmm5
+; SSE-NEXT: movdqa %xmm9, %xmm4
+; SSE-NEXT: pcmpgtd %xmm5, %xmm4
+; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm4[0,0,2,2]
+; SSE-NEXT: pcmpeqd %xmm9, %xmm5
+; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSE-NEXT: pand %xmm10, %xmm5
; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE-NEXT: pand %xmm10, %xmm4
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[1,1,3,3]
-; SSE-NEXT: por %xmm5, %xmm1
; SSE-NEXT: por %xmm4, %xmm1
-; SSE-NEXT: paddq %xmm2, %xmm6
-; SSE-NEXT: pxor %xmm8, %xmm2
-; SSE-NEXT: movdqa %xmm6, %xmm4
+; SSE-NEXT: por %xmm5, %xmm1
+; SSE-NEXT: movdqa %xmm2, %xmm4
; SSE-NEXT: pxor %xmm8, %xmm4
+; SSE-NEXT: paddq %xmm6, %xmm2
; SSE-NEXT: movdqa %xmm2, %xmm5
-; SSE-NEXT: pcmpgtd %xmm4, %xmm5
-; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm5[0,0,2,2]
-; SSE-NEXT: pcmpeqd %xmm2, %xmm4
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE-NEXT: pxor %xmm8, %xmm5
+; SSE-NEXT: movdqa %xmm4, %xmm6
+; SSE-NEXT: pcmpgtd %xmm5, %xmm6
+; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm6[0,0,2,2]
+; SSE-NEXT: pcmpeqd %xmm4, %xmm5
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
; SSE-NEXT: pand %xmm9, %xmm4
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[1,1,3,3]
-; SSE-NEXT: por %xmm6, %xmm2
+; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
+; SSE-NEXT: por %xmm5, %xmm2
; SSE-NEXT: por %xmm4, %xmm2
-; SSE-NEXT: paddq %xmm3, %xmm7
-; SSE-NEXT: pxor %xmm8, %xmm3
-; SSE-NEXT: pxor %xmm7, %xmm8
; SSE-NEXT: movdqa %xmm3, %xmm4
-; SSE-NEXT: pcmpgtd %xmm8, %xmm4
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSE-NEXT: pcmpeqd %xmm3, %xmm8
-; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm8[1,1,3,3]
-; SSE-NEXT: pand %xmm5, %xmm6
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
-; SSE-NEXT: por %xmm7, %xmm3
-; SSE-NEXT: por %xmm6, %xmm3
+; SSE-NEXT: pxor %xmm8, %xmm4
+; SSE-NEXT: paddq %xmm7, %xmm3
+; SSE-NEXT: pxor %xmm3, %xmm8
+; SSE-NEXT: movdqa %xmm4, %xmm5
+; SSE-NEXT: pcmpgtd %xmm8, %xmm5
+; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; SSE-NEXT: pcmpeqd %xmm4, %xmm8
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[1,1,3,3]
+; SSE-NEXT: pand %xmm6, %xmm4
+; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSE-NEXT: por %xmm5, %xmm3
+; SSE-NEXT: por %xmm4, %xmm3
; SSE-NEXT: retq
;
; AVX1-LABEL: v8i64:
diff --git a/llvm/test/CodeGen/X86/umul-with-overflow.ll b/llvm/test/CodeGen/X86/umul-with-overflow.ll
index 3afddbb7f0e4d..bc93d02e3deee 100644
--- a/llvm/test/CodeGen/X86/umul-with-overflow.ll
+++ b/llvm/test/CodeGen/X86/umul-with-overflow.ll
@@ -34,9 +34,10 @@ define i32 @test2(i32 %a, i32 %b) nounwind readnone {
;
; X64-LABEL: test2:
; X64: # %bb.0: # %entry
+; X64-NEXT: # kill: def $esi killed $esi def $rsi
; X64-NEXT: # kill: def $edi killed $edi def $rdi
-; X64-NEXT: addl %esi, %edi
-; X64-NEXT: leal (%rdi,%rdi), %eax
+; X64-NEXT: leal (%rdi,%rsi), %eax
+; X64-NEXT: addl %eax, %eax
; X64-NEXT: retq
entry:
%tmp0 = add i32 %b, %a
diff --git a/llvm/test/CodeGen/X86/umul_fix.ll b/llvm/test/CodeGen/X86/umul_fix.ll
index fa247791a025e..fce98cc448645 100644
--- a/llvm/test/CodeGen/X86/umul_fix.ll
+++ b/llvm/test/CodeGen/X86/umul_fix.ll
@@ -45,19 +45,19 @@ define i64 @func2(i64 %x, i64 %y) nounwind {
; X86-NEXT: pushl %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
; X86-NEXT: movl %esi, %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
; X86-NEXT: movl %edx, %edi
; X86-NEXT: movl %eax, %ebx
; X86-NEXT: movl %esi, %eax
-; X86-NEXT: mull {{[0-9]+}}(%esp)
+; X86-NEXT: mull %ebp
; X86-NEXT: movl %eax, %esi
-; X86-NEXT: movl %edx, %ebp
-; X86-NEXT: addl %ebx, %ebp
+; X86-NEXT: addl %edx, %ebx
; X86-NEXT: adcl $0, %edi
; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: addl %ebp, %eax
+; X86-NEXT: mull %ebp
+; X86-NEXT: addl %ebx, %eax
; X86-NEXT: adcl %edi, %edx
; X86-NEXT: imull {{[0-9]+}}(%esp), %ecx
; X86-NEXT: addl %ecx, %edx
@@ -306,31 +306,31 @@ define i64 @func8(i64 %x, i64 %y) nounwind {
; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: movl %esi, %eax
; X86-NEXT: mull %ebp
-; X86-NEXT: movl %edx, %esi
+; X86-NEXT: movl %edx, %ecx
; X86-NEXT: movl %eax, %edi
-; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: movl %esi, %eax
; X86-NEXT: mull %ebx
; X86-NEXT: addl %edx, %edi
-; X86-NEXT: adcl $0, %esi
+; X86-NEXT: adcl $0, %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull %ebp
-; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: movl %edx, %esi
; X86-NEXT: movl %eax, %ebp
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull %ebx
; X86-NEXT: addl %edi, %eax
-; X86-NEXT: adcl %esi, %edx
-; X86-NEXT: adcl $0, %ecx
-; X86-NEXT: addl %ebp, %edx
-; X86-NEXT: adcl $0, %ecx
-; X86-NEXT: shldl $1, %edx, %ecx
-; X86-NEXT: shrdl $31, %edx, %eax
-; X86-NEXT: movl %ecx, %edx
+; X86-NEXT: adcl %edx, %ecx
+; X86-NEXT: adcl $0, %esi
+; X86-NEXT: addl %ebp, %ecx
+; X86-NEXT: adcl $0, %esi
+; X86-NEXT: shldl $1, %ecx, %esi
+; X86-NEXT: shrdl $31, %ecx, %eax
+; X86-NEXT: movl %esi, %edx
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: popl %ebx
diff --git a/llvm/test/CodeGen/X86/umul_fix_sat.ll b/llvm/test/CodeGen/X86/umul_fix_sat.ll
index a61090ac19e09..155dab5b47373 100644
--- a/llvm/test/CodeGen/X86/umul_fix_sat.ll
+++ b/llvm/test/CodeGen/X86/umul_fix_sat.ll
@@ -52,27 +52,26 @@ define i64 @func2(i64 %x, i64 %y) nounwind {
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: mull %ebx
+; X86-NEXT: mull %esi
; X86-NEXT: movl %edx, %edi
-; X86-NEXT: movl %eax, %esi
+; X86-NEXT: movl %eax, %ebx
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
; X86-NEXT: movl %eax, %ecx
-; X86-NEXT: movl %edx, %ebp
-; X86-NEXT: addl %esi, %ebp
+; X86-NEXT: addl %edx, %ebx
; X86-NEXT: adcl $0, %edi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %ebx
+; X86-NEXT: mull %esi
; X86-NEXT: movl %edx, %esi
-; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: movl %eax, %ebp
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: addl %ebp, %eax
+; X86-NEXT: addl %ebx, %eax
; X86-NEXT: adcl %edi, %edx
; X86-NEXT: adcl $0, %esi
-; X86-NEXT: addl %ebx, %edx
+; X86-NEXT: addl %ebp, %edx
; X86-NEXT: adcl $0, %esi
; X86-NEXT: shrdl $2, %eax, %ecx
; X86-NEXT: shrdl $2, %edx, %eax
diff --git a/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll
index cb7a93a31eeea..a9be2a5b9273e 100644
--- a/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll
+++ b/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll
@@ -35,10 +35,10 @@ define <4 x i32> @test_urem_odd_even(<4 x i32> %X) nounwind {
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT: por %xmm2, %xmm1
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993459,306783378,171798691,42949672]
-; CHECK-SSE41-NEXT: pminud %xmm1, %xmm0
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT: por %xmm2, %xmm0
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993459,306783378,171798691,42949672]
+; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
; CHECK-SSE41-NEXT: psrld $31, %xmm0
; CHECK-SSE41-NEXT: retq
@@ -189,10 +189,10 @@ define <4 x i32> @test_urem_even_allones_eq(<4 x i32> %X) nounwind {
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT: por %xmm2, %xmm1
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [306783378,306783378,1,306783378]
-; CHECK-SSE41-NEXT: pminud %xmm1, %xmm0
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT: por %xmm2, %xmm0
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [306783378,306783378,1,306783378]
+; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
; CHECK-SSE41-NEXT: psrld $31, %xmm0
; CHECK-SSE41-NEXT: retq
@@ -267,10 +267,10 @@ define <4 x i32> @test_urem_even_allones_ne(<4 x i32> %X) nounwind {
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT: por %xmm2, %xmm1
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [306783379,306783379,2,306783379]
-; CHECK-SSE41-NEXT: pmaxud %xmm1, %xmm0
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT: por %xmm2, %xmm0
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [306783379,306783379,2,306783379]
+; CHECK-SSE41-NEXT: pmaxud %xmm0, %xmm1
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
; CHECK-SSE41-NEXT: psrld $31, %xmm0
; CHECK-SSE41-NEXT: retq
@@ -346,10 +346,10 @@ define <4 x i32> @test_urem_odd_even_allones_eq(<4 x i32> %X) nounwind {
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT: por %xmm2, %xmm1
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993459,306783378,1,42949672]
-; CHECK-SSE41-NEXT: pminud %xmm1, %xmm0
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT: por %xmm2, %xmm0
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993459,306783378,1,42949672]
+; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
; CHECK-SSE41-NEXT: psrld $31, %xmm0
; CHECK-SSE41-NEXT: retq
@@ -423,10 +423,10 @@ define <4 x i32> @test_urem_odd_even_allones_ne(<4 x i32> %X) nounwind {
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT: por %xmm2, %xmm1
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993460,306783379,2,42949673]
-; CHECK-SSE41-NEXT: pmaxud %xmm1, %xmm0
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT: por %xmm2, %xmm0
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993460,306783379,2,42949673]
+; CHECK-SSE41-NEXT: pmaxud %xmm0, %xmm1
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
; CHECK-SSE41-NEXT: psrld $31, %xmm0
; CHECK-SSE41-NEXT: retq
@@ -505,10 +505,10 @@ define <4 x i32> @test_urem_odd_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT: por %xmm2, %xmm1
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993459,858993459,268435455,858993459]
-; CHECK-SSE41-NEXT: pminud %xmm1, %xmm0
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT: por %xmm2, %xmm0
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993459,858993459,268435455,858993459]
+; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
; CHECK-SSE41-NEXT: psrld $31, %xmm0
; CHECK-SSE41-NEXT: retq
@@ -585,10 +585,10 @@ define <4 x i32> @test_urem_even_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT: por %xmm2, %xmm1
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [306783378,306783378,268435455,306783378]
-; CHECK-SSE41-NEXT: pminud %xmm1, %xmm0
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT: por %xmm2, %xmm0
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [306783378,306783378,268435455,306783378]
+; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
; CHECK-SSE41-NEXT: psrld $31, %xmm0
; CHECK-SSE41-NEXT: retq
@@ -664,10 +664,10 @@ define <4 x i32> @test_urem_odd_even_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT: por %xmm2, %xmm1
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993459,306783378,268435455,42949672]
-; CHECK-SSE41-NEXT: pminud %xmm1, %xmm0
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT: por %xmm2, %xmm0
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993459,306783378,268435455,42949672]
+; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
; CHECK-SSE41-NEXT: psrld $31, %xmm0
; CHECK-SSE41-NEXT: retq
@@ -870,10 +870,10 @@ define <4 x i32> @test_urem_odd_even_one(<4 x i32> %X) nounwind {
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT: por %xmm2, %xmm1
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993459,306783378,4294967295,42949672]
-; CHECK-SSE41-NEXT: pminud %xmm1, %xmm0
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT: por %xmm2, %xmm0
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993459,306783378,4294967295,42949672]
+; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
; CHECK-SSE41-NEXT: psrld $31, %xmm0
; CHECK-SSE41-NEXT: retq
@@ -952,10 +952,10 @@ define <4 x i32> @test_urem_odd_INT_MIN(<4 x i32> %X) nounwind {
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT: por %xmm2, %xmm1
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993459,858993459,1,858993459]
-; CHECK-SSE41-NEXT: pminud %xmm1, %xmm0
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT: por %xmm2, %xmm0
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993459,858993459,1,858993459]
+; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
; CHECK-SSE41-NEXT: psrld $31, %xmm0
; CHECK-SSE41-NEXT: retq
@@ -1032,10 +1032,10 @@ define <4 x i32> @test_urem_even_INT_MIN(<4 x i32> %X) nounwind {
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT: por %xmm2, %xmm1
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [306783378,306783378,1,306783378]
-; CHECK-SSE41-NEXT: pminud %xmm1, %xmm0
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT: por %xmm2, %xmm0
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [306783378,306783378,1,306783378]
+; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
; CHECK-SSE41-NEXT: psrld $31, %xmm0
; CHECK-SSE41-NEXT: retq
@@ -1111,10 +1111,10 @@ define <4 x i32> @test_urem_odd_even_INT_MIN(<4 x i32> %X) nounwind {
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT: por %xmm2, %xmm1
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993459,306783378,1,42949672]
-; CHECK-SSE41-NEXT: pminud %xmm1, %xmm0
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT: por %xmm2, %xmm0
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993459,306783378,1,42949672]
+; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
; CHECK-SSE41-NEXT: psrld $31, %xmm0
; CHECK-SSE41-NEXT: retq
@@ -1193,10 +1193,10 @@ define <4 x i32> @test_urem_odd_allones_and_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT: por %xmm2, %xmm1
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993459,1,268435455,858993459]
-; CHECK-SSE41-NEXT: pminud %xmm1, %xmm0
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT: por %xmm2, %xmm0
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993459,1,268435455,858993459]
+; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
; CHECK-SSE41-NEXT: psrld $31, %xmm0
; CHECK-SSE41-NEXT: retq
@@ -1272,10 +1272,10 @@ define <4 x i32> @test_urem_even_allones_and_poweroftwo(<4 x i32> %X) nounwind {
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT: por %xmm2, %xmm1
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [306783378,1,268435455,306783378]
-; CHECK-SSE41-NEXT: pminud %xmm1, %xmm0
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT: por %xmm2, %xmm0
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [306783378,1,268435455,306783378]
+; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
; CHECK-SSE41-NEXT: psrld $31, %xmm0
; CHECK-SSE41-NEXT: retq
@@ -1351,10 +1351,10 @@ define <4 x i32> @test_urem_odd_even_allones_and_poweroftwo(<4 x i32> %X) nounwi
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT: por %xmm2, %xmm1
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993459,1,268435455,42949672]
-; CHECK-SSE41-NEXT: pminud %xmm1, %xmm0
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT: por %xmm2, %xmm0
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993459,1,268435455,42949672]
+; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
; CHECK-SSE41-NEXT: psrld $31, %xmm0
; CHECK-SSE41-NEXT: retq
@@ -1469,10 +1469,10 @@ define <4 x i32> @test_urem_even_allones_and_one(<4 x i32> %X) nounwind {
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT: por %xmm2, %xmm1
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [306783378,1,4294967295,306783378]
-; CHECK-SSE41-NEXT: pminud %xmm1, %xmm0
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT: por %xmm2, %xmm0
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [306783378,1,4294967295,306783378]
+; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
; CHECK-SSE41-NEXT: psrld $31, %xmm0
; CHECK-SSE41-NEXT: retq
@@ -1548,10 +1548,10 @@ define <4 x i32> @test_urem_odd_even_allones_and_one(<4 x i32> %X) nounwind {
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT: por %xmm2, %xmm1
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993459,1,4294967295,42949672]
-; CHECK-SSE41-NEXT: pminud %xmm1, %xmm0
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT: por %xmm2, %xmm0
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993459,1,4294967295,42949672]
+; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
; CHECK-SSE41-NEXT: psrld $31, %xmm0
; CHECK-SSE41-NEXT: retq
@@ -1629,10 +1629,10 @@ define <4 x i32> @test_urem_odd_poweroftwo_and_one(<4 x i32> %X) nounwind {
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT: por %xmm2, %xmm1
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993459,268435455,4294967295,858993459]
-; CHECK-SSE41-NEXT: pminud %xmm1, %xmm0
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT: por %xmm2, %xmm0
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993459,268435455,4294967295,858993459]
+; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
; CHECK-SSE41-NEXT: psrld $31, %xmm0
; CHECK-SSE41-NEXT: retq
@@ -1708,10 +1708,10 @@ define <4 x i32> @test_urem_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT: por %xmm2, %xmm1
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [306783378,268435455,4294967295,306783378]
-; CHECK-SSE41-NEXT: pminud %xmm1, %xmm0
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT: por %xmm2, %xmm0
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [306783378,268435455,4294967295,306783378]
+; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
; CHECK-SSE41-NEXT: psrld $31, %xmm0
; CHECK-SSE41-NEXT: retq
@@ -1787,10 +1787,10 @@ define <4 x i32> @test_urem_odd_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT: por %xmm2, %xmm1
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993459,268435455,4294967295,42949672]
-; CHECK-SSE41-NEXT: pminud %xmm1, %xmm0
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT: por %xmm2, %xmm0
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993459,268435455,4294967295,42949672]
+; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
; CHECK-SSE41-NEXT: psrld $31, %xmm0
; CHECK-SSE41-NEXT: retq
@@ -1868,10 +1868,10 @@ define <4 x i32> @test_urem_odd_allones_and_poweroftwo_and_one(<4 x i32> %X) nou
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT: por %xmm2, %xmm1
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993459,1,268435455,4294967295]
-; CHECK-SSE41-NEXT: pminud %xmm1, %xmm0
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT: por %xmm2, %xmm0
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993459,1,268435455,4294967295]
+; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
; CHECK-SSE41-NEXT: psrld $31, %xmm0
; CHECK-SSE41-NEXT: retq
@@ -1947,10 +1947,10 @@ define <4 x i32> @test_urem_even_allones_and_poweroftwo_and_one(<4 x i32> %X) no
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
-; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT: por %xmm2, %xmm1
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [306783378,1,268435455,4294967295]
-; CHECK-SSE41-NEXT: pminud %xmm1, %xmm0
+; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-SSE41-NEXT: por %xmm2, %xmm0
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [306783378,1,268435455,4294967295]
+; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
; CHECK-SSE41-NEXT: psrld $31, %xmm0
; CHECK-SSE41-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vec-strict-cmp-128.ll b/llvm/test/CodeGen/X86/vec-strict-cmp-128.ll
index 3b4777664b666..008435b47b53a 100644
--- a/llvm/test/CodeGen/X86/vec-strict-cmp-128.ll
+++ b/llvm/test/CodeGen/X86/vec-strict-cmp-128.ll
@@ -723,11 +723,11 @@ define <4 x i32> @test_v4f32_one_q(<4 x i32> %a, <4 x i32> %b, <4 x float> %f1,
; SSE-32-NEXT: movaps 8(%ebp), %xmm3
; SSE-32-NEXT: movaps %xmm2, %xmm4
; SSE-32-NEXT: cmpneqps %xmm3, %xmm4
-; SSE-32-NEXT: cmpordps %xmm3, %xmm2
-; SSE-32-NEXT: andps %xmm4, %xmm2
-; SSE-32-NEXT: andps %xmm2, %xmm0
-; SSE-32-NEXT: andnps %xmm1, %xmm2
-; SSE-32-NEXT: orps %xmm2, %xmm0
+; SSE-32-NEXT: cmpordps %xmm2, %xmm3
+; SSE-32-NEXT: andps %xmm4, %xmm3
+; SSE-32-NEXT: andps %xmm3, %xmm0
+; SSE-32-NEXT: andnps %xmm1, %xmm3
+; SSE-32-NEXT: orps %xmm3, %xmm0
; SSE-32-NEXT: movl %ebp, %esp
; SSE-32-NEXT: popl %ebp
; SSE-32-NEXT: retl
@@ -737,10 +737,10 @@ define <4 x i32> @test_v4f32_one_q(<4 x i32> %a, <4 x i32> %b, <4 x float> %f1,
; SSE-64-NEXT: movaps %xmm2, %xmm4
; SSE-64-NEXT: cmpneqps %xmm3, %xmm4
; SSE-64-NEXT: cmpordps %xmm3, %xmm2
-; SSE-64-NEXT: andps %xmm4, %xmm2
-; SSE-64-NEXT: andps %xmm2, %xmm0
-; SSE-64-NEXT: andnps %xmm1, %xmm2
-; SSE-64-NEXT: orps %xmm2, %xmm0
+; SSE-64-NEXT: andps %xmm2, %xmm4
+; SSE-64-NEXT: andps %xmm4, %xmm0
+; SSE-64-NEXT: andnps %xmm1, %xmm4
+; SSE-64-NEXT: orps %xmm4, %xmm0
; SSE-64-NEXT: retq
;
; AVX-32-LABEL: test_v4f32_one_q:
@@ -916,11 +916,11 @@ define <4 x i32> @test_v4f32_ueq_q(<4 x i32> %a, <4 x i32> %b, <4 x float> %f1,
; SSE-32-NEXT: movaps 8(%ebp), %xmm3
; SSE-32-NEXT: movaps %xmm2, %xmm4
; SSE-32-NEXT: cmpeqps %xmm3, %xmm4
-; SSE-32-NEXT: cmpunordps %xmm3, %xmm2
-; SSE-32-NEXT: orps %xmm4, %xmm2
-; SSE-32-NEXT: andps %xmm2, %xmm0
-; SSE-32-NEXT: andnps %xmm1, %xmm2
-; SSE-32-NEXT: orps %xmm2, %xmm0
+; SSE-32-NEXT: cmpunordps %xmm2, %xmm3
+; SSE-32-NEXT: orps %xmm4, %xmm3
+; SSE-32-NEXT: andps %xmm3, %xmm0
+; SSE-32-NEXT: andnps %xmm1, %xmm3
+; SSE-32-NEXT: orps %xmm3, %xmm0
; SSE-32-NEXT: movl %ebp, %esp
; SSE-32-NEXT: popl %ebp
; SSE-32-NEXT: retl
@@ -930,10 +930,10 @@ define <4 x i32> @test_v4f32_ueq_q(<4 x i32> %a, <4 x i32> %b, <4 x float> %f1,
; SSE-64-NEXT: movaps %xmm2, %xmm4
; SSE-64-NEXT: cmpeqps %xmm3, %xmm4
; SSE-64-NEXT: cmpunordps %xmm3, %xmm2
-; SSE-64-NEXT: orps %xmm4, %xmm2
-; SSE-64-NEXT: andps %xmm2, %xmm0
-; SSE-64-NEXT: andnps %xmm1, %xmm2
-; SSE-64-NEXT: orps %xmm2, %xmm0
+; SSE-64-NEXT: orps %xmm2, %xmm4
+; SSE-64-NEXT: andps %xmm4, %xmm0
+; SSE-64-NEXT: andnps %xmm1, %xmm4
+; SSE-64-NEXT: orps %xmm4, %xmm0
; SSE-64-NEXT: retq
;
; AVX-32-LABEL: test_v4f32_ueq_q:
@@ -2383,11 +2383,11 @@ define <2 x i64> @test_v2f64_one_q(<2 x i64> %a, <2 x i64> %b, <2 x double> %f1,
; SSE-32-NEXT: movapd 8(%ebp), %xmm3
; SSE-32-NEXT: movapd %xmm2, %xmm4
; SSE-32-NEXT: cmpneqpd %xmm3, %xmm4
-; SSE-32-NEXT: cmpordpd %xmm3, %xmm2
-; SSE-32-NEXT: andpd %xmm4, %xmm2
-; SSE-32-NEXT: andpd %xmm2, %xmm0
-; SSE-32-NEXT: andnpd %xmm1, %xmm2
-; SSE-32-NEXT: orpd %xmm2, %xmm0
+; SSE-32-NEXT: cmpordpd %xmm2, %xmm3
+; SSE-32-NEXT: andpd %xmm4, %xmm3
+; SSE-32-NEXT: andpd %xmm3, %xmm0
+; SSE-32-NEXT: andnpd %xmm1, %xmm3
+; SSE-32-NEXT: orpd %xmm3, %xmm0
; SSE-32-NEXT: movl %ebp, %esp
; SSE-32-NEXT: popl %ebp
; SSE-32-NEXT: retl
@@ -2397,10 +2397,10 @@ define <2 x i64> @test_v2f64_one_q(<2 x i64> %a, <2 x i64> %b, <2 x double> %f1,
; SSE-64-NEXT: movapd %xmm2, %xmm4
; SSE-64-NEXT: cmpneqpd %xmm3, %xmm4
; SSE-64-NEXT: cmpordpd %xmm3, %xmm2
-; SSE-64-NEXT: andpd %xmm4, %xmm2
-; SSE-64-NEXT: andpd %xmm2, %xmm0
-; SSE-64-NEXT: andnpd %xmm1, %xmm2
-; SSE-64-NEXT: orpd %xmm2, %xmm0
+; SSE-64-NEXT: andpd %xmm2, %xmm4
+; SSE-64-NEXT: andpd %xmm4, %xmm0
+; SSE-64-NEXT: andnpd %xmm1, %xmm4
+; SSE-64-NEXT: orpd %xmm4, %xmm0
; SSE-64-NEXT: retq
;
; AVX-32-LABEL: test_v2f64_one_q:
@@ -2576,11 +2576,11 @@ define <2 x i64> @test_v2f64_ueq_q(<2 x i64> %a, <2 x i64> %b, <2 x double> %f1,
; SSE-32-NEXT: movapd 8(%ebp), %xmm3
; SSE-32-NEXT: movapd %xmm2, %xmm4
; SSE-32-NEXT: cmpeqpd %xmm3, %xmm4
-; SSE-32-NEXT: cmpunordpd %xmm3, %xmm2
-; SSE-32-NEXT: orpd %xmm4, %xmm2
-; SSE-32-NEXT: andpd %xmm2, %xmm0
-; SSE-32-NEXT: andnpd %xmm1, %xmm2
-; SSE-32-NEXT: orpd %xmm2, %xmm0
+; SSE-32-NEXT: cmpunordpd %xmm2, %xmm3
+; SSE-32-NEXT: orpd %xmm4, %xmm3
+; SSE-32-NEXT: andpd %xmm3, %xmm0
+; SSE-32-NEXT: andnpd %xmm1, %xmm3
+; SSE-32-NEXT: orpd %xmm3, %xmm0
; SSE-32-NEXT: movl %ebp, %esp
; SSE-32-NEXT: popl %ebp
; SSE-32-NEXT: retl
@@ -2590,10 +2590,10 @@ define <2 x i64> @test_v2f64_ueq_q(<2 x i64> %a, <2 x i64> %b, <2 x double> %f1,
; SSE-64-NEXT: movapd %xmm2, %xmm4
; SSE-64-NEXT: cmpeqpd %xmm3, %xmm4
; SSE-64-NEXT: cmpunordpd %xmm3, %xmm2
-; SSE-64-NEXT: orpd %xmm4, %xmm2
-; SSE-64-NEXT: andpd %xmm2, %xmm0
-; SSE-64-NEXT: andnpd %xmm1, %xmm2
-; SSE-64-NEXT: orpd %xmm2, %xmm0
+; SSE-64-NEXT: orpd %xmm2, %xmm4
+; SSE-64-NEXT: andpd %xmm4, %xmm0
+; SSE-64-NEXT: andnpd %xmm1, %xmm4
+; SSE-64-NEXT: orpd %xmm4, %xmm0
; SSE-64-NEXT: retq
;
; AVX-32-LABEL: test_v2f64_ueq_q:
@@ -3338,10 +3338,10 @@ define <4 x i32> @test_v4f32_oeq_s(<4 x i32> %a, <4 x i32> %b, <4 x float> %f1,
; SSE-32-NEXT: movaps 8(%ebp), %xmm3
; SSE-32-NEXT: movaps %xmm2, %xmm4
; SSE-32-NEXT: cmpltps %xmm3, %xmm4
-; SSE-32-NEXT: cmpeqps %xmm3, %xmm2
-; SSE-32-NEXT: andps %xmm2, %xmm0
-; SSE-32-NEXT: andnps %xmm1, %xmm2
-; SSE-32-NEXT: orps %xmm2, %xmm0
+; SSE-32-NEXT: cmpeqps %xmm2, %xmm3
+; SSE-32-NEXT: andps %xmm3, %xmm0
+; SSE-32-NEXT: andnps %xmm1, %xmm3
+; SSE-32-NEXT: orps %xmm3, %xmm0
; SSE-32-NEXT: movl %ebp, %esp
; SSE-32-NEXT: popl %ebp
; SSE-32-NEXT: retl
@@ -3816,11 +3816,11 @@ define <4 x i32> @test_v4f32_one_s(<4 x i32> %a, <4 x i32> %b, <4 x float> %f1,
; SSE-32-NEXT: cmpltps %xmm3, %xmm4
; SSE-32-NEXT: movaps %xmm2, %xmm4
; SSE-32-NEXT: cmpneqps %xmm3, %xmm4
-; SSE-32-NEXT: cmpordps %xmm3, %xmm2
-; SSE-32-NEXT: andps %xmm4, %xmm2
-; SSE-32-NEXT: andps %xmm2, %xmm0
-; SSE-32-NEXT: andnps %xmm1, %xmm2
-; SSE-32-NEXT: orps %xmm2, %xmm0
+; SSE-32-NEXT: cmpordps %xmm2, %xmm3
+; SSE-32-NEXT: andps %xmm4, %xmm3
+; SSE-32-NEXT: andps %xmm3, %xmm0
+; SSE-32-NEXT: andnps %xmm1, %xmm3
+; SSE-32-NEXT: orps %xmm3, %xmm0
; SSE-32-NEXT: movl %ebp, %esp
; SSE-32-NEXT: popl %ebp
; SSE-32-NEXT: retl
@@ -3832,10 +3832,10 @@ define <4 x i32> @test_v4f32_one_s(<4 x i32> %a, <4 x i32> %b, <4 x float> %f1,
; SSE-64-NEXT: movaps %xmm2, %xmm4
; SSE-64-NEXT: cmpneqps %xmm3, %xmm4
; SSE-64-NEXT: cmpordps %xmm3, %xmm2
-; SSE-64-NEXT: andps %xmm4, %xmm2
-; SSE-64-NEXT: andps %xmm2, %xmm0
-; SSE-64-NEXT: andnps %xmm1, %xmm2
-; SSE-64-NEXT: orps %xmm2, %xmm0
+; SSE-64-NEXT: andps %xmm2, %xmm4
+; SSE-64-NEXT: andps %xmm4, %xmm0
+; SSE-64-NEXT: andnps %xmm1, %xmm4
+; SSE-64-NEXT: orps %xmm4, %xmm0
; SSE-64-NEXT: retq
;
; AVX-32-LABEL: test_v4f32_one_s:
@@ -3918,10 +3918,10 @@ define <4 x i32> @test_v4f32_ord_s(<4 x i32> %a, <4 x i32> %b, <4 x float> %f1,
; SSE-32-NEXT: movaps 8(%ebp), %xmm3
; SSE-32-NEXT: movaps %xmm2, %xmm4
; SSE-32-NEXT: cmpltps %xmm3, %xmm4
-; SSE-32-NEXT: cmpordps %xmm3, %xmm2
-; SSE-32-NEXT: andps %xmm2, %xmm0
-; SSE-32-NEXT: andnps %xmm1, %xmm2
-; SSE-32-NEXT: orps %xmm2, %xmm0
+; SSE-32-NEXT: cmpordps %xmm2, %xmm3
+; SSE-32-NEXT: andps %xmm3, %xmm0
+; SSE-32-NEXT: andnps %xmm1, %xmm3
+; SSE-32-NEXT: orps %xmm3, %xmm0
; SSE-32-NEXT: movl %ebp, %esp
; SSE-32-NEXT: popl %ebp
; SSE-32-NEXT: retl
@@ -4018,11 +4018,11 @@ define <4 x i32> @test_v4f32_ueq_s(<4 x i32> %a, <4 x i32> %b, <4 x float> %f1,
; SSE-32-NEXT: cmpltps %xmm3, %xmm4
; SSE-32-NEXT: movaps %xmm2, %xmm4
; SSE-32-NEXT: cmpeqps %xmm3, %xmm4
-; SSE-32-NEXT: cmpunordps %xmm3, %xmm2
-; SSE-32-NEXT: orps %xmm4, %xmm2
-; SSE-32-NEXT: andps %xmm2, %xmm0
-; SSE-32-NEXT: andnps %xmm1, %xmm2
-; SSE-32-NEXT: orps %xmm2, %xmm0
+; SSE-32-NEXT: cmpunordps %xmm2, %xmm3
+; SSE-32-NEXT: orps %xmm4, %xmm3
+; SSE-32-NEXT: andps %xmm3, %xmm0
+; SSE-32-NEXT: andnps %xmm1, %xmm3
+; SSE-32-NEXT: orps %xmm3, %xmm0
; SSE-32-NEXT: movl %ebp, %esp
; SSE-32-NEXT: popl %ebp
; SSE-32-NEXT: retl
@@ -4034,10 +4034,10 @@ define <4 x i32> @test_v4f32_ueq_s(<4 x i32> %a, <4 x i32> %b, <4 x float> %f1,
; SSE-64-NEXT: movaps %xmm2, %xmm4
; SSE-64-NEXT: cmpeqps %xmm3, %xmm4
; SSE-64-NEXT: cmpunordps %xmm3, %xmm2
-; SSE-64-NEXT: orps %xmm4, %xmm2
-; SSE-64-NEXT: andps %xmm2, %xmm0
-; SSE-64-NEXT: andnps %xmm1, %xmm2
-; SSE-64-NEXT: orps %xmm2, %xmm0
+; SSE-64-NEXT: orps %xmm2, %xmm4
+; SSE-64-NEXT: andps %xmm4, %xmm0
+; SSE-64-NEXT: andnps %xmm1, %xmm4
+; SSE-64-NEXT: orps %xmm4, %xmm0
; SSE-64-NEXT: retq
;
; AVX-32-LABEL: test_v4f32_ueq_s:
@@ -4498,10 +4498,10 @@ define <4 x i32> @test_v4f32_une_s(<4 x i32> %a, <4 x i32> %b, <4 x float> %f1,
; SSE-32-NEXT: movaps 8(%ebp), %xmm3
; SSE-32-NEXT: movaps %xmm2, %xmm4
; SSE-32-NEXT: cmpltps %xmm3, %xmm4
-; SSE-32-NEXT: cmpneqps %xmm3, %xmm2
-; SSE-32-NEXT: andps %xmm2, %xmm0
-; SSE-32-NEXT: andnps %xmm1, %xmm2
-; SSE-32-NEXT: orps %xmm2, %xmm0
+; SSE-32-NEXT: cmpneqps %xmm2, %xmm3
+; SSE-32-NEXT: andps %xmm3, %xmm0
+; SSE-32-NEXT: andnps %xmm1, %xmm3
+; SSE-32-NEXT: orps %xmm3, %xmm0
; SSE-32-NEXT: movl %ebp, %esp
; SSE-32-NEXT: popl %ebp
; SSE-32-NEXT: retl
@@ -4596,10 +4596,10 @@ define <4 x i32> @test_v4f32_uno_s(<4 x i32> %a, <4 x i32> %b, <4 x float> %f1,
; SSE-32-NEXT: movaps 8(%ebp), %xmm3
; SSE-32-NEXT: movaps %xmm2, %xmm4
; SSE-32-NEXT: cmpltps %xmm3, %xmm4
-; SSE-32-NEXT: cmpunordps %xmm3, %xmm2
-; SSE-32-NEXT: andps %xmm2, %xmm0
-; SSE-32-NEXT: andnps %xmm1, %xmm2
-; SSE-32-NEXT: orps %xmm2, %xmm0
+; SSE-32-NEXT: cmpunordps %xmm2, %xmm3
+; SSE-32-NEXT: andps %xmm3, %xmm0
+; SSE-32-NEXT: andnps %xmm1, %xmm3
+; SSE-32-NEXT: orps %xmm3, %xmm0
; SSE-32-NEXT: movl %ebp, %esp
; SSE-32-NEXT: popl %ebp
; SSE-32-NEXT: retl
@@ -4694,10 +4694,10 @@ define <2 x i64> @test_v2f64_oeq_s(<2 x i64> %a, <2 x i64> %b, <2 x double> %f1,
; SSE-32-NEXT: movapd 8(%ebp), %xmm3
; SSE-32-NEXT: movapd %xmm2, %xmm4
; SSE-32-NEXT: cmpltpd %xmm3, %xmm4
-; SSE-32-NEXT: cmpeqpd %xmm3, %xmm2
-; SSE-32-NEXT: andpd %xmm2, %xmm0
-; SSE-32-NEXT: andnpd %xmm1, %xmm2
-; SSE-32-NEXT: orpd %xmm2, %xmm0
+; SSE-32-NEXT: cmpeqpd %xmm2, %xmm3
+; SSE-32-NEXT: andpd %xmm3, %xmm0
+; SSE-32-NEXT: andnpd %xmm1, %xmm3
+; SSE-32-NEXT: orpd %xmm3, %xmm0
; SSE-32-NEXT: movl %ebp, %esp
; SSE-32-NEXT: popl %ebp
; SSE-32-NEXT: retl
@@ -5172,11 +5172,11 @@ define <2 x i64> @test_v2f64_one_s(<2 x i64> %a, <2 x i64> %b, <2 x double> %f1,
; SSE-32-NEXT: cmpltpd %xmm3, %xmm4
; SSE-32-NEXT: movapd %xmm2, %xmm4
; SSE-32-NEXT: cmpneqpd %xmm3, %xmm4
-; SSE-32-NEXT: cmpordpd %xmm3, %xmm2
-; SSE-32-NEXT: andpd %xmm4, %xmm2
-; SSE-32-NEXT: andpd %xmm2, %xmm0
-; SSE-32-NEXT: andnpd %xmm1, %xmm2
-; SSE-32-NEXT: orpd %xmm2, %xmm0
+; SSE-32-NEXT: cmpordpd %xmm2, %xmm3
+; SSE-32-NEXT: andpd %xmm4, %xmm3
+; SSE-32-NEXT: andpd %xmm3, %xmm0
+; SSE-32-NEXT: andnpd %xmm1, %xmm3
+; SSE-32-NEXT: orpd %xmm3, %xmm0
; SSE-32-NEXT: movl %ebp, %esp
; SSE-32-NEXT: popl %ebp
; SSE-32-NEXT: retl
@@ -5188,10 +5188,10 @@ define <2 x i64> @test_v2f64_one_s(<2 x i64> %a, <2 x i64> %b, <2 x double> %f1,
; SSE-64-NEXT: movapd %xmm2, %xmm4
; SSE-64-NEXT: cmpneqpd %xmm3, %xmm4
; SSE-64-NEXT: cmpordpd %xmm3, %xmm2
-; SSE-64-NEXT: andpd %xmm4, %xmm2
-; SSE-64-NEXT: andpd %xmm2, %xmm0
-; SSE-64-NEXT: andnpd %xmm1, %xmm2
-; SSE-64-NEXT: orpd %xmm2, %xmm0
+; SSE-64-NEXT: andpd %xmm2, %xmm4
+; SSE-64-NEXT: andpd %xmm4, %xmm0
+; SSE-64-NEXT: andnpd %xmm1, %xmm4
+; SSE-64-NEXT: orpd %xmm4, %xmm0
; SSE-64-NEXT: retq
;
; AVX-32-LABEL: test_v2f64_one_s:
@@ -5274,10 +5274,10 @@ define <2 x i64> @test_v2f64_ord_s(<2 x i64> %a, <2 x i64> %b, <2 x double> %f1,
; SSE-32-NEXT: movapd 8(%ebp), %xmm3
; SSE-32-NEXT: movapd %xmm2, %xmm4
; SSE-32-NEXT: cmpltpd %xmm3, %xmm4
-; SSE-32-NEXT: cmpordpd %xmm3, %xmm2
-; SSE-32-NEXT: andpd %xmm2, %xmm0
-; SSE-32-NEXT: andnpd %xmm1, %xmm2
-; SSE-32-NEXT: orpd %xmm2, %xmm0
+; SSE-32-NEXT: cmpordpd %xmm2, %xmm3
+; SSE-32-NEXT: andpd %xmm3, %xmm0
+; SSE-32-NEXT: andnpd %xmm1, %xmm3
+; SSE-32-NEXT: orpd %xmm3, %xmm0
; SSE-32-NEXT: movl %ebp, %esp
; SSE-32-NEXT: popl %ebp
; SSE-32-NEXT: retl
@@ -5374,11 +5374,11 @@ define <2 x i64> @test_v2f64_ueq_s(<2 x i64> %a, <2 x i64> %b, <2 x double> %f1,
; SSE-32-NEXT: cmpltpd %xmm3, %xmm4
; SSE-32-NEXT: movapd %xmm2, %xmm4
; SSE-32-NEXT: cmpeqpd %xmm3, %xmm4
-; SSE-32-NEXT: cmpunordpd %xmm3, %xmm2
-; SSE-32-NEXT: orpd %xmm4, %xmm2
-; SSE-32-NEXT: andpd %xmm2, %xmm0
-; SSE-32-NEXT: andnpd %xmm1, %xmm2
-; SSE-32-NEXT: orpd %xmm2, %xmm0
+; SSE-32-NEXT: cmpunordpd %xmm2, %xmm3
+; SSE-32-NEXT: orpd %xmm4, %xmm3
+; SSE-32-NEXT: andpd %xmm3, %xmm0
+; SSE-32-NEXT: andnpd %xmm1, %xmm3
+; SSE-32-NEXT: orpd %xmm3, %xmm0
; SSE-32-NEXT: movl %ebp, %esp
; SSE-32-NEXT: popl %ebp
; SSE-32-NEXT: retl
@@ -5390,10 +5390,10 @@ define <2 x i64> @test_v2f64_ueq_s(<2 x i64> %a, <2 x i64> %b, <2 x double> %f1,
; SSE-64-NEXT: movapd %xmm2, %xmm4
; SSE-64-NEXT: cmpeqpd %xmm3, %xmm4
; SSE-64-NEXT: cmpunordpd %xmm3, %xmm2
-; SSE-64-NEXT: orpd %xmm4, %xmm2
-; SSE-64-NEXT: andpd %xmm2, %xmm0
-; SSE-64-NEXT: andnpd %xmm1, %xmm2
-; SSE-64-NEXT: orpd %xmm2, %xmm0
+; SSE-64-NEXT: orpd %xmm2, %xmm4
+; SSE-64-NEXT: andpd %xmm4, %xmm0
+; SSE-64-NEXT: andnpd %xmm1, %xmm4
+; SSE-64-NEXT: orpd %xmm4, %xmm0
; SSE-64-NEXT: retq
;
; AVX-32-LABEL: test_v2f64_ueq_s:
@@ -5854,10 +5854,10 @@ define <2 x i64> @test_v2f64_une_s(<2 x i64> %a, <2 x i64> %b, <2 x double> %f1,
; SSE-32-NEXT: movapd 8(%ebp), %xmm3
; SSE-32-NEXT: movapd %xmm2, %xmm4
; SSE-32-NEXT: cmpltpd %xmm3, %xmm4
-; SSE-32-NEXT: cmpneqpd %xmm3, %xmm2
-; SSE-32-NEXT: andpd %xmm2, %xmm0
-; SSE-32-NEXT: andnpd %xmm1, %xmm2
-; SSE-32-NEXT: orpd %xmm2, %xmm0
+; SSE-32-NEXT: cmpneqpd %xmm2, %xmm3
+; SSE-32-NEXT: andpd %xmm3, %xmm0
+; SSE-32-NEXT: andnpd %xmm1, %xmm3
+; SSE-32-NEXT: orpd %xmm3, %xmm0
; SSE-32-NEXT: movl %ebp, %esp
; SSE-32-NEXT: popl %ebp
; SSE-32-NEXT: retl
@@ -5952,10 +5952,10 @@ define <2 x i64> @test_v2f64_uno_s(<2 x i64> %a, <2 x i64> %b, <2 x double> %f1,
; SSE-32-NEXT: movapd 8(%ebp), %xmm3
; SSE-32-NEXT: movapd %xmm2, %xmm4
; SSE-32-NEXT: cmpltpd %xmm3, %xmm4
-; SSE-32-NEXT: cmpunordpd %xmm3, %xmm2
-; SSE-32-NEXT: andpd %xmm2, %xmm0
-; SSE-32-NEXT: andnpd %xmm1, %xmm2
-; SSE-32-NEXT: orpd %xmm2, %xmm0
+; SSE-32-NEXT: cmpunordpd %xmm2, %xmm3
+; SSE-32-NEXT: andpd %xmm3, %xmm0
+; SSE-32-NEXT: andnpd %xmm1, %xmm3
+; SSE-32-NEXT: orpd %xmm3, %xmm0
; SSE-32-NEXT: movl %ebp, %esp
; SSE-32-NEXT: popl %ebp
; SSE-32-NEXT: retl
diff --git a/llvm/test/CodeGen/X86/vec_ctbits.ll b/llvm/test/CodeGen/X86/vec_ctbits.ll
index e90611fbc1e7e..370f88d644b57 100644
--- a/llvm/test/CodeGen/X86/vec_ctbits.ll
+++ b/llvm/test/CodeGen/X86/vec_ctbits.ll
@@ -37,40 +37,40 @@ define <2 x i64> @foolz(<2 x i64> %a) nounwind {
; CHECK: # %bb.0:
; CHECK-NEXT: movdqa %xmm0, %xmm1
; CHECK-NEXT: psrlq $1, %xmm1
-; CHECK-NEXT: por %xmm0, %xmm1
-; CHECK-NEXT: movdqa %xmm1, %xmm0
-; CHECK-NEXT: psrlq $2, %xmm0
+; CHECK-NEXT: por %xmm1, %xmm0
+; CHECK-NEXT: movdqa %xmm0, %xmm1
+; CHECK-NEXT: psrlq $2, %xmm1
; CHECK-NEXT: por %xmm1, %xmm0
; CHECK-NEXT: movdqa %xmm0, %xmm1
; CHECK-NEXT: psrlq $4, %xmm1
-; CHECK-NEXT: por %xmm0, %xmm1
-; CHECK-NEXT: movdqa %xmm1, %xmm0
-; CHECK-NEXT: psrlq $8, %xmm0
+; CHECK-NEXT: por %xmm1, %xmm0
+; CHECK-NEXT: movdqa %xmm0, %xmm1
+; CHECK-NEXT: psrlq $8, %xmm1
; CHECK-NEXT: por %xmm1, %xmm0
; CHECK-NEXT: movdqa %xmm0, %xmm1
; CHECK-NEXT: psrlq $16, %xmm1
-; CHECK-NEXT: por %xmm0, %xmm1
-; CHECK-NEXT: movdqa %xmm1, %xmm0
-; CHECK-NEXT: psrlq $32, %xmm0
+; CHECK-NEXT: por %xmm1, %xmm0
+; CHECK-NEXT: movdqa %xmm0, %xmm1
+; CHECK-NEXT: psrlq $32, %xmm1
; CHECK-NEXT: por %xmm1, %xmm0
; CHECK-NEXT: pcmpeqd %xmm1, %xmm1
-; CHECK-NEXT: pxor %xmm0, %xmm1
-; CHECK-NEXT: movdqa %xmm1, %xmm0
-; CHECK-NEXT: psrlw $1, %xmm0
+; CHECK-NEXT: pxor %xmm1, %xmm0
+; CHECK-NEXT: movdqa %xmm0, %xmm1
+; CHECK-NEXT: psrlw $1, %xmm1
+; CHECK-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-NEXT: psubb %xmm1, %xmm0
+; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
+; CHECK-NEXT: movdqa %xmm0, %xmm2
+; CHECK-NEXT: pand %xmm1, %xmm2
+; CHECK-NEXT: psrlw $2, %xmm0
+; CHECK-NEXT: pand %xmm1, %xmm0
+; CHECK-NEXT: paddb %xmm2, %xmm0
+; CHECK-NEXT: movdqa %xmm0, %xmm1
+; CHECK-NEXT: psrlw $4, %xmm1
+; CHECK-NEXT: paddb %xmm1, %xmm0
; CHECK-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT: psubb %xmm0, %xmm1
-; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
-; CHECK-NEXT: movdqa %xmm1, %xmm2
-; CHECK-NEXT: pand %xmm0, %xmm2
-; CHECK-NEXT: psrlw $2, %xmm1
-; CHECK-NEXT: pand %xmm0, %xmm1
-; CHECK-NEXT: paddb %xmm2, %xmm1
-; CHECK-NEXT: movdqa %xmm1, %xmm2
-; CHECK-NEXT: psrlw $4, %xmm2
-; CHECK-NEXT: paddb %xmm1, %xmm2
-; CHECK-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; CHECK-NEXT: pxor %xmm0, %xmm0
-; CHECK-NEXT: psadbw %xmm2, %xmm0
+; CHECK-NEXT: pxor %xmm1, %xmm1
+; CHECK-NEXT: psadbw %xmm1, %xmm0
; CHECK-NEXT: retq
%c = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %a, i1 true)
ret <2 x i64> %c
@@ -123,16 +123,15 @@ define <2 x i32> @promtz(<2 x i32> %a) nounwind {
; CHECK-NEXT: paddb %xmm2, %xmm0
; CHECK-NEXT: movdqa %xmm0, %xmm1
; CHECK-NEXT: psrlw $4, %xmm1
-; CHECK-NEXT: paddb %xmm0, %xmm1
-; CHECK-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-NEXT: pxor %xmm0, %xmm0
-; CHECK-NEXT: movdqa %xmm1, %xmm2
-; CHECK-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; CHECK-NEXT: psadbw %xmm0, %xmm2
-; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; CHECK-NEXT: psadbw %xmm0, %xmm1
-; CHECK-NEXT: packuswb %xmm2, %xmm1
-; CHECK-NEXT: movdqa %xmm1, %xmm0
+; CHECK-NEXT: paddb %xmm1, %xmm0
+; CHECK-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-NEXT: pxor %xmm1, %xmm1
+; CHECK-NEXT: movdqa %xmm0, %xmm2
+; CHECK-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; CHECK-NEXT: psadbw %xmm1, %xmm2
+; CHECK-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-NEXT: psadbw %xmm1, %xmm0
+; CHECK-NEXT: packuswb %xmm2, %xmm0
; CHECK-NEXT: retq
%c = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %a, i1 false)
ret <2 x i32> %c
@@ -143,34 +142,34 @@ define <2 x i32> @promlz(<2 x i32> %a) nounwind {
; CHECK: # %bb.0:
; CHECK-NEXT: movdqa %xmm0, %xmm1
; CHECK-NEXT: psrld $1, %xmm1
-; CHECK-NEXT: por %xmm0, %xmm1
-; CHECK-NEXT: movdqa %xmm1, %xmm0
-; CHECK-NEXT: psrld $2, %xmm0
+; CHECK-NEXT: por %xmm1, %xmm0
+; CHECK-NEXT: movdqa %xmm0, %xmm1
+; CHECK-NEXT: psrld $2, %xmm1
; CHECK-NEXT: por %xmm1, %xmm0
; CHECK-NEXT: movdqa %xmm0, %xmm1
; CHECK-NEXT: psrld $4, %xmm1
-; CHECK-NEXT: por %xmm0, %xmm1
-; CHECK-NEXT: movdqa %xmm1, %xmm0
-; CHECK-NEXT: psrld $8, %xmm0
+; CHECK-NEXT: por %xmm1, %xmm0
+; CHECK-NEXT: movdqa %xmm0, %xmm1
+; CHECK-NEXT: psrld $8, %xmm1
; CHECK-NEXT: por %xmm1, %xmm0
; CHECK-NEXT: movdqa %xmm0, %xmm1
; CHECK-NEXT: psrld $16, %xmm1
-; CHECK-NEXT: por %xmm0, %xmm1
-; CHECK-NEXT: pcmpeqd %xmm2, %xmm2
-; CHECK-NEXT: pxor %xmm1, %xmm2
-; CHECK-NEXT: movdqa %xmm2, %xmm0
-; CHECK-NEXT: psrlw $1, %xmm0
-; CHECK-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT: psubb %xmm0, %xmm2
-; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
-; CHECK-NEXT: movdqa %xmm2, %xmm1
-; CHECK-NEXT: pand %xmm0, %xmm1
-; CHECK-NEXT: psrlw $2, %xmm2
-; CHECK-NEXT: pand %xmm0, %xmm2
-; CHECK-NEXT: paddb %xmm1, %xmm2
-; CHECK-NEXT: movdqa %xmm2, %xmm0
-; CHECK-NEXT: psrlw $4, %xmm0
+; CHECK-NEXT: por %xmm1, %xmm0
+; CHECK-NEXT: pcmpeqd %xmm1, %xmm1
+; CHECK-NEXT: pxor %xmm1, %xmm0
+; CHECK-NEXT: movdqa %xmm0, %xmm1
+; CHECK-NEXT: psrlw $1, %xmm1
+; CHECK-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-NEXT: psubb %xmm1, %xmm0
+; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
+; CHECK-NEXT: movdqa %xmm0, %xmm2
+; CHECK-NEXT: pand %xmm1, %xmm2
+; CHECK-NEXT: psrlw $2, %xmm0
+; CHECK-NEXT: pand %xmm1, %xmm0
; CHECK-NEXT: paddb %xmm2, %xmm0
+; CHECK-NEXT: movdqa %xmm0, %xmm1
+; CHECK-NEXT: psrlw $4, %xmm1
+; CHECK-NEXT: paddb %xmm1, %xmm0
; CHECK-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-NEXT: pxor %xmm1, %xmm1
; CHECK-NEXT: movdqa %xmm0, %xmm2
@@ -200,16 +199,15 @@ define <2 x i32> @prompop(<2 x i32> %a) nounwind {
; CHECK-NEXT: paddb %xmm2, %xmm0
; CHECK-NEXT: movdqa %xmm0, %xmm1
; CHECK-NEXT: psrlw $4, %xmm1
-; CHECK-NEXT: paddb %xmm0, %xmm1
-; CHECK-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-NEXT: pxor %xmm0, %xmm0
-; CHECK-NEXT: movdqa %xmm1, %xmm2
-; CHECK-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; CHECK-NEXT: psadbw %xmm0, %xmm2
-; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; CHECK-NEXT: psadbw %xmm0, %xmm1
-; CHECK-NEXT: packuswb %xmm2, %xmm1
-; CHECK-NEXT: movdqa %xmm1, %xmm0
+; CHECK-NEXT: paddb %xmm1, %xmm0
+; CHECK-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-NEXT: pxor %xmm1, %xmm1
+; CHECK-NEXT: movdqa %xmm0, %xmm2
+; CHECK-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; CHECK-NEXT: psadbw %xmm1, %xmm2
+; CHECK-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-NEXT: psadbw %xmm1, %xmm0
+; CHECK-NEXT: packuswb %xmm2, %xmm0
; CHECK-NEXT: retq
%c = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %a)
ret <2 x i32> %c
diff --git a/llvm/test/CodeGen/X86/vec_umulo.ll b/llvm/test/CodeGen/X86/vec_umulo.ll
index abb457cd9eaa8..51de68916596b 100644
--- a/llvm/test/CodeGen/X86/vec_umulo.ll
+++ b/llvm/test/CodeGen/X86/vec_umulo.ll
@@ -1094,9 +1094,9 @@ define <16 x i32> @umulo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nou
; SSE2-NEXT: psrlw $8, %xmm5
; SSE2-NEXT: psrlw $8, %xmm0
; SSE2-NEXT: packuswb %xmm5, %xmm0
-; SSE2-NEXT: pcmpeqb %xmm2, %xmm0
+; SSE2-NEXT: pcmpeqb %xmm0, %xmm2
; SSE2-NEXT: pcmpeqd %xmm3, %xmm3
-; SSE2-NEXT: pxor %xmm0, %xmm3
+; SSE2-NEXT: pxor %xmm2, %xmm3
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
; SSE2-NEXT: psrad $24, %xmm0
@@ -1135,9 +1135,9 @@ define <16 x i32> @umulo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nou
; SSSE3-NEXT: psrlw $8, %xmm5
; SSSE3-NEXT: psrlw $8, %xmm0
; SSSE3-NEXT: packuswb %xmm5, %xmm0
-; SSSE3-NEXT: pcmpeqb %xmm2, %xmm0
+; SSSE3-NEXT: pcmpeqb %xmm0, %xmm2
; SSSE3-NEXT: pcmpeqd %xmm3, %xmm3
-; SSSE3-NEXT: pxor %xmm0, %xmm3
+; SSSE3-NEXT: pxor %xmm2, %xmm3
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
; SSSE3-NEXT: psrad $24, %xmm0
@@ -2385,11 +2385,11 @@ define <8 x i32> @umulo_v8i16(<8 x i16> %a0, <8 x i16> %a1, <8 x i16>* %p2) noun
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: pmullw %xmm1, %xmm2
-; SSE2-NEXT: pmulhuw %xmm1, %xmm0
-; SSE2-NEXT: pxor %xmm3, %xmm3
-; SSE2-NEXT: pcmpeqw %xmm0, %xmm3
-; SSE2-NEXT: pcmpeqd %xmm1, %xmm1
-; SSE2-NEXT: pxor %xmm3, %xmm1
+; SSE2-NEXT: pmulhuw %xmm0, %xmm1
+; SSE2-NEXT: pxor %xmm0, %xmm0
+; SSE2-NEXT: pcmpeqw %xmm0, %xmm1
+; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
+; SSE2-NEXT: pxor %xmm0, %xmm1
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; SSE2-NEXT: psrad $16, %xmm0
; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
@@ -2402,11 +2402,11 @@ define <8 x i32> @umulo_v8i16(<8 x i16> %a0, <8 x i16> %a1, <8 x i16>* %p2) noun
; SSSE3: # %bb.0:
; SSSE3-NEXT: movdqa %xmm0, %xmm2
; SSSE3-NEXT: pmullw %xmm1, %xmm2
-; SSSE3-NEXT: pmulhuw %xmm1, %xmm0
-; SSSE3-NEXT: pxor %xmm3, %xmm3
-; SSSE3-NEXT: pcmpeqw %xmm0, %xmm3
-; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1
-; SSSE3-NEXT: pxor %xmm3, %xmm1
+; SSSE3-NEXT: pmulhuw %xmm0, %xmm1
+; SSSE3-NEXT: pxor %xmm0, %xmm0
+; SSSE3-NEXT: pcmpeqw %xmm0, %xmm1
+; SSSE3-NEXT: pcmpeqd %xmm0, %xmm0
+; SSSE3-NEXT: pxor %xmm0, %xmm1
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; SSSE3-NEXT: psrad $16, %xmm0
; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
@@ -2419,11 +2419,11 @@ define <8 x i32> @umulo_v8i16(<8 x i16> %a0, <8 x i16> %a1, <8 x i16>* %p2) noun
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm2
; SSE41-NEXT: pmullw %xmm1, %xmm2
-; SSE41-NEXT: pmulhuw %xmm1, %xmm0
-; SSE41-NEXT: pxor %xmm3, %xmm3
-; SSE41-NEXT: pcmpeqw %xmm0, %xmm3
-; SSE41-NEXT: pcmpeqd %xmm1, %xmm1
-; SSE41-NEXT: pxor %xmm3, %xmm1
+; SSE41-NEXT: pmulhuw %xmm0, %xmm1
+; SSE41-NEXT: pxor %xmm0, %xmm0
+; SSE41-NEXT: pcmpeqw %xmm0, %xmm1
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm0
+; SSE41-NEXT: pxor %xmm0, %xmm1
; SSE41-NEXT: pmovsxwd %xmm1, %xmm0
; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
; SSE41-NEXT: pslld $31, %xmm1
diff --git a/llvm/test/CodeGen/X86/vector-bitreverse.ll b/llvm/test/CodeGen/X86/vector-bitreverse.ll
index a9b872c0254b5..dbf5db2aef685 100644
--- a/llvm/test/CodeGen/X86/vector-bitreverse.ll
+++ b/llvm/test/CodeGen/X86/vector-bitreverse.ll
@@ -25,13 +25,13 @@ define i8 @test_bitreverse_i8(i8 %a) nounwind {
; SSE-NEXT: shlb $2, %al
; SSE-NEXT: shrb $2, %dil
; SSE-NEXT: andb $51, %dil
-; SSE-NEXT: orb %al, %dil
-; SSE-NEXT: movl %edi, %eax
-; SSE-NEXT: andb $85, %al
-; SSE-NEXT: addb %al, %al
-; SSE-NEXT: shrb %dil
-; SSE-NEXT: andb $85, %dil
; SSE-NEXT: orb %dil, %al
+; SSE-NEXT: movl %eax, %ecx
+; SSE-NEXT: andb $85, %cl
+; SSE-NEXT: addb %cl, %cl
+; SSE-NEXT: shrb %al
+; SSE-NEXT: andb $85, %al
+; SSE-NEXT: orb %cl, %al
; SSE-NEXT: retq
;
; AVX-LABEL: test_bitreverse_i8:
@@ -42,13 +42,13 @@ define i8 @test_bitreverse_i8(i8 %a) nounwind {
; AVX-NEXT: shlb $2, %al
; AVX-NEXT: shrb $2, %dil
; AVX-NEXT: andb $51, %dil
-; AVX-NEXT: orb %al, %dil
-; AVX-NEXT: movl %edi, %eax
-; AVX-NEXT: andb $85, %al
-; AVX-NEXT: addb %al, %al
-; AVX-NEXT: shrb %dil
-; AVX-NEXT: andb $85, %dil
; AVX-NEXT: orb %dil, %al
+; AVX-NEXT: movl %eax, %ecx
+; AVX-NEXT: andb $85, %cl
+; AVX-NEXT: addb %cl, %cl
+; AVX-NEXT: shrb %al
+; AVX-NEXT: andb $85, %al
+; AVX-NEXT: orb %cl, %al
; AVX-NEXT: retq
;
; XOP-LABEL: test_bitreverse_i8:
@@ -67,13 +67,13 @@ define i8 @test_bitreverse_i8(i8 %a) nounwind {
; GFNISSE-NEXT: shlb $2, %al
; GFNISSE-NEXT: shrb $2, %dil
; GFNISSE-NEXT: andb $51, %dil
-; GFNISSE-NEXT: orb %al, %dil
-; GFNISSE-NEXT: movl %edi, %eax
-; GFNISSE-NEXT: andb $85, %al
-; GFNISSE-NEXT: addb %al, %al
-; GFNISSE-NEXT: shrb %dil
-; GFNISSE-NEXT: andb $85, %dil
; GFNISSE-NEXT: orb %dil, %al
+; GFNISSE-NEXT: movl %eax, %ecx
+; GFNISSE-NEXT: andb $85, %cl
+; GFNISSE-NEXT: addb %cl, %cl
+; GFNISSE-NEXT: shrb %al
+; GFNISSE-NEXT: andb $85, %al
+; GFNISSE-NEXT: orb %cl, %al
; GFNISSE-NEXT: retq
;
; GFNIAVX-LABEL: test_bitreverse_i8:
@@ -84,13 +84,13 @@ define i8 @test_bitreverse_i8(i8 %a) nounwind {
; GFNIAVX-NEXT: shlb $2, %al
; GFNIAVX-NEXT: shrb $2, %dil
; GFNIAVX-NEXT: andb $51, %dil
-; GFNIAVX-NEXT: orb %al, %dil
-; GFNIAVX-NEXT: movl %edi, %eax
-; GFNIAVX-NEXT: andb $85, %al
-; GFNIAVX-NEXT: addb %al, %al
-; GFNIAVX-NEXT: shrb %dil
-; GFNIAVX-NEXT: andb $85, %dil
; GFNIAVX-NEXT: orb %dil, %al
+; GFNIAVX-NEXT: movl %eax, %ecx
+; GFNIAVX-NEXT: andb $85, %cl
+; GFNIAVX-NEXT: addb %cl, %cl
+; GFNIAVX-NEXT: shrb %al
+; GFNIAVX-NEXT: andb $85, %al
+; GFNIAVX-NEXT: orb %cl, %al
; GFNIAVX-NEXT: retq
;
; GFNIAVX2-LABEL: test_bitreverse_i8:
@@ -101,13 +101,13 @@ define i8 @test_bitreverse_i8(i8 %a) nounwind {
; GFNIAVX2-NEXT: shlb $2, %al
; GFNIAVX2-NEXT: shrb $2, %dil
; GFNIAVX2-NEXT: andb $51, %dil
-; GFNIAVX2-NEXT: orb %al, %dil
-; GFNIAVX2-NEXT: movl %edi, %eax
-; GFNIAVX2-NEXT: andb $85, %al
-; GFNIAVX2-NEXT: addb %al, %al
-; GFNIAVX2-NEXT: shrb %dil
-; GFNIAVX2-NEXT: andb $85, %dil
; GFNIAVX2-NEXT: orb %dil, %al
+; GFNIAVX2-NEXT: movl %eax, %ecx
+; GFNIAVX2-NEXT: andb $85, %cl
+; GFNIAVX2-NEXT: addb %cl, %cl
+; GFNIAVX2-NEXT: shrb %al
+; GFNIAVX2-NEXT: andb $85, %al
+; GFNIAVX2-NEXT: orb %cl, %al
; GFNIAVX2-NEXT: retq
;
; GFNIAVX512F-LABEL: test_bitreverse_i8:
@@ -118,13 +118,13 @@ define i8 @test_bitreverse_i8(i8 %a) nounwind {
; GFNIAVX512F-NEXT: shlb $2, %al
; GFNIAVX512F-NEXT: shrb $2, %dil
; GFNIAVX512F-NEXT: andb $51, %dil
-; GFNIAVX512F-NEXT: orb %al, %dil
-; GFNIAVX512F-NEXT: movl %edi, %eax
-; GFNIAVX512F-NEXT: andb $85, %al
-; GFNIAVX512F-NEXT: addb %al, %al
-; GFNIAVX512F-NEXT: shrb %dil
-; GFNIAVX512F-NEXT: andb $85, %dil
; GFNIAVX512F-NEXT: orb %dil, %al
+; GFNIAVX512F-NEXT: movl %eax, %ecx
+; GFNIAVX512F-NEXT: andb $85, %cl
+; GFNIAVX512F-NEXT: addb %cl, %cl
+; GFNIAVX512F-NEXT: shrb %al
+; GFNIAVX512F-NEXT: andb $85, %al
+; GFNIAVX512F-NEXT: orb %cl, %al
; GFNIAVX512F-NEXT: retq
;
; GFNIAVX512BW-LABEL: test_bitreverse_i8:
@@ -135,13 +135,13 @@ define i8 @test_bitreverse_i8(i8 %a) nounwind {
; GFNIAVX512BW-NEXT: shlb $2, %al
; GFNIAVX512BW-NEXT: shrb $2, %dil
; GFNIAVX512BW-NEXT: andb $51, %dil
-; GFNIAVX512BW-NEXT: orb %al, %dil
-; GFNIAVX512BW-NEXT: movl %edi, %eax
-; GFNIAVX512BW-NEXT: andb $85, %al
-; GFNIAVX512BW-NEXT: addb %al, %al
-; GFNIAVX512BW-NEXT: shrb %dil
-; GFNIAVX512BW-NEXT: andb $85, %dil
; GFNIAVX512BW-NEXT: orb %dil, %al
+; GFNIAVX512BW-NEXT: movl %eax, %ecx
+; GFNIAVX512BW-NEXT: andb $85, %cl
+; GFNIAVX512BW-NEXT: addb %cl, %cl
+; GFNIAVX512BW-NEXT: shrb %al
+; GFNIAVX512BW-NEXT: andb $85, %al
+; GFNIAVX512BW-NEXT: orb %cl, %al
; GFNIAVX512BW-NEXT: retq
%b = call i8 @llvm.bitreverse.i8(i8 %a)
ret i8 %b
diff --git a/llvm/test/CodeGen/X86/vector-ext-logic.ll b/llvm/test/CodeGen/X86/vector-ext-logic.ll
index 611c0d4019a70..a26d7e9486f1c 100644
--- a/llvm/test/CodeGen/X86/vector-ext-logic.ll
+++ b/llvm/test/CodeGen/X86/vector-ext-logic.ll
@@ -5,13 +5,11 @@
define <8 x i32> @zext_and_v8i32(<8 x i16> %x, <8 x i16> %y) {
; SSE2-LABEL: zext_and_v8i32:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: pand %xmm1, %xmm2
-; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: movdqa %xmm2, %xmm0
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; SSE2-NEXT: movdqa %xmm2, %xmm1
+; SSE2-NEXT: pand %xmm0, %xmm1
+; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
; SSE2-NEXT: retq
;
; AVX2-LABEL: zext_and_v8i32:
@@ -28,13 +26,11 @@ define <8 x i32> @zext_and_v8i32(<8 x i16> %x, <8 x i16> %y) {
define <8 x i32> @zext_or_v8i32(<8 x i16> %x, <8 x i16> %y) {
; SSE2-LABEL: zext_or_v8i32:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: por %xmm1, %xmm2
-; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: movdqa %xmm2, %xmm0
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; SSE2-NEXT: movdqa %xmm2, %xmm1
+; SSE2-NEXT: por %xmm0, %xmm1
+; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
; SSE2-NEXT: retq
;
; AVX2-LABEL: zext_or_v8i32:
@@ -51,13 +47,11 @@ define <8 x i32> @zext_or_v8i32(<8 x i16> %x, <8 x i16> %y) {
define <8 x i32> @zext_xor_v8i32(<8 x i16> %x, <8 x i16> %y) {
; SSE2-LABEL: zext_xor_v8i32:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: pxor %xmm1, %xmm2
-; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: movdqa %xmm2, %xmm0
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; SSE2-NEXT: movdqa %xmm2, %xmm1
+; SSE2-NEXT: pxor %xmm0, %xmm1
+; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
; SSE2-NEXT: retq
;
; AVX2-LABEL: zext_xor_v8i32:
@@ -298,14 +292,12 @@ define <8 x i32> @bool_zext_and(<8 x i1> %x, <8 x i1> %y) {
define <8 x i32> @bool_zext_or(<8 x i1> %x, <8 x i1> %y) {
; SSE2-LABEL: bool_zext_or:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: por %xmm1, %xmm2
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: movdqa %xmm2, %xmm0
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; SSE2-NEXT: movdqa %xmm2, %xmm1
+; SSE2-NEXT: por %xmm0, %xmm1
+; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
; SSE2-NEXT: retq
;
; AVX2-LABEL: bool_zext_or:
@@ -323,14 +315,12 @@ define <8 x i32> @bool_zext_or(<8 x i1> %x, <8 x i1> %y) {
define <8 x i32> @bool_zext_xor(<8 x i1> %x, <8 x i1> %y) {
; SSE2-LABEL: bool_zext_xor:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: pxor %xmm1, %xmm2
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: movdqa %xmm2, %xmm0
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; SSE2-NEXT: movdqa %xmm2, %xmm1
+; SSE2-NEXT: pxor %xmm0, %xmm1
+; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
; SSE2-NEXT: retq
;
; AVX2-LABEL: bool_zext_xor:
diff --git a/llvm/test/CodeGen/X86/vector-fshl-128.ll b/llvm/test/CodeGen/X86/vector-fshl-128.ll
index 0b4edf10cd9ed..97a485b04b41a 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-128.ll
@@ -727,75 +727,74 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt)
; SSE2-LABEL: var_funnnel_v16i8:
; SSE2: # %bb.0:
; SSE2-NEXT: psrlw $1, %xmm1
-; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; SSE2-NEXT: pand %xmm6, %xmm1
+; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; SSE2-NEXT: pand %xmm5, %xmm1
; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
; SSE2-NEXT: movdqa %xmm2, %xmm7
; SSE2-NEXT: pandn %xmm8, %xmm7
; SSE2-NEXT: psllw $5, %xmm7
; SSE2-NEXT: pxor %xmm3, %xmm3
-; SSE2-NEXT: pxor %xmm4, %xmm4
-; SSE2-NEXT: pcmpgtb %xmm7, %xmm4
-; SSE2-NEXT: movdqa %xmm4, %xmm5
-; SSE2-NEXT: pandn %xmm1, %xmm5
+; SSE2-NEXT: pxor %xmm6, %xmm6
+; SSE2-NEXT: pcmpgtb %xmm7, %xmm6
+; SSE2-NEXT: movdqa %xmm6, %xmm4
+; SSE2-NEXT: pandn %xmm1, %xmm4
; SSE2-NEXT: psrlw $4, %xmm1
-; SSE2-NEXT: pand %xmm4, %xmm1
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: por %xmm5, %xmm1
+; SSE2-NEXT: pand %xmm1, %xmm6
+; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6
+; SSE2-NEXT: por %xmm4, %xmm6
; SSE2-NEXT: paddb %xmm7, %xmm7
-; SSE2-NEXT: pxor %xmm4, %xmm4
-; SSE2-NEXT: pcmpgtb %xmm7, %xmm4
-; SSE2-NEXT: movdqa %xmm4, %xmm5
-; SSE2-NEXT: pandn %xmm1, %xmm5
-; SSE2-NEXT: psrlw $2, %xmm1
-; SSE2-NEXT: pand %xmm4, %xmm1
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: por %xmm5, %xmm1
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: pcmpgtb %xmm7, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm4
+; SSE2-NEXT: pandn %xmm6, %xmm4
+; SSE2-NEXT: psrlw $2, %xmm6
+; SSE2-NEXT: pand %xmm1, %xmm6
+; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6
+; SSE2-NEXT: por %xmm4, %xmm6
; SSE2-NEXT: paddb %xmm7, %xmm7
-; SSE2-NEXT: pxor %xmm4, %xmm4
-; SSE2-NEXT: pcmpgtb %xmm7, %xmm4
-; SSE2-NEXT: movdqa %xmm4, %xmm5
-; SSE2-NEXT: pandn %xmm1, %xmm5
-; SSE2-NEXT: psrlw $1, %xmm1
-; SSE2-NEXT: pand %xmm6, %xmm4
-; SSE2-NEXT: pand %xmm1, %xmm4
-; SSE2-NEXT: por %xmm5, %xmm4
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: pcmpgtb %xmm7, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm4
+; SSE2-NEXT: pandn %xmm6, %xmm4
+; SSE2-NEXT: psrlw $1, %xmm6
+; SSE2-NEXT: pand %xmm5, %xmm1
+; SSE2-NEXT: pand %xmm6, %xmm1
+; SSE2-NEXT: por %xmm4, %xmm1
; SSE2-NEXT: pand %xmm8, %xmm2
; SSE2-NEXT: psllw $5, %xmm2
-; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: pcmpgtb %xmm2, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm5
+; SSE2-NEXT: pxor %xmm4, %xmm4
+; SSE2-NEXT: pcmpgtb %xmm2, %xmm4
+; SSE2-NEXT: movdqa %xmm4, %xmm5
; SSE2-NEXT: pandn %xmm0, %xmm5
; SSE2-NEXT: psllw $4, %xmm0
-; SSE2-NEXT: pand %xmm1, %xmm0
+; SSE2-NEXT: pand %xmm4, %xmm0
; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE2-NEXT: por %xmm5, %xmm0
; SSE2-NEXT: paddb %xmm2, %xmm2
-; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: pcmpgtb %xmm2, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm5
+; SSE2-NEXT: pxor %xmm4, %xmm4
+; SSE2-NEXT: pcmpgtb %xmm2, %xmm4
+; SSE2-NEXT: movdqa %xmm4, %xmm5
; SSE2-NEXT: pandn %xmm0, %xmm5
; SSE2-NEXT: psllw $2, %xmm0
-; SSE2-NEXT: pand %xmm1, %xmm0
+; SSE2-NEXT: pand %xmm4, %xmm0
; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE2-NEXT: por %xmm5, %xmm0
; SSE2-NEXT: paddb %xmm2, %xmm2
; SSE2-NEXT: pcmpgtb %xmm2, %xmm3
-; SSE2-NEXT: movdqa %xmm3, %xmm1
-; SSE2-NEXT: pandn %xmm0, %xmm1
-; SSE2-NEXT: por %xmm4, %xmm1
+; SSE2-NEXT: movdqa %xmm3, %xmm2
+; SSE2-NEXT: pandn %xmm0, %xmm2
+; SSE2-NEXT: por %xmm1, %xmm2
; SSE2-NEXT: paddb %xmm0, %xmm0
; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: por %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: var_funnnel_v16i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm2, %xmm3
-; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; SSE41-NEXT: movdqa %xmm3, %xmm0
-; SSE41-NEXT: pandn %xmm4, %xmm0
+; SSE41-NEXT: movdqa %xmm0, %xmm4
+; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: pandn %xmm3, %xmm0
; SSE41-NEXT: psllw $5, %xmm0
; SSE41-NEXT: psrlw $1, %xmm1
; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
@@ -814,27 +813,27 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt)
; SSE41-NEXT: pand %xmm5, %xmm6
; SSE41-NEXT: paddb %xmm0, %xmm0
; SSE41-NEXT: pblendvb %xmm0, %xmm6, %xmm1
-; SSE41-NEXT: pand %xmm4, %xmm3
+; SSE41-NEXT: pand %xmm2, %xmm3
; SSE41-NEXT: psllw $5, %xmm3
-; SSE41-NEXT: movdqa %xmm3, %xmm4
-; SSE41-NEXT: paddb %xmm3, %xmm4
-; SSE41-NEXT: movdqa %xmm2, %xmm5
+; SSE41-NEXT: movdqa %xmm3, %xmm2
+; SSE41-NEXT: paddb %xmm3, %xmm2
+; SSE41-NEXT: movdqa %xmm4, %xmm5
; SSE41-NEXT: psllw $4, %xmm5
; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5
; SSE41-NEXT: movdqa %xmm3, %xmm0
-; SSE41-NEXT: pblendvb %xmm0, %xmm5, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm3
+; SSE41-NEXT: pblendvb %xmm0, %xmm5, %xmm4
+; SSE41-NEXT: movdqa %xmm4, %xmm3
; SSE41-NEXT: psllw $2, %xmm3
; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE41-NEXT: movdqa %xmm4, %xmm0
-; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm3
-; SSE41-NEXT: paddb %xmm2, %xmm3
-; SSE41-NEXT: paddb %xmm4, %xmm4
-; SSE41-NEXT: movdqa %xmm4, %xmm0
-; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
-; SSE41-NEXT: por %xmm1, %xmm2
; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm4
+; SSE41-NEXT: movdqa %xmm4, %xmm3
+; SSE41-NEXT: paddb %xmm4, %xmm3
+; SSE41-NEXT: paddb %xmm2, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm4
+; SSE41-NEXT: por %xmm1, %xmm4
+; SSE41-NEXT: movdqa %xmm4, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: var_funnnel_v16i8:
@@ -998,65 +997,65 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt)
; X86-SSE2-LABEL: var_funnnel_v16i8:
; X86-SSE2: # %bb.0:
; X86-SSE2-NEXT: psrlw $1, %xmm1
-; X86-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; X86-SSE2-NEXT: pand %xmm5, %xmm1
-; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; X86-SSE2-NEXT: pand %xmm4, %xmm1
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
; X86-SSE2-NEXT: movdqa %xmm2, %xmm6
-; X86-SSE2-NEXT: pandn %xmm4, %xmm6
+; X86-SSE2-NEXT: pandn %xmm3, %xmm6
; X86-SSE2-NEXT: psllw $5, %xmm6
-; X86-SSE2-NEXT: pxor %xmm3, %xmm3
-; X86-SSE2-NEXT: pcmpgtb %xmm6, %xmm3
-; X86-SSE2-NEXT: movdqa %xmm3, %xmm7
+; X86-SSE2-NEXT: pxor %xmm5, %xmm5
+; X86-SSE2-NEXT: pcmpgtb %xmm6, %xmm5
+; X86-SSE2-NEXT: movdqa %xmm5, %xmm7
; X86-SSE2-NEXT: pandn %xmm1, %xmm7
; X86-SSE2-NEXT: psrlw $4, %xmm1
-; X86-SSE2-NEXT: pand %xmm3, %xmm1
-; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-SSE2-NEXT: por %xmm7, %xmm1
+; X86-SSE2-NEXT: pand %xmm1, %xmm5
+; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm5
+; X86-SSE2-NEXT: por %xmm7, %xmm5
; X86-SSE2-NEXT: paddb %xmm6, %xmm6
-; X86-SSE2-NEXT: pxor %xmm3, %xmm3
-; X86-SSE2-NEXT: pcmpgtb %xmm6, %xmm3
-; X86-SSE2-NEXT: movdqa %xmm3, %xmm7
-; X86-SSE2-NEXT: pandn %xmm1, %xmm7
-; X86-SSE2-NEXT: psrlw $2, %xmm1
-; X86-SSE2-NEXT: pand %xmm3, %xmm1
-; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-SSE2-NEXT: por %xmm7, %xmm1
+; X86-SSE2-NEXT: pxor %xmm1, %xmm1
+; X86-SSE2-NEXT: pcmpgtb %xmm6, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm7
+; X86-SSE2-NEXT: pandn %xmm5, %xmm7
+; X86-SSE2-NEXT: psrlw $2, %xmm5
+; X86-SSE2-NEXT: pand %xmm1, %xmm5
+; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm5
+; X86-SSE2-NEXT: por %xmm7, %xmm5
; X86-SSE2-NEXT: paddb %xmm6, %xmm6
-; X86-SSE2-NEXT: pxor %xmm3, %xmm3
-; X86-SSE2-NEXT: pcmpgtb %xmm6, %xmm3
-; X86-SSE2-NEXT: movdqa %xmm3, %xmm6
-; X86-SSE2-NEXT: pand %xmm5, %xmm3
-; X86-SSE2-NEXT: pandn %xmm1, %xmm6
-; X86-SSE2-NEXT: psrlw $1, %xmm1
-; X86-SSE2-NEXT: pand %xmm1, %xmm3
-; X86-SSE2-NEXT: por %xmm6, %xmm3
-; X86-SSE2-NEXT: pand %xmm4, %xmm2
-; X86-SSE2-NEXT: psllw $5, %xmm2
; X86-SSE2-NEXT: pxor %xmm1, %xmm1
-; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm1
-; X86-SSE2-NEXT: movdqa %xmm1, %xmm4
+; X86-SSE2-NEXT: pcmpgtb %xmm6, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm6
+; X86-SSE2-NEXT: pand %xmm4, %xmm1
+; X86-SSE2-NEXT: pandn %xmm5, %xmm6
+; X86-SSE2-NEXT: psrlw $1, %xmm5
+; X86-SSE2-NEXT: pand %xmm5, %xmm1
+; X86-SSE2-NEXT: por %xmm6, %xmm1
+; X86-SSE2-NEXT: pand %xmm3, %xmm2
+; X86-SSE2-NEXT: psllw $5, %xmm2
+; X86-SSE2-NEXT: pxor %xmm3, %xmm3
+; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm3
+; X86-SSE2-NEXT: movdqa %xmm3, %xmm4
; X86-SSE2-NEXT: pandn %xmm0, %xmm4
; X86-SSE2-NEXT: psllw $4, %xmm0
-; X86-SSE2-NEXT: pand %xmm1, %xmm0
+; X86-SSE2-NEXT: pand %xmm3, %xmm0
; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
; X86-SSE2-NEXT: por %xmm4, %xmm0
; X86-SSE2-NEXT: paddb %xmm2, %xmm2
-; X86-SSE2-NEXT: pxor %xmm1, %xmm1
-; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm1
-; X86-SSE2-NEXT: movdqa %xmm1, %xmm4
+; X86-SSE2-NEXT: pxor %xmm3, %xmm3
+; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm3
+; X86-SSE2-NEXT: movdqa %xmm3, %xmm4
; X86-SSE2-NEXT: pandn %xmm0, %xmm4
; X86-SSE2-NEXT: psllw $2, %xmm0
-; X86-SSE2-NEXT: pand %xmm1, %xmm0
+; X86-SSE2-NEXT: pand %xmm3, %xmm0
; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
; X86-SSE2-NEXT: por %xmm4, %xmm0
-; X86-SSE2-NEXT: pxor %xmm1, %xmm1
+; X86-SSE2-NEXT: pxor %xmm3, %xmm3
; X86-SSE2-NEXT: paddb %xmm2, %xmm2
-; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm1
-; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm3
+; X86-SSE2-NEXT: movdqa %xmm3, %xmm2
; X86-SSE2-NEXT: pandn %xmm0, %xmm2
-; X86-SSE2-NEXT: por %xmm3, %xmm2
+; X86-SSE2-NEXT: por %xmm1, %xmm2
; X86-SSE2-NEXT: paddb %xmm0, %xmm0
-; X86-SSE2-NEXT: pand %xmm1, %xmm0
+; X86-SSE2-NEXT: pand %xmm3, %xmm0
; X86-SSE2-NEXT: por %xmm2, %xmm0
; X86-SSE2-NEXT: retl
%res = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt)
diff --git a/llvm/test/CodeGen/X86/vector-fshr-128.ll b/llvm/test/CodeGen/X86/vector-fshr-128.ll
index d7408e91f6f6c..baab6cb2424ad 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-128.ll
@@ -426,56 +426,56 @@ define <4 x i32> @var_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt)
define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt) nounwind {
; SSE2-LABEL: var_funnnel_v8i16:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm2, %xmm3
-; SSE2-NEXT: psllw $12, %xmm3
-; SSE2-NEXT: movdqa %xmm3, %xmm4
-; SSE2-NEXT: psraw $15, %xmm4
-; SSE2-NEXT: movdqa %xmm4, %xmm5
+; SSE2-NEXT: movdqa %xmm2, %xmm4
+; SSE2-NEXT: psllw $12, %xmm4
+; SSE2-NEXT: movdqa %xmm4, %xmm3
+; SSE2-NEXT: psraw $15, %xmm3
+; SSE2-NEXT: movdqa %xmm3, %xmm5
; SSE2-NEXT: pandn %xmm1, %xmm5
; SSE2-NEXT: psrlw $8, %xmm1
-; SSE2-NEXT: pand %xmm4, %xmm1
-; SSE2-NEXT: por %xmm5, %xmm1
-; SSE2-NEXT: paddw %xmm3, %xmm3
-; SSE2-NEXT: movdqa %xmm3, %xmm4
-; SSE2-NEXT: psraw $15, %xmm4
-; SSE2-NEXT: movdqa %xmm4, %xmm5
-; SSE2-NEXT: pandn %xmm1, %xmm5
-; SSE2-NEXT: psrlw $4, %xmm1
-; SSE2-NEXT: pand %xmm4, %xmm1
-; SSE2-NEXT: por %xmm5, %xmm1
-; SSE2-NEXT: paddw %xmm3, %xmm3
-; SSE2-NEXT: movdqa %xmm3, %xmm4
-; SSE2-NEXT: psraw $15, %xmm4
-; SSE2-NEXT: movdqa %xmm4, %xmm5
-; SSE2-NEXT: pandn %xmm1, %xmm5
-; SSE2-NEXT: psrlw $2, %xmm1
-; SSE2-NEXT: pand %xmm4, %xmm1
-; SSE2-NEXT: por %xmm5, %xmm1
-; SSE2-NEXT: paddw %xmm3, %xmm3
-; SSE2-NEXT: psraw $15, %xmm3
-; SSE2-NEXT: movdqa %xmm3, %xmm4
-; SSE2-NEXT: pandn %xmm1, %xmm4
-; SSE2-NEXT: psrlw $1, %xmm1
; SSE2-NEXT: pand %xmm1, %xmm3
+; SSE2-NEXT: por %xmm5, %xmm3
+; SSE2-NEXT: paddw %xmm4, %xmm4
+; SSE2-NEXT: movdqa %xmm4, %xmm1
+; SSE2-NEXT: psraw $15, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm5
+; SSE2-NEXT: pandn %xmm3, %xmm5
+; SSE2-NEXT: psrlw $4, %xmm3
+; SSE2-NEXT: pand %xmm1, %xmm3
+; SSE2-NEXT: por %xmm5, %xmm3
+; SSE2-NEXT: paddw %xmm4, %xmm4
+; SSE2-NEXT: movdqa %xmm4, %xmm1
+; SSE2-NEXT: psraw $15, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm5
+; SSE2-NEXT: pandn %xmm3, %xmm5
+; SSE2-NEXT: psrlw $2, %xmm3
+; SSE2-NEXT: pand %xmm1, %xmm3
+; SSE2-NEXT: por %xmm5, %xmm3
+; SSE2-NEXT: paddw %xmm4, %xmm4
+; SSE2-NEXT: psraw $15, %xmm4
+; SSE2-NEXT: movdqa %xmm4, %xmm1
+; SSE2-NEXT: pandn %xmm3, %xmm1
+; SSE2-NEXT: psrlw $1, %xmm3
+; SSE2-NEXT: pand %xmm4, %xmm3
; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm1
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
-; SSE2-NEXT: pslld $23, %xmm1
+; SSE2-NEXT: movdqa %xmm2, %xmm4
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pslld $23, %xmm4
; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216]
-; SSE2-NEXT: paddd %xmm5, %xmm1
-; SSE2-NEXT: cvttps2dq %xmm1, %xmm1
-; SSE2-NEXT: pslld $16, %xmm1
-; SSE2-NEXT: psrad $16, %xmm1
+; SSE2-NEXT: paddd %xmm5, %xmm4
+; SSE2-NEXT: cvttps2dq %xmm4, %xmm4
+; SSE2-NEXT: pslld $16, %xmm4
+; SSE2-NEXT: psrad $16, %xmm4
; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
; SSE2-NEXT: pslld $23, %xmm2
; SSE2-NEXT: paddd %xmm5, %xmm2
; SSE2-NEXT: cvttps2dq %xmm2, %xmm2
; SSE2-NEXT: pslld $16, %xmm2
; SSE2-NEXT: psrad $16, %xmm2
-; SSE2-NEXT: packssdw %xmm1, %xmm2
+; SSE2-NEXT: packssdw %xmm4, %xmm2
; SSE2-NEXT: psllw $1, %xmm0
; SSE2-NEXT: pmullw %xmm2, %xmm0
-; SSE2-NEXT: por %xmm4, %xmm0
+; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: por %xmm3, %xmm0
; SSE2-NEXT: retq
;
@@ -669,56 +669,56 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt)
;
; X86-SSE2-LABEL: var_funnnel_v8i16:
; X86-SSE2: # %bb.0:
-; X86-SSE2-NEXT: movdqa %xmm2, %xmm3
-; X86-SSE2-NEXT: psllw $12, %xmm3
-; X86-SSE2-NEXT: movdqa %xmm3, %xmm4
-; X86-SSE2-NEXT: psraw $15, %xmm4
-; X86-SSE2-NEXT: movdqa %xmm4, %xmm5
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm4
+; X86-SSE2-NEXT: psllw $12, %xmm4
+; X86-SSE2-NEXT: movdqa %xmm4, %xmm3
+; X86-SSE2-NEXT: psraw $15, %xmm3
+; X86-SSE2-NEXT: movdqa %xmm3, %xmm5
; X86-SSE2-NEXT: pandn %xmm1, %xmm5
; X86-SSE2-NEXT: psrlw $8, %xmm1
-; X86-SSE2-NEXT: pand %xmm4, %xmm1
-; X86-SSE2-NEXT: por %xmm5, %xmm1
-; X86-SSE2-NEXT: paddw %xmm3, %xmm3
-; X86-SSE2-NEXT: movdqa %xmm3, %xmm4
-; X86-SSE2-NEXT: psraw $15, %xmm4
-; X86-SSE2-NEXT: movdqa %xmm4, %xmm5
-; X86-SSE2-NEXT: pandn %xmm1, %xmm5
-; X86-SSE2-NEXT: psrlw $4, %xmm1
-; X86-SSE2-NEXT: pand %xmm4, %xmm1
-; X86-SSE2-NEXT: por %xmm5, %xmm1
-; X86-SSE2-NEXT: paddw %xmm3, %xmm3
-; X86-SSE2-NEXT: movdqa %xmm3, %xmm4
-; X86-SSE2-NEXT: psraw $15, %xmm4
-; X86-SSE2-NEXT: movdqa %xmm4, %xmm5
-; X86-SSE2-NEXT: pandn %xmm1, %xmm5
-; X86-SSE2-NEXT: psrlw $2, %xmm1
-; X86-SSE2-NEXT: pand %xmm4, %xmm1
-; X86-SSE2-NEXT: por %xmm5, %xmm1
-; X86-SSE2-NEXT: paddw %xmm3, %xmm3
-; X86-SSE2-NEXT: psraw $15, %xmm3
-; X86-SSE2-NEXT: movdqa %xmm3, %xmm4
-; X86-SSE2-NEXT: pandn %xmm1, %xmm4
-; X86-SSE2-NEXT: psrlw $1, %xmm1
; X86-SSE2-NEXT: pand %xmm1, %xmm3
+; X86-SSE2-NEXT: por %xmm5, %xmm3
+; X86-SSE2-NEXT: paddw %xmm4, %xmm4
+; X86-SSE2-NEXT: movdqa %xmm4, %xmm1
+; X86-SSE2-NEXT: psraw $15, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm5
+; X86-SSE2-NEXT: pandn %xmm3, %xmm5
+; X86-SSE2-NEXT: psrlw $4, %xmm3
+; X86-SSE2-NEXT: pand %xmm1, %xmm3
+; X86-SSE2-NEXT: por %xmm5, %xmm3
+; X86-SSE2-NEXT: paddw %xmm4, %xmm4
+; X86-SSE2-NEXT: movdqa %xmm4, %xmm1
+; X86-SSE2-NEXT: psraw $15, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm5
+; X86-SSE2-NEXT: pandn %xmm3, %xmm5
+; X86-SSE2-NEXT: psrlw $2, %xmm3
+; X86-SSE2-NEXT: pand %xmm1, %xmm3
+; X86-SSE2-NEXT: por %xmm5, %xmm3
+; X86-SSE2-NEXT: paddw %xmm4, %xmm4
+; X86-SSE2-NEXT: psraw $15, %xmm4
+; X86-SSE2-NEXT: movdqa %xmm4, %xmm1
+; X86-SSE2-NEXT: pandn %xmm3, %xmm1
+; X86-SSE2-NEXT: psrlw $1, %xmm3
+; X86-SSE2-NEXT: pand %xmm4, %xmm3
; X86-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
-; X86-SSE2-NEXT: movdqa %xmm2, %xmm1
-; X86-SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
-; X86-SSE2-NEXT: pslld $23, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm4
+; X86-SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7]
+; X86-SSE2-NEXT: pslld $23, %xmm4
; X86-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216]
-; X86-SSE2-NEXT: paddd %xmm5, %xmm1
-; X86-SSE2-NEXT: cvttps2dq %xmm1, %xmm1
-; X86-SSE2-NEXT: pslld $16, %xmm1
-; X86-SSE2-NEXT: psrad $16, %xmm1
+; X86-SSE2-NEXT: paddd %xmm5, %xmm4
+; X86-SSE2-NEXT: cvttps2dq %xmm4, %xmm4
+; X86-SSE2-NEXT: pslld $16, %xmm4
+; X86-SSE2-NEXT: psrad $16, %xmm4
; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
; X86-SSE2-NEXT: pslld $23, %xmm2
; X86-SSE2-NEXT: paddd %xmm5, %xmm2
; X86-SSE2-NEXT: cvttps2dq %xmm2, %xmm2
; X86-SSE2-NEXT: pslld $16, %xmm2
; X86-SSE2-NEXT: psrad $16, %xmm2
-; X86-SSE2-NEXT: packssdw %xmm1, %xmm2
+; X86-SSE2-NEXT: packssdw %xmm4, %xmm2
; X86-SSE2-NEXT: psllw $1, %xmm0
; X86-SSE2-NEXT: pmullw %xmm2, %xmm0
-; X86-SSE2-NEXT: por %xmm4, %xmm0
+; X86-SSE2-NEXT: por %xmm1, %xmm0
; X86-SSE2-NEXT: por %xmm3, %xmm0
; X86-SSE2-NEXT: retl
%res = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt)
@@ -738,24 +738,24 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt)
; SSE2-NEXT: movdqa %xmm4, %xmm7
; SSE2-NEXT: pandn %xmm1, %xmm7
; SSE2-NEXT: psrlw $4, %xmm1
-; SSE2-NEXT: pand %xmm4, %xmm1
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: por %xmm7, %xmm1
+; SSE2-NEXT: pand %xmm1, %xmm4
+; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
+; SSE2-NEXT: por %xmm7, %xmm4
; SSE2-NEXT: paddb %xmm6, %xmm6
-; SSE2-NEXT: pxor %xmm4, %xmm4
-; SSE2-NEXT: pcmpgtb %xmm6, %xmm4
-; SSE2-NEXT: movdqa %xmm4, %xmm7
-; SSE2-NEXT: pandn %xmm1, %xmm7
-; SSE2-NEXT: psrlw $2, %xmm1
-; SSE2-NEXT: pand %xmm4, %xmm1
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: por %xmm7, %xmm1
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: pcmpgtb %xmm6, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm7
+; SSE2-NEXT: pandn %xmm4, %xmm7
+; SSE2-NEXT: psrlw $2, %xmm4
+; SSE2-NEXT: pand %xmm1, %xmm4
+; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
+; SSE2-NEXT: por %xmm7, %xmm4
; SSE2-NEXT: paddb %xmm6, %xmm6
-; SSE2-NEXT: pxor %xmm4, %xmm4
-; SSE2-NEXT: pcmpgtb %xmm6, %xmm4
-; SSE2-NEXT: movdqa %xmm4, %xmm6
-; SSE2-NEXT: pandn %xmm1, %xmm6
-; SSE2-NEXT: psrlw $1, %xmm1
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: pcmpgtb %xmm6, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm6
+; SSE2-NEXT: pandn %xmm4, %xmm6
+; SSE2-NEXT: psrlw $1, %xmm4
; SSE2-NEXT: pand %xmm1, %xmm4
; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
; SSE2-NEXT: por %xmm6, %xmm4
@@ -999,24 +999,24 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt)
; X86-SSE2-NEXT: movdqa %xmm4, %xmm7
; X86-SSE2-NEXT: pandn %xmm1, %xmm7
; X86-SSE2-NEXT: psrlw $4, %xmm1
-; X86-SSE2-NEXT: pand %xmm4, %xmm1
-; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-SSE2-NEXT: por %xmm7, %xmm1
+; X86-SSE2-NEXT: pand %xmm1, %xmm4
+; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm4
+; X86-SSE2-NEXT: por %xmm7, %xmm4
; X86-SSE2-NEXT: paddb %xmm6, %xmm6
-; X86-SSE2-NEXT: pxor %xmm4, %xmm4
-; X86-SSE2-NEXT: pcmpgtb %xmm6, %xmm4
-; X86-SSE2-NEXT: movdqa %xmm4, %xmm7
-; X86-SSE2-NEXT: pandn %xmm1, %xmm7
-; X86-SSE2-NEXT: psrlw $2, %xmm1
-; X86-SSE2-NEXT: pand %xmm4, %xmm1
-; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-SSE2-NEXT: por %xmm7, %xmm1
+; X86-SSE2-NEXT: pxor %xmm1, %xmm1
+; X86-SSE2-NEXT: pcmpgtb %xmm6, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm7
+; X86-SSE2-NEXT: pandn %xmm4, %xmm7
+; X86-SSE2-NEXT: psrlw $2, %xmm4
+; X86-SSE2-NEXT: pand %xmm1, %xmm4
+; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm4
+; X86-SSE2-NEXT: por %xmm7, %xmm4
; X86-SSE2-NEXT: paddb %xmm6, %xmm6
-; X86-SSE2-NEXT: pxor %xmm4, %xmm4
-; X86-SSE2-NEXT: pcmpgtb %xmm6, %xmm4
-; X86-SSE2-NEXT: movdqa %xmm4, %xmm6
-; X86-SSE2-NEXT: pandn %xmm1, %xmm6
-; X86-SSE2-NEXT: psrlw $1, %xmm1
+; X86-SSE2-NEXT: pxor %xmm1, %xmm1
+; X86-SSE2-NEXT: pcmpgtb %xmm6, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm6
+; X86-SSE2-NEXT: pandn %xmm4, %xmm6
+; X86-SSE2-NEXT: psrlw $1, %xmm4
; X86-SSE2-NEXT: pand %xmm1, %xmm4
; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm4
; X86-SSE2-NEXT: por %xmm6, %xmm4
diff --git a/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll b/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll
index 9841a42163897..53e450856632d 100644
--- a/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll
+++ b/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll
@@ -78,41 +78,39 @@ define <2 x i64> @test_div7_2i64(<2 x i64> %a) nounwind {
define <4 x i32> @test_div7_4i32(<4 x i32> %a) nounwind {
; SSE2-LABEL: test_div7_4i32:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027]
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: pmuludq %xmm2, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027]
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: pmuludq %xmm1, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSE2-NEXT: pmuludq %xmm2, %xmm3
+; SSE2-NEXT: pmuludq %xmm1, %xmm3
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; SSE2-NEXT: pxor %xmm3, %xmm3
; SSE2-NEXT: pcmpgtd %xmm0, %xmm3
-; SSE2-NEXT: pand %xmm2, %xmm3
+; SSE2-NEXT: pand %xmm1, %xmm3
; SSE2-NEXT: paddd %xmm0, %xmm3
-; SSE2-NEXT: psubd %xmm3, %xmm1
-; SSE2-NEXT: paddd %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psrld $31, %xmm0
-; SSE2-NEXT: psrad $2, %xmm1
-; SSE2-NEXT: paddd %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: psubd %xmm3, %xmm2
+; SSE2-NEXT: paddd %xmm2, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psrld $31, %xmm1
+; SSE2-NEXT: psrad $2, %xmm0
+; SSE2-NEXT: paddd %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_div7_4i32:
; SSE41: # %bb.0:
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027]
-; SSE41-NEXT: pmuldq %xmm1, %xmm2
-; SSE41-NEXT: pmuldq %xmm0, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; SSE41-NEXT: paddd %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: psrld $31, %xmm0
-; SSE41-NEXT: psrad $2, %xmm1
-; SSE41-NEXT: paddd %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027]
+; SSE41-NEXT: pmuldq %xmm2, %xmm1
+; SSE41-NEXT: pmuldq %xmm0, %xmm2
+; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
+; SSE41-NEXT: paddd %xmm2, %xmm0
+; SSE41-NEXT: movdqa %xmm0, %xmm1
+; SSE41-NEXT: psrld $31, %xmm1
+; SSE41-NEXT: psrad $2, %xmm0
+; SSE41-NEXT: paddd %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: test_div7_4i32:
@@ -170,17 +168,16 @@ define <8 x i16> @test_div7_8i16(<8 x i16> %a) nounwind {
define <16 x i8> @test_div7_16i8(<16 x i8> %a) nounwind {
; SSE-LABEL: test_div7_16i8:
; SSE: # %bb.0:
-; SSE-NEXT: movdqa %xmm0, %xmm1
-; SSE-NEXT: pxor %xmm0, %xmm0
+; SSE-NEXT: pxor %xmm1, %xmm1
; SSE-NEXT: pxor %xmm2, %xmm2
-; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
+; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
; SSE-NEXT: movdqa {{.*#+}} xmm3 = [37632,37632,37632,37632,37632,37632,37632,37632]
; SSE-NEXT: pmulhw %xmm3, %xmm2
; SSE-NEXT: psrlw $8, %xmm2
-; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE-NEXT: pmulhw %xmm3, %xmm0
-; SSE-NEXT: psrlw $8, %xmm0
-; SSE-NEXT: packuswb %xmm2, %xmm0
+; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE-NEXT: pmulhw %xmm3, %xmm1
+; SSE-NEXT: psrlw $8, %xmm1
+; SSE-NEXT: packuswb %xmm2, %xmm1
; SSE-NEXT: paddb %xmm1, %xmm0
; SSE-NEXT: movdqa %xmm0, %xmm1
; SSE-NEXT: psrlw $2, %xmm1
diff --git a/llvm/test/CodeGen/X86/vector-lzcnt-128.ll b/llvm/test/CodeGen/X86/vector-lzcnt-128.ll
index 7ae52d9ddcd64..0cb2b019123b0 100644
--- a/llvm/test/CodeGen/X86/vector-lzcnt-128.ll
+++ b/llvm/test/CodeGen/X86/vector-lzcnt-128.ll
@@ -18,80 +18,80 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind {
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrlq $1, %xmm1
-; SSE2-NEXT: por %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psrlq $2, %xmm0
+; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psrlq $2, %xmm1
; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrlq $4, %xmm1
-; SSE2-NEXT: por %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psrlq $8, %xmm0
+; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psrlq $8, %xmm1
; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrlq $16, %xmm1
-; SSE2-NEXT: por %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psrlq $32, %xmm0
+; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psrlq $32, %xmm1
; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: pcmpeqd %xmm1, %xmm1
-; SSE2-NEXT: pxor %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psrlw $1, %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psrlw $1, %xmm1
+; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT: psubb %xmm1, %xmm0
+; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: pand %xmm1, %xmm2
+; SSE2-NEXT: psrlw $2, %xmm0
+; SSE2-NEXT: pand %xmm1, %xmm0
+; SSE2-NEXT: paddb %xmm2, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psrlw $4, %xmm1
+; SSE2-NEXT: paddb %xmm1, %xmm0
; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE2-NEXT: psubb %xmm0, %xmm1
-; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: pand %xmm0, %xmm2
-; SSE2-NEXT: psrlw $2, %xmm1
-; SSE2-NEXT: pand %xmm0, %xmm1
-; SSE2-NEXT: paddb %xmm2, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: psrlw $4, %xmm2
-; SSE2-NEXT: paddb %xmm1, %xmm2
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE2-NEXT: pxor %xmm0, %xmm0
-; SSE2-NEXT: psadbw %xmm2, %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: psadbw %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE3-LABEL: testv2i64:
; SSE3: # %bb.0:
; SSE3-NEXT: movdqa %xmm0, %xmm1
; SSE3-NEXT: psrlq $1, %xmm1
-; SSE3-NEXT: por %xmm0, %xmm1
-; SSE3-NEXT: movdqa %xmm1, %xmm0
-; SSE3-NEXT: psrlq $2, %xmm0
+; SSE3-NEXT: por %xmm1, %xmm0
+; SSE3-NEXT: movdqa %xmm0, %xmm1
+; SSE3-NEXT: psrlq $2, %xmm1
; SSE3-NEXT: por %xmm1, %xmm0
; SSE3-NEXT: movdqa %xmm0, %xmm1
; SSE3-NEXT: psrlq $4, %xmm1
-; SSE3-NEXT: por %xmm0, %xmm1
-; SSE3-NEXT: movdqa %xmm1, %xmm0
-; SSE3-NEXT: psrlq $8, %xmm0
+; SSE3-NEXT: por %xmm1, %xmm0
+; SSE3-NEXT: movdqa %xmm0, %xmm1
+; SSE3-NEXT: psrlq $8, %xmm1
; SSE3-NEXT: por %xmm1, %xmm0
; SSE3-NEXT: movdqa %xmm0, %xmm1
; SSE3-NEXT: psrlq $16, %xmm1
-; SSE3-NEXT: por %xmm0, %xmm1
-; SSE3-NEXT: movdqa %xmm1, %xmm0
-; SSE3-NEXT: psrlq $32, %xmm0
+; SSE3-NEXT: por %xmm1, %xmm0
+; SSE3-NEXT: movdqa %xmm0, %xmm1
+; SSE3-NEXT: psrlq $32, %xmm1
; SSE3-NEXT: por %xmm1, %xmm0
; SSE3-NEXT: pcmpeqd %xmm1, %xmm1
-; SSE3-NEXT: pxor %xmm0, %xmm1
-; SSE3-NEXT: movdqa %xmm1, %xmm0
-; SSE3-NEXT: psrlw $1, %xmm0
+; SSE3-NEXT: pxor %xmm1, %xmm0
+; SSE3-NEXT: movdqa %xmm0, %xmm1
+; SSE3-NEXT: psrlw $1, %xmm1
+; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE3-NEXT: psubb %xmm1, %xmm0
+; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
+; SSE3-NEXT: movdqa %xmm0, %xmm2
+; SSE3-NEXT: pand %xmm1, %xmm2
+; SSE3-NEXT: psrlw $2, %xmm0
+; SSE3-NEXT: pand %xmm1, %xmm0
+; SSE3-NEXT: paddb %xmm2, %xmm0
+; SSE3-NEXT: movdqa %xmm0, %xmm1
+; SSE3-NEXT: psrlw $4, %xmm1
+; SSE3-NEXT: paddb %xmm1, %xmm0
; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE3-NEXT: psubb %xmm0, %xmm1
-; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
-; SSE3-NEXT: movdqa %xmm1, %xmm2
-; SSE3-NEXT: pand %xmm0, %xmm2
-; SSE3-NEXT: psrlw $2, %xmm1
-; SSE3-NEXT: pand %xmm0, %xmm1
-; SSE3-NEXT: paddb %xmm2, %xmm1
-; SSE3-NEXT: movdqa %xmm1, %xmm2
-; SSE3-NEXT: psrlw $4, %xmm2
-; SSE3-NEXT: paddb %xmm1, %xmm2
-; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE3-NEXT: pxor %xmm0, %xmm0
-; SSE3-NEXT: psadbw %xmm2, %xmm0
+; SSE3-NEXT: pxor %xmm1, %xmm1
+; SSE3-NEXT: psadbw %xmm1, %xmm0
; SSE3-NEXT: retq
;
; SSSE3-LABEL: testv2i64:
@@ -268,80 +268,80 @@ define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind {
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrlq $1, %xmm1
-; SSE2-NEXT: por %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psrlq $2, %xmm0
+; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psrlq $2, %xmm1
; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrlq $4, %xmm1
-; SSE2-NEXT: por %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psrlq $8, %xmm0
+; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psrlq $8, %xmm1
; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrlq $16, %xmm1
-; SSE2-NEXT: por %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psrlq $32, %xmm0
+; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psrlq $32, %xmm1
; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: pcmpeqd %xmm1, %xmm1
-; SSE2-NEXT: pxor %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psrlw $1, %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psrlw $1, %xmm1
+; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT: psubb %xmm1, %xmm0
+; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: pand %xmm1, %xmm2
+; SSE2-NEXT: psrlw $2, %xmm0
+; SSE2-NEXT: pand %xmm1, %xmm0
+; SSE2-NEXT: paddb %xmm2, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psrlw $4, %xmm1
+; SSE2-NEXT: paddb %xmm1, %xmm0
; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE2-NEXT: psubb %xmm0, %xmm1
-; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: pand %xmm0, %xmm2
-; SSE2-NEXT: psrlw $2, %xmm1
-; SSE2-NEXT: pand %xmm0, %xmm1
-; SSE2-NEXT: paddb %xmm2, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: psrlw $4, %xmm2
-; SSE2-NEXT: paddb %xmm1, %xmm2
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE2-NEXT: pxor %xmm0, %xmm0
-; SSE2-NEXT: psadbw %xmm2, %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: psadbw %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE3-LABEL: testv2i64u:
; SSE3: # %bb.0:
; SSE3-NEXT: movdqa %xmm0, %xmm1
; SSE3-NEXT: psrlq $1, %xmm1
-; SSE3-NEXT: por %xmm0, %xmm1
-; SSE3-NEXT: movdqa %xmm1, %xmm0
-; SSE3-NEXT: psrlq $2, %xmm0
+; SSE3-NEXT: por %xmm1, %xmm0
+; SSE3-NEXT: movdqa %xmm0, %xmm1
+; SSE3-NEXT: psrlq $2, %xmm1
; SSE3-NEXT: por %xmm1, %xmm0
; SSE3-NEXT: movdqa %xmm0, %xmm1
; SSE3-NEXT: psrlq $4, %xmm1
-; SSE3-NEXT: por %xmm0, %xmm1
-; SSE3-NEXT: movdqa %xmm1, %xmm0
-; SSE3-NEXT: psrlq $8, %xmm0
+; SSE3-NEXT: por %xmm1, %xmm0
+; SSE3-NEXT: movdqa %xmm0, %xmm1
+; SSE3-NEXT: psrlq $8, %xmm1
; SSE3-NEXT: por %xmm1, %xmm0
; SSE3-NEXT: movdqa %xmm0, %xmm1
; SSE3-NEXT: psrlq $16, %xmm1
-; SSE3-NEXT: por %xmm0, %xmm1
-; SSE3-NEXT: movdqa %xmm1, %xmm0
-; SSE3-NEXT: psrlq $32, %xmm0
+; SSE3-NEXT: por %xmm1, %xmm0
+; SSE3-NEXT: movdqa %xmm0, %xmm1
+; SSE3-NEXT: psrlq $32, %xmm1
; SSE3-NEXT: por %xmm1, %xmm0
; SSE3-NEXT: pcmpeqd %xmm1, %xmm1
-; SSE3-NEXT: pxor %xmm0, %xmm1
-; SSE3-NEXT: movdqa %xmm1, %xmm0
-; SSE3-NEXT: psrlw $1, %xmm0
+; SSE3-NEXT: pxor %xmm1, %xmm0
+; SSE3-NEXT: movdqa %xmm0, %xmm1
+; SSE3-NEXT: psrlw $1, %xmm1
+; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE3-NEXT: psubb %xmm1, %xmm0
+; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
+; SSE3-NEXT: movdqa %xmm0, %xmm2
+; SSE3-NEXT: pand %xmm1, %xmm2
+; SSE3-NEXT: psrlw $2, %xmm0
+; SSE3-NEXT: pand %xmm1, %xmm0
+; SSE3-NEXT: paddb %xmm2, %xmm0
+; SSE3-NEXT: movdqa %xmm0, %xmm1
+; SSE3-NEXT: psrlw $4, %xmm1
+; SSE3-NEXT: paddb %xmm1, %xmm0
; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE3-NEXT: psubb %xmm0, %xmm1
-; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
-; SSE3-NEXT: movdqa %xmm1, %xmm2
-; SSE3-NEXT: pand %xmm0, %xmm2
-; SSE3-NEXT: psrlw $2, %xmm1
-; SSE3-NEXT: pand %xmm0, %xmm1
-; SSE3-NEXT: paddb %xmm2, %xmm1
-; SSE3-NEXT: movdqa %xmm1, %xmm2
-; SSE3-NEXT: psrlw $4, %xmm2
-; SSE3-NEXT: paddb %xmm1, %xmm2
-; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE3-NEXT: pxor %xmm0, %xmm0
-; SSE3-NEXT: psadbw %xmm2, %xmm0
+; SSE3-NEXT: pxor %xmm1, %xmm1
+; SSE3-NEXT: psadbw %xmm1, %xmm0
; SSE3-NEXT: retq
;
; SSSE3-LABEL: testv2i64u:
@@ -518,34 +518,34 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrld $1, %xmm1
-; SSE2-NEXT: por %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psrld $2, %xmm0
+; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psrld $2, %xmm1
; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrld $4, %xmm1
-; SSE2-NEXT: por %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psrld $8, %xmm0
+; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psrld $8, %xmm1
; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrld $16, %xmm1
-; SSE2-NEXT: por %xmm0, %xmm1
-; SSE2-NEXT: pcmpeqd %xmm2, %xmm2
-; SSE2-NEXT: pxor %xmm1, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm0
-; SSE2-NEXT: psrlw $1, %xmm0
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE2-NEXT: psubb %xmm0, %xmm2
-; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
-; SSE2-NEXT: movdqa %xmm2, %xmm1
-; SSE2-NEXT: pand %xmm0, %xmm1
-; SSE2-NEXT: psrlw $2, %xmm2
-; SSE2-NEXT: pand %xmm0, %xmm2
-; SSE2-NEXT: paddb %xmm1, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm0
-; SSE2-NEXT: psrlw $4, %xmm0
+; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: pcmpeqd %xmm1, %xmm1
+; SSE2-NEXT: pxor %xmm1, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psrlw $1, %xmm1
+; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT: psubb %xmm1, %xmm0
+; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: pand %xmm1, %xmm2
+; SSE2-NEXT: psrlw $2, %xmm0
+; SSE2-NEXT: pand %xmm1, %xmm0
; SSE2-NEXT: paddb %xmm2, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psrlw $4, %xmm1
+; SSE2-NEXT: paddb %xmm1, %xmm0
; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: movdqa %xmm0, %xmm2
@@ -560,34 +560,34 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
; SSE3: # %bb.0:
; SSE3-NEXT: movdqa %xmm0, %xmm1
; SSE3-NEXT: psrld $1, %xmm1
-; SSE3-NEXT: por %xmm0, %xmm1
-; SSE3-NEXT: movdqa %xmm1, %xmm0
-; SSE3-NEXT: psrld $2, %xmm0
+; SSE3-NEXT: por %xmm1, %xmm0
+; SSE3-NEXT: movdqa %xmm0, %xmm1
+; SSE3-NEXT: psrld $2, %xmm1
; SSE3-NEXT: por %xmm1, %xmm0
; SSE3-NEXT: movdqa %xmm0, %xmm1
; SSE3-NEXT: psrld $4, %xmm1
-; SSE3-NEXT: por %xmm0, %xmm1
-; SSE3-NEXT: movdqa %xmm1, %xmm0
-; SSE3-NEXT: psrld $8, %xmm0
+; SSE3-NEXT: por %xmm1, %xmm0
+; SSE3-NEXT: movdqa %xmm0, %xmm1
+; SSE3-NEXT: psrld $8, %xmm1
; SSE3-NEXT: por %xmm1, %xmm0
; SSE3-NEXT: movdqa %xmm0, %xmm1
; SSE3-NEXT: psrld $16, %xmm1
-; SSE3-NEXT: por %xmm0, %xmm1
-; SSE3-NEXT: pcmpeqd %xmm2, %xmm2
-; SSE3-NEXT: pxor %xmm1, %xmm2
-; SSE3-NEXT: movdqa %xmm2, %xmm0
-; SSE3-NEXT: psrlw $1, %xmm0
-; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE3-NEXT: psubb %xmm0, %xmm2
-; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
-; SSE3-NEXT: movdqa %xmm2, %xmm1
-; SSE3-NEXT: pand %xmm0, %xmm1
-; SSE3-NEXT: psrlw $2, %xmm2
-; SSE3-NEXT: pand %xmm0, %xmm2
-; SSE3-NEXT: paddb %xmm1, %xmm2
-; SSE3-NEXT: movdqa %xmm2, %xmm0
-; SSE3-NEXT: psrlw $4, %xmm0
+; SSE3-NEXT: por %xmm1, %xmm0
+; SSE3-NEXT: pcmpeqd %xmm1, %xmm1
+; SSE3-NEXT: pxor %xmm1, %xmm0
+; SSE3-NEXT: movdqa %xmm0, %xmm1
+; SSE3-NEXT: psrlw $1, %xmm1
+; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE3-NEXT: psubb %xmm1, %xmm0
+; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
+; SSE3-NEXT: movdqa %xmm0, %xmm2
+; SSE3-NEXT: pand %xmm1, %xmm2
+; SSE3-NEXT: psrlw $2, %xmm0
+; SSE3-NEXT: pand %xmm1, %xmm0
; SSE3-NEXT: paddb %xmm2, %xmm0
+; SSE3-NEXT: movdqa %xmm0, %xmm1
+; SSE3-NEXT: psrlw $4, %xmm1
+; SSE3-NEXT: paddb %xmm1, %xmm0
; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE3-NEXT: pxor %xmm1, %xmm1
; SSE3-NEXT: movdqa %xmm0, %xmm2
@@ -744,34 +744,34 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind {
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrld $1, %xmm1
-; SSE2-NEXT: por %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psrld $2, %xmm0
+; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psrld $2, %xmm1
; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrld $4, %xmm1
-; SSE2-NEXT: por %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psrld $8, %xmm0
+; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psrld $8, %xmm1
; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrld $16, %xmm1
-; SSE2-NEXT: por %xmm0, %xmm1
-; SSE2-NEXT: pcmpeqd %xmm2, %xmm2
-; SSE2-NEXT: pxor %xmm1, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm0
-; SSE2-NEXT: psrlw $1, %xmm0
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE2-NEXT: psubb %xmm0, %xmm2
-; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
-; SSE2-NEXT: movdqa %xmm2, %xmm1
-; SSE2-NEXT: pand %xmm0, %xmm1
-; SSE2-NEXT: psrlw $2, %xmm2
-; SSE2-NEXT: pand %xmm0, %xmm2
-; SSE2-NEXT: paddb %xmm1, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm0
-; SSE2-NEXT: psrlw $4, %xmm0
+; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: pcmpeqd %xmm1, %xmm1
+; SSE2-NEXT: pxor %xmm1, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psrlw $1, %xmm1
+; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT: psubb %xmm1, %xmm0
+; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: pand %xmm1, %xmm2
+; SSE2-NEXT: psrlw $2, %xmm0
+; SSE2-NEXT: pand %xmm1, %xmm0
; SSE2-NEXT: paddb %xmm2, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psrlw $4, %xmm1
+; SSE2-NEXT: paddb %xmm1, %xmm0
; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: movdqa %xmm0, %xmm2
@@ -786,34 +786,34 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind {
; SSE3: # %bb.0:
; SSE3-NEXT: movdqa %xmm0, %xmm1
; SSE3-NEXT: psrld $1, %xmm1
-; SSE3-NEXT: por %xmm0, %xmm1
-; SSE3-NEXT: movdqa %xmm1, %xmm0
-; SSE3-NEXT: psrld $2, %xmm0
+; SSE3-NEXT: por %xmm1, %xmm0
+; SSE3-NEXT: movdqa %xmm0, %xmm1
+; SSE3-NEXT: psrld $2, %xmm1
; SSE3-NEXT: por %xmm1, %xmm0
; SSE3-NEXT: movdqa %xmm0, %xmm1
; SSE3-NEXT: psrld $4, %xmm1
-; SSE3-NEXT: por %xmm0, %xmm1
-; SSE3-NEXT: movdqa %xmm1, %xmm0
-; SSE3-NEXT: psrld $8, %xmm0
+; SSE3-NEXT: por %xmm1, %xmm0
+; SSE3-NEXT: movdqa %xmm0, %xmm1
+; SSE3-NEXT: psrld $8, %xmm1
; SSE3-NEXT: por %xmm1, %xmm0
; SSE3-NEXT: movdqa %xmm0, %xmm1
; SSE3-NEXT: psrld $16, %xmm1
-; SSE3-NEXT: por %xmm0, %xmm1
-; SSE3-NEXT: pcmpeqd %xmm2, %xmm2
-; SSE3-NEXT: pxor %xmm1, %xmm2
-; SSE3-NEXT: movdqa %xmm2, %xmm0
-; SSE3-NEXT: psrlw $1, %xmm0
-; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE3-NEXT: psubb %xmm0, %xmm2
-; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
-; SSE3-NEXT: movdqa %xmm2, %xmm1
-; SSE3-NEXT: pand %xmm0, %xmm1
-; SSE3-NEXT: psrlw $2, %xmm2
-; SSE3-NEXT: pand %xmm0, %xmm2
-; SSE3-NEXT: paddb %xmm1, %xmm2
-; SSE3-NEXT: movdqa %xmm2, %xmm0
-; SSE3-NEXT: psrlw $4, %xmm0
+; SSE3-NEXT: por %xmm1, %xmm0
+; SSE3-NEXT: pcmpeqd %xmm1, %xmm1
+; SSE3-NEXT: pxor %xmm1, %xmm0
+; SSE3-NEXT: movdqa %xmm0, %xmm1
+; SSE3-NEXT: psrlw $1, %xmm1
+; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE3-NEXT: psubb %xmm1, %xmm0
+; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
+; SSE3-NEXT: movdqa %xmm0, %xmm2
+; SSE3-NEXT: pand %xmm1, %xmm2
+; SSE3-NEXT: psrlw $2, %xmm0
+; SSE3-NEXT: pand %xmm1, %xmm0
; SSE3-NEXT: paddb %xmm2, %xmm0
+; SSE3-NEXT: movdqa %xmm0, %xmm1
+; SSE3-NEXT: psrlw $4, %xmm1
+; SSE3-NEXT: paddb %xmm1, %xmm0
; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE3-NEXT: pxor %xmm1, %xmm1
; SSE3-NEXT: movdqa %xmm0, %xmm2
@@ -970,35 +970,35 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrlw $1, %xmm1
-; SSE2-NEXT: por %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psrlw $2, %xmm0
+; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psrlw $2, %xmm1
; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrlw $4, %xmm1
-; SSE2-NEXT: por %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psrlw $8, %xmm0
+; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psrlw $8, %xmm1
; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: pcmpeqd %xmm1, %xmm1
-; SSE2-NEXT: pxor %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psrlw $1, %xmm0
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE2-NEXT: psubb %xmm0, %xmm1
-; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: pand %xmm0, %xmm2
-; SSE2-NEXT: psrlw $2, %xmm1
-; SSE2-NEXT: pand %xmm0, %xmm1
-; SSE2-NEXT: paddb %xmm2, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: psrlw $4, %xmm2
-; SSE2-NEXT: paddb %xmm1, %xmm2
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm0
-; SSE2-NEXT: psllw $8, %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psrlw $1, %xmm1
+; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT: psubb %xmm1, %xmm0
+; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: pand %xmm1, %xmm2
+; SSE2-NEXT: psrlw $2, %xmm0
+; SSE2-NEXT: pand %xmm1, %xmm0
; SSE2-NEXT: paddb %xmm2, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psrlw $4, %xmm1
+; SSE2-NEXT: paddb %xmm1, %xmm0
+; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psllw $8, %xmm1
+; SSE2-NEXT: paddb %xmm1, %xmm0
; SSE2-NEXT: psrlw $8, %xmm0
; SSE2-NEXT: retq
;
@@ -1006,35 +1006,35 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
; SSE3: # %bb.0:
; SSE3-NEXT: movdqa %xmm0, %xmm1
; SSE3-NEXT: psrlw $1, %xmm1
-; SSE3-NEXT: por %xmm0, %xmm1
-; SSE3-NEXT: movdqa %xmm1, %xmm0
-; SSE3-NEXT: psrlw $2, %xmm0
+; SSE3-NEXT: por %xmm1, %xmm0
+; SSE3-NEXT: movdqa %xmm0, %xmm1
+; SSE3-NEXT: psrlw $2, %xmm1
; SSE3-NEXT: por %xmm1, %xmm0
; SSE3-NEXT: movdqa %xmm0, %xmm1
; SSE3-NEXT: psrlw $4, %xmm1
-; SSE3-NEXT: por %xmm0, %xmm1
-; SSE3-NEXT: movdqa %xmm1, %xmm0
-; SSE3-NEXT: psrlw $8, %xmm0
+; SSE3-NEXT: por %xmm1, %xmm0
+; SSE3-NEXT: movdqa %xmm0, %xmm1
+; SSE3-NEXT: psrlw $8, %xmm1
; SSE3-NEXT: por %xmm1, %xmm0
; SSE3-NEXT: pcmpeqd %xmm1, %xmm1
-; SSE3-NEXT: pxor %xmm0, %xmm1
-; SSE3-NEXT: movdqa %xmm1, %xmm0
-; SSE3-NEXT: psrlw $1, %xmm0
-; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE3-NEXT: psubb %xmm0, %xmm1
-; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
-; SSE3-NEXT: movdqa %xmm1, %xmm2
-; SSE3-NEXT: pand %xmm0, %xmm2
-; SSE3-NEXT: psrlw $2, %xmm1
-; SSE3-NEXT: pand %xmm0, %xmm1
-; SSE3-NEXT: paddb %xmm2, %xmm1
-; SSE3-NEXT: movdqa %xmm1, %xmm2
-; SSE3-NEXT: psrlw $4, %xmm2
-; SSE3-NEXT: paddb %xmm1, %xmm2
-; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE3-NEXT: movdqa %xmm2, %xmm0
-; SSE3-NEXT: psllw $8, %xmm0
+; SSE3-NEXT: pxor %xmm1, %xmm0
+; SSE3-NEXT: movdqa %xmm0, %xmm1
+; SSE3-NEXT: psrlw $1, %xmm1
+; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE3-NEXT: psubb %xmm1, %xmm0
+; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
+; SSE3-NEXT: movdqa %xmm0, %xmm2
+; SSE3-NEXT: pand %xmm1, %xmm2
+; SSE3-NEXT: psrlw $2, %xmm0
+; SSE3-NEXT: pand %xmm1, %xmm0
; SSE3-NEXT: paddb %xmm2, %xmm0
+; SSE3-NEXT: movdqa %xmm0, %xmm1
+; SSE3-NEXT: psrlw $4, %xmm1
+; SSE3-NEXT: paddb %xmm1, %xmm0
+; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT: movdqa %xmm0, %xmm1
+; SSE3-NEXT: psllw $8, %xmm1
+; SSE3-NEXT: paddb %xmm1, %xmm0
; SSE3-NEXT: psrlw $8, %xmm0
; SSE3-NEXT: retq
;
@@ -1160,35 +1160,35 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind {
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrlw $1, %xmm1
-; SSE2-NEXT: por %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psrlw $2, %xmm0
+; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psrlw $2, %xmm1
; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrlw $4, %xmm1
-; SSE2-NEXT: por %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psrlw $8, %xmm0
+; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psrlw $8, %xmm1
; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: pcmpeqd %xmm1, %xmm1
-; SSE2-NEXT: pxor %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psrlw $1, %xmm0
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE2-NEXT: psubb %xmm0, %xmm1
-; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: pand %xmm0, %xmm2
-; SSE2-NEXT: psrlw $2, %xmm1
-; SSE2-NEXT: pand %xmm0, %xmm1
-; SSE2-NEXT: paddb %xmm2, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: psrlw $4, %xmm2
-; SSE2-NEXT: paddb %xmm1, %xmm2
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm0
-; SSE2-NEXT: psllw $8, %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psrlw $1, %xmm1
+; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT: psubb %xmm1, %xmm0
+; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: pand %xmm1, %xmm2
+; SSE2-NEXT: psrlw $2, %xmm0
+; SSE2-NEXT: pand %xmm1, %xmm0
; SSE2-NEXT: paddb %xmm2, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psrlw $4, %xmm1
+; SSE2-NEXT: paddb %xmm1, %xmm0
+; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psllw $8, %xmm1
+; SSE2-NEXT: paddb %xmm1, %xmm0
; SSE2-NEXT: psrlw $8, %xmm0
; SSE2-NEXT: retq
;
@@ -1196,35 +1196,35 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind {
; SSE3: # %bb.0:
; SSE3-NEXT: movdqa %xmm0, %xmm1
; SSE3-NEXT: psrlw $1, %xmm1
-; SSE3-NEXT: por %xmm0, %xmm1
-; SSE3-NEXT: movdqa %xmm1, %xmm0
-; SSE3-NEXT: psrlw $2, %xmm0
+; SSE3-NEXT: por %xmm1, %xmm0
+; SSE3-NEXT: movdqa %xmm0, %xmm1
+; SSE3-NEXT: psrlw $2, %xmm1
; SSE3-NEXT: por %xmm1, %xmm0
; SSE3-NEXT: movdqa %xmm0, %xmm1
; SSE3-NEXT: psrlw $4, %xmm1
-; SSE3-NEXT: por %xmm0, %xmm1
-; SSE3-NEXT: movdqa %xmm1, %xmm0
-; SSE3-NEXT: psrlw $8, %xmm0
+; SSE3-NEXT: por %xmm1, %xmm0
+; SSE3-NEXT: movdqa %xmm0, %xmm1
+; SSE3-NEXT: psrlw $8, %xmm1
; SSE3-NEXT: por %xmm1, %xmm0
; SSE3-NEXT: pcmpeqd %xmm1, %xmm1
-; SSE3-NEXT: pxor %xmm0, %xmm1
-; SSE3-NEXT: movdqa %xmm1, %xmm0
-; SSE3-NEXT: psrlw $1, %xmm0
-; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE3-NEXT: psubb %xmm0, %xmm1
-; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
-; SSE3-NEXT: movdqa %xmm1, %xmm2
-; SSE3-NEXT: pand %xmm0, %xmm2
-; SSE3-NEXT: psrlw $2, %xmm1
-; SSE3-NEXT: pand %xmm0, %xmm1
-; SSE3-NEXT: paddb %xmm2, %xmm1
-; SSE3-NEXT: movdqa %xmm1, %xmm2
-; SSE3-NEXT: psrlw $4, %xmm2
-; SSE3-NEXT: paddb %xmm1, %xmm2
-; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE3-NEXT: movdqa %xmm2, %xmm0
-; SSE3-NEXT: psllw $8, %xmm0
+; SSE3-NEXT: pxor %xmm1, %xmm0
+; SSE3-NEXT: movdqa %xmm0, %xmm1
+; SSE3-NEXT: psrlw $1, %xmm1
+; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE3-NEXT: psubb %xmm1, %xmm0
+; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
+; SSE3-NEXT: movdqa %xmm0, %xmm2
+; SSE3-NEXT: pand %xmm1, %xmm2
+; SSE3-NEXT: psrlw $2, %xmm0
+; SSE3-NEXT: pand %xmm1, %xmm0
; SSE3-NEXT: paddb %xmm2, %xmm0
+; SSE3-NEXT: movdqa %xmm0, %xmm1
+; SSE3-NEXT: psrlw $4, %xmm1
+; SSE3-NEXT: paddb %xmm1, %xmm0
+; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT: movdqa %xmm0, %xmm1
+; SSE3-NEXT: psllw $8, %xmm1
+; SSE3-NEXT: paddb %xmm1, %xmm0
; SSE3-NEXT: psrlw $8, %xmm0
; SSE3-NEXT: retq
;
@@ -1351,31 +1351,31 @@ define <16 x i8> @testv16i8(<16 x i8> %in) nounwind {
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrlw $1, %xmm1
; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: por %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psrlw $2, %xmm0
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psrlw $2, %xmm1
+; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrlw $4, %xmm1
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; SSE2-NEXT: pand %xmm2, %xmm1
-; SSE2-NEXT: por %xmm0, %xmm1
-; SSE2-NEXT: pcmpeqd %xmm3, %xmm3
-; SSE2-NEXT: pxor %xmm1, %xmm3
-; SSE2-NEXT: movdqa %xmm3, %xmm0
-; SSE2-NEXT: psrlw $1, %xmm0
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE2-NEXT: psubb %xmm0, %xmm3
-; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
-; SSE2-NEXT: movdqa %xmm3, %xmm1
-; SSE2-NEXT: pand %xmm0, %xmm1
-; SSE2-NEXT: psrlw $2, %xmm3
-; SSE2-NEXT: pand %xmm0, %xmm3
-; SSE2-NEXT: paddb %xmm1, %xmm3
-; SSE2-NEXT: movdqa %xmm3, %xmm0
-; SSE2-NEXT: psrlw $4, %xmm0
+; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: pcmpeqd %xmm1, %xmm1
+; SSE2-NEXT: pxor %xmm1, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psrlw $1, %xmm1
+; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT: psubb %xmm1, %xmm0
+; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: pand %xmm1, %xmm3
+; SSE2-NEXT: psrlw $2, %xmm0
+; SSE2-NEXT: pand %xmm1, %xmm0
; SSE2-NEXT: paddb %xmm3, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psrlw $4, %xmm1
+; SSE2-NEXT: paddb %xmm1, %xmm0
; SSE2-NEXT: pand %xmm2, %xmm0
; SSE2-NEXT: retq
;
@@ -1384,31 +1384,31 @@ define <16 x i8> @testv16i8(<16 x i8> %in) nounwind {
; SSE3-NEXT: movdqa %xmm0, %xmm1
; SSE3-NEXT: psrlw $1, %xmm1
; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT: por %xmm0, %xmm1
-; SSE3-NEXT: movdqa %xmm1, %xmm0
-; SSE3-NEXT: psrlw $2, %xmm0
-; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT: por %xmm1, %xmm0
+; SSE3-NEXT: movdqa %xmm0, %xmm1
+; SSE3-NEXT: psrlw $2, %xmm1
+; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; SSE3-NEXT: por %xmm1, %xmm0
; SSE3-NEXT: movdqa %xmm0, %xmm1
; SSE3-NEXT: psrlw $4, %xmm1
; SSE3-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; SSE3-NEXT: pand %xmm2, %xmm1
-; SSE3-NEXT: por %xmm0, %xmm1
-; SSE3-NEXT: pcmpeqd %xmm3, %xmm3
-; SSE3-NEXT: pxor %xmm1, %xmm3
-; SSE3-NEXT: movdqa %xmm3, %xmm0
-; SSE3-NEXT: psrlw $1, %xmm0
-; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE3-NEXT: psubb %xmm0, %xmm3
-; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
-; SSE3-NEXT: movdqa %xmm3, %xmm1
-; SSE3-NEXT: pand %xmm0, %xmm1
-; SSE3-NEXT: psrlw $2, %xmm3
-; SSE3-NEXT: pand %xmm0, %xmm3
-; SSE3-NEXT: paddb %xmm1, %xmm3
-; SSE3-NEXT: movdqa %xmm3, %xmm0
-; SSE3-NEXT: psrlw $4, %xmm0
+; SSE3-NEXT: por %xmm1, %xmm0
+; SSE3-NEXT: pcmpeqd %xmm1, %xmm1
+; SSE3-NEXT: pxor %xmm1, %xmm0
+; SSE3-NEXT: movdqa %xmm0, %xmm1
+; SSE3-NEXT: psrlw $1, %xmm1
+; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE3-NEXT: psubb %xmm1, %xmm0
+; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
+; SSE3-NEXT: movdqa %xmm0, %xmm3
+; SSE3-NEXT: pand %xmm1, %xmm3
+; SSE3-NEXT: psrlw $2, %xmm0
+; SSE3-NEXT: pand %xmm1, %xmm0
; SSE3-NEXT: paddb %xmm3, %xmm0
+; SSE3-NEXT: movdqa %xmm0, %xmm1
+; SSE3-NEXT: psrlw $4, %xmm1
+; SSE3-NEXT: paddb %xmm1, %xmm0
; SSE3-NEXT: pand %xmm2, %xmm0
; SSE3-NEXT: retq
;
@@ -1501,31 +1501,31 @@ define <16 x i8> @testv16i8u(<16 x i8> %in) nounwind {
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrlw $1, %xmm1
; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: por %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psrlw $2, %xmm0
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psrlw $2, %xmm1
+; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrlw $4, %xmm1
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; SSE2-NEXT: pand %xmm2, %xmm1
-; SSE2-NEXT: por %xmm0, %xmm1
-; SSE2-NEXT: pcmpeqd %xmm3, %xmm3
-; SSE2-NEXT: pxor %xmm1, %xmm3
-; SSE2-NEXT: movdqa %xmm3, %xmm0
-; SSE2-NEXT: psrlw $1, %xmm0
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE2-NEXT: psubb %xmm0, %xmm3
-; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
-; SSE2-NEXT: movdqa %xmm3, %xmm1
-; SSE2-NEXT: pand %xmm0, %xmm1
-; SSE2-NEXT: psrlw $2, %xmm3
-; SSE2-NEXT: pand %xmm0, %xmm3
-; SSE2-NEXT: paddb %xmm1, %xmm3
-; SSE2-NEXT: movdqa %xmm3, %xmm0
-; SSE2-NEXT: psrlw $4, %xmm0
+; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: pcmpeqd %xmm1, %xmm1
+; SSE2-NEXT: pxor %xmm1, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psrlw $1, %xmm1
+; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT: psubb %xmm1, %xmm0
+; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: pand %xmm1, %xmm3
+; SSE2-NEXT: psrlw $2, %xmm0
+; SSE2-NEXT: pand %xmm1, %xmm0
; SSE2-NEXT: paddb %xmm3, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psrlw $4, %xmm1
+; SSE2-NEXT: paddb %xmm1, %xmm0
; SSE2-NEXT: pand %xmm2, %xmm0
; SSE2-NEXT: retq
;
@@ -1534,31 +1534,31 @@ define <16 x i8> @testv16i8u(<16 x i8> %in) nounwind {
; SSE3-NEXT: movdqa %xmm0, %xmm1
; SSE3-NEXT: psrlw $1, %xmm1
; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT: por %xmm0, %xmm1
-; SSE3-NEXT: movdqa %xmm1, %xmm0
-; SSE3-NEXT: psrlw $2, %xmm0
-; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT: por %xmm1, %xmm0
+; SSE3-NEXT: movdqa %xmm0, %xmm1
+; SSE3-NEXT: psrlw $2, %xmm1
+; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; SSE3-NEXT: por %xmm1, %xmm0
; SSE3-NEXT: movdqa %xmm0, %xmm1
; SSE3-NEXT: psrlw $4, %xmm1
; SSE3-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; SSE3-NEXT: pand %xmm2, %xmm1
-; SSE3-NEXT: por %xmm0, %xmm1
-; SSE3-NEXT: pcmpeqd %xmm3, %xmm3
-; SSE3-NEXT: pxor %xmm1, %xmm3
-; SSE3-NEXT: movdqa %xmm3, %xmm0
-; SSE3-NEXT: psrlw $1, %xmm0
-; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE3-NEXT: psubb %xmm0, %xmm3
-; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
-; SSE3-NEXT: movdqa %xmm3, %xmm1
-; SSE3-NEXT: pand %xmm0, %xmm1
-; SSE3-NEXT: psrlw $2, %xmm3
-; SSE3-NEXT: pand %xmm0, %xmm3
-; SSE3-NEXT: paddb %xmm1, %xmm3
-; SSE3-NEXT: movdqa %xmm3, %xmm0
-; SSE3-NEXT: psrlw $4, %xmm0
+; SSE3-NEXT: por %xmm1, %xmm0
+; SSE3-NEXT: pcmpeqd %xmm1, %xmm1
+; SSE3-NEXT: pxor %xmm1, %xmm0
+; SSE3-NEXT: movdqa %xmm0, %xmm1
+; SSE3-NEXT: psrlw $1, %xmm1
+; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE3-NEXT: psubb %xmm1, %xmm0
+; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
+; SSE3-NEXT: movdqa %xmm0, %xmm3
+; SSE3-NEXT: pand %xmm1, %xmm3
+; SSE3-NEXT: psrlw $2, %xmm0
+; SSE3-NEXT: pand %xmm1, %xmm0
; SSE3-NEXT: paddb %xmm3, %xmm0
+; SSE3-NEXT: movdqa %xmm0, %xmm1
+; SSE3-NEXT: psrlw $4, %xmm1
+; SSE3-NEXT: paddb %xmm1, %xmm0
; SSE3-NEXT: pand %xmm2, %xmm0
; SSE3-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vector-lzcnt-sub128.ll b/llvm/test/CodeGen/X86/vector-lzcnt-sub128.ll
index e903c09b9b8f1..a1b277efde6ff 100644
--- a/llvm/test/CodeGen/X86/vector-lzcnt-sub128.ll
+++ b/llvm/test/CodeGen/X86/vector-lzcnt-sub128.ll
@@ -8,34 +8,34 @@ define <2 x i32> @illegal_ctlz(<2 x i32> %v1) {
; CHECK: # %bb.0:
; CHECK-NEXT: movdqa %xmm0, %xmm1
; CHECK-NEXT: psrld $1, %xmm1
-; CHECK-NEXT: por %xmm0, %xmm1
-; CHECK-NEXT: movdqa %xmm1, %xmm0
-; CHECK-NEXT: psrld $2, %xmm0
+; CHECK-NEXT: por %xmm1, %xmm0
+; CHECK-NEXT: movdqa %xmm0, %xmm1
+; CHECK-NEXT: psrld $2, %xmm1
; CHECK-NEXT: por %xmm1, %xmm0
; CHECK-NEXT: movdqa %xmm0, %xmm1
; CHECK-NEXT: psrld $4, %xmm1
-; CHECK-NEXT: por %xmm0, %xmm1
-; CHECK-NEXT: movdqa %xmm1, %xmm0
-; CHECK-NEXT: psrld $8, %xmm0
+; CHECK-NEXT: por %xmm1, %xmm0
+; CHECK-NEXT: movdqa %xmm0, %xmm1
+; CHECK-NEXT: psrld $8, %xmm1
; CHECK-NEXT: por %xmm1, %xmm0
; CHECK-NEXT: movdqa %xmm0, %xmm1
; CHECK-NEXT: psrld $16, %xmm1
-; CHECK-NEXT: por %xmm0, %xmm1
-; CHECK-NEXT: pcmpeqd %xmm2, %xmm2
-; CHECK-NEXT: pxor %xmm1, %xmm2
-; CHECK-NEXT: movdqa %xmm2, %xmm0
-; CHECK-NEXT: psrlw $1, %xmm0
-; CHECK-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT: psubb %xmm0, %xmm2
-; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
-; CHECK-NEXT: movdqa %xmm2, %xmm1
-; CHECK-NEXT: pand %xmm0, %xmm1
-; CHECK-NEXT: psrlw $2, %xmm2
-; CHECK-NEXT: pand %xmm0, %xmm2
-; CHECK-NEXT: paddb %xmm1, %xmm2
-; CHECK-NEXT: movdqa %xmm2, %xmm0
-; CHECK-NEXT: psrlw $4, %xmm0
+; CHECK-NEXT: por %xmm1, %xmm0
+; CHECK-NEXT: pcmpeqd %xmm1, %xmm1
+; CHECK-NEXT: pxor %xmm1, %xmm0
+; CHECK-NEXT: movdqa %xmm0, %xmm1
+; CHECK-NEXT: psrlw $1, %xmm1
+; CHECK-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-NEXT: psubb %xmm1, %xmm0
+; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
+; CHECK-NEXT: movdqa %xmm0, %xmm2
+; CHECK-NEXT: pand %xmm1, %xmm2
+; CHECK-NEXT: psrlw $2, %xmm0
+; CHECK-NEXT: pand %xmm1, %xmm0
; CHECK-NEXT: paddb %xmm2, %xmm0
+; CHECK-NEXT: movdqa %xmm0, %xmm1
+; CHECK-NEXT: psrlw $4, %xmm1
+; CHECK-NEXT: paddb %xmm1, %xmm0
; CHECK-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-NEXT: pxor %xmm1, %xmm1
; CHECK-NEXT: movdqa %xmm0, %xmm2
@@ -69,16 +69,15 @@ define <2 x i32> @illegal_cttz(<2 x i32> %v1) {
; CHECK-NEXT: paddb %xmm2, %xmm0
; CHECK-NEXT: movdqa %xmm0, %xmm1
; CHECK-NEXT: psrlw $4, %xmm1
-; CHECK-NEXT: paddb %xmm0, %xmm1
-; CHECK-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-NEXT: pxor %xmm0, %xmm0
-; CHECK-NEXT: movdqa %xmm1, %xmm2
-; CHECK-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; CHECK-NEXT: psadbw %xmm0, %xmm2
-; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; CHECK-NEXT: psadbw %xmm0, %xmm1
-; CHECK-NEXT: packuswb %xmm2, %xmm1
-; CHECK-NEXT: movdqa %xmm1, %xmm0
+; CHECK-NEXT: paddb %xmm1, %xmm0
+; CHECK-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-NEXT: pxor %xmm1, %xmm1
+; CHECK-NEXT: movdqa %xmm0, %xmm2
+; CHECK-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; CHECK-NEXT: psadbw %xmm1, %xmm2
+; CHECK-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-NEXT: psadbw %xmm1, %xmm0
+; CHECK-NEXT: packuswb %xmm2, %xmm0
; CHECK-NEXT: retq
%v2 = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %v1, i1 true)
ret <2 x i32> %v2
diff --git a/llvm/test/CodeGen/X86/vector-narrow-binop.ll b/llvm/test/CodeGen/X86/vector-narrow-binop.ll
index 1d4b4574bf105..01190f580b880 100644
--- a/llvm/test/CodeGen/X86/vector-narrow-binop.ll
+++ b/llvm/test/CodeGen/X86/vector-narrow-binop.ll
@@ -153,9 +153,9 @@ define <4 x double> @fmul_v2f64(<2 x double> %x, <2 x double> %y) {
; SSE-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm0[0]
; SSE-NEXT: mulpd %xmm2, %xmm2
; SSE-NEXT: mulpd %xmm1, %xmm1
-; SSE-NEXT: addpd %xmm2, %xmm1
-; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
-; SSE-NEXT: movapd %xmm1, %xmm0
+; SSE-NEXT: addpd %xmm1, %xmm2
+; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
+; SSE-NEXT: movapd %xmm2, %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: fmul_v2f64:
diff --git a/llvm/test/CodeGen/X86/vector-popcnt-128-ult-ugt.ll b/llvm/test/CodeGen/X86/vector-popcnt-128-ult-ugt.ll
index 4a9439c158d86..f95178f1bfcdf 100644
--- a/llvm/test/CodeGen/X86/vector-popcnt-128-ult-ugt.ll
+++ b/llvm/test/CodeGen/X86/vector-popcnt-128-ult-ugt.ll
@@ -1669,10 +1669,10 @@ define <8 x i16> @ugt_2_v8i16(<8 x i16> %0) {
; SSE2-NEXT: paddb %xmm2, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrlw $4, %xmm1
-; SSE2-NEXT: paddb %xmm0, %xmm1
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psllw $8, %xmm0
+; SSE2-NEXT: paddb %xmm1, %xmm0
+; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psllw $8, %xmm1
; SSE2-NEXT: paddb %xmm1, %xmm0
; SSE2-NEXT: psrlw $8, %xmm0
; SSE2-NEXT: pcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -1692,10 +1692,10 @@ define <8 x i16> @ugt_2_v8i16(<8 x i16> %0) {
; SSE3-NEXT: paddb %xmm2, %xmm0
; SSE3-NEXT: movdqa %xmm0, %xmm1
; SSE3-NEXT: psrlw $4, %xmm1
-; SSE3-NEXT: paddb %xmm0, %xmm1
-; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT: movdqa %xmm1, %xmm0
-; SSE3-NEXT: psllw $8, %xmm0
+; SSE3-NEXT: paddb %xmm1, %xmm0
+; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT: movdqa %xmm0, %xmm1
+; SSE3-NEXT: psllw $8, %xmm1
; SSE3-NEXT: paddb %xmm1, %xmm0
; SSE3-NEXT: psrlw $8, %xmm0
; SSE3-NEXT: pcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -1988,10 +1988,10 @@ define <8 x i16> @ugt_3_v8i16(<8 x i16> %0) {
; SSE2-NEXT: paddb %xmm2, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrlw $4, %xmm1
-; SSE2-NEXT: paddb %xmm0, %xmm1
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psllw $8, %xmm0
+; SSE2-NEXT: paddb %xmm1, %xmm0
+; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psllw $8, %xmm1
; SSE2-NEXT: paddb %xmm1, %xmm0
; SSE2-NEXT: psrlw $8, %xmm0
; SSE2-NEXT: pcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -2011,10 +2011,10 @@ define <8 x i16> @ugt_3_v8i16(<8 x i16> %0) {
; SSE3-NEXT: paddb %xmm2, %xmm0
; SSE3-NEXT: movdqa %xmm0, %xmm1
; SSE3-NEXT: psrlw $4, %xmm1
-; SSE3-NEXT: paddb %xmm0, %xmm1
-; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT: movdqa %xmm1, %xmm0
-; SSE3-NEXT: psllw $8, %xmm0
+; SSE3-NEXT: paddb %xmm1, %xmm0
+; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT: movdqa %xmm0, %xmm1
+; SSE3-NEXT: psllw $8, %xmm1
; SSE3-NEXT: paddb %xmm1, %xmm0
; SSE3-NEXT: psrlw $8, %xmm0
; SSE3-NEXT: pcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -2307,10 +2307,10 @@ define <8 x i16> @ugt_4_v8i16(<8 x i16> %0) {
; SSE2-NEXT: paddb %xmm2, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrlw $4, %xmm1
-; SSE2-NEXT: paddb %xmm0, %xmm1
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psllw $8, %xmm0
+; SSE2-NEXT: paddb %xmm1, %xmm0
+; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psllw $8, %xmm1
; SSE2-NEXT: paddb %xmm1, %xmm0
; SSE2-NEXT: psrlw $8, %xmm0
; SSE2-NEXT: pcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -2330,10 +2330,10 @@ define <8 x i16> @ugt_4_v8i16(<8 x i16> %0) {
; SSE3-NEXT: paddb %xmm2, %xmm0
; SSE3-NEXT: movdqa %xmm0, %xmm1
; SSE3-NEXT: psrlw $4, %xmm1
-; SSE3-NEXT: paddb %xmm0, %xmm1
-; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT: movdqa %xmm1, %xmm0
-; SSE3-NEXT: psllw $8, %xmm0
+; SSE3-NEXT: paddb %xmm1, %xmm0
+; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT: movdqa %xmm0, %xmm1
+; SSE3-NEXT: psllw $8, %xmm1
; SSE3-NEXT: paddb %xmm1, %xmm0
; SSE3-NEXT: psrlw $8, %xmm0
; SSE3-NEXT: pcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -2626,10 +2626,10 @@ define <8 x i16> @ugt_5_v8i16(<8 x i16> %0) {
; SSE2-NEXT: paddb %xmm2, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrlw $4, %xmm1
-; SSE2-NEXT: paddb %xmm0, %xmm1
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psllw $8, %xmm0
+; SSE2-NEXT: paddb %xmm1, %xmm0
+; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psllw $8, %xmm1
; SSE2-NEXT: paddb %xmm1, %xmm0
; SSE2-NEXT: psrlw $8, %xmm0
; SSE2-NEXT: pcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -2649,10 +2649,10 @@ define <8 x i16> @ugt_5_v8i16(<8 x i16> %0) {
; SSE3-NEXT: paddb %xmm2, %xmm0
; SSE3-NEXT: movdqa %xmm0, %xmm1
; SSE3-NEXT: psrlw $4, %xmm1
-; SSE3-NEXT: paddb %xmm0, %xmm1
-; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT: movdqa %xmm1, %xmm0
-; SSE3-NEXT: psllw $8, %xmm0
+; SSE3-NEXT: paddb %xmm1, %xmm0
+; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT: movdqa %xmm0, %xmm1
+; SSE3-NEXT: psllw $8, %xmm1
; SSE3-NEXT: paddb %xmm1, %xmm0
; SSE3-NEXT: psrlw $8, %xmm0
; SSE3-NEXT: pcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -2945,10 +2945,10 @@ define <8 x i16> @ugt_6_v8i16(<8 x i16> %0) {
; SSE2-NEXT: paddb %xmm2, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrlw $4, %xmm1
-; SSE2-NEXT: paddb %xmm0, %xmm1
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psllw $8, %xmm0
+; SSE2-NEXT: paddb %xmm1, %xmm0
+; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psllw $8, %xmm1
; SSE2-NEXT: paddb %xmm1, %xmm0
; SSE2-NEXT: psrlw $8, %xmm0
; SSE2-NEXT: pcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -2968,10 +2968,10 @@ define <8 x i16> @ugt_6_v8i16(<8 x i16> %0) {
; SSE3-NEXT: paddb %xmm2, %xmm0
; SSE3-NEXT: movdqa %xmm0, %xmm1
; SSE3-NEXT: psrlw $4, %xmm1
-; SSE3-NEXT: paddb %xmm0, %xmm1
-; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT: movdqa %xmm1, %xmm0
-; SSE3-NEXT: psllw $8, %xmm0
+; SSE3-NEXT: paddb %xmm1, %xmm0
+; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT: movdqa %xmm0, %xmm1
+; SSE3-NEXT: psllw $8, %xmm1
; SSE3-NEXT: paddb %xmm1, %xmm0
; SSE3-NEXT: psrlw $8, %xmm0
; SSE3-NEXT: pcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -3264,10 +3264,10 @@ define <8 x i16> @ugt_7_v8i16(<8 x i16> %0) {
; SSE2-NEXT: paddb %xmm2, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrlw $4, %xmm1
-; SSE2-NEXT: paddb %xmm0, %xmm1
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psllw $8, %xmm0
+; SSE2-NEXT: paddb %xmm1, %xmm0
+; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psllw $8, %xmm1
; SSE2-NEXT: paddb %xmm1, %xmm0
; SSE2-NEXT: psrlw $8, %xmm0
; SSE2-NEXT: pcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -3287,10 +3287,10 @@ define <8 x i16> @ugt_7_v8i16(<8 x i16> %0) {
; SSE3-NEXT: paddb %xmm2, %xmm0
; SSE3-NEXT: movdqa %xmm0, %xmm1
; SSE3-NEXT: psrlw $4, %xmm1
-; SSE3-NEXT: paddb %xmm0, %xmm1
-; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT: movdqa %xmm1, %xmm0
-; SSE3-NEXT: psllw $8, %xmm0
+; SSE3-NEXT: paddb %xmm1, %xmm0
+; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT: movdqa %xmm0, %xmm1
+; SSE3-NEXT: psllw $8, %xmm1
; SSE3-NEXT: paddb %xmm1, %xmm0
; SSE3-NEXT: psrlw $8, %xmm0
; SSE3-NEXT: pcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -3583,10 +3583,10 @@ define <8 x i16> @ugt_8_v8i16(<8 x i16> %0) {
; SSE2-NEXT: paddb %xmm2, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrlw $4, %xmm1
-; SSE2-NEXT: paddb %xmm0, %xmm1
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psllw $8, %xmm0
+; SSE2-NEXT: paddb %xmm1, %xmm0
+; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psllw $8, %xmm1
; SSE2-NEXT: paddb %xmm1, %xmm0
; SSE2-NEXT: psrlw $8, %xmm0
; SSE2-NEXT: pcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -3606,10 +3606,10 @@ define <8 x i16> @ugt_8_v8i16(<8 x i16> %0) {
; SSE3-NEXT: paddb %xmm2, %xmm0
; SSE3-NEXT: movdqa %xmm0, %xmm1
; SSE3-NEXT: psrlw $4, %xmm1
-; SSE3-NEXT: paddb %xmm0, %xmm1
-; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT: movdqa %xmm1, %xmm0
-; SSE3-NEXT: psllw $8, %xmm0
+; SSE3-NEXT: paddb %xmm1, %xmm0
+; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT: movdqa %xmm0, %xmm1
+; SSE3-NEXT: psllw $8, %xmm1
; SSE3-NEXT: paddb %xmm1, %xmm0
; SSE3-NEXT: psrlw $8, %xmm0
; SSE3-NEXT: pcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -3902,10 +3902,10 @@ define <8 x i16> @ugt_9_v8i16(<8 x i16> %0) {
; SSE2-NEXT: paddb %xmm2, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrlw $4, %xmm1
-; SSE2-NEXT: paddb %xmm0, %xmm1
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psllw $8, %xmm0
+; SSE2-NEXT: paddb %xmm1, %xmm0
+; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psllw $8, %xmm1
; SSE2-NEXT: paddb %xmm1, %xmm0
; SSE2-NEXT: psrlw $8, %xmm0
; SSE2-NEXT: pcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -3925,10 +3925,10 @@ define <8 x i16> @ugt_9_v8i16(<8 x i16> %0) {
; SSE3-NEXT: paddb %xmm2, %xmm0
; SSE3-NEXT: movdqa %xmm0, %xmm1
; SSE3-NEXT: psrlw $4, %xmm1
-; SSE3-NEXT: paddb %xmm0, %xmm1
-; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT: movdqa %xmm1, %xmm0
-; SSE3-NEXT: psllw $8, %xmm0
+; SSE3-NEXT: paddb %xmm1, %xmm0
+; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT: movdqa %xmm0, %xmm1
+; SSE3-NEXT: psllw $8, %xmm1
; SSE3-NEXT: paddb %xmm1, %xmm0
; SSE3-NEXT: psrlw $8, %xmm0
; SSE3-NEXT: pcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -4221,10 +4221,10 @@ define <8 x i16> @ugt_10_v8i16(<8 x i16> %0) {
; SSE2-NEXT: paddb %xmm2, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrlw $4, %xmm1
-; SSE2-NEXT: paddb %xmm0, %xmm1
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psllw $8, %xmm0
+; SSE2-NEXT: paddb %xmm1, %xmm0
+; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psllw $8, %xmm1
; SSE2-NEXT: paddb %xmm1, %xmm0
; SSE2-NEXT: psrlw $8, %xmm0
; SSE2-NEXT: pcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -4244,10 +4244,10 @@ define <8 x i16> @ugt_10_v8i16(<8 x i16> %0) {
; SSE3-NEXT: paddb %xmm2, %xmm0
; SSE3-NEXT: movdqa %xmm0, %xmm1
; SSE3-NEXT: psrlw $4, %xmm1
-; SSE3-NEXT: paddb %xmm0, %xmm1
-; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT: movdqa %xmm1, %xmm0
-; SSE3-NEXT: psllw $8, %xmm0
+; SSE3-NEXT: paddb %xmm1, %xmm0
+; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT: movdqa %xmm0, %xmm1
+; SSE3-NEXT: psllw $8, %xmm1
; SSE3-NEXT: paddb %xmm1, %xmm0
; SSE3-NEXT: psrlw $8, %xmm0
; SSE3-NEXT: pcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -4540,10 +4540,10 @@ define <8 x i16> @ugt_11_v8i16(<8 x i16> %0) {
; SSE2-NEXT: paddb %xmm2, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrlw $4, %xmm1
-; SSE2-NEXT: paddb %xmm0, %xmm1
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psllw $8, %xmm0
+; SSE2-NEXT: paddb %xmm1, %xmm0
+; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psllw $8, %xmm1
; SSE2-NEXT: paddb %xmm1, %xmm0
; SSE2-NEXT: psrlw $8, %xmm0
; SSE2-NEXT: pcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -4563,10 +4563,10 @@ define <8 x i16> @ugt_11_v8i16(<8 x i16> %0) {
; SSE3-NEXT: paddb %xmm2, %xmm0
; SSE3-NEXT: movdqa %xmm0, %xmm1
; SSE3-NEXT: psrlw $4, %xmm1
-; SSE3-NEXT: paddb %xmm0, %xmm1
-; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT: movdqa %xmm1, %xmm0
-; SSE3-NEXT: psllw $8, %xmm0
+; SSE3-NEXT: paddb %xmm1, %xmm0
+; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT: movdqa %xmm0, %xmm1
+; SSE3-NEXT: psllw $8, %xmm1
; SSE3-NEXT: paddb %xmm1, %xmm0
; SSE3-NEXT: psrlw $8, %xmm0
; SSE3-NEXT: pcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -4859,10 +4859,10 @@ define <8 x i16> @ugt_12_v8i16(<8 x i16> %0) {
; SSE2-NEXT: paddb %xmm2, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrlw $4, %xmm1
-; SSE2-NEXT: paddb %xmm0, %xmm1
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psllw $8, %xmm0
+; SSE2-NEXT: paddb %xmm1, %xmm0
+; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psllw $8, %xmm1
; SSE2-NEXT: paddb %xmm1, %xmm0
; SSE2-NEXT: psrlw $8, %xmm0
; SSE2-NEXT: pcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -4882,10 +4882,10 @@ define <8 x i16> @ugt_12_v8i16(<8 x i16> %0) {
; SSE3-NEXT: paddb %xmm2, %xmm0
; SSE3-NEXT: movdqa %xmm0, %xmm1
; SSE3-NEXT: psrlw $4, %xmm1
-; SSE3-NEXT: paddb %xmm0, %xmm1
-; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT: movdqa %xmm1, %xmm0
-; SSE3-NEXT: psllw $8, %xmm0
+; SSE3-NEXT: paddb %xmm1, %xmm0
+; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT: movdqa %xmm0, %xmm1
+; SSE3-NEXT: psllw $8, %xmm1
; SSE3-NEXT: paddb %xmm1, %xmm0
; SSE3-NEXT: psrlw $8, %xmm0
; SSE3-NEXT: pcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -5178,10 +5178,10 @@ define <8 x i16> @ugt_13_v8i16(<8 x i16> %0) {
; SSE2-NEXT: paddb %xmm2, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrlw $4, %xmm1
-; SSE2-NEXT: paddb %xmm0, %xmm1
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psllw $8, %xmm0
+; SSE2-NEXT: paddb %xmm1, %xmm0
+; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psllw $8, %xmm1
; SSE2-NEXT: paddb %xmm1, %xmm0
; SSE2-NEXT: psrlw $8, %xmm0
; SSE2-NEXT: pcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -5201,10 +5201,10 @@ define <8 x i16> @ugt_13_v8i16(<8 x i16> %0) {
; SSE3-NEXT: paddb %xmm2, %xmm0
; SSE3-NEXT: movdqa %xmm0, %xmm1
; SSE3-NEXT: psrlw $4, %xmm1
-; SSE3-NEXT: paddb %xmm0, %xmm1
-; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT: movdqa %xmm1, %xmm0
-; SSE3-NEXT: psllw $8, %xmm0
+; SSE3-NEXT: paddb %xmm1, %xmm0
+; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT: movdqa %xmm0, %xmm1
+; SSE3-NEXT: psllw $8, %xmm1
; SSE3-NEXT: paddb %xmm1, %xmm0
; SSE3-NEXT: psrlw $8, %xmm0
; SSE3-NEXT: pcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -5497,10 +5497,10 @@ define <8 x i16> @ugt_14_v8i16(<8 x i16> %0) {
; SSE2-NEXT: paddb %xmm2, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrlw $4, %xmm1
-; SSE2-NEXT: paddb %xmm0, %xmm1
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psllw $8, %xmm0
+; SSE2-NEXT: paddb %xmm1, %xmm0
+; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psllw $8, %xmm1
; SSE2-NEXT: paddb %xmm1, %xmm0
; SSE2-NEXT: psrlw $8, %xmm0
; SSE2-NEXT: pcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -5520,10 +5520,10 @@ define <8 x i16> @ugt_14_v8i16(<8 x i16> %0) {
; SSE3-NEXT: paddb %xmm2, %xmm0
; SSE3-NEXT: movdqa %xmm0, %xmm1
; SSE3-NEXT: psrlw $4, %xmm1
-; SSE3-NEXT: paddb %xmm0, %xmm1
-; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT: movdqa %xmm1, %xmm0
-; SSE3-NEXT: psllw $8, %xmm0
+; SSE3-NEXT: paddb %xmm1, %xmm0
+; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT: movdqa %xmm0, %xmm1
+; SSE3-NEXT: psllw $8, %xmm1
; SSE3-NEXT: paddb %xmm1, %xmm0
; SSE3-NEXT: psrlw $8, %xmm0
; SSE3-NEXT: pcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -5960,17 +5960,16 @@ define <4 x i32> @ugt_2_v4i32(<4 x i32> %0) {
; SSE2-NEXT: paddb %xmm2, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrlw $4, %xmm1
-; SSE2-NEXT: paddb %xmm0, %xmm1
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: pxor %xmm0, %xmm0
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE2-NEXT: psadbw %xmm0, %xmm2
-; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT: psadbw %xmm0, %xmm1
-; SSE2-NEXT: packuswb %xmm2, %xmm1
-; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: paddb %xmm1, %xmm0
+; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-NEXT: psadbw %xmm1, %xmm2
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT: psadbw %xmm1, %xmm0
+; SSE2-NEXT: packuswb %xmm2, %xmm0
+; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE2-NEXT: retq
;
; SSE3-LABEL: ugt_2_v4i32:
@@ -5987,17 +5986,16 @@ define <4 x i32> @ugt_2_v4i32(<4 x i32> %0) {
; SSE3-NEXT: paddb %xmm2, %xmm0
; SSE3-NEXT: movdqa %xmm0, %xmm1
; SSE3-NEXT: psrlw $4, %xmm1
-; SSE3-NEXT: paddb %xmm0, %xmm1
-; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT: pxor %xmm0, %xmm0
-; SSE3-NEXT: movdqa %xmm1, %xmm2
-; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE3-NEXT: psadbw %xmm0, %xmm2
-; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE3-NEXT: psadbw %xmm0, %xmm1
-; SSE3-NEXT: packuswb %xmm2, %xmm1
-; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT: movdqa %xmm1, %xmm0
+; SSE3-NEXT: paddb %xmm1, %xmm0
+; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT: pxor %xmm1, %xmm1
+; SSE3-NEXT: movdqa %xmm0, %xmm2
+; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE3-NEXT: psadbw %xmm1, %xmm2
+; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE3-NEXT: psadbw %xmm1, %xmm0
+; SSE3-NEXT: packuswb %xmm2, %xmm0
+; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE3-NEXT: retq
;
; SSSE3-LABEL: ugt_2_v4i32:
@@ -6340,17 +6338,16 @@ define <4 x i32> @ugt_3_v4i32(<4 x i32> %0) {
; SSE2-NEXT: paddb %xmm2, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrlw $4, %xmm1
-; SSE2-NEXT: paddb %xmm0, %xmm1
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: pxor %xmm0, %xmm0
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE2-NEXT: psadbw %xmm0, %xmm2
-; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT: psadbw %xmm0, %xmm1
-; SSE2-NEXT: packuswb %xmm2, %xmm1
-; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: paddb %xmm1, %xmm0
+; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-NEXT: psadbw %xmm1, %xmm2
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT: psadbw %xmm1, %xmm0
+; SSE2-NEXT: packuswb %xmm2, %xmm0
+; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE2-NEXT: retq
;
; SSE3-LABEL: ugt_3_v4i32:
@@ -6367,17 +6364,16 @@ define <4 x i32> @ugt_3_v4i32(<4 x i32> %0) {
; SSE3-NEXT: paddb %xmm2, %xmm0
; SSE3-NEXT: movdqa %xmm0, %xmm1
; SSE3-NEXT: psrlw $4, %xmm1
-; SSE3-NEXT: paddb %xmm0, %xmm1
-; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT: pxor %xmm0, %xmm0
-; SSE3-NEXT: movdqa %xmm1, %xmm2
-; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE3-NEXT: psadbw %xmm0, %xmm2
-; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE3-NEXT: psadbw %xmm0, %xmm1
-; SSE3-NEXT: packuswb %xmm2, %xmm1
-; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT: movdqa %xmm1, %xmm0
+; SSE3-NEXT: paddb %xmm1, %xmm0
+; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT: pxor %xmm1, %xmm1
+; SSE3-NEXT: movdqa %xmm0, %xmm2
+; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE3-NEXT: psadbw %xmm1, %xmm2
+; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE3-NEXT: psadbw %xmm1, %xmm0
+; SSE3-NEXT: packuswb %xmm2, %xmm0
+; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE3-NEXT: retq
;
; SSSE3-LABEL: ugt_3_v4i32:
@@ -6720,17 +6716,16 @@ define <4 x i32> @ugt_4_v4i32(<4 x i32> %0) {
; SSE2-NEXT: paddb %xmm2, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrlw $4, %xmm1
-; SSE2-NEXT: paddb %xmm0, %xmm1
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: pxor %xmm0, %xmm0
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE2-NEXT: psadbw %xmm0, %xmm2
-; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT: psadbw %xmm0, %xmm1
-; SSE2-NEXT: packuswb %xmm2, %xmm1
-; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: paddb %xmm1, %xmm0
+; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-NEXT: psadbw %xmm1, %xmm2
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT: psadbw %xmm1, %xmm0
+; SSE2-NEXT: packuswb %xmm2, %xmm0
+; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE2-NEXT: retq
;
; SSE3-LABEL: ugt_4_v4i32:
@@ -6747,17 +6742,16 @@ define <4 x i32> @ugt_4_v4i32(<4 x i32> %0) {
; SSE3-NEXT: paddb %xmm2, %xmm0
; SSE3-NEXT: movdqa %xmm0, %xmm1
; SSE3-NEXT: psrlw $4, %xmm1
-; SSE3-NEXT: paddb %xmm0, %xmm1
-; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT: pxor %xmm0, %xmm0
-; SSE3-NEXT: movdqa %xmm1, %xmm2
-; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE3-NEXT: psadbw %xmm0, %xmm2
-; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE3-NEXT: psadbw %xmm0, %xmm1
-; SSE3-NEXT: packuswb %xmm2, %xmm1
-; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT: movdqa %xmm1, %xmm0
+; SSE3-NEXT: paddb %xmm1, %xmm0
+; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT: pxor %xmm1, %xmm1
+; SSE3-NEXT: movdqa %xmm0, %xmm2
+; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE3-NEXT: psadbw %xmm1, %xmm2
+; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE3-NEXT: psadbw %xmm1, %xmm0
+; SSE3-NEXT: packuswb %xmm2, %xmm0
+; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE3-NEXT: retq
;
; SSSE3-LABEL: ugt_4_v4i32:
@@ -7100,17 +7094,16 @@ define <4 x i32> @ugt_5_v4i32(<4 x i32> %0) {
; SSE2-NEXT: paddb %xmm2, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrlw $4, %xmm1
-; SSE2-NEXT: paddb %xmm0, %xmm1
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: pxor %xmm0, %xmm0
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE2-NEXT: psadbw %xmm0, %xmm2
-; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT: psadbw %xmm0, %xmm1
-; SSE2-NEXT: packuswb %xmm2, %xmm1
-; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: paddb %xmm1, %xmm0
+; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-NEXT: psadbw %xmm1, %xmm2
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT: psadbw %xmm1, %xmm0
+; SSE2-NEXT: packuswb %xmm2, %xmm0
+; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE2-NEXT: retq
;
; SSE3-LABEL: ugt_5_v4i32:
@@ -7127,17 +7120,16 @@ define <4 x i32> @ugt_5_v4i32(<4 x i32> %0) {
; SSE3-NEXT: paddb %xmm2, %xmm0
; SSE3-NEXT: movdqa %xmm0, %xmm1
; SSE3-NEXT: psrlw $4, %xmm1
-; SSE3-NEXT: paddb %xmm0, %xmm1
-; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT: pxor %xmm0, %xmm0
-; SSE3-NEXT: movdqa %xmm1, %xmm2
-; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE3-NEXT: psadbw %xmm0, %xmm2
-; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE3-NEXT: psadbw %xmm0, %xmm1
-; SSE3-NEXT: packuswb %xmm2, %xmm1
-; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT: movdqa %xmm1, %xmm0
+; SSE3-NEXT: paddb %xmm1, %xmm0
+; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT: pxor %xmm1, %xmm1
+; SSE3-NEXT: movdqa %xmm0, %xmm2
+; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE3-NEXT: psadbw %xmm1, %xmm2
+; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE3-NEXT: psadbw %xmm1, %xmm0
+; SSE3-NEXT: packuswb %xmm2, %xmm0
+; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE3-NEXT: retq
;
; SSSE3-LABEL: ugt_5_v4i32:
@@ -7480,17 +7472,16 @@ define <4 x i32> @ugt_6_v4i32(<4 x i32> %0) {
; SSE2-NEXT: paddb %xmm2, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrlw $4, %xmm1
-; SSE2-NEXT: paddb %xmm0, %xmm1
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: pxor %xmm0, %xmm0
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE2-NEXT: psadbw %xmm0, %xmm2
-; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT: psadbw %xmm0, %xmm1
-; SSE2-NEXT: packuswb %xmm2, %xmm1
-; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: paddb %xmm1, %xmm0
+; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-NEXT: psadbw %xmm1, %xmm2
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT: psadbw %xmm1, %xmm0
+; SSE2-NEXT: packuswb %xmm2, %xmm0
+; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE2-NEXT: retq
;
; SSE3-LABEL: ugt_6_v4i32:
@@ -7507,17 +7498,16 @@ define <4 x i32> @ugt_6_v4i32(<4 x i32> %0) {
; SSE3-NEXT: paddb %xmm2, %xmm0
; SSE3-NEXT: movdqa %xmm0, %xmm1
; SSE3-NEXT: psrlw $4, %xmm1
-; SSE3-NEXT: paddb %xmm0, %xmm1
-; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT: pxor %xmm0, %xmm0
-; SSE3-NEXT: movdqa %xmm1, %xmm2
-; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE3-NEXT: psadbw %xmm0, %xmm2
-; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE3-NEXT: psadbw %xmm0, %xmm1
-; SSE3-NEXT: packuswb %xmm2, %xmm1
-; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT: movdqa %xmm1, %xmm0
+; SSE3-NEXT: paddb %xmm1, %xmm0
+; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT: pxor %xmm1, %xmm1
+; SSE3-NEXT: movdqa %xmm0, %xmm2
+; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE3-NEXT: psadbw %xmm1, %xmm2
+; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE3-NEXT: psadbw %xmm1, %xmm0
+; SSE3-NEXT: packuswb %xmm2, %xmm0
+; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE3-NEXT: retq
;
; SSSE3-LABEL: ugt_6_v4i32:
@@ -7860,17 +7850,16 @@ define <4 x i32> @ugt_7_v4i32(<4 x i32> %0) {
; SSE2-NEXT: paddb %xmm2, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrlw $4, %xmm1
-; SSE2-NEXT: paddb %xmm0, %xmm1
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: pxor %xmm0, %xmm0
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE2-NEXT: psadbw %xmm0, %xmm2
-; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT: psadbw %xmm0, %xmm1
-; SSE2-NEXT: packuswb %xmm2, %xmm1
-; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: paddb %xmm1, %xmm0
+; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-NEXT: psadbw %xmm1, %xmm2
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT: psadbw %xmm1, %xmm0
+; SSE2-NEXT: packuswb %xmm2, %xmm0
+; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE2-NEXT: retq
;
; SSE3-LABEL: ugt_7_v4i32:
@@ -7887,17 +7876,16 @@ define <4 x i32> @ugt_7_v4i32(<4 x i32> %0) {
; SSE3-NEXT: paddb %xmm2, %xmm0
; SSE3-NEXT: movdqa %xmm0, %xmm1
; SSE3-NEXT: psrlw $4, %xmm1
-; SSE3-NEXT: paddb %xmm0, %xmm1
-; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT: pxor %xmm0, %xmm0
-; SSE3-NEXT: movdqa %xmm1, %xmm2
-; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE3-NEXT: psadbw %xmm0, %xmm2
-; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE3-NEXT: psadbw %xmm0, %xmm1
-; SSE3-NEXT: packuswb %xmm2, %xmm1
-; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT: movdqa %xmm1, %xmm0
+; SSE3-NEXT: paddb %xmm1, %xmm0
+; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT: pxor %xmm1, %xmm1
+; SSE3-NEXT: movdqa %xmm0, %xmm2
+; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE3-NEXT: psadbw %xmm1, %xmm2
+; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE3-NEXT: psadbw %xmm1, %xmm0
+; SSE3-NEXT: packuswb %xmm2, %xmm0
+; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE3-NEXT: retq
;
; SSSE3-LABEL: ugt_7_v4i32:
@@ -8240,17 +8228,16 @@ define <4 x i32> @ugt_8_v4i32(<4 x i32> %0) {
; SSE2-NEXT: paddb %xmm2, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrlw $4, %xmm1
-; SSE2-NEXT: paddb %xmm0, %xmm1
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: pxor %xmm0, %xmm0
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE2-NEXT: psadbw %xmm0, %xmm2
-; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT: psadbw %xmm0, %xmm1
-; SSE2-NEXT: packuswb %xmm2, %xmm1
-; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: paddb %xmm1, %xmm0
+; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-NEXT: psadbw %xmm1, %xmm2
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT: psadbw %xmm1, %xmm0
+; SSE2-NEXT: packuswb %xmm2, %xmm0
+; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE2-NEXT: retq
;
; SSE3-LABEL: ugt_8_v4i32:
@@ -8267,17 +8254,16 @@ define <4 x i32> @ugt_8_v4i32(<4 x i32> %0) {
; SSE3-NEXT: paddb %xmm2, %xmm0
; SSE3-NEXT: movdqa %xmm0, %xmm1
; SSE3-NEXT: psrlw $4, %xmm1
-; SSE3-NEXT: paddb %xmm0, %xmm1
-; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT: pxor %xmm0, %xmm0
-; SSE3-NEXT: movdqa %xmm1, %xmm2
-; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE3-NEXT: psadbw %xmm0, %xmm2
-; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE3-NEXT: psadbw %xmm0, %xmm1
-; SSE3-NEXT: packuswb %xmm2, %xmm1
-; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT: movdqa %xmm1, %xmm0
+; SSE3-NEXT: paddb %xmm1, %xmm0
+; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT: pxor %xmm1, %xmm1
+; SSE3-NEXT: movdqa %xmm0, %xmm2
+; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE3-NEXT: psadbw %xmm1, %xmm2
+; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE3-NEXT: psadbw %xmm1, %xmm0
+; SSE3-NEXT: packuswb %xmm2, %xmm0
+; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE3-NEXT: retq
;
; SSSE3-LABEL: ugt_8_v4i32:
@@ -8620,17 +8606,16 @@ define <4 x i32> @ugt_9_v4i32(<4 x i32> %0) {
; SSE2-NEXT: paddb %xmm2, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrlw $4, %xmm1
-; SSE2-NEXT: paddb %xmm0, %xmm1
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: pxor %xmm0, %xmm0
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE2-NEXT: psadbw %xmm0, %xmm2
-; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT: psadbw %xmm0, %xmm1
-; SSE2-NEXT: packuswb %xmm2, %xmm1
-; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: paddb %xmm1, %xmm0
+; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-NEXT: psadbw %xmm1, %xmm2
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT: psadbw %xmm1, %xmm0
+; SSE2-NEXT: packuswb %xmm2, %xmm0
+; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE2-NEXT: retq
;
; SSE3-LABEL: ugt_9_v4i32:
@@ -8647,17 +8632,16 @@ define <4 x i32> @ugt_9_v4i32(<4 x i32> %0) {
; SSE3-NEXT: paddb %xmm2, %xmm0
; SSE3-NEXT: movdqa %xmm0, %xmm1
; SSE3-NEXT: psrlw $4, %xmm1
-; SSE3-NEXT: paddb %xmm0, %xmm1
-; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT: pxor %xmm0, %xmm0
-; SSE3-NEXT: movdqa %xmm1, %xmm2
-; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE3-NEXT: psadbw %xmm0, %xmm2
-; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE3-NEXT: psadbw %xmm0, %xmm1
-; SSE3-NEXT: packuswb %xmm2, %xmm1
-; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT: movdqa %xmm1, %xmm0
+; SSE3-NEXT: paddb %xmm1, %xmm0
+; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT: pxor %xmm1, %xmm1
+; SSE3-NEXT: movdqa %xmm0, %xmm2
+; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE3-NEXT: psadbw %xmm1, %xmm2
+; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE3-NEXT: psadbw %xmm1, %xmm0
+; SSE3-NEXT: packuswb %xmm2, %xmm0
+; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE3-NEXT: retq
;
; SSSE3-LABEL: ugt_9_v4i32:
@@ -9000,17 +8984,16 @@ define <4 x i32> @ugt_10_v4i32(<4 x i32> %0) {
; SSE2-NEXT: paddb %xmm2, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrlw $4, %xmm1
-; SSE2-NEXT: paddb %xmm0, %xmm1
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: pxor %xmm0, %xmm0
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE2-NEXT: psadbw %xmm0, %xmm2
-; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT: psadbw %xmm0, %xmm1
-; SSE2-NEXT: packuswb %xmm2, %xmm1
-; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: paddb %xmm1, %xmm0
+; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-NEXT: psadbw %xmm1, %xmm2
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT: psadbw %xmm1, %xmm0
+; SSE2-NEXT: packuswb %xmm2, %xmm0
+; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE2-NEXT: retq
;
; SSE3-LABEL: ugt_10_v4i32:
@@ -9027,17 +9010,16 @@ define <4 x i32> @ugt_10_v4i32(<4 x i32> %0) {
; SSE3-NEXT: paddb %xmm2, %xmm0
; SSE3-NEXT: movdqa %xmm0, %xmm1
; SSE3-NEXT: psrlw $4, %xmm1
-; SSE3-NEXT: paddb %xmm0, %xmm1
-; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT: pxor %xmm0, %xmm0
-; SSE3-NEXT: movdqa %xmm1, %xmm2
-; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE3-NEXT: psadbw %xmm0, %xmm2
-; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE3-NEXT: psadbw %xmm0, %xmm1
-; SSE3-NEXT: packuswb %xmm2, %xmm1
-; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT: movdqa %xmm1, %xmm0
+; SSE3-NEXT: paddb %xmm1, %xmm0
+; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT: pxor %xmm1, %xmm1
+; SSE3-NEXT: movdqa %xmm0, %xmm2
+; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE3-NEXT: psadbw %xmm1, %xmm2
+; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE3-NEXT: psadbw %xmm1, %xmm0
+; SSE3-NEXT: packuswb %xmm2, %xmm0
+; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE3-NEXT: retq
;
; SSSE3-LABEL: ugt_10_v4i32:
@@ -9380,17 +9362,16 @@ define <4 x i32> @ugt_11_v4i32(<4 x i32> %0) {
; SSE2-NEXT: paddb %xmm2, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrlw $4, %xmm1
-; SSE2-NEXT: paddb %xmm0, %xmm1
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: pxor %xmm0, %xmm0
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE2-NEXT: psadbw %xmm0, %xmm2
-; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT: psadbw %xmm0, %xmm1
-; SSE2-NEXT: packuswb %xmm2, %xmm1
-; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: paddb %xmm1, %xmm0
+; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-NEXT: psadbw %xmm1, %xmm2
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT: psadbw %xmm1, %xmm0
+; SSE2-NEXT: packuswb %xmm2, %xmm0
+; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE2-NEXT: retq
;
; SSE3-LABEL: ugt_11_v4i32:
@@ -9407,17 +9388,16 @@ define <4 x i32> @ugt_11_v4i32(<4 x i32> %0) {
; SSE3-NEXT: paddb %xmm2, %xmm0
; SSE3-NEXT: movdqa %xmm0, %xmm1
; SSE3-NEXT: psrlw $4, %xmm1
-; SSE3-NEXT: paddb %xmm0, %xmm1
-; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT: pxor %xmm0, %xmm0
-; SSE3-NEXT: movdqa %xmm1, %xmm2
-; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE3-NEXT: psadbw %xmm0, %xmm2
-; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE3-NEXT: psadbw %xmm0, %xmm1
-; SSE3-NEXT: packuswb %xmm2, %xmm1
-; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT: movdqa %xmm1, %xmm0
+; SSE3-NEXT: paddb %xmm1, %xmm0
+; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT: pxor %xmm1, %xmm1
+; SSE3-NEXT: movdqa %xmm0, %xmm2
+; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE3-NEXT: psadbw %xmm1, %xmm2
+; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE3-NEXT: psadbw %xmm1, %xmm0
+; SSE3-NEXT: packuswb %xmm2, %xmm0
+; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE3-NEXT: retq
;
; SSSE3-LABEL: ugt_11_v4i32:
@@ -9760,17 +9740,16 @@ define <4 x i32> @ugt_12_v4i32(<4 x i32> %0) {
; SSE2-NEXT: paddb %xmm2, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrlw $4, %xmm1
-; SSE2-NEXT: paddb %xmm0, %xmm1
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: pxor %xmm0, %xmm0
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE2-NEXT: psadbw %xmm0, %xmm2
-; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT: psadbw %xmm0, %xmm1
-; SSE2-NEXT: packuswb %xmm2, %xmm1
-; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: paddb %xmm1, %xmm0
+; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-NEXT: psadbw %xmm1, %xmm2
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT: psadbw %xmm1, %xmm0
+; SSE2-NEXT: packuswb %xmm2, %xmm0
+; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE2-NEXT: retq
;
; SSE3-LABEL: ugt_12_v4i32:
@@ -9787,17 +9766,16 @@ define <4 x i32> @ugt_12_v4i32(<4 x i32> %0) {
; SSE3-NEXT: paddb %xmm2, %xmm0
; SSE3-NEXT: movdqa %xmm0, %xmm1
; SSE3-NEXT: psrlw $4, %xmm1
-; SSE3-NEXT: paddb %xmm0, %xmm1
-; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT: pxor %xmm0, %xmm0
-; SSE3-NEXT: movdqa %xmm1, %xmm2
-; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE3-NEXT: psadbw %xmm0, %xmm2
-; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE3-NEXT: psadbw %xmm0, %xmm1
-; SSE3-NEXT: packuswb %xmm2, %xmm1
-; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT: movdqa %xmm1, %xmm0
+; SSE3-NEXT: paddb %xmm1, %xmm0
+; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT: pxor %xmm1, %xmm1
+; SSE3-NEXT: movdqa %xmm0, %xmm2
+; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE3-NEXT: psadbw %xmm1, %xmm2
+; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE3-NEXT: psadbw %xmm1, %xmm0
+; SSE3-NEXT: packuswb %xmm2, %xmm0
+; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE3-NEXT: retq
;
; SSSE3-LABEL: ugt_12_v4i32:
@@ -10140,17 +10118,16 @@ define <4 x i32> @ugt_13_v4i32(<4 x i32> %0) {
; SSE2-NEXT: paddb %xmm2, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrlw $4, %xmm1
-; SSE2-NEXT: paddb %xmm0, %xmm1
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: pxor %xmm0, %xmm0
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE2-NEXT: psadbw %xmm0, %xmm2
-; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT: psadbw %xmm0, %xmm1
-; SSE2-NEXT: packuswb %xmm2, %xmm1
-; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: paddb %xmm1, %xmm0
+; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-NEXT: psadbw %xmm1, %xmm2
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT: psadbw %xmm1, %xmm0
+; SSE2-NEXT: packuswb %xmm2, %xmm0
+; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE2-NEXT: retq
;
; SSE3-LABEL: ugt_13_v4i32:
@@ -10167,17 +10144,16 @@ define <4 x i32> @ugt_13_v4i32(<4 x i32> %0) {
; SSE3-NEXT: paddb %xmm2, %xmm0
; SSE3-NEXT: movdqa %xmm0, %xmm1
; SSE3-NEXT: psrlw $4, %xmm1
-; SSE3-NEXT: paddb %xmm0, %xmm1
-; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT: pxor %xmm0, %xmm0
-; SSE3-NEXT: movdqa %xmm1, %xmm2
-; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE3-NEXT: psadbw %xmm0, %xmm2
-; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE3-NEXT: psadbw %xmm0, %xmm1
-; SSE3-NEXT: packuswb %xmm2, %xmm1
-; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT: movdqa %xmm1, %xmm0
+; SSE3-NEXT: paddb %xmm1, %xmm0
+; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT: pxor %xmm1, %xmm1
+; SSE3-NEXT: movdqa %xmm0, %xmm2
+; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE3-NEXT: psadbw %xmm1, %xmm2
+; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE3-NEXT: psadbw %xmm1, %xmm0
+; SSE3-NEXT: packuswb %xmm2, %xmm0
+; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE3-NEXT: retq
;
; SSSE3-LABEL: ugt_13_v4i32:
@@ -10520,17 +10496,16 @@ define <4 x i32> @ugt_14_v4i32(<4 x i32> %0) {
; SSE2-NEXT: paddb %xmm2, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrlw $4, %xmm1
-; SSE2-NEXT: paddb %xmm0, %xmm1
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: pxor %xmm0, %xmm0
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE2-NEXT: psadbw %xmm0, %xmm2
-; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT: psadbw %xmm0, %xmm1
-; SSE2-NEXT: packuswb %xmm2, %xmm1
-; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: paddb %xmm1, %xmm0
+; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-NEXT: psadbw %xmm1, %xmm2
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT: psadbw %xmm1, %xmm0
+; SSE2-NEXT: packuswb %xmm2, %xmm0
+; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE2-NEXT: retq
;
; SSE3-LABEL: ugt_14_v4i32:
@@ -10547,17 +10522,16 @@ define <4 x i32> @ugt_14_v4i32(<4 x i32> %0) {
; SSE3-NEXT: paddb %xmm2, %xmm0
; SSE3-NEXT: movdqa %xmm0, %xmm1
; SSE3-NEXT: psrlw $4, %xmm1
-; SSE3-NEXT: paddb %xmm0, %xmm1
-; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT: pxor %xmm0, %xmm0
-; SSE3-NEXT: movdqa %xmm1, %xmm2
-; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE3-NEXT: psadbw %xmm0, %xmm2
-; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE3-NEXT: psadbw %xmm0, %xmm1
-; SSE3-NEXT: packuswb %xmm2, %xmm1
-; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT: movdqa %xmm1, %xmm0
+; SSE3-NEXT: paddb %xmm1, %xmm0
+; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT: pxor %xmm1, %xmm1
+; SSE3-NEXT: movdqa %xmm0, %xmm2
+; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE3-NEXT: psadbw %xmm1, %xmm2
+; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE3-NEXT: psadbw %xmm1, %xmm0
+; SSE3-NEXT: packuswb %xmm2, %xmm0
+; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE3-NEXT: retq
;
; SSSE3-LABEL: ugt_14_v4i32:
@@ -10900,17 +10874,16 @@ define <4 x i32> @ugt_15_v4i32(<4 x i32> %0) {
; SSE2-NEXT: paddb %xmm2, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrlw $4, %xmm1
-; SSE2-NEXT: paddb %xmm0, %xmm1
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: pxor %xmm0, %xmm0
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE2-NEXT: psadbw %xmm0, %xmm2
-; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT: psadbw %xmm0, %xmm1
-; SSE2-NEXT: packuswb %xmm2, %xmm1
-; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: paddb %xmm1, %xmm0
+; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-NEXT: psadbw %xmm1, %xmm2
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT: psadbw %xmm1, %xmm0
+; SSE2-NEXT: packuswb %xmm2, %xmm0
+; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE2-NEXT: retq
;
; SSE3-LABEL: ugt_15_v4i32:
@@ -10927,17 +10900,16 @@ define <4 x i32> @ugt_15_v4i32(<4 x i32> %0) {
; SSE3-NEXT: paddb %xmm2, %xmm0
; SSE3-NEXT: movdqa %xmm0, %xmm1
; SSE3-NEXT: psrlw $4, %xmm1
-; SSE3-NEXT: paddb %xmm0, %xmm1
-; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT: pxor %xmm0, %xmm0
-; SSE3-NEXT: movdqa %xmm1, %xmm2
-; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE3-NEXT: psadbw %xmm0, %xmm2
-; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE3-NEXT: psadbw %xmm0, %xmm1
-; SSE3-NEXT: packuswb %xmm2, %xmm1
-; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT: movdqa %xmm1, %xmm0
+; SSE3-NEXT: paddb %xmm1, %xmm0
+; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT: pxor %xmm1, %xmm1
+; SSE3-NEXT: movdqa %xmm0, %xmm2
+; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE3-NEXT: psadbw %xmm1, %xmm2
+; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE3-NEXT: psadbw %xmm1, %xmm0
+; SSE3-NEXT: packuswb %xmm2, %xmm0
+; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE3-NEXT: retq
;
; SSSE3-LABEL: ugt_15_v4i32:
@@ -11280,17 +11252,16 @@ define <4 x i32> @ugt_16_v4i32(<4 x i32> %0) {
; SSE2-NEXT: paddb %xmm2, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrlw $4, %xmm1
-; SSE2-NEXT: paddb %xmm0, %xmm1
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: pxor %xmm0, %xmm0
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE2-NEXT: psadbw %xmm0, %xmm2
-; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT: psadbw %xmm0, %xmm1
-; SSE2-NEXT: packuswb %xmm2, %xmm1
-; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: paddb %xmm1, %xmm0
+; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-NEXT: psadbw %xmm1, %xmm2
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT: psadbw %xmm1, %xmm0
+; SSE2-NEXT: packuswb %xmm2, %xmm0
+; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE2-NEXT: retq
;
; SSE3-LABEL: ugt_16_v4i32:
@@ -11307,17 +11278,16 @@ define <4 x i32> @ugt_16_v4i32(<4 x i32> %0) {
; SSE3-NEXT: paddb %xmm2, %xmm0
; SSE3-NEXT: movdqa %xmm0, %xmm1
; SSE3-NEXT: psrlw $4, %xmm1
-; SSE3-NEXT: paddb %xmm0, %xmm1
-; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT: pxor %xmm0, %xmm0
-; SSE3-NEXT: movdqa %xmm1, %xmm2
-; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE3-NEXT: psadbw %xmm0, %xmm2
-; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE3-NEXT: psadbw %xmm0, %xmm1
-; SSE3-NEXT: packuswb %xmm2, %xmm1
-; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT: movdqa %xmm1, %xmm0
+; SSE3-NEXT: paddb %xmm1, %xmm0
+; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT: pxor %xmm1, %xmm1
+; SSE3-NEXT: movdqa %xmm0, %xmm2
+; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE3-NEXT: psadbw %xmm1, %xmm2
+; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE3-NEXT: psadbw %xmm1, %xmm0
+; SSE3-NEXT: packuswb %xmm2, %xmm0
+; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE3-NEXT: retq
;
; SSSE3-LABEL: ugt_16_v4i32:
@@ -11660,17 +11630,16 @@ define <4 x i32> @ugt_17_v4i32(<4 x i32> %0) {
; SSE2-NEXT: paddb %xmm2, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrlw $4, %xmm1
-; SSE2-NEXT: paddb %xmm0, %xmm1
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: pxor %xmm0, %xmm0
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE2-NEXT: psadbw %xmm0, %xmm2
-; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT: psadbw %xmm0, %xmm1
-; SSE2-NEXT: packuswb %xmm2, %xmm1
-; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: paddb %xmm1, %xmm0
+; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-NEXT: psadbw %xmm1, %xmm2
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT: psadbw %xmm1, %xmm0
+; SSE2-NEXT: packuswb %xmm2, %xmm0
+; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE2-NEXT: retq
;
; SSE3-LABEL: ugt_17_v4i32:
@@ -11687,17 +11656,16 @@ define <4 x i32> @ugt_17_v4i32(<4 x i32> %0) {
; SSE3-NEXT: paddb %xmm2, %xmm0
; SSE3-NEXT: movdqa %xmm0, %xmm1
; SSE3-NEXT: psrlw $4, %xmm1
-; SSE3-NEXT: paddb %xmm0, %xmm1
-; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT: pxor %xmm0, %xmm0
-; SSE3-NEXT: movdqa %xmm1, %xmm2
-; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE3-NEXT: psadbw %xmm0, %xmm2
-; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE3-NEXT: psadbw %xmm0, %xmm1
-; SSE3-NEXT: packuswb %xmm2, %xmm1
-; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT: movdqa %xmm1, %xmm0
+; SSE3-NEXT: paddb %xmm1, %xmm0
+; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT: pxor %xmm1, %xmm1
+; SSE3-NEXT: movdqa %xmm0, %xmm2
+; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE3-NEXT: psadbw %xmm1, %xmm2
+; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE3-NEXT: psadbw %xmm1, %xmm0
+; SSE3-NEXT: packuswb %xmm2, %xmm0
+; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE3-NEXT: retq
;
; SSSE3-LABEL: ugt_17_v4i32:
@@ -12040,17 +12008,16 @@ define <4 x i32> @ugt_18_v4i32(<4 x i32> %0) {
; SSE2-NEXT: paddb %xmm2, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrlw $4, %xmm1
-; SSE2-NEXT: paddb %xmm0, %xmm1
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: pxor %xmm0, %xmm0
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE2-NEXT: psadbw %xmm0, %xmm2
-; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT: psadbw %xmm0, %xmm1
-; SSE2-NEXT: packuswb %xmm2, %xmm1
-; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: paddb %xmm1, %xmm0
+; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-NEXT: psadbw %xmm1, %xmm2
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT: psadbw %xmm1, %xmm0
+; SSE2-NEXT: packuswb %xmm2, %xmm0
+; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE2-NEXT: retq
;
; SSE3-LABEL: ugt_18_v4i32:
@@ -12067,17 +12034,16 @@ define <4 x i32> @ugt_18_v4i32(<4 x i32> %0) {
; SSE3-NEXT: paddb %xmm2, %xmm0
; SSE3-NEXT: movdqa %xmm0, %xmm1
; SSE3-NEXT: psrlw $4, %xmm1
-; SSE3-NEXT: paddb %xmm0, %xmm1
-; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT: pxor %xmm0, %xmm0
-; SSE3-NEXT: movdqa %xmm1, %xmm2
-; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE3-NEXT: psadbw %xmm0, %xmm2
-; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE3-NEXT: psadbw %xmm0, %xmm1
-; SSE3-NEXT: packuswb %xmm2, %xmm1
-; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT: movdqa %xmm1, %xmm0
+; SSE3-NEXT: paddb %xmm1, %xmm0
+; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT: pxor %xmm1, %xmm1
+; SSE3-NEXT: movdqa %xmm0, %xmm2
+; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE3-NEXT: psadbw %xmm1, %xmm2
+; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE3-NEXT: psadbw %xmm1, %xmm0
+; SSE3-NEXT: packuswb %xmm2, %xmm0
+; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE3-NEXT: retq
;
; SSSE3-LABEL: ugt_18_v4i32:
@@ -12420,17 +12386,16 @@ define <4 x i32> @ugt_19_v4i32(<4 x i32> %0) {
; SSE2-NEXT: paddb %xmm2, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrlw $4, %xmm1
-; SSE2-NEXT: paddb %xmm0, %xmm1
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: pxor %xmm0, %xmm0
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE2-NEXT: psadbw %xmm0, %xmm2
-; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT: psadbw %xmm0, %xmm1
-; SSE2-NEXT: packuswb %xmm2, %xmm1
-; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: paddb %xmm1, %xmm0
+; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-NEXT: psadbw %xmm1, %xmm2
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT: psadbw %xmm1, %xmm0
+; SSE2-NEXT: packuswb %xmm2, %xmm0
+; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE2-NEXT: retq
;
; SSE3-LABEL: ugt_19_v4i32:
@@ -12447,17 +12412,16 @@ define <4 x i32> @ugt_19_v4i32(<4 x i32> %0) {
; SSE3-NEXT: paddb %xmm2, %xmm0
; SSE3-NEXT: movdqa %xmm0, %xmm1
; SSE3-NEXT: psrlw $4, %xmm1
-; SSE3-NEXT: paddb %xmm0, %xmm1
-; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT: pxor %xmm0, %xmm0
-; SSE3-NEXT: movdqa %xmm1, %xmm2
-; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE3-NEXT: psadbw %xmm0, %xmm2
-; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE3-NEXT: psadbw %xmm0, %xmm1
-; SSE3-NEXT: packuswb %xmm2, %xmm1
-; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT: movdqa %xmm1, %xmm0
+; SSE3-NEXT: paddb %xmm1, %xmm0
+; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT: pxor %xmm1, %xmm1
+; SSE3-NEXT: movdqa %xmm0, %xmm2
+; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE3-NEXT: psadbw %xmm1, %xmm2
+; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE3-NEXT: psadbw %xmm1, %xmm0
+; SSE3-NEXT: packuswb %xmm2, %xmm0
+; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE3-NEXT: retq
;
; SSSE3-LABEL: ugt_19_v4i32:
@@ -12800,17 +12764,16 @@ define <4 x i32> @ugt_20_v4i32(<4 x i32> %0) {
; SSE2-NEXT: paddb %xmm2, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrlw $4, %xmm1
-; SSE2-NEXT: paddb %xmm0, %xmm1
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: pxor %xmm0, %xmm0
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE2-NEXT: psadbw %xmm0, %xmm2
-; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT: psadbw %xmm0, %xmm1
-; SSE2-NEXT: packuswb %xmm2, %xmm1
-; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: paddb %xmm1, %xmm0
+; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-NEXT: psadbw %xmm1, %xmm2
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT: psadbw %xmm1, %xmm0
+; SSE2-NEXT: packuswb %xmm2, %xmm0
+; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE2-NEXT: retq
;
; SSE3-LABEL: ugt_20_v4i32:
@@ -12827,17 +12790,16 @@ define <4 x i32> @ugt_20_v4i32(<4 x i32> %0) {
; SSE3-NEXT: paddb %xmm2, %xmm0
; SSE3-NEXT: movdqa %xmm0, %xmm1
; SSE3-NEXT: psrlw $4, %xmm1
-; SSE3-NEXT: paddb %xmm0, %xmm1
-; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT: pxor %xmm0, %xmm0
-; SSE3-NEXT: movdqa %xmm1, %xmm2
-; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE3-NEXT: psadbw %xmm0, %xmm2
-; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE3-NEXT: psadbw %xmm0, %xmm1
-; SSE3-NEXT: packuswb %xmm2, %xmm1
-; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT: movdqa %xmm1, %xmm0
+; SSE3-NEXT: paddb %xmm1, %xmm0
+; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT: pxor %xmm1, %xmm1
+; SSE3-NEXT: movdqa %xmm0, %xmm2
+; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE3-NEXT: psadbw %xmm1, %xmm2
+; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE3-NEXT: psadbw %xmm1, %xmm0
+; SSE3-NEXT: packuswb %xmm2, %xmm0
+; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE3-NEXT: retq
;
; SSSE3-LABEL: ugt_20_v4i32:
@@ -13180,17 +13142,16 @@ define <4 x i32> @ugt_21_v4i32(<4 x i32> %0) {
; SSE2-NEXT: paddb %xmm2, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrlw $4, %xmm1
-; SSE2-NEXT: paddb %xmm0, %xmm1
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: pxor %xmm0, %xmm0
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE2-NEXT: psadbw %xmm0, %xmm2
-; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT: psadbw %xmm0, %xmm1
-; SSE2-NEXT: packuswb %xmm2, %xmm1
-; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: paddb %xmm1, %xmm0
+; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-NEXT: psadbw %xmm1, %xmm2
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT: psadbw %xmm1, %xmm0
+; SSE2-NEXT: packuswb %xmm2, %xmm0
+; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE2-NEXT: retq
;
; SSE3-LABEL: ugt_21_v4i32:
@@ -13207,17 +13168,16 @@ define <4 x i32> @ugt_21_v4i32(<4 x i32> %0) {
; SSE3-NEXT: paddb %xmm2, %xmm0
; SSE3-NEXT: movdqa %xmm0, %xmm1
; SSE3-NEXT: psrlw $4, %xmm1
-; SSE3-NEXT: paddb %xmm0, %xmm1
-; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT: pxor %xmm0, %xmm0
-; SSE3-NEXT: movdqa %xmm1, %xmm2
-; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE3-NEXT: psadbw %xmm0, %xmm2
-; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE3-NEXT: psadbw %xmm0, %xmm1
-; SSE3-NEXT: packuswb %xmm2, %xmm1
-; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT: movdqa %xmm1, %xmm0
+; SSE3-NEXT: paddb %xmm1, %xmm0
+; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT: pxor %xmm1, %xmm1
+; SSE3-NEXT: movdqa %xmm0, %xmm2
+; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE3-NEXT: psadbw %xmm1, %xmm2
+; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE3-NEXT: psadbw %xmm1, %xmm0
+; SSE3-NEXT: packuswb %xmm2, %xmm0
+; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE3-NEXT: retq
;
; SSSE3-LABEL: ugt_21_v4i32:
@@ -13560,17 +13520,16 @@ define <4 x i32> @ugt_22_v4i32(<4 x i32> %0) {
; SSE2-NEXT: paddb %xmm2, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrlw $4, %xmm1
-; SSE2-NEXT: paddb %xmm0, %xmm1
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: pxor %xmm0, %xmm0
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE2-NEXT: psadbw %xmm0, %xmm2
-; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT: psadbw %xmm0, %xmm1
-; SSE2-NEXT: packuswb %xmm2, %xmm1
-; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: paddb %xmm1, %xmm0
+; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-NEXT: psadbw %xmm1, %xmm2
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT: psadbw %xmm1, %xmm0
+; SSE2-NEXT: packuswb %xmm2, %xmm0
+; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE2-NEXT: retq
;
; SSE3-LABEL: ugt_22_v4i32:
@@ -13587,17 +13546,16 @@ define <4 x i32> @ugt_22_v4i32(<4 x i32> %0) {
; SSE3-NEXT: paddb %xmm2, %xmm0
; SSE3-NEXT: movdqa %xmm0, %xmm1
; SSE3-NEXT: psrlw $4, %xmm1
-; SSE3-NEXT: paddb %xmm0, %xmm1
-; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT: pxor %xmm0, %xmm0
-; SSE3-NEXT: movdqa %xmm1, %xmm2
-; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE3-NEXT: psadbw %xmm0, %xmm2
-; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE3-NEXT: psadbw %xmm0, %xmm1
-; SSE3-NEXT: packuswb %xmm2, %xmm1
-; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT: movdqa %xmm1, %xmm0
+; SSE3-NEXT: paddb %xmm1, %xmm0
+; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT: pxor %xmm1, %xmm1
+; SSE3-NEXT: movdqa %xmm0, %xmm2
+; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE3-NEXT: psadbw %xmm1, %xmm2
+; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE3-NEXT: psadbw %xmm1, %xmm0
+; SSE3-NEXT: packuswb %xmm2, %xmm0
+; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE3-NEXT: retq
;
; SSSE3-LABEL: ugt_22_v4i32:
@@ -13940,17 +13898,16 @@ define <4 x i32> @ugt_23_v4i32(<4 x i32> %0) {
; SSE2-NEXT: paddb %xmm2, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrlw $4, %xmm1
-; SSE2-NEXT: paddb %xmm0, %xmm1
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: pxor %xmm0, %xmm0
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE2-NEXT: psadbw %xmm0, %xmm2
-; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT: psadbw %xmm0, %xmm1
-; SSE2-NEXT: packuswb %xmm2, %xmm1
-; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: paddb %xmm1, %xmm0
+; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-NEXT: psadbw %xmm1, %xmm2
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT: psadbw %xmm1, %xmm0
+; SSE2-NEXT: packuswb %xmm2, %xmm0
+; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE2-NEXT: retq
;
; SSE3-LABEL: ugt_23_v4i32:
@@ -13967,17 +13924,16 @@ define <4 x i32> @ugt_23_v4i32(<4 x i32> %0) {
; SSE3-NEXT: paddb %xmm2, %xmm0
; SSE3-NEXT: movdqa %xmm0, %xmm1
; SSE3-NEXT: psrlw $4, %xmm1
-; SSE3-NEXT: paddb %xmm0, %xmm1
-; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT: pxor %xmm0, %xmm0
-; SSE3-NEXT: movdqa %xmm1, %xmm2
-; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE3-NEXT: psadbw %xmm0, %xmm2
-; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE3-NEXT: psadbw %xmm0, %xmm1
-; SSE3-NEXT: packuswb %xmm2, %xmm1
-; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT: movdqa %xmm1, %xmm0
+; SSE3-NEXT: paddb %xmm1, %xmm0
+; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT: pxor %xmm1, %xmm1
+; SSE3-NEXT: movdqa %xmm0, %xmm2
+; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE3-NEXT: psadbw %xmm1, %xmm2
+; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE3-NEXT: psadbw %xmm1, %xmm0
+; SSE3-NEXT: packuswb %xmm2, %xmm0
+; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE3-NEXT: retq
;
; SSSE3-LABEL: ugt_23_v4i32:
@@ -14320,17 +14276,16 @@ define <4 x i32> @ugt_24_v4i32(<4 x i32> %0) {
; SSE2-NEXT: paddb %xmm2, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrlw $4, %xmm1
-; SSE2-NEXT: paddb %xmm0, %xmm1
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: pxor %xmm0, %xmm0
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE2-NEXT: psadbw %xmm0, %xmm2
-; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT: psadbw %xmm0, %xmm1
-; SSE2-NEXT: packuswb %xmm2, %xmm1
-; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: paddb %xmm1, %xmm0
+; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-NEXT: psadbw %xmm1, %xmm2
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT: psadbw %xmm1, %xmm0
+; SSE2-NEXT: packuswb %xmm2, %xmm0
+; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE2-NEXT: retq
;
; SSE3-LABEL: ugt_24_v4i32:
@@ -14347,17 +14302,16 @@ define <4 x i32> @ugt_24_v4i32(<4 x i32> %0) {
; SSE3-NEXT: paddb %xmm2, %xmm0
; SSE3-NEXT: movdqa %xmm0, %xmm1
; SSE3-NEXT: psrlw $4, %xmm1
-; SSE3-NEXT: paddb %xmm0, %xmm1
-; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT: pxor %xmm0, %xmm0
-; SSE3-NEXT: movdqa %xmm1, %xmm2
-; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE3-NEXT: psadbw %xmm0, %xmm2
-; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE3-NEXT: psadbw %xmm0, %xmm1
-; SSE3-NEXT: packuswb %xmm2, %xmm1
-; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT: movdqa %xmm1, %xmm0
+; SSE3-NEXT: paddb %xmm1, %xmm0
+; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT: pxor %xmm1, %xmm1
+; SSE3-NEXT: movdqa %xmm0, %xmm2
+; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE3-NEXT: psadbw %xmm1, %xmm2
+; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE3-NEXT: psadbw %xmm1, %xmm0
+; SSE3-NEXT: packuswb %xmm2, %xmm0
+; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE3-NEXT: retq
;
; SSSE3-LABEL: ugt_24_v4i32:
@@ -14700,17 +14654,16 @@ define <4 x i32> @ugt_25_v4i32(<4 x i32> %0) {
; SSE2-NEXT: paddb %xmm2, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrlw $4, %xmm1
-; SSE2-NEXT: paddb %xmm0, %xmm1
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: pxor %xmm0, %xmm0
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE2-NEXT: psadbw %xmm0, %xmm2
-; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT: psadbw %xmm0, %xmm1
-; SSE2-NEXT: packuswb %xmm2, %xmm1
-; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: paddb %xmm1, %xmm0
+; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-NEXT: psadbw %xmm1, %xmm2
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT: psadbw %xmm1, %xmm0
+; SSE2-NEXT: packuswb %xmm2, %xmm0
+; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE2-NEXT: retq
;
; SSE3-LABEL: ugt_25_v4i32:
@@ -14727,17 +14680,16 @@ define <4 x i32> @ugt_25_v4i32(<4 x i32> %0) {
; SSE3-NEXT: paddb %xmm2, %xmm0
; SSE3-NEXT: movdqa %xmm0, %xmm1
; SSE3-NEXT: psrlw $4, %xmm1
-; SSE3-NEXT: paddb %xmm0, %xmm1
-; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT: pxor %xmm0, %xmm0
-; SSE3-NEXT: movdqa %xmm1, %xmm2
-; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE3-NEXT: psadbw %xmm0, %xmm2
-; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE3-NEXT: psadbw %xmm0, %xmm1
-; SSE3-NEXT: packuswb %xmm2, %xmm1
-; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT: movdqa %xmm1, %xmm0
+; SSE3-NEXT: paddb %xmm1, %xmm0
+; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT: pxor %xmm1, %xmm1
+; SSE3-NEXT: movdqa %xmm0, %xmm2
+; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE3-NEXT: psadbw %xmm1, %xmm2
+; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE3-NEXT: psadbw %xmm1, %xmm0
+; SSE3-NEXT: packuswb %xmm2, %xmm0
+; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE3-NEXT: retq
;
; SSSE3-LABEL: ugt_25_v4i32:
@@ -15080,17 +15032,16 @@ define <4 x i32> @ugt_26_v4i32(<4 x i32> %0) {
; SSE2-NEXT: paddb %xmm2, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrlw $4, %xmm1
-; SSE2-NEXT: paddb %xmm0, %xmm1
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: pxor %xmm0, %xmm0
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE2-NEXT: psadbw %xmm0, %xmm2
-; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT: psadbw %xmm0, %xmm1
-; SSE2-NEXT: packuswb %xmm2, %xmm1
-; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: paddb %xmm1, %xmm0
+; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-NEXT: psadbw %xmm1, %xmm2
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT: psadbw %xmm1, %xmm0
+; SSE2-NEXT: packuswb %xmm2, %xmm0
+; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE2-NEXT: retq
;
; SSE3-LABEL: ugt_26_v4i32:
@@ -15107,17 +15058,16 @@ define <4 x i32> @ugt_26_v4i32(<4 x i32> %0) {
; SSE3-NEXT: paddb %xmm2, %xmm0
; SSE3-NEXT: movdqa %xmm0, %xmm1
; SSE3-NEXT: psrlw $4, %xmm1
-; SSE3-NEXT: paddb %xmm0, %xmm1
-; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT: pxor %xmm0, %xmm0
-; SSE3-NEXT: movdqa %xmm1, %xmm2
-; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE3-NEXT: psadbw %xmm0, %xmm2
-; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE3-NEXT: psadbw %xmm0, %xmm1
-; SSE3-NEXT: packuswb %xmm2, %xmm1
-; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT: movdqa %xmm1, %xmm0
+; SSE3-NEXT: paddb %xmm1, %xmm0
+; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT: pxor %xmm1, %xmm1
+; SSE3-NEXT: movdqa %xmm0, %xmm2
+; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE3-NEXT: psadbw %xmm1, %xmm2
+; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE3-NEXT: psadbw %xmm1, %xmm0
+; SSE3-NEXT: packuswb %xmm2, %xmm0
+; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE3-NEXT: retq
;
; SSSE3-LABEL: ugt_26_v4i32:
@@ -15460,17 +15410,16 @@ define <4 x i32> @ugt_27_v4i32(<4 x i32> %0) {
; SSE2-NEXT: paddb %xmm2, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrlw $4, %xmm1
-; SSE2-NEXT: paddb %xmm0, %xmm1
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: pxor %xmm0, %xmm0
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE2-NEXT: psadbw %xmm0, %xmm2
-; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT: psadbw %xmm0, %xmm1
-; SSE2-NEXT: packuswb %xmm2, %xmm1
-; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: paddb %xmm1, %xmm0
+; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-NEXT: psadbw %xmm1, %xmm2
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT: psadbw %xmm1, %xmm0
+; SSE2-NEXT: packuswb %xmm2, %xmm0
+; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE2-NEXT: retq
;
; SSE3-LABEL: ugt_27_v4i32:
@@ -15487,17 +15436,16 @@ define <4 x i32> @ugt_27_v4i32(<4 x i32> %0) {
; SSE3-NEXT: paddb %xmm2, %xmm0
; SSE3-NEXT: movdqa %xmm0, %xmm1
; SSE3-NEXT: psrlw $4, %xmm1
-; SSE3-NEXT: paddb %xmm0, %xmm1
-; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT: pxor %xmm0, %xmm0
-; SSE3-NEXT: movdqa %xmm1, %xmm2
-; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE3-NEXT: psadbw %xmm0, %xmm2
-; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE3-NEXT: psadbw %xmm0, %xmm1
-; SSE3-NEXT: packuswb %xmm2, %xmm1
-; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT: movdqa %xmm1, %xmm0
+; SSE3-NEXT: paddb %xmm1, %xmm0
+; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT: pxor %xmm1, %xmm1
+; SSE3-NEXT: movdqa %xmm0, %xmm2
+; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE3-NEXT: psadbw %xmm1, %xmm2
+; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE3-NEXT: psadbw %xmm1, %xmm0
+; SSE3-NEXT: packuswb %xmm2, %xmm0
+; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE3-NEXT: retq
;
; SSSE3-LABEL: ugt_27_v4i32:
@@ -15840,17 +15788,16 @@ define <4 x i32> @ugt_28_v4i32(<4 x i32> %0) {
; SSE2-NEXT: paddb %xmm2, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrlw $4, %xmm1
-; SSE2-NEXT: paddb %xmm0, %xmm1
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: pxor %xmm0, %xmm0
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE2-NEXT: psadbw %xmm0, %xmm2
-; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT: psadbw %xmm0, %xmm1
-; SSE2-NEXT: packuswb %xmm2, %xmm1
-; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: paddb %xmm1, %xmm0
+; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-NEXT: psadbw %xmm1, %xmm2
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT: psadbw %xmm1, %xmm0
+; SSE2-NEXT: packuswb %xmm2, %xmm0
+; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE2-NEXT: retq
;
; SSE3-LABEL: ugt_28_v4i32:
@@ -15867,17 +15814,16 @@ define <4 x i32> @ugt_28_v4i32(<4 x i32> %0) {
; SSE3-NEXT: paddb %xmm2, %xmm0
; SSE3-NEXT: movdqa %xmm0, %xmm1
; SSE3-NEXT: psrlw $4, %xmm1
-; SSE3-NEXT: paddb %xmm0, %xmm1
-; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT: pxor %xmm0, %xmm0
-; SSE3-NEXT: movdqa %xmm1, %xmm2
-; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE3-NEXT: psadbw %xmm0, %xmm2
-; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE3-NEXT: psadbw %xmm0, %xmm1
-; SSE3-NEXT: packuswb %xmm2, %xmm1
-; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT: movdqa %xmm1, %xmm0
+; SSE3-NEXT: paddb %xmm1, %xmm0
+; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT: pxor %xmm1, %xmm1
+; SSE3-NEXT: movdqa %xmm0, %xmm2
+; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE3-NEXT: psadbw %xmm1, %xmm2
+; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE3-NEXT: psadbw %xmm1, %xmm0
+; SSE3-NEXT: packuswb %xmm2, %xmm0
+; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE3-NEXT: retq
;
; SSSE3-LABEL: ugt_28_v4i32:
@@ -16220,17 +16166,16 @@ define <4 x i32> @ugt_29_v4i32(<4 x i32> %0) {
; SSE2-NEXT: paddb %xmm2, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrlw $4, %xmm1
-; SSE2-NEXT: paddb %xmm0, %xmm1
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: pxor %xmm0, %xmm0
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE2-NEXT: psadbw %xmm0, %xmm2
-; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT: psadbw %xmm0, %xmm1
-; SSE2-NEXT: packuswb %xmm2, %xmm1
-; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: paddb %xmm1, %xmm0
+; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-NEXT: psadbw %xmm1, %xmm2
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT: psadbw %xmm1, %xmm0
+; SSE2-NEXT: packuswb %xmm2, %xmm0
+; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE2-NEXT: retq
;
; SSE3-LABEL: ugt_29_v4i32:
@@ -16247,17 +16192,16 @@ define <4 x i32> @ugt_29_v4i32(<4 x i32> %0) {
; SSE3-NEXT: paddb %xmm2, %xmm0
; SSE3-NEXT: movdqa %xmm0, %xmm1
; SSE3-NEXT: psrlw $4, %xmm1
-; SSE3-NEXT: paddb %xmm0, %xmm1
-; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT: pxor %xmm0, %xmm0
-; SSE3-NEXT: movdqa %xmm1, %xmm2
-; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE3-NEXT: psadbw %xmm0, %xmm2
-; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE3-NEXT: psadbw %xmm0, %xmm1
-; SSE3-NEXT: packuswb %xmm2, %xmm1
-; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT: movdqa %xmm1, %xmm0
+; SSE3-NEXT: paddb %xmm1, %xmm0
+; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT: pxor %xmm1, %xmm1
+; SSE3-NEXT: movdqa %xmm0, %xmm2
+; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE3-NEXT: psadbw %xmm1, %xmm2
+; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE3-NEXT: psadbw %xmm1, %xmm0
+; SSE3-NEXT: packuswb %xmm2, %xmm0
+; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE3-NEXT: retq
;
; SSSE3-LABEL: ugt_29_v4i32:
@@ -16600,17 +16544,16 @@ define <4 x i32> @ugt_30_v4i32(<4 x i32> %0) {
; SSE2-NEXT: paddb %xmm2, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrlw $4, %xmm1
-; SSE2-NEXT: paddb %xmm0, %xmm1
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: pxor %xmm0, %xmm0
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE2-NEXT: psadbw %xmm0, %xmm2
-; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT: psadbw %xmm0, %xmm1
-; SSE2-NEXT: packuswb %xmm2, %xmm1
-; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: paddb %xmm1, %xmm0
+; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-NEXT: psadbw %xmm1, %xmm2
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT: psadbw %xmm1, %xmm0
+; SSE2-NEXT: packuswb %xmm2, %xmm0
+; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE2-NEXT: retq
;
; SSE3-LABEL: ugt_30_v4i32:
@@ -16627,17 +16570,16 @@ define <4 x i32> @ugt_30_v4i32(<4 x i32> %0) {
; SSE3-NEXT: paddb %xmm2, %xmm0
; SSE3-NEXT: movdqa %xmm0, %xmm1
; SSE3-NEXT: psrlw $4, %xmm1
-; SSE3-NEXT: paddb %xmm0, %xmm1
-; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT: pxor %xmm0, %xmm0
-; SSE3-NEXT: movdqa %xmm1, %xmm2
-; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE3-NEXT: psadbw %xmm0, %xmm2
-; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE3-NEXT: psadbw %xmm0, %xmm1
-; SSE3-NEXT: packuswb %xmm2, %xmm1
-; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT: movdqa %xmm1, %xmm0
+; SSE3-NEXT: paddb %xmm1, %xmm0
+; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT: pxor %xmm1, %xmm1
+; SSE3-NEXT: movdqa %xmm0, %xmm2
+; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE3-NEXT: psadbw %xmm1, %xmm2
+; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE3-NEXT: psadbw %xmm1, %xmm0
+; SSE3-NEXT: packuswb %xmm2, %xmm0
+; SSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE3-NEXT: retq
;
; SSSE3-LABEL: ugt_30_v4i32:
@@ -16971,11 +16913,11 @@ define <2 x i64> @ugt_1_v2i64(<2 x i64> %0) {
; SSE2-NEXT: pcmpeqd %xmm1, %xmm1
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: paddq %xmm1, %xmm2
-; SSE2-NEXT: pand %xmm0, %xmm2
-; SSE2-NEXT: pxor %xmm3, %xmm3
-; SSE2-NEXT: pcmpeqd %xmm2, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,0,3,2]
-; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: pcmpeqd %xmm2, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,0,3,2]
+; SSE2-NEXT: pand %xmm2, %xmm0
; SSE2-NEXT: pxor %xmm1, %xmm0
; SSE2-NEXT: retq
;
@@ -16984,11 +16926,11 @@ define <2 x i64> @ugt_1_v2i64(<2 x i64> %0) {
; SSE3-NEXT: pcmpeqd %xmm1, %xmm1
; SSE3-NEXT: movdqa %xmm0, %xmm2
; SSE3-NEXT: paddq %xmm1, %xmm2
-; SSE3-NEXT: pand %xmm0, %xmm2
-; SSE3-NEXT: pxor %xmm3, %xmm3
-; SSE3-NEXT: pcmpeqd %xmm2, %xmm3
-; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,0,3,2]
-; SSE3-NEXT: pand %xmm3, %xmm0
+; SSE3-NEXT: pand %xmm2, %xmm0
+; SSE3-NEXT: pxor %xmm2, %xmm2
+; SSE3-NEXT: pcmpeqd %xmm2, %xmm0
+; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,0,3,2]
+; SSE3-NEXT: pand %xmm2, %xmm0
; SSE3-NEXT: pxor %xmm1, %xmm0
; SSE3-NEXT: retq
;
@@ -16997,11 +16939,11 @@ define <2 x i64> @ugt_1_v2i64(<2 x i64> %0) {
; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1
; SSSE3-NEXT: movdqa %xmm0, %xmm2
; SSSE3-NEXT: paddq %xmm1, %xmm2
-; SSSE3-NEXT: pand %xmm0, %xmm2
-; SSSE3-NEXT: pxor %xmm3, %xmm3
-; SSSE3-NEXT: pcmpeqd %xmm2, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,0,3,2]
-; SSSE3-NEXT: pand %xmm3, %xmm0
+; SSSE3-NEXT: pand %xmm2, %xmm0
+; SSSE3-NEXT: pxor %xmm2, %xmm2
+; SSSE3-NEXT: pcmpeqd %xmm2, %xmm0
+; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,0,3,2]
+; SSSE3-NEXT: pand %xmm2, %xmm0
; SSSE3-NEXT: pxor %xmm1, %xmm0
; SSSE3-NEXT: retq
;
@@ -17084,33 +17026,33 @@ define <2 x i64> @ult_2_v2i64(<2 x i64> %0) {
; SSE2: # %bb.0:
; SSE2-NEXT: pcmpeqd %xmm1, %xmm1
; SSE2-NEXT: paddq %xmm0, %xmm1
-; SSE2-NEXT: pand %xmm0, %xmm1
-; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: pcmpeqd %xmm1, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2]
-; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: pand %xmm1, %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: pcmpeqd %xmm1, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
+; SSE2-NEXT: pand %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE3-LABEL: ult_2_v2i64:
; SSE3: # %bb.0:
; SSE3-NEXT: pcmpeqd %xmm1, %xmm1
; SSE3-NEXT: paddq %xmm0, %xmm1
-; SSE3-NEXT: pand %xmm0, %xmm1
-; SSE3-NEXT: pxor %xmm2, %xmm2
-; SSE3-NEXT: pcmpeqd %xmm1, %xmm2
-; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2]
-; SSE3-NEXT: pand %xmm2, %xmm0
+; SSE3-NEXT: pand %xmm1, %xmm0
+; SSE3-NEXT: pxor %xmm1, %xmm1
+; SSE3-NEXT: pcmpeqd %xmm1, %xmm0
+; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
+; SSE3-NEXT: pand %xmm1, %xmm0
; SSE3-NEXT: retq
;
; SSSE3-LABEL: ult_2_v2i64:
; SSSE3: # %bb.0:
; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1
; SSSE3-NEXT: paddq %xmm0, %xmm1
-; SSSE3-NEXT: pand %xmm0, %xmm1
-; SSSE3-NEXT: pxor %xmm2, %xmm2
-; SSSE3-NEXT: pcmpeqd %xmm1, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2]
-; SSSE3-NEXT: pand %xmm2, %xmm0
+; SSSE3-NEXT: pand %xmm1, %xmm0
+; SSSE3-NEXT: pxor %xmm1, %xmm1
+; SSSE3-NEXT: pcmpeqd %xmm1, %xmm0
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
+; SSSE3-NEXT: pand %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: ult_2_v2i64:
diff --git a/llvm/test/CodeGen/X86/vector-popcnt-128.ll b/llvm/test/CodeGen/X86/vector-popcnt-128.ll
index 70aae67b755a4..ac7d4d6cb4445 100644
--- a/llvm/test/CodeGen/X86/vector-popcnt-128.ll
+++ b/llvm/test/CodeGen/X86/vector-popcnt-128.ll
@@ -158,16 +158,15 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
; SSE2-NEXT: paddb %xmm2, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrlw $4, %xmm1
-; SSE2-NEXT: paddb %xmm0, %xmm1
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: pxor %xmm0, %xmm0
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE2-NEXT: psadbw %xmm0, %xmm2
-; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT: psadbw %xmm0, %xmm1
-; SSE2-NEXT: packuswb %xmm2, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: paddb %xmm1, %xmm0
+; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-NEXT: psadbw %xmm1, %xmm2
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT: psadbw %xmm1, %xmm0
+; SSE2-NEXT: packuswb %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE3-LABEL: testv4i32:
@@ -184,16 +183,15 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
; SSE3-NEXT: paddb %xmm2, %xmm0
; SSE3-NEXT: movdqa %xmm0, %xmm1
; SSE3-NEXT: psrlw $4, %xmm1
-; SSE3-NEXT: paddb %xmm0, %xmm1
-; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT: pxor %xmm0, %xmm0
-; SSE3-NEXT: movdqa %xmm1, %xmm2
-; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE3-NEXT: psadbw %xmm0, %xmm2
-; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE3-NEXT: psadbw %xmm0, %xmm1
-; SSE3-NEXT: packuswb %xmm2, %xmm1
-; SSE3-NEXT: movdqa %xmm1, %xmm0
+; SSE3-NEXT: paddb %xmm1, %xmm0
+; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT: pxor %xmm1, %xmm1
+; SSE3-NEXT: movdqa %xmm0, %xmm2
+; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE3-NEXT: psadbw %xmm1, %xmm2
+; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE3-NEXT: psadbw %xmm1, %xmm0
+; SSE3-NEXT: packuswb %xmm2, %xmm0
; SSE3-NEXT: retq
;
; SSSE3-LABEL: testv4i32:
@@ -329,10 +327,10 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
; SSE2-NEXT: paddb %xmm2, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrlw $4, %xmm1
-; SSE2-NEXT: paddb %xmm0, %xmm1
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psllw $8, %xmm0
+; SSE2-NEXT: paddb %xmm1, %xmm0
+; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psllw $8, %xmm1
; SSE2-NEXT: paddb %xmm1, %xmm0
; SSE2-NEXT: psrlw $8, %xmm0
; SSE2-NEXT: retq
@@ -351,10 +349,10 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
; SSE3-NEXT: paddb %xmm2, %xmm0
; SSE3-NEXT: movdqa %xmm0, %xmm1
; SSE3-NEXT: psrlw $4, %xmm1
-; SSE3-NEXT: paddb %xmm0, %xmm1
-; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT: movdqa %xmm1, %xmm0
-; SSE3-NEXT: psllw $8, %xmm0
+; SSE3-NEXT: paddb %xmm1, %xmm0
+; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT: movdqa %xmm0, %xmm1
+; SSE3-NEXT: psllw $8, %xmm1
; SSE3-NEXT: paddb %xmm1, %xmm0
; SSE3-NEXT: psrlw $8, %xmm0
; SSE3-NEXT: retq
@@ -804,52 +802,55 @@ define <2 x i64> @ne_1_v2i64(<2 x i64> %0) {
; SSE2-LABEL: ne_1_v2i64:
; SSE2: # %bb.0:
; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: pcmpeqd %xmm1, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,0,3,2]
+; SSE2-NEXT: pand %xmm2, %xmm3
; SSE2-NEXT: pcmpeqd %xmm2, %xmm2
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: paddq %xmm2, %xmm3
-; SSE2-NEXT: pand %xmm0, %xmm3
+; SSE2-NEXT: movdqa %xmm0, %xmm4
+; SSE2-NEXT: paddq %xmm2, %xmm4
+; SSE2-NEXT: pand %xmm4, %xmm0
; SSE2-NEXT: pcmpeqd %xmm1, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,0,3,2]
-; SSE2-NEXT: pand %xmm0, %xmm4
-; SSE2-NEXT: pcmpeqd %xmm1, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,0,3,2]
-; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
+; SSE2-NEXT: pand %xmm1, %xmm0
; SSE2-NEXT: pxor %xmm2, %xmm0
-; SSE2-NEXT: por %xmm4, %xmm0
+; SSE2-NEXT: por %xmm3, %xmm0
; SSE2-NEXT: retq
;
; SSE3-LABEL: ne_1_v2i64:
; SSE3: # %bb.0:
; SSE3-NEXT: pxor %xmm1, %xmm1
+; SSE3-NEXT: movdqa %xmm0, %xmm2
+; SSE3-NEXT: pcmpeqd %xmm1, %xmm2
+; SSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,0,3,2]
+; SSE3-NEXT: pand %xmm2, %xmm3
; SSE3-NEXT: pcmpeqd %xmm2, %xmm2
-; SSE3-NEXT: movdqa %xmm0, %xmm3
-; SSE3-NEXT: paddq %xmm2, %xmm3
-; SSE3-NEXT: pand %xmm0, %xmm3
+; SSE3-NEXT: movdqa %xmm0, %xmm4
+; SSE3-NEXT: paddq %xmm2, %xmm4
+; SSE3-NEXT: pand %xmm4, %xmm0
; SSE3-NEXT: pcmpeqd %xmm1, %xmm0
-; SSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,0,3,2]
-; SSE3-NEXT: pand %xmm0, %xmm4
-; SSE3-NEXT: pcmpeqd %xmm1, %xmm3
-; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,0,3,2]
-; SSE3-NEXT: pand %xmm3, %xmm0
+; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
+; SSE3-NEXT: pand %xmm1, %xmm0
; SSE3-NEXT: pxor %xmm2, %xmm0
-; SSE3-NEXT: por %xmm4, %xmm0
+; SSE3-NEXT: por %xmm3, %xmm0
; SSE3-NEXT: retq
;
; SSSE3-LABEL: ne_1_v2i64:
; SSSE3: # %bb.0:
; SSSE3-NEXT: pxor %xmm1, %xmm1
+; SSSE3-NEXT: movdqa %xmm0, %xmm2
+; SSSE3-NEXT: pcmpeqd %xmm1, %xmm2
+; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,0,3,2]
+; SSSE3-NEXT: pand %xmm2, %xmm3
; SSSE3-NEXT: pcmpeqd %xmm2, %xmm2
-; SSSE3-NEXT: movdqa %xmm0, %xmm3
-; SSSE3-NEXT: paddq %xmm2, %xmm3
-; SSSE3-NEXT: pand %xmm0, %xmm3
+; SSSE3-NEXT: movdqa %xmm0, %xmm4
+; SSSE3-NEXT: paddq %xmm2, %xmm4
+; SSSE3-NEXT: pand %xmm4, %xmm0
; SSSE3-NEXT: pcmpeqd %xmm1, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,0,3,2]
-; SSSE3-NEXT: pand %xmm0, %xmm4
-; SSSE3-NEXT: pcmpeqd %xmm1, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,0,3,2]
-; SSSE3-NEXT: pand %xmm3, %xmm0
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
+; SSSE3-NEXT: pand %xmm1, %xmm0
; SSSE3-NEXT: pxor %xmm2, %xmm0
-; SSSE3-NEXT: por %xmm4, %xmm0
+; SSSE3-NEXT: por %xmm3, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: ne_1_v2i64:
diff --git a/llvm/test/CodeGen/X86/vector-reduce-fadd-fast.ll b/llvm/test/CodeGen/X86/vector-reduce-fadd-fast.ll
index e8738ec933821..254d8ede61b19 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-fadd-fast.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-fadd-fast.ll
@@ -321,9 +321,9 @@ define float @test_v4f32_zero(<4 x float> %a0) {
; SSE2: # %bb.0:
; SSE2-NEXT: movaps %xmm0, %xmm1
; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE2-NEXT: addps %xmm0, %xmm1
-; SSE2-NEXT: movaps %xmm1, %xmm0
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1]
+; SSE2-NEXT: addps %xmm1, %xmm0
+; SSE2-NEXT: movaps %xmm0, %xmm1
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
; SSE2-NEXT: addss %xmm1, %xmm0
; SSE2-NEXT: retq
;
@@ -331,8 +331,8 @@ define float @test_v4f32_zero(<4 x float> %a0) {
; SSE41: # %bb.0:
; SSE41-NEXT: movaps %xmm0, %xmm1
; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE41-NEXT: addps %xmm0, %xmm1
-; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE41-NEXT: addps %xmm1, %xmm0
+; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; SSE41-NEXT: addss %xmm1, %xmm0
; SSE41-NEXT: retq
;
@@ -375,9 +375,9 @@ define float @test_v8f32_zero(<8 x float> %a0) {
; SSE2-NEXT: addps %xmm1, %xmm0
; SSE2-NEXT: movaps %xmm0, %xmm1
; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE2-NEXT: addps %xmm0, %xmm1
-; SSE2-NEXT: movaps %xmm1, %xmm0
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1]
+; SSE2-NEXT: addps %xmm1, %xmm0
+; SSE2-NEXT: movaps %xmm0, %xmm1
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
; SSE2-NEXT: addss %xmm1, %xmm0
; SSE2-NEXT: retq
;
@@ -386,8 +386,8 @@ define float @test_v8f32_zero(<8 x float> %a0) {
; SSE41-NEXT: addps %xmm1, %xmm0
; SSE41-NEXT: movaps %xmm0, %xmm1
; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE41-NEXT: addps %xmm0, %xmm1
-; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE41-NEXT: addps %xmm1, %xmm0
+; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; SSE41-NEXT: addss %xmm1, %xmm0
; SSE41-NEXT: retq
;
@@ -444,9 +444,9 @@ define float @test_v16f32_zero(<16 x float> %a0) {
; SSE2-NEXT: addps %xmm1, %xmm0
; SSE2-NEXT: movaps %xmm0, %xmm1
; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE2-NEXT: addps %xmm0, %xmm1
-; SSE2-NEXT: movaps %xmm1, %xmm0
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1]
+; SSE2-NEXT: addps %xmm1, %xmm0
+; SSE2-NEXT: movaps %xmm0, %xmm1
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
; SSE2-NEXT: addss %xmm1, %xmm0
; SSE2-NEXT: retq
;
@@ -457,8 +457,8 @@ define float @test_v16f32_zero(<16 x float> %a0) {
; SSE41-NEXT: addps %xmm1, %xmm0
; SSE41-NEXT: movaps %xmm0, %xmm1
; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE41-NEXT: addps %xmm0, %xmm1
-; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE41-NEXT: addps %xmm1, %xmm0
+; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; SSE41-NEXT: addss %xmm1, %xmm0
; SSE41-NEXT: retq
;
@@ -562,9 +562,9 @@ define float @test_v4f32_undef(<4 x float> %a0) {
; SSE2: # %bb.0:
; SSE2-NEXT: movaps %xmm0, %xmm1
; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE2-NEXT: addps %xmm0, %xmm1
-; SSE2-NEXT: movaps %xmm1, %xmm0
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1]
+; SSE2-NEXT: addps %xmm1, %xmm0
+; SSE2-NEXT: movaps %xmm0, %xmm1
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
; SSE2-NEXT: addss %xmm1, %xmm0
; SSE2-NEXT: retq
;
@@ -572,8 +572,8 @@ define float @test_v4f32_undef(<4 x float> %a0) {
; SSE41: # %bb.0:
; SSE41-NEXT: movaps %xmm0, %xmm1
; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE41-NEXT: addps %xmm0, %xmm1
-; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE41-NEXT: addps %xmm1, %xmm0
+; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; SSE41-NEXT: addss %xmm1, %xmm0
; SSE41-NEXT: retq
;
@@ -616,9 +616,9 @@ define float @test_v8f32_undef(<8 x float> %a0) {
; SSE2-NEXT: addps %xmm1, %xmm0
; SSE2-NEXT: movaps %xmm0, %xmm1
; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE2-NEXT: addps %xmm0, %xmm1
-; SSE2-NEXT: movaps %xmm1, %xmm0
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1]
+; SSE2-NEXT: addps %xmm1, %xmm0
+; SSE2-NEXT: movaps %xmm0, %xmm1
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
; SSE2-NEXT: addss %xmm1, %xmm0
; SSE2-NEXT: retq
;
@@ -627,8 +627,8 @@ define float @test_v8f32_undef(<8 x float> %a0) {
; SSE41-NEXT: addps %xmm1, %xmm0
; SSE41-NEXT: movaps %xmm0, %xmm1
; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE41-NEXT: addps %xmm0, %xmm1
-; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE41-NEXT: addps %xmm1, %xmm0
+; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; SSE41-NEXT: addss %xmm1, %xmm0
; SSE41-NEXT: retq
;
@@ -685,9 +685,9 @@ define float @test_v16f32_undef(<16 x float> %a0) {
; SSE2-NEXT: addps %xmm1, %xmm0
; SSE2-NEXT: movaps %xmm0, %xmm1
; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE2-NEXT: addps %xmm0, %xmm1
-; SSE2-NEXT: movaps %xmm1, %xmm0
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1]
+; SSE2-NEXT: addps %xmm1, %xmm0
+; SSE2-NEXT: movaps %xmm0, %xmm1
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
; SSE2-NEXT: addss %xmm1, %xmm0
; SSE2-NEXT: retq
;
@@ -698,8 +698,8 @@ define float @test_v16f32_undef(<16 x float> %a0) {
; SSE41-NEXT: addps %xmm1, %xmm0
; SSE41-NEXT: movaps %xmm0, %xmm1
; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE41-NEXT: addps %xmm0, %xmm1
-; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE41-NEXT: addps %xmm1, %xmm0
+; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; SSE41-NEXT: addss %xmm1, %xmm0
; SSE41-NEXT: retq
;
@@ -1126,9 +1126,9 @@ define double @test_v16f64_zero(<16 x double> %a0) {
; SSE-NEXT: addpd %xmm7, %xmm3
; SSE-NEXT: addpd %xmm5, %xmm1
; SSE-NEXT: addpd %xmm3, %xmm1
-; SSE-NEXT: addpd %xmm0, %xmm1
-; SSE-NEXT: movapd %xmm1, %xmm0
-; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; SSE-NEXT: addpd %xmm1, %xmm0
+; SSE-NEXT: movapd %xmm0, %xmm1
+; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
; SSE-NEXT: addsd %xmm1, %xmm0
; SSE-NEXT: retq
;
@@ -1330,9 +1330,9 @@ define double @test_v16f64_undef(<16 x double> %a0) {
; SSE-NEXT: addpd %xmm7, %xmm3
; SSE-NEXT: addpd %xmm5, %xmm1
; SSE-NEXT: addpd %xmm3, %xmm1
-; SSE-NEXT: addpd %xmm0, %xmm1
-; SSE-NEXT: movapd %xmm1, %xmm0
-; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; SSE-NEXT: addpd %xmm1, %xmm0
+; SSE-NEXT: movapd %xmm0, %xmm1
+; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
; SSE-NEXT: addsd %xmm1, %xmm0
; SSE-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmul-fast.ll b/llvm/test/CodeGen/X86/vector-reduce-fmul-fast.ll
index e0b30d1d69fe2..7b17b34519060 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-fmul-fast.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-fmul-fast.ll
@@ -233,9 +233,9 @@ define float @test_v4f32_zero(<4 x float> %a0) {
; SSE2: # %bb.0:
; SSE2-NEXT: movaps %xmm0, %xmm1
; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE2-NEXT: mulps %xmm0, %xmm1
-; SSE2-NEXT: movaps %xmm1, %xmm0
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1]
+; SSE2-NEXT: mulps %xmm1, %xmm0
+; SSE2-NEXT: movaps %xmm0, %xmm1
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
; SSE2-NEXT: mulss %xmm1, %xmm0
; SSE2-NEXT: retq
;
@@ -243,8 +243,8 @@ define float @test_v4f32_zero(<4 x float> %a0) {
; SSE41: # %bb.0:
; SSE41-NEXT: movaps %xmm0, %xmm1
; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE41-NEXT: mulps %xmm0, %xmm1
-; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE41-NEXT: mulps %xmm1, %xmm0
+; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; SSE41-NEXT: mulss %xmm1, %xmm0
; SSE41-NEXT: retq
;
@@ -273,9 +273,9 @@ define float @test_v8f32_zero(<8 x float> %a0) {
; SSE2-NEXT: mulps %xmm1, %xmm0
; SSE2-NEXT: movaps %xmm0, %xmm1
; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE2-NEXT: mulps %xmm0, %xmm1
-; SSE2-NEXT: movaps %xmm1, %xmm0
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1]
+; SSE2-NEXT: mulps %xmm1, %xmm0
+; SSE2-NEXT: movaps %xmm0, %xmm1
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
; SSE2-NEXT: mulss %xmm1, %xmm0
; SSE2-NEXT: retq
;
@@ -284,8 +284,8 @@ define float @test_v8f32_zero(<8 x float> %a0) {
; SSE41-NEXT: mulps %xmm1, %xmm0
; SSE41-NEXT: movaps %xmm0, %xmm1
; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE41-NEXT: mulps %xmm0, %xmm1
-; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE41-NEXT: mulps %xmm1, %xmm0
+; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; SSE41-NEXT: mulss %xmm1, %xmm0
; SSE41-NEXT: retq
;
@@ -322,9 +322,9 @@ define float @test_v16f32_zero(<16 x float> %a0) {
; SSE2-NEXT: mulps %xmm1, %xmm0
; SSE2-NEXT: movaps %xmm0, %xmm1
; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE2-NEXT: mulps %xmm0, %xmm1
-; SSE2-NEXT: movaps %xmm1, %xmm0
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1]
+; SSE2-NEXT: mulps %xmm1, %xmm0
+; SSE2-NEXT: movaps %xmm0, %xmm1
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
; SSE2-NEXT: mulss %xmm1, %xmm0
; SSE2-NEXT: retq
;
@@ -335,8 +335,8 @@ define float @test_v16f32_zero(<16 x float> %a0) {
; SSE41-NEXT: mulps %xmm1, %xmm0
; SSE41-NEXT: movaps %xmm0, %xmm1
; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE41-NEXT: mulps %xmm0, %xmm1
-; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE41-NEXT: mulps %xmm1, %xmm0
+; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; SSE41-NEXT: mulss %xmm1, %xmm0
; SSE41-NEXT: retq
;
@@ -406,9 +406,9 @@ define float @test_v4f32_undef(<4 x float> %a0) {
; SSE2: # %bb.0:
; SSE2-NEXT: movaps %xmm0, %xmm1
; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE2-NEXT: mulps %xmm0, %xmm1
-; SSE2-NEXT: movaps %xmm1, %xmm0
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1]
+; SSE2-NEXT: mulps %xmm1, %xmm0
+; SSE2-NEXT: movaps %xmm0, %xmm1
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
; SSE2-NEXT: mulss %xmm1, %xmm0
; SSE2-NEXT: retq
;
@@ -416,8 +416,8 @@ define float @test_v4f32_undef(<4 x float> %a0) {
; SSE41: # %bb.0:
; SSE41-NEXT: movaps %xmm0, %xmm1
; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE41-NEXT: mulps %xmm0, %xmm1
-; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE41-NEXT: mulps %xmm1, %xmm0
+; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; SSE41-NEXT: mulss %xmm1, %xmm0
; SSE41-NEXT: retq
;
@@ -446,9 +446,9 @@ define float @test_v8f32_undef(<8 x float> %a0) {
; SSE2-NEXT: mulps %xmm1, %xmm0
; SSE2-NEXT: movaps %xmm0, %xmm1
; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE2-NEXT: mulps %xmm0, %xmm1
-; SSE2-NEXT: movaps %xmm1, %xmm0
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1]
+; SSE2-NEXT: mulps %xmm1, %xmm0
+; SSE2-NEXT: movaps %xmm0, %xmm1
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
; SSE2-NEXT: mulss %xmm1, %xmm0
; SSE2-NEXT: retq
;
@@ -457,8 +457,8 @@ define float @test_v8f32_undef(<8 x float> %a0) {
; SSE41-NEXT: mulps %xmm1, %xmm0
; SSE41-NEXT: movaps %xmm0, %xmm1
; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE41-NEXT: mulps %xmm0, %xmm1
-; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE41-NEXT: mulps %xmm1, %xmm0
+; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; SSE41-NEXT: mulss %xmm1, %xmm0
; SSE41-NEXT: retq
;
@@ -495,9 +495,9 @@ define float @test_v16f32_undef(<16 x float> %a0) {
; SSE2-NEXT: mulps %xmm1, %xmm0
; SSE2-NEXT: movaps %xmm0, %xmm1
; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE2-NEXT: mulps %xmm0, %xmm1
-; SSE2-NEXT: movaps %xmm1, %xmm0
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1]
+; SSE2-NEXT: mulps %xmm1, %xmm0
+; SSE2-NEXT: movaps %xmm0, %xmm1
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
; SSE2-NEXT: mulss %xmm1, %xmm0
; SSE2-NEXT: retq
;
@@ -508,8 +508,8 @@ define float @test_v16f32_undef(<16 x float> %a0) {
; SSE41-NEXT: mulps %xmm1, %xmm0
; SSE41-NEXT: movaps %xmm0, %xmm1
; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE41-NEXT: mulps %xmm0, %xmm1
-; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE41-NEXT: mulps %xmm1, %xmm0
+; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; SSE41-NEXT: mulss %xmm1, %xmm0
; SSE41-NEXT: retq
;
@@ -788,9 +788,9 @@ define double @test_v16f64_zero(<16 x double> %a0) {
; SSE-NEXT: mulpd %xmm7, %xmm3
; SSE-NEXT: mulpd %xmm5, %xmm1
; SSE-NEXT: mulpd %xmm3, %xmm1
-; SSE-NEXT: mulpd %xmm0, %xmm1
-; SSE-NEXT: movapd %xmm1, %xmm0
-; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; SSE-NEXT: mulpd %xmm1, %xmm0
+; SSE-NEXT: movapd %xmm0, %xmm1
+; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
; SSE-NEXT: mulsd %xmm1, %xmm0
; SSE-NEXT: retq
;
@@ -922,9 +922,9 @@ define double @test_v16f64_undef(<16 x double> %a0) {
; SSE-NEXT: mulpd %xmm7, %xmm3
; SSE-NEXT: mulpd %xmm5, %xmm1
; SSE-NEXT: mulpd %xmm3, %xmm1
-; SSE-NEXT: mulpd %xmm0, %xmm1
-; SSE-NEXT: movapd %xmm1, %xmm0
-; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; SSE-NEXT: mulpd %xmm1, %xmm0
+; SSE-NEXT: movapd %xmm0, %xmm1
+; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
; SSE-NEXT: mulsd %xmm1, %xmm0
; SSE-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll
index 0244beb6745c9..3dcc2507ac6b5 100644
--- a/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll
@@ -257,33 +257,31 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
;
; SSE41-LABEL: var_shift_v8i16:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: movdqa %xmm0, %xmm2
+; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: psllw $12, %xmm0
-; SSE41-NEXT: psllw $4, %xmm2
-; SSE41-NEXT: por %xmm0, %xmm2
+; SSE41-NEXT: psllw $4, %xmm1
+; SSE41-NEXT: por %xmm1, %xmm0
+; SSE41-NEXT: movdqa %xmm0, %xmm1
+; SSE41-NEXT: paddw %xmm0, %xmm1
; SSE41-NEXT: movdqa %xmm2, %xmm3
-; SSE41-NEXT: paddw %xmm2, %xmm3
-; SSE41-NEXT: movdqa %xmm1, %xmm4
-; SSE41-NEXT: psraw $8, %xmm4
-; SSE41-NEXT: movdqa %xmm2, %xmm0
-; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: psraw $4, %xmm2
-; SSE41-NEXT: movdqa %xmm3, %xmm0
-; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: psraw $2, %xmm2
-; SSE41-NEXT: paddw %xmm3, %xmm3
-; SSE41-NEXT: movdqa %xmm3, %xmm0
-; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: psraw $1, %xmm2
-; SSE41-NEXT: paddw %xmm3, %xmm3
-; SSE41-NEXT: movdqa %xmm3, %xmm0
-; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
+; SSE41-NEXT: psraw $8, %xmm3
+; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm3
+; SSE41-NEXT: psraw $4, %xmm3
; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm3
+; SSE41-NEXT: psraw $2, %xmm3
+; SSE41-NEXT: paddw %xmm1, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm3
+; SSE41-NEXT: psraw $1, %xmm3
+; SSE41-NEXT: paddw %xmm1, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: var_shift_v8i16:
diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll
index aebc0eb0bb528..1ba47a7adbbe7 100644
--- a/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll
@@ -161,33 +161,31 @@ define <4 x i16> @var_shift_v4i16(<4 x i16> %a, <4 x i16> %b) nounwind {
;
; SSE41-LABEL: var_shift_v4i16:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: movdqa %xmm0, %xmm2
+; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: psllw $12, %xmm0
-; SSE41-NEXT: psllw $4, %xmm2
-; SSE41-NEXT: por %xmm0, %xmm2
+; SSE41-NEXT: psllw $4, %xmm1
+; SSE41-NEXT: por %xmm1, %xmm0
+; SSE41-NEXT: movdqa %xmm0, %xmm1
+; SSE41-NEXT: paddw %xmm0, %xmm1
; SSE41-NEXT: movdqa %xmm2, %xmm3
-; SSE41-NEXT: paddw %xmm2, %xmm3
-; SSE41-NEXT: movdqa %xmm1, %xmm4
-; SSE41-NEXT: psraw $8, %xmm4
-; SSE41-NEXT: movdqa %xmm2, %xmm0
-; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: psraw $4, %xmm2
-; SSE41-NEXT: movdqa %xmm3, %xmm0
-; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: psraw $2, %xmm2
-; SSE41-NEXT: paddw %xmm3, %xmm3
-; SSE41-NEXT: movdqa %xmm3, %xmm0
-; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: psraw $1, %xmm2
-; SSE41-NEXT: paddw %xmm3, %xmm3
-; SSE41-NEXT: movdqa %xmm3, %xmm0
-; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
+; SSE41-NEXT: psraw $8, %xmm3
+; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm3
+; SSE41-NEXT: psraw $4, %xmm3
; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm3
+; SSE41-NEXT: psraw $2, %xmm3
+; SSE41-NEXT: paddw %xmm1, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm3
+; SSE41-NEXT: psraw $1, %xmm3
+; SSE41-NEXT: paddw %xmm1, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: var_shift_v4i16:
@@ -334,33 +332,31 @@ define <2 x i16> @var_shift_v2i16(<2 x i16> %a, <2 x i16> %b) nounwind {
;
; SSE41-LABEL: var_shift_v2i16:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: movdqa %xmm0, %xmm2
+; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: psllw $12, %xmm0
-; SSE41-NEXT: psllw $4, %xmm2
-; SSE41-NEXT: por %xmm0, %xmm2
+; SSE41-NEXT: psllw $4, %xmm1
+; SSE41-NEXT: por %xmm1, %xmm0
+; SSE41-NEXT: movdqa %xmm0, %xmm1
+; SSE41-NEXT: paddw %xmm0, %xmm1
; SSE41-NEXT: movdqa %xmm2, %xmm3
-; SSE41-NEXT: paddw %xmm2, %xmm3
-; SSE41-NEXT: movdqa %xmm1, %xmm4
-; SSE41-NEXT: psraw $8, %xmm4
-; SSE41-NEXT: movdqa %xmm2, %xmm0
-; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: psraw $4, %xmm2
-; SSE41-NEXT: movdqa %xmm3, %xmm0
-; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: psraw $2, %xmm2
-; SSE41-NEXT: paddw %xmm3, %xmm3
-; SSE41-NEXT: movdqa %xmm3, %xmm0
-; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: psraw $1, %xmm2
-; SSE41-NEXT: paddw %xmm3, %xmm3
-; SSE41-NEXT: movdqa %xmm3, %xmm0
-; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
+; SSE41-NEXT: psraw $8, %xmm3
+; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm3
+; SSE41-NEXT: psraw $4, %xmm3
; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm3
+; SSE41-NEXT: psraw $2, %xmm3
+; SSE41-NEXT: paddw %xmm1, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm3
+; SSE41-NEXT: psraw $1, %xmm3
+; SSE41-NEXT: paddw %xmm1, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: var_shift_v2i16:
diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll
index 792cf6a261686..cb9e1341e0967 100644
--- a/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll
@@ -227,33 +227,31 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
;
; SSE41-LABEL: var_shift_v8i16:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: movdqa %xmm0, %xmm2
+; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: psllw $12, %xmm0
-; SSE41-NEXT: psllw $4, %xmm2
-; SSE41-NEXT: por %xmm0, %xmm2
+; SSE41-NEXT: psllw $4, %xmm1
+; SSE41-NEXT: por %xmm1, %xmm0
+; SSE41-NEXT: movdqa %xmm0, %xmm1
+; SSE41-NEXT: paddw %xmm0, %xmm1
; SSE41-NEXT: movdqa %xmm2, %xmm3
-; SSE41-NEXT: paddw %xmm2, %xmm3
-; SSE41-NEXT: movdqa %xmm1, %xmm4
-; SSE41-NEXT: psrlw $8, %xmm4
-; SSE41-NEXT: movdqa %xmm2, %xmm0
-; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: psrlw $4, %xmm2
-; SSE41-NEXT: movdqa %xmm3, %xmm0
-; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: psrlw $2, %xmm2
-; SSE41-NEXT: paddw %xmm3, %xmm3
-; SSE41-NEXT: movdqa %xmm3, %xmm0
-; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: psrlw $1, %xmm2
-; SSE41-NEXT: paddw %xmm3, %xmm3
-; SSE41-NEXT: movdqa %xmm3, %xmm0
-; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
+; SSE41-NEXT: psrlw $8, %xmm3
+; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm3
+; SSE41-NEXT: psrlw $4, %xmm3
; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm3
+; SSE41-NEXT: psrlw $2, %xmm3
+; SSE41-NEXT: paddw %xmm1, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm3
+; SSE41-NEXT: psrlw $1, %xmm3
+; SSE41-NEXT: paddw %xmm1, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: var_shift_v8i16:
diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll
index ac5f158f6f6be..4cc16c584e461 100644
--- a/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll
@@ -161,33 +161,31 @@ define <4 x i16> @var_shift_v4i16(<4 x i16> %a, <4 x i16> %b) nounwind {
;
; SSE41-LABEL: var_shift_v4i16:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: movdqa %xmm0, %xmm2
+; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: psllw $12, %xmm0
-; SSE41-NEXT: psllw $4, %xmm2
-; SSE41-NEXT: por %xmm0, %xmm2
+; SSE41-NEXT: psllw $4, %xmm1
+; SSE41-NEXT: por %xmm1, %xmm0
+; SSE41-NEXT: movdqa %xmm0, %xmm1
+; SSE41-NEXT: paddw %xmm0, %xmm1
; SSE41-NEXT: movdqa %xmm2, %xmm3
-; SSE41-NEXT: paddw %xmm2, %xmm3
-; SSE41-NEXT: movdqa %xmm1, %xmm4
-; SSE41-NEXT: psrlw $8, %xmm4
-; SSE41-NEXT: movdqa %xmm2, %xmm0
-; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: psrlw $4, %xmm2
-; SSE41-NEXT: movdqa %xmm3, %xmm0
-; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: psrlw $2, %xmm2
-; SSE41-NEXT: paddw %xmm3, %xmm3
-; SSE41-NEXT: movdqa %xmm3, %xmm0
-; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: psrlw $1, %xmm2
-; SSE41-NEXT: paddw %xmm3, %xmm3
-; SSE41-NEXT: movdqa %xmm3, %xmm0
-; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
+; SSE41-NEXT: psrlw $8, %xmm3
+; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm3
+; SSE41-NEXT: psrlw $4, %xmm3
; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm3
+; SSE41-NEXT: psrlw $2, %xmm3
+; SSE41-NEXT: paddw %xmm1, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm3
+; SSE41-NEXT: psrlw $1, %xmm3
+; SSE41-NEXT: paddw %xmm1, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: var_shift_v4i16:
@@ -334,33 +332,31 @@ define <2 x i16> @var_shift_v2i16(<2 x i16> %a, <2 x i16> %b) nounwind {
;
; SSE41-LABEL: var_shift_v2i16:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: movdqa %xmm0, %xmm2
+; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: psllw $12, %xmm0
-; SSE41-NEXT: psllw $4, %xmm2
-; SSE41-NEXT: por %xmm0, %xmm2
+; SSE41-NEXT: psllw $4, %xmm1
+; SSE41-NEXT: por %xmm1, %xmm0
+; SSE41-NEXT: movdqa %xmm0, %xmm1
+; SSE41-NEXT: paddw %xmm0, %xmm1
; SSE41-NEXT: movdqa %xmm2, %xmm3
-; SSE41-NEXT: paddw %xmm2, %xmm3
-; SSE41-NEXT: movdqa %xmm1, %xmm4
-; SSE41-NEXT: psrlw $8, %xmm4
-; SSE41-NEXT: movdqa %xmm2, %xmm0
-; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: psrlw $4, %xmm2
-; SSE41-NEXT: movdqa %xmm3, %xmm0
-; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: psrlw $2, %xmm2
-; SSE41-NEXT: paddw %xmm3, %xmm3
-; SSE41-NEXT: movdqa %xmm3, %xmm0
-; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: psrlw $1, %xmm2
-; SSE41-NEXT: paddw %xmm3, %xmm3
-; SSE41-NEXT: movdqa %xmm3, %xmm0
-; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
+; SSE41-NEXT: psrlw $8, %xmm3
+; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm3
+; SSE41-NEXT: psrlw $4, %xmm3
; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm3
+; SSE41-NEXT: psrlw $2, %xmm3
+; SSE41-NEXT: paddw %xmm1, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm3
+; SSE41-NEXT: psrlw $1, %xmm3
+; SSE41-NEXT: paddw %xmm1, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: var_shift_v2i16:
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll
index 442e7c4d373f0..b2d813dd440a6 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll
@@ -23,18 +23,18 @@ define <32 x i16> @combine_vpermt2var_32i16_identity_mask(<32 x i16> %x0, <32 x
; X86: # %bb.0:
; X86-NEXT: vmovdqa64 {{.*#+}} zmm1 = [31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1
-; X86-NEXT: vpermi2w %zmm0, %zmm0, %zmm1 {%k1} {z}
-; X86-NEXT: vmovdqa64 {{.*#+}} zmm0 = [63,30,61,28,59,26,57,24,55,22,53,20,51,18,49,16,47,46,13,44,11,42,9,40,7,38,5,36,3,34,1,32]
-; X86-NEXT: vpermi2w %zmm1, %zmm1, %zmm0 {%k1} {z}
+; X86-NEXT: vpermt2w %zmm0, %zmm1, %zmm0 {%k1} {z}
+; X86-NEXT: vmovdqa64 {{.*#+}} zmm1 = [63,30,61,28,59,26,57,24,55,22,53,20,51,18,49,16,47,46,13,44,11,42,9,40,7,38,5,36,3,34,1,32]
+; X86-NEXT: vpermt2w %zmm0, %zmm1, %zmm0 {%k1} {z}
; X86-NEXT: retl
;
; X64-LABEL: combine_vpermt2var_32i16_identity_mask:
; X64: # %bb.0:
; X64-NEXT: vmovdqa64 {{.*#+}} zmm1 = [31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
; X64-NEXT: kmovd %edi, %k1
-; X64-NEXT: vpermi2w %zmm0, %zmm0, %zmm1 {%k1} {z}
-; X64-NEXT: vmovdqa64 {{.*#+}} zmm0 = [63,30,61,28,59,26,57,24,55,22,53,20,51,18,49,16,47,46,13,44,11,42,9,40,7,38,5,36,3,34,1,32]
-; X64-NEXT: vpermi2w %zmm1, %zmm1, %zmm0 {%k1} {z}
+; X64-NEXT: vpermt2w %zmm0, %zmm1, %zmm0 {%k1} {z}
+; X64-NEXT: vmovdqa64 {{.*#+}} zmm1 = [63,30,61,28,59,26,57,24,55,22,53,20,51,18,49,16,47,46,13,44,11,42,9,40,7,38,5,36,3,34,1,32]
+; X64-NEXT: vpermt2w %zmm0, %zmm1, %zmm0 {%k1} {z}
; X64-NEXT: retq
%res0 = call <32 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.512(<32 x i16> <i16 31, i16 30, i16 29, i16 28, i16 27, i16 26, i16 25, i16 24, i16 23, i16 22, i16 21, i16 20, i16 19, i16 18, i16 17, i16 16, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>, <32 x i16> %x0, <32 x i16> %x1, i32 %m)
%res1 = call <32 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.512(<32 x i16> <i16 63, i16 30, i16 61, i16 28, i16 59, i16 26, i16 57, i16 24, i16 55, i16 22, i16 53, i16 20, i16 51, i16 18, i16 49, i16 16, i16 47, i16 46, i16 13, i16 44, i16 11, i16 42, i16 9, i16 40, i16 7, i16 38, i16 5, i16 36, i16 3, i16 34, i16 1, i16 32>, <32 x i16> %res0, <32 x i16> %res0, i32 %m)
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll
index c358250305a7c..be65effbc7241 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll
@@ -18,18 +18,18 @@ define <16 x i16> @combine_vpermt2var_16i16_identity_mask(<16 x i16> %x0, <16 x
; X86: # %bb.0:
; X86-NEXT: vmovdqa {{.*#+}} ymm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
-; X86-NEXT: vpermi2w %ymm0, %ymm0, %ymm1 {%k1} {z}
-; X86-NEXT: vmovdqa {{.*#+}} ymm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
-; X86-NEXT: vpermi2w %ymm1, %ymm1, %ymm0 {%k1} {z}
+; X86-NEXT: vpermt2w %ymm0, %ymm1, %ymm0 {%k1} {z}
+; X86-NEXT: vmovdqa {{.*#+}} ymm1 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
+; X86-NEXT: vpermt2w %ymm0, %ymm1, %ymm0 {%k1} {z}
; X86-NEXT: retl
;
; X64-LABEL: combine_vpermt2var_16i16_identity_mask:
; X64: # %bb.0:
; X64-NEXT: vmovdqa {{.*#+}} ymm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
; X64-NEXT: kmovd %edi, %k1
-; X64-NEXT: vpermi2w %ymm0, %ymm0, %ymm1 {%k1} {z}
-; X64-NEXT: vmovdqa {{.*#+}} ymm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
-; X64-NEXT: vpermi2w %ymm1, %ymm1, %ymm0 {%k1} {z}
+; X64-NEXT: vpermt2w %ymm0, %ymm1, %ymm0 {%k1} {z}
+; X64-NEXT: vmovdqa {{.*#+}} ymm1 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
+; X64-NEXT: vpermt2w %ymm0, %ymm1, %ymm0 {%k1} {z}
; X64-NEXT: retq
%res0 = call <16 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.256(<16 x i16> <i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>, <16 x i16> %x0, <16 x i16> %x1, i16 %m)
%res1 = call <16 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.256(<16 x i16> <i16 15, i16 30, i16 13, i16 28, i16 11, i16 26, i16 9, i16 24, i16 7, i16 22, i16 5, i16 20, i16 3, i16 18, i16 1, i16 16>, <16 x i16> %res0, <16 x i16> %res0, i16 %m)
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512f.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512f.ll
index c9928d198a2c2..77e756bbb2dd1 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512f.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512f.ll
@@ -154,9 +154,9 @@ define <8 x double> @combine_vpermt2var_8f64_identity_mask(<8 x double> %x0, <8
; X86-AVX512F-NEXT: vmovapd {{.*#+}} zmm1 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0]
; X86-AVX512F-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; X86-AVX512F-NEXT: kmovw %eax, %k1
-; X86-AVX512F-NEXT: vpermi2pd %zmm0, %zmm0, %zmm1 {%k1} {z}
-; X86-AVX512F-NEXT: vmovapd {{.*#+}} zmm0 = [7,0,14,0,5,0,12,0,3,0,10,0,1,0,8,0]
-; X86-AVX512F-NEXT: vpermi2pd %zmm1, %zmm1, %zmm0 {%k1} {z}
+; X86-AVX512F-NEXT: vpermt2pd %zmm0, %zmm1, %zmm0 {%k1} {z}
+; X86-AVX512F-NEXT: vmovapd {{.*#+}} zmm1 = [7,0,14,0,5,0,12,0,3,0,10,0,1,0,8,0]
+; X86-AVX512F-NEXT: vpermt2pd %zmm0, %zmm1, %zmm0 {%k1} {z}
; X86-AVX512F-NEXT: retl
;
; X86-AVX512BW-LABEL: combine_vpermt2var_8f64_identity_mask:
@@ -164,27 +164,27 @@ define <8 x double> @combine_vpermt2var_8f64_identity_mask(<8 x double> %x0, <8
; X86-AVX512BW-NEXT: vmovapd {{.*#+}} zmm1 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0]
; X86-AVX512BW-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; X86-AVX512BW-NEXT: kmovd %eax, %k1
-; X86-AVX512BW-NEXT: vpermi2pd %zmm0, %zmm0, %zmm1 {%k1} {z}
-; X86-AVX512BW-NEXT: vmovapd {{.*#+}} zmm0 = [7,0,14,0,5,0,12,0,3,0,10,0,1,0,8,0]
-; X86-AVX512BW-NEXT: vpermi2pd %zmm1, %zmm1, %zmm0 {%k1} {z}
+; X86-AVX512BW-NEXT: vpermt2pd %zmm0, %zmm1, %zmm0 {%k1} {z}
+; X86-AVX512BW-NEXT: vmovapd {{.*#+}} zmm1 = [7,0,14,0,5,0,12,0,3,0,10,0,1,0,8,0]
+; X86-AVX512BW-NEXT: vpermt2pd %zmm0, %zmm1, %zmm0 {%k1} {z}
; X86-AVX512BW-NEXT: retl
;
; X64-AVX512F-LABEL: combine_vpermt2var_8f64_identity_mask:
; X64-AVX512F: # %bb.0:
; X64-AVX512F-NEXT: vmovapd {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
; X64-AVX512F-NEXT: kmovw %edi, %k1
-; X64-AVX512F-NEXT: vpermi2pd %zmm0, %zmm0, %zmm1 {%k1} {z}
-; X64-AVX512F-NEXT: vmovapd {{.*#+}} zmm0 = [7,14,5,12,3,10,1,8]
-; X64-AVX512F-NEXT: vpermi2pd %zmm1, %zmm1, %zmm0 {%k1} {z}
+; X64-AVX512F-NEXT: vpermt2pd %zmm0, %zmm1, %zmm0 {%k1} {z}
+; X64-AVX512F-NEXT: vmovapd {{.*#+}} zmm1 = [7,14,5,12,3,10,1,8]
+; X64-AVX512F-NEXT: vpermt2pd %zmm0, %zmm1, %zmm0 {%k1} {z}
; X64-AVX512F-NEXT: retq
;
; X64-AVX512BW-LABEL: combine_vpermt2var_8f64_identity_mask:
; X64-AVX512BW: # %bb.0:
; X64-AVX512BW-NEXT: vmovapd {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
; X64-AVX512BW-NEXT: kmovd %edi, %k1
-; X64-AVX512BW-NEXT: vpermi2pd %zmm0, %zmm0, %zmm1 {%k1} {z}
-; X64-AVX512BW-NEXT: vmovapd {{.*#+}} zmm0 = [7,14,5,12,3,10,1,8]
-; X64-AVX512BW-NEXT: vpermi2pd %zmm1, %zmm1, %zmm0 {%k1} {z}
+; X64-AVX512BW-NEXT: vpermt2pd %zmm0, %zmm1, %zmm0 {%k1} {z}
+; X64-AVX512BW-NEXT: vmovapd {{.*#+}} zmm1 = [7,14,5,12,3,10,1,8]
+; X64-AVX512BW-NEXT: vpermt2pd %zmm0, %zmm1, %zmm0 {%k1} {z}
; X64-AVX512BW-NEXT: retq
%res0 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> <i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>, <8 x double> %x0, <8 x double> %x1, i8 %m)
%res1 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> <i64 7, i64 14, i64 5, i64 12, i64 3, i64 10, i64 1, i64 8>, <8 x double> %res0, <8 x double> %res0, i8 %m)
@@ -258,9 +258,9 @@ define <8 x i64> @combine_vpermt2var_8i64_identity_mask(<8 x i64> %x0, <8 x i64>
; X86-AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0]
; X86-AVX512F-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; X86-AVX512F-NEXT: kmovw %eax, %k1
-; X86-AVX512F-NEXT: vpermi2q %zmm0, %zmm0, %zmm1 {%k1} {z}
-; X86-AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,0,14,0,5,0,12,0,3,0,10,0,1,0,8,0]
-; X86-AVX512F-NEXT: vpermi2q %zmm1, %zmm1, %zmm0 {%k1} {z}
+; X86-AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm0 {%k1} {z}
+; X86-AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,0,14,0,5,0,12,0,3,0,10,0,1,0,8,0]
+; X86-AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm0 {%k1} {z}
; X86-AVX512F-NEXT: retl
;
; X86-AVX512BW-LABEL: combine_vpermt2var_8i64_identity_mask:
@@ -268,27 +268,27 @@ define <8 x i64> @combine_vpermt2var_8i64_identity_mask(<8 x i64> %x0, <8 x i64>
; X86-AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0]
; X86-AVX512BW-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; X86-AVX512BW-NEXT: kmovd %eax, %k1
-; X86-AVX512BW-NEXT: vpermi2q %zmm0, %zmm0, %zmm1 {%k1} {z}
-; X86-AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,0,14,0,5,0,12,0,3,0,10,0,1,0,8,0]
-; X86-AVX512BW-NEXT: vpermi2q %zmm1, %zmm1, %zmm0 {%k1} {z}
+; X86-AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm0 {%k1} {z}
+; X86-AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,0,14,0,5,0,12,0,3,0,10,0,1,0,8,0]
+; X86-AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm0 {%k1} {z}
; X86-AVX512BW-NEXT: retl
;
; X64-AVX512F-LABEL: combine_vpermt2var_8i64_identity_mask:
; X64-AVX512F: # %bb.0:
; X64-AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
; X64-AVX512F-NEXT: kmovw %edi, %k1
-; X64-AVX512F-NEXT: vpermi2q %zmm0, %zmm0, %zmm1 {%k1} {z}
-; X64-AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,14,5,12,3,10,1,8]
-; X64-AVX512F-NEXT: vpermi2q %zmm1, %zmm1, %zmm0 {%k1} {z}
+; X64-AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm0 {%k1} {z}
+; X64-AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,14,5,12,3,10,1,8]
+; X64-AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm0 {%k1} {z}
; X64-AVX512F-NEXT: retq
;
; X64-AVX512BW-LABEL: combine_vpermt2var_8i64_identity_mask:
; X64-AVX512BW: # %bb.0:
; X64-AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
; X64-AVX512BW-NEXT: kmovd %edi, %k1
-; X64-AVX512BW-NEXT: vpermi2q %zmm0, %zmm0, %zmm1 {%k1} {z}
-; X64-AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,14,5,12,3,10,1,8]
-; X64-AVX512BW-NEXT: vpermi2q %zmm1, %zmm1, %zmm0 {%k1} {z}
+; X64-AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm0 {%k1} {z}
+; X64-AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,14,5,12,3,10,1,8]
+; X64-AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm0 {%k1} {z}
; X64-AVX512BW-NEXT: retq
%res0 = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> <i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>, <8 x i64> %x0, <8 x i64> %x1, i8 %m)
%res1 = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> <i64 7, i64 14, i64 5, i64 12, i64 3, i64 10, i64 1, i64 8>, <8 x i64> %res0, <8 x i64> %res0, i8 %m)
@@ -308,27 +308,27 @@ define <16 x float> @combine_vpermt2var_16f32_identity_mask(<16 x float> %x0, <1
; X86: # %bb.0:
; X86-NEXT: vmovaps {{.*#+}} zmm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
-; X86-NEXT: vpermi2ps %zmm0, %zmm0, %zmm1 {%k1} {z}
-; X86-NEXT: vmovaps {{.*#+}} zmm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
-; X86-NEXT: vpermi2ps %zmm1, %zmm1, %zmm0 {%k1} {z}
+; X86-NEXT: vpermt2ps %zmm0, %zmm1, %zmm0 {%k1} {z}
+; X86-NEXT: vmovaps {{.*#+}} zmm1 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
+; X86-NEXT: vpermt2ps %zmm0, %zmm1, %zmm0 {%k1} {z}
; X86-NEXT: retl
;
; X64-AVX512F-LABEL: combine_vpermt2var_16f32_identity_mask:
; X64-AVX512F: # %bb.0:
; X64-AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
; X64-AVX512F-NEXT: kmovw %edi, %k1
-; X64-AVX512F-NEXT: vpermi2ps %zmm0, %zmm0, %zmm1 {%k1} {z}
-; X64-AVX512F-NEXT: vmovaps {{.*#+}} zmm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
-; X64-AVX512F-NEXT: vpermi2ps %zmm1, %zmm1, %zmm0 {%k1} {z}
+; X64-AVX512F-NEXT: vpermt2ps %zmm0, %zmm1, %zmm0 {%k1} {z}
+; X64-AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
+; X64-AVX512F-NEXT: vpermt2ps %zmm0, %zmm1, %zmm0 {%k1} {z}
; X64-AVX512F-NEXT: retq
;
; X64-AVX512BW-LABEL: combine_vpermt2var_16f32_identity_mask:
; X64-AVX512BW: # %bb.0:
; X64-AVX512BW-NEXT: vmovaps {{.*#+}} zmm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
; X64-AVX512BW-NEXT: kmovd %edi, %k1
-; X64-AVX512BW-NEXT: vpermi2ps %zmm0, %zmm0, %zmm1 {%k1} {z}
-; X64-AVX512BW-NEXT: vmovaps {{.*#+}} zmm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
-; X64-AVX512BW-NEXT: vpermi2ps %zmm1, %zmm1, %zmm0 {%k1} {z}
+; X64-AVX512BW-NEXT: vpermt2ps %zmm0, %zmm1, %zmm0 {%k1} {z}
+; X64-AVX512BW-NEXT: vmovaps {{.*#+}} zmm1 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
+; X64-AVX512BW-NEXT: vpermt2ps %zmm0, %zmm1, %zmm0 {%k1} {z}
; X64-AVX512BW-NEXT: retq
%res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>, <16 x float> %x0, <16 x float> %x1, i16 %m)
%res1 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 15, i32 30, i32 13, i32 28, i32 11, i32 26, i32 9, i32 24, i32 7, i32 22, i32 5, i32 20, i32 3, i32 18, i32 1, i32 16>, <16 x float> %res0, <16 x float> %res0, i16 %m)
@@ -597,27 +597,27 @@ define <16 x i32> @combine_vpermt2var_16i32_identity_mask(<16 x i32> %x0, <16 x
; X86: # %bb.0:
; X86-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
-; X86-NEXT: vpermi2d %zmm0, %zmm0, %zmm1 {%k1} {z}
-; X86-NEXT: vmovdqa64 {{.*#+}} zmm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
-; X86-NEXT: vpermi2d %zmm1, %zmm1, %zmm0 {%k1} {z}
+; X86-NEXT: vpermt2d %zmm0, %zmm1, %zmm0 {%k1} {z}
+; X86-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
+; X86-NEXT: vpermt2d %zmm0, %zmm1, %zmm0 {%k1} {z}
; X86-NEXT: retl
;
; X64-AVX512F-LABEL: combine_vpermt2var_16i32_identity_mask:
; X64-AVX512F: # %bb.0:
; X64-AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
; X64-AVX512F-NEXT: kmovw %edi, %k1
-; X64-AVX512F-NEXT: vpermi2d %zmm0, %zmm0, %zmm1 {%k1} {z}
-; X64-AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
-; X64-AVX512F-NEXT: vpermi2d %zmm1, %zmm1, %zmm0 {%k1} {z}
+; X64-AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm0 {%k1} {z}
+; X64-AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
+; X64-AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm0 {%k1} {z}
; X64-AVX512F-NEXT: retq
;
; X64-AVX512BW-LABEL: combine_vpermt2var_16i32_identity_mask:
; X64-AVX512BW: # %bb.0:
; X64-AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
; X64-AVX512BW-NEXT: kmovd %edi, %k1
-; X64-AVX512BW-NEXT: vpermi2d %zmm0, %zmm0, %zmm1 {%k1} {z}
-; X64-AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
-; X64-AVX512BW-NEXT: vpermi2d %zmm1, %zmm1, %zmm0 {%k1} {z}
+; X64-AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm0 {%k1} {z}
+; X64-AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
+; X64-AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm0 {%k1} {z}
; X64-AVX512BW-NEXT: retq
%res0 = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>, <16 x i32> %x0, <16 x i32> %x1, i16 %m)
%res1 = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> <i32 15, i32 30, i32 13, i32 28, i32 11, i32 26, i32 9, i32 24, i32 7, i32 22, i32 5, i32 20, i32 3, i32 18, i32 1, i32 16>, <16 x i32> %res0, <16 x i32> %res0, i16 %m)
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512vbmi.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512vbmi.ll
index 86c206c6b1875..78ae0d23da978 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512vbmi.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512vbmi.ll
@@ -31,18 +31,18 @@ define <16 x i8> @combine_vpermt2var_16i8_identity_mask(<16 x i8> %x0, <16 x i8>
; X86: # %bb.0:
; X86-NEXT: vmovdqa {{.*#+}} xmm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
-; X86-NEXT: vpermi2b %xmm0, %xmm0, %xmm1 {%k1} {z}
-; X86-NEXT: vmovdqa {{.*#+}} xmm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
-; X86-NEXT: vpermi2b %xmm1, %xmm1, %xmm0 {%k1} {z}
+; X86-NEXT: vpermt2b %xmm0, %xmm1, %xmm0 {%k1} {z}
+; X86-NEXT: vmovdqa {{.*#+}} xmm1 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
+; X86-NEXT: vpermt2b %xmm0, %xmm1, %xmm0 {%k1} {z}
; X86-NEXT: retl
;
; X64-LABEL: combine_vpermt2var_16i8_identity_mask:
; X64: # %bb.0:
; X64-NEXT: vmovdqa {{.*#+}} xmm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
; X64-NEXT: kmovd %edi, %k1
-; X64-NEXT: vpermi2b %xmm0, %xmm0, %xmm1 {%k1} {z}
-; X64-NEXT: vmovdqa {{.*#+}} xmm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
-; X64-NEXT: vpermi2b %xmm1, %xmm1, %xmm0 {%k1} {z}
+; X64-NEXT: vpermt2b %xmm0, %xmm1, %xmm0 {%k1} {z}
+; X64-NEXT: vmovdqa {{.*#+}} xmm1 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
+; X64-NEXT: vpermt2b %xmm0, %xmm1, %xmm0 {%k1} {z}
; X64-NEXT: retq
%res0 = call <16 x i8> @llvm.x86.avx512.maskz.vpermt2var.qi.128(<16 x i8> <i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>, <16 x i8> %x0, <16 x i8> %x1, i16 %m)
%res1 = call <16 x i8> @llvm.x86.avx512.maskz.vpermt2var.qi.128(<16 x i8> <i8 15, i8 30, i8 13, i8 28, i8 11, i8 26, i8 9, i8 24, i8 7, i8 22, i8 5, i8 20, i8 3, i8 18, i8 1, i8 16>, <16 x i8> %res0, <16 x i8> %res0, i16 %m)
diff --git a/llvm/test/CodeGen/X86/vector-trunc-packus.ll b/llvm/test/CodeGen/X86/vector-trunc-packus.ll
index a4dc0b152d78d..9ab03ce654e8f 100644
--- a/llvm/test/CodeGen/X86/vector-trunc-packus.ll
+++ b/llvm/test/CodeGen/X86/vector-trunc-packus.ll
@@ -321,110 +321,110 @@ define void @trunc_packus_v2i64_v2i32_store(<2 x i64> %a0, <2 x i32>* %p1) {
define <4 x i32> @trunc_packus_v4i64_v4i32(<4 x i64> %a0) {
; SSE2-LABEL: trunc_packus_v4i64_v4i32:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [4294967295,4294967295]
+; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [4294967295,4294967295]
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: pxor %xmm2, %xmm3
+; SSE2-NEXT: movdqa %xmm0, %xmm4
+; SSE2-NEXT: pxor %xmm2, %xmm4
; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147483647,2147483647]
; SSE2-NEXT: movdqa %xmm5, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm3, %xmm6
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm6
; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm5, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm5, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
; SSE2-NEXT: pand %xmm7, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3]
-; SSE2-NEXT: por %xmm4, %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: pandn %xmm8, %xmm3
-; SSE2-NEXT: por %xmm0, %xmm3
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: pxor %xmm2, %xmm0
-; SSE2-NEXT: movdqa %xmm5, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm5, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSE2-NEXT: por %xmm4, %xmm6
; SSE2-NEXT: pand %xmm6, %xmm0
+; SSE2-NEXT: pandn %xmm3, %xmm6
+; SSE2-NEXT: por %xmm6, %xmm0
+; SSE2-NEXT: movdqa %xmm1, %xmm4
+; SSE2-NEXT: pxor %xmm2, %xmm4
+; SSE2-NEXT: movdqa %xmm5, %xmm6
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm6
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm5, %xmm4
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT: por %xmm0, %xmm4
-; SSE2-NEXT: pand %xmm4, %xmm1
-; SSE2-NEXT: pandn %xmm8, %xmm4
-; SSE2-NEXT: por %xmm1, %xmm4
-; SSE2-NEXT: movdqa %xmm4, %xmm0
-; SSE2-NEXT: pxor %xmm2, %xmm0
+; SSE2-NEXT: pand %xmm7, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
+; SSE2-NEXT: por %xmm4, %xmm5
+; SSE2-NEXT: pand %xmm5, %xmm1
+; SSE2-NEXT: pandn %xmm3, %xmm5
+; SSE2-NEXT: por %xmm1, %xmm5
+; SSE2-NEXT: movdqa %xmm5, %xmm1
+; SSE2-NEXT: pxor %xmm2, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm3
+; SSE2-NEXT: pcmpgtd %xmm2, %xmm3
+; SSE2-NEXT: pcmpeqd %xmm2, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT: pand %xmm3, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE2-NEXT: por %xmm1, %xmm3
+; SSE2-NEXT: pand %xmm5, %xmm3
; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm1
-; SSE2-NEXT: pcmpeqd %xmm2, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT: pand %xmm1, %xmm0
+; SSE2-NEXT: pxor %xmm2, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm4
+; SSE2-NEXT: pcmpgtd %xmm2, %xmm4
+; SSE2-NEXT: pcmpeqd %xmm2, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: por %xmm0, %xmm1
; SSE2-NEXT: pand %xmm4, %xmm1
-; SSE2-NEXT: movdqa %xmm3, %xmm0
-; SSE2-NEXT: pxor %xmm2, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm4
-; SSE2-NEXT: pcmpeqd %xmm2, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; SSE2-NEXT: pand %xmm4, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSE2-NEXT: por %xmm2, %xmm0
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
+; SSE2-NEXT: por %xmm1, %xmm2
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: trunc_packus_v4i64_v4i32:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [4294967295,4294967295]
+; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [4294967295,4294967295]
; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
-; SSSE3-NEXT: movdqa %xmm0, %xmm3
-; SSSE3-NEXT: pxor %xmm2, %xmm3
+; SSSE3-NEXT: movdqa %xmm0, %xmm4
+; SSSE3-NEXT: pxor %xmm2, %xmm4
; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147483647,2147483647]
; SSSE3-NEXT: movdqa %xmm5, %xmm6
-; SSSE3-NEXT: pcmpgtd %xmm3, %xmm6
+; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6
; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm5, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
+; SSSE3-NEXT: pcmpeqd %xmm5, %xmm4
+; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
; SSSE3-NEXT: pand %xmm7, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3]
-; SSSE3-NEXT: por %xmm4, %xmm3
-; SSSE3-NEXT: pand %xmm3, %xmm0
-; SSSE3-NEXT: pandn %xmm8, %xmm3
-; SSSE3-NEXT: por %xmm0, %xmm3
-; SSSE3-NEXT: movdqa %xmm1, %xmm0
-; SSSE3-NEXT: pxor %xmm2, %xmm0
-; SSSE3-NEXT: movdqa %xmm5, %xmm4
-; SSSE3-NEXT: pcmpgtd %xmm0, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm5, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSSE3-NEXT: por %xmm4, %xmm6
; SSSE3-NEXT: pand %xmm6, %xmm0
+; SSSE3-NEXT: pandn %xmm3, %xmm6
+; SSSE3-NEXT: por %xmm6, %xmm0
+; SSSE3-NEXT: movdqa %xmm1, %xmm4
+; SSSE3-NEXT: pxor %xmm2, %xmm4
+; SSSE3-NEXT: movdqa %xmm5, %xmm6
+; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6
+; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
+; SSSE3-NEXT: pcmpeqd %xmm5, %xmm4
; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT: por %xmm0, %xmm4
-; SSSE3-NEXT: pand %xmm4, %xmm1
-; SSSE3-NEXT: pandn %xmm8, %xmm4
-; SSSE3-NEXT: por %xmm1, %xmm4
-; SSSE3-NEXT: movdqa %xmm4, %xmm0
-; SSSE3-NEXT: pxor %xmm2, %xmm0
+; SSSE3-NEXT: pand %xmm7, %xmm4
+; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
+; SSSE3-NEXT: por %xmm4, %xmm5
+; SSSE3-NEXT: pand %xmm5, %xmm1
+; SSSE3-NEXT: pandn %xmm3, %xmm5
+; SSSE3-NEXT: por %xmm1, %xmm5
+; SSSE3-NEXT: movdqa %xmm5, %xmm1
+; SSSE3-NEXT: pxor %xmm2, %xmm1
+; SSSE3-NEXT: movdqa %xmm1, %xmm3
+; SSSE3-NEXT: pcmpgtd %xmm2, %xmm3
+; SSSE3-NEXT: pcmpeqd %xmm2, %xmm1
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSSE3-NEXT: pand %xmm3, %xmm1
+; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSSE3-NEXT: por %xmm1, %xmm3
+; SSSE3-NEXT: pand %xmm5, %xmm3
; SSSE3-NEXT: movdqa %xmm0, %xmm1
-; SSSE3-NEXT: pcmpgtd %xmm2, %xmm1
-; SSSE3-NEXT: pcmpeqd %xmm2, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT: pand %xmm1, %xmm0
+; SSSE3-NEXT: pxor %xmm2, %xmm1
+; SSSE3-NEXT: movdqa %xmm1, %xmm4
+; SSSE3-NEXT: pcmpgtd %xmm2, %xmm4
+; SSSE3-NEXT: pcmpeqd %xmm2, %xmm1
; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSSE3-NEXT: por %xmm0, %xmm1
; SSSE3-NEXT: pand %xmm4, %xmm1
-; SSSE3-NEXT: movdqa %xmm3, %xmm0
-; SSSE3-NEXT: pxor %xmm2, %xmm0
-; SSSE3-NEXT: movdqa %xmm0, %xmm4
-; SSSE3-NEXT: pcmpgtd %xmm2, %xmm4
-; SSSE3-NEXT: pcmpeqd %xmm2, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; SSSE3-NEXT: pand %xmm4, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSSE3-NEXT: por %xmm2, %xmm0
-; SSSE3-NEXT: pand %xmm3, %xmm0
-; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
+; SSSE3-NEXT: por %xmm1, %xmm2
+; SSSE3-NEXT: pand %xmm2, %xmm0
+; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: trunc_packus_v4i64_v4i32:
@@ -2312,10 +2312,10 @@ define <4 x i16> @trunc_packus_v4i32_v4i16(<4 x i32> %a0) {
; SSSE3-NEXT: pcmpgtd %xmm0, %xmm2
; SSSE3-NEXT: pand %xmm2, %xmm0
; SSSE3-NEXT: pandn %xmm1, %xmm2
-; SSSE3-NEXT: por %xmm0, %xmm2
+; SSSE3-NEXT: por %xmm2, %xmm0
; SSSE3-NEXT: pxor %xmm1, %xmm1
-; SSSE3-NEXT: movdqa %xmm2, %xmm0
-; SSSE3-NEXT: pcmpgtd %xmm1, %xmm0
+; SSSE3-NEXT: movdqa %xmm0, %xmm2
+; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2
; SSSE3-NEXT: pand %xmm2, %xmm0
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; SSSE3-NEXT: retq
@@ -2448,19 +2448,19 @@ define <8 x i16> @trunc_packus_v8i32_v8i16(<8 x i32> %a0) {
; SSE2-NEXT: pcmpgtd %xmm0, %xmm1
; SSE2-NEXT: pand %xmm1, %xmm0
; SSE2-NEXT: pandn %xmm2, %xmm1
-; SSE2-NEXT: por %xmm0, %xmm1
-; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm0
-; SSE2-NEXT: pand %xmm1, %xmm0
-; SSE2-NEXT: movdqa %xmm3, %xmm1
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm1
-; SSE2-NEXT: pand %xmm3, %xmm1
-; SSE2-NEXT: pslld $16, %xmm1
-; SSE2-NEXT: psrad $16, %xmm1
+; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: movdqa %xmm3, %xmm2
+; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
+; SSE2-NEXT: pand %xmm3, %xmm2
+; SSE2-NEXT: pslld $16, %xmm2
+; SSE2-NEXT: psrad $16, %xmm2
; SSE2-NEXT: pslld $16, %xmm0
; SSE2-NEXT: psrad $16, %xmm0
-; SSE2-NEXT: packssdw %xmm1, %xmm0
+; SSE2-NEXT: packssdw %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: trunc_packus_v8i32_v8i16:
@@ -2475,18 +2475,18 @@ define <8 x i16> @trunc_packus_v8i32_v8i16(<8 x i32> %a0) {
; SSSE3-NEXT: pcmpgtd %xmm0, %xmm1
; SSSE3-NEXT: pand %xmm1, %xmm0
; SSSE3-NEXT: pandn %xmm2, %xmm1
-; SSSE3-NEXT: por %xmm0, %xmm1
-; SSSE3-NEXT: pxor %xmm2, %xmm2
-; SSSE3-NEXT: movdqa %xmm1, %xmm0
-; SSSE3-NEXT: pcmpgtd %xmm2, %xmm0
-; SSSE3-NEXT: pand %xmm1, %xmm0
-; SSSE3-NEXT: movdqa %xmm3, %xmm1
-; SSSE3-NEXT: pcmpgtd %xmm2, %xmm1
-; SSSE3-NEXT: pand %xmm3, %xmm1
-; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; SSSE3-NEXT: pshufb %xmm2, %xmm1
-; SSSE3-NEXT: pshufb %xmm2, %xmm0
-; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-NEXT: por %xmm1, %xmm0
+; SSSE3-NEXT: pxor %xmm1, %xmm1
+; SSSE3-NEXT: movdqa %xmm0, %xmm2
+; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2
+; SSSE3-NEXT: pand %xmm2, %xmm0
+; SSSE3-NEXT: movdqa %xmm3, %xmm2
+; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2
+; SSSE3-NEXT: pand %xmm3, %xmm2
+; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; SSSE3-NEXT: pshufb %xmm1, %xmm2
+; SSSE3-NEXT: pshufb %xmm1, %xmm0
+; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: trunc_packus_v8i32_v8i16:
@@ -2725,18 +2725,18 @@ define <2 x i8> @trunc_packus_v2i64_v2i8(<2 x i64> %a0) {
; SSE2-NEXT: por %xmm2, %xmm3
; SSE2-NEXT: pand %xmm3, %xmm0
; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE2-NEXT: por %xmm0, %xmm3
-; SSE2-NEXT: movdqa %xmm3, %xmm0
-; SSE2-NEXT: pxor %xmm1, %xmm0
+; SSE2-NEXT: por %xmm3, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm1, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; SSE2-NEXT: pxor %xmm1, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm3
+; SSE2-NEXT: pcmpgtd %xmm1, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm1, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
; SSE2-NEXT: pand %xmm4, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; SSE2-NEXT: por %xmm1, %xmm0
-; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; SSE2-NEXT: por %xmm1, %xmm2
+; SSE2-NEXT: pand %xmm2, %xmm0
; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE2-NEXT: packuswb %xmm0, %xmm0
; SSE2-NEXT: packuswb %xmm0, %xmm0
@@ -2759,18 +2759,18 @@ define <2 x i8> @trunc_packus_v2i64_v2i8(<2 x i64> %a0) {
; SSSE3-NEXT: por %xmm2, %xmm3
; SSSE3-NEXT: pand %xmm3, %xmm0
; SSSE3-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT: por %xmm0, %xmm3
-; SSSE3-NEXT: movdqa %xmm3, %xmm0
-; SSSE3-NEXT: pxor %xmm1, %xmm0
+; SSSE3-NEXT: por %xmm3, %xmm0
; SSSE3-NEXT: movdqa %xmm0, %xmm2
-; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm1, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; SSSE3-NEXT: pxor %xmm1, %xmm2
+; SSSE3-NEXT: movdqa %xmm2, %xmm3
+; SSSE3-NEXT: pcmpgtd %xmm1, %xmm3
+; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
+; SSSE3-NEXT: pcmpeqd %xmm1, %xmm2
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
; SSSE3-NEXT: pand %xmm4, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; SSSE3-NEXT: por %xmm1, %xmm0
-; SSSE3-NEXT: pand %xmm3, %xmm0
+; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; SSSE3-NEXT: por %xmm1, %xmm2
+; SSSE3-NEXT: pand %xmm2, %xmm0
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
; SSSE3-NEXT: retq
;
@@ -3043,37 +3043,37 @@ define <4 x i8> @trunc_packus_v4i64_v4i8(<4 x i64> %a0) {
; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm5, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3]
-; SSE2-NEXT: pand %xmm6, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
-; SSE2-NEXT: por %xmm5, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm0
-; SSE2-NEXT: pandn %xmm8, %xmm1
-; SSE2-NEXT: por %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: pxor %xmm3, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT: pand %xmm6, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT: por %xmm1, %xmm2
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: pandn %xmm8, %xmm2
+; SSE2-NEXT: por %xmm2, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: pxor %xmm3, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm2
; SSE2-NEXT: pcmpgtd %xmm3, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm3, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
-; SSE2-NEXT: pand %xmm5, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; SSE2-NEXT: por %xmm6, %xmm0
-; SSE2-NEXT: movdqa %xmm4, %xmm2
-; SSE2-NEXT: pxor %xmm3, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm5
+; SSE2-NEXT: pcmpeqd %xmm3, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT: pand %xmm5, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT: por %xmm1, %xmm2
+; SSE2-NEXT: movdqa %xmm4, %xmm1
+; SSE2-NEXT: pxor %xmm3, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm5
; SSE2-NEXT: pcmpgtd %xmm3, %xmm5
; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm3, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE2-NEXT: pand %xmm6, %xmm2
+; SSE2-NEXT: pcmpeqd %xmm3, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT: pand %xmm6, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3]
-; SSE2-NEXT: por %xmm2, %xmm3
+; SSE2-NEXT: por %xmm1, %xmm3
; SSE2-NEXT: pand %xmm8, %xmm3
; SSE2-NEXT: pand %xmm4, %xmm3
-; SSE2-NEXT: pand %xmm8, %xmm0
-; SSE2-NEXT: pand %xmm1, %xmm0
+; SSE2-NEXT: pand %xmm8, %xmm2
+; SSE2-NEXT: pand %xmm2, %xmm0
; SSE2-NEXT: packuswb %xmm3, %xmm0
; SSE2-NEXT: packuswb %xmm0, %xmm0
; SSE2-NEXT: packuswb %xmm0, %xmm0
@@ -3109,17 +3109,17 @@ define <4 x i8> @trunc_packus_v4i64_v4i8(<4 x i64> %a0) {
; SSSE3-NEXT: por %xmm1, %xmm4
; SSSE3-NEXT: pand %xmm4, %xmm0
; SSSE3-NEXT: pandn %xmm8, %xmm4
-; SSSE3-NEXT: por %xmm0, %xmm4
-; SSSE3-NEXT: movdqa %xmm4, %xmm0
-; SSSE3-NEXT: pxor %xmm2, %xmm0
+; SSSE3-NEXT: por %xmm4, %xmm0
; SSSE3-NEXT: movdqa %xmm0, %xmm1
-; SSSE3-NEXT: pcmpgtd %xmm2, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm2, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
-; SSSE3-NEXT: pand %xmm5, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSSE3-NEXT: por %xmm6, %xmm0
+; SSSE3-NEXT: pxor %xmm2, %xmm1
+; SSSE3-NEXT: movdqa %xmm1, %xmm4
+; SSSE3-NEXT: pcmpgtd %xmm2, %xmm4
+; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSSE3-NEXT: pcmpeqd %xmm2, %xmm1
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSSE3-NEXT: pand %xmm5, %xmm1
+; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSSE3-NEXT: por %xmm1, %xmm4
; SSSE3-NEXT: pand %xmm4, %xmm0
; SSSE3-NEXT: movdqa %xmm3, %xmm1
; SSSE3-NEXT: pxor %xmm2, %xmm1
@@ -5097,10 +5097,10 @@ define <4 x i8> @trunc_packus_v4i32_v4i8(<4 x i32> %a0) "min-legal-vector-width"
; SSE2-NEXT: pcmpgtd %xmm0, %xmm2
; SSE2-NEXT: pand %xmm2, %xmm0
; SSE2-NEXT: pandn %xmm1, %xmm2
-; SSE2-NEXT: por %xmm0, %xmm2
+; SSE2-NEXT: por %xmm2, %xmm0
; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: movdqa %xmm2, %xmm0
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
; SSE2-NEXT: pand %xmm2, %xmm0
; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE2-NEXT: packuswb %xmm0, %xmm0
@@ -5114,10 +5114,10 @@ define <4 x i8> @trunc_packus_v4i32_v4i8(<4 x i32> %a0) "min-legal-vector-width"
; SSSE3-NEXT: pcmpgtd %xmm0, %xmm2
; SSSE3-NEXT: pand %xmm2, %xmm0
; SSSE3-NEXT: pandn %xmm1, %xmm2
-; SSSE3-NEXT: por %xmm0, %xmm2
+; SSSE3-NEXT: por %xmm2, %xmm0
; SSSE3-NEXT: pxor %xmm1, %xmm1
-; SSSE3-NEXT: movdqa %xmm2, %xmm0
-; SSSE3-NEXT: pcmpgtd %xmm1, %xmm0
+; SSSE3-NEXT: movdqa %xmm0, %xmm2
+; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2
; SSSE3-NEXT: pand %xmm2, %xmm0
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
; SSSE3-NEXT: retq
@@ -5126,10 +5126,9 @@ define <4 x i8> @trunc_packus_v4i32_v4i8(<4 x i32> %a0) "min-legal-vector-width"
; SSE41: # %bb.0:
; SSE41-NEXT: pminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE41-NEXT: pxor %xmm1, %xmm1
-; SSE41-NEXT: pmaxsd %xmm0, %xmm1
-; SSE41-NEXT: packusdw %xmm1, %xmm1
-; SSE41-NEXT: packuswb %xmm1, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pmaxsd %xmm1, %xmm0
+; SSE41-NEXT: packusdw %xmm0, %xmm0
+; SSE41-NEXT: packuswb %xmm0, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: trunc_packus_v4i32_v4i8:
diff --git a/llvm/test/CodeGen/X86/vector-trunc-ssat.ll b/llvm/test/CodeGen/X86/vector-trunc-ssat.ll
index e8a0ac2808ad4..2a727bcbb07aa 100644
--- a/llvm/test/CodeGen/X86/vector-trunc-ssat.ll
+++ b/llvm/test/CodeGen/X86/vector-trunc-ssat.ll
@@ -313,122 +313,122 @@ define void @trunc_ssat_v2i64_v2i32_store(<2 x i64> %a0, <2 x i32>* %p1) {
define <4 x i32> @trunc_ssat_v4i64_v4i32(<4 x i64> %a0) {
; SSE2-LABEL: trunc_ssat_v4i64_v4i32:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [2147483647,2147483647]
+; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483647,2147483647]
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: pxor %xmm2, %xmm3
+; SSE2-NEXT: movdqa %xmm0, %xmm4
+; SSE2-NEXT: pxor %xmm2, %xmm4
; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [4294967295,4294967295]
; SSE2-NEXT: movdqa %xmm5, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm3, %xmm6
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm6
; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm5, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm5, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
; SSE2-NEXT: pand %xmm7, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3]
-; SSE2-NEXT: por %xmm4, %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: pandn %xmm8, %xmm3
-; SSE2-NEXT: por %xmm0, %xmm3
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: pxor %xmm2, %xmm0
-; SSE2-NEXT: movdqa %xmm5, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm5, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSE2-NEXT: por %xmm4, %xmm6
; SSE2-NEXT: pand %xmm6, %xmm0
+; SSE2-NEXT: pandn %xmm3, %xmm6
+; SSE2-NEXT: por %xmm6, %xmm0
+; SSE2-NEXT: movdqa %xmm1, %xmm4
+; SSE2-NEXT: pxor %xmm2, %xmm4
+; SSE2-NEXT: movdqa %xmm5, %xmm6
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm6
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm5, %xmm4
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT: por %xmm0, %xmm4
-; SSE2-NEXT: pand %xmm4, %xmm1
-; SSE2-NEXT: pandn %xmm8, %xmm4
-; SSE2-NEXT: por %xmm1, %xmm4
+; SSE2-NEXT: pand %xmm7, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
+; SSE2-NEXT: por %xmm4, %xmm5
+; SSE2-NEXT: pand %xmm5, %xmm1
+; SSE2-NEXT: pandn %xmm3, %xmm5
+; SSE2-NEXT: por %xmm1, %xmm5
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968]
-; SSE2-NEXT: movdqa %xmm4, %xmm0
-; SSE2-NEXT: pxor %xmm2, %xmm0
-; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [18446744069414584320,18446744069414584320]
-; SSE2-NEXT: movdqa %xmm0, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm5, %xmm6
+; SSE2-NEXT: movdqa %xmm5, %xmm3
+; SSE2-NEXT: pxor %xmm2, %xmm3
+; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [18446744069414584320,18446744069414584320]
+; SSE2-NEXT: movdqa %xmm3, %xmm6
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm6
; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm5, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT: pand %xmm7, %xmm0
+; SSE2-NEXT: pcmpeqd %xmm4, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE2-NEXT: pand %xmm7, %xmm3
; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; SSE2-NEXT: por %xmm0, %xmm6
-; SSE2-NEXT: pand %xmm6, %xmm4
+; SSE2-NEXT: por %xmm3, %xmm6
+; SSE2-NEXT: pand %xmm6, %xmm5
; SSE2-NEXT: pandn %xmm1, %xmm6
-; SSE2-NEXT: por %xmm4, %xmm6
-; SSE2-NEXT: pxor %xmm3, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm0
-; SSE2-NEXT: pcmpgtd %xmm5, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm5, %xmm2
+; SSE2-NEXT: por %xmm5, %xmm6
+; SSE2-NEXT: pxor %xmm0, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm3
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm4, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE2-NEXT: pand %xmm4, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT: por %xmm2, %xmm0
-; SSE2-NEXT: pand %xmm0, %xmm3
-; SSE2-NEXT: pandn %xmm1, %xmm0
+; SSE2-NEXT: pand %xmm5, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE2-NEXT: por %xmm2, %xmm3
+; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: pandn %xmm1, %xmm3
; SSE2-NEXT: por %xmm3, %xmm0
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm6[0,2]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: trunc_ssat_v4i64_v4i32:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [2147483647,2147483647]
+; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147483647,2147483647]
; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
-; SSSE3-NEXT: movdqa %xmm0, %xmm3
-; SSSE3-NEXT: pxor %xmm2, %xmm3
+; SSSE3-NEXT: movdqa %xmm0, %xmm4
+; SSSE3-NEXT: pxor %xmm2, %xmm4
; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [4294967295,4294967295]
; SSSE3-NEXT: movdqa %xmm5, %xmm6
-; SSSE3-NEXT: pcmpgtd %xmm3, %xmm6
+; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6
; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm5, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
+; SSSE3-NEXT: pcmpeqd %xmm5, %xmm4
+; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
; SSSE3-NEXT: pand %xmm7, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3]
-; SSSE3-NEXT: por %xmm4, %xmm3
-; SSSE3-NEXT: pand %xmm3, %xmm0
-; SSSE3-NEXT: pandn %xmm8, %xmm3
-; SSSE3-NEXT: por %xmm0, %xmm3
-; SSSE3-NEXT: movdqa %xmm1, %xmm0
-; SSSE3-NEXT: pxor %xmm2, %xmm0
-; SSSE3-NEXT: movdqa %xmm5, %xmm4
-; SSSE3-NEXT: pcmpgtd %xmm0, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm5, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSSE3-NEXT: por %xmm4, %xmm6
; SSSE3-NEXT: pand %xmm6, %xmm0
+; SSSE3-NEXT: pandn %xmm3, %xmm6
+; SSSE3-NEXT: por %xmm6, %xmm0
+; SSSE3-NEXT: movdqa %xmm1, %xmm4
+; SSSE3-NEXT: pxor %xmm2, %xmm4
+; SSSE3-NEXT: movdqa %xmm5, %xmm6
+; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6
+; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
+; SSSE3-NEXT: pcmpeqd %xmm5, %xmm4
; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT: por %xmm0, %xmm4
-; SSSE3-NEXT: pand %xmm4, %xmm1
-; SSSE3-NEXT: pandn %xmm8, %xmm4
-; SSSE3-NEXT: por %xmm1, %xmm4
+; SSSE3-NEXT: pand %xmm7, %xmm4
+; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
+; SSSE3-NEXT: por %xmm4, %xmm5
+; SSSE3-NEXT: pand %xmm5, %xmm1
+; SSSE3-NEXT: pandn %xmm3, %xmm5
+; SSSE3-NEXT: por %xmm1, %xmm5
; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968]
-; SSSE3-NEXT: movdqa %xmm4, %xmm0
-; SSSE3-NEXT: pxor %xmm2, %xmm0
-; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [18446744069414584320,18446744069414584320]
-; SSSE3-NEXT: movdqa %xmm0, %xmm6
-; SSSE3-NEXT: pcmpgtd %xmm5, %xmm6
+; SSSE3-NEXT: movdqa %xmm5, %xmm3
+; SSSE3-NEXT: pxor %xmm2, %xmm3
+; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [18446744069414584320,18446744069414584320]
+; SSSE3-NEXT: movdqa %xmm3, %xmm6
+; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6
; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm5, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT: pand %xmm7, %xmm0
+; SSSE3-NEXT: pcmpeqd %xmm4, %xmm3
+; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSSE3-NEXT: pand %xmm7, %xmm3
; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; SSSE3-NEXT: por %xmm0, %xmm6
-; SSSE3-NEXT: pand %xmm6, %xmm4
+; SSSE3-NEXT: por %xmm3, %xmm6
+; SSSE3-NEXT: pand %xmm6, %xmm5
; SSSE3-NEXT: pandn %xmm1, %xmm6
-; SSSE3-NEXT: por %xmm4, %xmm6
-; SSSE3-NEXT: pxor %xmm3, %xmm2
-; SSSE3-NEXT: movdqa %xmm2, %xmm0
-; SSSE3-NEXT: pcmpgtd %xmm5, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm5, %xmm2
+; SSSE3-NEXT: por %xmm5, %xmm6
+; SSSE3-NEXT: pxor %xmm0, %xmm2
+; SSSE3-NEXT: movdqa %xmm2, %xmm3
+; SSSE3-NEXT: pcmpgtd %xmm4, %xmm3
+; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
+; SSSE3-NEXT: pcmpeqd %xmm4, %xmm2
; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSSE3-NEXT: pand %xmm4, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT: por %xmm2, %xmm0
-; SSSE3-NEXT: pand %xmm0, %xmm3
-; SSSE3-NEXT: pandn %xmm1, %xmm0
+; SSSE3-NEXT: pand %xmm5, %xmm2
+; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSSE3-NEXT: por %xmm2, %xmm3
+; SSSE3-NEXT: pand %xmm3, %xmm0
+; SSSE3-NEXT: pandn %xmm1, %xmm3
; SSSE3-NEXT: por %xmm3, %xmm0
; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm6[0,2]
; SSSE3-NEXT: retq
@@ -1376,61 +1376,61 @@ define void @trunc_ssat_v2i64_v2i16_store(<2 x i64> %a0, <2 x i16> *%p1) {
define <4 x i16> @trunc_ssat_v4i64_v4i16(<4 x i64> %a0) {
; SSE2-LABEL: trunc_ssat_v4i64_v4i16:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [32767,32767]
+; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [32767,32767]
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: pxor %xmm2, %xmm3
+; SSE2-NEXT: movdqa %xmm0, %xmm4
+; SSE2-NEXT: pxor %xmm2, %xmm4
; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147516415,2147516415]
; SSE2-NEXT: movdqa %xmm5, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm3, %xmm6
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm6
; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm5, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm5, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
; SSE2-NEXT: pand %xmm7, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3]
-; SSE2-NEXT: por %xmm4, %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: pandn %xmm8, %xmm3
-; SSE2-NEXT: por %xmm0, %xmm3
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: pxor %xmm2, %xmm0
-; SSE2-NEXT: movdqa %xmm5, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm5, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSE2-NEXT: por %xmm4, %xmm6
; SSE2-NEXT: pand %xmm6, %xmm0
+; SSE2-NEXT: pandn %xmm3, %xmm6
+; SSE2-NEXT: por %xmm6, %xmm0
+; SSE2-NEXT: movdqa %xmm1, %xmm4
+; SSE2-NEXT: pxor %xmm2, %xmm4
+; SSE2-NEXT: movdqa %xmm5, %xmm6
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm6
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm5, %xmm4
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT: por %xmm0, %xmm4
-; SSE2-NEXT: pand %xmm4, %xmm1
-; SSE2-NEXT: pandn %xmm8, %xmm4
-; SSE2-NEXT: por %xmm1, %xmm4
+; SSE2-NEXT: pand %xmm7, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
+; SSE2-NEXT: por %xmm4, %xmm5
+; SSE2-NEXT: pand %xmm5, %xmm1
+; SSE2-NEXT: pandn %xmm3, %xmm5
+; SSE2-NEXT: por %xmm1, %xmm5
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848]
-; SSE2-NEXT: movdqa %xmm4, %xmm0
-; SSE2-NEXT: pxor %xmm2, %xmm0
-; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [18446744071562035200,18446744071562035200]
-; SSE2-NEXT: movdqa %xmm0, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm5, %xmm6
+; SSE2-NEXT: movdqa %xmm5, %xmm3
+; SSE2-NEXT: pxor %xmm2, %xmm3
+; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [18446744071562035200,18446744071562035200]
+; SSE2-NEXT: movdqa %xmm3, %xmm6
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm6
; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm5, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT: pand %xmm7, %xmm0
+; SSE2-NEXT: pcmpeqd %xmm4, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE2-NEXT: pand %xmm7, %xmm3
; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; SSE2-NEXT: por %xmm0, %xmm6
-; SSE2-NEXT: pand %xmm6, %xmm4
+; SSE2-NEXT: por %xmm3, %xmm6
+; SSE2-NEXT: pand %xmm6, %xmm5
; SSE2-NEXT: pandn %xmm1, %xmm6
-; SSE2-NEXT: por %xmm4, %xmm6
-; SSE2-NEXT: pxor %xmm3, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm0
-; SSE2-NEXT: pcmpgtd %xmm5, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm5, %xmm2
+; SSE2-NEXT: por %xmm5, %xmm6
+; SSE2-NEXT: pxor %xmm0, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm3
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm4, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE2-NEXT: pand %xmm4, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT: por %xmm2, %xmm0
-; SSE2-NEXT: pand %xmm0, %xmm3
-; SSE2-NEXT: pandn %xmm1, %xmm0
+; SSE2-NEXT: pand %xmm5, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE2-NEXT: por %xmm2, %xmm3
+; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: pandn %xmm1, %xmm3
; SSE2-NEXT: por %xmm3, %xmm0
; SSE2-NEXT: packssdw %xmm6, %xmm0
; SSE2-NEXT: packssdw %xmm0, %xmm0
@@ -1438,61 +1438,61 @@ define <4 x i16> @trunc_ssat_v4i64_v4i16(<4 x i64> %a0) {
;
; SSSE3-LABEL: trunc_ssat_v4i64_v4i16:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [32767,32767]
+; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [32767,32767]
; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
-; SSSE3-NEXT: movdqa %xmm0, %xmm3
-; SSSE3-NEXT: pxor %xmm2, %xmm3
+; SSSE3-NEXT: movdqa %xmm0, %xmm4
+; SSSE3-NEXT: pxor %xmm2, %xmm4
; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147516415,2147516415]
; SSSE3-NEXT: movdqa %xmm5, %xmm6
-; SSSE3-NEXT: pcmpgtd %xmm3, %xmm6
+; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6
; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm5, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
+; SSSE3-NEXT: pcmpeqd %xmm5, %xmm4
+; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
; SSSE3-NEXT: pand %xmm7, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3]
-; SSSE3-NEXT: por %xmm4, %xmm3
-; SSSE3-NEXT: pand %xmm3, %xmm0
-; SSSE3-NEXT: pandn %xmm8, %xmm3
-; SSSE3-NEXT: por %xmm0, %xmm3
-; SSSE3-NEXT: movdqa %xmm1, %xmm0
-; SSSE3-NEXT: pxor %xmm2, %xmm0
-; SSSE3-NEXT: movdqa %xmm5, %xmm4
-; SSSE3-NEXT: pcmpgtd %xmm0, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm5, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSSE3-NEXT: por %xmm4, %xmm6
; SSSE3-NEXT: pand %xmm6, %xmm0
+; SSSE3-NEXT: pandn %xmm3, %xmm6
+; SSSE3-NEXT: por %xmm6, %xmm0
+; SSSE3-NEXT: movdqa %xmm1, %xmm4
+; SSSE3-NEXT: pxor %xmm2, %xmm4
+; SSSE3-NEXT: movdqa %xmm5, %xmm6
+; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6
+; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
+; SSSE3-NEXT: pcmpeqd %xmm5, %xmm4
; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT: por %xmm0, %xmm4
-; SSSE3-NEXT: pand %xmm4, %xmm1
-; SSSE3-NEXT: pandn %xmm8, %xmm4
-; SSSE3-NEXT: por %xmm1, %xmm4
+; SSSE3-NEXT: pand %xmm7, %xmm4
+; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
+; SSSE3-NEXT: por %xmm4, %xmm5
+; SSSE3-NEXT: pand %xmm5, %xmm1
+; SSSE3-NEXT: pandn %xmm3, %xmm5
+; SSSE3-NEXT: por %xmm1, %xmm5
; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848]
-; SSSE3-NEXT: movdqa %xmm4, %xmm0
-; SSSE3-NEXT: pxor %xmm2, %xmm0
-; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [18446744071562035200,18446744071562035200]
-; SSSE3-NEXT: movdqa %xmm0, %xmm6
-; SSSE3-NEXT: pcmpgtd %xmm5, %xmm6
+; SSSE3-NEXT: movdqa %xmm5, %xmm3
+; SSSE3-NEXT: pxor %xmm2, %xmm3
+; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [18446744071562035200,18446744071562035200]
+; SSSE3-NEXT: movdqa %xmm3, %xmm6
+; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6
; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm5, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT: pand %xmm7, %xmm0
+; SSSE3-NEXT: pcmpeqd %xmm4, %xmm3
+; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSSE3-NEXT: pand %xmm7, %xmm3
; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; SSSE3-NEXT: por %xmm0, %xmm6
-; SSSE3-NEXT: pand %xmm6, %xmm4
+; SSSE3-NEXT: por %xmm3, %xmm6
+; SSSE3-NEXT: pand %xmm6, %xmm5
; SSSE3-NEXT: pandn %xmm1, %xmm6
-; SSSE3-NEXT: por %xmm4, %xmm6
-; SSSE3-NEXT: pxor %xmm3, %xmm2
-; SSSE3-NEXT: movdqa %xmm2, %xmm0
-; SSSE3-NEXT: pcmpgtd %xmm5, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm5, %xmm2
+; SSSE3-NEXT: por %xmm5, %xmm6
+; SSSE3-NEXT: pxor %xmm0, %xmm2
+; SSSE3-NEXT: movdqa %xmm2, %xmm3
+; SSSE3-NEXT: pcmpgtd %xmm4, %xmm3
+; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
+; SSSE3-NEXT: pcmpeqd %xmm4, %xmm2
; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSSE3-NEXT: pand %xmm4, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT: por %xmm2, %xmm0
-; SSSE3-NEXT: pand %xmm0, %xmm3
-; SSSE3-NEXT: pandn %xmm1, %xmm0
+; SSSE3-NEXT: pand %xmm5, %xmm2
+; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSSE3-NEXT: por %xmm2, %xmm3
+; SSSE3-NEXT: pand %xmm3, %xmm0
+; SSSE3-NEXT: pandn %xmm1, %xmm3
; SSSE3-NEXT: por %xmm3, %xmm0
; SSSE3-NEXT: packssdw %xmm6, %xmm0
; SSSE3-NEXT: packssdw %xmm0, %xmm0
@@ -2485,20 +2485,20 @@ define <2 x i8> @trunc_ssat_v2i64_v2i8(<2 x i64> %a0) {
; SSE2-NEXT: por %xmm2, %xmm3
; SSE2-NEXT: pand %xmm3, %xmm0
; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE2-NEXT: por %xmm0, %xmm3
-; SSE2-NEXT: pxor %xmm3, %xmm1
-; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [18446744071562067840,18446744071562067840]
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm0, %xmm1
+; SSE2-NEXT: por %xmm3, %xmm0
+; SSE2-NEXT: pxor %xmm0, %xmm1
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [18446744071562067840,18446744071562067840]
+; SSE2-NEXT: movdqa %xmm1, %xmm3
+; SSE2-NEXT: pcmpgtd %xmm2, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm2, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSE2-NEXT: pand %xmm4, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; SSE2-NEXT: por %xmm1, %xmm0
-; SSE2-NEXT: pand %xmm0, %xmm3
-; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE2-NEXT: por %xmm3, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; SSE2-NEXT: por %xmm1, %xmm2
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE2-NEXT: por %xmm2, %xmm0
; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE2-NEXT: packuswb %xmm0, %xmm0
; SSE2-NEXT: packuswb %xmm0, %xmm0
@@ -2521,20 +2521,20 @@ define <2 x i8> @trunc_ssat_v2i64_v2i8(<2 x i64> %a0) {
; SSSE3-NEXT: por %xmm2, %xmm3
; SSSE3-NEXT: pand %xmm3, %xmm0
; SSSE3-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT: por %xmm0, %xmm3
-; SSSE3-NEXT: pxor %xmm3, %xmm1
-; SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [18446744071562067840,18446744071562067840]
-; SSSE3-NEXT: movdqa %xmm1, %xmm2
-; SSSE3-NEXT: pcmpgtd %xmm0, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm0, %xmm1
+; SSSE3-NEXT: por %xmm3, %xmm0
+; SSSE3-NEXT: pxor %xmm0, %xmm1
+; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [18446744071562067840,18446744071562067840]
+; SSSE3-NEXT: movdqa %xmm1, %xmm3
+; SSSE3-NEXT: pcmpgtd %xmm2, %xmm3
+; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
+; SSSE3-NEXT: pcmpeqd %xmm2, %xmm1
; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSSE3-NEXT: pand %xmm4, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; SSSE3-NEXT: por %xmm1, %xmm0
-; SSSE3-NEXT: pand %xmm0, %xmm3
-; SSSE3-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSSE3-NEXT: por %xmm3, %xmm0
+; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; SSSE3-NEXT: por %xmm1, %xmm2
+; SSSE3-NEXT: pand %xmm2, %xmm0
+; SSSE3-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSSE3-NEXT: por %xmm2, %xmm0
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
; SSSE3-NEXT: retq
;
@@ -2797,38 +2797,38 @@ define <4 x i8> @trunc_ssat_v4i64_v4i8(<4 x i64> %a0) {
; SSE2-NEXT: por %xmm1, %xmm4
; SSE2-NEXT: pand %xmm4, %xmm0
; SSE2-NEXT: pandn %xmm8, %xmm4
-; SSE2-NEXT: por %xmm0, %xmm4
-; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [18446744073709551488,18446744073709551488]
-; SSE2-NEXT: movdqa %xmm4, %xmm0
-; SSE2-NEXT: pxor %xmm2, %xmm0
+; SSE2-NEXT: por %xmm4, %xmm0
+; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [18446744073709551488,18446744073709551488]
+; SSE2-NEXT: movdqa %xmm0, %xmm4
+; SSE2-NEXT: pxor %xmm2, %xmm4
; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [18446744071562067840,18446744071562067840]
-; SSE2-NEXT: movdqa %xmm0, %xmm6
+; SSE2-NEXT: movdqa %xmm4, %xmm6
; SSE2-NEXT: pcmpgtd %xmm5, %xmm6
; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm5, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; SSE2-NEXT: pand %xmm7, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
-; SSE2-NEXT: por %xmm1, %xmm0
-; SSE2-NEXT: pand %xmm0, %xmm4
-; SSE2-NEXT: pandn %xmm8, %xmm0
-; SSE2-NEXT: por %xmm4, %xmm0
+; SSE2-NEXT: pcmpeqd %xmm5, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE2-NEXT: pand %xmm7, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSE2-NEXT: por %xmm4, %xmm6
+; SSE2-NEXT: pand %xmm6, %xmm0
+; SSE2-NEXT: pandn %xmm1, %xmm6
+; SSE2-NEXT: por %xmm6, %xmm0
; SSE2-NEXT: pxor %xmm3, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm1
-; SSE2-NEXT: pcmpgtd %xmm5, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,0,2,2]
+; SSE2-NEXT: movdqa %xmm2, %xmm4
+; SSE2-NEXT: pcmpgtd %xmm5, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm5, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE2-NEXT: pand %xmm4, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: por %xmm2, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm3
-; SSE2-NEXT: pandn %xmm8, %xmm1
-; SSE2-NEXT: por %xmm3, %xmm1
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0]
-; SSE2-NEXT: pand %xmm2, %xmm1
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: packuswb %xmm1, %xmm0
+; SSE2-NEXT: pand %xmm6, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE2-NEXT: por %xmm2, %xmm4
+; SSE2-NEXT: pand %xmm4, %xmm3
+; SSE2-NEXT: pandn %xmm1, %xmm4
+; SSE2-NEXT: por %xmm3, %xmm4
+; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,0,255,0,0,0]
+; SSE2-NEXT: pand %xmm1, %xmm4
+; SSE2-NEXT: pand %xmm1, %xmm0
+; SSE2-NEXT: packuswb %xmm4, %xmm0
; SSE2-NEXT: packuswb %xmm0, %xmm0
; SSE2-NEXT: packuswb %xmm0, %xmm0
; SSE2-NEXT: retq
@@ -2863,38 +2863,38 @@ define <4 x i8> @trunc_ssat_v4i64_v4i8(<4 x i64> %a0) {
; SSSE3-NEXT: por %xmm1, %xmm4
; SSSE3-NEXT: pand %xmm4, %xmm0
; SSSE3-NEXT: pandn %xmm8, %xmm4
-; SSSE3-NEXT: por %xmm0, %xmm4
-; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [18446744073709551488,18446744073709551488]
-; SSSE3-NEXT: movdqa %xmm4, %xmm0
-; SSSE3-NEXT: pxor %xmm2, %xmm0
+; SSSE3-NEXT: por %xmm4, %xmm0
+; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [18446744073709551488,18446744073709551488]
+; SSSE3-NEXT: movdqa %xmm0, %xmm4
+; SSSE3-NEXT: pxor %xmm2, %xmm4
; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [18446744071562067840,18446744071562067840]
-; SSSE3-NEXT: movdqa %xmm0, %xmm6
+; SSSE3-NEXT: movdqa %xmm4, %xmm6
; SSSE3-NEXT: pcmpgtd %xmm5, %xmm6
; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm5, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; SSSE3-NEXT: pand %xmm7, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
-; SSSE3-NEXT: por %xmm1, %xmm0
-; SSSE3-NEXT: pand %xmm0, %xmm4
-; SSSE3-NEXT: pandn %xmm8, %xmm0
-; SSSE3-NEXT: por %xmm4, %xmm0
+; SSSE3-NEXT: pcmpeqd %xmm5, %xmm4
+; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSSE3-NEXT: pand %xmm7, %xmm4
+; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSSE3-NEXT: por %xmm4, %xmm6
+; SSSE3-NEXT: pand %xmm6, %xmm0
+; SSSE3-NEXT: pandn %xmm1, %xmm6
+; SSSE3-NEXT: por %xmm6, %xmm0
; SSSE3-NEXT: pxor %xmm3, %xmm2
-; SSSE3-NEXT: movdqa %xmm2, %xmm1
-; SSSE3-NEXT: pcmpgtd %xmm5, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,0,2,2]
+; SSSE3-NEXT: movdqa %xmm2, %xmm4
+; SSSE3-NEXT: pcmpgtd %xmm5, %xmm4
+; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
; SSSE3-NEXT: pcmpeqd %xmm5, %xmm2
; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSSE3-NEXT: pand %xmm4, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSSE3-NEXT: por %xmm2, %xmm1
-; SSSE3-NEXT: pand %xmm1, %xmm3
-; SSSE3-NEXT: pandn %xmm8, %xmm1
-; SSSE3-NEXT: por %xmm3, %xmm1
-; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; SSSE3-NEXT: pshufb %xmm2, %xmm1
-; SSSE3-NEXT: pshufb %xmm2, %xmm0
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSSE3-NEXT: pand %xmm6, %xmm2
+; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSSE3-NEXT: por %xmm2, %xmm4
+; SSSE3-NEXT: pand %xmm4, %xmm3
+; SSSE3-NEXT: pandn %xmm1, %xmm4
+; SSSE3-NEXT: por %xmm3, %xmm4
+; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; SSSE3-NEXT: pshufb %xmm1, %xmm4
+; SSSE3-NEXT: pshufb %xmm1, %xmm0
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: trunc_ssat_v4i64_v4i8:
@@ -4913,12 +4913,12 @@ define <4 x i8> @trunc_ssat_v4i32_v4i8(<4 x i32> %a0) {
; SSE2-NEXT: pcmpgtd %xmm0, %xmm2
; SSE2-NEXT: pand %xmm2, %xmm0
; SSE2-NEXT: pandn %xmm1, %xmm2
-; SSE2-NEXT: por %xmm0, %xmm2
+; SSE2-NEXT: por %xmm2, %xmm0
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [4294967168,4294967168,4294967168,4294967168]
-; SSE2-NEXT: movdqa %xmm2, %xmm0
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm0
-; SSE2-NEXT: pand %xmm0, %xmm2
-; SSE2-NEXT: pandn %xmm1, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: pandn %xmm1, %xmm2
; SSE2-NEXT: por %xmm2, %xmm0
; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE2-NEXT: packuswb %xmm0, %xmm0
@@ -4932,12 +4932,12 @@ define <4 x i8> @trunc_ssat_v4i32_v4i8(<4 x i32> %a0) {
; SSSE3-NEXT: pcmpgtd %xmm0, %xmm2
; SSSE3-NEXT: pand %xmm2, %xmm0
; SSSE3-NEXT: pandn %xmm1, %xmm2
-; SSSE3-NEXT: por %xmm0, %xmm2
+; SSSE3-NEXT: por %xmm2, %xmm0
; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [4294967168,4294967168,4294967168,4294967168]
-; SSSE3-NEXT: movdqa %xmm2, %xmm0
-; SSSE3-NEXT: pcmpgtd %xmm1, %xmm0
-; SSSE3-NEXT: pand %xmm0, %xmm2
-; SSSE3-NEXT: pandn %xmm1, %xmm0
+; SSSE3-NEXT: movdqa %xmm0, %xmm2
+; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2
+; SSSE3-NEXT: pand %xmm2, %xmm0
+; SSSE3-NEXT: pandn %xmm1, %xmm2
; SSSE3-NEXT: por %xmm2, %xmm0
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
; SSSE3-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vector-trunc-usat.ll b/llvm/test/CodeGen/X86/vector-trunc-usat.ll
index b4784ec441e23..f61977cda1cc7 100644
--- a/llvm/test/CodeGen/X86/vector-trunc-usat.ll
+++ b/llvm/test/CodeGen/X86/vector-trunc-usat.ll
@@ -2048,18 +2048,17 @@ define <2 x i8> @trunc_usat_v2i64_v2i8(<2 x i64> %a0) {
; SSE2-NEXT: pcmpgtd %xmm1, %xmm3
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm2, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT: pand %xmm4, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE2-NEXT: por %xmm2, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm0
-; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: por %xmm0, %xmm1
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: packuswb %xmm1, %xmm1
-; SSE2-NEXT: packuswb %xmm1, %xmm1
-; SSE2-NEXT: packuswb %xmm1, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT: pand %xmm4, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; SSE2-NEXT: por %xmm1, %xmm2
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE2-NEXT: por %xmm2, %xmm0
+; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: packuswb %xmm0, %xmm0
+; SSE2-NEXT: packuswb %xmm0, %xmm0
+; SSE2-NEXT: packuswb %xmm0, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: trunc_usat_v2i64_v2i8:
@@ -2258,37 +2257,37 @@ define void @trunc_usat_v2i64_v2i8_store(<2 x i64> %a0, <2 x i8>* %p1) {
define <4 x i8> @trunc_usat_v4i64_v4i8(<4 x i64> %a0) {
; SSE2-LABEL: trunc_usat_v4i64_v4i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [255,255]
-; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259456,9223372039002259456]
-; SSE2-NEXT: pxor %xmm4, %xmm0
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255]
+; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456]
+; SSE2-NEXT: movdqa %xmm0, %xmm4
+; SSE2-NEXT: pxor %xmm3, %xmm4
; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259711,9223372039002259711]
; SSE2-NEXT: movdqa %xmm5, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm6
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm6
; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm5, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSE2-NEXT: pand %xmm7, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
-; SSE2-NEXT: por %xmm3, %xmm0
-; SSE2-NEXT: pand %xmm0, %xmm2
-; SSE2-NEXT: pandn %xmm8, %xmm0
-; SSE2-NEXT: por %xmm2, %xmm0
-; SSE2-NEXT: pxor %xmm1, %xmm4
-; SSE2-NEXT: movdqa %xmm5, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm4, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm5, %xmm4
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT: pand %xmm3, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE2-NEXT: por %xmm4, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm1
-; SSE2-NEXT: pandn %xmm8, %xmm2
-; SSE2-NEXT: por %xmm1, %xmm2
-; SSE2-NEXT: pand %xmm8, %xmm2
-; SSE2-NEXT: pand %xmm8, %xmm0
-; SSE2-NEXT: packuswb %xmm2, %xmm0
+; SSE2-NEXT: pand %xmm7, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSE2-NEXT: por %xmm4, %xmm6
+; SSE2-NEXT: pand %xmm6, %xmm0
+; SSE2-NEXT: pandn %xmm2, %xmm6
+; SSE2-NEXT: por %xmm6, %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm3
+; SSE2-NEXT: movdqa %xmm5, %xmm4
+; SSE2-NEXT: pcmpgtd %xmm3, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm5, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE2-NEXT: pand %xmm6, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE2-NEXT: por %xmm3, %xmm4
+; SSE2-NEXT: pand %xmm4, %xmm1
+; SSE2-NEXT: pandn %xmm2, %xmm4
+; SSE2-NEXT: por %xmm1, %xmm4
+; SSE2-NEXT: pand %xmm2, %xmm4
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: packuswb %xmm4, %xmm0
; SSE2-NEXT: packuswb %xmm0, %xmm0
; SSE2-NEXT: packuswb %xmm0, %xmm0
; SSE2-NEXT: retq
@@ -3607,17 +3606,16 @@ define <16 x i8> @trunc_usat_v16i64_v16i8(<16 x i64>* %p0) {
define <4 x i8> @trunc_usat_v4i32_v4i8(<4 x i32> %a0) {
; SSE2-LABEL: trunc_usat_v4i32_v4i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-NEXT: pxor %xmm0, %xmm2
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2147483903,2147483903,2147483903,2147483903]
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm0
-; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: por %xmm0, %xmm1
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: packuswb %xmm1, %xmm1
-; SSE2-NEXT: packuswb %xmm1, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT: pxor %xmm0, %xmm1
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483903,2147483903,2147483903,2147483903]
+; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE2-NEXT: por %xmm2, %xmm0
+; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: packuswb %xmm0, %xmm0
+; SSE2-NEXT: packuswb %xmm0, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: trunc_usat_v4i32_v4i8:
diff --git a/llvm/test/CodeGen/X86/vector-tzcnt-128.ll b/llvm/test/CodeGen/X86/vector-tzcnt-128.ll
index 4d37cafbbb191..7a7a0ac9024a4 100644
--- a/llvm/test/CodeGen/X86/vector-tzcnt-128.ll
+++ b/llvm/test/CodeGen/X86/vector-tzcnt-128.ll
@@ -435,16 +435,15 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
; SSE2-NEXT: paddb %xmm2, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrlw $4, %xmm1
-; SSE2-NEXT: paddb %xmm0, %xmm1
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: pxor %xmm0, %xmm0
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE2-NEXT: psadbw %xmm0, %xmm2
-; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT: psadbw %xmm0, %xmm1
-; SSE2-NEXT: packuswb %xmm2, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: paddb %xmm1, %xmm0
+; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-NEXT: psadbw %xmm1, %xmm2
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT: psadbw %xmm1, %xmm0
+; SSE2-NEXT: packuswb %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE3-LABEL: testv4i32:
@@ -464,16 +463,15 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
; SSE3-NEXT: paddb %xmm2, %xmm0
; SSE3-NEXT: movdqa %xmm0, %xmm1
; SSE3-NEXT: psrlw $4, %xmm1
-; SSE3-NEXT: paddb %xmm0, %xmm1
-; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT: pxor %xmm0, %xmm0
-; SSE3-NEXT: movdqa %xmm1, %xmm2
-; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE3-NEXT: psadbw %xmm0, %xmm2
-; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE3-NEXT: psadbw %xmm0, %xmm1
-; SSE3-NEXT: packuswb %xmm2, %xmm1
-; SSE3-NEXT: movdqa %xmm1, %xmm0
+; SSE3-NEXT: paddb %xmm1, %xmm0
+; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT: pxor %xmm1, %xmm1
+; SSE3-NEXT: movdqa %xmm0, %xmm2
+; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE3-NEXT: psadbw %xmm1, %xmm2
+; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE3-NEXT: psadbw %xmm1, %xmm0
+; SSE3-NEXT: packuswb %xmm2, %xmm0
; SSE3-NEXT: retq
;
; SSSE3-LABEL: testv4i32:
@@ -678,16 +676,15 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind {
; SSE2-NEXT: paddb %xmm2, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrlw $4, %xmm1
-; SSE2-NEXT: paddb %xmm0, %xmm1
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: pxor %xmm0, %xmm0
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE2-NEXT: psadbw %xmm0, %xmm2
-; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT: psadbw %xmm0, %xmm1
-; SSE2-NEXT: packuswb %xmm2, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: paddb %xmm1, %xmm0
+; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-NEXT: psadbw %xmm1, %xmm2
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT: psadbw %xmm1, %xmm0
+; SSE2-NEXT: packuswb %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE3-LABEL: testv4i32u:
@@ -707,16 +704,15 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind {
; SSE3-NEXT: paddb %xmm2, %xmm0
; SSE3-NEXT: movdqa %xmm0, %xmm1
; SSE3-NEXT: psrlw $4, %xmm1
-; SSE3-NEXT: paddb %xmm0, %xmm1
-; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT: pxor %xmm0, %xmm0
-; SSE3-NEXT: movdqa %xmm1, %xmm2
-; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE3-NEXT: psadbw %xmm0, %xmm2
-; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE3-NEXT: psadbw %xmm0, %xmm1
-; SSE3-NEXT: packuswb %xmm2, %xmm1
-; SSE3-NEXT: movdqa %xmm1, %xmm0
+; SSE3-NEXT: paddb %xmm1, %xmm0
+; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT: pxor %xmm1, %xmm1
+; SSE3-NEXT: movdqa %xmm0, %xmm2
+; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE3-NEXT: psadbw %xmm1, %xmm2
+; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE3-NEXT: psadbw %xmm1, %xmm0
+; SSE3-NEXT: packuswb %xmm2, %xmm0
; SSE3-NEXT: retq
;
; SSSE3-LABEL: testv4i32u:
@@ -921,10 +917,10 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
; SSE2-NEXT: paddb %xmm2, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrlw $4, %xmm1
-; SSE2-NEXT: paddb %xmm0, %xmm1
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psllw $8, %xmm0
+; SSE2-NEXT: paddb %xmm1, %xmm0
+; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psllw $8, %xmm1
; SSE2-NEXT: paddb %xmm1, %xmm0
; SSE2-NEXT: psrlw $8, %xmm0
; SSE2-NEXT: retq
@@ -946,10 +942,10 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
; SSE3-NEXT: paddb %xmm2, %xmm0
; SSE3-NEXT: movdqa %xmm0, %xmm1
; SSE3-NEXT: psrlw $4, %xmm1
-; SSE3-NEXT: paddb %xmm0, %xmm1
-; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT: movdqa %xmm1, %xmm0
-; SSE3-NEXT: psllw $8, %xmm0
+; SSE3-NEXT: paddb %xmm1, %xmm0
+; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT: movdqa %xmm0, %xmm1
+; SSE3-NEXT: psllw $8, %xmm1
; SSE3-NEXT: paddb %xmm1, %xmm0
; SSE3-NEXT: psrlw $8, %xmm0
; SSE3-NEXT: retq
@@ -1097,10 +1093,10 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind {
; SSE2-NEXT: paddb %xmm2, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrlw $4, %xmm1
-; SSE2-NEXT: paddb %xmm0, %xmm1
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psllw $8, %xmm0
+; SSE2-NEXT: paddb %xmm1, %xmm0
+; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psllw $8, %xmm1
; SSE2-NEXT: paddb %xmm1, %xmm0
; SSE2-NEXT: psrlw $8, %xmm0
; SSE2-NEXT: retq
@@ -1122,10 +1118,10 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind {
; SSE3-NEXT: paddb %xmm2, %xmm0
; SSE3-NEXT: movdqa %xmm0, %xmm1
; SSE3-NEXT: psrlw $4, %xmm1
-; SSE3-NEXT: paddb %xmm0, %xmm1
-; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE3-NEXT: movdqa %xmm1, %xmm0
-; SSE3-NEXT: psllw $8, %xmm0
+; SSE3-NEXT: paddb %xmm1, %xmm0
+; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE3-NEXT: movdqa %xmm0, %xmm1
+; SSE3-NEXT: psllw $8, %xmm1
; SSE3-NEXT: paddb %xmm1, %xmm0
; SSE3-NEXT: psrlw $8, %xmm0
; SSE3-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vselect-packss.ll b/llvm/test/CodeGen/X86/vselect-packss.ll
index fa782630ee19e..5e88143134cb0 100644
--- a/llvm/test/CodeGen/X86/vselect-packss.ll
+++ b/llvm/test/CodeGen/X86/vselect-packss.ll
@@ -225,15 +225,14 @@ define <16 x i8> @vselect_packss_v16i64(<16 x i64> %a0, <16 x i64> %a1, <16 x i8
; SSE2-NEXT: pand %xmm1, %xmm2
; SSE2-NEXT: pcmpeqd {{[0-9]+}}(%rsp), %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
-; SSE2-NEXT: pand %xmm0, %xmm1
-; SSE2-NEXT: packssdw %xmm2, %xmm1
-; SSE2-NEXT: packssdw %xmm3, %xmm1
-; SSE2-NEXT: packsswb %xmm5, %xmm1
-; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm0
; SSE2-NEXT: pand %xmm1, %xmm0
-; SSE2-NEXT: pandn {{[0-9]+}}(%rsp), %xmm1
-; SSE2-NEXT: por %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: packssdw %xmm2, %xmm0
+; SSE2-NEXT: packssdw %xmm3, %xmm0
+; SSE2-NEXT: packsswb %xmm5, %xmm0
+; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1
+; SSE2-NEXT: pand %xmm0, %xmm1
+; SSE2-NEXT: pandn {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE42-LABEL: vselect_packss_v16i64:
More information about the llvm-commits
mailing list